From 36d8593ec74dc04d3bd7c1c897a7b7cfbd0b0dc6 Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Sat, 19 Feb 2011 15:34:50 +0000 Subject: Make clocksource name const As nothing should be writing to the clocksource name string, make the clocksource name pointer const. Build-tested on ARM Versatile Express. Signed-off-by: Russell King Signed-off-by: John Stultz --- include/linux/clocksource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index c37b21a..94c1f38 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -161,7 +161,7 @@ struct clocksource { /* * First part of structure is read mostly */ - char *name; + const char *name; struct list_head list; int rating; cycle_t (*read)(struct clocksource *cs); -- cgit v1.1 From b01cc1b0eae0dea19257b29347116505fbedf679 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 26 Apr 2010 19:03:05 -0700 Subject: x86: Convert remaining x86 clocksources to clocksource_register_hz/khz This converts the remaining x86 clocksources to use clocksource_register_hz/khz. CC: jacob.jun.pan@intel.com CC: Glauber Costa CC: Dimitri Sivanich CC: Rusty Russell CC: Jeremy Fitzhardinge CC: Chris McDermott CC: Thomas Gleixner Tested-by: Konrad Rzeszutek Wilk [xen] Signed-off-by: John Stultz --- arch/x86/kernel/apb_timer.c | 10 +--------- arch/x86/kernel/i8253.c | 6 +----- arch/x86/kernel/kvmclock.c | 6 +----- arch/x86/lguest/boot.c | 4 +--- arch/x86/platform/uv/uv_time.c | 6 +----- arch/x86/xen/time.c | 6 +----- drivers/clocksource/cyclone.c | 10 ++-------- 7 files changed, 8 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 51ef31a..29ebf5a 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = { .rating = APBT_CLOCKSOURCE_RATING, .read = apbt_read_clocksource, .mask = APBT_MASK, - .shift = APBT_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = apbt_restart_clocksource, }; @@ -595,14 +594,7 @@ static int apbt_clocksource_register(void) if (t1 == apbt_read_clocksource(&clocksource_apbt)) panic("APBT counter not counting. APBT disabled\n"); - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); + clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); return 0; } diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2dfd315..212fe65 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -188,8 +188,6 @@ static struct clocksource pit_cs = { .rating = 110, .read = pit_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, }; static int __init init_pit_clocksource(void) @@ -205,9 +203,7 @@ static int __init init_pit_clocksource(void) pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); - - return clocksource_register(&pit_cs); + return clocksource_register_hz(&pit_cs, CLOCK_TICK_RATE); } arch_initcall(init_pit_clocksource); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f98d3ea..6389a6b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -26,8 +26,6 @@ #include #include -#define KVM_SCALE 22 - static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; @@ -120,8 +118,6 @@ static struct clocksource kvm_clock = { .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << KVM_SCALE, - .shift = KVM_SCALE, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -203,7 +199,7 @@ void __init kvmclock_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index eba687f0..5b96fd9 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -913,8 +913,6 @@ static struct clocksource lguest_clock = { .rating = 200, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << 22, - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -997,7 +995,7 @@ static void lguest_time_init(void) /* Set up the timer interrupt (0) to go to our simple timer routine */ set_irq_handler(0, lguest_time_irq); - clocksource_register(&lguest_clock); + clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 9daf5d1..0eb9018 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = { .rating = 400, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -372,14 +371,11 @@ static __init int uv_rtc_setup_clock(void) if (!is_uv_system()) return -ENODEV; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_uv.shift); - /* If single blade, prefer tsc */ if (uv_num_possible_blades() == 1) clocksource_uv.rating = 250; - rc = clocksource_register(&clocksource_uv); + rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); else diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 067759e..04e1159 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -26,8 +26,6 @@ #include "xen-ops.h" -#define XEN_SHIFT 22 - /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) @@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = { .rating = 400, .read = xen_clocksource_get_cycles, .mask = ~0, - .mult = 1< Date: Mon, 26 Apr 2010 20:20:47 -0700 Subject: ia64: convert to clocksource_register_hz/khz This converts the ia64 clocksources to use clocksource_register_hz/khz CC: Tony Luck CC: Thomas Gleixner Tested-by: Tony Luck [clocksource_itc path] Signed-off-by: John Stultz --- arch/ia64/kernel/cyclone.c | 6 +----- arch/ia64/kernel/time.c | 9 ++------- arch/ia64/sn/kernel/sn2/timer.c | 6 +----- drivers/char/hpet.c | 6 +----- 4 files changed, 5 insertions(+), 22 deletions(-) diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c index d52f1f7..f64097b 100644 --- a/arch/ia64/kernel/cyclone.c +++ b/arch/ia64/kernel/cyclone.c @@ -31,8 +31,6 @@ static struct clocksource clocksource_cyclone = { .rating = 300, .read = read_cyclone, .mask = (1LL << 40) - 1, - .mult = 0, /*to be caluclated*/ - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -118,9 +116,7 @@ int __init init_cyclone_clock(void) /* initialize last tick */ cyclone_mc = cyclone_timer; clocksource_cyclone.fsys_mmio = cyclone_timer; - clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, - clocksource_cyclone.shift); - clocksource_register(&clocksource_cyclone); + clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); return 0; } diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 9702fa9..41c40f0 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -73,8 +73,6 @@ static struct clocksource clocksource_itc = { .rating = 350, .read = itc_get_cycles, .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /*to be calculated*/ - .shift = 16, .flags = CLOCK_SOURCE_IS_CONTINUOUS, #ifdef CONFIG_PARAVIRT .resume = paravirt_clocksource_resume, @@ -374,11 +372,8 @@ ia64_init_itm (void) ia64_cpu_local_tick(); if (!itc_clocksource) { - /* Sort out mult/shift values: */ - clocksource_itc.mult = - clocksource_hz2mult(local_cpu_data->itc_freq, - clocksource_itc.shift); - clocksource_register(&clocksource_itc); + clocksource_register_hz(&clocksource_itc, + local_cpu_data->itc_freq); itc_clocksource = &clocksource_itc; } } diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c index 21d6f09..c34efda 100644 --- a/arch/ia64/sn/kernel/sn2/timer.c +++ b/arch/ia64/sn/kernel/sn2/timer.c @@ -33,8 +33,6 @@ static struct clocksource clocksource_sn2 = { .rating = 450, .read = read_sn2, .mask = (1LL << 55) - 1, - .mult = 0, - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -57,9 +55,7 @@ ia64_sn_udelay (unsigned long usecs) void __init sn_timer_init(void) { clocksource_sn2.fsys_mmio = RTC_COUNTER_ADDR; - clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_sn2.shift); - clocksource_register(&clocksource_sn2); + clocksource_register_hz(&clocksource_sn2, sn_rtc_cycles_per_second); ia64_udelay = &ia64_sn_udelay; } diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 7066e80..051474c 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -84,8 +84,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = CLOCKSOURCE_MASK(64), - .mult = 0, /* to be calculated */ - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static struct clocksource *hpet_clocksource; @@ -934,9 +932,7 @@ int hpet_alloc(struct hpet_data *hdp) if (!hpet_clocksource) { hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc; CLKSRC_FSYS_MMIO_SET(clocksource_hpet.fsys_mmio, hpet_mctr); - clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq, - clocksource_hpet.shift); - clocksource_register(&clocksource_hpet); + clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq); hpetp->hp_clocksource = &clocksource_hpet; hpet_clocksource = &clocksource_hpet; } -- cgit v1.1 From b8f39f7dfe12d4c8402c493a24fbf1e21d086771 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 26 Apr 2010 20:22:23 -0700 Subject: microblaze: convert to clocksource_register_hz/khz This converts the microblaze clocksources to use clocksource_register_hz/khz CC: Michal Simek CC: Thomas Gleixner Tested-by: Michal Simek Signed-off-by: John Stultz --- arch/microblaze/kernel/timer.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index a5aa33d..68ec7d1 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -217,16 +217,12 @@ static struct clocksource clocksource_microblaze = { .rating = 300, .read = microblaze_read, .mask = CLOCKSOURCE_MASK(32), - .shift = 8, /* I can shift it */ .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init microblaze_clocksource_init(void) { - clocksource_microblaze.mult = - clocksource_hz2mult(timer_clock_freq, - clocksource_microblaze.shift); - if (clocksource_register(&clocksource_microblaze)) + if (clocksource_register_hz(&clocksource_microblaze, timer_clock_freq)) panic("failed to register clocksource"); /* stop timer1 */ -- cgit v1.1 From 7861434fe9b5c9dad1bbb1f674cded95950e778e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 19 Oct 2010 17:49:07 -0700 Subject: alpha: convert to clocksource_register_hz Converts alpha to use clocksource_register_hz. CC: Richard Henderson CC: Ivan Kokshaysky CC: Matt Turner CC: Thomas Gleixner Signed-off-by: John Stultz --- arch/alpha/kernel/time.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index c1f3e7c..33b8110 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -378,8 +378,7 @@ static struct clocksource clocksource_rpcc = { static inline void register_rpcc_clocksource(long cycle_freq) { - clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4); - clocksource_register(&clocksource_rpcc); + clocksource_register_hz(&clocksource_rpcc, cycle_freq); } #else /* !CONFIG_SMP */ static inline void register_rpcc_clocksource(long cycle_freq) -- cgit v1.1 From 39280742efb00ab61ad62486c737fdd3e980c30f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 26 Apr 2010 20:24:37 -0700 Subject: sparc: convert to clocksource_register_hz/khz This converts the sparc clocksources to use clocksource_register_hz/khz CC: "David S. Miller" CC: Thomas Gleixner Signed-off-by: John Stultz --- arch/sparc/kernel/time_64.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index 3bc9c99..58aa27b 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -816,14 +816,12 @@ void __init time_init(void) clocksource_hz2mult(freq, SPARC64_NSEC_PER_CYC_SHIFT); clocksource_tick.name = tick_ops->name; - clocksource_calc_mult_shift(&clocksource_tick, freq, 4); clocksource_tick.read = clocksource_tick_read; + clocksource_register_hz(&clocksource_tick, freq); printk("clocksource: mult[%x] shift[%d]\n", clocksource_tick.mult, clocksource_tick.shift); - clocksource_register(&clocksource_tick); - sparc64_clockevent.name = tick_ops->name; clockevents_calc_mult_shift(&sparc64_clockevent, freq, 4); -- cgit v1.1 From 75c4fd8c7862f37eeae5c80f33bbe4dce97571d4 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 26 Apr 2010 20:23:11 -0700 Subject: mips: convert to clocksource_register_hz/khz This converts the mips clocksources to use clocksource_register_hz/khz CC: Ralf Baechle CC: Thomas Gleixner Signed-off-by: John Stultz --- arch/mips/alchemy/common/time.c | 3 +-- arch/mips/cavium-octeon/csrc-octeon.c | 3 +-- arch/mips/include/asm/time.h | 6 ----- arch/mips/jz4740/time.c | 3 +-- arch/mips/kernel/cevt-txx9.c | 3 +-- arch/mips/kernel/csrc-bcm1480.c | 3 +-- arch/mips/kernel/csrc-ioasic.c | 4 +-- arch/mips/kernel/csrc-powertv.c | 35 +++---------------------- arch/mips/kernel/csrc-r4k.c | 4 +-- arch/mips/kernel/csrc-sb1250.c | 3 +-- arch/mips/kernel/i8253.c | 5 +--- arch/mips/loongson/common/cs5536/cs5536_mfgpt.c | 5 +--- arch/mips/sgi-ip27/ip27-timer.c | 3 +-- 13 files changed, 14 insertions(+), 66 deletions(-) diff --git a/arch/mips/alchemy/common/time.c b/arch/mips/alchemy/common/time.c index 2aecb2f..d5da6ad 100644 --- a/arch/mips/alchemy/common/time.c +++ b/arch/mips/alchemy/common/time.c @@ -141,8 +141,7 @@ static int __init alchemy_time_init(unsigned int m2int) goto cntr_err; /* register counter1 clocksource and event device */ - clocksource_set_clock(&au1x_counter1_clocksource, 32768); - clocksource_register(&au1x_counter1_clocksource); + clocksource_register_hz(&au1x_counter1_clocksource, 32768); cd->shift = 32; cd->mult = div_sc(32768, NSEC_PER_SEC, cd->shift); diff --git a/arch/mips/cavium-octeon/csrc-octeon.c b/arch/mips/cavium-octeon/csrc-octeon.c index 26bf711..29d56af 100644 --- a/arch/mips/cavium-octeon/csrc-octeon.c +++ b/arch/mips/cavium-octeon/csrc-octeon.c @@ -105,8 +105,7 @@ unsigned long long notrace sched_clock(void) void __init plat_time_init(void) { clocksource_mips.rating = 300; - clocksource_set_clock(&clocksource_mips, octeon_get_clock_rate()); - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, octeon_get_clock_rate()); } static u64 octeon_udelay_factor; diff --git a/arch/mips/include/asm/time.h b/arch/mips/include/asm/time.h index c7f1bfe..bc14447 100644 --- a/arch/mips/include/asm/time.h +++ b/arch/mips/include/asm/time.h @@ -84,12 +84,6 @@ static inline int init_mips_clocksource(void) #endif } -static inline void clocksource_set_clock(struct clocksource *cs, - unsigned int clock) -{ - clocksource_calc_mult_shift(cs, clock, 4); -} - static inline void clockevent_set_clock(struct clock_event_device *cd, unsigned int clock) { diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c index fe01678..03dfd4e 100644 --- a/arch/mips/jz4740/time.c +++ b/arch/mips/jz4740/time.c @@ -121,8 +121,7 @@ void __init plat_time_init(void) clockevents_register_device(&jz4740_clockevent); - clocksource_set_clock(&jz4740_clocksource, clk_rate); - ret = clocksource_register(&jz4740_clocksource); + ret = clocksource_register_hz(&jz4740_clocksource, clk_rate); if (ret) printk(KERN_ERR "Failed to register clocksource: %d\n", ret); diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c index 0b73773..f0ab92a 100644 --- a/arch/mips/kernel/cevt-txx9.c +++ b/arch/mips/kernel/cevt-txx9.c @@ -51,8 +51,7 @@ void __init txx9_clocksource_init(unsigned long baseaddr, { struct txx9_tmr_reg __iomem *tmrptr; - clocksource_set_clock(&txx9_clocksource.cs, TIMER_CLK(imbusclk)); - clocksource_register(&txx9_clocksource.cs); + clocksource_register_hz(&txx9_clocksource.cs, TIMER_CLK(imbusclk)); tmrptr = ioremap(baseaddr, sizeof(struct txx9_tmr_reg)); __raw_writel(TCR_BASE, &tmrptr->tcr); diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c index 51489f8..f96f99c 100644 --- a/arch/mips/kernel/csrc-bcm1480.c +++ b/arch/mips/kernel/csrc-bcm1480.c @@ -49,6 +49,5 @@ void __init sb1480_clocksource_init(void) plldiv = G_BCM1480_SYS_PLL_DIV(__raw_readq(IOADDR(A_SCD_SYSTEM_CFG))); zbbus = ((plldiv >> 1) * 50000000) + ((plldiv & 1) * 25000000); - clocksource_set_clock(cs, zbbus); - clocksource_register(cs); + clocksource_register_hz(cs, zbbus); } diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c index 23da108..46bd7fa 100644 --- a/arch/mips/kernel/csrc-ioasic.c +++ b/arch/mips/kernel/csrc-ioasic.c @@ -59,7 +59,5 @@ void __init dec_ioasic_clocksource_init(void) printk(KERN_INFO "I/O ASIC clock frequency %dHz\n", freq); clocksource_dec.rating = 200 + freq / 10000000; - clocksource_set_clock(&clocksource_dec, freq); - - clocksource_register(&clocksource_dec); + clocksource_register_hz(&clocksource_dec, freq); } diff --git a/arch/mips/kernel/csrc-powertv.c b/arch/mips/kernel/csrc-powertv.c index a27c16c..2e7c523 100644 --- a/arch/mips/kernel/csrc-powertv.c +++ b/arch/mips/kernel/csrc-powertv.c @@ -78,9 +78,7 @@ static void __init powertv_c0_hpt_clocksource_init(void) clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000; - clocksource_set_clock(&clocksource_mips, mips_hpt_frequency); - - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); } /** @@ -130,43 +128,16 @@ static struct clocksource clocksource_tim_c = { /** * powertv_tim_c_clocksource_init - set up a clock source for the TIM_C clock * - * The hard part here is coming up with a constant k and shift s such that - * the 48-bit TIM_C value multiplied by k doesn't overflow and that value, - * when shifted right by s, yields the corresponding number of nanoseconds. * We know that TIM_C counts at 27 MHz/8, so each cycle corresponds to - * 1 / (27,000,000/8) seconds. Multiply that by a billion and you get the - * number of nanoseconds. Since the TIM_C value has 48 bits and the math is - * done in 64 bits, avoiding an overflow means that k must be less than - * 64 - 48 = 16 bits. + * 1 / (27,000,000/8) seconds. */ static void __init powertv_tim_c_clocksource_init(void) { - int prescale; - unsigned long dividend; - unsigned long k; - int s; - const int max_k_bits = (64 - 48) - 1; - const unsigned long billion = 1000000000; const unsigned long counts_per_second = 27000000 / 8; - prescale = BITS_PER_LONG - ilog2(billion) - 1; - dividend = billion << prescale; - k = dividend / counts_per_second; - s = ilog2(k) - max_k_bits; - - if (s < 0) - s = prescale; - - else { - k >>= s; - s += prescale; - } - - clocksource_tim_c.mult = k; - clocksource_tim_c.shift = s; clocksource_tim_c.rating = 200; - clocksource_register(&clocksource_tim_c); + clocksource_register_hz(&clocksource_tim_c, counts_per_second); tim_c = (struct tim_c *) asic_reg_addr(tim_ch); } diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index e95a3cd..decd1fa 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -30,9 +30,7 @@ int __init init_r4k_clocksource(void) /* Calculate a somewhat reasonable rating value */ clocksource_mips.rating = 200 + mips_hpt_frequency / 10000000; - clocksource_set_clock(&clocksource_mips, mips_hpt_frequency); - - clocksource_register(&clocksource_mips); + clocksource_register_hz(&clocksource_mips, mips_hpt_frequency); return 0; } diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c index d14d3d1..e9606d9 100644 --- a/arch/mips/kernel/csrc-sb1250.c +++ b/arch/mips/kernel/csrc-sb1250.c @@ -65,6 +65,5 @@ void __init sb1250_clocksource_init(void) IOADDR(A_SCD_TIMER_REGISTER(SB1250_HPT_NUM, R_SCD_TIMER_CFG))); - clocksource_set_clock(cs, V_SCD_TIMER_FREQ); - clocksource_register(cs); + clocksource_register_hz(cs, V_SCD_TIMER_FREQ); } diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index 2392a7a2..9fadd17 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -196,8 +196,6 @@ static struct clocksource clocksource_pit = { .rating = 110, .read = pit_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, }; static int __init init_pit_clocksource(void) @@ -205,7 +203,6 @@ static int __init init_pit_clocksource(void) if (num_possible_cpus() > 1) /* PIT does not scale! */ return 0; - clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); - return clocksource_register(&clocksource_pit); + return clocksource_register_hz(&clocksource_pit, CLOCK_TICK_RATE); } arch_initcall(init_pit_clocksource); diff --git a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c index 8c807c9..0cb1b97 100644 --- a/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c +++ b/arch/mips/loongson/common/cs5536/cs5536_mfgpt.c @@ -201,8 +201,6 @@ static struct clocksource clocksource_mfgpt = { .rating = 120, /* Functional for real use, but not desired */ .read = mfgpt_read, .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 22, }; int __init init_mfgpt_clocksource(void) @@ -210,8 +208,7 @@ int __init init_mfgpt_clocksource(void) if (num_possible_cpus() > 1) /* MFGPT does not scale! */ return 0; - clocksource_mfgpt.mult = clocksource_hz2mult(MFGPT_TICK_RATE, 22); - return clocksource_register(&clocksource_mfgpt); + return clocksource_register_hz(&clocksource_mfgpt, MFGPT_TICK_RATE); } arch_initcall(init_mfgpt_clocksource); diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c index d6802d6..3cac883 100644 --- a/arch/mips/sgi-ip27/ip27-timer.c +++ b/arch/mips/sgi-ip27/ip27-timer.c @@ -177,8 +177,7 @@ static void __init hub_rt_clocksource_init(void) { struct clocksource *cs = &hub_rt_clocksource; - clocksource_set_clock(cs, CYCLES_PER_SEC); - clocksource_register(cs); + clocksource_register_hz(cs, CYCLES_PER_SEC); } void __init plat_time_init(void) -- cgit v1.1 From a1c57e0fec53defe745e64417eacdbd3618c3e66 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 26 Apr 2010 20:20:07 -0700 Subject: blackfin: convert to clocksource_register_hz This converts the blackfin clocksource to use clocksource_register_hz. CC: Mike Frysinger CC: Thomas Gleixner Signed-off-by: John Stultz --- arch/blackfin/kernel/time-ts.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c index 8c9a43d..4a01371 100644 --- a/arch/blackfin/kernel/time-ts.c +++ b/arch/blackfin/kernel/time-ts.c @@ -23,29 +23,6 @@ #include #include -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ #if defined(CONFIG_CYCLES_CLOCKSOURCE) @@ -63,7 +40,6 @@ static struct clocksource bfin_cs_cycles = { .rating = 400, .read = bfin_read_cycles, .mask = CLOCKSOURCE_MASK(64), - .shift = CYC2NS_SCALE_FACTOR, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -75,10 +51,7 @@ static inline unsigned long long bfin_cs_cycles_sched_clock(void) static int __init bfin_cs_cycles_init(void) { - bfin_cs_cycles.mult = \ - clocksource_hz2mult(get_cclk(), bfin_cs_cycles.shift); - - if (clocksource_register(&bfin_cs_cycles)) + if (clocksource_register_hz(&bfin_cs_cycles, get_cclk())) panic("failed to register clocksource"); return 0; @@ -111,7 +84,6 @@ static struct clocksource bfin_cs_gptimer0 = { .rating = 350, .read = bfin_read_gptimer0, .mask = CLOCKSOURCE_MASK(32), - .shift = CYC2NS_SCALE_FACTOR, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -125,10 +97,7 @@ static int __init bfin_cs_gptimer0_init(void) { setup_gptimer0(); - bfin_cs_gptimer0.mult = \ - clocksource_hz2mult(get_sclk(), bfin_cs_gptimer0.shift); - - if (clocksource_register(&bfin_cs_gptimer0)) + if (clocksource_register_hz(&bfin_cs_gptimer0, get_sclk())) panic("failed to register clocksource"); return 0; -- cgit v1.1 From d79647aea22732f39c81bbdc80931f96b46023f0 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Mon, 7 Mar 2011 15:18:57 -0500 Subject: xen/gntdev,gntalloc: Remove unneeded VM flags The only time when granted pages need to be treated specially is when using Xen's PTE modification for grant mappings owned by another domain (that is, only gntdev on PV guests). Otherwise, the area does not require VM_DONTCOPY and VM_PFNMAP, since it can be accessed just like any other page of RAM. Since the vm_operations_struct close operations decrement reference counts, a corresponding open function that increments them is required now that it is possible to have multiple references to a single area. We are careful in the gntdev to check if we can remove those flags. The reason that we need to be careful in gntdev on PV guests is because we are not changing the PFN/MFN mapping on PV; instead, we change the application's page tables to point to the other domain's memory. This means that the vma cannot be copied without using another grant mapping hypercall; it also requires special handling on unmap, which is the reason for gntdev's dependency on the MMU notifier. For gntalloc, this is not a concern - the pages are owned by the domain using the gntalloc device, and can be mapped and unmapped in the same manner as any other page of memory. Acked-by: Ian Campbell Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk [v2: Added in git commit "We are.." from email correspondence] --- drivers/xen/gntalloc.c | 14 ++++++++++++-- drivers/xen/gntdev.c | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index a7ffdfe..f6832f4 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -427,6 +427,17 @@ static long gntalloc_ioctl(struct file *filp, unsigned int cmd, return 0; } +static void gntalloc_vma_open(struct vm_area_struct *vma) +{ + struct gntalloc_gref *gref = vma->vm_private_data; + if (!gref) + return; + + spin_lock(&gref_lock); + gref->users++; + spin_unlock(&gref_lock); +} + static void gntalloc_vma_close(struct vm_area_struct *vma) { struct gntalloc_gref *gref = vma->vm_private_data; @@ -441,6 +452,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma) } static struct vm_operations_struct gntalloc_vmops = { + .open = gntalloc_vma_open, .close = gntalloc_vma_close, }; @@ -471,8 +483,6 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = gref; vma->vm_flags |= VM_RESERVED; - vma->vm_flags |= VM_DONTCOPY; - vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP; vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index d96d311..687761f 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -358,17 +358,26 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages) /* ------------------------------------------------------------------ */ +static void gntdev_vma_open(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("gntdev_vma_open %p\n", vma); + atomic_inc(&map->users); +} + static void gntdev_vma_close(struct vm_area_struct *vma) { struct grant_map *map = vma->vm_private_data; - pr_debug("close %p\n", vma); + pr_debug("gntdev_vma_close %p\n", vma); map->vma = NULL; vma->vm_private_data = NULL; gntdev_put_map(map); } static struct vm_operations_struct gntdev_vmops = { + .open = gntdev_vma_open, .close = gntdev_vma_close, }; @@ -680,7 +689,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; + vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND; + + if (use_ptemod) + vma->vm_flags |= VM_DONTCOPY|VM_PFNMAP; vma->vm_private_data = map; -- cgit v1.1 From 45bb1674b976ef81429c1e42de05844b49d45dea Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Sun, 13 Mar 2011 15:10:17 +0000 Subject: x86, olpc: Use device tree for platform identification Make OLPC fully depend on device tree, and use it to identify the OLPC platform details. Some nodes are exposed as platform devices where we plan to use device tree for device probing. Signed-off-by: Daniel Drake Acked-by: Grant Likely LKML-Reference: <20110313151017.C255F9D401E@zog.reactivated.net> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/olpc_ofw.h | 9 +++---- arch/x86/platform/olpc/Makefile | 4 +--- arch/x86/platform/olpc/olpc.c | 51 +++++++++++++++++++++------------------- arch/x86/platform/olpc/olpc_dt.c | 19 +++++++++++++++ 5 files changed, 51 insertions(+), 34 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b4c2e9c..471221b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2071,7 +2071,7 @@ config OLPC depends on !X86_PAE select GPIOLIB select OF - select OF_PROMTREE if PROC_DEVICETREE + select OF_PROMTREE ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index c5d3a5a..2448771 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void); /* check if OFW was detected during boot */ extern bool olpc_ofw_present(void); +extern void olpc_dt_build_devicetree(void); + #else /* !CONFIG_OLPC */ static inline void olpc_ofw_detect(void) { } static inline void setup_olpc_ofw_pgd(void) { } -#endif /* !CONFIG_OLPC */ - -#ifdef CONFIG_OF_PROMTREE -extern void olpc_dt_build_devicetree(void); -#else static inline void olpc_dt_build_devicetree(void) { } -#endif +#endif /* !CONFIG_OLPC */ #endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index c2a8cab..81c5e21 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,4 +1,2 @@ -obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC) += olpc_ofw.o -obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index edaf3fe..0060fd5 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -187,41 +188,43 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -static bool __init check_ofw_architecture(void) +static bool __init check_ofw_architecture(struct device_node *root) { - size_t propsize; - char olpc_arch[5]; - const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; - void *res[] = { &propsize }; + const char *olpc_arch; + int propsize; - if (olpc_ofw("getprop", args, res)) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return false; - } + olpc_arch = of_get_property(root, "architecture", &propsize); return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; } -static u32 __init get_board_revision(void) +static u32 __init get_board_revision(struct device_node *root) { - size_t propsize; - __be32 rev; - const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res) || propsize != 4) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return cpu_to_be32(0); - } - return be32_to_cpu(rev); + int propsize; + const __be32 *rev; + + rev = of_get_property(root, "board-revision-int", &propsize); + if (propsize != 4) + return 0; + + return be32_to_cpu(*rev); } static bool __init platform_detect(void) { - if (!check_ofw_architecture()) + struct device_node *root = of_find_node_by_path("/"); + bool success; + + if (!root) return false; - olpc_platform_info.flags |= OLPC_F_PRESENT; - olpc_platform_info.boardrev = get_board_revision(); - return true; + + success = check_ofw_architecture(root); + if (success) { + olpc_platform_info.boardrev = get_board_revision(root); + olpc_platform_info.flags |= OLPC_F_PRESENT; + } + + of_node_put(root); + return success; } static int __init add_xo1_platform_devices(void) diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index dab8746..4ce208f 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -19,7 +19,9 @@ #include #include #include +#include #include +#include #include static phandle __init olpc_dt_getsibling(phandle node) @@ -181,3 +183,20 @@ void __init olpc_dt_build_devicetree(void) pr_info("PROM DT: Built device tree with %u bytes of memory.\n", prom_early_allocated); } + +/* A list of DT node/bus matches that we want to expose as platform devices */ +static struct of_device_id __initdata of_ids[] = { + { .compatible = "olpc,xo1-battery" }, + { .compatible = "olpc,xo1-dcon" }, + { .compatible = "olpc,xo1-rtc" }, + {}, +}; + +static int __init olpc_create_platform_devices(void) +{ + if (machine_is_olpc()) + return of_platform_bus_probe(NULL, of_ids, NULL); + else + return 0; +} +device_initcall(olpc_create_platform_devices); -- cgit v1.1 From 5d94e81f69d4b1d1102d3ab557ce0a817c11fbbb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 8 Mar 2011 10:36:19 -0800 Subject: x86: Introduce pci_map_biosrom() The isci driver needs to retrieve its preboot OROM image which contains necessary runtime parameters like platform specific sas addresses and phy configuration. There is no ROM BAR associated with this area, instead we will need to scan legacy expansion ROM space. 1/ Promote the probe_roms_32 implementation to x86-64 2/ Add a facility to find and map an adapter rom by pci device (according to PCI Firmware Specification Revision 3.0) Signed-off-by: Dave Jiang LKML-Reference: <20110308183226.6246.90354.stgit@localhost6.localdomain6> Signed-off-by: Dan Williams Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/probe_roms.h | 8 ++ arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/head32.c | 1 - arch/x86/kernel/probe_roms.c | 267 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/probe_roms_32.c | 166 ------------------------ arch/x86/kernel/x86_init.c | 2 +- 7 files changed, 278 insertions(+), 170 deletions(-) create mode 100644 arch/x86/include/asm/probe_roms.h create mode 100644 arch/x86/kernel/probe_roms.c delete mode 100644 arch/x86/kernel/probe_roms_32.c diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h new file mode 100644 index 0000000..4950a0b --- /dev/null +++ b/arch/x86/include/asm/probe_roms.h @@ -0,0 +1,8 @@ +#ifndef _PROBE_ROMS_H_ +#define _PROBE_ROMS_H_ +struct pci_dev; + +extern void __iomem *pci_map_biosrom(struct pci_dev *pdev); +extern void pci_unmap_biosrom(void __iomem *rom); +extern size_t pci_biosrom_size(struct pci_dev *pdev); +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index db8aa19..03d3a32 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align); type *name; \ RESERVE_BRK(name, sizeof(type) * entries) +extern void probe_roms(void); #ifdef __i386__ void __init i386_start_kernel(void); -extern void probe_roms(void); #else void __init x86_64_start_kernel(char *real_mode); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 62445ba..f33d738 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_32) += probe_roms_32.o +obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 7f138b3..eab4940 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -23,7 +23,6 @@ static void __init i386_default_early_setup(void) { /* Initialize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c new file mode 100644 index 0000000..ba0a4cc --- /dev/null +++ b/arch/x86/kernel/probe_roms.c @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c deleted file mode 100644 index 071e7fe..0000000 --- a/arch/x86/kernel/probe_roms_32.c +++ /dev/null @@ -1,166 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -#define ROMSIGNATURE 0xaa55 - -static int __init romsignature(const unsigned char *rom) -{ - const unsigned short * const ptr = (const unsigned short *)rom; - unsigned short sig; - - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; -} - -static int __init romchecksum(const unsigned char *rom, unsigned long length) -{ - unsigned char sum, c; - - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) - sum += c; - return !length && !sum; -} - -void __init probe_roms(void) -{ - const unsigned char *rom; - unsigned long start, length, upper; - unsigned char c; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9..6eee082 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { } struct x86_init_ops x86_init __initdata = { .resources = { - .probe_roms = x86_init_noop, + .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, -- cgit v1.1 From 9f1f1bfd8d7e579f07dbe56d6f93bd594da43b3d Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 23 Mar 2011 21:31:40 +0600 Subject: x86, mpparse: Remove unnecessary variable 'ret' isn't used by check_slot(), gets initialized but has no real use, so remove it. Signed-off-by: Rakib Mullick LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6f789a8..ef32d4c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -722,14 +722,12 @@ inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - int ret = 0; - if (!mpc_new_phys || count <= mpc_new_length) { WARN(1, "update_mptable: No spare slots (length: %x)\n", count); return -1; } - return ret; + return 0; } static int __init replace_intsrc_all(struct mpc_table *mpc, -- cgit v1.1 From 1d321881afb955bba1e0a8f50d3a04e111fcb581 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 28 Mar 2011 00:46:11 +0400 Subject: perf, x86: P4 PMU - clean up the code a bit No change on the functional level, just align the table properly. Signed-off-by: Cyrill Gorcunov Cc: Lin Ming LKML-Reference: <4D8FA213.5050108@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index c2520e1..8ff882f 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { -- cgit v1.1 From 349c004e3d31fda23ad225b61861be38047fff16 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 12 Mar 2011 12:50:10 +0100 Subject: x86: A fast way to check capabilities of the current cpu Add this_cpu_has() which determines if the current cpu has a certain ability using a segment prefix and a bit test operation. For that we need to add bit operations to x86s percpu.h. Many uses of cpu_has use a pointer passed to a function to determine the current flags. That is no longer necessary after this patch. However, this patch only converts the straightforward cases where cpu_has is used with this_cpu_ptr. The rest is work for later. -tj: Rolled up patch to add x86_ prefix and use percpu_read() instead of percpu_read_stable(). Signed-off-by: Christoph Lameter Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- arch/x86/include/asm/cpufeature.h | 13 +++++++++---- arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++ arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/smpboot.c | 4 ++-- 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087..50c0d30 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) -#define cpu_has(c, bit) \ - (__builtin_constant_p(bit) && \ +#define REQUIRED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ @@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ - ? 1 : \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) + +#define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability)) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d475b43..76042d9 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -542,6 +542,33 @@ do { \ old__; \ }) +static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, + const unsigned long __percpu *addr) +{ + unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; + + return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +} + +static inline int x86_this_cpu_variable_test_bit(int nr, + const unsigned long __percpu *addr) +{ + int oldbit; + + asm volatile("bt "__percpu_arg(2)",%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#define x86_this_cpu_test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? x86_this_cpu_constant_test_bit((nr), (addr)) \ + : x86_this_cpu_variable_test_bit((nr), (addr))) + + #include /* We can use this directly for local CPU (faster). */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index fabf01e..2bc503b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { + if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d46cbe4..88a90a9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -465,7 +465,7 @@ static void mwait_idle(void) if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); trace_cpu_idle(1, smp_processor_id()); - if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3..a3c430b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void) void *mwait_ptr; struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) + if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)) return; - if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; -- cgit v1.1 From fe5042138b6fc60edde3b60025078884c2eb71ac Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 12 Mar 2011 12:50:46 +0100 Subject: x86: Use this_cpu_has for thermal_interrupt current cpu It is more effective to use a segment prefix instead of calculating the address of the current cpu area amd then testing flags. Signed-off-by: Christoph Lameter Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9..6b0f4cd 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -355,7 +355,6 @@ static void notify_thresholds(__u64 msr_val) static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); @@ -367,19 +366,19 @@ static void intel_thermal_interrupt(void) CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, -- cgit v1.1 From 9d42a53e0f46505b39494041d514372235817e15 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 12 Mar 2011 12:51:12 +0100 Subject: acpi throttling: Use this_cpu_has and simplify code current cpu With the this_cpu_xx we no longer need to pass an acpi structure to the msr management code. Simplifies code and improves performance. NOTE: This code is x86 specific (see #ifdef CONFIG_X86) but not under arch/x86. Signed-off-by: Christoph Lameter Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- drivers/acpi/processor_throttling.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index fa84e97..6f2c6d9 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -710,20 +710,14 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr) } #ifdef CONFIG_X86 -static int acpi_throttling_rdmsr(struct acpi_processor *pr, - u64 *value) +static int acpi_throttling_rdmsr(u64 *value) { - struct cpuinfo_x86 *c; u64 msr_high, msr_low; - unsigned int cpu; u64 msr = 0; int ret = -1; - cpu = pr->id; - c = &cpu_data(cpu); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - !cpu_has(c, X86_FEATURE_ACPI)) { + if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) || + !this_cpu_has(X86_FEATURE_ACPI)) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); } else { @@ -738,18 +732,13 @@ static int acpi_throttling_rdmsr(struct acpi_processor *pr, return ret; } -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) +static int acpi_throttling_wrmsr(u64 value) { - struct cpuinfo_x86 *c; - unsigned int cpu; int ret = -1; u64 msr; - cpu = pr->id; - c = &cpu_data(cpu); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - !cpu_has(c, X86_FEATURE_ACPI)) { + if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) || + !this_cpu_has(X86_FEATURE_ACPI)) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); } else { @@ -761,15 +750,14 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) return ret; } #else -static int acpi_throttling_rdmsr(struct acpi_processor *pr, - u64 *value) +static int acpi_throttling_rdmsr(u64 *value) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); return -1; } -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) +static int acpi_throttling_wrmsr(u64 value) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); @@ -801,7 +789,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr, ret = 0; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: - ret = acpi_throttling_rdmsr(pr, value); + ret = acpi_throttling_rdmsr(value); break; default: printk(KERN_ERR PREFIX "Unknown addr space %d\n", @@ -834,7 +822,7 @@ static int acpi_write_throttling_state(struct acpi_processor *pr, ret = 0; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: - ret = acpi_throttling_wrmsr(pr, value); + ret = acpi_throttling_wrmsr(value); break; default: printk(KERN_ERR PREFIX "Unknown addr space %d\n", -- cgit v1.1 From cd25f8bc2696664877b21d33b7994e12fa570919 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 25 Mar 2011 16:27:48 +0800 Subject: perf probe: Add fastpath to do lookup by function name v3 -> v2: - Make pubname_search_cb more generic - Add fastpath to find_probes also v2 -> v1: - Don't compare file names with cu_find_realpath(...), instead, compare them with the name returned by dwarf_decl_file(sp_die) The vmlinux file may have thousands of CUs. We can lookup function name from .debug_pubnames section to avoid the slow loop on CUs. 1. Improvement data for find_line_range ./perf stat -e cycles -r 10 -- ./perf probe -k /home/mlin/vmlinux \ -s /home/mlin/linux-2.6 \ --line csum_partial_copy_to_user > tmp.log before patch applied ===================== 847,988,276 cycles 0.355075856 seconds time elapsed after patch applied ===================== 206,102,622 cycles 0.086883555 seconds time elapsed 2. Improvement data for find_probes ./perf stat -e cycles -r 10 -- ./perf probe -k /home/mlin/vmlinux \ -s /home/mlin/linux-2.6 \ --vars csum_partial_copy_to_user > tmp.log before patch applied ===================== 848,490,844 cycles 0.355307901 seconds time elapsed after patch applied ===================== 205,684,469 cycles 0.086694010 seconds time elapsed Acked-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: linux-kernel LKML-Reference: <1301041668.14111.52.camel@minggr.sh.intel.com> Signed-off-by: Lin Ming Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 72 ++++++++++++++++++++++++++++++++++++++++++ tools/perf/util/probe-finder.h | 2 ++ 2 files changed, 74 insertions(+) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 194f9e2..ff416b8 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1435,6 +1435,38 @@ static int find_probe_point_by_func(struct probe_finder *pf) return _param.retval; } +struct pubname_callback_param { + char *function; + char *file; + Dwarf_Die *cu_die; + Dwarf_Die *sp_die; + int found; +}; + +static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data) +{ + struct pubname_callback_param *param = data; + + if (dwarf_offdie(dbg, gl->die_offset, param->sp_die)) { + if (dwarf_tag(param->sp_die) != DW_TAG_subprogram) + return DWARF_CB_OK; + + if (die_compare_name(param->sp_die, param->function)) { + if (!dwarf_offdie(dbg, gl->cu_offset, param->cu_die)) + return DWARF_CB_OK; + + if (param->file && + strtailcmp(param->file, dwarf_decl_file(param->sp_die))) + return DWARF_CB_OK; + + param->found = 1; + return DWARF_CB_ABORT; + } + } + + return DWARF_CB_OK; +} + /* Find probe points from debuginfo */ static int find_probes(int fd, struct probe_finder *pf) { @@ -1461,6 +1493,27 @@ static int find_probes(int fd, struct probe_finder *pf) off = 0; line_list__init(&pf->lcache); + + /* Fastpath: lookup by function name from .debug_pubnames section */ + if (pp->function) { + struct pubname_callback_param pubname_param = { + .function = pp->function, + .file = pp->file, + .cu_die = &pf->cu_die, + .sp_die = &pf->sp_die, + }; + struct dwarf_callback_param probe_param = { + .data = pf, + }; + + dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0); + if (pubname_param.found) { + ret = probe_point_search_cb(&pf->sp_die, &probe_param); + if (ret) + goto found; + } + } + /* Loop on CUs (Compilation Unit) */ while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) { /* Get the DIE(Debugging Information Entry) of this CU */ @@ -1488,6 +1541,8 @@ static int find_probes(int fd, struct probe_finder *pf) } off = noff; } + +found: line_list__free(&pf->lcache); if (dwfl) dwfl_end(dwfl); @@ -1895,6 +1950,22 @@ int find_line_range(int fd, struct line_range *lr) return -EBADF; } + /* Fastpath: lookup by function name from .debug_pubnames section */ + if (lr->function) { + struct pubname_callback_param pubname_param = { + .function = lr->function, .file = lr->file, + .cu_die = &lf.cu_die, .sp_die = &lf.sp_die, .found = 0}; + struct dwarf_callback_param line_range_param = { + .data = (void *)&lf, .retval = 0}; + + dwarf_getpubnames(dbg, pubname_search_cb, &pubname_param, 0); + if (pubname_param.found) { + line_range_search_cb(&lf.sp_die, &line_range_param); + if (lf.found) + goto found; + } + } + /* Loop on CUs (Compilation Unit) */ while (!lf.found && ret >= 0) { if (dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) != 0) @@ -1923,6 +1994,7 @@ int find_line_range(int fd, struct line_range *lr) off = noff; } +found: /* Store comp_dir */ if (lf.found) { comp_dir = cu_get_comp_dir(&lf.cu_die); diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index beaefc3..605730a 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -49,6 +49,7 @@ struct probe_finder { Dwarf_Addr addr; /* Address */ const char *fname; /* Real file name */ Dwarf_Die cu_die; /* Current CU */ + Dwarf_Die sp_die; struct list_head lcache; /* Line cache for lazy match */ /* For variable searching */ @@ -83,6 +84,7 @@ struct line_finder { int lno_s; /* Start line number */ int lno_e; /* End line number */ Dwarf_Die cu_die; /* Current CU */ + Dwarf_Die sp_die; int found; }; -- cgit v1.1 From a7f3c8f1da1050ad778c3a3930f07c63a5ec570b Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 28 Mar 2011 11:32:31 +0200 Subject: xen/balloon: Use PageHighMem() for high memory page detection Replace pfn < max_low_pfn by !PageHighMem() in increase_reservation(). It makes more clearer what is going on. Acked-by: Ian Campbell Acked-by: Daniel De Graaf Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 043af8a..61665b2 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -246,7 +246,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (!xen_hvm_domain() && pfn < max_low_pfn) { + if (!xen_hvm_domain() && !PageHighMem(page)) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), -- cgit v1.1 From 4dfe22f5f24345511c378272189b7504d67767fb Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 28 Mar 2011 11:33:18 +0200 Subject: xen/balloon: Simplify HVM integration Simplify HVM integration proposed by Stefano Stabellini in 53d5522cad291a0e93a385e0594b6aea6b54a071. Acked-by: Ian Campbell Acked-by: Daniel De Graaf Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/balloon.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 61665b2..42a0ba0 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -246,7 +246,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (!xen_hvm_domain() && !PageHighMem(page)) { + if (xen_pv_domain() && !PageHighMem(page)) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -293,7 +293,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) scrub_page(page); - if (!xen_hvm_domain() && !PageHighMem(page)) { + if (xen_pv_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), __pte_ma(0), 0); @@ -429,7 +429,7 @@ EXPORT_SYMBOL(free_xenballooned_pages); static int __init balloon_init(void) { - unsigned long pfn, nr_pages, extra_pfn_end; + unsigned long pfn, extra_pfn_end; struct page *page; if (!xen_domain()) @@ -437,11 +437,7 @@ static int __init balloon_init(void) pr_info("xen/balloon: Initialising balloon driver.\n"); - if (xen_pv_domain()) - nr_pages = xen_start_info->nr_pages; - else - nr_pages = max_pfn; - balloon_stats.current_pages = min(nr_pages, max_pfn); + balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages, max_pfn) : max_pfn; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; -- cgit v1.1 From 83be7e52d46a5b3a9955a38a9597bf1de1851ea7 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 28 Mar 2011 11:34:10 +0200 Subject: xen/balloon: Clarify credit calculation Move credit calculation to current_target() and rename it to current_credit(). Acked-by: Ian Campbell Acked-by: Daniel De Graaf Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/balloon.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 42a0ba0..a6d8e59 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -193,7 +193,7 @@ static enum bp_state update_schedule(enum bp_state state) return BP_EAGAIN; } -static unsigned long current_target(void) +static long current_credit(void) { unsigned long target = balloon_stats.target_pages; @@ -202,7 +202,7 @@ static unsigned long current_target(void) balloon_stats.balloon_low + balloon_stats.balloon_high); - return target; + return target - balloon_stats.current_pages; } static enum bp_state increase_reservation(unsigned long nr_pages) @@ -337,7 +337,7 @@ static void balloon_process(struct work_struct *work) mutex_lock(&balloon_mutex); do { - credit = current_target() - balloon_stats.current_pages; + credit = current_credit(); if (credit > 0) state = increase_reservation(credit); @@ -420,7 +420,7 @@ void free_xenballooned_pages(int nr_pages, struct page** pages) } /* The balloon may be too large now. Shrink it if needed. */ - if (current_target() != balloon_stats.current_pages) + if (current_credit()) schedule_delayed_work(&balloon_worker, 0); mutex_unlock(&balloon_mutex); -- cgit v1.1 From 09ca132a8e469f87504899b4016c7517511887d0 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Mon, 28 Mar 2011 11:35:59 +0200 Subject: xen/balloon: Move dec_totalhigh_pages() from __balloon_append() to balloon_append() git commit 9be4d4575906af9698de660e477f949a076c87e1 (xen: add extra pages to balloon) splited balloon_append() into two functions (balloon_append() and __balloon_append()) and left decrementation of totalram_pages counter in __balloon_append(). In this situation if __balloon_append() is called on i386 with highmem page referenced then totalhigh_pages is decremented, however, it should not. This patch corrects that issue and moves dec_totalhigh_pages() from __balloon_append() to balloon_append(). Now totalram_pages and totalhigh_pages are decremented simultaneously only when balloon_append() is called. Acked-by: Ian Campbell Acked-by: Daniel De Graaf Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/balloon.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index a6d8e59..f54290b 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -114,7 +114,6 @@ static void __balloon_append(struct page *page) if (PageHighMem(page)) { list_add_tail(&page->lru, &ballooned_pages); balloon_stats.balloon_high++; - dec_totalhigh_pages(); } else { list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; @@ -124,6 +123,8 @@ static void __balloon_append(struct page *page) static void balloon_append(struct page *page) { __balloon_append(page); + if (PageHighMem(page)) + dec_totalhigh_pages(); totalram_pages--; } @@ -462,7 +463,7 @@ static int __init balloon_init(void) pfn < extra_pfn_end; pfn++) { page = pfn_to_page(pfn); - /* totalram_pages doesn't include the boot-time + /* totalram_pages and totalhigh_pages do not include the boot-time balloon extension, so don't subtract from it. */ __balloon_append(page); } -- cgit v1.1 From 2c9e45f7a287384e1382932597e41a9a567811ba Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 17 Mar 2011 10:03:21 -0600 Subject: perf script: If type not given fields apply to all event types Allow: perf script -f to be equivalent to: perf script -f trace: -f sw: -f hw: i.e., the specified fields apply to all event types if the type string is not given. The field (-f) arguments are processed in the order received. A later usage can reset a prior request. e.g., -f trace: -f comm,tid,time,sym The first -f suppresses trace events (field list is ""), but then the second invocation sets the fields to comm,tid,time,sym. In this case a warning is given to the user: "Overriding previous field request for all events." Alternativey, consider the order: -f comm,tid,time,sym -f trace: The first -f sets the fields for all events and the second -f suppresses trace events. The user is given a warning message about the override, and the result of the above is that only S/W and H/W events are displayed with the given fields. For the 'wildcard' option if a user selected field is invalid for an event type, a message is displayed to the user that the option is ignored for that type. For example: perf script -f comm,tid,trace 2>&1 | less 'trace' not valid for hardware events. Ignoring. 'trace' not valid for software events. Ignoring. Alternatively, if the type is given an invalid field is specified it is an error. For example: perf script -v -f sw:comm,tid,trace 2>&1 | less 'trace' not valid for software events. At this point usage is displayed, and perf-script exits. Finally, a user may not set fields to none for all event types. i.e., -f "" is not allowed. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org LPU-Reference: <1300377801-27246-1-git-send-email-daahern@cisco.com> Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 171 +++++++++++++++++++++++++++++++------------- 1 file changed, 122 insertions(+), 49 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ac574ea..6cf811a 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -49,23 +49,52 @@ struct output_option { }; /* default set to maintain compatibility with current format */ -static u64 output_fields[PERF_TYPE_MAX] = { - [PERF_TYPE_HARDWARE] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \ - PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \ - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, - - [PERF_TYPE_SOFTWARE] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \ - PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \ - PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, - - [PERF_TYPE_TRACEPOINT] = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | \ - PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | \ - PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE, +static struct { + bool user_set; + u64 fields; + u64 invalid_fields; +} output[PERF_TYPE_MAX] = { + + [PERF_TYPE_HARDWARE] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + + .invalid_fields = PERF_OUTPUT_TRACE, + }, + + [PERF_TYPE_SOFTWARE] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + + .invalid_fields = PERF_OUTPUT_TRACE, + }, + + [PERF_TYPE_TRACEPOINT] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE, + }, }; -static bool output_set_by_user; +static bool output_set_by_user(void) +{ + int j; + for (j = 0; j < PERF_TYPE_MAX; ++j) { + if (output[j].user_set) + return true; + } + return false; +} -#define PRINT_FIELD(x) (output_fields[attr->type] & PERF_OUTPUT_##x) +#define PRINT_FIELD(x) (output[attr->type].fields & PERF_OUTPUT_##x) static int perf_session__check_attr(struct perf_session *session, struct perf_event_attr *attr) @@ -168,7 +197,7 @@ static void process_event(union perf_event *event __unused, { struct perf_event_attr *attr = &evsel->attr; - if (output_fields[attr->type] == 0) + if (output[attr->type].fields == 0) return; if (perf_session__check_attr(session, attr) < 0) @@ -451,6 +480,7 @@ static int parse_output_fields(const struct option *opt __used, { char *tok; int i, imax = sizeof(all_output_options) / sizeof(struct output_option); + int j; int rc = 0; char *str = strdup(arg); int type = -1; @@ -458,52 +488,95 @@ static int parse_output_fields(const struct option *opt __used, if (!str) return -ENOMEM; - tok = strtok(str, ":"); - if (!tok) { - fprintf(stderr, - "Invalid field string - not prepended with type."); - return -EINVAL; - } - - /* first word should state which event type user - * is specifying the fields + /* first word can state for which event type the user is specifying + * the fields. If no type exists, the specified fields apply to all + * event types found in the file minus the invalid fields for a type. */ - if (!strcmp(tok, "hw")) - type = PERF_TYPE_HARDWARE; - else if (!strcmp(tok, "sw")) - type = PERF_TYPE_SOFTWARE; - else if (!strcmp(tok, "trace")) - type = PERF_TYPE_TRACEPOINT; - else { - fprintf(stderr, "Invalid event type in field string."); - return -EINVAL; + tok = strchr(str, ':'); + if (tok) { + *tok = '\0'; + tok++; + if (!strcmp(str, "hw")) + type = PERF_TYPE_HARDWARE; + else if (!strcmp(str, "sw")) + type = PERF_TYPE_SOFTWARE; + else if (!strcmp(str, "trace")) + type = PERF_TYPE_TRACEPOINT; + else { + fprintf(stderr, "Invalid event type in field string.\n"); + return -EINVAL; + } + + if (output[type].user_set) + pr_warning("Overriding previous field request for %s events.\n", + event_type(type)); + + output[type].fields = 0; + output[type].user_set = true; + + } else { + tok = str; + if (strlen(str) == 0) { + fprintf(stderr, + "Cannot set fields to 'none' for all event types.\n"); + rc = -EINVAL; + goto out; + } + + if (output_set_by_user()) + pr_warning("Overriding previous field request for all events.\n"); + + for (j = 0; j < PERF_TYPE_MAX; ++j) { + output[j].fields = 0; + output[j].user_set = true; + } } - output_fields[type] = 0; - while (1) { - tok = strtok(NULL, ","); - if (!tok) - break; + tok = strtok(tok, ","); + while (tok) { for (i = 0; i < imax; ++i) { - if (strcmp(tok, all_output_options[i].str) == 0) { - output_fields[type] |= all_output_options[i].field; + if (strcmp(tok, all_output_options[i].str) == 0) break; - } } if (i == imax) { - fprintf(stderr, "Invalid field requested."); + fprintf(stderr, "Invalid field requested.\n"); rc = -EINVAL; - break; + goto out; } - } - if (output_fields[type] == 0) { - pr_debug("No fields requested for %s type. " - "Events will not be displayed\n", event_type(type)); + if (type == -1) { + /* add user option to all events types for + * which it is valid + */ + for (j = 0; j < PERF_TYPE_MAX; ++j) { + if (output[j].invalid_fields & all_output_options[i].field) { + pr_warning("\'%s\' not valid for %s events. Ignoring.\n", + all_output_options[i].str, event_type(j)); + } else + output[j].fields |= all_output_options[i].field; + } + } else { + if (output[type].invalid_fields & all_output_options[i].field) { + fprintf(stderr, "\'%s\' not valid for %s events.\n", + all_output_options[i].str, event_type(type)); + + rc = -EINVAL; + goto out; + } + output[type].fields |= all_output_options[i].field; + } + + tok = strtok(NULL, ","); } - output_set_by_user = true; + if (type >= 0) { + if (output[type].fields == 0) { + pr_debug("No fields requested for %s type. " + "Events will not be displayed.\n", event_type(type)); + } + } +out: free(str); return rc; } @@ -1020,7 +1093,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) struct stat perf_stat; int input; - if (output_set_by_user) { + if (output_set_by_user()) { fprintf(stderr, "custom fields not supported for generated scripts"); return -1; -- cgit v1.1 From 176fcc5c5f0131504a55e1e1d35389c49a9177c2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 30 Mar 2011 15:30:43 -0300 Subject: perf script: Add more documentation about the -f/--fields parameters Using the commit log for 2c9e45f. Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-script.txt | 52 ++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 66f040b..86c87e2 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -113,13 +113,61 @@ OPTIONS Do various checks like samples ordering and lost events. -f:: ---fields +--fields:: Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, sym. Field - list must be prepended with the type, trace, sw or hw, + list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -f sw:comm,tid,time,sym and -f trace:time,cpu,trace + perf script -f + + is equivalent to: + + perf script -f trace: -f sw: -f hw: + + i.e., the specified fields apply to all event types if the type string + is not given. + + The arguments are processed in the order received. A later usage can + reset a prior request. e.g.: + + -f trace: -f comm,tid,time,sym + + The first -f suppresses trace events (field list is ""), but then the + second invocation sets the fields to comm,tid,time,sym. In this case a + warning is given to the user: + + "Overriding previous field request for all events." + + Alternativey, consider the order: + + -f comm,tid,time,sym -f trace: + + The first -f sets the fields for all events and the second -f + suppresses trace events. The user is given a warning message about + the override, and the result of the above is that only S/W and H/W + events are displayed with the given fields. + + For the 'wildcard' option if a user selected field is invalid for an + event type, a message is displayed to the user that the option is + ignored for that type. For example: + + $ perf script -f comm,tid,trace + 'trace' not valid for hardware events. Ignoring. + 'trace' not valid for software events. Ignoring. + + Alternatively, if the type is given an invalid field is specified it + is an error. For example: + + perf script -v -f sw:comm,tid,trace + 'trace' not valid for software events. + + At this point usage is displayed, and perf-script exits. + + Finally, a user may not set fields to none for all event types. + i.e., -f "" is not allowed. + -k:: --vmlinux=:: vmlinux pathname -- cgit v1.1 From 052936080c8fb2f791103995b21bd4018c8df886 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 Apr 2011 11:15:12 +0200 Subject: x86-64, NUMA: Remove custom phys_to_nid() implementation phys_to_nid() maps physical address to NUMA node id. This is implemented by building perfect hash in compute_hash_shift() during initialization. However, with SPARSE memory model, the nid is encoded in page flags. The perfect hash implementation was for DISCONTIG memory model which got removed years ago by b263295dbf (x86: 64-bit, make sparsemem vmemmap the only memory model). So, the perfect hash ends up being used only during initialization when the core SPARSE code already provides perfectly acceptable generic early_pfn_to_nid() implementation. Drop phys_to_nid() and use the generic ealry_pfn_to_nid() instead. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Acked-by: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner --- arch/x86/Kconfig | 4 -- arch/x86/include/asm/mmzone_64.h | 23 -------- arch/x86/mm/numa_64.c | 123 +-------------------------------------- 3 files changed, 1 insertion(+), 149 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a..b034814 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1703,10 +1703,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y depends on MEMORY_HOTPLUG -config HAVE_ARCH_EARLY_PFN_TO_NID - def_bool X86_64 - depends on NUMA - config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index 288b96f..b3f88d7 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -4,36 +4,13 @@ #ifndef _ASM_X86_MMZONE_64_H #define _ASM_X86_MMZONE_64_H - #ifdef CONFIG_NUMA #include - #include -/* Simple perfect hash to map physical addresses to node numbers */ -struct memnode { - int shift; - unsigned int mapsize; - s16 *map; - s16 embedded_map[64 - 8]; -} ____cacheline_aligned; /* total size = 128 bytes */ -extern struct memnode memnode; -#define memnode_shift memnode.shift -#define memnodemap memnode.map -#define memnodemapsize memnode.mapsize - extern struct pglist_data *node_data[]; -static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) -{ - unsigned nid; - VIRTUAL_BUG_ON(!memnodemap); - nid = memnodemap[addr >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); - return nid; -} - #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e8c00cc..3951ee6 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -28,125 +28,10 @@ EXPORT_SYMBOL(node_data); nodemask_t numa_nodes_parsed __initdata; -struct memnode memnode; - -static unsigned long __initdata nodemap_addr; -static unsigned long __initdata nodemap_size; - static struct numa_meminfo numa_meminfo __initdata; - static int numa_distance_cnt; static u8 *numa_distance; -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) -{ - unsigned long addr, end; - int i, res = -1; - - memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); - for (i = 0; i < mi->nr_blks; i++) { - addr = mi->blk[i].start; - end = mi->blk[i].end; - if (addr >= end) - continue; - if ((end >> shift) >= memnodemapsize) - return 0; - do { - if (memnodemap[addr >> shift] != NUMA_NO_NODE) - return -1; - memnodemap[addr >> shift] = mi->blk[i].nid; - addr += (1UL << shift); - } while (addr < end); - res = 1; - } - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long addr; - - memnodemap = memnode.embedded_map; - if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) - return 0; - - addr = 0x8000; - nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), - nodemap_size, L1_CACHE_BYTES); - if (nodemap_addr == MEMBLOCK_ERROR) { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - nodemap_addr = nodemap_size = 0; - return -1; - } - memnodemap = phys_to_virt(nodemap_addr); - memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); - - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) -{ - int i, nodes_used = 0; - unsigned long start, end; - unsigned long bitfield = 0, memtop = 0; - - for (i = 0; i < mi->nr_blks; i++) { - start = mi->blk[i].start; - end = mi->blk[i].end; - if (start >= end) - continue; - bitfield |= start; - nodes_used++; - if (end > memtop) - memtop = end; - } - if (nodes_used <= 1) - i = 63; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i)+1; - return i; -} - -static int __init compute_hash_shift(const struct numa_meminfo *mi) -{ - int shift; - - shift = extract_lsb_from_nodes(mi); - if (allocate_cachealigned_memnodemap()) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", - shift); - - if (populate_memnodemap(mi, shift) != 1) { - printk(KERN_INFO "Your memory is not aligned you need to " - "rebuild your kernel with a bigger NODEMAPSIZE " - "shift=%d\n", shift); - return -1; - } - return shift; -} - -int __meminit __early_pfn_to_nid(unsigned long pfn) -{ - return phys_to_nid(pfn << PAGE_SHIFT); -} - static void * __init early_node_mem(int nodeid, unsigned long start, unsigned long end, unsigned long size, unsigned long align) @@ -270,7 +155,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, nodedata_phys + pgdat_size - 1); - nid = phys_to_nid(nodedata_phys); + nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT); if (nid != nodeid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); @@ -527,12 +412,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; - memnode_shift = compute_hash_shift(mi); - if (memnode_shift < 0) { - printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); - return -EINVAL; - } - for (i = 0; i < mi->nr_blks; i++) memblock_x86_register_active_regions(mi->blk[i].nid, mi->blk[i].start >> PAGE_SHIFT, -- cgit v1.1 From 3b16651f806d35b5c404f2525fbce76afa3c9297 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 Apr 2011 11:15:12 +0200 Subject: x86: Clean up memory model related configs in arch/x86/Kconfig * Remove bogus dependency on ARCH_SELECT_MEMORY_MODEL from ARCH_FLATMEM_ENABLE. ENABLE configs don't interfere with SELECT_MEMORY_MODEL. They just need to indicate whether the specific memory model is supported. * Relocate HAVE_ARCH_ALLOC_REMAP, ARCH_PROC_KCORE_TEXT and ARCH_SPARSEMEM_DEFAULT so that memory model related configs are together in consistent order. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Yinghai Lu Cc: "H. Peter Anvin" Cc: Thomas Gleixner --- arch/x86/Kconfig | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b034814..8db4fbf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1223,6 +1223,10 @@ config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA +config HAVE_ARCH_ALLOC_REMAP + def_bool y + depends on X86_32 && NUMA + config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM @@ -1231,13 +1235,9 @@ config NEED_NODE_MEMMAP_SIZE def_bool y depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_FLATMEM_ENABLE def_bool y - depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA + depends on X86_32 && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -1247,20 +1247,16 @@ config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA && X86_32 -config ARCH_PROC_KCORE_TEXT - def_bool y - depends on X86_64 && PROC_KCORE - -config ARCH_SPARSEMEM_DEFAULT - def_bool y - depends on X86_64 - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_64 + config ARCH_SELECT_MEMORY_MODEL def_bool y depends on ARCH_SPARSEMEM_ENABLE @@ -1269,6 +1265,10 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ARCH_PROC_KCORE_TEXT + def_bool y + depends on X86_64 && PROC_KCORE + config ILLEGAL_POINTER_VALUE hex default 0 if X86_32 -- cgit v1.1 From 711b8c87a5fe6de78e90411cb67b506babfef74a Mon Sep 17 00:00:00 2001 From: Florian Mickler Date: Mon, 4 Apr 2011 01:17:40 +0200 Subject: x86-64, NUMA: Remove unused variable In case !CONFIG_ACPI_NUMA and !CONFIG_AMD_NUMA gcc emits a warning about the unused variable ret. As that variable is in fact not needed I choose to remove it. Signed-off-by: Florian Mickler LKML-Reference: <1301843624-22364-1-git-send-email-florian@mickler.org> Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3951ee6..13f5b06 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -505,17 +505,13 @@ static int __init numa_init(int (*init_func)(void)) void __init initmem_init(void) { - int ret; - if (!numa_off) { #ifdef CONFIG_ACPI_NUMA - ret = numa_init(x86_acpi_numa_init); - if (!ret) + if (!numa_init(x86_acpi_numa_init)) return; #endif #ifdef CONFIG_AMD_NUMA - ret = numa_init(amd_numa_init); - if (!ret) + if (!numa_init(amd_numa_init)) return; #endif } -- cgit v1.1 From 64d21fc194e12bdf7347019bf10325a4b3d77e7b Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sat, 2 Apr 2011 13:17:48 +0600 Subject: x86, mpparse: Put check_slot() into .init section check_slot() is only called from replace_intsrc_all() - which is in the .init section. So, put check_slot into the .init section as well, so it can be freed after system boot. Signed-off-by: Rakib Mullick LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5a532ce..f1b2718 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -715,7 +715,7 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) } } -static int +static int __init check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { int ret = 0; -- cgit v1.1 From 0588fa30db44fd2d4032b36a061c87478a43fbee Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Mar 2011 22:59:21 -0400 Subject: tracing: Convert trace_printk() formats for module to const char * The trace_printk() formats for modules do not show up in the debugfs/tracing/printk_formats file. Only the formats that are for trace_printk()s that are in the kernel core. To facilitate the change to add trace_printk() formats from modules into that file as well, we need to convert the structure that holds the formats from char fmt[], into const char *fmt, and allocate them separately. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d88..b8b2681 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); struct trace_bprintk_fmt { struct list_head list; - char fmt[0]; + const char *fmt; }; static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) @@ -49,6 +49,7 @@ static void hold_module_trace_bprintk_format(const char **start, const char **end) { const char **iter; + char *fmt; mutex_lock(&btrace_mutex); for (iter = start; iter < end; iter++) { @@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } - tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) - + strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt) { + tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + if (tb_fmt) + fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt && fmt) { list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(tb_fmt->fmt, *iter); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; *iter = tb_fmt->fmt; - } else + } else { + kfree(tb_fmt); *iter = NULL; + } } mutex_unlock(&btrace_mutex); } -- cgit v1.1 From 1813dc3776c22ad4b0294a6df8434b9a02c98109 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 21 Mar 2011 23:36:31 -0400 Subject: tracing: Print trace_bprintk() formats for modules too The file debugfs/tracing/printk_formats maps the addresses to the formats that are used by trace_bprintk() so that userspace tools can read the buffer and be able to decode trace_bprintk events to get the format saved when reading the ring buffer directly. This is because trace_bprintk() does not store the format into the buffer, but just the address of the format, which is hidden in the kernel memory. But currently it only exports trace_bprintk()s from the kernel core and not for modules. The modules need their formats exported as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 103 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index b8b2681..dff763b 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -89,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, return 0; } +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use containerof() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + struct trace_bprintk_fmt *mod_fmt; + + if (list_empty(&trace_bprintk_fmt_list)) + return NULL; + + /* + * v will point to the address of the fmt record from t_next + * v will be NULL from t_start. + * If this is the first pointer or called from start + * then we need to walk the list. + */ + if (!v || start_index == *pos) { + struct trace_bprintk_fmt *p; + + /* search the module list */ + list_for_each_entry(p, &trace_bprintk_fmt_list, list) { + if (start_index == *pos) + return &p->fmt; + start_index++; + } + /* pos > index */ + return NULL; + } + + /* + * v points to the address of the fmt field in the mod list + * structure that holds the module print format. + */ + mod_fmt = container_of(v, typeof(*mod_fmt), fmt); + if (mod_fmt->list.next == &trace_bprintk_fmt_list) + return NULL; + + mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + + return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ + mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ + mutex_unlock(&btrace_mutex); +} + #else /* !CONFIG_MODULES */ __init static int module_trace_bprintk_format_notify(struct notifier_block *self, @@ -96,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, { return 0; } +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { } #endif /* CONFIG_MODULES */ @@ -158,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +static const char **find_next(void *v, loff_t *pos) +{ + const char **fmt = v; + int start_index; + + if (!fmt) + fmt = __start___trace_bprintk_fmt + *pos; + + start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + + if (*pos < start_index) + return fmt; + + return find_next_mod_format(start_index, v, fmt, pos); +} + static void * t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = __start___trace_bprintk_fmt + *pos; - - if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) - return NULL; - return fmt; + format_mod_start(); + return find_next(NULL, pos); } static void *t_next(struct seq_file *m, void * v, loff_t *pos) { (*pos)++; - return t_start(m, pos); + return find_next(v, pos); } static int t_show(struct seq_file *m, void *v) @@ -210,6 +300,7 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + format_mod_stop(); } static const struct seq_operations show_format_seq_ops = { -- cgit v1.1 From ee5e51f51be755830f57445e268ba50e88ccbdbb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 25 Mar 2011 12:05:18 +0100 Subject: tracing: Avoid soft lockup in trace_pipe running following commands: # enable the binary option echo 1 > ./options/bin # disable context info option echo 0 > ./options/context-info # tracing only events echo 1 > ./events/enable cat trace_pipe plus forcing system to generate many tracing events, is causing lockup (in NON preemptive kernels) inside tracing_read_pipe function. The issue is also easily reproduced by running ltp stress test. (ftrace_stress_test.sh) The reasons are: - bin/hex/raw output functions for events are set to trace_nop_print function, which prints nothing and returns TRACE_TYPE_HANDLED value - LOST EVENT trace do not handle trace_seq overflow These reasons force the while loop in tracing_read_pipe function never to break. The attached patch fixies handling of lost event trace, and changes trace_nop_print to print minimal info, which is needed for the correct tracing_read_pipe processing. v2 changes: - omit the cond_resched changes by trace_nop_print changes - WARN changed to WARN_ONCE and added info to be able to find out the culprit v3 changes: - make more accurate patch comment Signed-off-by: Jiri Olsa LKML-Reference: <20110325110518.GC1922@jolsa.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 15 ++++++++++++--- kernel/trace/trace_output.c | 3 +++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9541c27..5af42f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2013,9 +2013,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; - if (iter->lost_events) - trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events); + if (iter->lost_events && + !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events)) + return TRACE_TYPE_PARTIAL_LINE; if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -3229,6 +3230,14 @@ waitagain: if (iter->seq.len >= cnt) break; + + /* + * Setting the full flag means we reached the trace_seq buffer + * size and we should leave by partial output condition above. + * One of the trace_seq_* functions is not used properly. + */ + WARN_ONCE(iter->seq.full, "full flag set for trace type %d", + iter->ent->type); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 456be90..cf535cc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -830,6 +830,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { + if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } -- cgit v1.1 From d430d3d7e646eb1eac2bb4aa244a644312e67c76 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 16 Mar 2011 17:29:47 -0400 Subject: jump label: Introduce static_branch() interface Introduce: static __always_inline bool static_branch(struct jump_label_key *key); instead of the old JUMP_LABEL(key, label) macro. In this way, jump labels become really easy to use: Define: struct jump_label_key jump_key; Can be used as: if (static_branch(&jump_key)) do unlikely code enable/disale via: jump_label_inc(&jump_key); jump_label_dec(&jump_key); that's it! For the jump labels disabled case, the static_branch() becomes an atomic_read(), and jump_label_inc()/dec() are simply atomic_inc(), atomic_dec() operations. We show testing results for this change below. Thanks to H. Peter Anvin for suggesting the 'static_branch()' construct. Since we now require a 'struct jump_label_key *key', we can store a pointer into the jump table addresses. In this way, we can enable/disable jump labels, in basically constant time. This change allows us to completely remove the previous hashtable scheme. Thanks to Peter Zijlstra for this re-write. Testing: I ran a series of 'tbench 20' runs 5 times (with reboots) for 3 configurations, where tracepoints were disabled. jump label configured in avg: 815.6 jump label *not* configured in (using atomic reads) avg: 800.1 jump label *not* configured in (regular reads) avg: 803.4 Signed-off-by: Peter Zijlstra LKML-Reference: <20110316212947.GA8792@redhat.com> Signed-off-by: Jason Baron Suggested-by: H. Peter Anvin Tested-by: David Daney Acked-by: Ralf Baechle Acked-by: David S. Miller Acked-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/mips/include/asm/jump_label.h | 22 +- arch/sparc/include/asm/jump_label.h | 25 +- arch/x86/include/asm/alternative.h | 3 +- arch/x86/include/asm/jump_label.h | 26 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/module.c | 1 + include/asm-generic/vmlinux.lds.h | 14 +- include/linux/dynamic_debug.h | 2 - include/linux/jump_label.h | 89 +++--- include/linux/jump_label_ref.h | 44 --- include/linux/perf_event.h | 26 +- include/linux/tracepoint.h | 22 +- kernel/jump_label.c | 539 +++++++++++++++--------------------- kernel/perf_event.c | 4 +- kernel/tracepoint.c | 23 +- 15 files changed, 356 insertions(+), 486 deletions(-) delete mode 100644 include/linux/jump_label_ref.h diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 7622ccf..1881b31 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,16 +20,18 @@ #define WORD_INSN ".word" #endif -#define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:\tnop\n\t" \ - "nop\n\t" \ - ".pushsection __jump_table, \"a\"\n\t" \ - WORD_INSN " 1b, %l[" #label "], %0\n\t" \ - ".popsection\n\t" \ - : : "i" (key) : : label); \ - } while (0) - +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:\tnop\n\t" + "nop\n\t" + ".pushsection __jump_table, \"aw\"\n\t" + WORD_INSN " 1b, %l[l_yes], %0\n\t" + ".popsection\n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index 427d468..fc73a82 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,17 +7,20 @@ #define JUMP_LABEL_NOP_SIZE 4 -#define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:\n\t" \ - "nop\n\t" \ - "nop\n\t" \ - ".pushsection __jump_table, \"a\"\n\t"\ - ".align 4\n\t" \ - ".word 1b, %l[" #label "], %c0\n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label);\ - } while (0) +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:\n\t" + "nop\n\t" + "nop\n\t" + ".pushsection __jump_table, \"aw\"\n\t" + ".align 4\n\t" + ".word 1b, %l[l_yes], %c0\n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13009d1..8cdd1e2 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,7 +4,6 @@ #include #include #include -#include #include /* @@ -191,7 +190,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); extern void text_poke_smp_batch(struct text_poke_param *params, int n); -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_JUMP_LABEL) #define IDEAL_NOP_SIZE_5 5 extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; extern void arch_init_ideal_nop5(void); diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 574dbc2..f217cee 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -5,20 +5,24 @@ #include #include +#include #define JUMP_LABEL_NOP_SIZE 5 -# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" - -# define JUMP_LABEL(key, label) \ - do { \ - asm goto("1:" \ - JUMP_LABEL_INITIAL_NOP \ - ".pushsection __jump_table, \"aw\" \n\t"\ - _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ - ".popsection \n\t" \ - : : "i" (key) : : label); \ - } while (0) +#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" + +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("1:" + JUMP_LABEL_INITIAL_NOP + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + ".popsection \n\t" + : : "i" (key) : : l_yes); + return false; +l_yes: + return true; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4a23467..651454b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -679,7 +679,7 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); } -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_JUMP_LABEL) #ifdef CONFIG_X86_64 unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index ab23f1a..52f256f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 32c45e5..7952216 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -170,6 +170,10 @@ STRUCT_ALIGN(); \ *(__tracepoints) \ /* implement dynamic printk debug */ \ + . = ALIGN(8); \ + VMLINUX_SYMBOL(__start___jump_table) = .; \ + *(__jump_table) \ + VMLINUX_SYMBOL(__stop___jump_table) = .; \ . = ALIGN(8); \ VMLINUX_SYMBOL(__start___verbose) = .; \ *(__verbose) \ @@ -228,8 +232,6 @@ \ BUG_TABLE \ \ - JUMP_TABLE \ - \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ @@ -589,14 +591,6 @@ #define BUG_TABLE #endif -#define JUMP_TABLE \ - . = ALIGN(8); \ - __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__start___jump_table) = .; \ - *(__jump_table) \ - VMLINUX_SYMBOL(__stop___jump_table) = .; \ - } - #ifdef CONFIG_PM_TRACE #define TRACEDATA \ . = ALIGN(4); \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 0c9653f..e747ecd 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -1,8 +1,6 @@ #ifndef _DYNAMIC_DEBUG_H #define _DYNAMIC_DEBUG_H -#include - /* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They * use independent hash functions, to reduce the chance of false positives. diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7880f18..83e745f 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -1,20 +1,43 @@ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H +#include +#include + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) + +struct jump_label_key { + atomic_t enabled; + struct jump_entry *entries; +#ifdef CONFIG_MODULES + struct jump_label_mod *next; +#endif +}; + # include # define HAVE_JUMP_LABEL #endif enum jump_label_type { + JUMP_LABEL_DISABLE = 0, JUMP_LABEL_ENABLE, - JUMP_LABEL_DISABLE }; struct module; #ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_MODULES +#define JUMP_LABEL_INIT {{ 0 }, NULL, NULL} +#else +#define JUMP_LABEL_INIT {{ 0 }, NULL} +#endif + +static __always_inline bool static_branch(struct jump_label_key *key) +{ + return arch_static_branch(key); +} + extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; @@ -23,37 +46,37 @@ extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_text_poke_early(jump_label_t addr); -extern void jump_label_update(unsigned long key, enum jump_label_type type); -extern void jump_label_apply_nops(struct module *mod); extern int jump_label_text_reserved(void *start, void *end); +extern void jump_label_inc(struct jump_label_key *key); +extern void jump_label_dec(struct jump_label_key *key); +extern bool jump_label_enabled(struct jump_label_key *key); +extern void jump_label_apply_nops(struct module *mod); -#define jump_label_enable(key) \ - jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE); +#else -#define jump_label_disable(key) \ - jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE); +#include -#else +#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} -#define JUMP_LABEL(key, label) \ -do { \ - if (unlikely(*key)) \ - goto label; \ -} while (0) +struct jump_label_key { + atomic_t enabled; +}; -#define jump_label_enable(cond_var) \ -do { \ - *(cond_var) = 1; \ -} while (0) +static __always_inline bool static_branch(struct jump_label_key *key) +{ + if (unlikely(atomic_read(&key->enabled))) + return true; + return false; +} -#define jump_label_disable(cond_var) \ -do { \ - *(cond_var) = 0; \ -} while (0) +static inline void jump_label_inc(struct jump_label_key *key) +{ + atomic_inc(&key->enabled); +} -static inline int jump_label_apply_nops(struct module *mod) +static inline void jump_label_dec(struct jump_label_key *key) { - return 0; + atomic_dec(&key->enabled); } static inline int jump_label_text_reserved(void *start, void *end) @@ -64,16 +87,16 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -#endif +static inline bool jump_label_enabled(struct jump_label_key *key) +{ + return !!atomic_read(&key->enabled); +} -#define COND_STMT(key, stmt) \ -do { \ - __label__ jl_enabled; \ - JUMP_LABEL(key, jl_enabled); \ - if (0) { \ -jl_enabled: \ - stmt; \ - } \ -} while (0) +static inline int jump_label_apply_nops(struct module *mod) +{ + return 0; +} + +#endif #endif diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h deleted file mode 100644 index e5d012a..0000000 --- a/include/linux/jump_label_ref.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef _LINUX_JUMP_LABEL_REF_H -#define _LINUX_JUMP_LABEL_REF_H - -#include -#include - -#ifdef HAVE_JUMP_LABEL - -static inline void jump_label_inc(atomic_t *key) -{ - if (atomic_add_return(1, key) == 1) - jump_label_enable(key); -} - -static inline void jump_label_dec(atomic_t *key) -{ - if (atomic_dec_and_test(key)) - jump_label_disable(key); -} - -#else /* !HAVE_JUMP_LABEL */ - -static inline void jump_label_inc(atomic_t *key) -{ - atomic_inc(key); -} - -static inline void jump_label_dec(atomic_t *key) -{ - atomic_dec(key); -} - -#undef JUMP_LABEL -#define JUMP_LABEL(key, label) \ -do { \ - if (unlikely(__builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(key), atomic_t *), \ - atomic_read((atomic_t *)(key)), *(key)))) \ - goto label; \ -} while (0) - -#endif /* HAVE_JUMP_LABEL */ - -#endif /* _LINUX_JUMP_LABEL_REF_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 311b4dc..730b782 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -505,7 +505,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1034,7 +1034,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); @@ -1063,22 +1063,21 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - JUMP_LABEL(&perf_swevent_enabled[event_id], have_event); - return; - -have_event: - if (!regs) { - perf_fetch_caller_regs(&hot_regs); - regs = &hot_regs; + if (static_branch(&perf_swevent_enabled[event_id])) { + if (!regs) { + perf_fetch_caller_regs(&hot_regs); + regs = &hot_regs; + } + __perf_sw_event(event_id, nr, nmi, regs, addr); } - __perf_sw_event(event_id, nr, nmi, regs, addr); } -extern atomic_t perf_sched_events; +extern struct jump_label_key perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *task) { - COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task)); + if (static_branch(&perf_sched_events)) + __perf_event_task_sched_in(task); } static inline @@ -1086,7 +1085,8 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next)); + if (static_branch(&perf_sched_events)) + __perf_event_task_sched_out(task, next); } extern void perf_event_mmap(struct vm_area_struct *vma); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 97c84a5..d530a44 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - int state; /* State. */ + struct jump_label_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -146,9 +146,7 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - JUMP_LABEL(&__tracepoint_##name.state, do_trace); \ - return; \ -do_trace: \ + if (static_branch(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -176,14 +174,14 @@ do_trace: \ * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. */ -#define DEFINE_TRACE_FN(name, reg, unreg) \ - static const char __tpstrtab_##name[] \ - __attribute__((section("__tracepoints_strings"))) = #name; \ - struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, 0, reg, unreg, NULL }; \ - static struct tracepoint * const __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ +#define DEFINE_TRACE_FN(name, reg, unreg) \ + static const char __tpstrtab_##name[] \ + __attribute__((section("__tracepoints_strings"))) = #name; \ + struct tracepoint __tracepoint_##name \ + __attribute__((section("__tracepoints"))) = \ + { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + static struct tracepoint * const __tracepoint_ptr_##name __used \ + __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; #define DEFINE_TRACE(name) \ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3b79bd9..74d1c09 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,43 +2,23 @@ * jump label support * * Copyright (C) 2009 Jason Baron + * Copyright (C) 2011 Peter Zijlstra * */ -#include #include #include #include #include -#include #include #include #include +#include #ifdef HAVE_JUMP_LABEL -#define JUMP_LABEL_HASH_BITS 6 -#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) -static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; - /* mutex to protect coming/going of the the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); -struct jump_label_entry { - struct hlist_node hlist; - struct jump_entry *table; - int nr_entries; - /* hang modules off here */ - struct hlist_head modules; - unsigned long key; -}; - -struct jump_label_module_entry { - struct hlist_node hlist; - struct jump_entry *table; - int nr_entries; - struct module *mod; -}; - void jump_label_lock(void) { mutex_lock(&jump_label_mutex); @@ -49,6 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } +bool jump_label_enabled(struct jump_label_key *key) +{ + return !!atomic_read(&key->enabled); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b) } static void -sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; @@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static struct jump_label_entry *get_jump_label_entry(jump_label_t key) -{ - struct hlist_head *head; - struct hlist_node *node; - struct jump_label_entry *e; - u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); - - head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (key == e->key) - return e; - } - return NULL; -} +static void jump_label_update(struct jump_label_key *key, int enable); -static struct jump_label_entry * -add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +void jump_label_inc(struct jump_label_key *key) { - struct hlist_head *head; - struct jump_label_entry *e; - u32 hash; - - e = get_jump_label_entry(key); - if (e) - return ERR_PTR(-EEXIST); - - e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - - hash = jhash((void *)&key, sizeof(jump_label_t), 0); - head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; - e->key = key; - e->table = table; - e->nr_entries = nr_entries; - INIT_HLIST_HEAD(&(e->modules)); - hlist_add_head(&e->hlist, head); - return e; -} + if (atomic_inc_not_zero(&key->enabled)) + return; -static int -build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) -{ - struct jump_entry *iter, *iter_begin; - struct jump_label_entry *entry; - int count; - - sort_jump_label_entries(start, stop); - iter = start; - while (iter < stop) { - entry = get_jump_label_entry(iter->key); - if (!entry) { - iter_begin = iter; - count = 0; - while ((iter < stop) && - (iter->key == iter_begin->key)) { - iter++; - count++; - } - entry = add_jump_label_entry(iter_begin->key, - count, iter_begin); - if (IS_ERR(entry)) - return PTR_ERR(entry); - } else { - WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); - return -1; - } - } - return 0; + jump_label_lock(); + if (atomic_add_return(1, &key->enabled) == 1) + jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_unlock(); } -/*** - * jump_label_update - update jump label text - * @key - key value associated with a a jump label - * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE - * - * Will enable/disable the jump for jump label @key, depending on the - * value of @type. - * - */ - -void jump_label_update(unsigned long key, enum jump_label_type type) +void jump_label_dec(struct jump_label_key *key) { - struct jump_entry *iter; - struct jump_label_entry *entry; - struct hlist_node *module_node; - struct jump_label_module_entry *e_module; - int count; + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + return; - jump_label_lock(); - entry = get_jump_label_entry((jump_label_t)key); - if (entry) { - count = entry->nr_entries; - iter = entry->table; - while (count--) { - if (kernel_text_address(iter->code)) - arch_jump_label_transform(iter, type); - iter++; - } - /* eanble/disable jump labels in modules */ - hlist_for_each_entry(e_module, module_node, &(entry->modules), - hlist) { - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (iter->key && - kernel_text_address(iter->code)) - arch_jump_label_transform(iter, type); - iter++; - } - } - } + jump_label_update(key, JUMP_LABEL_DISABLE); jump_label_unlock(); } @@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end) return 0; } -#ifdef CONFIG_MODULES - -static int module_conflict(void *start, void *end) +static int __jump_label_text_reserved(struct jump_entry *iter_start, + struct jump_entry *iter_stop, void *start, void *end) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; struct jump_entry *iter; - int i, count; - int conflict = 0; - - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (addr_conflict(iter, start, end)) { - conflict = 1; - goto out; - } - iter++; - } - } - } - } -out: - return conflict; -} - -#endif - -/*** - * jump_label_text_reserved - check if addr range is reserved - * @start: start text addr - * @end: end text addr - * - * checks if the text addr located between @start and @end - * overlaps with any of the jump label patch addresses. Code - * that wants to modify kernel text should first verify that - * it does not overlap with any of the jump label addresses. - * Caller must hold jump_label_mutex. - * - * returns 1 if there is an overlap, 0 otherwise - */ -int jump_label_text_reserved(void *start, void *end) -{ - struct jump_entry *iter; - struct jump_entry *iter_start = __start___jump_table; - struct jump_entry *iter_stop = __start___jump_table; - int conflict = 0; iter = iter_start; while (iter < iter_stop) { - if (addr_conflict(iter, start, end)) { - conflict = 1; - goto out; - } + if (addr_conflict(iter, start, end)) + return 1; iter++; } - /* now check modules */ -#ifdef CONFIG_MODULES - conflict = module_conflict(start, end); -#endif -out: - return conflict; + return 0; +} + +static void __jump_label_update(struct jump_label_key *key, + struct jump_entry *entry, int enable) +{ + for (; entry->key == (jump_label_t)(unsigned long)key; entry++) { + /* + * entry->code set to 0 invalidates module init text sections + * kernel_text_address() verifies we are not in core kernel + * init code, see jump_label_invalidate_module_init(). + */ + if (entry->code && kernel_text_address(entry->code)) + arch_jump_label_transform(entry, enable); + } } /* @@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr) { } -static __init int init_jump_label(void) +static __init int jump_label_init(void) { - int ret; struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; + struct jump_label_key *key = NULL; struct jump_entry *iter; jump_label_lock(); - ret = build_jump_label_hashtable(__start___jump_table, - __stop___jump_table); - iter = iter_start; - while (iter < iter_stop) { + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { arch_jump_label_text_poke_early(iter->code); - iter++; + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + atomic_set(&key->enabled, 0); + key->entries = iter; +#ifdef CONFIG_MODULES + key->next = NULL; +#endif } jump_label_unlock(); - return ret; + + return 0; } -early_initcall(init_jump_label); +early_initcall(jump_label_init); #ifdef CONFIG_MODULES -static struct jump_label_module_entry * -add_jump_label_module_entry(struct jump_label_entry *entry, - struct jump_entry *iter_begin, - int count, struct module *mod) +struct jump_label_mod { + struct jump_label_mod *next; + struct jump_entry *entries; + struct module *mod; +}; + +static int __jump_label_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + + mod = __module_text_address((unsigned long)start); + if (!mod) + return 0; + + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + + return __jump_label_text_reserved(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries, + start, end); +} + +static void __jump_label_mod_update(struct jump_label_key *key, int enable) +{ + struct jump_label_mod *mod = key->next; + + while (mod) { + __jump_label_update(key, mod->entries, enable); + mod = mod->next; + } +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) { - struct jump_label_module_entry *e; - - e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - e->mod = mod; - e->nr_entries = count; - e->table = iter_begin; - hlist_add_head(&e->hlist, &entry->modules); - return e; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (iter_start == iter_stop) + return; + + for (iter = iter_start; iter < iter_stop; iter++) + arch_jump_label_text_poke_early(iter->code); } -static int add_jump_label_module(struct module *mod) +static int jump_label_add_module(struct module *mod) { - struct jump_entry *iter, *iter_begin; - struct jump_label_entry *entry; - struct jump_label_module_entry *module_entry; - int count; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm; /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) + if (iter_start == iter_stop) return 0; - sort_jump_label_entries(mod->jump_entries, - mod->jump_entries + mod->num_jump_entries); - iter = mod->jump_entries; - while (iter < mod->jump_entries + mod->num_jump_entries) { - entry = get_jump_label_entry(iter->key); - iter_begin = iter; - count = 0; - while ((iter < mod->jump_entries + mod->num_jump_entries) && - (iter->key == iter_begin->key)) { - iter++; - count++; - } - if (!entry) { - entry = add_jump_label_entry(iter_begin->key, 0, NULL); - if (IS_ERR(entry)) - return PTR_ERR(entry); + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) { + atomic_set(&key->enabled, 0); + key->entries = iter; + key->next = NULL; + continue; } - module_entry = add_jump_label_module_entry(entry, iter_begin, - count, mod); - if (IS_ERR(module_entry)) - return PTR_ERR(module_entry); + + jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + if (!jlm) + return -ENOMEM; + + jlm->mod = mod; + jlm->entries = iter; + jlm->next = key->next; + key->next = jlm; + + if (jump_label_enabled(key)) + __jump_label_update(key, iter, JUMP_LABEL_ENABLE); } + return 0; } -static void remove_jump_label_module(struct module *mod) +static void jump_label_del_module(struct module *mod) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; - int i; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm, **prev; - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) + continue; + + prev = &key->next; + jlm = key->next; - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - if (e_module->mod == mod) { - hlist_del(&e_module->hlist); - kfree(e_module); - } - } - if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { - hlist_del(&e->hlist); - kfree(e); - } + while (jlm && jlm->mod != mod) { + prev = &jlm->next; + jlm = jlm->next; + } + + if (jlm) { + *prev = jlm->next; + kfree(jlm); } } } -static void remove_jump_label_module_init(struct module *mod) +static void jump_label_invalidate_module_init(struct module *mod) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - int i, count; - - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - if (e_module->mod != mod) - continue; - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (within_module_init(iter->code, mod)) - iter->key = 0; - iter++; - } - } - } + for (iter = iter_start; iter < iter_stop; iter++) { + if (within_module_init(iter->code, mod)) + iter->code = 0; } } @@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: jump_label_lock(); - ret = add_jump_label_module(mod); + ret = jump_label_add_module(mod); if (ret) - remove_jump_label_module(mod); + jump_label_del_module(mod); jump_label_unlock(); break; case MODULE_STATE_GOING: jump_label_lock(); - remove_jump_label_module(mod); + jump_label_del_module(mod); jump_label_unlock(); break; case MODULE_STATE_LIVE: jump_label_lock(); - remove_jump_label_module_init(mod); + jump_label_invalidate_module_init(mod); jump_label_unlock(); break; } - return ret; -} -/*** - * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() - * @mod: module to patch - * - * Allow for run-time selection of the optimal nops. Before the module - * loads patch these with arch_get_jump_label_nop(), which is specified by - * the arch specific jump label code. - */ -void jump_label_apply_nops(struct module *mod) -{ - struct jump_entry *iter; - - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; - - iter = mod->jump_entries; - while (iter < mod->jump_entries + mod->num_jump_entries) { - arch_jump_label_text_poke_early(iter->code); - iter++; - } + return notifier_from_errno(ret); } struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, - .priority = 0, + .priority = 1, /* higher than tracepoints */ }; -static __init int init_jump_label_module(void) +static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } -early_initcall(init_jump_label_module); +early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + int ret = __jump_label_text_reserved(__start___jump_table, + __stop___jump_table, start, end); + + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = __jump_label_mod_text_reserved(start, end); +#endif + return ret; +} + +static void jump_label_update(struct jump_label_key *key, int enable) +{ + struct jump_entry *entry = key->entries; + + /* if there are no users, entry can be NULL */ + if (entry) + __jump_label_update(key, entry, enable); + +#ifdef CONFIG_MODULES + __jump_label_mod_update(key, enable); +#endif +} + #endif diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c75925c..d665e92 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -125,7 +125,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -atomic_t perf_sched_events __read_mostly; +struct jump_label_key perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -5417,7 +5417,7 @@ fail: return err; } -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 68187af..b219f14 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !elem->state && active) + if (elem->regfunc && !jump_label_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && elem->state && !active) + else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (!elem->state && active) { - jump_label_enable(&elem->state); - elem->state = active; - } else if (elem->state && !active) { - jump_label_disable(&elem->state); - elem->state = active; - } + if (active && !jump_label_enabled(&elem->key)) + jump_label_inc(&elem->key); + else if (!active && jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); } /* @@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && elem->state) + if (elem->unregfunc && jump_label_enabled(&elem->key)) elem->unregfunc(); - if (elem->state) { - jump_label_disable(&elem->state); - elem->state = 0; - } + if (jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } -- cgit v1.1 From ef64789413c73f32faa5e5f1bc393e5843b0aa51 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 16 Mar 2011 15:58:27 -0400 Subject: jump label: Add _ASM_ALIGN for x86 and x86_64 The linker should not be adding holes to word size aligned pointers, but out of paranoia we are explicitly specifying that alignment. I have not seen any holes in the jump label section in practice. Signed-off-by: Jason Baron LKML-Reference: Acked-by: Peter Zijlstra Acked-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/include/asm/jump_label.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index f217cee..a32b18c 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -16,6 +16,7 @@ static __always_inline bool arch_static_branch(struct jump_label_key *key) asm goto("1:" JUMP_LABEL_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" + _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" ".popsection \n\t" : : "i" (key) : : l_yes); -- cgit v1.1 From 5373db886b791b2bc7811e2c115377916c409a5d Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Wed, 16 Mar 2011 15:58:30 -0400 Subject: jump label: Add s390 support Implement the architecture backend for jump label support on s390. For a shared kernel booted from a NSS silently disable jump labels because the NSS is read-only. Therefore jump labels will be disabled in a shared kernel and can't be activated. Signed-off-by: Jan Glauber LKML-Reference: <6935d2c41ce111e1719176ed4bbd3dbe4de80855.1300299760.git.jbaron@redhat.com> Acked-by: Peter Zijlstra Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt --- arch/s390/Kconfig | 1 + arch/s390/include/asm/jump_label.h | 37 ++++++++++++++++++++++++ arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/jump_label.c | 59 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 arch/s390/include/asm/jump_label.h create mode 100644 arch/s390/kernel/jump_label.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2508a6f..4a7f140 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -88,6 +88,7 @@ config S390 select HAVE_KERNEL_XZ select HAVE_GET_USER_PAGES_FAST select HAVE_ARCH_MUTEX_CPU_RELAX + select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h new file mode 100644 index 0000000..95a6cf2 --- /dev/null +++ b/arch/s390/include/asm/jump_label.h @@ -0,0 +1,37 @@ +#ifndef _ASM_S390_JUMP_LABEL_H +#define _ASM_S390_JUMP_LABEL_H + +#include + +#define JUMP_LABEL_NOP_SIZE 6 + +#ifdef CONFIG_64BIT +#define ASM_PTR ".quad" +#define ASM_ALIGN ".balign 8" +#else +#define ASM_PTR ".long" +#define ASM_ALIGN ".balign 4" +#endif + +static __always_inline bool arch_static_branch(struct jump_label_key *key) +{ + asm goto("0: brcl 0,0\n" + ".pushsection __jump_table, \"aw\"\n" + ASM_ALIGN "\n" + ASM_PTR " 0b, %l[label], %0\n" + ".popsection\n" + : : "X" (key) : : label); + return false; +label: + return true; +} + +typedef unsigned long jump_label_t; + +struct jump_entry { + jump_label_t code; + jump_label_t target; + jump_label_t key; +}; + +#endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 64230bc..5ff15da 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_sysinfo.o += -Iinclude/math-emu -Iarch/s390/math-emu -w obj-y := bitmap.o traps.o time.o process.o base.o early.o setup.o \ processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \ s390_ext.o debug.o irq.o ipl.o dis.o diag.o mem_detect.o \ - vdso.o vtime.o sysinfo.o nmi.o sclp.o + vdso.o vtime.o sysinfo.o nmi.o sclp.o jump_label.o obj-y += $(if $(CONFIG_64BIT),entry64.o,entry.o) obj-y += $(if $(CONFIG_64BIT),reipl64.o,reipl.o) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c new file mode 100644 index 0000000..44cc06b --- /dev/null +++ b/arch/s390/kernel/jump_label.c @@ -0,0 +1,59 @@ +/* + * Jump label s390 support + * + * Copyright IBM Corp. 2011 + * Author(s): Jan Glauber + */ +#include +#include +#include +#include +#include + +#ifdef HAVE_JUMP_LABEL + +struct insn { + u16 opcode; + s32 offset; +} __packed; + +struct insn_args { + unsigned long *target; + struct insn *insn; + ssize_t size; +}; + +static int __arch_jump_label_transform(void *data) +{ + struct insn_args *args = data; + int rc; + + rc = probe_kernel_write(args->target, args->insn, args->size); + WARN_ON_ONCE(rc < 0); + return 0; +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + struct insn_args args; + struct insn insn; + + if (type == JUMP_LABEL_ENABLE) { + /* brcl 15,offset */ + insn.opcode = 0xc0f4; + insn.offset = (entry->target - entry->code) >> 1; + } else { + /* brcl 0,0 */ + insn.opcode = 0xc004; + insn.offset = 0; + } + + args.target = (void *) entry->code; + args.insn = &insn; + args.size = JUMP_LABEL_NOP_SIZE; + + stop_machine(__arch_jump_label_transform, &args, NULL); +} + +#endif -- cgit v1.1 From 660e34cebf0a11d54f2d5dd8838607452355f321 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 4 Apr 2011 13:55:05 -0400 Subject: x86: Reorder reboot method preferences We have a never ending stream of 'reboot quirks' for new boxes that will not reboot properly under Linux (they will hang on reboot). The reason is widespread 'Windows compatible' assumption of modern x86 hardware, which expects the following reboot sequence: - hitting the ACPI reboot vector (if available) - trying the keyboard controller - hitting the ACPI reboot vector again - then giving the keyboard controller one last go This sequence expectation gets more and more embedded in modern hardware, which often lacks a keyboard controller and may even lock up if the legacy io ports are hit - and which hardware is often not tested with Linux during development. The end result is that reboot works under Windows-alike OSs but not under Linux. Rework our reboot process to meet this hardware externality a little better and match this assumption of newer x86 hardware. In addition to the ACPI,kbd,ACPI,kbd sequence we'll still fall through to attempting a legacy triple fault if nothing else works - and keep trying that and the kbd reset. Signed-off-by: Matthew Garrett [ this commit will also save special casing Oaktrail boards ] Acked-by: Alan Cox Cc: Linus Torvalds Cc: Andrew Morton Cc: Leann Ogasawara Cc: Dave Jones Cc: Len Brown LKML-Reference: <1301939705-2404-1-git-send-email-mjg@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 08c44b0..0c016f7 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void) { } +/* + * Windows compatible x86 hardware expects the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * + * If the machine is still alive at this stage, it gives up. We default to + * following the same pattern, except that if we're still alive after (4) we'll + * try to force a triple fault and then cycle between hitting the keyboard + * controller and doing that + */ static void native_machine_emergency_restart(void) { int i; + int attempt = 0; + int orig_reboot_type = reboot_type; if (reboot_emergency) emergency_vmx_disable_all(); @@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void) outb(0xfe, 0x64); /* pulse reset low */ udelay(50); } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_TRIPLE; + } + break; case BOOT_TRIPLE: load_idt(&no_idt); -- cgit v1.1 From ded467374a34eb80020c2213456b1d9ca946b88c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 10:53:48 +0200 Subject: x86/amd-iommu: Move compl-wait command building to own function This patch introduces a seperate function for building completion-wait commands. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 57ca777..eebd504 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -383,6 +383,13 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static void build_completion_wait(struct iommu_cmd *cmd) +{ + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = CMD_COMPL_WAIT_INT_MASK; + CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); +} + /* * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. Must be called with iommu->lock held. @@ -458,9 +465,7 @@ static int __iommu_completion_wait(struct amd_iommu *iommu) { struct iommu_cmd cmd; - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + build_completion_wait(&cmd); return __iommu_queue_command(iommu, &cmd); } -- cgit v1.1 From 94fe79e2f100bfcd8e7689cbf8838634779b80a2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 11:07:21 +0200 Subject: x86/amd-iommu: Move inv-dte command building to own function This patch moves command building for the invalidate-dte command into its own function. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index eebd504..4e5631a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -390,6 +390,13 @@ static void build_completion_wait(struct iommu_cmd *cmd) CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); } +static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) +{ + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); +} + /* * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. Must be called with iommu->lock held. @@ -533,10 +540,7 @@ static int iommu_flush_device(struct device *dev) devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - /* Build command */ - memset(&cmd, 0, sizeof(cmd)); - CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); - cmd.data[0] = devid; + build_inv_dte(&cmd, devid); return iommu_queue_command(iommu, &cmd); } -- cgit v1.1 From 11b6402c6673b530fac9920c5640c75e99fee956 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 11:49:28 +0200 Subject: x86/amd-iommu: Cleanup inv_pages command handling This patch reworks the processing of invalidate-pages commands to the IOMMU. The function building the the command is extended so we can get rid of another function. It was also renamed to match with the other function names. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 83 ++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4e5631a..f8ec28e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -397,6 +397,37 @@ static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); } +static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + size_t size, u16 domid, int pde) +{ + u64 pages; + int s; + + pages = iommu_num_pages(address, size, PAGE_SIZE); + s = 0; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + address &= PAGE_MASK; + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + /* * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. Must be called with iommu->lock held. @@ -545,37 +576,6 @@ static int iommu_flush_device(struct device *dev) return iommu_queue_command(iommu, &cmd); } -static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - u16 domid, int pde, int s) -{ - memset(cmd, 0, sizeof(*cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); - cmd->data[1] |= domid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; -} - -/* - * Generic command send function for invalidaing TLB entries - */ -static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, - u64 address, u16 domid, int pde, int s) -{ - struct iommu_cmd cmd; - int ret; - - __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); - - ret = iommu_queue_command(iommu, &cmd); - - return ret; -} - /* * TLB invalidation function which is called from the mapping functions. * It invalidates a single PTE if the range to flush is within a single @@ -584,20 +584,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, static void __iommu_flush_pages(struct protection_domain *domain, u64 address, size_t size, int pde) { - int s = 0, i; - unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); - - address &= PAGE_MASK; - - if (pages > 1) { - /* - * If we have to flush more than one page, flush all - * TLB entries for this domain - */ - address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - s = 1; - } + struct iommu_cmd cmd; + int ret = 0, i; + build_inv_iommu_pages(&cmd, address, size, domain->id, pde); for (i = 0; i < amd_iommus_present; ++i) { if (!domain->dev_iommu[i]) @@ -607,11 +597,10 @@ static void __iommu_flush_pages(struct protection_domain *domain, * Devices of this domain are behind this IOMMU * We need a TLB flush */ - iommu_queue_inv_iommu_pages(amd_iommus[i], address, - domain->id, pde, s); + ret |= iommu_queue_command(amd_iommus[i], &cmd); } - return; + WARN_ON(ret); } static void iommu_flush_pages(struct protection_domain *domain, -- cgit v1.1 From 3fe14ab541cd9b0d1f243afb7556046f12c8743c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:47 +0200 Subject: x86-32, numa: Fix failure condition check in alloc_remap() node_remap_{start|end}_vaddr[] describe [start, end) ranges; however, alloc_remap() incorrectly failed when the current allocation + size equaled the end but it should fail only when it goes over. Fix it. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-2-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index bde3906..84aac47 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size) size = ALIGN(size, L1_CACHE_BYTES); - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) return NULL; node_remap_alloc_vaddr[nid] += size; -- cgit v1.1 From a6c24f7a705d939ddd2fcaa443fa3d8e852b933d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:48 +0200 Subject: x86-32, numa: Align pgdat size while initializing alloc_remap When pgdat is reserved in init_remap_allocator(), PAGE_SIZE aligned size will be used. Match the size alignment in initialization to avoid allocation failure down the road. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-3-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84aac47..50e8250 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -287,7 +287,8 @@ static __init unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] = max_pfn; /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); + size = node_remap_size[nid]; + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); /* convert size to large (pmd size) pages, rounding up */ size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; -- cgit v1.1 From 5b8443b25c0f323ec190d094e4b441957b02664e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:49 +0200 Subject: x86-32, numa: Remove redundant top-down alloc code from remap initialization memblock_find_in_range() now does top-down allocation by default, so there's no reason for its callers to explicitly implement it by gradually lowering the start address. Remove redundant top-down allocation logic from init_meminit() and calculate_numa_remap_pages(). Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-4-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 50e8250..60701a5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -270,8 +270,7 @@ static __init unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_kva_target; - u64 node_kva_final; + u64 node_kva; /* * The acpi/srat node info can show hot-add memroy zones @@ -295,19 +294,11 @@ static __init unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = memblock_find_in_range(node_kva_target, + node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == MEMBLOCK_ERROR) + ((u64)size)<>PAGE_SHIFT); + size, nid, node_kva >> PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system @@ -328,11 +319,11 @@ static __init unsigned long calculate_numa_remap_pages(void) * to use it as free. * So memblock_x86_reserve_range here, hope we don't run out of that array */ - memblock_x86_reserve_range(node_kva_final, - node_kva_final+(((u64)size)<>PAGE_SHIFT; + node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT; } printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", reserve_pages); @@ -356,7 +347,6 @@ static void init_remap_allocator(int nid) void __init initmem_init(void) { int nid; - long kva_target_pfn; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -371,15 +361,10 @@ void __init initmem_init(void) kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = memblock_find_in_range(kva_target_pfn<> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); - + kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + kva_pages << PAGE_SHIFT, + PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT; if (kva_start_pfn == MEMBLOCK_ERROR) panic("Can not get kva space\n"); -- cgit v1.1 From 5510db9c1be111528ce46c57f0bec1c9dce258f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:50 +0200 Subject: x86-32, numa: Reorganize calculate_numa_remap_page() Separate the outer node walking loop and per-node logic from calculate_numa_remap_pages(). The outer loop is collapsed into initmem_init() and the per-node logic is moved into a new function - init_alloc_remap(). The new function name is confusing with the existing init_remap_allocator() and the behavior is the function isn't very clean either at this point, but this is to prepare for further cleanups and it will become prettier. This function doesn't introduce any behavior change. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-5-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 125 +++++++++++++++++++++++++------------------------- 1 file changed, 62 insertions(+), 63 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 60701a5..5039e9b 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -264,70 +264,64 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long calculate_numa_remap_pages(void) +static __init unsigned long init_alloc_remap(int nid, unsigned long offset) { - int nid; - unsigned long size, reserve_pages = 0; + unsigned long size; + u64 node_kva; - for_each_online_node(nid) { - u64 node_kva; - - /* - * The acpi/srat node info can show hot-add memroy zones - * where memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, node_start_pfn[nid], node_end_pfn[nid]); - if (node_start_pfn[nid] > max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid]; - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, - ((u64)node_end_pfn[nid])<> PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out of that array - */ - memblock_x86_reserve_range(node_kva, - node_kva + (((u64)size)<> PAGE_SHIFT; - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; + /* + * The acpi/srat node info can show hot-add memroy zones where + * memory could be added but not currently present. + */ + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", + nid, node_start_pfn[nid], node_end_pfn[nid]); + if (node_start_pfn[nid] > max_pfn) + return 0; + if (!node_end_pfn[nid]) + return 0; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid]; + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + + node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, + (u64)node_end_pfn[nid] << PAGE_SHIFT, + (u64)size << PAGE_SHIFT, + LARGE_PAGE_BYTES); + if (node_kva == MEMBLOCK_ERROR) + panic("Can not get kva ram\n"); + + node_remap_size[nid] = size; + node_remap_offset[nid] = offset; + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva >> PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then + * max_low_pfn but we could have some hole in high memory, + * and it will only check page_is_ram(pfn) && + * !page_is_reserved_early(pfn) to decide to use it as free. + * So memblock_x86_reserve_range here, hope we don't run out + * of that array + */ + memblock_x86_reserve_range(node_kva, + node_kva + ((u64)size << PAGE_SHIFT), + "KVA RAM"); + + node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT; + + return size; } static void init_remap_allocator(int nid) @@ -346,6 +340,7 @@ static void init_remap_allocator(int nid) void __init initmem_init(void) { + unsigned long reserve_pages = 0; int nid; /* @@ -359,7 +354,11 @@ void __init initmem_init(void) get_memcfg_numa(); numa_init_array(); - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); + for_each_online_node(nid) + reserve_pages += init_alloc_remap(nid, reserve_pages); + kva_pages = roundup(reserve_pages, PTRS_PER_PTE); + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", + reserve_pages); kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT, -- cgit v1.1 From c4d4f577d49c441ab4f1bb6068247dafb366e635 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:51 +0200 Subject: x86-32, numa: Rename @node_kva to @node_pa in init_alloc_remap() init_alloc_remap() is about to do more and using _kva suffix for physical address becomes confusing because the function will be handling both physical and virtual addresses. Rename @node_kva to @node_pa. This is trivial rename and doesn't cause any behavior difference. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-6-git-send-email-tj@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 5039e9b..30933fe 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -267,7 +267,7 @@ void resume_map_numa_kva(pgd_t *pgd_base) static __init unsigned long init_alloc_remap(int nid, unsigned long offset) { unsigned long size; - u64 node_kva; + u64 node_pa; /* * The acpi/srat node info can show hot-add memroy zones where @@ -291,17 +291,17 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, - (u64)node_end_pfn[nid] << PAGE_SHIFT, - (u64)size << PAGE_SHIFT, - LARGE_PAGE_BYTES); - if (node_kva == MEMBLOCK_ERROR) + node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, + (u64)node_end_pfn[nid] << PAGE_SHIFT, + (u64)size << PAGE_SHIFT, + LARGE_PAGE_BYTES); + if (node_pa == MEMBLOCK_ERROR) panic("Can not get kva ram\n"); node_remap_size[nid] = size; node_remap_offset[nid] = offset; printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", - size, nid, node_kva >> PAGE_SHIFT); + size, nid, node_pa >> PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system @@ -315,11 +315,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) * So memblock_x86_reserve_range here, hope we don't run out * of that array */ - memblock_x86_reserve_range(node_kva, - node_kva + ((u64)size << PAGE_SHIFT), + memblock_x86_reserve_range(node_pa, node_pa + ((u64)size << PAGE_SHIFT), "KVA RAM"); - node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT; + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; return size; } -- cgit v1.1 From af7c1a6e8374e05aab4a98ce4d2fb07b66506a02 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:52 +0200 Subject: x86-32, numa: Make @size in init_aloc_remap() represent bytes @size variable in init_alloc_remap() is confusing in that it starts as number of bytes as its name implies and then becomes number of pages. Make it consistently represent bytes. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-7-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 30933fe..99310d2 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -286,22 +286,19 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) size = node_remap_size[nid]; size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; + /* align to large page */ + size = ALIGN(size, LARGE_PAGE_BYTES); node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, (u64)node_end_pfn[nid] << PAGE_SHIFT, - (u64)size << PAGE_SHIFT, - LARGE_PAGE_BYTES); + size, LARGE_PAGE_BYTES); if (node_pa == MEMBLOCK_ERROR) panic("Can not get kva ram\n"); - node_remap_size[nid] = size; + node_remap_size[nid] = size >> PAGE_SHIFT; node_remap_offset[nid] = offset; printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", - size, nid, node_pa >> PAGE_SHIFT); + size >> PAGE_SHIFT, nid, node_pa >> PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system @@ -315,12 +312,11 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) * So memblock_x86_reserve_range here, hope we don't run out * of that array */ - memblock_x86_reserve_range(node_pa, node_pa + ((u64)size << PAGE_SHIFT), - "KVA RAM"); + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - return size; + return size >> PAGE_SHIFT; } static void init_remap_allocator(int nid) -- cgit v1.1 From 7210cf9217937e470a9acbc113a590f476b9c047 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:53 +0200 Subject: x86-32, numa: Calculate remap size in common code Only pgdat and memmap use remap area and there isn't much benefit in allowing per-node override. In addition, the use of node_remap_size[] is confusing in that it contains number of bytes before remap initialization and then number of pages afterwards. Move remap size calculation for memap from specific NUMA config implementations to init_alloc_remap() and make node_remap_size[] static. The only behavior difference is that, before this patch, numaq_32 didn't consider max_pfn when calculating the memmap size but it's enforced after this patch, which is the right thing to do. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-8-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/topology.h | 1 - arch/x86/kernel/apic/numaq_32.c | 4 ---- arch/x86/mm/numa_32.c | 10 ++++------ arch/x86/mm/srat_32.c | 1 - 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 910a708..8dba769 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void); #ifdef CONFIG_X86_32 extern unsigned long node_start_pfn[]; extern unsigned long node_end_pfn[]; -extern unsigned long node_remap_size[]; #define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) # define SD_CACHE_NICE_TRIES 1 diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 6273eee..0aced70 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd) node_end_pfn[node]); memory_present(node, node_start_pfn[node], node_end_pfn[node]); - - node_remap_size[node] = node_memmap_size_bytes(node, - node_start_pfn[node], - node_end_pfn[node]); } /* diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 99310d2..9a73365 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -104,7 +104,7 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_size[MAX_NUMNODES]; +static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -129,7 +129,6 @@ int __init get_memcfg_numa_flat(void) node_end_pfn[0] = max_pfn; memblock_x86_register_active_regions(0, 0, max_pfn); memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); @@ -282,11 +281,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid]; + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, node_start_pfn[nid], + min(node_end_pfn[nid], max_pfn)); size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - /* align to large page */ size = ALIGN(size, LARGE_PAGE_BYTES); node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 48651c6..1b9e82c 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -276,7 +276,6 @@ int __init get_memcfg_from_srat(void) unsigned long end = min(node_end_pfn[nid], max_pfn); memory_present(nid, start, end); - node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); } return 1; out_fail: -- cgit v1.1 From 82044c328d6f6b22882c2a936e487e6d2240817a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:54 +0200 Subject: x86-32, numa: Make init_alloc_remap() less panicky Remap allocator failure isn't fatal. The callers are required to fall back to regular early memory allocation mechanisms on failure anyway, so there's no reason to panic on remap init failure. Whining and returning are enough. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-9-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 9a73365..c127543 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -290,8 +290,11 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, (u64)node_end_pfn[nid] << PAGE_SHIFT, size, LARGE_PAGE_BYTES); - if (node_pa == MEMBLOCK_ERROR) - panic("Can not get kva ram\n"); + if (node_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", + size, nid); + return 0; + } node_remap_size[nid] = size >> PAGE_SHIFT; node_remap_offset[nid] = offset; -- cgit v1.1 From 0e9f93c1c04c8ab10cc564df54a7ad0f83c67796 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:55 +0200 Subject: x86-32, numa: Move lowmem address space reservation to init_alloc_remap() Remap alloc init is done in the following stages. 1. init_alloc_remap() calculates how much memory is necessary for each node and reserves node local memory. 2. initmem_init() collects how much each node needs and reserves a single contiguous lowmem area which can contain all. 3. init_remap_allocator() initializes allocator parameters from the determined lowmem address and per-node offsets. 4. Actual remap happens. There is no reason for the lowmem remap area to be reserved as a single contiguous area at one go. They don't interact with each other and the memblock allocator will put them side-by-side anyway. This patch breaks up the single lowmem address reservation and put per-node lowmem address reservation into init_alloc_remap() and initializes allocator parameters directly in the function as all the addresses are determined there. This merges steps 2 and 3 into 1. While at it, remove now largely irrelevant comments in init_alloc_remap(). This change causes the following behavior changes. * Remap lowmem areas are allocated in smaller per-node chunks. * Remap lowmem area reservation failure fail future remap allocations instead of panicking. * Remap allocator initialization is less verbose. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-10-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 82 ++++++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 57 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index c127543..12bb34c 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -108,9 +108,6 @@ static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static unsigned long kva_start_pfn; -static unsigned long kva_pages; - int __cpuinit numa_cpu_node(int cpu) { return apic->x86_32_numa_cpu_node(cpu); @@ -266,7 +263,8 @@ void resume_map_numa_kva(pgd_t *pgd_base) static __init unsigned long init_alloc_remap(int nid, unsigned long offset) { unsigned long size; - u64 node_pa; + u64 node_pa, remap_pa; + void *remap_va; /* * The acpi/srat node info can show hot-add memroy zones where @@ -287,6 +285,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); size = ALIGN(size, LARGE_PAGE_BYTES); + /* allocate node memory and the lowmem remap area */ node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, (u64)node_end_pfn[nid] << PAGE_SHIFT, size, LARGE_PAGE_BYTES); @@ -295,45 +294,35 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) size, nid); return 0; } + memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + + remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (remap_pa == MEMBLOCK_ERROR) { + pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", + size, nid); + memblock_x86_free_range(node_pa, node_pa + size); + return 0; + } + memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + remap_va = phys_to_virt(remap_pa); + /* initialize remap allocator parameters */ + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; node_remap_size[nid] = size >> PAGE_SHIFT; node_remap_offset[nid] = offset; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", - size >> PAGE_SHIFT, nid, node_pa >> PAGE_SHIFT); - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then - * max_low_pfn but we could have some hole in high memory, - * and it will only check page_is_ram(pfn) && - * !page_is_reserved_early(pfn) to decide to use it as free. - * So memblock_x86_reserve_range here, hope we don't run out - * of that array - */ - memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + node_remap_start_vaddr[nid] = remap_va; + node_remap_end_vaddr[nid] = remap_va + size; + node_remap_alloc_vaddr[nid] = remap_va + ALIGN(sizeof(pg_data_t), PAGE_SIZE); - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; + printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", + nid, node_pa, node_pa + size, remap_va, remap_va + size); return size >> PAGE_SHIFT; } -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); -} - void __init initmem_init(void) { unsigned long reserve_pages = 0; @@ -352,25 +341,7 @@ void __init initmem_init(void) for_each_online_node(nid) reserve_pages += init_alloc_remap(nid, reserve_pages); - kva_pages = roundup(reserve_pages, PTRS_PER_PTE); - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - - kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - kva_pages << PAGE_SHIFT, - PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT; - if (kva_start_pfn == MEMBLOCK_ERROR) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); - - /* avoid clash with initrd */ - memblock_x86_reserve_range(kva_start_pfn< max_low_pfn) @@ -390,11 +361,8 @@ void __init initmem_init(void) printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - + for_each_online_node(nid) allocate_pgdat(nid); - } remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", -- cgit v1.1 From 2a286344f06d6341740b284494379373e87648f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:56 +0200 Subject: x86-32, numa: Move remapping for remap allocator into init_alloc_remap() There's no reason to perform the actual remapping separately. Collapse remap_numa_kva() into init_alloc_remap() and, while at it, make it less verbose. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-11-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 12bb34c..53ec13a 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -205,26 +205,6 @@ void *alloc_remap(int nid, unsigned long size) return allocation; } -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn<> PAGE_SHIFT; pfn += PTRS_PER_PTE) + set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), + (node_pa >> PAGE_SHIFT) + pfn, + PAGE_KERNEL_LARGE); + /* initialize remap allocator parameters */ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; node_remap_size[nid] = size >> PAGE_SHIFT; @@ -363,7 +349,6 @@ void __init initmem_init(void) (ulong) pfn_to_kaddr(max_low_pfn)); for_each_online_node(nid) allocate_pgdat(nid); - remap_numa_kva(); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); -- cgit v1.1 From b2e3e4fa3eee752b893687783f2a427106c93423 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:57 +0200 Subject: x86-32, numa: Make pgdat allocation use alloc_remap() pgdat allocation is handled differnetly from other remap allocations - it's reserved during initialization. There's no reason to handle this any differnetly. Remap allocator is initialized for every node and if init failed the allocation will fail and pgdat allocation can fall back to generic code like anyone else. Remove special init-time pgdat reservation and make allocate_pgdat() use alloc_remap() like everyone else. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-12-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 53ec13a..0184a9f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -160,9 +160,8 @@ static void __init allocate_pgdat(int nid) { char buf[16]; - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { + NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE)); + if (!NODE_DATA(nid)) { unsigned long pgdat_phys; pgdat_phys = memblock_find_in_range(min_low_pfn< [%p-%p)\n", nid, node_pa, node_pa + size, remap_va, remap_va + size); -- cgit v1.1 From 1d85b61baf0334dd6bb88261bec42b808204d694 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:58 +0200 Subject: x86-32, numa: Remove now useless node_remap_offset[] With lowmem address reservation moved into init_alloc_remap(), node_remap_offset[] is no longer useful. Remove it and related offset handling code. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-13-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0184a9f..960ea7b 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -187,7 +187,6 @@ static void __init allocate_pgdat(int nid) static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; void *alloc_remap(int nid, unsigned long size) { @@ -239,7 +238,7 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif -static __init unsigned long init_alloc_remap(int nid, unsigned long offset) +static __init void init_alloc_remap(int nid) { unsigned long size, pfn; u64 node_pa, remap_pa; @@ -252,9 +251,9 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", nid, node_start_pfn[nid], node_end_pfn[nid]); if (node_start_pfn[nid] > max_pfn) - return 0; + return; if (!node_end_pfn[nid]) - return 0; + return; if (node_end_pfn[nid] > max_pfn) node_end_pfn[nid] = max_pfn; @@ -271,7 +270,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) if (node_pa == MEMBLOCK_ERROR) { pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", size, nid); - return 0; + return; } memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); @@ -282,7 +281,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", size, nid); memblock_x86_free_range(node_pa, node_pa + size); - return 0; + return; } memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); remap_va = phys_to_virt(remap_pa); @@ -296,7 +295,6 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) /* initialize remap allocator parameters */ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; node_remap_size[nid] = size >> PAGE_SHIFT; - node_remap_offset[nid] = offset; node_remap_start_vaddr[nid] = remap_va; node_remap_end_vaddr[nid] = remap_va + size; @@ -304,13 +302,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset) printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", nid, node_pa, node_pa + size, remap_va, remap_va + size); - - return size >> PAGE_SHIFT; } void __init initmem_init(void) { - unsigned long reserve_pages = 0; int nid; /* @@ -325,7 +320,7 @@ void __init initmem_init(void) numa_init_array(); for_each_online_node(nid) - reserve_pages += init_alloc_remap(nid, reserve_pages); + init_alloc_remap(nid); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; -- cgit v1.1 From 198bd06bbfde2984027e91f64c55eb19a7034a27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:23:59 +0200 Subject: x86-32, numa: Remove redundant node_remap_size[] Remap area size can be determined from node_remap_start_vaddr[] and node_remap_end_vaddr[] making node_remap_size[] redundant. Remove it. While at it, make resume_map_numa_kva() use @nr_pages for number of pages instead of @size. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-14-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 960ea7b..f325e6f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -104,7 +104,6 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -static unsigned long node_remap_size[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -214,15 +213,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) int node; for_each_online_node(node) { - unsigned long start_va, start_pfn, size, pfn; + unsigned long start_va, start_pfn, nr_pages, pfn; start_va = (unsigned long)node_remap_start_vaddr[node]; start_pfn = node_remap_start_pfn[node]; - size = node_remap_size[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; printk(KERN_DEBUG "%s: node %d\n", __func__, node); - for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); pgd_t *pgd = pgd_base + pgd_index(vaddr); pud_t *pud = pud_offset(pgd, vaddr); @@ -294,8 +294,6 @@ static __init void init_alloc_remap(int nid) /* initialize remap allocator parameters */ node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_size[nid] = size >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; node_remap_end_vaddr[nid] = remap_va + size; node_remap_alloc_vaddr[nid] = remap_va; -- cgit v1.1 From 993ba1585cbb03fab012e41d1a5d24330a283b31 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Apr 2011 00:24:00 +0200 Subject: x86-32, numa: Update remap allocator comments Now that remap allocator is cleaned up, update comments such that they are in docbook function description format and reflect the actual implementation. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1301955840-7246-15-git-send-email-tj@kernel.org Acked-by: Yinghai Lu Cc: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 56 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f325e6f..c757c0a 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -176,17 +176,31 @@ static void __init allocate_pgdat(int nid) } /* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. + * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; static void *node_remap_end_vaddr[MAX_NUMNODES]; static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; @@ -238,6 +252,28 @@ void resume_map_numa_kva(pgd_t *pgd_base) } #endif +/** + * init_alloc_remap - Initialize remap allocator for a NUMA node + * @nid: NUMA node to initizlie remap allocator for + * + * NUMA nodes may end up without any lowmem. As allocating pgdat and + * memmap on a different node with lowmem is inefficient, a special + * remap allocator is implemented which can be used by alloc_remap(). + * + * For each node, the amount of memory which will be necessary for + * pgdat and memmap is calculated and two memory areas of the size are + * allocated - one in the node and the other in lowmem; then, the area + * in the node is remapped to the lowmem area. + * + * As pgdat and memmap must be allocated in lowmem anyway, this + * doesn't waste lowmem address space; however, the actual lowmem + * which gets remapped over is wasted. The amount shouldn't be + * problematic on machines this feature will be used. + * + * Initialization failure isn't fatal. alloc_remap() is used + * opportunistically and the callers will fall back to other memory + * allocation mechanisms on failure. + */ static __init void init_alloc_remap(int nid) { unsigned long size, pfn; @@ -306,14 +342,6 @@ void __init initmem_init(void) { int nid; - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - get_memcfg_numa(); numa_init_array(); -- cgit v1.1 From 815b33fdc279d34ab40a8bfe1866623a4cc5669b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 17:26:49 +0200 Subject: x86/amd-iommu: Cleanup completion-wait handling This patch cleans up the implementation of completion-wait command sending. It also switches the completion indicator from the MMIO bit to a memory store which can be checked without IOMMU locking. As a side effect this patch makes the __iommu_queue_command function obsolete and so it is removed too. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 107 ++++++++++++-------------------------------- 1 file changed, 28 insertions(+), 79 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f8ec28e..073c64b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,7 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define EXIT_LOOP_COUNT 10000000 +#define LOOP_TIMEOUT 100000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); @@ -383,10 +384,14 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ -static void build_completion_wait(struct iommu_cmd *cmd) +static void build_completion_wait(struct iommu_cmd *cmd, u64 address) { + WARN_ON(address & 0x7ULL); + memset(cmd, 0, sizeof(*cmd)); - cmd->data[0] = CMD_COMPL_WAIT_INT_MASK; + cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; + cmd->data[1] = upper_32_bits(__pa(address)); + cmd->data[2] = 1; CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); } @@ -432,12 +437,14 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. Must be called with iommu->lock held. */ -static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { + unsigned long flags; u32 tail, head; u8 *target; WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); + spin_lock_irqsave(&iommu->lock, flags); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); @@ -446,99 +453,41 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) if (tail == head) return -ENOMEM; writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - - return 0; -} - -/* - * General queuing function for commands. Takes iommu->lock and calls - * __iommu_queue_command(). - */ -static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&iommu->lock, flags); - ret = __iommu_queue_command(iommu, cmd); - if (!ret) - iommu->need_sync = true; + iommu->need_sync = true; spin_unlock_irqrestore(&iommu->lock, flags); - return ret; -} - -/* - * This function waits until an IOMMU has completed a completion - * wait command - */ -static void __iommu_wait_for_completion(struct amd_iommu *iommu) -{ - int ready = 0; - unsigned status = 0; - unsigned long i = 0; - - INC_STATS_COUNTER(compl_wait); - - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; - } - - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - - if (unlikely(i == EXIT_LOOP_COUNT)) - iommu->reset_in_progress = true; + return 0; } /* * This function queues a completion wait command into the command * buffer of an IOMMU */ -static int __iommu_completion_wait(struct amd_iommu *iommu) -{ - struct iommu_cmd cmd; - - build_completion_wait(&cmd); - - return __iommu_queue_command(iommu, &cmd); -} - -/* - * This function is called whenever we need to ensure that the IOMMU has - * completed execution of all commands we sent. It sends a - * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs - * us about that by writing a value to a physical address we pass with - * the command. - */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret = 0; - unsigned long flags; - - spin_lock_irqsave(&iommu->lock, flags); + struct iommu_cmd cmd; + volatile u64 sem = 0; + int ret, i = 0; if (!iommu->need_sync) - goto out; - - ret = __iommu_completion_wait(iommu); + return 0; - iommu->need_sync = false; + build_completion_wait(&cmd, (u64)&sem); + ret = iommu_queue_command(iommu, &cmd); if (ret) - goto out; - - __iommu_wait_for_completion(iommu); + return ret; -out: - spin_unlock_irqrestore(&iommu->lock, flags); + while (sem == 0 && i < LOOP_TIMEOUT) { + udelay(1); + i += 1; + } - if (iommu->reset_in_progress) + if (i == LOOP_TIMEOUT) { + pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); + } return 0; } -- cgit v1.1 From 61985a040f17c03b09a2772508ee02729571365b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 17:46:49 +0200 Subject: x86/amd-iommu: Remove command buffer resetting logic The logic to reset the command buffer caused more problems than it actually helped. The logic jumped in when the IOMMU hardware doesn't execute commands anymore but the reasons for this are usually not fixed by just resetting the command buffer. So the code can be removed to reduce complexity. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 3 --- arch/x86/kernel/amd_iommu.c | 20 +------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index e3509fc..878ae00 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -409,9 +409,6 @@ struct amd_iommu { /* if one, we need to send a completion wait command */ bool need_sync; - /* becomes true if a command buffer reset is running */ - bool reset_in_progress; - /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 073c64b..0147c5c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -58,7 +58,6 @@ struct iommu_cmd { u32 data[4]; }; -static void reset_iommu_command_buffer(struct amd_iommu *iommu); static void update_domain(struct protection_domain *domain); /**************************************************************************** @@ -323,8 +322,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); - iommu->reset_in_progress = true; - reset_iommu_command_buffer(iommu); dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: @@ -485,8 +482,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (i == LOOP_TIMEOUT) { pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); - iommu->reset_in_progress = true; - reset_iommu_command_buffer(iommu); + ret = -EIO; } return 0; @@ -628,20 +624,6 @@ void amd_iommu_flush_all_domains(void) spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -static void reset_iommu_command_buffer(struct amd_iommu *iommu) -{ - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); - - amd_iommu_reset_cmd_buffer(iommu); - amd_iommu_flush_all_devices(); - amd_iommu_flush_all_domains(); - - iommu->reset_in_progress = false; -} - /**************************************************************************** * * The functions below are used the create the page table mappings for -- cgit v1.1 From 17b124bf1463582005d662d4dd95f037ad863c57 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 18:01:35 +0200 Subject: x86/amd-iommu: Rename iommu_flush* to domain_flush* These functions all operate on protection domains and not on singe IOMMUs. Represent that in their name. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 87 +++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0147c5c..9d66b20 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -488,22 +488,6 @@ static int iommu_completion_wait(struct amd_iommu *iommu) return 0; } -static void iommu_flush_complete(struct protection_domain *domain) -{ - int i; - - for (i = 0; i < amd_iommus_present; ++i) { - if (!domain->dev_iommu[i]) - continue; - - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } -} - /* * Command send function for invalidating a device table entry */ @@ -526,8 +510,8 @@ static int iommu_flush_device(struct device *dev) * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static void __iommu_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) +static void __domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { struct iommu_cmd cmd; int ret = 0, i; @@ -548,29 +532,45 @@ static void __iommu_flush_pages(struct protection_domain *domain, WARN_ON(ret); } -static void iommu_flush_pages(struct protection_domain *domain, - u64 address, size_t size) +static void domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - __iommu_flush_pages(domain, address, size, 0); + __domain_flush_pages(domain, address, size, 0); } /* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct protection_domain *domain) +static void domain_flush_tlb(struct protection_domain *domain) { - __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); + __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct protection_domain *domain) +static void domain_flush_tlb_pde(struct protection_domain *domain) { - __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); + __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); +} + +static void domain_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } } /* * This function flushes the DTEs for all devices in domain */ -static void iommu_flush_domain_devices(struct protection_domain *domain) +static void domain_flush_devices(struct protection_domain *domain) { struct iommu_dev_data *dev_data; unsigned long flags; @@ -591,8 +591,8 @@ static void iommu_flush_all_domain_devices(void) spin_lock_irqsave(&amd_iommu_pd_lock, flags); list_for_each_entry(domain, &amd_iommu_pd_list, list) { - iommu_flush_domain_devices(domain); - iommu_flush_complete(domain); + domain_flush_devices(domain); + domain_flush_complete(domain); } spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); @@ -616,8 +616,8 @@ void amd_iommu_flush_all_domains(void) list_for_each_entry(domain, &amd_iommu_pd_list, list) { spin_lock(&domain->lock); - iommu_flush_tlb_pde(domain); - iommu_flush_complete(domain); + domain_flush_tlb_pde(domain); + domain_flush_complete(domain); spin_unlock(&domain->lock); } @@ -1480,7 +1480,7 @@ static int attach_device(struct device *dev, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_flush_tlb_pde(domain); + domain_flush_tlb_pde(domain); return ret; } @@ -1693,8 +1693,9 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - iommu_flush_domain_devices(domain); - iommu_flush_tlb_pde(domain); + + domain_flush_devices(domain); + domain_flush_tlb_pde(domain); domain->updated = false; } @@ -1853,10 +1854,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(&dma_dom->domain); + domain_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; } else if (unlikely(amd_iommu_np_cache)) - iommu_flush_pages(&dma_dom->domain, address, size); + domain_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1905,7 +1906,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(&dma_dom->domain, flush_addr, size); + domain_flush_pages(&dma_dom->domain, flush_addr, size); dma_dom->need_flush = false; } } @@ -1941,7 +1942,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, if (addr == DMA_ERROR_CODE) goto out; - iommu_flush_complete(domain); + domain_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1968,7 +1969,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, __unmap_single(domain->priv, dma_addr, size, dir); - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2033,7 +2034,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_flush_complete(domain); + domain_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -2079,7 +2080,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2129,7 +2130,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out_free; } - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2161,7 +2162,7 @@ static void free_coherent(struct device *dev, size_t size, __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2471,7 +2472,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, unmap_size = iommu_unmap_page(domain, iova, page_size); mutex_unlock(&domain->api_lock); - iommu_flush_tlb_pde(domain); + domain_flush_tlb_pde(domain); return get_order(unmap_size); } -- cgit v1.1 From ac0ea6e92b2227c86fe4f7f9eb429071d617a25d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 18:38:20 +0200 Subject: x86/amd-iommu: Improve handling of full command buffer This patch improved the handling of commands when the IOMMU command buffer is nearly full. In this case it issues an completion wait command and waits until the IOMMU has processed it before continuing queuing new commands. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 88 +++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9d66b20..75c7f8c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -381,6 +381,39 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static int wait_on_sem(volatile u64 *sem) +{ + int i = 0; + + while (*sem == 0 && i < LOOP_TIMEOUT) { + udelay(1); + i += 1; + } + + if (i == LOOP_TIMEOUT) { + pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); + return -EIO; + } + + return 0; +} + +static void copy_cmd_to_buffer(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + u32 tail) +{ + u8 *target; + + target = iommu->cmd_buf + tail; + tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + + /* Copy command to buffer */ + memcpy(target, cmd, sizeof(*cmd)); + + /* Tell the IOMMU about it */ + writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); +} + static void build_completion_wait(struct iommu_cmd *cmd, u64 address) { WARN_ON(address & 0x7ULL); @@ -432,25 +465,44 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, /* * Writes the command to the IOMMUs command buffer and informs the - * hardware about the new command. Must be called with iommu->lock held. + * hardware about the new command. */ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { + u32 left, tail, head, next_tail; unsigned long flags; - u32 tail, head; - u8 *target; WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); + +again: spin_lock_irqsave(&iommu->lock, flags); - tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - target = iommu->cmd_buf + tail; - memcpy_toio(target, cmd, sizeof(*cmd)); - tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; - head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - if (tail == head) - return -ENOMEM; - writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + left = (head - next_tail) % iommu->cmd_buf_size; + + if (left <= 2) { + struct iommu_cmd sync_cmd; + volatile u64 sem = 0; + int ret; + + build_completion_wait(&sync_cmd, (u64)&sem); + copy_cmd_to_buffer(iommu, &sync_cmd, tail); + + spin_unlock_irqrestore(&iommu->lock, flags); + + if ((ret = wait_on_sem(&sem)) != 0) + return ret; + + goto again; + } + + copy_cmd_to_buffer(iommu, cmd, tail); + + /* We need to sync now to make sure all commands are processed */ iommu->need_sync = true; + spin_unlock_irqrestore(&iommu->lock, flags); return 0; @@ -464,7 +516,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) { struct iommu_cmd cmd; volatile u64 sem = 0; - int ret, i = 0; + int ret; if (!iommu->need_sync) return 0; @@ -475,17 +527,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (ret) return ret; - while (sem == 0 && i < LOOP_TIMEOUT) { - udelay(1); - i += 1; - } - - if (i == LOOP_TIMEOUT) { - pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); - ret = -EIO; - } - - return 0; + return wait_on_sem(&sem); } /* -- cgit v1.1 From d8c13085775c72e2d46edc54ed0c803c3a944ddb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 6 Apr 2011 18:51:26 +0200 Subject: x86/amd-iommu: Rename iommu_flush_device This function operates on a struct device, so give it a name that represents that. As a side effect a new function is introduced which operates on am iommu and a device-id. It will be used again in a later patch. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 75c7f8c..3557f22 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -530,21 +530,27 @@ static int iommu_completion_wait(struct amd_iommu *iommu) return wait_on_sem(&sem); } +static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) +{ + struct iommu_cmd cmd; + + build_inv_dte(&cmd, devid); + + return iommu_queue_command(iommu, &cmd); +} + /* * Command send function for invalidating a device table entry */ -static int iommu_flush_device(struct device *dev) +static int device_flush_dte(struct device *dev) { struct amd_iommu *iommu; - struct iommu_cmd cmd; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - build_inv_dte(&cmd, devid); - - return iommu_queue_command(iommu, &cmd); + return iommu_flush_dte(iommu, devid); } /* @@ -620,7 +626,7 @@ static void domain_flush_devices(struct protection_domain *domain) spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(dev_data, &domain->dev_list, list) - iommu_flush_device(dev_data->dev); + device_flush_dte(dev_data->dev); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1424,7 +1430,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain) domain->dev_cnt += 1; /* Flush the DTE entry */ - iommu_flush_device(dev); + device_flush_dte(dev); } static void do_detach(struct device *dev) @@ -1447,7 +1453,7 @@ static void do_detach(struct device *dev) clear_dte_entry(devid); /* Flush the DTE entry */ - iommu_flush_device(dev); + device_flush_dte(dev); } /* @@ -1663,7 +1669,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; } - iommu_flush_device(dev); + device_flush_dte(dev); iommu_completion_wait(iommu); out: @@ -2448,7 +2454,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; - iommu_flush_device(dev); + device_flush_dte(dev); iommu_completion_wait(iommu); } -- cgit v1.1 From 7d0c5cc5be73f7ce26fdcca7b8ec2203f661eb93 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 7 Apr 2011 08:16:10 +0200 Subject: x86/amd-iommu: Flush all internal TLBs when IOMMUs are enabled The old code only flushed a DTE or a domain TLB before it is actually used by the IOMMU driver. While this is efficient and works when done right it is more likely to introduce new bugs when changing code (which happened in the past). This patch adds code to flush all DTEs and all domain TLBs in each IOMMU right after it is enabled (at boot and after resume). This reduces the complexity of the driver and makes it less likely to introduce stale-TLB bugs in the future. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_proto.h | 2 - arch/x86/kernel/amd_iommu.c | 75 +++++++++++++++------------------- arch/x86/kernel/amd_iommu_init.c | 11 ++++- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 916bc81..1223c0f 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -24,8 +24,6 @@ struct amd_iommu; extern int amd_iommu_init_dma_ops(void); extern int amd_iommu_init_passthrough(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_flush_all_domains(void); -extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); extern int amd_iommu_init_devices(void); diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 3557f22..bcf58ea 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -539,6 +539,40 @@ static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) return iommu_queue_command(iommu, &cmd); } +static void iommu_flush_dte_all(struct amd_iommu *iommu) +{ + u32 devid; + + for (devid = 0; devid <= 0xffff; ++devid) + iommu_flush_dte(iommu, devid); + + iommu_completion_wait(iommu); +} + +/* + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. + */ +static void iommu_flush_tlb_all(struct amd_iommu *iommu) +{ + u32 dom_id; + + for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { + struct iommu_cmd cmd; + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + dom_id, 1); + iommu_queue_command(iommu, &cmd); + } + + iommu_completion_wait(iommu); +} + +void iommu_flush_all_caches(struct amd_iommu *iommu) +{ + iommu_flush_dte_all(iommu); + iommu_flush_tlb_all(iommu); +} + /* * Command send function for invalidating a device table entry */ @@ -631,47 +665,6 @@ static void domain_flush_devices(struct protection_domain *domain) spin_unlock_irqrestore(&domain->lock, flags); } -static void iommu_flush_all_domain_devices(void) -{ - struct protection_domain *domain; - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - - list_for_each_entry(domain, &amd_iommu_pd_list, list) { - domain_flush_devices(domain); - domain_flush_complete(domain); - } - - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); -} - -void amd_iommu_flush_all_devices(void) -{ - iommu_flush_all_domain_devices(); -} - -/* - * This function uses heavy locking and may disable irqs for some time. But - * this is no issue because it is only called during resume. - */ -void amd_iommu_flush_all_domains(void) -{ - struct protection_domain *domain; - unsigned long flags; - - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - - list_for_each_entry(domain, &amd_iommu_pd_list, list) { - spin_lock(&domain->lock); - domain_flush_tlb_pde(domain); - domain_flush_complete(domain); - spin_unlock(&domain->lock); - } - - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); -} - /**************************************************************************** * * The functions below are used the create the page table mappings for diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 246d727..8848dda 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -180,6 +180,12 @@ static u32 dev_table_size; /* size of the device table */ static u32 alias_table_size; /* size of the alias table */ static u32 rlookup_table_size; /* size if the rlookup table */ +/* + * This function flushes all internal caches of + * the IOMMU used by this driver. + */ +extern void iommu_flush_all_caches(struct amd_iommu *iommu); + static inline void update_last_devid(u16 devid) { if (devid > amd_iommu_last_bdf) @@ -1244,6 +1250,7 @@ static void enable_iommus(void) iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable(iommu); + iommu_flush_all_caches(iommu); } } @@ -1274,8 +1281,8 @@ static void amd_iommu_resume(void) * we have to flush after the IOMMUs are enabled because a * disabled IOMMU will never execute the commands we send */ - amd_iommu_flush_all_devices(); - amd_iommu_flush_all_domains(); + for_each_iommu(iommu) + iommu_flush_all_caches(iommu); } static int amd_iommu_suspend(void) -- cgit v1.1 From ba4b87ad5497cba555954885db99c99627f93748 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 31 Mar 2011 08:08:09 -0400 Subject: dma-debug: print information about leaked entry When driver leak dma mapping, print additional information about one of leaked entries, to to help investigate problem. Patch should be useful for debugging drivers, which maps many different class of buffers. Signed-off-by: Stanislaw Gruszka Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 4bfb047..db07bfd 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -649,7 +649,7 @@ out_err: return -ENOMEM; } -static int device_dma_allocations(struct device *dev) +static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) { struct dma_debug_entry *entry; unsigned long flags; @@ -660,8 +660,10 @@ static int device_dma_allocations(struct device *dev) for (i = 0; i < HASH_SIZE; ++i) { spin_lock(&dma_entry_hash[i].lock); list_for_each_entry(entry, &dma_entry_hash[i].list, list) { - if (entry->dev == dev) + if (entry->dev == dev) { count += 1; + *out_entry = entry; + } } spin_unlock(&dma_entry_hash[i].lock); } @@ -674,6 +676,7 @@ static int device_dma_allocations(struct device *dev) static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; + struct dma_debug_entry *uninitialized_var(entry); int count; if (global_disable) @@ -681,12 +684,17 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: - count = device_dma_allocations(dev); + count = device_dma_allocations(dev, &entry); if (count == 0) break; - err_printk(dev, NULL, "DMA-API: device driver has pending " + err_printk(dev, entry, "DMA-API: device driver has pending " "DMA allocations while released from device " - "[count=%d]\n", count); + "[count=%d]\n" + "One of leaked entries details: " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [mapped as %s]\n", + count, entry->dev_addr, entry->size, + dir2name[entry->direction], type2name[entry->type]); break; default: break; -- cgit v1.1 From ce9c99af8d4b3b0b9463654fd252d8640d804dc3 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 8 Apr 2011 07:42:29 +0100 Subject: x86, cpu: Move AMD Elan Kconfig under "Processor family" Currently the option resides under X86_EXTENDED_PLATFORM due to historical nonstandard A20M# handling. However that is no longer the case and so Elan can be treated as part of the standard processor choice Kconfig option. Signed-off-by: Ian Campbell Link: http://lkml.kernel.org/r/1302245177.31620.47.camel@localhost.localdomain Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 11 ----------- arch/x86/Kconfig.cpu | 16 ++++++++++------ arch/x86/Makefile_32.cpu | 2 +- arch/x86/include/asm/module.h | 2 +- arch/x86/kernel/cpu/cpufreq/Kconfig | 4 ++-- 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a..f00a3f3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -365,17 +365,6 @@ config X86_UV # Following is an alphabetically sorted list of 32 bit extended platforms # Please maintain the alphabetic order if and when there are additions -config X86_ELAN - bool "AMD Elan" - depends on X86_32 - depends on X86_EXTENDED_PLATFORM - ---help--- - Select this for an AMD Elan processor. - - Do not use this option for K6/Athlon/Opteron processors! - - If unsure, choose "PC-compatible" instead. - config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index d161e93..6a7cfdf 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,6 +1,4 @@ # Put here option for CPU selection and depending optimization -if !X86_ELAN - choice prompt "Processor family" default M686 if X86_32 @@ -203,6 +201,14 @@ config MWINCHIP3D stores for this CPU, which can increase performance of some operations. +config MELAN + bool "AMD Elan" + depends on X86_32 + ---help--- + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + config MGEODEGX1 bool "GeodeGX1" depends on X86_32 @@ -292,8 +298,6 @@ config X86_GENERIC This is really intended for distributors who need more generic optimizations. -endif - # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT @@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 + default "4" if MELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_XADD @@ -358,7 +362,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index f2ee1ab..86cee7b 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march= $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) # AMD Elan support -cflags-$(CONFIG_X86_ELAN) += -march=i486 +cflags-$(CONFIG_MELAN) += -march=i486 # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 67763c5..9eae775 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -35,7 +35,7 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_X86_ELAN +#elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE #define MODULE_PROC_FAMILY "CRUSOE " diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 870e6cc..0ab9b22 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -43,7 +43,7 @@ config X86_ACPI_CPUFREQ config ELAN_CPUFREQ tristate "AMD Elan SC400 and SC410" select CPU_FREQ_TABLE - depends on X86_ELAN + depends on MELAN ---help--- This adds the CPUFreq driver for AMD Elan SC400 and SC410 processors. @@ -59,7 +59,7 @@ config ELAN_CPUFREQ config SC520_CPUFREQ tristate "AMD Elan SC520" select CPU_FREQ_TABLE - depends on X86_ELAN + depends on MELAN ---help--- This adds the CPUFreq driver for AMD Elan SC520 processor. -- cgit v1.1 From 5cdede2408e80f190c5595e592c24e77c1bf44b2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 Apr 2011 15:55:18 +0200 Subject: PCI: Move ATS declarations in seperate header file This patch moves the relevant declarations from the local header file in drivers/pci to a more accessible locations so that it can be used by the AMD IOMMU driver too. The file is named pci-ats.h because support for the PCI PRI capability will also be added there in a later patch-set. Signed-off-by: Joerg Roedel Acked-by: Jesse Barnes --- drivers/pci/intel-iommu.c | 1 + drivers/pci/iov.c | 1 + drivers/pci/pci.h | 37 --------------------------------- include/linux/pci-ats.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 include/linux/pci-ats.h diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 7da3bef..fdb2cef 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include "pci.h" diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 553d8ee..42fae47 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "pci.h" #define VIRTFN_ID_LEN 16 diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index a6ec200..4020025 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -250,15 +250,6 @@ struct pci_sriov { u8 __iomem *mstate; /* VF Migration State Array */ }; -/* Address Translation Service */ -struct pci_ats { - int pos; /* capability position */ - int stu; /* Smallest Translation Unit */ - int qdep; /* Invalidate Queue Depth */ - int ref_cnt; /* Physical Function reference count */ - unsigned int is_enabled:1; /* Enable bit is set */ -}; - #ifdef CONFIG_PCI_IOV extern int pci_iov_init(struct pci_dev *dev); extern void pci_iov_release(struct pci_dev *dev); @@ -269,19 +260,6 @@ extern resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, extern void pci_restore_iov_state(struct pci_dev *dev); extern int pci_iov_bus_range(struct pci_bus *bus); -extern int pci_enable_ats(struct pci_dev *dev, int ps); -extern void pci_disable_ats(struct pci_dev *dev); -extern int pci_ats_queue_depth(struct pci_dev *dev); -/** - * pci_ats_enabled - query the ATS status - * @dev: the PCI device - * - * Returns 1 if ATS capability is enabled, or 0 if not. - */ -static inline int pci_ats_enabled(struct pci_dev *dev) -{ - return dev->ats && dev->ats->is_enabled; -} #else static inline int pci_iov_init(struct pci_dev *dev) { @@ -304,21 +282,6 @@ static inline int pci_iov_bus_range(struct pci_bus *bus) return 0; } -static inline int pci_enable_ats(struct pci_dev *dev, int ps) -{ - return -ENODEV; -} -static inline void pci_disable_ats(struct pci_dev *dev) -{ -} -static inline int pci_ats_queue_depth(struct pci_dev *dev) -{ - return -ENODEV; -} -static inline int pci_ats_enabled(struct pci_dev *dev) -{ - return 0; -} #endif /* CONFIG_PCI_IOV */ static inline resource_size_t pci_resource_alignment(struct pci_dev *dev, diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h new file mode 100644 index 0000000..655824f --- /dev/null +++ b/include/linux/pci-ats.h @@ -0,0 +1,52 @@ +#ifndef LINUX_PCI_ATS_H +#define LINUX_PCI_ATS_H + +/* Address Translation Service */ +struct pci_ats { + int pos; /* capability position */ + int stu; /* Smallest Translation Unit */ + int qdep; /* Invalidate Queue Depth */ + int ref_cnt; /* Physical Function reference count */ + unsigned int is_enabled:1; /* Enable bit is set */ +}; + +#ifdef CONFIG_PCI_IOV + +extern int pci_enable_ats(struct pci_dev *dev, int ps); +extern void pci_disable_ats(struct pci_dev *dev); +extern int pci_ats_queue_depth(struct pci_dev *dev); +/** + * pci_ats_enabled - query the ATS status + * @dev: the PCI device + * + * Returns 1 if ATS capability is enabled, or 0 if not. + */ +static inline int pci_ats_enabled(struct pci_dev *dev) +{ + return dev->ats && dev->ats->is_enabled; +} + +#else /* CONFIG_PCI_IOV */ + +static inline int pci_enable_ats(struct pci_dev *dev, int ps) +{ + return -ENODEV; +} + +static inline void pci_disable_ats(struct pci_dev *dev) +{ +} + +static inline int pci_ats_queue_depth(struct pci_dev *dev) +{ + return -ENODEV; +} + +static inline int pci_ats_enabled(struct pci_dev *dev) +{ + return 0; +} + +#endif /* CONFIG_PCI_IOV */ + +#endif /* LINUX_PCI_ATS_H*/ -- cgit v1.1 From 9844b4e5dd1932e175a23d84ce09702bdf4b5689 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 Apr 2011 09:22:56 +0200 Subject: x86/amd-iommu: Select PCI_IOV with AMD IOMMU driver In order to support ATS in the AMD IOMMU driver this patch makes sure that the generic support for ATS is compiled in. Signed-off-by: Joerg Roedel --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a..8cc29da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -690,6 +690,7 @@ config AMD_IOMMU bool "AMD IOMMU support" select SWIOTLB select PCI_MSI + select PCI_IOV depends on X86_64 && PCI && ACPI ---help--- With this option you can enable support for AMD IOMMU hardware in -- cgit v1.1 From cb41ed85efa01e633388314c03a4f3004c6b783b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 Apr 2011 11:00:53 +0200 Subject: x86/amd-iommu: Flush device IOTLB if ATS is enabled This patch implements a function to flush the IOTLB on devices supporting ATS and makes sure that this TLB is also flushed if necessary. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 3 +- arch/x86/kernel/amd_iommu.c | 74 +++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 878ae00..f5d184e7 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -113,7 +113,8 @@ /* command specific defines */ #define CMD_COMPL_WAIT 0x01 #define CMD_INV_DEV_ENTRY 0x02 -#define CMD_INV_IOMMU_PAGES 0x03 +#define CMD_INV_IOMMU_PAGES 0x03 +#define CMD_INV_IOTLB_PAGES 0x04 #define CMD_COMPL_WAIT_STORE_MASK 0x01 #define CMD_COMPL_WAIT_INT_MASK 0x02 diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index bcf58ea..f3ce433 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -463,6 +464,37 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; } +static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, + u64 address, size_t size) +{ + u64 pages; + int s; + + pages = iommu_num_pages(address, size, PAGE_SIZE); + s = 0; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + address &= PAGE_MASK; + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + cmd->data[0] |= (qdep & 0xff) << 24; + cmd->data[1] = devid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); + if (s) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; +} + /* * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. @@ -574,17 +606,47 @@ void iommu_flush_all_caches(struct amd_iommu *iommu) } /* + * Command send function for flushing on-device TLB + */ +static int device_flush_iotlb(struct device *dev, u64 address, size_t size) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct amd_iommu *iommu; + struct iommu_cmd cmd; + u16 devid; + int qdep; + + qdep = pci_ats_queue_depth(pdev); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + + build_inv_iotlb_pages(&cmd, devid, qdep, address, size); + + return iommu_queue_command(iommu, &cmd); +} + +/* * Command send function for invalidating a device table entry */ static int device_flush_dte(struct device *dev) { struct amd_iommu *iommu; + struct pci_dev *pdev; u16 devid; + int ret; + pdev = to_pci_dev(dev); devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - return iommu_flush_dte(iommu, devid); + ret = iommu_flush_dte(iommu, devid); + if (ret) + return ret; + + if (pci_ats_enabled(pdev)) + ret = device_flush_iotlb(dev, 0, ~0UL); + + return ret; } /* @@ -595,6 +657,7 @@ static int device_flush_dte(struct device *dev) static void __domain_flush_pages(struct protection_domain *domain, u64 address, size_t size, int pde) { + struct iommu_dev_data *dev_data; struct iommu_cmd cmd; int ret = 0, i; @@ -611,6 +674,15 @@ static void __domain_flush_pages(struct protection_domain *domain, ret |= iommu_queue_command(amd_iommus[i], &cmd); } + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct pci_dev *pdev = to_pci_dev(dev_data->dev); + + if (!pci_ats_enabled(pdev)) + continue; + + ret |= device_flush_iotlb(dev_data->dev, address, size); + } + WARN_ON(ret); } -- cgit v1.1 From 60f723b4117507c05c8b0b5c8b98ecc12a76878e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 Apr 2011 12:50:24 +0200 Subject: x86/amd-iommu: Add flag to indicate IOTLB support This patch adds a flag to the AMD IOMMU driver to indicate that all IOMMUs present in the system support device IOTLBs. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 ++ arch/x86/kernel/amd_iommu_init.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index f5d184e7..cb811c9 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -250,6 +250,8 @@ extern bool amd_iommu_dump; /* global flag if IOMMUs cache non-present entries */ extern bool amd_iommu_np_cache; +/* Only true if all IOMMUs support device IOTLBs */ +extern bool amd_iommu_iotlb_sup; /* * Make iterating over all IOMMUs easier diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8848dda..b6c634f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -137,6 +137,7 @@ int amd_iommus_present; /* IOMMUs have a non-present cache? */ bool amd_iommu_np_cache __read_mostly; +bool amd_iommu_iotlb_sup __read_mostly = true; /* * The ACPI table parsing functions set this variable on an error @@ -673,6 +674,9 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) + amd_iommu_iotlb_sup = false; + if (!is_rd890_iommu(iommu->dev)) return; -- cgit v1.1 From fd7b5535e10ce820f030842da3f289f80ec0d4f3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 5 Apr 2011 15:31:08 +0200 Subject: x86/amd-iommu: Add ATS enable/disable code This patch adds the necessary code to the AMD IOMMU driver for enabling and disabling the ATS capability on a device and to setup the IOMMU data structures correctly. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 2 ++ arch/x86/kernel/amd_iommu.c | 34 ++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index cb811c9..7434377 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -216,6 +216,8 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) +#define DTE_FLAG_IOTLB 0x01 + #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f3ce433..e4791f6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1452,17 +1452,22 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -static void set_dte_entry(u16 devid, struct protection_domain *domain) +static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { u64 pte_root = virt_to_phys(domain->pt_root); + u32 flags = 0; pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + if (ats) + flags |= DTE_FLAG_IOTLB; + + amd_iommu_dev_table[devid].data[3] |= flags; + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); } static void clear_dte_entry(u16 devid) @@ -1479,16 +1484,22 @@ static void do_attach(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; + struct pci_dev *pdev; + bool ats = false; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + pdev = to_pci_dev(dev); + + if (amd_iommu_iotlb_sup) + ats = pci_ats_enabled(pdev); /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain); + set_dte_entry(devid, domain, ats); /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; @@ -1502,11 +1513,13 @@ static void do_detach(struct device *dev) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; + struct pci_dev *pdev; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + pdev = to_pci_dev(dev); /* decrease reference counters */ dev_data->domain->dev_iommu[iommu->index] -= 1; @@ -1581,9 +1594,13 @@ out_unlock: static int attach_device(struct device *dev, struct protection_domain *domain) { + struct pci_dev *pdev = to_pci_dev(dev); unsigned long flags; int ret; + if (amd_iommu_iotlb_sup) + pci_enable_ats(pdev, PAGE_SHIFT); + write_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); @@ -1640,12 +1657,16 @@ static void __detach_device(struct device *dev) */ static void detach_device(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev)) + pci_disable_ats(pdev); } /* @@ -1795,8 +1816,9 @@ static void update_device_table(struct protection_domain *domain) struct iommu_dev_data *dev_data; list_for_each_entry(dev_data, &domain->dev_list, list) { + struct pci_dev *pdev = to_pci_dev(dev_data->dev); u16 devid = get_device_id(dev_data->dev); - set_dte_entry(devid, domain); + set_dte_entry(devid, domain, pci_ats_enabled(pdev)); } } -- cgit v1.1 From f4ad9bd208c98f32a6f9136618e0b8bebe3fb370 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Apr 2011 12:53:09 +0800 Subject: sched: Eliminate dead code from wakeup_gran() calc_delta_fair() checks NICE_0_LOAD already, delete duplicate check. Signed-off-by: Shaohua Li Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1302238389.3981.92.camel@sli10-conroe Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833a..4ee50f0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1789,10 +1789,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) * This is especially important for buddies when the leftmost * task is higher priority than the buddy. */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - - return gran; + return calc_delta_fair(gran, se); } /* -- cgit v1.1 From c4a8849af939082052d8117f9ea3e170a99ff232 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:42 +0200 Subject: sched: Remove obsolete arch_ prefixes Non weak static functions clearly are not arch specific, so remove the arch_ prefix. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.820460566@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 4801363..d3e183c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serializes calls to init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -7670,7 +7670,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const struct cpumask *cpu_map) +static int init_sched_domains(const struct cpumask *cpu_map) { int err; @@ -7687,7 +7687,7 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) return err; } -static void arch_destroy_sched_domains(const struct cpumask *cpu_map, +static void destroy_sched_domains(const struct cpumask *cpu_map, struct cpumask *tmpmask) { free_sched_groups(cpu_map, tmpmask); @@ -7706,7 +7706,7 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); } /* handle null as "default" */ @@ -7815,7 +7815,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void arch_reinit_sched_domains(void) +static void reinit_sched_domains(void) { get_online_cpus(); @@ -7848,7 +7848,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - arch_reinit_sched_domains(); + reinit_sched_domains(); return count; } @@ -7974,7 +7974,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_active_mask); + init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); -- cgit v1.1 From d274cb30f4a08045492d3f0c47cdf1a25668b1f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:43 +0200 Subject: sched: Simplify ->cpu_power initialization The code in update_group_power() does what init_sched_groups_power() does and more, so remove the special init_ code and call the generic code instead. Also move the sd->span_weight initialization because update_group_power() needs it. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.875856012@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 44 +++++--------------------------------------- 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index d3e183c..50d5fd3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6679,9 +6679,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - for (tmp = sd; tmp; tmp = tmp->parent) - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -7159,11 +7156,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - struct sched_domain *child; - struct sched_group *group; - long power; - int weight; - WARN_ON(!sd || !sd->groups); if (cpu != group_first_cpu(sd->groups)) @@ -7171,36 +7163,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); - child = sd->child; - - sd->groups->cpu_power = 0; - - if (!child) { - power = SCHED_LOAD_SCALE; - weight = cpumask_weight(sched_domain_span(sd)); - /* - * SMT siblings share the power of a single core. - * Usually multiple threads get a better yield out of - * that one core than a single thread would have, - * reflect that in sd->smt_gain. - */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= sd->smt_gain; - power /= weight; - power >>= SCHED_LOAD_SHIFT; - } - sd->groups->cpu_power += power; - return; - } - - /* - * Add cpu_power of each child group to this groups cpu_power. - */ - group = child->groups; - do { - sd->groups->cpu_power += group->cpu_power; - group = group->next; - } while (group != child->groups); + update_group_power(sd, cpu); } /* @@ -7507,7 +7470,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, { enum s_alloc alloc_state = sa_none; struct s_data d; - struct sched_domain *sd; + struct sched_domain *sd, *tmp; int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; @@ -7530,6 +7493,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + + for (tmp = sd; tmp; tmp = tmp->parent) + tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); } for_each_cpu(i, cpu_map) { -- cgit v1.1 From a06dadbec5c5df0bf3a35f33616f67d10ca9ba28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:44 +0200 Subject: sched: Simplify build_sched_groups() Notice that the mask being computed is the same as the domain span we just computed. By using the domain_span we can avoid some mask allocations and computations. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.925028189@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 50d5fd3..e3818f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6866,9 +6866,6 @@ struct s_data { cpumask_var_t notcovered; #endif cpumask_var_t nodemask; - cpumask_var_t this_sibling_map; - cpumask_var_t this_core_map; - cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6880,9 +6877,6 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, - sa_this_book_map, - sa_this_core_map, - sa_this_sibling_map, sa_nodemask, sa_sched_group_nodes, #ifdef CONFIG_NUMA @@ -7251,12 +7245,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ - case sa_this_book_map: - free_cpumask_var(d->this_book_map); /* fall through */ - case sa_this_core_map: - free_cpumask_var(d->this_core_map); /* fall through */ - case sa_this_sibling_map: - free_cpumask_var(d->this_sibling_map); /* fall through */ case sa_nodemask: free_cpumask_var(d->nodemask); /* fall through */ case sa_sched_group_nodes: @@ -7295,14 +7283,8 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, #endif if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) return sa_sched_group_nodes; - if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) - return sa_nodemask; - if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) - return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) - return sa_this_core_map; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_this_book_map; + return sa_nodemask; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -7414,39 +7396,40 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, static void build_sched_groups(struct s_data *d, enum sched_domain_level l, const struct cpumask *cpu_map, int cpu) { + struct sched_domain *sd; + switch (l) { #ifdef CONFIG_SCHED_SMT case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - cpumask_and(d->this_sibling_map, cpu_map, - topology_thread_cpumask(cpu)); - if (cpu == cpumask_first(d->this_sibling_map)) - init_sched_build_groups(d->this_sibling_map, cpu_map, + sd = &per_cpu(cpu_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_cpu_group, d->send_covered, d->tmpmask); break; #endif #ifdef CONFIG_SCHED_MC case SD_LV_MC: /* set up multi-core groups */ - cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); - if (cpu == cpumask_first(d->this_core_map)) - init_sched_build_groups(d->this_core_map, cpu_map, + sd = &per_cpu(core_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_core_group, d->send_covered, d->tmpmask); break; #endif #ifdef CONFIG_SCHED_BOOK case SD_LV_BOOK: /* set up book groups */ - cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); - if (cpu == cpumask_first(d->this_book_map)) - init_sched_build_groups(d->this_book_map, cpu_map, + sd = &per_cpu(book_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_book_group, d->send_covered, d->tmpmask); break; #endif case SD_LV_CPU: /* set up physical groups */ - cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); - if (!cpumask_empty(d->nodemask)) - init_sched_build_groups(d->nodemask, cpu_map, + sd = &per_cpu(phys_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_phys_group, d->send_covered, d->tmpmask); break; @@ -7502,11 +7485,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); - } - - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) build_sched_groups(&d, SD_LV_CPU, cpu_map, i); + } #ifdef CONFIG_NUMA /* Set up node groups */ -- cgit v1.1 From cd4ea6ae3982f6861da3b510e69cbc194f331d83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:45 +0200 Subject: sched: Change NODE sched_domain group creation The NODE sched_domain is 'special' in that it allocates sched_groups per CPU, instead of sharing the sched_groups between all CPUs. While this might have some benefits on large NUMA and avoid remote memory accesses when iterating the sched_groups, this does break current code that assumes sched_groups are shared between all sched_domains (since the dynamic cpu_power patches). So refactor the NODE groups to behave like all other groups. (The ALLNODES domain again shared its groups across the CPUs for some reason). If someone does measure a performance decrease due to this change we need to revisit this and come up with another way to have both dynamic cpu_power and NUMA work nice together. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122941.978111700@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 229 ++++++++------------------------------------------------- 1 file changed, 32 insertions(+), 197 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index e3818f1..72d561f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6861,29 +6861,18 @@ struct static_sched_domain { struct s_data { #ifdef CONFIG_NUMA int sd_allnodes; - cpumask_var_t domainspan; - cpumask_var_t covered; - cpumask_var_t notcovered; #endif cpumask_var_t nodemask; cpumask_var_t send_covered; cpumask_var_t tmpmask; - struct sched_group **sched_group_nodes; struct root_domain *rd; }; enum s_alloc { - sa_sched_groups = 0, sa_rootdomain, sa_tmpmask, sa_send_covered, sa_nodemask, - sa_sched_group_nodes, -#ifdef CONFIG_NUMA - sa_notcovered, - sa_covered, - sa_domainspan, -#endif sa_none, }; @@ -6979,18 +6968,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, } #ifdef CONFIG_NUMA -/* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. - */ static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_node); -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, +static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *nodemask) { @@ -7000,142 +6981,27 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, group = cpumask_first(nodemask); if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; + *sg = &per_cpu(sched_group_node, group).sg; return group; } -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu(j, sched_group_cpus(sg)) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j).sd; - if (j != group_first_cpu(sd->groups)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - } while (sg != group_head); -} +static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); -static int build_numa_sched_groups(struct s_data *d, - const struct cpumask *cpu_map, int num) +static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, + struct cpumask *nodemask) { - struct sched_domain *sd; - struct sched_group *sg, *prev; - int n, j; - - cpumask_clear(d->covered); - cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); - if (cpumask_empty(d->nodemask)) { - d->sched_group_nodes[num] = NULL; - goto out; - } - - sched_domain_node_span(num, d->domainspan); - cpumask_and(d->domainspan, d->domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); - return -ENOMEM; - } - d->sched_group_nodes[num] = sg; - - for_each_cpu(j, d->nodemask) { - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } + int group; - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->nodemask); - sg->next = sg; - cpumask_or(d->covered, d->covered, d->nodemask); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); + group = cpumask_first(nodemask); - prev = sg; - for (j = 0; j < nr_node_ids; j++) { - n = (num + j) % nr_node_ids; - cpumask_complement(d->notcovered, d->covered); - cpumask_and(d->tmpmask, d->notcovered, cpu_map); - cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); - if (cpumask_empty(d->tmpmask)) - break; - cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d->tmpmask)) - continue; - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - return -ENOMEM; - } - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->tmpmask); - sg->next = prev->next; - cpumask_or(d->covered, d->covered, d->tmpmask); - prev->next = sg; - prev = sg; - } -out: - return 0; + if (sg) + *sg = &per_cpu(sched_group_allnodes, group).sg; + return group; } -#endif /* CONFIG_NUMA */ - -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ - int cpu, i; - for_each_cpu(cpu, cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (!sched_group_nodes) - continue; - - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -} -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ -} #endif /* CONFIG_NUMA */ /* @@ -7236,9 +7102,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { switch (what) { - case sa_sched_groups: - free_sched_groups(cpu_map, d->tmpmask); /* fall through */ - d->sched_group_nodes = NULL; case sa_rootdomain: free_rootdomain(d->rd); /* fall through */ case sa_tmpmask: @@ -7247,16 +7110,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->send_covered); /* fall through */ case sa_nodemask: free_cpumask_var(d->nodemask); /* fall through */ - case sa_sched_group_nodes: -#ifdef CONFIG_NUMA - kfree(d->sched_group_nodes); /* fall through */ - case sa_notcovered: - free_cpumask_var(d->notcovered); /* fall through */ - case sa_covered: - free_cpumask_var(d->covered); /* fall through */ - case sa_domainspan: - free_cpumask_var(d->domainspan); /* fall through */ -#endif case sa_none: break; } @@ -7265,24 +7118,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { -#ifdef CONFIG_NUMA - if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) - return sa_none; - if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) - return sa_domainspan; - if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) - return sa_covered; - /* Allocate the per-node list of sched groups */ - d->sched_group_nodes = kcalloc(nr_node_ids, - sizeof(struct sched_group *), GFP_KERNEL); - if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return sa_notcovered; - } - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; -#endif if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_sched_group_nodes; + return sa_none; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) return sa_nodemask; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) @@ -7322,6 +7159,7 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, if (parent) parent->child = sd; cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); + cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7434,6 +7272,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #ifdef CONFIG_NUMA + case SD_LV_NODE: + sd = &per_cpu(node_domains, cpu).sd; + if (cpu == cpumask_first(sched_domain_span(sd))) + init_sched_build_groups(sched_domain_span(sd), cpu_map, + &cpu_to_node_group, + d->send_covered, d->tmpmask); + case SD_LV_ALLNODES: init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, d->send_covered, d->tmpmask); @@ -7462,7 +7307,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - alloc_state = sa_sched_groups; /* * Set up domains for cpus specified by the cpu_map. @@ -7486,16 +7330,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); build_sched_groups(&d, SD_LV_CPU, cpu_map, i); + build_sched_groups(&d, SD_LV_NODE, cpu_map, i); } #ifdef CONFIG_NUMA /* Set up node groups */ if (d.sd_allnodes) build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - - for (i = 0; i < nr_node_ids; i++) - if (build_numa_sched_groups(&d, cpu_map, i)) - goto error; #endif /* Calculate CPU power for physical packages and nodes */ @@ -7524,15 +7365,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } #ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(d.sched_group_nodes[i]); + for_each_cpu(i, cpu_map) { + sd = &per_cpu(node_domains, i).sd; + init_sched_groups_power(i, sd); + } if (d.sd_allnodes) { - struct sched_group *sg; - - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - d.tmpmask); - init_numa_sched_groups_power(sg); + for_each_cpu(i, cpu_map) { + sd = &per_cpu(allnodes_domains, i).sd; + init_sched_groups_power(i, sd); + } } #endif @@ -7550,7 +7392,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_attach_domain(sd, d.rd, i); } - d.sched_group_nodes = NULL; /* don't free this we still need it */ __free_domain_allocs(&d, sa_tmpmask, cpu_map); return 0; @@ -7636,7 +7477,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) static void destroy_sched_domains(const struct cpumask *cpu_map, struct cpumask *tmpmask) { - free_sched_groups(cpu_map, tmpmask); } /* @@ -7913,11 +7753,6 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif get_online_cpus(); mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); -- cgit v1.1 From 3739494e08da50c8a68d65eed5ba3012a54b40d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:46 +0200 Subject: sched: Clean up some ALLNODES code Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.025636011@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 72d561f..fa10cf7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7280,7 +7280,9 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); case SD_LV_ALLNODES: - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + if (cpu == cpumask_first(cpu_map)) + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, d->send_covered, d->tmpmask); break; #endif @@ -7331,14 +7333,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, build_sched_groups(&d, SD_LV_MC, cpu_map, i); build_sched_groups(&d, SD_LV_CPU, cpu_map, i); build_sched_groups(&d, SD_LV_NODE, cpu_map, i); + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, i); } -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (d.sd_allnodes) - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); -#endif - /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { -- cgit v1.1 From 1cf51902546d60b8a7a6aba2dd557bd4ba8840ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:47 +0200 Subject: sched: Simplify sched_group creation Instead of calling build_sched_groups() for each possible sched_domain we might have created, note that we can simply iterate the sched_domain tree and call it for each sched_domain present. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.077862519@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index fa10cf7..e66d24a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7231,15 +7231,12 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } -static void build_sched_groups(struct s_data *d, enum sched_domain_level l, +static void build_sched_groups(struct s_data *d, struct sched_domain *sd, const struct cpumask *cpu_map, int cpu) { - struct sched_domain *sd; - - switch (l) { + switch (sd->level) { #ifdef CONFIG_SCHED_SMT case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - sd = &per_cpu(cpu_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_cpu_group, @@ -7248,7 +7245,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, #endif #ifdef CONFIG_SCHED_MC case SD_LV_MC: /* set up multi-core groups */ - sd = &per_cpu(core_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_core_group, @@ -7257,7 +7253,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, #endif #ifdef CONFIG_SCHED_BOOK case SD_LV_BOOK: /* set up book groups */ - sd = &per_cpu(book_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_book_group, @@ -7265,7 +7260,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, break; #endif case SD_LV_CPU: /* set up physical groups */ - sd = &per_cpu(phys_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_phys_group, @@ -7273,7 +7267,6 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, break; #ifdef CONFIG_NUMA case SD_LV_NODE: - sd = &per_cpu(node_domains, cpu).sd; if (cpu == cpumask_first(sched_domain_span(sd))) init_sched_build_groups(sched_domain_span(sd), cpu_map, &cpu_to_node_group, @@ -7323,17 +7316,10 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); - for (tmp = sd; tmp; tmp = tmp->parent) + for (tmp = sd; tmp; tmp = tmp->parent) { tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - } - - for_each_cpu(i, cpu_map) { - build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); - build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); - build_sched_groups(&d, SD_LV_MC, cpu_map, i); - build_sched_groups(&d, SD_LV_CPU, cpu_map, i); - build_sched_groups(&d, SD_LV_NODE, cpu_map, i); - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, i); + build_sched_groups(&d, tmp, cpu_map, i); + } } /* Calculate CPU power for physical packages and nodes */ -- cgit v1.1 From 21d42ccfd6c6c11f96c2acfd32a85cfc33514d3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:48 +0200 Subject: sched: Simplify finding the lowest sched_domain Instead of relying on knowing the build order and various CONFIG_ flags simply remember the bottom most sched_domain when we created the domain hierarchy. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.134511046@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index e66d24a..d6992bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6865,11 +6865,13 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t send_covered; cpumask_var_t tmpmask; + struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, + sa_sd, sa_tmpmask, sa_send_covered, sa_nodemask, @@ -7104,6 +7106,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, switch (what) { case sa_rootdomain: free_rootdomain(d->rd); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ case sa_tmpmask: free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: @@ -7124,10 +7128,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) { + printk(KERN_WARNING "Cannot alloc per-cpu pointers\n"); + return sa_tmpmask; + } d->rd = alloc_rootdomain(); if (!d->rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); - return sa_tmpmask; + return sa_sd; } return sa_rootdomain; } @@ -7316,6 +7325,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + *per_cpu_ptr(d.sd, i) = sd; + for (tmp = sd; tmp; tmp = tmp->parent) { tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); build_sched_groups(&d, tmp, cpu_map, i); @@ -7363,15 +7374,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ for_each_cpu(i, cpu_map) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i).sd; -#elif defined(CONFIG_SCHED_BOOK) - sd = &per_cpu(book_domains, i).sd; -#else - sd = &per_cpu(phys_domains, i).sd; -#endif + sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } -- cgit v1.1 From a9c9a9b6bff27ac9c746344a9c1a19bf3327002c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:49 +0200 Subject: sched: Simplify sched_groups_power initialization Again, instead of relying on knowing the possible domains and their order, simply rely on the sched_domain tree and whatever domains are present in there to initialize the sched_group cpu_power. Note: we need to iterate the CPU mask backwards because of the cpumask_first() condition for iterating up the tree. By iterating the mask backwards we ensure all groups of a domain are set-up before starting on the parent groups that rely on its children to be completely done. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.187335414@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index d6992bf..1cca59e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7334,43 +7334,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu(i, cpu_map) { - sd = &per_cpu(cpu_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu(i, cpu_map) { - sd = &per_cpu(core_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_BOOK - for_each_cpu(i, cpu_map) { - sd = &per_cpu(book_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif - - for_each_cpu(i, cpu_map) { - sd = &per_cpu(phys_domains, i).sd; - init_sched_groups_power(i, sd); - } - -#ifdef CONFIG_NUMA - for_each_cpu(i, cpu_map) { - sd = &per_cpu(node_domains, i).sd; - init_sched_groups_power(i, sd); - } + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; - if (d.sd_allnodes) { - for_each_cpu(i, cpu_map) { - sd = &per_cpu(allnodes_domains, i).sd; + sd = *per_cpu_ptr(d.sd, i); + for (; sd; sd = sd->parent) init_sched_groups_power(i, sd); - } } -#endif /* Attach the domains */ for_each_cpu(i, cpu_map) { -- cgit v1.1 From dce840a08702bd13a9a186e07e63d1ef82256b5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:50 +0200 Subject: sched: Dynamically allocate sched_domain/sched_group data-structures Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 + kernel/sched.c | 479 ++++++++++++++++++++------------------------------ kernel/sched_fair.c | 30 +++- 3 files changed, 218 insertions(+), 296 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c02..020b79d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void) struct sched_group { struct sched_group *next; /* Must be a circular list */ + atomic_t ref; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@ -973,6 +974,10 @@ struct sched_domain { #ifdef CONFIG_SCHED_DEBUG char *name; #endif + union { + void *private; /* used during construction */ + struct rcu_head rcu; /* used during destruction */ + }; unsigned int span_weight; /* diff --git a/kernel/sched.c b/kernel/sched.c index 1cca59e..6520484 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -417,6 +417,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -571,7 +572,7 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_sched_held() || \ + rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -6572,12 +6573,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -static void free_rootdomain(struct root_domain *rd) +static void free_rootdomain(struct rcu_head *rcu) { - synchronize_sched(); + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6618,7 +6618,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - free_rootdomain(old_rd); + call_rcu_sched(&old_rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -6669,6 +6669,25 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + if (atomic_dec_and_test(&sd->groups->ref)) + kfree(sd->groups); + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -6689,20 +6708,25 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { + tmp = sd; sd = sd->parent; + destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; } - sched_domain_debug(sd, cpu); + /* sched_domain_debug(sd, cpu); */ rq_attach_root(rq, rd); + tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); } /* cpus with isolated domains */ @@ -6718,56 +6742,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const struct cpumask *span, - const struct cpumask *cpu_map, - int (*group_fn)(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *tmpmask), - struct cpumask *covered, struct cpumask *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; - - for_each_cpu(j, span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - #define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA @@ -6858,154 +6832,96 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; +}; + struct s_data { #ifdef CONFIG_NUMA int sd_allnodes; #endif cpumask_var_t nodemask; cpumask_var_t send_covered; - cpumask_var_t tmpmask; struct sched_domain ** __percpu sd; + struct sd_data sdd[SD_LV_MAX]; struct root_domain *rd; }; enum s_alloc { sa_rootdomain, sa_sd, - sa_tmpmask, + sa_sd_storage, sa_send_covered, sa_nodemask, sa_none, }; /* - * SMT sched-domains: + * Assumes the sched_domain tree is fully constructed */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_groups); - -static int -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { - if (sg) - *sg = &per_cpu(sched_groups, cpu).sg; - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct static_sched_domain, core_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); + if (child) + cpu = cpumask_first(sched_domain_span(child)); -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_SMT - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif if (sg) - *sg = &per_cpu(sched_group_core, group).sg; - return group; + *sg = *per_cpu_ptr(sdd->sg, cpu); + + return cpu; } -#endif /* CONFIG_SCHED_MC */ /* - * book sched-domains: + * build_sched_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. */ -#ifdef CONFIG_SCHED_BOOK -static DEFINE_PER_CPU(struct static_sched_domain, book_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); - -static int -cpu_to_book_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group = cpu; -#ifdef CONFIG_SCHED_MC - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#endif - if (sg) - *sg = &per_cpu(sched_group_book, group).sg; - return group; -} -#endif /* CONFIG_SCHED_BOOK */ - -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); - -static int -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) +static void +build_sched_groups(struct sched_domain *sd, struct cpumask *covered) { - int group; -#ifdef CONFIG_SCHED_BOOK - cpumask_and(mask, cpu_book_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_MC) - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_phys, group).sg; - return group; -} - -#ifdef CONFIG_NUMA -static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_node); + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + int i; -static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; + cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; - if (sg) - *sg = &per_cpu(sched_group_node, group).sg; - return group; -} + if (cpumask_test_cpu(i, covered)) + continue; -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); + cpumask_clear(sched_group_cpus(sg)); + sg->cpu_power = 0; -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) + continue; - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } - if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; - return group; + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; } -#endif /* CONFIG_NUMA */ - /* * Initialize sched groups cpu_power. * @@ -7039,15 +6955,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT(sd, type) sd_init_##type(sd) - -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain *sd_init_##type(struct s_data *d, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(d->sdd[SD_LV_##type].sd, cpu); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ + sd->private = &d->sdd[SD_LV_##type]; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7103,13 +7019,22 @@ static void set_domain_attribute(struct sched_domain *sd, static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { + int i, j; + switch (what) { case sa_rootdomain: - free_rootdomain(d->rd); /* fall through */ + free_rootdomain(&d->rd->rcu); /* fall through */ case sa_sd: free_percpu(d->sd); /* fall through */ - case sa_tmpmask: - free_cpumask_var(d->tmpmask); /* fall through */ + case sa_sd_storage: + for (i = 0; i < SD_LV_MAX; i++) { + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(d->sdd[i].sd, j)); + kfree(*per_cpu_ptr(d->sdd[i].sg, j)); + } + free_percpu(d->sdd[i].sd); + free_percpu(d->sdd[i].sg); + } /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ case sa_nodemask: @@ -7122,25 +7047,70 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { + int i, j; + + memset(d, 0, sizeof(*d)); + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) return sa_none; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) return sa_nodemask; - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) - return sa_send_covered; - d->sd = alloc_percpu(struct sched_domain *); - if (!d->sd) { - printk(KERN_WARNING "Cannot alloc per-cpu pointers\n"); - return sa_tmpmask; + for (i = 0; i < SD_LV_MAX; i++) { + d->sdd[i].sd = alloc_percpu(struct sched_domain *); + if (!d->sdd[i].sd) + return sa_sd_storage; + + d->sdd[i].sg = alloc_percpu(struct sched_group *); + if (!d->sdd[i].sg) + return sa_sd_storage; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return sa_sd_storage; + + *per_cpu_ptr(d->sdd[i].sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return sa_sd_storage; + + *per_cpu_ptr(d->sdd[i].sg, j) = sg; + } } + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; d->rd = alloc_rootdomain(); - if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); + if (!d->rd) return sa_sd; - } return sa_rootdomain; } +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + struct sched_group *sg = sd->groups; + + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; + + if (cpu == cpumask_first(sched_group_cpus(sg))) { + WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + *per_cpu_ptr(sdd->sg, cpu) = NULL; + } +} + static struct sched_domain *__build_numa_sched_domains(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) { @@ -7151,24 +7121,20 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, d->sd_allnodes = 0; if (cpumask_weight(cpu_map) > SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); + sd = sd_init_ALLNODES(d, i); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); d->sd_allnodes = 1; } parent = sd; - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); + sd = sd_init_NODE(d, i); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); sd->parent = parent; if (parent) parent->child = sd; cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); - cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7178,14 +7144,12 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, struct sched_domain *parent, int i) { struct sched_domain *sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); + sd = sd_init_CPU(d, i); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), d->nodemask); sd->parent = parent; if (parent) parent->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); return sd; } @@ -7195,13 +7159,11 @@ static struct sched_domain *__build_book_sched_domain(struct s_data *d, { struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_BOOK - sd = &per_cpu(book_domains, i).sd; - SD_INIT(sd, BOOK); + sd = sd_init_BOOK(d, i); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); sd->parent = parent; parent->child = sd; - cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7212,13 +7174,11 @@ static struct sched_domain *__build_mc_sched_domain(struct s_data *d, { struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_MC - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); + sd = sd_init_MC(d, i); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); sd->parent = parent; parent->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } @@ -7229,92 +7189,32 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, { struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); + sd = sd_init_SIBLING(d, i); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); sd->parent = parent; parent->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); #endif return sd; } -static void build_sched_groups(struct s_data *d, struct sched_domain *sd, - const struct cpumask *cpu_map, int cpu) -{ - switch (sd->level) { -#ifdef CONFIG_SCHED_SMT - case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_cpu_group, - d->send_covered, d->tmpmask); - break; -#endif -#ifdef CONFIG_SCHED_MC - case SD_LV_MC: /* set up multi-core groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_core_group, - d->send_covered, d->tmpmask); - break; -#endif -#ifdef CONFIG_SCHED_BOOK - case SD_LV_BOOK: /* set up book groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_book_group, - d->send_covered, d->tmpmask); - break; -#endif - case SD_LV_CPU: /* set up physical groups */ - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_phys_group, - d->send_covered, d->tmpmask); - break; -#ifdef CONFIG_NUMA - case SD_LV_NODE: - if (cpu == cpumask_first(sched_domain_span(sd))) - init_sched_build_groups(sched_domain_span(sd), cpu_map, - &cpu_to_node_group, - d->send_covered, d->tmpmask); - - case SD_LV_ALLNODES: - if (cpu == cpumask_first(cpu_map)) - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - d->send_covered, d->tmpmask); - break; -#endif - default: - break; - } -} - /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; + struct sched_domain *sd; struct s_data d; - struct sched_domain *sd, *tmp; int i; -#ifdef CONFIG_NUMA - d.sd_allnodes = 0; -#endif alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - /* - * Set up domains for cpus specified by the cpu_map. - */ + /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -7326,10 +7226,19 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); *per_cpu_ptr(d.sd, i) = sd; + } + + /* Build the groups for the domains */ + for_each_cpu(i, cpu_map) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + get_group(i, sd->private, &sd->groups); + atomic_inc(&sd->groups->ref); - for (tmp = sd; tmp; tmp = tmp->parent) { - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - build_sched_groups(&d, tmp, cpu_map, i); + if (i != cpumask_first(sched_domain_span(sd))) + continue; + + build_sched_groups(sd, d.send_covered); } } @@ -7338,18 +7247,21 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (!cpumask_test_cpu(i, cpu_map)) continue; - sd = *per_cpu_ptr(d.sd, i); - for (; sd; sd = sd->parent) + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); init_sched_groups_power(i, sd); + } } /* Attach the domains */ + rcu_read_lock(); for_each_cpu(i, cpu_map) { sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } + rcu_read_unlock(); - __free_domain_allocs(&d, sa_tmpmask, cpu_map); + __free_domain_allocs(&d, sa_sd, cpu_map); return 0; error: @@ -7357,11 +7269,6 @@ error: return -ENOMEM; } -static int build_sched_domains(const struct cpumask *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); -} - static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; @@ -7425,31 +7332,24 @@ static int init_sched_domains(const struct cpumask *cpu_map) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur[0]); + err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); return err; } -static void destroy_sched_domains(const struct cpumask *cpu_map, - struct cpumask *tmpmask) -{ -} - /* * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ static void detach_destroy_domains(const struct cpumask *cpu_map) { - /* Save because hotplug lock held. */ - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + rcu_read_unlock(); } /* handle null as "default" */ @@ -7538,8 +7438,7 @@ match1: goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new[i], - dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0..4a8ac7c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); for_each_domain(target, sd) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; @@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target) cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) break; } + rcu_read_unlock(); return target; } @@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; } while (sd) { @@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ } /* while loop will break here if sd == NULL */ } +unlock: + rcu_read_unlock(); return new_cpu; } @@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + rcu_read_unlock(); raw_spin_lock(&this_rq->lock); @@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) @@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data) else schedstat_inc(sd, alb_failed); } + rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; @@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu) if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; + rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { - if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.grp_idle_mask); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } +unlock: + rcu_read_unlock(); out_done: - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) update_shares(cpu); + rcu_read_lock(); for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -3890,6 +3907,7 @@ out: if (!balance) break; } + rcu_read_unlock(); /* * next_balance will be updated only when there is a need. -- cgit v1.1 From 822ff793c34a5d4c8b5f3f9ce932602233d96464 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:51 +0200 Subject: sched: Simplify the free path some If we check the root_domain reference count we can see if its been used or not, use this observation to simplify some of the return paths. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.298339503@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 6520484..72c194c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7023,7 +7023,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, switch (what) { case sa_rootdomain: - free_rootdomain(&d->rd->rcu); /* fall through */ + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ case sa_sd: free_percpu(d->sd); /* fall through */ case sa_sd_storage: @@ -7208,7 +7209,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; - int i; + int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) @@ -7261,12 +7262,10 @@ static int build_sched_domains(const struct cpumask *cpu_map, } rcu_read_unlock(); - __free_domain_allocs(&d, sa_sd, cpu_map); - return 0; - + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); - return -ENOMEM; + return ret; } static cpumask_var_t *doms_cur; /* current sched domains */ -- cgit v1.1 From a6c75f2f8d988ecfecf971f98f1cb6fc4de522fe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:52 +0200 Subject: sched: Avoid using sd->level Don't use sd->level for identifying properties of the domain. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.350174079@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4a8ac7c..9c5679c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2651,7 +2651,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_LOAD_SCALE */ - if (sd->level != SD_LV_SIBLING) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; /* -- cgit v1.1 From 3859173d43658d51a749bc0201b943922577d39c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:53 +0200 Subject: sched: Reduce some allocation pressure Since we now allocate SD_LV_MAX * nr_cpu_ids sched_domain/sched_group structures when rebuilding the scheduler toplogy it might make sense to shrink that depending on the CONFIG_ options. This is only needed until we get rid of SD_LV_* alltogether and provide a full dynamic topology interface. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.406226449@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 020b79d..5a9168b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -897,12 +897,20 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) enum sched_domain_level { SD_LV_NONE = 0, +#ifdef CONFIG_SCHED_SMT SD_LV_SIBLING, +#endif +#ifdef CONFIG_SCHED_MC SD_LV_MC, +#endif +#ifdef CONFIG_SCHED_BOOK SD_LV_BOOK, +#endif SD_LV_CPU, +#ifdef CONFIG_NUMA SD_LV_NODE, SD_LV_ALLNODES, +#endif SD_LV_MAX }; -- cgit v1.1 From 3bd65a80affb9768b91f03c56dba46ee79525f9b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:54 +0200 Subject: sched: Simplify NODE/ALLNODES domain creation Don't treat ALLNODES/NODE different for difference's sake. Simply always create the ALLNODES domain and let the sd_degenerate() checks kill it when its redundant. This simplifies the code flow. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.455464579@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 72c194c..d395fe5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6838,9 +6838,6 @@ struct sd_data { }; struct s_data { -#ifdef CONFIG_NUMA - int sd_allnodes; -#endif cpumask_var_t nodemask; cpumask_var_t send_covered; struct sched_domain ** __percpu sd; @@ -7112,30 +7109,35 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } } -static struct sched_domain *__build_numa_sched_domains(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) { struct sched_domain *sd = NULL; #ifdef CONFIG_NUMA - struct sched_domain *parent; - - d->sd_allnodes = 0; - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = sd_init_ALLNODES(d, i); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - d->sd_allnodes = 1; - } - parent = sd; + sd = sd_init_ALLNODES(d, i); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + sd->parent = parent; + if (parent) + parent->child = sd; +#endif + return sd; +} +static struct sched_domain *__build_node_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = NULL; +#ifdef CONFIG_NUMA sd = sd_init_NODE(d, i); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); sd->parent = parent; if (parent) parent->child = sd; - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); #endif return sd; } @@ -7220,7 +7222,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); - sd = __build_numa_sched_domains(&d, cpu_map, attr, i); + sd = NULL; + sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); -- cgit v1.1 From bf28b253266ebd73c331dde24d64606afde32ceb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:55 +0200 Subject: sched: Remove nodemask allocation There's only one nodemask user left so remove it with a direct computation and save some memory and reduce some code-flow complexity. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.505608966@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index d395fe5..f4d3a62 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6838,7 +6838,6 @@ struct sd_data { }; struct s_data { - cpumask_var_t nodemask; cpumask_var_t send_covered; struct sched_domain ** __percpu sd; struct sd_data sdd[SD_LV_MAX]; @@ -6850,7 +6849,6 @@ enum s_alloc { sa_sd, sa_sd_storage, sa_send_covered, - sa_nodemask, sa_none, }; @@ -7035,8 +7033,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, } /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ - case sa_nodemask: - free_cpumask_var(d->nodemask); /* fall through */ case sa_none: break; } @@ -7049,10 +7045,8 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, memset(d, 0, sizeof(*d)); - if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_none; if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_nodemask; + return sa_none; for (i = 0; i < SD_LV_MAX; i++) { d->sdd[i].sd = alloc_percpu(struct sched_domain *); if (!d->sdd[i].sd) @@ -7149,7 +7143,8 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, struct sched_domain *sd; sd = sd_init_CPU(d, i); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d->nodemask); + cpumask_and(sched_domain_span(sd), + cpumask_of_node(cpu_to_node(i)), cpu_map); sd->parent = parent; if (parent) parent->child = sd; @@ -7219,9 +7214,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), - cpu_map); - sd = NULL; sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i); -- cgit v1.1 From 7dd04b730749f957c116f363524fd622b05e5141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:56 +0200 Subject: sched: Remove some dead code Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.553814623@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ------ kernel/sched.c | 16 ---------------- 2 files changed, 22 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a9168b..09d9e02 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -883,9 +883,6 @@ struct sched_group { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_group' in kernel/sched.c) */ unsigned long cpumask[0]; }; @@ -994,9 +991,6 @@ struct sched_domain { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_domain' in kernel/sched.c) */ unsigned long span[0]; }; diff --git a/kernel/sched.c b/kernel/sched.c index f4d3a62..5ec685c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6816,22 +6816,6 @@ static void sched_domain_node_span(int node, struct cpumask *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * - * ( See the the comments in include/linux/sched.h:struct sched_group - * and struct sched_domain. ) - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; - -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); -}; - struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; -- cgit v1.1 From f96225fd51893b6650cffd5427f13f6b1b356488 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:57 +0200 Subject: sched: Create persistent sched_domains_tmpmask Since sched domain creation is fully serialized by the sched_domains_mutex we can create a single persistent tmpmask to use during domain creation. This removes the need for s_data::send_covered. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.607287405@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5ec685c..fd73e91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6822,7 +6822,6 @@ struct sd_data { }; struct s_data { - cpumask_var_t send_covered; struct sched_domain ** __percpu sd; struct sd_data sdd[SD_LV_MAX]; struct root_domain *rd; @@ -6832,7 +6831,6 @@ enum s_alloc { sa_rootdomain, sa_sd, sa_sd_storage, - sa_send_covered, sa_none, }; @@ -6853,6 +6851,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) return cpu; } +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + /* * build_sched_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU @@ -6864,13 +6864,17 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) * and ->cpu_power to 0. */ static void -build_sched_groups(struct sched_domain *sd, struct cpumask *covered) +build_sched_groups(struct sched_domain *sd) { struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; int i; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + cpumask_clear(covered); for_each_cpu(i, span) { @@ -7015,8 +7019,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_percpu(d->sdd[i].sd); free_percpu(d->sdd[i].sg); } /* fall through */ - case sa_send_covered: - free_cpumask_var(d->send_covered); /* fall through */ case sa_none: break; } @@ -7029,8 +7031,6 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, memset(d, 0, sizeof(*d)); - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_none; for (i = 0; i < SD_LV_MAX; i++) { d->sdd[i].sd = alloc_percpu(struct sched_domain *); if (!d->sdd[i].sd) @@ -7219,7 +7219,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, if (i != cpumask_first(sched_domain_span(sd))) continue; - build_sched_groups(sd, d.send_covered); + build_sched_groups(sd); } } @@ -7896,6 +7896,7 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); -- cgit v1.1 From 4cb988395da6e16627a8be69729e50cd72ebb23e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:58 +0200 Subject: sched: Avoid allocations in sched_domain_debug() Since we're all serialized by sched_domains_mutex we can use sched_domains_tmpmask and avoid having to do allocations. This means we can use sched_domains_debug() for cpu_attach_domain() again. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.664347467@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index fd73e91..35fc995 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6395,6 +6395,8 @@ early_initcall(migration_init); #ifdef CONFIG_SMP +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + #ifdef CONFIG_SCHED_DEBUG static __read_mostly int sched_domain_debug_enabled; @@ -6490,7 +6492,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_var_t groupmask; int level = 0; if (!sched_domain_debug_enabled) @@ -6503,20 +6504,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } - free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6721,7 +6716,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } - /* sched_domain_debug(sd, cpu); */ + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); tmp = rq->sd; @@ -6851,8 +6846,6 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) return cpu; } -static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ - /* * build_sched_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU @@ -7896,8 +7889,8 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); -- cgit v1.1 From d3081f52f29da1ba6c27685519a9222b39eac763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:59 +0200 Subject: sched: Create proper cpu_$DOM_mask() functions In order to unify the sched domain creation more, create proper cpu_$DOM_mask() functions for those domains that didn't already have one. Use the sched_domains_tmpmask for the weird NUMA domain span. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.717702108@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 35fc995..3ae1e02 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6807,8 +6807,22 @@ static void sched_domain_node_span(int node, struct cpumask *span) cpumask_or(span, span, cpumask_of_node(next_node)); } } + +static const struct cpumask *cpu_node_mask(int cpu) +{ + lockdep_assert_held(&sched_domains_mutex); + + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + + return sched_domains_tmpmask; +} #endif /* CONFIG_NUMA */ +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + int sched_smt_power_savings = 0, sched_mc_power_savings = 0; struct sd_data { @@ -7088,7 +7102,7 @@ static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d, #ifdef CONFIG_NUMA sd = sd_init_ALLNODES(d, i); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_possible_mask); sd->parent = parent; if (parent) parent->child = sd; @@ -7104,8 +7118,7 @@ static struct sched_domain *__build_node_sched_domain(struct s_data *d, #ifdef CONFIG_NUMA sd = sd_init_NODE(d, i); set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_node_mask(i)); sd->parent = parent; if (parent) parent->child = sd; @@ -7120,8 +7133,7 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, struct sched_domain *sd; sd = sd_init_CPU(d, i); set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - cpumask_of_node(cpu_to_node(i)), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_cpu_mask(i)); sd->parent = parent; if (parent) parent->child = sd; -- cgit v1.1 From eb7a74e6cd936c00749e2921b9e058631d986648 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:00 +0200 Subject: sched: Stuff the sched_domain creation in a data-structure In order to make the topology contruction fully dynamic, remove the still hard-coded list of possible domains and stick them in a data-structure. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.770335383@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 3ae1e02..f0e1821 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6843,6 +6843,16 @@ enum s_alloc { sa_none, }; +typedef struct sched_domain *(*sched_domain_build_f)(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int cpu); + +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + +struct sched_domain_topology_level { + sched_domain_build_f build; +}; + /* * Assumes the sched_domain tree is fully constructed */ @@ -7185,6 +7195,18 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } +static struct sched_domain_topology_level default_topology[] = { + { __build_allnodes_sched_domain, }, + { __build_node_sched_domain, }, + { __build_cpu_sched_domain, }, + { __build_book_sched_domain, }, + { __build_mc_sched_domain, }, + { __build_smt_sched_domain, }, + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -7203,13 +7225,11 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { + struct sched_domain_topology_level *tl; + sd = NULL; - sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); + for (tl = sched_domain_topology; tl->build; tl++) + sd = tl->build(&d, cpu_map, attr, sd, i); *per_cpu_ptr(d.sd, i) = sd; } -- cgit v1.1 From 2c402dc3bb502e9dd74fce72c14d293fcef4719d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:01 +0200 Subject: sched: Unify the sched_domain build functions Since all the __build_$DOM_sched_domain() functions do pretty much the same thing, unify them. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.826347257@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 133 +++++++++++++++++---------------------------------------- 1 file changed, 39 insertions(+), 94 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f0e1821..00d1e37 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6816,6 +6816,11 @@ static const struct cpumask *cpu_node_mask(int cpu) return sched_domains_tmpmask; } + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ + return cpu_possible_mask; +} #endif /* CONFIG_NUMA */ static const struct cpumask *cpu_cpu_mask(int cpu) @@ -6843,14 +6848,12 @@ enum s_alloc { sa_none, }; -typedef struct sched_domain *(*sched_domain_build_f)(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int cpu); - +typedef struct sched_domain *(*sched_domain_init_f)(struct s_data *d, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); struct sched_domain_topology_level { - sched_domain_build_f build; + sched_domain_init_f init; + sched_domain_mask_f mask; }; /* @@ -7104,109 +7107,51 @@ static void claim_allocations(int cpu, struct sched_domain *sd) } } -static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) { - struct sched_domain *sd = NULL; -#ifdef CONFIG_NUMA - sd = sd_init_ALLNODES(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_possible_mask); - sd->parent = parent; - if (parent) - parent->child = sd; -#endif - return sd; + return topology_thread_cpumask(cpu); } +#endif -static struct sched_domain *__build_node_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = NULL; +static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_NUMA - sd = sd_init_NODE(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_node_mask(i)); - sd->parent = parent; - if (parent) - parent->child = sd; + { sd_init_ALLNODES, cpu_allnodes_mask, }, + { sd_init_NODE, cpu_node_mask, }, #endif - return sd; -} - -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd; - sd = sd_init_CPU(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_cpu_mask(i)); - sd->parent = parent; - if (parent) - parent->child = sd; - return sd; -} - -static struct sched_domain *__build_book_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; + { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_SCHED_BOOK - sd = sd_init_BOOK(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); - sd->parent = parent; - parent->child = sd; + { sd_init_BOOK, cpu_book_mask, }, #endif - return sd; -} - -static struct sched_domain *__build_mc_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_MC - sd = sd_init_MC(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); - sd->parent = parent; - parent->child = sd; + { sd_init_MC, cpu_coregroup_mask, }, #endif - return sd; -} - -static struct sched_domain *__build_smt_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; #ifdef CONFIG_SCHED_SMT - sd = sd_init_SIBLING(d, i); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); - sd->parent = parent; - parent->child = sd; + { sd_init_SIBLING, cpu_smt_mask, }, #endif - return sd; -} - -static struct sched_domain_topology_level default_topology[] = { - { __build_allnodes_sched_domain, }, - { __build_node_sched_domain, }, - { __build_cpu_sched_domain, }, - { __build_book_sched_domain, }, - { __build_mc_sched_domain, }, - { __build_smt_sched_domain, }, { NULL, }, }; static struct sched_domain_topology_level *sched_domain_topology = default_topology; +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *parent, + int cpu) +{ + struct sched_domain *sd = tl->init(d, cpu); + if (!sd) + return parent; + + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd->parent = parent; + if (parent) + parent->child = sd; + + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -7228,8 +7173,8 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_topology_level *tl; sd = NULL; - for (tl = sched_domain_topology; tl->build; tl++) - sd = tl->build(&d, cpu_map, attr, sd, i); + for (tl = sched_domain_topology; tl->init; tl++) + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); *per_cpu_ptr(d.sd, i) = sd; } -- cgit v1.1 From d069b916f7b50021d41d6ce498f86da32a7afaec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:02 +0200 Subject: sched: Reverse the topology list In order to get rid of static sched_domain::level assignments, reverse the topology iteration. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.876506131@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 00d1e37..38bc53b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7114,20 +7114,23 @@ static const struct cpumask *cpu_smt_mask(int cpu) } #endif +/* + * Topology list, bottom-up. + */ static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_NUMA - { sd_init_ALLNODES, cpu_allnodes_mask, }, - { sd_init_NODE, cpu_node_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, +#ifdef CONFIG_SCHED_SMT + { sd_init_SIBLING, cpu_smt_mask, }, #endif #ifdef CONFIG_SCHED_MC { sd_init_MC, cpu_coregroup_mask, }, #endif -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, +#ifdef CONFIG_SCHED_BOOK + { sd_init_BOOK, cpu_book_mask, }, +#endif + { sd_init_CPU, cpu_cpu_mask, }, +#ifdef CONFIG_NUMA + { sd_init_NODE, cpu_node_mask, }, + { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif { NULL, }, }; @@ -7136,18 +7139,18 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct s_data *d, const struct cpumask *cpu_map, - struct sched_domain_attr *attr, struct sched_domain *parent, + struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { struct sched_domain *sd = tl->init(d, cpu); if (!sd) - return parent; + return child; set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd->parent = parent; - if (parent) - parent->child = sd; + if (child) + child->parent = sd; + sd->child = child; return sd; } @@ -7176,6 +7179,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, for (tl = sched_domain_topology; tl->init; tl++) sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + while (sd->child) + sd = sd->child; + *per_cpu_ptr(d.sd, i) = sd; } -- cgit v1.1 From 54ab4ff4316eb329d2c1acc110fbc623d2966931 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:03 +0200 Subject: sched: Move sched domain storage into the topology list In order to remove the last dependency on the statid domain levels, move the sd_data storage into the topology structure. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.924926412@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 129 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 38bc53b..3231e19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6837,7 +6837,6 @@ struct sd_data { struct s_data { struct sched_domain ** __percpu sd; - struct sd_data sdd[SD_LV_MAX]; struct root_domain *rd; }; @@ -6848,12 +6847,15 @@ enum s_alloc { sa_none, }; -typedef struct sched_domain *(*sched_domain_init_f)(struct s_data *d, int cpu); +struct sched_domain_topology_level; + +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; + struct sd_data data; }; /* @@ -6958,15 +6960,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain *sd_init_##type(struct s_data *d, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(d->sdd[SD_LV_##type].sd, cpu); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ - sd->private = &d->sdd[SD_LV_##type]; \ - return sd; \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7019,11 +7022,12 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { - int i, j; - switch (what) { case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) @@ -7031,14 +7035,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_sd: free_percpu(d->sd); /* fall through */ case sa_sd_storage: - for (i = 0; i < SD_LV_MAX; i++) { - for_each_cpu(j, cpu_map) { - kfree(*per_cpu_ptr(d->sdd[i].sd, j)); - kfree(*per_cpu_ptr(d->sdd[i].sg, j)); - } - free_percpu(d->sdd[i].sd); - free_percpu(d->sdd[i].sg); - } /* fall through */ + __sdt_free(cpu_map); /* fall through */ case sa_none: break; } @@ -7047,38 +7044,10 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { - int i, j; - memset(d, 0, sizeof(*d)); - for (i = 0; i < SD_LV_MAX; i++) { - d->sdd[i].sd = alloc_percpu(struct sched_domain *); - if (!d->sdd[i].sd) - return sa_sd_storage; - - d->sdd[i].sg = alloc_percpu(struct sched_group *); - if (!d->sdd[i].sg) - return sa_sd_storage; - - for_each_cpu(j, cpu_map) { - struct sched_domain *sd; - struct sched_group *sg; - - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sd) - return sa_sd_storage; - - *per_cpu_ptr(d->sdd[i].sd, j) = sd; - - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(j)); - if (!sg) - return sa_sd_storage; - - *per_cpu_ptr(d->sdd[i].sg, j) = sg; - } - } + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) return sa_sd_storage; @@ -7137,12 +7106,68 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + *per_cpu_ptr(sdd->sg, j) = sg; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(sdd->sd, j)); + kfree(*per_cpu_ptr(sdd->sg, j)); + } + free_percpu(sdd->sd); + free_percpu(sdd->sg); + } +} + struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(d, cpu); + struct sched_domain *sd = tl->init(tl, cpu); if (!sd) return child; -- cgit v1.1 From 60495e7760d8ee364695006af37309b0755e0e17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:04 +0200 Subject: sched: Dynamic sched_domain::level Remove the SD_LV_ enum and use dynamic level assignments. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.969433965@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 23 +++-------------------- kernel/cpuset.c | 2 +- kernel/sched.c | 9 ++++++--- 3 files changed, 10 insertions(+), 24 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 09d9e02..e43e5b0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -892,25 +892,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } -enum sched_domain_level { - SD_LV_NONE = 0, -#ifdef CONFIG_SCHED_SMT - SD_LV_SIBLING, -#endif -#ifdef CONFIG_SCHED_MC - SD_LV_MC, -#endif -#ifdef CONFIG_SCHED_BOOK - SD_LV_BOOK, -#endif - SD_LV_CPU, -#ifdef CONFIG_NUMA - SD_LV_NODE, - SD_LV_ALLNODES, -#endif - SD_LV_MAX -}; - struct sched_domain_attr { int relax_domain_level; }; @@ -919,6 +900,8 @@ struct sched_domain_attr { .relax_domain_level = -1, \ } +extern int sched_domain_level_max; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -936,7 +919,7 @@ struct sched_domain { unsigned int forkexec_idx; unsigned int smt_gain; int flags; /* See SD_* */ - enum sched_domain_level level; + int level; /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16..2bb8c2e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= SD_LV_MAX) + if (val < -1 || val >= sched_domain_level_max) return -EINVAL; #endif diff --git a/kernel/sched.c b/kernel/sched.c index 3231e19..506cb81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6966,7 +6966,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ { \ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ SD_INIT_NAME(sd, type); \ sd->private = &tl->data; \ return sd; \ @@ -6988,13 +6987,14 @@ SD_INIT_FUNC(CPU) #endif static int default_relax_domain_level = -1; +int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { unsigned long val; val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) + if (val < sched_domain_level_max) default_relax_domain_level = val; return 1; @@ -7173,8 +7173,11 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - if (child) + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + } sd->child = child; return sd; -- cgit v1.1 From 3905c54f2bd2c6f937f87307987ca072eabc3e7b Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 12 Apr 2011 14:00:40 +1000 Subject: sched, sparc64: Turn cpu_coregroup_mask() into a real function This compile error triggers on Sparc64: kernel/sched.c:7140: error: 'cpu_coregroup_mask' undeclared here (not in a function) Because after the recent scheduler domain cleanups the scheduler uses this arch method as a function pointer in a scheduler topology data structure - which is not possible with a macro. Signed-off-by: Stephen Rothwell Acked-by: David S. Miller Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20110412140040.3020ef55.sfr@canb.auug.org.au Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/topology_64.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1c79f32..8b9c556 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -65,6 +65,10 @@ static inline int pcibus_to_node(struct pci_bus *pbus) #define smt_capable() (sparc64_multi_core) #endif /* CONFIG_SMP */ -#define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) +extern cpumask_t cpu_core_map[NR_CPUS]; +static inline const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_core_map[cpu]; +} #endif /* _ASM_SPARC64_TOPOLOGY_H */ -- cgit v1.1 From d99ddec3eee0be8a43b2c1ff624b9dfaaa26b959 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 11 Apr 2011 11:03:18 +0200 Subject: x86/amd-iommu: Add extended feature detection This patch adds detection of the extended features of an AMD IOMMU. The available features are printed to dmesg on boot. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_proto.h | 10 +++++++++- arch/x86/include/asm/amd_iommu_types.h | 17 +++++++++++++++++ arch/x86/kernel/amd_iommu_init.c | 24 ++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index 1223c0f..a4ae6c3 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -19,7 +19,7 @@ #ifndef _ASM_X86_AMD_IOMMU_PROTO_H #define _ASM_X86_AMD_IOMMU_PROTO_H -struct amd_iommu; +#include extern int amd_iommu_init_dma_ops(void); extern int amd_iommu_init_passthrough(void); @@ -42,4 +42,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev) (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); } +static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) +{ + if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) + return false; + + return !!(iommu->features & f); +} + #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 878ae00..5c24e46 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -68,12 +68,25 @@ #define MMIO_CONTROL_OFFSET 0x0018 #define MMIO_EXCL_BASE_OFFSET 0x0020 #define MMIO_EXCL_LIMIT_OFFSET 0x0028 +#define MMIO_EXT_FEATURES 0x0030 #define MMIO_CMD_HEAD_OFFSET 0x2000 #define MMIO_CMD_TAIL_OFFSET 0x2008 #define MMIO_EVT_HEAD_OFFSET 0x2010 #define MMIO_EVT_TAIL_OFFSET 0x2018 #define MMIO_STATUS_OFFSET 0x2020 + +/* Extended Feature Bits */ +#define FEATURE_PREFETCH (1ULL<<0) +#define FEATURE_PPR (1ULL<<1) +#define FEATURE_X2APIC (1ULL<<2) +#define FEATURE_NX (1ULL<<3) +#define FEATURE_GT (1ULL<<4) +#define FEATURE_IA (1ULL<<6) +#define FEATURE_GA (1ULL<<7) +#define FEATURE_HE (1ULL<<8) +#define FEATURE_PC (1ULL<<9) + /* MMIO status bits */ #define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 @@ -227,6 +240,7 @@ /* IOMMU capabilities */ #define IOMMU_CAP_IOTLB 24 #define IOMMU_CAP_NPCACHE 26 +#define IOMMU_CAP_EFR 27 #define MAX_DOMAIN_ID 65536 @@ -371,6 +385,9 @@ struct amd_iommu { /* flags read from acpi table */ u8 acpi_flags; + /* Extended features */ + u64 features; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8848dda..047905d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -299,9 +299,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", + static const char * const feat_str[] = { + "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", + "IA", "GA", "HE", "PC", NULL + }; + int i; + + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx", dev_name(&iommu->dev->dev), iommu->cap_ptr); + if (iommu->cap & (1 << IOMMU_CAP_EFR)) { + printk(KERN_CONT " extended features: "); + for (i = 0; feat_str[i]; ++i) + if (iommu_feature(iommu, (1ULL << i))) + printk(KERN_CONT " %s", feat_str[i]); + } + printk(KERN_CONT "\n"); + iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } @@ -657,7 +671,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; - u32 range, misc; + u32 range, misc, low, high; int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, @@ -673,6 +687,12 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + /* read extended feature bits */ + low = readl(iommu->mmio_base + MMIO_EXT_FEATURES); + high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4); + + iommu->features = ((u64)high << 32) | low; + if (!is_rd890_iommu(iommu->dev)) return; -- cgit v1.1 From 58fc7f1419560efa9c426b829c195050e0147d7f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 11 Apr 2011 11:13:24 +0200 Subject: x86/amd-iommu: Add support for invalidate_all command This patch adds support for the invalidate_all command present in new versions of the AMD IOMMU. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_types.h | 1 + arch/x86/kernel/amd_iommu.c | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 5c24e46..df62d26 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -127,6 +127,7 @@ #define CMD_COMPL_WAIT 0x01 #define CMD_INV_DEV_ENTRY 0x02 #define CMD_INV_IOMMU_PAGES 0x03 +#define CMD_INV_ALL 0x08 #define CMD_COMPL_WAIT_STORE_MASK 0x01 #define CMD_COMPL_WAIT_INT_MASK 0x02 diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index bcf58ea..d6192bc 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -463,6 +463,12 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; } +static void build_inv_all(struct iommu_cmd *cmd) +{ + memset(cmd, 0, sizeof(*cmd)); + CMD_SET_TYPE(cmd, CMD_INV_ALL); +} + /* * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. @@ -567,10 +573,24 @@ static void iommu_flush_tlb_all(struct amd_iommu *iommu) iommu_completion_wait(iommu); } +static void iommu_flush_all(struct amd_iommu *iommu) +{ + struct iommu_cmd cmd; + + build_inv_all(&cmd); + + iommu_queue_command(iommu, &cmd); + iommu_completion_wait(iommu); +} + void iommu_flush_all_caches(struct amd_iommu *iommu) { - iommu_flush_dte_all(iommu); - iommu_flush_tlb_all(iommu); + if (iommu_feature(iommu, FEATURE_IA)) { + iommu_flush_all(iommu); + } else { + iommu_flush_dte_all(iommu); + iommu_flush_tlb_all(iommu); + } } /* -- cgit v1.1 From 32a90b6e65792260d6212ac52e8f5be140b6f5be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Apr 2011 12:01:51 +0200 Subject: ARM: imx: fix usb related build failure for mach-vpr200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was broken by 4bd597b (ARM i.MX ehci: do ehci init in board specific functions) and fixes: CC arch/arm/mach-mx3/mach-vpr200.o arch/arm/mach-mx3/mach-vpr200.c:263: error: unknown field 'flags' specified in initializer arch/arm/mach-mx3/mach-vpr200.c:264: warning: initialization makes pointer from integer without a cast by just applying the change to mach-vpr200.c that the other machine files got by 4bd597b. LAKML-Reference: 1302257029-17397-1-git-send-email-u.kleine-koenig@pengutronix.de Acked-by: Marc Reilly Signed-off-by: Uwe Kleine-König Signed-off-by: Sascha Hauer --- arch/arm/mach-mx3/mach-vpr200.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-mx3/mach-vpr200.c b/arch/arm/mach-mx3/mach-vpr200.c index 2cf390f..47a69cb 100644 --- a/arch/arm/mach-mx3/mach-vpr200.c +++ b/arch/arm/mach-mx3/mach-vpr200.c @@ -257,11 +257,16 @@ static const struct fsl_usb2_platform_data otg_device_pdata __initconst = { .workaround = FLS_USB2_WORKAROUND_ENGCM09152, }; +static int vpr200_usbh_init(struct platform_device *pdev) +{ + return mx35_initialize_usb_hw(pdev->id, + MXC_EHCI_INTERFACE_SINGLE_UNI | MXC_EHCI_INTERNAL_PHY); +} + /* USB HOST config */ static const struct mxc_usbh_platform_data usb_host_pdata __initconst = { - .portsc = MXC_EHCI_MODE_SERIAL, - .flags = MXC_EHCI_INTERFACE_SINGLE_UNI | - MXC_EHCI_INTERNAL_PHY, + .init = vpr200_usbh_init, + .portsc = MXC_EHCI_MODE_SERIAL, }; static struct platform_device *devices[] __initdata = { -- cgit v1.1 From f61b9fc27e5b61dbc330696f040cc66ba1dbcbaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Apr 2011 16:06:12 +0200 Subject: ARM: mxs/clock-mx28: fix up name##_set_rate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the lcdif clock get_rate looks as follows: read div from HW_CLKCTRL_DIS_LCDIF.DIV return clk_get_rate(clk->parent) / div with clk->parent being ref_pix_clk on my system. ref_pix_clk's rate depends on HW_CLKCTRL_FRAC1.PIXFRAC. The set_rate function for lcdif does: parent_rate = clk_get_rate(clk->parent); based on that calculate frac and div such that parent_rate * 18 / frac / div is near the requested rate. HW_CLKCTRL_FRAC1.PIXFRAC is updated with frac HW_CLKCTRL_DIS_LCDIF.DIV is updated with div For this calculation to be correct parent_rate needs to be initialized not with the clock rate of lcdif's parent (i.e. ref_pix) but that of its grandparent (i.e. ref_pix' parent == pll0_clk). The obvious downside of this patch is that now set_rate(lcdif) changes its parent's rate, too. Still this is better than a wrong rate. Acked-by: Shawn Guo LAKML-Reference: 20110225084950.GA13684@S2101-09.ap.freescale.net Signed-off-by: Uwe Kleine-König Signed-off-by: Sascha Hauer --- arch/arm/mach-mxs/clock-mx28.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-mxs/clock-mx28.c b/arch/arm/mach-mxs/clock-mx28.c index 1ad97fe..5dcc59d 100644 --- a/arch/arm/mach-mxs/clock-mx28.c +++ b/arch/arm/mach-mxs/clock-mx28.c @@ -295,11 +295,11 @@ static int name##_set_rate(struct clk *clk, unsigned long rate) \ unsigned long diff, parent_rate, calc_rate; \ int i; \ \ - parent_rate = clk_get_rate(clk->parent); \ div_max = BM_CLKCTRL_##dr##_DIV >> BP_CLKCTRL_##dr##_DIV; \ bm_busy = BM_CLKCTRL_##dr##_BUSY; \ \ if (clk->parent == &ref_xtal_clk) { \ + parent_rate = clk_get_rate(clk->parent); \ div = DIV_ROUND_UP(parent_rate, rate); \ if (clk == &cpu_clk) { \ div_max = BM_CLKCTRL_CPU_DIV_XTAL >> \ @@ -309,6 +309,11 @@ static int name##_set_rate(struct clk *clk, unsigned long rate) \ if (div == 0 || div > div_max) \ return -EINVAL; \ } else { \ + /* \ + * hack alert: this block modifies clk->parent, too, \ + * so the base to use it the grand parent. \ + */ \ + parent_rate = clk_get_rate(clk->parent->parent); \ rate >>= PARENT_RATE_SHIFT; \ parent_rate >>= PARENT_RATE_SHIFT; \ diff = parent_rate; \ -- cgit v1.1 From 0575b4b83edb0a766a9c1518a5da57780f386340 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 16 Nov 2010 13:13:37 +0000 Subject: ARM: mxc: Correct data alignment in headsmp.S for CONFIG_THUMB2_KERNEL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Directives such as .long and .word do not magically cause the assembler location counter to become aligned in gas. As a result, using these directives in code sections can result in misaligned data words when building a Thumb-2 kernel (CONFIG_THUMB2_KERNEL). This is a Bad Thing, since the ABI permits the compiler to assume that fundamental types of word size or above are word- aligned when accessing them from C. If the data is not really word-aligned, this can cause impaired performance and stray alignment faults in some circumstances. In general, the following rules should be applied when using data word declaration directives inside code sections: * .quad and .double: .align 3 * .long, .word, .single, .float: .align (or .align 2) * .short: No explicit alignment required, since Thumb-2 instructions are always 2 or 4 bytes in size. immediately after an instruction. Signed-off-by: Dave Martin Signed-off-by: Sascha Hauer LAKML-Reference: 1289913217-8672-1-git-send-email-dave.martin@linaro.org Signed-off-by: Uwe Kleine-König Signed-off-by: Sascha Hauer --- arch/arm/plat-mxc/ssi-fiq.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/plat-mxc/ssi-fiq.S b/arch/arm/plat-mxc/ssi-fiq.S index 4ddce56..8397a2d 100644 --- a/arch/arm/plat-mxc/ssi-fiq.S +++ b/arch/arm/plat-mxc/ssi-fiq.S @@ -124,6 +124,8 @@ imx_ssi_fiq_start: 1: @ return from FIQ subs pc, lr, #4 + + .align imx_ssi_fiq_base: .word 0x0 imx_ssi_fiq_rx_buffer: -- cgit v1.1 From b5eee2fdefc89df312034e5e0e6f5dd55c4e9bae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Apr 2011 14:29:58 +0200 Subject: ARM: mxc: Add missing lockdep annotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The irq_set_wake() function of the gpio irq_chip calls enable/disable_irq_wake() on the demultiplex interrupt. That leads to a lockdep warning "INFO: possible recursive locking detected" because irq_set_type() is called under irq_desc->lock and the *_irq_wake() calls take irq_desc->lock of the demux interrupt. Tell lockdep that the gpio irqs are in a different lock class. Documentation/SubmitChecklist: 15: All codepaths have been exercised with all lockdep features enabled. That's a non-optional requirement, AFAICT. Reported-and-tested-by: Arnaud Patard Signed-off-by: Thomas Gleixner LAKML-Reference: alpine.LFD.2.00.1104041416290.19945@localhost6.localdomain6 Signed-off-by: Uwe Kleine-König Signed-off-by: Sascha Hauer --- arch/arm/plat-mxc/gpio.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/plat-mxc/gpio.c b/arch/arm/plat-mxc/gpio.c index 7a10724..6cd6d7f 100644 --- a/arch/arm/plat-mxc/gpio.c +++ b/arch/arm/plat-mxc/gpio.c @@ -295,6 +295,12 @@ static int mxc_gpio_direction_output(struct gpio_chip *chip, return 0; } +/* + * This lock class tells lockdep that GPIO irqs are in a different + * category than their parents, so it won't report false recursion. + */ +static struct lock_class_key gpio_lock_class; + int __init mxc_gpio_init(struct mxc_gpio_port *port, int cnt) { int i, j; @@ -311,6 +317,7 @@ int __init mxc_gpio_init(struct mxc_gpio_port *port, int cnt) __raw_writel(~0, port[i].base + GPIO_ISR); for (j = port[i].virtual_irq_start; j < port[i].virtual_irq_start + 32; j++) { + irq_set_lockdep_class(j, &gpio_lock_class); irq_set_chip_and_handler(j, &gpio_irq_chip, handle_level_irq); set_irq_flags(j, IRQF_VALID); -- cgit v1.1 From 184748cc50b2dceb8287f9fb657eda48ff8fcfe7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:39 +0200 Subject: sched: Provide scheduler_ipi() callback in response to smp_send_reschedule() For future rework of try_to_wake_up() we'd like to push part of that function onto the CPU the task is actually going to run on. In order to do so we need a generic callback from the existing scheduler IPI. This patch introduces such a generic callback: scheduler_ipi() and implements it as a NOP. BenH notes: PowerPC might use this IPI on offline CPUs under rare conditions! Acked-by: Russell King Acked-by: Martin Schwidefsky Acked-by: Chris Metcalf Acked-by: Jesper Nilsson Acked-by: Benjamin Herrenschmidt Signed-off-by: Ralf Baechle Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.744338123@chello.nl --- arch/alpha/kernel/smp.c | 3 +-- arch/arm/kernel/smp.c | 5 +---- arch/blackfin/mach-common/smp.c | 3 +++ arch/cris/arch-v32/kernel/smp.c | 13 ++++++++----- arch/ia64/kernel/irq_ia64.c | 2 ++ arch/ia64/xen/irq_xen.c | 10 +++++++++- arch/m32r/kernel/smp.c | 4 +--- arch/mips/cavium-octeon/smp.c | 2 ++ arch/mips/kernel/smtc.c | 2 +- arch/mips/mti-malta/malta-int.c | 2 ++ arch/mips/pmc-sierra/yosemite/smp.c | 4 ++++ arch/mips/sgi-ip27/ip27-irq.c | 2 ++ arch/mips/sibyte/bcm1480/smp.c | 7 +++---- arch/mips/sibyte/sb1250/smp.c | 7 +++---- arch/mn10300/kernel/smp.c | 5 +---- arch/parisc/kernel/smp.c | 5 +---- arch/powerpc/kernel/smp.c | 4 ++-- arch/s390/kernel/smp.c | 6 +++--- arch/sh/kernel/smp.c | 2 ++ arch/sparc/kernel/smp_32.c | 4 +++- arch/sparc/kernel/smp_64.c | 1 + arch/tile/kernel/smp.c | 6 +----- arch/um/kernel/smp.c | 2 +- arch/x86/kernel/smp.c | 5 ++--- arch/x86/xen/smp.c | 5 ++--- include/linux/sched.h | 2 ++ 26 files changed, 63 insertions(+), 50 deletions(-) diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078..5a621c6 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 8fe05ad..7a561eb 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 6e17a26..326bb86 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -164,6 +164,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) while (msg_queue->count) { msg = &msg_queue->ipi_message[msg_queue->head]; switch (msg->type) { + case BFIN_IPI_RESCHEDULE: + scheduler_ipi(); + break; case BFIN_IPI_CALL_FUNC: spin_unlock_irqrestore(&msg_queue->lock, flags); ipi_call_function(cpu, msg); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1..66cc756 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); + if (ipi.vector & IPI_SCHEDULE) { + scheduler_ipi(); + } if (ipi.vector & IPI_CALL) { - func(info); + func(info); } if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) + if (flush_mm == FLUSH_ALL) + __flush_tlb_all(); + else if (flush_vma == FLUSH_ALL) __flush_tlb_mm(flush_mm); - else + else __flush_tlb_page(flush_vma, flush_addr); } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b70474..782c3a35 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) smp_local_flush_tlb(); kstat_incr_irqs_this_cpu(irq, desc); } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); kstat_incr_irqs_this_cpu(irq, desc); } else { ia64_setreg(_IA64_REG_CR_TPR, vector); diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb85..b279e14 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c @@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; static int xen_slab_ready; #ifdef CONFIG_SMP +#include + /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, * it ends up to issue several memory accesses upon percpu data and * thus adds unnecessary traffic to other paths. @@ -99,7 +101,13 @@ static int xen_slab_ready; static irqreturn_t xen_dummy_handler(int irq, void *dev_id) { + return IRQ_HANDLED; +} +static irqreturn_t +xen_resched_handler(int irq, void *dev_id) +{ + scheduler_ipi(); return IRQ_HANDLED; } @@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { }; static struct irqaction xen_resched_irqaction = { - .handler = xen_dummy_handler, + .handler = xen_resched_handler, .flags = IRQF_DISABLED, .name = "resched" }; diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20..fc10b39 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) * * Description: This routine executes on CPU which received * 'RESCHEDULE_IPI'. - * Rescheduling is processed at the exit of interrupt - * operation. * * Born on Date: 2002.02.05 * @@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) *==========================================================================*/ void smp_reschedule_interrupt(void) { - /* nothing to do */ + scheduler_ipi(); } /*==========================================================================* diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21..76923ee 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); /* Check if we've been told to flush the icache */ if (action & SMP_ICACHE_FLUSH) diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4..cedac46 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) static void ipi_resched_interrupt(void) { - /* Return from interrupt should be enough to cause scheduler check */ + scheduler_ipi(); } static void ipi_call_interrupt(void) diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index 9027061..7d93e6fb 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -309,6 +309,8 @@ static void ipi_call_dispatch(void) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) { + scheduler_ipi(); + return IRQ_HANDLED; } diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e88..2608752 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -55,6 +55,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; case 1: @@ -63,6 +65,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; } } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603..b18b04e 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) #ifdef CONFIG_SMP if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c..d667875 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb..38e7f6b 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826..83fb279 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -494,14 +494,11 @@ void smp_send_stop(void) * @irq: The interrupt number. * @dev_id: The device ID. * - * We need do nothing here, since the scheduling will be effected on our way - * back through entry.S. - * * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. */ static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { - /* do nothing */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d3..828305f 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); - /* - * Reschedule callback. Everything to be - * done is done by the interrupt return path. - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbdbb14..9f9c204 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -116,7 +116,7 @@ void smp_message_recv(int msg) generic_smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: - /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data) { - /* we just need the return path side effect of checking need_resched */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 63a97db..63c7d9f 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; /* * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. */ bits = xchg(&S390_lowcore.ext_call_fast, 0); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b..6207561 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) generic_smp_call_function_interrupt(); break; case SMP_MSG_RESCHEDULE: + scheduler_ipi(); break; case SMP_MSG_FUNCTION_SINGLE: generic_smp_call_function_single_interrupt(); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 91c10fb..f95690c 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -125,7 +125,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; void smp_send_reschedule(int cpu) { - /* See sparc64 */ + /* + * XXX missing reschedule IPI, see scheduler_ipi() + */ } void smp_send_stop(void) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3e94a8c..9478da7 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + scheduler_ipi(); } /* This is a nop because we capture all other cpus diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a429310..c52224d 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ static irqreturn_t handle_reschedule_ipi(int irq, void *token) { - /* - * Nothing to do here; when we return from interrupt, the - * rescheduling will occur there. But do bump the interrupt - * profiler count in the meantime. - */ __get_cpu_var(irq_stat).irq_resched_count++; + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27..eefb107 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -173,7 +173,7 @@ void IPI_handler(int cpu) break; case 'R': - set_tsk_need_resched(current); + scheduler_ipi(); break; case 'S': diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac..013e7eb 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3061244..762b46a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c02..758e27a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2189,8 +2189,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +static inline void scheduler_ipi(void) { } extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.1 From 3ca7a440da394808571dad32d33d3bc0389982e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:40 +0200 Subject: sched: Always provide p->on_cpu Always provide p->on_cpu so that we can determine if its on a cpu without having to lock the rq. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.785452014@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +--- kernel/sched.c | 46 +++++++++++++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 758e27a..3435837 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1200,9 +1200,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - int oncpu; -#endif + int on_cpu; #endif int prio, static_prio, normal_prio; diff --git a/kernel/sched.c b/kernel/sched.c index a187c3f..cd2593e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -838,18 +838,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -865,15 +886,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -882,7 +894,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -895,12 +907,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -2686,8 +2698,8 @@ void sched_fork(struct task_struct *p, int clone_flags) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -5776,8 +5788,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.1 From c6eb3dda25892f1f974f5420f63e6721aab02f6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:41 +0200 Subject: mutex: Use p->on_cpu for the adaptive spin Since we now have p->on_cpu unconditionally available, use it to re-implement mutex_spin_on_owner. Requested-by: Thomas Gleixner Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.826338173@chello.nl --- include/linux/mutex.h | 2 +- include/linux/sched.h | 2 +- kernel/mutex-debug.c | 2 +- kernel/mutex-debug.h | 2 +- kernel/mutex.c | 2 +- kernel/mutex.h | 2 +- kernel/sched.c | 83 ++++++++++++++++++++------------------------------- 7 files changed, 39 insertions(+), 56 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 94b48bd..c75471d 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -51,7 +51,7 @@ struct mutex { spinlock_t wait_lock; struct list_head wait_list; #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - struct thread_info *owner; + struct task_struct *owner; #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/sched.h b/include/linux/sched.h index 3435837..1738504 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a9..73da83a 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a..0799fd3 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa..fe4706c 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ for (;;) { - struct thread_info *owner; + struct task_struct *owner; /* * If we own the BKL, then don't spin. The owner of diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca..4115fbf 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -19,7 +19,7 @@ #ifdef CONFIG_SMP static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/sched.c b/kernel/sched.c index cd2593e..55cc503 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4173,70 +4173,53 @@ need_resched: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif -- cgit v1.1 From c2f7115e2e52a6c187b8c1f54f0e4970bb677be0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Apr 2011 13:28:56 +0200 Subject: sched: Move wq_worker_waking to the correct site wq_worker_waking_up() needs to match wq_worker_sleeping(), since the latter is only called on deactivate, move the former near activate. Signed-off-by: Peter Zijlstra Cc: Tejun Heo Link: http://lkml.kernel.org/n/top-t3m7n70n9frmv4pv2n5fwmov@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 55cc503..81ab58e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2421,6 +2421,10 @@ static inline void ttwu_activate(struct task_struct *p, struct rq *rq, schedstat_inc(p, se.statistics.nr_wakeups_remote); activate_task(rq, p, en_flags); + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); } static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, @@ -2445,9 +2449,6 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, rq->idle_stamp = 0; } #endif - /* if a worker is waking up, notify workqueue */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); } /** -- cgit v1.1 From 893633817f5b58f5227365d74344e0170a718213 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:42 +0200 Subject: sched: Change the ttwu() success details try_to_wake_up() would only return a success when it would have to place a task on a rq, change that to every time we change p->state to TASK_RUNNING, because that's the real measure of wakeups. This results in that success is always true for the tracepoints. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.866866929@chello.nl --- kernel/sched.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 81ab58e..3919aa4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2427,10 +2427,10 @@ static inline void ttwu_activate(struct task_struct *p, struct rq *rq, wq_worker_waking_up(p, cpu_of(rq)); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - int wake_flags, bool success) +static void +ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags) { - trace_sched_wakeup(p, success); + trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -2546,9 +2546,9 @@ out_activate: #endif /* CONFIG_SMP */ ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, cpu == this_cpu, en_flags); - success = 1; out_running: - ttwu_post_activation(p, rq, wake_flags, success); + ttwu_post_activation(p, rq, wake_flags); + success = 1; out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2567,7 +2567,6 @@ out: static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; BUG_ON(rq != this_rq()); BUG_ON(p == current); @@ -2582,9 +2581,8 @@ static void try_to_wake_up_local(struct task_struct *p) schedstat_inc(rq, ttwu_local); } ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - success = true; } - ttwu_post_activation(p, rq, 0, success); + ttwu_post_activation(p, rq, 0); } /** @@ -2747,7 +2745,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); - trace_sched_wakeup_new(p, 1); + trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) -- cgit v1.1 From d7c01d27ab767a30d672d1fd657aa8336ebdcbca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:43 +0200 Subject: sched: Clean up ttwu() stats Collect all ttwu() stat code into a single function and ensure its always called for an actual wakeup (changing p->state to TASK_RUNNING). Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.908177058@chello.nl --- kernel/sched.c | 75 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 3919aa4..4481638 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2406,20 +2406,43 @@ static void update_avg(u64 *avg, u64 sample) } #endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync, bool is_migrate, bool is_local, - unsigned long en_flags) +static void +ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) { +#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); schedstat_inc(p, se.statistics.nr_wakeups); - if (is_sync) + + if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (is_migrate) + + if (cpu != task_cpu(p)) schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (is_local) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ activate_task(rq, p, en_flags); /* if a worker is waking up, notify workqueue */ @@ -2481,12 +2504,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (!(p->state & state)) goto out; + cpu = task_cpu(p); + if (p->se.on_rq) goto out_running; - cpu = task_cpu(p); orig_cpu = cpu; - #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; @@ -2527,27 +2550,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ - out_activate: #endif /* CONFIG_SMP */ - ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, - cpu == this_cpu, en_flags); + ttwu_activate(rq, p, en_flags); out_running: ttwu_post_activation(p, rq, wake_flags); + ttwu_stat(rq, p, cpu, wake_flags); success = 1; out: task_rq_unlock(rq, &flags); @@ -2575,14 +2583,11 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) return; - if (!p->se.on_rq) { - if (likely(!task_running(rq, p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - } + if (!p->se.on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_post_activation(p, rq, 0); + ttwu_stat(rq, p, smp_processor_id(), 0); } /** -- cgit v1.1 From fd2f4419b4cbe8fe90796df9617c355762afd6a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:44 +0200 Subject: sched: Provide p->on_rq Provide a generic p->on_rq because the p->se.on_rq semantics are unfavourable for lockless wakeups but needed for sched_fair. In particular, p->on_rq is only cleared when we actually dequeue the task in schedule() and not on any random dequeue as done by things like __migrate_task() and __sched_setscheduler(). This also allows us to remove p->se usage from !sched_fair code. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.949545047@chello.nl --- include/linux/sched.h | 1 + kernel/sched.c | 38 ++++++++++++++++++++------------------ kernel/sched_debug.c | 2 +- kernel/sched_rt.c | 16 ++++++++-------- kernel/sched_stoptask.c | 2 +- 5 files changed, 31 insertions(+), 28 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1738504..b33a700 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,6 +1202,7 @@ struct task_struct { #ifdef CONFIG_SMP int on_cpu; #endif + int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; diff --git a/kernel/sched.c b/kernel/sched.c index 4481638..dece28e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1785,7 +1785,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1793,7 +1792,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -2128,7 +2126,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2203,7 +2201,7 @@ static bool migrate_task(struct task_struct *p, struct rq *rq) * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->se.on_rq || task_running(rq, p); + return p->on_rq || task_running(rq, p); } /* @@ -2263,7 +2261,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -2444,6 +2442,7 @@ ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); + p->on_rq = 1; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -2506,7 +2505,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, cpu = task_cpu(p); - if (p->se.on_rq) + if (p->on_rq) goto out_running; orig_cpu = cpu; @@ -2583,7 +2582,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) return; - if (!p->se.on_rq) + if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); @@ -2620,19 +2619,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2750,6 +2751,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); + p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -4051,7 +4053,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -4126,7 +4128,9 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; /* * If we are going to sleep and we have plugged IO queued, make @@ -4695,7 +4699,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4743,7 +4747,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4877,8 +4881,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -5044,7 +5046,7 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -5965,7 +5967,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -8339,7 +8341,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8682,7 +8684,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83..3669bec6 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc..9ca4f5f 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1136,7 +1136,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1287,7 +1287,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !task->se.on_rq)) { + !task->on_rq)) { raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; @@ -1321,7 +1321,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->rt.nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); return p; @@ -1467,7 +1467,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * There's a chance that p is higher in priority @@ -1538,7 +1538,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Update the migration status of the RQ if we have an RT task * which is running AND changing its weight value. */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); if (!task_current(rq, p)) { @@ -1608,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->se.on_rq && !rq->rt.rt_nr_running) + if (p->on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1638,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->se.on_rq && rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1657,7 +1657,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; if (rq->curr == p) { diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd4..f607de4 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -26,7 +26,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->se.on_rq) + if (stop && stop->on_rq) return stop; return NULL; -- cgit v1.1 From 013fdb8086acaae5f8eb96f9ad48fcd98882ac46 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:45 +0200 Subject: sched: Serialize p->cpus_allowed and ttwu() using p->pi_lock Currently p->pi_lock already serializes p->sched_class, also put p->cpus_allowed and try_to_wake_up() under it, this prepares the way to do the first part of ttwu() without holding rq->lock. By having p->sched_class and p->cpus_allowed serialized by p->pi_lock, we prepare the way to call select_task_rq() without holding rq->lock. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.990364093@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index dece28e..d398f2f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2340,7 +2340,7 @@ EXPORT_SYMBOL_GPL(kick_process); #ifdef CONFIG_SMP /* - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -2373,7 +2373,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) @@ -2499,7 +2499,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); if (!(p->state & state)) goto out; @@ -2557,7 +2558,8 @@ out_running: ttwu_stat(rq, p, cpu, wake_flags); success = 1; out: - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); put_cpu(); return success; @@ -4694,6 +4696,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio < 0 || prio > MAX_PRIO); + lockdep_assert_held(&p->pi_lock); + rq = task_rq_lock(p, &flags); trace_sched_pi_setprio(p, prio); @@ -5317,7 +5321,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; unsigned long flags; - struct rq *rq; int retval; get_online_cpus(); @@ -5332,9 +5335,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -5882,18 +5885,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - /* - * Serialize against TASK_WAKING so that ttwu() and wunt() can - * drop the rq->lock and still rely on ->cpus_allowed. - */ -again: - while (task_is_waking(p)) - cpu_relax(); - rq = task_rq_lock(p, &flags); - if (task_is_waking(p)) { - task_rq_unlock(rq, &flags); - goto again; - } + raw_spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5921,13 +5914,15 @@ again: if (migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return ret; } -- cgit v1.1 From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:46 +0200 Subject: sched: Drop the rq argument to sched_class::select_task_rq() In preparation of calling select_task_rq() without rq->lock held, drop the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- kernel/sched.c | 20 +++++++++++--------- kernel/sched_fair.c | 2 +- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 38 ++++++++++++++++++++++++++------------ kernel/sched_stoptask.c | 3 +-- 6 files changed, 41 insertions(+), 27 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b33a700..ff4e2f9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,8 +1067,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct rq *rq, struct task_struct *p, - int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); diff --git a/kernel/sched.c b/kernel/sched.c index d398f2f..d4b815d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2195,13 +2195,15 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, struct rq *rq) +static bool need_migrate_task(struct task_struct *p) { /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->on_rq || task_running(rq, p); + bool running = p->on_rq || p->on_cpu; + smp_rmb(); /* finish_lock_switch() */ + return running; } /* @@ -2376,9 +2378,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2533,7 +2535,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, en_flags |= ENQUEUE_WAKING; } - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); __task_rq_unlock(rq); @@ -2744,7 +2746,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * We set TASK_WAKING so that select_task_rq() can drop rq->lock * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); set_task_cpu(p, cpu); p->state = TASK_RUNNING; @@ -3474,7 +3476,7 @@ void sched_exec(void) int dest_cpu; rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3482,7 +3484,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -5911,7 +5913,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ __task_rq_unlock(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0..96b2c95 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1657,7 +1657,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a63..0a51882 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -7,7 +7,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ca4f5f..19ecb31 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { + struct task_struct *curr; + struct rq *rq; + int cpu; + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + /* - * If the current task is an RT task, then + * If the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * lock? * * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. */ - if (unlikely(rt_task(rq->curr)) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio) && + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); + int target = find_lowest_rq(p); - return (cpu == -1) ? task_cpu(p) : cpu; + if (target != -1) + cpu = target; } + rcu_read_unlock(); - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); + return cpu; } static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index f607de4..6f43763 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -9,8 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct rq *rq, struct task_struct *p, - int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.1 From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:47 +0200 Subject: sched: Remove rq argument to sched_class::task_waking() In preparation of calling this without rq->lock held, remove the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++--- kernel/sched.c | 2 +- kernel/sched_fair.c | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ff4e2f9..7f5732f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1048,8 +1048,12 @@ struct sched_domain; #define WF_FORK 0x02 /* child wakeup after fork */ #define ENQUEUE_WAKEUP 1 -#define ENQUEUE_WAKING 2 -#define ENQUEUE_HEAD 4 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif #define DEQUEUE_SLEEP 1 @@ -1071,7 +1075,7 @@ struct sched_class { void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, diff --git a/kernel/sched.c b/kernel/sched.c index d4b815d..46f42ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2531,7 +2531,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); + p->sched_class->task_waking(p); en_flags |= ENQUEUE_WAKING; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 96b2c95..ad4c414f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1372,11 +1372,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + lockdep_assert_held(&task_rq(p)->lock); + se->vruntime -= cfs_rq->min_vruntime; } -- cgit v1.1 From 3fe1698b7fe05aeb063564e71e40d09f28d8e80c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:48 +0200 Subject: sched: Deal with non-atomic min_vruntime reads on 32bits In order to avoid reading partial updated min_vruntime values on 32bit implement a seqcount like solution. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.111378493@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ kernel/sched_fair.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 46f42ca..7a5eb26 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -312,6 +312,9 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif struct rb_root tasks_timeline; struct rb_node *rb_leftmost; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ad4c414f..054cebb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } /* @@ -1376,10 +1380,21 @@ static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; - lockdep_assert_held(&task_rq(p)->lock); +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; - se->vruntime -= cfs_rq->min_vruntime; + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; } #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.1 From a8e4f2eaecc9bfa4954adf79a04f4f22fddd829c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:49 +0200 Subject: sched: Delay task_contributes_to_load() In prepratation of having to call task_contributes_to_load() without holding rq->lock, we need to store the result until we do and can update the rq accounting accordingly. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.151523907@chello.nl --- include/linux/sched.h | 1 + kernel/sched.c | 16 ++++------------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f5732f..25c5031 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1273,6 +1273,7 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; diff --git a/kernel/sched.c b/kernel/sched.c index 7a5eb26..fd32b78 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2519,18 +2519,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (unlikely(task_running(rq, p))) goto out_activate; - /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: - */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) { @@ -2555,6 +2544,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + out_activate: #endif /* CONFIG_SMP */ ttwu_activate(rq, p, en_flags); -- cgit v1.1 From 2acca55ed98ad9b9aa25e7e587ebe306c0313dc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:50 +0200 Subject: sched: Also serialize ttwu_local() with p->pi_lock Since we now serialize ttwu() using p->pi_lock, we also need to serialize ttwu_local() using that, otherwise, once we drop the rq->lock from ttwu() it can race with ttwu_local(). Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.192366907@chello.nl --- kernel/sched.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index fd32b78..6b269b7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2566,9 +2566,9 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not already there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. this_rq() stays locked over invocation. + * the current task. */ static void try_to_wake_up_local(struct task_struct *p) { @@ -2578,14 +2578,22 @@ static void try_to_wake_up_local(struct task_struct *p) BUG_ON(p == current); lockdep_assert_held(&rq->lock); + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + if (!(p->state & TASK_NORMAL)) - return; + goto out; if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); ttwu_stat(rq, p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -4114,11 +4122,13 @@ need_resched: if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; @@ -4128,12 +4138,9 @@ need_resched: try_to_wake_up_local(to_wakeup); } - deactivate_task(rq, prev, DEQUEUE_SLEEP); - prev->on_rq = 0; - /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. + * If we are going to sleep and we have plugged IO + * queued, make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); -- cgit v1.1 From 0122ec5b02f766c355b3168df53a6c038a24fa0d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:51 +0200 Subject: sched: Add p->pi_lock to task_rq_lock() In order to be able to call set_task_cpu() while either holding p->pi_lock or task_rq(p)->lock we need to hold both locks in order to stabilize task_rq(). This makes task_rq_lock() acquire both locks, and have __task_rq_lock() validate that p->pi_lock is held. This increases the locking overhead for most scheduler syscalls but allows reduction of rq->lock contention for some scheduler hot paths (ttwu). Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.232781355@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 103 ++++++++++++++++++++++++++------------------------------- 1 file changed, 47 insertions(+), 56 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 6b269b7..f155127 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -599,7 +599,7 @@ static inline int cpu_of(struct rq *rq) * Return the group to which this tasks belongs. * * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() * holds that lock for each task it moves into the cgroup. Therefore * by holding that lock, we pin the task to the current cgroup. */ @@ -609,7 +609,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&task_rq(p)->lock)); + lockdep_is_held(&p->pi_lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -924,23 +924,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * Check whether the task is waking, we use this to synchronize ->cpus_allowed - * against ttwu(). - */ -static inline int task_is_waking(struct task_struct *p) -{ - return unlikely(p->state == TASK_WAKING); -} - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. + * __task_rq_lock - lock the rq @p resides on. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { struct rq *rq; + lockdep_assert_held(&p->pi_lock); + for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); @@ -951,22 +943,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) } /* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) __acquires(rq->lock) { struct rq *rq; for (;;) { - local_irq_save(*flags); + raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } } @@ -976,10 +968,13 @@ static void __task_rq_unlock(struct rq *rq) raw_spin_unlock(&rq->lock); } -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) + __releases(p->pi_lock) { - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } /* @@ -2175,6 +2170,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif #endif trace_sched_migrate_task(p, new_cpu); @@ -2270,7 +2270,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -2652,6 +2652,7 @@ static void __sched_fork(struct task_struct *p) */ void sched_fork(struct task_struct *p, int clone_flags) { + unsigned long flags; int cpu = get_cpu(); __sched_fork(p); @@ -2702,9 +2703,9 @@ void sched_fork(struct task_struct *p, int clone_flags) * * Silence PROVE_RCU. */ - rcu_read_lock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); - rcu_read_unlock(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2753,7 +2754,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) set_task_cpu(p, cpu); p->state = TASK_RUNNING; - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); #endif rq = task_rq_lock(p, &flags); @@ -2765,7 +2766,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); put_cpu(); } @@ -3490,12 +3491,12 @@ void sched_exec(void) likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } #endif @@ -3532,7 +3533,7 @@ unsigned long long task_delta_exec(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3550,7 +3551,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3574,7 +3575,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); thread_group_cputime(p, &totals); ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -4693,16 +4694,13 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; int oldprio, on_rq, running; struct rq *rq; const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); - lockdep_assert_held(&p->pi_lock); - - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); trace_sched_pi_setprio(p, prio); oldprio = p->prio; @@ -4727,7 +4725,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); } #endif @@ -4775,7 +4773,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_task(rq->curr); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -5003,20 +5001,17 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* + * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -5040,8 +5035,7 @@ recheck: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EPERM; } } @@ -5050,8 +5044,7 @@ recheck: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); goto recheck; } on_rq = p->on_rq; @@ -5073,8 +5066,7 @@ recheck: activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); rt_mutex_adjust_pi(p); @@ -5666,7 +5658,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, rq = task_rq_lock(p, &flags); time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5889,8 +5881,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - raw_spin_lock_irqsave(&p->pi_lock, flags); - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5918,15 +5909,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return ret; } @@ -5954,6 +5943,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); + raw_spin_lock(&p->pi_lock); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) @@ -5976,6 +5966,7 @@ done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); return ret; } @@ -8702,7 +8693,7 @@ void sched_move_task(struct task_struct *tsk) if (on_rq) enqueue_task(rq, tsk, 0); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, tsk, &flags); } #endif /* CONFIG_CGROUP_SCHED */ -- cgit v1.1 From ab2515c4b98f7bc4fa11cad9fa0f811d63a72a26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:52 +0200 Subject: sched: Drop rq->lock from first part of wake_up_new_task() Since p->pi_lock now protects all things needed to call select_task_rq() avoid the double remote rq->lock acquisition and rely on p->pi_lock. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.273362517@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f155127..7c5494d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2736,28 +2736,18 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu __maybe_unused = get_cpu(); + raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP - rq = task_rq_lock(p, &flags); - p->state = TASK_WAKING; - /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug - * - * We set TASK_WAKING so that select_task_rq() can drop rq->lock - * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); - set_task_cpu(p, cpu); - - p->state = TASK_RUNNING; - task_rq_unlock(rq, p, &flags); + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = 1; trace_sched_wakeup_new(p, true); @@ -2767,7 +2757,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, p, &flags); - put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS -- cgit v1.1 From 8f42ced974df7d5af2de4cf5ea21fe978c7e4478 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:53 +0200 Subject: sched: Drop rq->lock from sched_exec() Since we can now call select_task_rq() and set_task_cpu() with only p->pi_lock held, and sched_exec() load-balancing has always been optimistic, drop all rq->lock usage. Oleg also noted that need_migrate_task() will always be true for current, so don't bother calling that at all. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.314204889@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7c5494d..1be1a09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3465,27 +3465,22 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - struct rq *rq; int dest_cpu; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; - /* - * select_task_rq() can race against ->cpus_allowed - */ - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { + if (likely(cpu_active(dest_cpu))) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, p, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #endif -- cgit v1.1 From e4a52bcb9a18142d79e231b6733cabdbf2e67c1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:54 +0200 Subject: sched: Remove rq->lock from the first half of ttwu() Currently ttwu() does two rq->lock acquisitions, once on the task's old rq, holding it over the p->state fiddling and load-balance pass. Then it drops the old rq->lock to acquire the new rq->lock. By having serialized ttwu(), p->sched_class, p->cpus_allowed with p->pi_lock, we can now drop the whole first rq->lock acquisition. The p->pi_lock serializing concurrent ttwu() calls protects p->state, which we will set to TASK_WAKING to bridge possible p->pi_lock to rq->lock gaps and serialize set_task_cpu() calls against task_rq_lock(). The p->pi_lock serialization of p->sched_class allows us to call scheduling class methods without holding the rq->lock, and the serialization of p->cpus_allowed allows us to do the load-balancing bits without races. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.354401150@chello.nl --- kernel/sched.c | 65 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 1be1a09..871dd9e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2493,69 +2493,78 @@ ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags) * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, orig_cpu, this_cpu, success = 0; + int cpu, this_cpu, success = 0; unsigned long flags; - unsigned long en_flags = ENQUEUE_WAKEUP; struct rq *rq; this_cpu = get_cpu(); smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); - rq = __task_rq_lock(p); if (!(p->state & state)) goto out; cpu = task_cpu(p); - if (p->on_rq) - goto out_running; + if (p->on_rq) { + rq = __task_rq_lock(p); + if (p->on_rq) + goto out_running; + __task_rq_unlock(rq); + } - orig_cpu = cpu; #ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; + while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + /* + * If called from interrupt context we could have landed in the + * middle of schedule(), in this case we should take care not + * to spin on ->on_cpu if p is current, since that would + * deadlock. + */ + if (p == current) + goto out_activate; +#endif + cpu_relax(); + } + /* + * Pairs with the smp_wmb() in finish_lock_switch(). + */ + smp_rmb(); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) { + if (p->sched_class->task_waking) p->sched_class->task_waking(p); - en_flags |= ENQUEUE_WAKING; - } cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) - set_task_cpu(p, cpu); - __task_rq_unlock(rq); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +out_activate: +#endif +#endif /* CONFIG_SMP */ rq = cpu_rq(cpu); raw_spin_lock(&rq->lock); - /* - * We migrated the task without holding either rq->lock, however - * since the task is not on the task list itself, nobody else - * will try and migrate the task, hence the rq should match the - * cpu we just moved it to. - */ - WARN_ON(task_cpu(p) != cpu); - WARN_ON(p->state != TASK_WAKING); +#ifdef CONFIG_SMP + if (cpu != task_cpu(p)) + set_task_cpu(p, cpu); if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#endif -out_activate: -#endif /* CONFIG_SMP */ - ttwu_activate(rq, p, en_flags); + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); out_running: ttwu_post_activation(p, rq, wake_flags); ttwu_stat(rq, p, cpu, wake_flags); success = 1; -out: __task_rq_unlock(rq); +out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); put_cpu(); -- cgit v1.1 From b84cb5df1f9ad6da3f214c638d5fb08d0c99de1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:55 +0200 Subject: sched: Remove rq argument from ttwu_stat() In order to call ttwu_stat() without holding rq->lock we must remove its rq argument. Since we need to change rq stats, account to the local rq instead of the task rq, this is safe since we have IRQs disabled. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.394638826@chello.nl --- kernel/sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 871dd9e..5ec2e8b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2410,9 +2410,11 @@ static void update_avg(u64 *avg, u64 sample) #endif static void -ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { #ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + #ifdef CONFIG_SMP int this_cpu = smp_processor_id(); @@ -2561,9 +2563,10 @@ out_activate: ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); out_running: ttwu_post_activation(p, rq, wake_flags); - ttwu_stat(rq, p, cpu, wake_flags); success = 1; __task_rq_unlock(rq); + + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); put_cpu(); @@ -2600,7 +2603,7 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); - ttwu_stat(rq, p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } -- cgit v1.1 From 23f41eeb42ce7f6f1210904e49e84718f02cb61c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:56 +0200 Subject: sched: Rename ttwu_post_activation() to ttwu_do_wakeup() The ttwu_post_activation() code does the core wakeup, it sets TASK_RUNNING and performs wakeup-preemption, so give is a more descriptive name. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.434609705@chello.nl --- kernel/sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5ec2e8b..e309dba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2456,8 +2456,11 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) wq_worker_waking_up(p, cpu_of(rq)); } +/* + * Mark the task runnable and perform wakeup-preemption. + */ static void -ttwu_post_activation(struct task_struct *p, struct rq *rq, int wake_flags) +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); @@ -2562,7 +2565,7 @@ out_activate: ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); out_running: - ttwu_post_activation(p, rq, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags); success = 1; __task_rq_unlock(rq); @@ -2602,7 +2605,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_post_activation(p, rq, 0); + ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); -- cgit v1.1 From c05fbafba1c5482bee399b360288fa405415e126 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:57 +0200 Subject: sched: Restructure ttwu() some more Factor our helper functions to make the inner workings of try_to_wake_up() more obvious, this also allows for adding remote queues. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.475848012@chello.nl --- kernel/sched.c | 91 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 33 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index e309dba..7d8b85f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2483,6 +2483,48 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) #endif } +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); +} + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -2501,27 +2543,25 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq; - - this_cpu = get_cpu(); + int cpu, success = 0; smp_wmb(); raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - if (p->on_rq) { - rq = __task_rq_lock(p); - if (p->on_rq) - goto out_running; - __task_rq_unlock(rq); - } + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; #ifdef CONFIG_SMP + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + */ while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* @@ -2530,8 +2570,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * to spin on ->on_cpu if p is current, since that would * deadlock. */ - if (p == current) - goto out_activate; + if (p == current) { + ttwu_queue(p, cpu); + goto stat; + } #endif cpu_relax(); } @@ -2547,32 +2589,15 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -out_activate: -#endif -#endif /* CONFIG_SMP */ - - rq = cpu_rq(cpu); - raw_spin_lock(&rq->lock); - -#ifdef CONFIG_SMP - if (cpu != task_cpu(p)) + if (task_cpu(p) != cpu) set_task_cpu(p, cpu); +#endif /* CONFIG_SMP */ - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; -#endif - - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); -out_running: - ttwu_do_wakeup(rq, p, wake_flags); - success = 1; - __task_rq_unlock(rq); - + ttwu_queue(p, cpu); +stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); - put_cpu(); return success; } -- cgit v1.1 From 317f394160e9beb97d19a84c39b7e5eb3d7815a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:58 +0200 Subject: sched: Move the second half of ttwu() to the remote cpu Now that we've removed the rq->lock requirement from the first part of ttwu() and can compute placement without holding any rq->lock, ensure we execute the second half of ttwu() on the actual cpu we want the task to run on. This avoids having to take rq->lock and doing the task enqueue remotely, saving lots on cacheline transfers. As measured using: http://oss.oracle.com/~mason/sembench.c $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ; do echo performance > $i; done $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 647278 worker burns per second patched: run time 30 seconds 816715 worker burns per second Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.515897185@chello.nl --- include/linux/sched.h | 3 ++- init/Kconfig | 5 +++++ kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_features.h | 6 ++++++ 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 25c5031..e09dafa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1203,6 +1203,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP + struct task_struct *wake_entry; int on_cpu; #endif int on_rq; @@ -2192,7 +2193,7 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -static inline void scheduler_ipi(void) { } +void scheduler_ipi(void); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } diff --git a/init/Kconfig b/init/Kconfig index 56240e7..32745bf 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -827,6 +827,11 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TTWU_QUEUE + bool + depends on !SPARC32 + default y + config MM_OWNER bool diff --git a/kernel/sched.c b/kernel/sched.c index 7d8b85f..9e3ede1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -556,6 +556,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -2516,10 +2520,61 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) return ret; } +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} +#endif + static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); @@ -6331,6 +6386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69ac..be40f73 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) * Decrement CPU power based on irq activity */ SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) -- cgit v1.1 From bd8e7dded88a3e1c085c333f19ff31387616f71a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:59 +0200 Subject: sched: Remove need_migrate_task() Oleg noticed that need_migrate_task() doesn't need the ->on_cpu check now that ttwu() doesn't do remote enqueues for !->on_rq && ->on_cpu, so remove the helper and replace the single instance with a direct ->on_rq test. Suggested-by: Oleg Nesterov Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.556674812@chello.nl Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 9e3ede1..cd597c7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2199,21 +2199,6 @@ struct migration_arg { static int migration_cpu_stop(void *data); /* - * The task's runqueue lock must be held. - * Returns true if you have to wait for migration thread. - */ -static bool need_migrate_task(struct task_struct *p) -{ - /* - * If the task is not on a runqueue (and not running), then - * the next wake-up will properly place the task. - */ - bool running = p->on_rq || p->on_cpu; - smp_rmb(); /* finish_lock_switch() */ - return running; -} - -/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -5985,7 +5970,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (need_migrate_task(p)) { + if (p->on_rq) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); -- cgit v1.1 From c55fa78b13b32d3f19e19cd0c8b9378fdc09e521 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 8 Nov 2010 14:13:35 -0500 Subject: xen/pci: Add xen_[find|register|unregister]_device_domain_owner functions. When the Xen PCI backend is told to enable or disable MSI/MSI-X functions, the initial domain performs these operations. The initial domain needs to know which domain (guest) is going to use the PCI device so when it makes the appropiate hypercall to retrieve the MSI/MSI-X vector it will also assign the PCI device to the appropiate domain (guest). This boils down to us needing a mechanism to find, set and unset the domain id that will be using the device. [v2: EXPORT_SYMBOL -> EXPORT_SYMBOL_GPL.] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/pci.h | 16 +++++++++ arch/x86/pci/xen.c | 73 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index aa86209..4fbda9a 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void) #endif #if defined(CONFIG_XEN_DOM0) void __init xen_setup_pirqs(void); +int xen_find_device_domain_owner(struct pci_dev *dev); +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); +int xen_unregister_device_domain_owner(struct pci_dev *dev); #else static inline void __init xen_setup_pirqs(void) { } +static inline int xen_find_device_domain_owner(struct pci_dev *dev) +{ + return -1; +} +static inline int xen_register_device_domain_owner(struct pci_dev *dev, + uint16_t domain) +{ + return -1; +} +static inline int xen_unregister_device_domain_owner(struct pci_dev *dev) +{ + return -1; +} #endif #if defined(CONFIG_PCI_MSI) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index e37b407..6075f2d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -461,3 +461,76 @@ void __init xen_setup_pirqs(void) } } #endif + +struct xen_device_domain_owner { + domid_t domain; + struct pci_dev *dev; + struct list_head list; +}; + +static DEFINE_SPINLOCK(dev_domain_list_spinlock); +static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list); + +static struct xen_device_domain_owner *find_device(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + list_for_each_entry(owner, &dev_domain_list, list) { + if (owner->dev == dev) + return owner; + } + return NULL; +} + +int xen_find_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + int domain = -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (owner) + domain = owner->domain; + spin_unlock(&dev_domain_list_spinlock); + return domain; +} +EXPORT_SYMBOL_GPL(xen_find_device_domain_owner); + +int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) +{ + struct xen_device_domain_owner *owner; + + owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); + if (!owner) + return -ENODEV; + + spin_lock(&dev_domain_list_spinlock); + if (find_device(dev)) { + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return -EEXIST; + } + owner->domain = domain; + owner->dev = dev; + list_add_tail(&owner->list, &dev_domain_list); + spin_unlock(&dev_domain_list_spinlock); + return 0; +} +EXPORT_SYMBOL_GPL(xen_register_device_domain_owner); + +int xen_unregister_device_domain_owner(struct pci_dev *dev) +{ + struct xen_device_domain_owner *owner; + + spin_lock(&dev_domain_list_spinlock); + owner = find_device(dev); + if (!owner) { + spin_unlock(&dev_domain_list_spinlock); + return -ENODEV; + } + list_del(&owner->list); + spin_unlock(&dev_domain_list_spinlock); + kfree(owner); + return 0; +} +EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); -- cgit v1.1 From beafbdc1df02877612dc9039c1de0639921fddec Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 14 Apr 2011 11:17:36 -0400 Subject: xen/irq: Check if the PCI device is owned by a domain different than DOMID_SELF. We check if there is a domain owner for the PCI device. In case of failure (meaning no domain has registered for this device) we make DOMID_SELF the owner. Signed-off-by: Konrad Rzeszutek Wilk [v2: deal with rebasing on v2.6.37-1] [v3: deal with rebasing on stable/irq.cleanup] [v4: deal with rebasing on stable/irq.ween_of_nr_irqs] [v5: deal with rebasing on v2.6.39-rc3] Signed-off-by: Jeremy Fitzhardinge Acked-by: Xiantao Zhang --- arch/x86/pci/xen.c | 21 ++++++++++++++++----- drivers/xen/events.c | 12 ++++++++---- include/xen/events.h | 3 ++- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 6075f2d..393981f 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -108,7 +108,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) } irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, (type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi"); + "msi-x" : "msi", + DOMID_SELF); if (irq < 0) goto error; dev_dbg(&dev->dev, @@ -148,7 +149,8 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : - "pcifront-msi"); + "pcifront-msi", + DOMID_SELF); if (irq < 0) goto free; i++; @@ -190,9 +192,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) list_for_each_entry(msidesc, &dev->msi_list, list) { struct physdev_map_pirq map_irq; + domid_t domid; + + domid = ret = xen_find_device_domain_owner(dev); + /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED, + * hence check ret value for < 0. */ + if (ret < 0) + domid = DOMID_SELF; memset(&map_irq, 0, sizeof(map_irq)); - map_irq.domid = DOMID_SELF; + map_irq.domid = domid; map_irq.type = MAP_PIRQ_TYPE_MSI; map_irq.index = -1; map_irq.pirq = -1; @@ -215,14 +224,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (ret) { - dev_warn(&dev->dev, "xen map irq failed %d\n", ret); + dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n", + ret, domid); goto out; } ret = xen_bind_pirq_msi_to_irq(dev, msidesc, map_irq.pirq, map_irq.index, (type == PCI_CAP_ID_MSIX) ? - "msi-x" : "msi"); + "msi-x" : "msi", + domid); if (ret < 0) goto out; } diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 42d6c93..ac0e228 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -101,6 +101,7 @@ struct irq_info unsigned short gsi; unsigned char vector; unsigned char flags; + uint16_t domid; } pirq; } u; }; @@ -184,6 +185,7 @@ static void xen_irq_info_pirq_init(unsigned irq, unsigned short pirq, unsigned short gsi, unsigned short vector, + uint16_t domid, unsigned char flags) { struct irq_info *info = info_for_irq(irq); @@ -193,6 +195,7 @@ static void xen_irq_info_pirq_init(unsigned irq, info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; info->u.pirq.vector = vector; + info->u.pirq.domid = domid; info->u.pirq.flags = flags; } @@ -655,7 +658,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, + xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); out: @@ -680,7 +683,8 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, int vector, const char *name) + int pirq, int vector, const char *name, + domid_t domid) { int irq, ret; @@ -693,7 +697,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); - xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0); + xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0); ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; @@ -722,7 +726,7 @@ int xen_destroy_irq(int irq) if (xen_initial_domain()) { unmap_irq.pirq = info->u.pirq.pirq; - unmap_irq.domid = DOMID_SELF; + unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); if (rc) { printk(KERN_WARNING "unmap irq failed %d\n", rc); diff --git a/include/xen/events.h b/include/xen/events.h index f1b87ad..9aecc0b 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -85,7 +85,8 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc); /* Bind an PSI pirq to an irq. */ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, int vector, const char *name); + int pirq, int vector, const char *name, + domid_t domid); #endif /* De-allocates the above mentioned physical interrupt. */ -- cgit v1.1 From c7c2c3a28657cfdcef50c02b18ccca3761209e17 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 8 Nov 2010 14:26:36 -0500 Subject: xen/irq: Add support to check if IRQ line is shared with other domains. We do this via the PHYSDEVOP_irq_status_query support hypervisor call. We will get a positive value if another domain has binded its PIRQ to the specified GSI (IRQ line). [v2: Deal with v2.6.37-rc1 rebase fallout] [v3: Deal with stable/irq.cleanup fallout] [v4: xen_ignore_irq->xen_test_irq_shared] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 12 ++++++++++++ include/xen/events.h | 3 +++ 2 files changed, 15 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ac0e228..0ac7a14 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1508,6 +1508,18 @@ void xen_poll_irq(int irq) xen_poll_irq_timeout(irq, 0 /* no timeout */); } +/* Check whether the IRQ line is shared with other guests. */ +int xen_test_irq_shared(int irq) +{ + struct irq_info *info = info_for_irq(irq); + struct physdev_irq_status_query irq_status = { .irq = info->u.pirq.pirq }; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !(irq_status.flags & XENIRQSTAT_shared); +} +EXPORT_SYMBOL_GPL(xen_test_irq_shared); + void xen_irq_resume(void) { unsigned int cpu, evtchn; diff --git a/include/xen/events.h b/include/xen/events.h index 9aecc0b..932e540 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -95,4 +95,7 @@ int xen_destroy_irq(int irq); /* Return irq from pirq */ int xen_irq_from_pirq(unsigned pirq); +/* Determine whether to ignore this IRQ if it is passed to a guest. */ +int xen_test_irq_shared(int irq); + #endif /* _XEN_EVENTS_H */ -- cgit v1.1 From e6197acc726ab3baa60375a5891d58c2ee87e0f3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 24 Feb 2011 14:20:12 -0500 Subject: xen/irq: Export 'xen_pirq_from_irq' function. We need this to find the real Xen PIRQ value for a device that requests an MSI or MSI-X. In the past we used 'xen_gsi_from_irq' since that function would return an Xen PIRQ or GSI depending on the provided IRQ. Now that we have seperated that we need to use the correct function. [v2: Deal with rebase on stable/irq.cleanup] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 6 ++++++ include/xen/events.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 0ac7a14..e4e8e9a 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -763,6 +763,12 @@ out: return irq; } + +int xen_pirq_from_irq(unsigned irq) +{ + return pirq_from_irq(irq); +} +EXPORT_SYMBOL_GPL(xen_pirq_from_irq); int bind_evtchn_to_irq(unsigned int evtchn) { int irq; diff --git a/include/xen/events.h b/include/xen/events.h index 932e540..9af21e1 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -95,6 +95,9 @@ int xen_destroy_irq(int irq); /* Return irq from pirq */ int xen_irq_from_pirq(unsigned pirq); +/* Return the pirq allocated to the irq. */ +int xen_pirq_from_irq(unsigned irq); + /* Determine whether to ignore this IRQ if it is passed to a guest. */ int xen_test_irq_shared(int irq); -- cgit v1.1 From 1eff1ad0285038e309a81da4a004f071608309fb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 16 Feb 2011 16:26:44 -0500 Subject: xen/irq: The Xen hypervisor cleans up the PIRQs if the other domain forgot. And if the other domain forgot to clean up its PIRQs we don't need to fail the operation. Just take a note of it and continue on. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index e4e8e9a..e51f3c5 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -728,7 +728,14 @@ int xen_destroy_irq(int irq) unmap_irq.pirq = info->u.pirq.pirq; unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); - if (rc) { + /* If another domain quits without making the pci_disable_msix + * call, the Xen hypervisor takes care of freeing the PIRQs + * (free_domain_pirqs). + */ + if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) + printk(KERN_INFO "domain %d does not have %d anymore\n", + info->u.pirq.domid, info->u.pirq.pirq); + else if (rc) { printk(KERN_WARNING "unmap irq failed %d\n", rc); goto out; } -- cgit v1.1 From 140363500ddadad0c09cb512cc0c96a4d3efa053 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 14 Apr 2011 23:50:57 -0400 Subject: iscsi_ibft: search for broadcom specific ibft sign (v2) Broadcom iscsi offload firmware uses a non standard ibft sign of "BIFT". When we added support for boot, the anaconda team and I were using older firmware (I guess 4 years old), so boot does not work on current cards. This patch modifies the ibft search code to search for "BIFT" along with the other possible values. Broadcom has tested the patch and reported it works with their firmware. Mike has tested Chelsio and Intel cards. [v2: - Add ACPI_SIG_IBFT to ibft_signs - replace break with goto in find_ibft_in_mem innner loop.] Signed-off-by: Mike Christie Signed-off-by: Peter Jones Signed-off-by: Konrad Rzeszutek Wilk --- drivers/firmware/iscsi_ibft_find.c | 51 ++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/drivers/firmware/iscsi_ibft_find.c b/drivers/firmware/iscsi_ibft_find.c index 2192456..f032e44 100644 --- a/drivers/firmware/iscsi_ibft_find.c +++ b/drivers/firmware/iscsi_ibft_find.c @@ -42,7 +42,20 @@ struct acpi_table_ibft *ibft_addr; EXPORT_SYMBOL_GPL(ibft_addr); -#define IBFT_SIGN "iBFT" +static const struct { + char *sign; +} ibft_signs[] = { +#ifdef CONFIG_ACPI + /* + * One spec says "IBFT", the other says "iBFT". We have to check + * for both. + */ + { ACPI_SIG_IBFT }, +#endif + { "iBFT" }, + { "BIFT" }, /* Broadcom iSCSI Offload */ +}; + #define IBFT_SIGN_LEN 4 #define IBFT_START 0x80000 /* 512kB */ #define IBFT_END 0x100000 /* 1MB */ @@ -62,6 +75,7 @@ static int __init find_ibft_in_mem(void) unsigned long pos; unsigned int len = 0; void *virt; + int i; for (pos = IBFT_START; pos < IBFT_END; pos += 16) { /* The table can't be inside the VGA BIOS reserved space, @@ -69,18 +83,23 @@ static int __init find_ibft_in_mem(void) if (pos == VGA_MEM) pos += VGA_SIZE; virt = isa_bus_to_virt(pos); - if (memcmp(virt, IBFT_SIGN, IBFT_SIGN_LEN) == 0) { - unsigned long *addr = - (unsigned long *)isa_bus_to_virt(pos + 4); - len = *addr; - /* if the length of the table extends past 1M, - * the table cannot be valid. */ - if (pos + len <= (IBFT_END-1)) { - ibft_addr = (struct acpi_table_ibft *)virt; - break; + + for (i = 0; i < ARRAY_SIZE(ibft_signs); i++) { + if (memcmp(virt, ibft_signs[i].sign, IBFT_SIGN_LEN) == + 0) { + unsigned long *addr = + (unsigned long *)isa_bus_to_virt(pos + 4); + len = *addr; + /* if the length of the table extends past 1M, + * the table cannot be valid. */ + if (pos + len <= (IBFT_END-1)) { + ibft_addr = (struct acpi_table_ibft *)virt; + goto done; + } } } } +done: return len; } /* @@ -89,18 +108,12 @@ static int __init find_ibft_in_mem(void) */ unsigned long __init find_ibft_region(unsigned long *sizep) { - + int i; ibft_addr = NULL; #ifdef CONFIG_ACPI - /* - * One spec says "IBFT", the other says "iBFT". We have to check - * for both. - */ - if (!ibft_addr) - acpi_table_parse(ACPI_SIG_IBFT, acpi_find_ibft); - if (!ibft_addr) - acpi_table_parse(IBFT_SIGN, acpi_find_ibft); + for (i = 0; i < ARRAY_SIZE(ibft_signs) && !ibft_addr; i++) + acpi_table_parse(ibft_signs[i].sign, acpi_find_ibft); #endif /* CONFIG_ACPI */ /* iBFT 1.03 section 1.4.3.1 mandates that UEFI machines will -- cgit v1.1 From cf8d91633ddef9e816ccbf3da833c79ce508988d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 28 Feb 2011 17:58:48 -0500 Subject: xen/p2m/m2p/gnttab: Support GNTMAP_host_map in the M2P override. We only supported the M2P (and P2M) override only for the GNTMAP_contains_pte type mappings. Meaning that we grants operations would "contain the machine address of the PTE to update" If the flag is unset, then the grant operation is "contains a host virtual address". The latter case means that the Hypervisor takes care of updating our page table (specifically the PTE entry) with the guest's MFN. As such we should not try to do anything with the PTE. Previous to this patch we would try to clear the PTE which resulted in Xen hypervisor being upset with us: (XEN) mm.c:1066:d0 Attempt to implicitly unmap a granted PTE c0100000ccc59067 (XEN) domain_crash called from mm.c:1067 (XEN) Domain 0 (vcpu#0) crashed on cpu#3: (XEN) ----[ Xen-4.0-110228 x86_64 debug=y Not tainted ]---- and crashing us. This patch allows us to inhibit the PTE clearing in the PV guest if the GNTMAP_contains_pte is not set. On the m2p_remove_override path we provide the same parameter. Sadly in the grant-table driver we do not have a mechanism to tell m2p_remove_override whether to clear the PTE or not. Since the grant-table driver is used by user-space, we can safely assume that it operates only on PTE's. Hence the implementation for it to work on !GNTMAP_contains_pte returns -EOPNOTSUPP. In the future we can implement the support for this. It will require some extra accounting structure to keep track of the page[i], and the flag. [v1: Added documentation details, made it return -EOPNOTSUPP instead of trying to do a half-way implementation] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 5 +++-- arch/x86/xen/p2m.c | 10 ++++------ drivers/xen/grant-table.c | 31 ++++++++++++++++++++++++------- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c61934f..64a619d 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); -extern int m2p_add_override(unsigned long mfn, struct page *page); -extern int m2p_remove_override(struct page *page); +extern int m2p_add_override(unsigned long mfn, struct page *page, + bool clear_pte); +extern int m2p_remove_override(struct page *page, bool clear_pte); extern struct page *m2p_find_override(unsigned long mfn); extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 141eb0d..2d2b32af 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -650,7 +650,7 @@ static unsigned long mfn_hash(unsigned long mfn) } /* Add an MFN override for a particular page */ -int m2p_add_override(unsigned long mfn, struct page *page) +int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) { unsigned long flags; unsigned long pfn; @@ -662,7 +662,6 @@ int m2p_add_override(unsigned long mfn, struct page *page) if (!PageHighMem(page)) { address = (unsigned long)__va(pfn << PAGE_SHIFT); ptep = lookup_address(address, &level); - if (WARN(ptep == NULL || level != PG_LEVEL_4K, "m2p_add_override: pfn %lx not mapped", pfn)) return -EINVAL; @@ -674,10 +673,9 @@ int m2p_add_override(unsigned long mfn, struct page *page) if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) return -ENOMEM; - if (!PageHighMem(page)) + if (clear_pte && !PageHighMem(page)) /* Just zap old mapping for now */ pte_clear(&init_mm, address, ptep); - spin_lock_irqsave(&m2p_override_lock, flags); list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); @@ -685,7 +683,7 @@ int m2p_add_override(unsigned long mfn, struct page *page) return 0; } -int m2p_remove_override(struct page *page) +int m2p_remove_override(struct page *page, bool clear_pte) { unsigned long flags; unsigned long mfn; @@ -713,7 +711,7 @@ int m2p_remove_override(struct page *page) spin_unlock_irqrestore(&m2p_override_lock, flags); set_phys_to_machine(pfn, page->index); - if (!PageHighMem(page)) + if (clear_pte && !PageHighMem(page)) set_pte_at(&init_mm, address, ptep, pfn_pte(pfn, PAGE_KERNEL)); /* No tlb flush necessary because the caller already diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3745a31..fd725cd 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -466,13 +466,30 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, if (map_ops[i].status) continue; - /* m2p override only supported for GNTMAP_contains_pte mappings */ - if (!(map_ops[i].flags & GNTMAP_contains_pte)) - continue; - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + + if (map_ops[i].flags & GNTMAP_contains_pte) { + pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - ret = m2p_add_override(mfn, pages[i]); + mfn = pte_mfn(*pte); + } else { + /* If you really wanted to do this: + * mfn = PFN_DOWN(map_ops[i].dev_bus_addr); + * + * The reason we do not implement it is b/c on the + * unmap path (gnttab_unmap_refs) we have no means of + * checking whether the page is !GNTMAP_contains_pte. + * + * That is without some extra data-structure to carry + * the struct page, bool clear_pte, and list_head next + * tuples and deal with allocation/delallocation, etc. + * + * The users of this API set the GNTMAP_contains_pte + * flag so lets just return not supported until it + * becomes neccessary to implement. + */ + return -EOPNOTSUPP; + } + ret = m2p_add_override(mfn, pages[i], + map_ops[i].flags & GNTMAP_contains_pte); if (ret) return ret; } @@ -494,7 +511,7 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, return ret; for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i]); + ret = m2p_remove_override(pages[i], true /* clear the PTE */); if (ret) return ret; } -- cgit v1.1 From 7caa2316bf0434f1150f58cb576542987a0466d7 Mon Sep 17 00:00:00 2001 From: Daniel Halperin Date: Wed, 6 Apr 2011 12:47:25 -0700 Subject: iwlwifi: fix frame injection for HT channels For some reason, sending QoS configuration causes transmission to stop after a single frame on HT channels when not associated. Removing the extra QoS configuration has no effect on station mode, and fixes injection mode. Signed-off-by: Daniel Halperin Signed-off-by: Wey-Yi Guy --- drivers/net/wireless/iwlwifi/iwl-agn-rxon.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c b/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c index dfdbea6..fbbde071 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-rxon.c @@ -335,7 +335,6 @@ int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed) struct ieee80211_channel *channel = conf->channel; const struct iwl_channel_info *ch_info; int ret = 0; - bool ht_changed[NUM_IWL_RXON_CTX] = {}; IWL_DEBUG_MAC80211(priv, "changed %#x", changed); @@ -383,10 +382,8 @@ int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed) for_each_context(priv, ctx) { /* Configure HT40 channels */ - if (ctx->ht.enabled != conf_is_ht(conf)) { + if (ctx->ht.enabled != conf_is_ht(conf)) ctx->ht.enabled = conf_is_ht(conf); - ht_changed[ctx->ctxid] = true; - } if (ctx->ht.enabled) { if (conf_is_ht40_minus(conf)) { @@ -455,8 +452,6 @@ int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed) if (!memcmp(&ctx->staging, &ctx->active, sizeof(ctx->staging))) continue; iwlagn_commit_rxon(priv, ctx); - if (ht_changed[ctx->ctxid]) - iwlagn_update_qos(priv, ctx); } out: mutex_unlock(&priv->mutex); -- cgit v1.1 From c387aa3a1a910ce00b86f3a85082d24f144db256 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 15:45:43 +0200 Subject: x86, gart: Don't enforce GART aperture lower-bound by alignment This patch changes the allocation of the GART aperture to enforce only natural alignment instead of aligning it on 512MB. This big alignment was used to force the GART aperture to be over 512MB. This is enforced by using 512MB as the lower-bound address in the allocation range. [ hpa: The actual number 512 MiB needs to be revisited, too. ] Cc: Yinghai Lu Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1303134346-5805-2-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 73fb469..3d2661c 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -30,6 +30,22 @@ #include #include +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * with the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; @@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - /* - * using 512M as goal, in case kexec will load kernel_big - * that will do the on position decompress, and could overlap with - * that position with gart that is used. - * sequende: - * kernel_small - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kernel_small(gart area become e820_reserved) - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kerne_big (uncompressed size will be big than 64M or 128M) - * so don't use 512M below as gart iommu, leave the space for kernel - * code for safe - */ - addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); - if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { + addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, + aper_size, aper_size); + if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); -- cgit v1.1 From bcb22a94f68cde70e820dc7280d1c731e8e695f7 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sun, 3 Apr 2011 22:47:23 -0300 Subject: ARM: mx5/mx53_loco: Fix build warning related to gpio_keys_button structure Fix the following warning: CC arch/arm/mach-mx5/board-mx53_loco.o arch/arm/mach-mx5/board-mx53_loco.c:203: warning: initialization discards qualifiers from pointer target type Signed-off-by: Fabio Estevam Signed-off-by: Sascha Hauer --- arch/arm/mach-mx5/board-mx53_loco.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-mx5/board-mx53_loco.c b/arch/arm/mach-mx5/board-mx53_loco.c index 10a1bea..6206b11 100644 --- a/arch/arm/mach-mx5/board-mx53_loco.c +++ b/arch/arm/mach-mx5/board-mx53_loco.c @@ -193,7 +193,7 @@ static iomux_v3_cfg_t mx53_loco_pads[] = { .wakeup = wake, \ } -static const struct gpio_keys_button loco_buttons[] __initconst = { +static struct gpio_keys_button loco_buttons[] = { GPIO_BUTTON(MX53_LOCO_POWER, KEY_POWER, 1, "power", 0), GPIO_BUTTON(MX53_LOCO_UI1, KEY_VOLUMEUP, 1, "volume-up", 0), GPIO_BUTTON(MX53_LOCO_UI2, KEY_VOLUMEDOWN, 1, "volume-down", 0), -- cgit v1.1 From 7a74aeb022b34a8fa8ef00545e66cf0568b5ddf6 Mon Sep 17 00:00:00 2001 From: Ville Tervo Date: Thu, 7 Apr 2011 14:59:50 +0300 Subject: Bluetooth: Fix refcount balance for hci connection hci_io_capa_reply_evt() holds reference for hciconnection. It's useless since hci_io_capa_request_evt()/hci_simple_pair_complete_evt() already protects the connection. In addition it leaves connection open after failed SSP pairing. Signed-off-by: Ville Tervo Signed-off-by: Gustavo F. Padovan --- net/bluetooth/hci_event.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index cebe7588..b257015 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2387,8 +2387,6 @@ static inline void hci_io_capa_reply_evt(struct hci_dev *hdev, struct sk_buff *s if (!conn) goto unlock; - hci_conn_hold(conn); - conn->remote_cap = ev->capability; conn->remote_oob = ev->oob_data; conn->remote_auth = ev->authentication; -- cgit v1.1 From b79f44c16a4e2181b1d6423afe746745d5e949ff Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 11 Apr 2011 18:46:55 -0300 Subject: Bluetooth: Fix keeping the command timer running In the teardown path the reset command is sent to the controller, this event causes the command timer to be reactivated. So the timer is removed in two situations, when the adapter isn't marked as UP and when we know that some command has been sent. Reported-by: Keith Packard Signed-off-by: Vinicius Costa Gomes Signed-off-by: Gustavo F. Padovan --- net/bluetooth/hci_core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2216620..e7dced9 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -587,10 +587,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_req_cancel(hdev, ENODEV); hci_req_lock(hdev); - /* Stop timer, it might be running */ - del_timer_sync(&hdev->cmd_timer); - if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { + del_timer_sync(&hdev->cmd_timer); hci_req_unlock(hdev); return 0; } @@ -629,6 +627,7 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Drop last sent command */ if (hdev->sent_cmd) { + del_timer_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } -- cgit v1.1 From f21ca5fff6e548833fa5ee8867239a8378623150 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 8 Apr 2011 17:10:41 +0300 Subject: Bluetooth: fix shutdown on SCO sockets shutdown should wait for SCO link to be properly disconnected before detroying the socket, otherwise an application using the socket may assume link is properly disconnected before it really happens which can be a problem when e.g synchronizing profile switch. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Gustavo F. Padovan --- net/bluetooth/sco.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 42fdffd..94954c7 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -369,6 +369,15 @@ static void __sco_sock_close(struct sock *sk) case BT_CONNECTED: case BT_CONFIG: + if (sco_pi(sk)->conn) { + sk->sk_state = BT_DISCONN; + sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); + hci_conn_put(sco_pi(sk)->conn->hcon); + sco_pi(sk)->conn = NULL; + } else + sco_chan_del(sk, ECONNRESET); + break; + case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); -- cgit v1.1 From a429b51930e64dd355840c37251a563000d7c10b Mon Sep 17 00:00:00 2001 From: Ruiyi Zhang Date: Mon, 18 Apr 2011 11:04:30 +0800 Subject: Bluetooth: Only keeping SAR bits when retransmitting one frame. When retrasmitting one frame, only SAR bits in control field should be kept. Signed-off-by: Ruiyi Zhang Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ca27f3a..2c8dd44 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1051,6 +1051,7 @@ static void l2cap_retransmit_one_frame(struct sock *sk, u8 tx_seq) tx_skb = skb_clone(skb, GFP_ATOMIC); bt_cb(skb)->retries++; control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE); + control &= L2CAP_CTRL_SAR; if (pi->conn_state & L2CAP_CONN_SEND_FBIT) { control |= L2CAP_CTRL_FINAL; -- cgit v1.1 From b1e7734f024c9ce4393016a97c8d821e1f18d9b4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Apr 2011 15:18:02 -0700 Subject: x86, percpu: Use ASM_NOP4 instead of hardcoding P6_NOP4 For use in assembly constants, use the ASM_NOP* defines. Signed-off-by: H. Peter Anvin Cc: Christoph Lameter Cc: Tejun Heo Link: http://lkml.kernel.org/r/1303166160-10315-2-git-send-email-hpa@linux.intel.com --- arch/x86/include/asm/percpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d475b43..751e7f3 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -517,7 +517,7 @@ do { \ typeof(o2) __o2 = o2; \ typeof(o2) __n2 = n2; \ typeof(o2) __dummy; \ - alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ + alternative_io("call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP4, \ "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ -- cgit v1.1 From dc326fca2b640fc41aed7c015d0f456935a66255 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Apr 2011 15:19:51 -0700 Subject: x86, cpu: Clean up and unify the NOP selection infrastructure Clean up and unify the NOP selection infrastructure: - Make the atomic 5-byte NOP a part of the selection system. - Pick NOPs once during early boot and then be done with it. Signed-off-by: H. Peter Anvin Cc: Tejun Heo Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Link: http://lkml.kernel.org/r/1303166160-10315-3-git-send-email-hpa@linux.intel.com --- arch/x86/include/asm/alternative.h | 8 -- arch/x86/include/asm/nops.h | 146 ++++++++++++++++------------- arch/x86/kernel/alternative.c | 182 ++++++++++++++++++++----------------- arch/x86/kernel/ftrace.c | 4 +- arch/x86/kernel/jump_label.c | 5 +- arch/x86/kernel/setup.c | 6 +- 6 files changed, 190 insertions(+), 161 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13009d1..7da1682 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -191,12 +191,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); extern void text_poke_smp_batch(struct text_poke_param *params, int n); -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) -#define IDEAL_NOP_SIZE_5 5 -extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; -extern void arch_init_ideal_nop5(void); -#else -static inline void arch_init_ideal_nop5(void) {} -#endif - #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index af78849..405b4032 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -1,7 +1,13 @@ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H -/* Define nops for use with alternative() */ +/* + * Define nops for use with alternative() and for tracing. + * + * *_NOP5_ATOMIC must be a single instruction. + */ + +#define NOP_DS_PREFIX 0x3e /* generic versions from gas 1: nop @@ -13,14 +19,15 @@ 6: leal 0x00000000(%esi),%esi 7: leal 0x00000000(,%esi,1),%esi */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 +#define GENERIC_NOP1 0x90 +#define GENERIC_NOP2 0x89,0xf6 +#define GENERIC_NOP3 0x8d,0x76,0x00 +#define GENERIC_NOP4 0x8d,0x74,0x26,0x00 +#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4 +#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 +#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 +#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7 +#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4 /* Opteron 64bit nops 1: nop @@ -29,13 +36,14 @@ 4: osp osp osp nop */ #define K8_NOP1 GENERIC_NOP1 -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 +#define K8_NOP2 0x66,K8_NOP1 +#define K8_NOP3 0x66,K8_NOP2 +#define K8_NOP4 0x66,K8_NOP3 +#define K8_NOP5 K8_NOP3,K8_NOP2 +#define K8_NOP6 K8_NOP3,K8_NOP3 +#define K8_NOP7 K8_NOP4,K8_NOP3 +#define K8_NOP8 K8_NOP4,K8_NOP4 +#define K8_NOP5_ATOMIC 0x66,K8_NOP4 /* K7 nops uses eax dependencies (arbitrary choice) @@ -47,13 +55,14 @@ 7: leal 0x00000000(,%eax,1),%eax */ #define K7_NOP1 GENERIC_NOP1 -#define K7_NOP2 ".byte 0x8b,0xc0\n" -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" -#define K7_NOP5 K7_NOP4 ASM_NOP1 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 +#define K7_NOP2 0x8b,0xc0 +#define K7_NOP3 0x8d,0x04,0x20 +#define K7_NOP4 0x8d,0x44,0x20,0x00 +#define K7_NOP5 K7_NOP4,K7_NOP1 +#define K7_NOP6 0x8d,0x80,0,0,0,0 +#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0 +#define K7_NOP8 K7_NOP7,K7_NOP1 +#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4 /* P6 nops uses eax dependencies (Intel-recommended choice) @@ -69,52 +78,65 @@ There is kernel code that depends on this. */ #define P6_NOP1 GENERIC_NOP1 -#define P6_NOP2 ".byte 0x66,0x90\n" -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" +#define P6_NOP2 0x66,0x90 +#define P6_NOP3 0x0f,0x1f,0x00 +#define P6_NOP4 0x0f,0x1f,0x40,0 +#define P6_NOP5 0x0f,0x1f,0x44,0x00,0 +#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0 +#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0 +#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0 +#define P6_NOP5_ATOMIC P6_NOP5 + +#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" #if defined(CONFIG_MK7) -#define ASM_NOP1 K7_NOP1 -#define ASM_NOP2 K7_NOP2 -#define ASM_NOP3 K7_NOP3 -#define ASM_NOP4 K7_NOP4 -#define ASM_NOP5 K7_NOP5 -#define ASM_NOP6 K7_NOP6 -#define ASM_NOP7 K7_NOP7 -#define ASM_NOP8 K7_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC) #elif defined(CONFIG_X86_P6_NOP) -#define ASM_NOP1 P6_NOP1 -#define ASM_NOP2 P6_NOP2 -#define ASM_NOP3 P6_NOP3 -#define ASM_NOP4 P6_NOP4 -#define ASM_NOP5 P6_NOP5 -#define ASM_NOP6 P6_NOP6 -#define ASM_NOP7 P6_NOP7 -#define ASM_NOP8 P6_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC) #elif defined(CONFIG_X86_64) -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC) #else -#define ASM_NOP1 GENERIC_NOP1 -#define ASM_NOP2 GENERIC_NOP2 -#define ASM_NOP3 GENERIC_NOP3 -#define ASM_NOP4 GENERIC_NOP4 -#define ASM_NOP5 GENERIC_NOP5 -#define ASM_NOP6 GENERIC_NOP6 -#define ASM_NOP7 GENERIC_NOP7 -#define ASM_NOP8 GENERIC_NOP8 +#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8) +#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC) #endif #define ASM_NOP_MAX 8 +#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */ + +#ifndef __ASSEMBLY__ +extern const unsigned char * const *ideal_nops; +extern void arch_init_ideal_nops(void); +#endif #endif /* _ASM_X86_NOPS_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4a23467..846f61e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) +/* + * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes + * that correspond to that nop. Getting from one nop to the next, we + * add to the array the offset that is equal to the sum of all sizes of + * nops preceding the one we are after. + * + * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the + * nice symmetry of sizes of the previous nops. + */ #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " - GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8 - "\t.previous"); -extern const unsigned char intelnops[]; -static const unsigned char *const __initconst_or_module -intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char intelnops[] = +{ + GENERIC_NOP1, + GENERIC_NOP2, + GENERIC_NOP3, + GENERIC_NOP4, + GENERIC_NOP5, + GENERIC_NOP6, + GENERIC_NOP7, + GENERIC_NOP8, + GENERIC_NOP5_ATOMIC +}; +static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +{ NULL, intelnops, intelnops + 1, @@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = { intelnops + 1 + 2 + 3 + 4 + 5, intelnops + 1 + 2 + 3 + 4 + 5 + 6, intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef K8_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8 - "\t.previous"); -extern const unsigned char k8nops[]; -static const unsigned char *const __initconst_or_module -k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char k8nops[] = +{ + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8, + K8_NOP5_ATOMIC +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +{ NULL, k8nops, k8nops + 1, @@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = { k8nops + 1 + 2 + 3 + 4 + 5, k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #if defined(K7_NOP1) && !defined(CONFIG_X86_64) -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8 - "\t.previous"); -extern const unsigned char k7nops[]; -static const unsigned char *const __initconst_or_module -k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char k7nops[] = +{ + K7_NOP1, + K7_NOP2, + K7_NOP3, + K7_NOP4, + K7_NOP5, + K7_NOP6, + K7_NOP7, + K7_NOP8, + K7_NOP5_ATOMIC +}; +static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = +{ NULL, k7nops, k7nops + 1, @@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = { k7nops + 1 + 2 + 3 + 4 + 5, k7nops + 1 + 2 + 3 + 4 + 5 + 6, k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef P6_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " - P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8 - "\t.previous"); -extern const unsigned char p6nops[]; -static const unsigned char *const __initconst_or_module -p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char __initconst_or_module p6nops[] = +{ + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8, + P6_NOP5_ATOMIC +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = +{ NULL, p6nops, p6nops + 1, @@ -147,47 +184,53 @@ p6_nops[ASM_NOP_MAX+1] = { p6nops + 1 + 2 + 3 + 4 + 5, p6nops + 1 + 2 + 3 + 4 + 5 + 6, p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif +/* Initialize these to a safe default */ #ifdef CONFIG_X86_64 +const unsigned char * const *ideal_nops = p6_nops; +#else +const unsigned char * const *ideal_nops = intel_nops; +#endif -extern char __vsyscall_0; -static const unsigned char *const *__init_or_module find_nop_table(void) +void __init arch_init_ideal_nops(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return k8_nops; -} - -#else /* CONFIG_X86_64 */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (boot_cpu_has(X86_FEATURE_NOPL)) { + ideal_nops = p6_nops; + } else { +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + ideal_nops = intel_nops; +#endif + } -static const unsigned char *const *__init_or_module find_nop_table(void) -{ - if (boot_cpu_has(X86_FEATURE_K8)) - return k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - return k7_nops; - else if (boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return intel_nops; + default: +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + if (boot_cpu_has(X86_FEATURE_K8)) + ideal_nops = k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + ideal_nops = k7_nops; + else + ideal_nops = intel_nops; +#endif + } } -#endif /* CONFIG_X86_64 */ - /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) { - const unsigned char *const *noptable = find_nop_table(); - while (len > 0) { unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + memcpy(insns, ideal_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -195,6 +238,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; +extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -678,29 +722,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) wrote_text = 0; __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); } - -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) - -#ifdef CONFIG_X86_64 -unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; -#else -unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -#endif - -void __init arch_init_ideal_nop5(void) -{ - /* - * There is no good nop for all x86 archs. This selection - * algorithm should be unified with the one in find_nop_table(), - * but this should be good enough for now. - * - * For cases other than the ones below, use the safe (as in - * always functional) defaults above. - */ -#ifdef CONFIG_X86_64 - /* Don't use these on 32 bits due to broken virtualizers */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - memcpy(ideal_nop5, p6_nops[5], 5); -#endif -} -#endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index a93742a..0ba15a6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -260,9 +260,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } -static unsigned char *ftrace_nop_replace(void) +static const unsigned char *ftrace_nop_replace(void) { - return ideal_nop5; + return ideal_nops[NOP_ATOMIC5]; } static int diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 961b6b3..3fee346 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry, code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); } else - memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); get_online_cpus(); mutex_lock(&text_mutex); text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); @@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry, void arch_jump_label_text_poke_early(jump_label_t addr) { - text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], + JUMP_LABEL_NOP_SIZE); } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5a0484a..390a663 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { - unsigned long flags; - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -1036,9 +1034,7 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - local_irq_save(flags); - arch_init_ideal_nop5(); - local_irq_restore(flags); + arch_init_ideal_nops(); } #ifdef CONFIG_X86_32 -- cgit v1.1 From d8d9766c8c29f71c37bc4b74cc9fcf6a192c9bfd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Apr 2011 15:31:57 -0700 Subject: x86, cpu: Change NOP selection for certain Intel CPUs Due to a decoder implementation quirk, some specific Intel CPUs actually perform better with the "k8_nops" than with the SDM-recommended NOPs. For runtime-selected NOPs, if we detect those specific CPUs then use the k8_nops instead of the ones we would normally use. Signed-off-by: H. Peter Anvin Cc: Tejun Heo Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Link: http://lkml.kernel.org/r/1303166160-10315-4-git-send-email-hpa@linux.intel.com --- arch/x86/kernel/alternative.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 846f61e..c0501ea 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -199,7 +199,19 @@ void __init arch_init_ideal_nops(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - if (boot_cpu_has(X86_FEATURE_NOPL)) { + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if (boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30) { + ideal_nops = k8_nops; + } else if (boot_cpu_has(X86_FEATURE_NOPL)) { ideal_nops = p6_nops; } else { #ifdef CONFIG_X86_64 -- cgit v1.1 From c8e5910edf8bbe2e5c6c35a4ef2a578cc7893b25 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sat, 16 Apr 2011 02:27:55 +0200 Subject: perf, x86: Use ALTERNATIVE() to check for X86_FEATURE_PERFCTR_CORE Using ALTERNATIVE() when checking for X86_FEATURE_PERFCTR_CORE avoids an extra pointer chase and data cache hit. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302913676-14352-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index eed3673a..224a84f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,6 +31,7 @@ #include #include #include +#include #if 0 #undef wrmsrl @@ -363,12 +364,18 @@ again: return new_raw_count; } -/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */ static inline int x86_pmu_addr_offset(int index) { - if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - return index << 1; - return index; + int offset; + + /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ + alternative_io(ASM_NOP2, + "shll $1, %%eax", + X86_FEATURE_PERFCTR_CORE, + "=a" (offset), + "a" (index)); + + return offset; } static inline unsigned int x86_pmu_config_addr(int index) -- cgit v1.1 From 69c80f3e9d3c569f8a3cee94ba1a324b5a7fa6b9 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Wed, 13 Apr 2011 18:21:09 -0700 Subject: sched: Make set_*_buddy() work on non-task entities Make set_*_buddy() work on non-task sched_entity, to facilitate the use of next_buddy to cache a group entity in cases where one of the tasks within that entity sleeps or gets preempted. set_skip_buddy() was incorrectly comparing the policy of task that is yielding to be not equal to SCHED_IDLE. Yielding should happen even when task yielding is SCHED_IDLE. This change removes the policy check on the yielding task. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302744070-30079-2-git-send-email-venki@google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8744593..501ab63 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1846,26 +1846,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->last = se; - } + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; } static void set_next_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->next = se; - } + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; } static void set_skip_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; - } + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; } /* -- cgit v1.1 From 2f36825b176f67e5c5228aa33d828bc39718811f Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 14 Apr 2011 10:30:53 -0700 Subject: sched: Next buddy hint on sleep and preempt path When a task in a taskgroup sleeps, pick_next_task starts all the way back at the root and picks the task/taskgroup with the min vruntime across all runnable tasks. But when there are many frequently sleeping tasks across different taskgroups, it makes better sense to stay with same taskgroup for its slice period (or until all tasks in the taskgroup sleeps) instead of switching cross taskgroup on each sleep after a short runtime. This helps specifically where taskgroups corresponds to a process with multiple threads. The change reduces the number of CR3 switches in this case. Example: Two taskgroups with 2 threads each which are running for 2ms and sleeping for 1ms. Looking at sched:sched_switch shows: BEFORE: taskgroup_1 threads [5004, 5005], taskgroup_2 threads [5016, 5017] cpu-soaker-5004 [003] 3683.391089 cpu-soaker-5016 [003] 3683.393106 cpu-soaker-5005 [003] 3683.395119 cpu-soaker-5017 [003] 3683.397130 cpu-soaker-5004 [003] 3683.399143 cpu-soaker-5016 [003] 3683.401155 cpu-soaker-5005 [003] 3683.403168 cpu-soaker-5017 [003] 3683.405170 AFTER: taskgroup_1 threads [21890, 21891], taskgroup_2 threads [21934, 21935] cpu-soaker-21890 [003] 865.895494 cpu-soaker-21935 [003] 865.897506 cpu-soaker-21934 [003] 865.899520 cpu-soaker-21935 [003] 865.901532 cpu-soaker-21934 [003] 865.903543 cpu-soaker-21935 [003] 865.905546 cpu-soaker-21891 [003] 865.907548 cpu-soaker-21890 [003] 865.909560 cpu-soaker-21891 [003] 865.911571 cpu-soaker-21890 [003] 865.913582 cpu-soaker-21891 [003] 865.915594 cpu-soaker-21934 [003] 865.917606 Similar problem is there when there are multiple taskgroups and say a task A preempts currently running task B of taskgroup_1. On schedule, pick_next_task can pick an unrelated task on taskgroup_2. Here it would be better to give some preference to task B on pick_next_task. A simple (may be extreme case) benchmark I tried was tbench with 2 tbench client processes with 2 threads each running on a single CPU. Avg throughput across 5 50 sec runs was: BEFORE: 105.84 MB/sec AFTER: 112.42 MB/sec Signed-off-by: Venkatesh Pallipadi Acked-by: Rik van Riel Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302802253-25760-1-git-send-email-venki@google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 501ab63..5280272 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1344,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } +static void set_next_buddy(struct sched_entity *se); + /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and @@ -1353,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_sleep = flags & DEQUEUE_SLEEP; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) + if (cfs_rq->load.weight) { + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && parent_entity(se)) + set_next_buddy(parent_entity(se)); break; + } flags |= DEQUEUE_SLEEP; } @@ -1877,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; if (unlikely(se == pse)) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); + next_buddy_marked = 1; + } /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1910,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ update_curr(cfs_rq); find_matching_se(&se, &pse); BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. + */ + if (!next_buddy_marked) + set_next_buddy(pse); goto preempt; + } return; -- cgit v1.1 From 057f3fadb347e9c51b07e1b277bbdda79f976768 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2011 11:24:34 +0200 Subject: sched: Fix sched_domain iterations vs. RCU Vladis Kletnieks reported a new RCU debug warning in the scheduler. Since commit dce840a08702b ("sched: Dynamically allocate sched_domain/ sched_group data-structures") the sched_domain trees are protected by RCU instead of RCU-sched. This means that we need to include rcu_read_lock() protection when we iterate them since disabling preemption doesn't suffice anymore. Reported-by: Valdis.Kletnieks@vt.edu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302882741.2388.241.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 0cfe031..27d3e73 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1208,11 +1208,17 @@ int get_nohz_timer_target(void) int i; struct sched_domain *sd; + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) - if (!idle_cpu(i)) - return i; + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } } +unlock: + rcu_read_unlock(); return cpu; } /* @@ -2415,12 +2421,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) struct sched_domain *sd; schedstat_inc(p, se.statistics.nr_wakeups_remote); + rcu_read_lock(); for_each_domain(this_cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { schedstat_inc(sd, ttwu_wake_remote); break; } } + rcu_read_unlock(); } #endif /* CONFIG_SMP */ -- cgit v1.1 From aeafcbaf4fcfeb74aeed65609ea5ead48dfc09f8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 31 Mar 2011 10:56:28 -0300 Subject: perf symbols: Give more useful names to 'self' parameters One more installment on an area that is mostly dormant. Suggested-by: Thomas Gleixner Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tom Zanussi LKML-Reference: Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 629 ++++++++++++++++++++++++----------------------- tools/perf/util/symbol.h | 78 +++--- 2 files changed, 361 insertions(+), 346 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index f06c10f..516876d 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -31,13 +31,13 @@ #define NT_GNU_BUILD_ID 3 #endif -static bool dso__build_id_equal(const struct dso *self, u8 *build_id); +static bool dso__build_id_equal(const struct dso *dso, u8 *build_id); static int elf_read_build_id(Elf *elf, void *bf, size_t size); static void dsos__add(struct list_head *head, struct dso *dso); static struct map *map__new2(u64 start, struct dso *dso, enum map_type type); -static int dso__load_kernel_sym(struct dso *self, struct map *map, +static int dso__load_kernel_sym(struct dso *dso, struct map *map, symbol_filter_t filter); -static int dso__load_guest_kernel_sym(struct dso *self, struct map *map, +static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map, symbol_filter_t filter); static int vmlinux_path__nr_entries; static char **vmlinux_path; @@ -49,27 +49,27 @@ struct symbol_conf symbol_conf = { .symfs = "", }; -int dso__name_len(const struct dso *self) +int dso__name_len(const struct dso *dso) { if (verbose) - return self->long_name_len; + return dso->long_name_len; - return self->short_name_len; + return dso->short_name_len; } -bool dso__loaded(const struct dso *self, enum map_type type) +bool dso__loaded(const struct dso *dso, enum map_type type) { - return self->loaded & (1 << type); + return dso->loaded & (1 << type); } -bool dso__sorted_by_name(const struct dso *self, enum map_type type) +bool dso__sorted_by_name(const struct dso *dso, enum map_type type) { - return self->sorted_by_name & (1 << type); + return dso->sorted_by_name & (1 << type); } -static void dso__set_sorted_by_name(struct dso *self, enum map_type type) +static void dso__set_sorted_by_name(struct dso *dso, enum map_type type) { - self->sorted_by_name |= (1 << type); + dso->sorted_by_name |= (1 << type); } bool symbol_type__is_a(char symbol_type, enum map_type map_type) @@ -84,9 +84,9 @@ bool symbol_type__is_a(char symbol_type, enum map_type map_type) } } -static void symbols__fixup_end(struct rb_root *self) +static void symbols__fixup_end(struct rb_root *symbols) { - struct rb_node *nd, *prevnd = rb_first(self); + struct rb_node *nd, *prevnd = rb_first(symbols); struct symbol *curr, *prev; if (prevnd == NULL) @@ -107,10 +107,10 @@ static void symbols__fixup_end(struct rb_root *self) curr->end = roundup(curr->start, 4096); } -static void __map_groups__fixup_end(struct map_groups *self, enum map_type type) +static void __map_groups__fixup_end(struct map_groups *mg, enum map_type type) { struct map *prev, *curr; - struct rb_node *nd, *prevnd = rb_first(&self->maps[type]); + struct rb_node *nd, *prevnd = rb_first(&mg->maps[type]); if (prevnd == NULL) return; @@ -130,128 +130,128 @@ static void __map_groups__fixup_end(struct map_groups *self, enum map_type type) curr->end = ~0ULL; } -static void map_groups__fixup_end(struct map_groups *self) +static void map_groups__fixup_end(struct map_groups *mg) { int i; for (i = 0; i < MAP__NR_TYPES; ++i) - __map_groups__fixup_end(self, i); + __map_groups__fixup_end(mg, i); } static struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name) { size_t namelen = strlen(name) + 1; - struct symbol *self = calloc(1, (symbol_conf.priv_size + - sizeof(*self) + namelen)); - if (self == NULL) + struct symbol *sym = calloc(1, (symbol_conf.priv_size + + sizeof(*sym) + namelen)); + if (sym == NULL) return NULL; if (symbol_conf.priv_size) - self = ((void *)self) + symbol_conf.priv_size; - - self->start = start; - self->end = len ? start + len - 1 : start; - self->binding = binding; - self->namelen = namelen - 1; + sym = ((void *)sym) + symbol_conf.priv_size; - pr_debug4("%s: %s %#" PRIx64 "-%#" PRIx64 "\n", __func__, name, start, self->end); + sym->start = start; + sym->end = len ? start + len - 1 : start; + sym->binding = binding; + sym->namelen = namelen - 1; - memcpy(self->name, name, namelen); + pr_debug4("%s: %s %#" PRIx64 "-%#" PRIx64 "\n", + __func__, name, start, sym->end); + memcpy(sym->name, name, namelen); - return self; + return sym; } -void symbol__delete(struct symbol *self) +void symbol__delete(struct symbol *sym) { - free(((void *)self) - symbol_conf.priv_size); + free(((void *)sym) - symbol_conf.priv_size); } -static size_t symbol__fprintf(struct symbol *self, FILE *fp) +static size_t symbol__fprintf(struct symbol *sym, FILE *fp) { return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n", - self->start, self->end, - self->binding == STB_GLOBAL ? 'g' : - self->binding == STB_LOCAL ? 'l' : 'w', - self->name); + sym->start, sym->end, + sym->binding == STB_GLOBAL ? 'g' : + sym->binding == STB_LOCAL ? 'l' : 'w', + sym->name); } -void dso__set_long_name(struct dso *self, char *name) +void dso__set_long_name(struct dso *dso, char *name) { if (name == NULL) return; - self->long_name = name; - self->long_name_len = strlen(name); + dso->long_name = name; + dso->long_name_len = strlen(name); } -static void dso__set_short_name(struct dso *self, const char *name) +static void dso__set_short_name(struct dso *dso, const char *name) { if (name == NULL) return; - self->short_name = name; - self->short_name_len = strlen(name); + dso->short_name = name; + dso->short_name_len = strlen(name); } -static void dso__set_basename(struct dso *self) +static void dso__set_basename(struct dso *dso) { - dso__set_short_name(self, basename(self->long_name)); + dso__set_short_name(dso, basename(dso->long_name)); } struct dso *dso__new(const char *name) { - struct dso *self = calloc(1, sizeof(*self) + strlen(name) + 1); + struct dso *dso = calloc(1, sizeof(*dso) + strlen(name) + 1); - if (self != NULL) { + if (dso != NULL) { int i; - strcpy(self->name, name); - dso__set_long_name(self, self->name); - dso__set_short_name(self, self->name); + strcpy(dso->name, name); + dso__set_long_name(dso, dso->name); + dso__set_short_name(dso, dso->name); for (i = 0; i < MAP__NR_TYPES; ++i) - self->symbols[i] = self->symbol_names[i] = RB_ROOT; - self->symtab_type = SYMTAB__NOT_FOUND; - self->loaded = 0; - self->sorted_by_name = 0; - self->has_build_id = 0; - self->kernel = DSO_TYPE_USER; - INIT_LIST_HEAD(&self->node); + dso->symbols[i] = dso->symbol_names[i] = RB_ROOT; + dso->symtab_type = SYMTAB__NOT_FOUND; + dso->loaded = 0; + dso->sorted_by_name = 0; + dso->has_build_id = 0; + dso->kernel = DSO_TYPE_USER; + INIT_LIST_HEAD(&dso->node); } - return self; + return dso; } -static void symbols__delete(struct rb_root *self) +static void symbols__delete(struct rb_root *symbols) { struct symbol *pos; - struct rb_node *next = rb_first(self); + struct rb_node *next = rb_first(symbols); while (next) { pos = rb_entry(next, struct symbol, rb_node); next = rb_next(&pos->rb_node); - rb_erase(&pos->rb_node, self); + rb_erase(&pos->rb_node, symbols); symbol__delete(pos); } } -void dso__delete(struct dso *self) +void dso__delete(struct dso *dso) { int i; for (i = 0; i < MAP__NR_TYPES; ++i) - symbols__delete(&self->symbols[i]); - if (self->sname_alloc) - free((char *)self->short_name); - if (self->lname_alloc) - free(self->long_name); - free(self); + symbols__delete(&dso->symbols[i]); + if (dso->sname_alloc) + free((char *)dso->short_name); + if (dso->lname_alloc) + free(dso->long_name); + free(dso); } -void dso__set_build_id(struct dso *self, void *build_id) +void dso__set_build_id(struct dso *dso, void *build_id) { - memcpy(self->build_id, build_id, sizeof(self->build_id)); - self->has_build_id = 1; + memcpy(dso->build_id, build_id, sizeof(dso->build_id)); + dso->has_build_id = 1; } -static void symbols__insert(struct rb_root *self, struct symbol *sym) +static void symbols__insert(struct rb_root *symbols, struct symbol *sym) { - struct rb_node **p = &self->rb_node; + struct rb_node **p = &symbols->rb_node; struct rb_node *parent = NULL; const u64 ip = sym->start; struct symbol *s; @@ -265,17 +265,17 @@ static void symbols__insert(struct rb_root *self, struct symbol *sym) p = &(*p)->rb_right; } rb_link_node(&sym->rb_node, parent, p); - rb_insert_color(&sym->rb_node, self); + rb_insert_color(&sym->rb_node, symbols); } -static struct symbol *symbols__find(struct rb_root *self, u64 ip) +static struct symbol *symbols__find(struct rb_root *symbols, u64 ip) { struct rb_node *n; - if (self == NULL) + if (symbols == NULL) return NULL; - n = self->rb_node; + n = symbols->rb_node; while (n) { struct symbol *s = rb_entry(n, struct symbol, rb_node); @@ -296,9 +296,9 @@ struct symbol_name_rb_node { struct symbol sym; }; -static void symbols__insert_by_name(struct rb_root *self, struct symbol *sym) +static void symbols__insert_by_name(struct rb_root *symbols, struct symbol *sym) { - struct rb_node **p = &self->rb_node; + struct rb_node **p = &symbols->rb_node; struct rb_node *parent = NULL; struct symbol_name_rb_node *symn, *s; @@ -313,27 +313,29 @@ static void symbols__insert_by_name(struct rb_root *self, struct symbol *sym) p = &(*p)->rb_right; } rb_link_node(&symn->rb_node, parent, p); - rb_insert_color(&symn->rb_node, self); + rb_insert_color(&symn->rb_node, symbols); } -static void symbols__sort_by_name(struct rb_root *self, struct rb_root *source) +static void symbols__sort_by_name(struct rb_root *symbols, + struct rb_root *source) { struct rb_node *nd; for (nd = rb_first(source); nd; nd = rb_next(nd)) { struct symbol *pos = rb_entry(nd, struct symbol, rb_node); - symbols__insert_by_name(self, pos); + symbols__insert_by_name(symbols, pos); } } -static struct symbol *symbols__find_by_name(struct rb_root *self, const char *name) +static struct symbol *symbols__find_by_name(struct rb_root *symbols, + const char *name) { struct rb_node *n; - if (self == NULL) + if (symbols == NULL) return NULL; - n = self->rb_node; + n = symbols->rb_node; while (n) { struct symbol_name_rb_node *s; @@ -353,29 +355,29 @@ static struct symbol *symbols__find_by_name(struct rb_root *self, const char *na return NULL; } -struct symbol *dso__find_symbol(struct dso *self, +struct symbol *dso__find_symbol(struct dso *dso, enum map_type type, u64 addr) { - return symbols__find(&self->symbols[type], addr); + return symbols__find(&dso->symbols[type], addr); } -struct symbol *dso__find_symbol_by_name(struct dso *self, enum map_type type, +struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type, const char *name) { - return symbols__find_by_name(&self->symbol_names[type], name); + return symbols__find_by_name(&dso->symbol_names[type], name); } -void dso__sort_by_name(struct dso *self, enum map_type type) +void dso__sort_by_name(struct dso *dso, enum map_type type) { - dso__set_sorted_by_name(self, type); - return symbols__sort_by_name(&self->symbol_names[type], - &self->symbols[type]); + dso__set_sorted_by_name(dso, type); + return symbols__sort_by_name(&dso->symbol_names[type], + &dso->symbols[type]); } -int build_id__sprintf(const u8 *self, int len, char *bf) +int build_id__sprintf(const u8 *build_id, int len, char *bf) { char *bid = bf; - const u8 *raw = self; + const u8 *raw = build_id; int i; for (i = 0; i < len; ++i) { @@ -384,24 +386,25 @@ int build_id__sprintf(const u8 *self, int len, char *bf) bid += 2; } - return raw - self; + return raw - build_id; } -size_t dso__fprintf_buildid(struct dso *self, FILE *fp) +size_t dso__fprintf_buildid(struct dso *dso, FILE *fp) { char sbuild_id[BUILD_ID_SIZE * 2 + 1]; - build_id__sprintf(self->build_id, sizeof(self->build_id), sbuild_id); + build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id); return fprintf(fp, "%s", sbuild_id); } -size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp) +size_t dso__fprintf_symbols_by_name(struct dso *dso, + enum map_type type, FILE *fp) { size_t ret = 0; struct rb_node *nd; struct symbol_name_rb_node *pos; - for (nd = rb_first(&self->symbol_names[type]); nd; nd = rb_next(nd)) { + for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) { pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); fprintf(fp, "%s\n", pos->sym.name); } @@ -409,18 +412,18 @@ size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE * return ret; } -size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp) +size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp) { struct rb_node *nd; - size_t ret = fprintf(fp, "dso: %s (", self->short_name); + size_t ret = fprintf(fp, "dso: %s (", dso->short_name); - if (self->short_name != self->long_name) - ret += fprintf(fp, "%s, ", self->long_name); + if (dso->short_name != dso->long_name) + ret += fprintf(fp, "%s, ", dso->long_name); ret += fprintf(fp, "%s, %sloaded, ", map_type__name[type], - self->loaded ? "" : "NOT "); - ret += dso__fprintf_buildid(self, fp); + dso->loaded ? "" : "NOT "); + ret += dso__fprintf_buildid(dso, fp); ret += fprintf(fp, ")\n"); - for (nd = rb_first(&self->symbols[type]); nd; nd = rb_next(nd)) { + for (nd = rb_first(&dso->symbols[type]); nd; nd = rb_next(nd)) { struct symbol *pos = rb_entry(nd, struct symbol, rb_node); ret += symbol__fprintf(pos, fp); } @@ -543,10 +546,10 @@ static int map__process_kallsym_symbol(void *arg, const char *name, * so that we can in the next step set the symbol ->end address and then * call kernel_maps__split_kallsyms. */ -static int dso__load_all_kallsyms(struct dso *self, const char *filename, +static int dso__load_all_kallsyms(struct dso *dso, const char *filename, struct map *map) { - struct process_kallsyms_args args = { .map = map, .dso = self, }; + struct process_kallsyms_args args = { .map = map, .dso = dso, }; return kallsyms__parse(filename, &args, map__process_kallsym_symbol); } @@ -555,7 +558,7 @@ static int dso__load_all_kallsyms(struct dso *self, const char *filename, * kernel range is broken in several maps, named [kernel].N, as we don't have * the original ELF section names vmlinux have. */ -static int dso__split_kallsyms(struct dso *self, struct map *map, +static int dso__split_kallsyms(struct dso *dso, struct map *map, symbol_filter_t filter) { struct map_groups *kmaps = map__kmap(map)->kmaps; @@ -563,7 +566,7 @@ static int dso__split_kallsyms(struct dso *self, struct map *map, struct map *curr_map = map; struct symbol *pos; int count = 0, moved = 0; - struct rb_root *root = &self->symbols[map->type]; + struct rb_root *root = &dso->symbols[map->type]; struct rb_node *next = rb_first(root); int kernel_range = 0; @@ -582,7 +585,7 @@ static int dso__split_kallsyms(struct dso *self, struct map *map, if (strcmp(curr_map->dso->short_name, module)) { if (curr_map != map && - self->kernel == DSO_TYPE_GUEST_KERNEL && + dso->kernel == DSO_TYPE_GUEST_KERNEL && machine__is_default_guest(machine)) { /* * We assume all symbols of a module are @@ -618,14 +621,14 @@ static int dso__split_kallsyms(struct dso *self, struct map *map, pos->end = curr_map->map_ip(curr_map, pos->end); } else if (curr_map != map) { char dso_name[PATH_MAX]; - struct dso *dso; + struct dso *ndso; if (count == 0) { curr_map = map; goto filter_symbol; } - if (self->kernel == DSO_TYPE_GUEST_KERNEL) + if (dso->kernel == DSO_TYPE_GUEST_KERNEL) snprintf(dso_name, sizeof(dso_name), "[guest.kernel].%d", kernel_range++); @@ -634,15 +637,15 @@ static int dso__split_kallsyms(struct dso *self, struct map *map, "[kernel].%d", kernel_range++); - dso = dso__new(dso_name); - if (dso == NULL) + ndso = dso__new(dso_name); + if (ndso == NULL) return -1; - dso->kernel = self->kernel; + ndso->kernel = dso->kernel; - curr_map = map__new2(pos->start, dso, map->type); + curr_map = map__new2(pos->start, ndso, map->type); if (curr_map == NULL) { - dso__delete(dso); + dso__delete(ndso); return -1; } @@ -665,7 +668,7 @@ discard_symbol: rb_erase(&pos->rb_node, root); } if (curr_map != map && - self->kernel == DSO_TYPE_GUEST_KERNEL && + dso->kernel == DSO_TYPE_GUEST_KERNEL && machine__is_default_guest(kmaps->machine)) { dso__set_loaded(curr_map->dso, curr_map->type); } @@ -673,21 +676,21 @@ discard_symbol: rb_erase(&pos->rb_node, root); return count + moved; } -int dso__load_kallsyms(struct dso *self, const char *filename, +int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map, symbol_filter_t filter) { - if (dso__load_all_kallsyms(self, filename, map) < 0) + if (dso__load_all_kallsyms(dso, filename, map) < 0) return -1; - if (self->kernel == DSO_TYPE_GUEST_KERNEL) - self->symtab_type = SYMTAB__GUEST_KALLSYMS; + if (dso->kernel == DSO_TYPE_GUEST_KERNEL) + dso->symtab_type = SYMTAB__GUEST_KALLSYMS; else - self->symtab_type = SYMTAB__KALLSYMS; + dso->symtab_type = SYMTAB__KALLSYMS; - return dso__split_kallsyms(self, map, filter); + return dso__split_kallsyms(dso, map, filter); } -static int dso__load_perf_map(struct dso *self, struct map *map, +static int dso__load_perf_map(struct dso *dso, struct map *map, symbol_filter_t filter) { char *line = NULL; @@ -695,7 +698,7 @@ static int dso__load_perf_map(struct dso *self, struct map *map, FILE *file; int nr_syms = 0; - file = fopen(self->long_name, "r"); + file = fopen(dso->long_name, "r"); if (file == NULL) goto out_failure; @@ -733,7 +736,7 @@ static int dso__load_perf_map(struct dso *self, struct map *map, if (filter && filter(map, sym)) symbol__delete(sym); else { - symbols__insert(&self->symbols[map->type], sym); + symbols__insert(&dso->symbols[map->type], sym); nr_syms++; } } @@ -752,7 +755,7 @@ out_failure: /** * elf_symtab__for_each_symbol - iterate thru all the symbols * - * @self: struct elf_symtab instance to iterate + * @syms: struct elf_symtab instance to iterate * @idx: uint32_t idx * @sym: GElf_Sym iterator */ @@ -852,7 +855,7 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep, * And always look at the original dso, not at debuginfo packages, that * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS). */ -static int dso__synthesize_plt_symbols(struct dso *self, struct map *map, +static int dso__synthesize_plt_symbols(struct dso *dso, struct map *map, symbol_filter_t filter) { uint32_t nr_rel_entries, idx; @@ -871,7 +874,7 @@ static int dso__synthesize_plt_symbols(struct dso *self, struct map *map, char name[PATH_MAX]; snprintf(name, sizeof(name), "%s%s", - symbol_conf.symfs, self->long_name); + symbol_conf.symfs, dso->long_name); fd = open(name, O_RDONLY); if (fd < 0) goto out; @@ -947,7 +950,7 @@ static int dso__synthesize_plt_symbols(struct dso *self, struct map *map, if (filter && filter(map, f)) symbol__delete(f); else { - symbols__insert(&self->symbols[map->type], f); + symbols__insert(&dso->symbols[map->type], f); ++nr; } } @@ -969,7 +972,7 @@ static int dso__synthesize_plt_symbols(struct dso *self, struct map *map, if (filter && filter(map, f)) symbol__delete(f); else { - symbols__insert(&self->symbols[map->type], f); + symbols__insert(&dso->symbols[map->type], f); ++nr; } } @@ -985,29 +988,30 @@ out_close: return nr; out: pr_debug("%s: problems reading %s PLT info.\n", - __func__, self->long_name); + __func__, dso->long_name); return 0; } -static bool elf_sym__is_a(GElf_Sym *self, enum map_type type) +static bool elf_sym__is_a(GElf_Sym *sym, enum map_type type) { switch (type) { case MAP__FUNCTION: - return elf_sym__is_function(self); + return elf_sym__is_function(sym); case MAP__VARIABLE: - return elf_sym__is_object(self); + return elf_sym__is_object(sym); default: return false; } } -static bool elf_sec__is_a(GElf_Shdr *self, Elf_Data *secstrs, enum map_type type) +static bool elf_sec__is_a(GElf_Shdr *shdr, Elf_Data *secstrs, + enum map_type type) { switch (type) { case MAP__FUNCTION: - return elf_sec__is_text(self, secstrs); + return elf_sec__is_text(shdr, secstrs); case MAP__VARIABLE: - return elf_sec__is_data(self, secstrs); + return elf_sec__is_data(shdr, secstrs); default: return false; } @@ -1032,13 +1036,13 @@ static size_t elf_addr_to_index(Elf *elf, GElf_Addr addr) return -1; } -static int dso__load_sym(struct dso *self, struct map *map, const char *name, +static int dso__load_sym(struct dso *dso, struct map *map, const char *name, int fd, symbol_filter_t filter, int kmodule, int want_symtab) { - struct kmap *kmap = self->kernel ? map__kmap(map) : NULL; + struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL; struct map *curr_map = map; - struct dso *curr_dso = self; + struct dso *curr_dso = dso; Elf_Data *symstrs, *secstrs; uint32_t nr_syms; int err = -1; @@ -1064,14 +1068,14 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, } /* Always reject images with a mismatched build-id: */ - if (self->has_build_id) { + if (dso->has_build_id) { u8 build_id[BUILD_ID_SIZE]; if (elf_read_build_id(elf, build_id, BUILD_ID_SIZE) != BUILD_ID_SIZE) goto out_elf_end; - if (!dso__build_id_equal(self, build_id)) + if (!dso__build_id_equal(dso, build_id)) goto out_elf_end; } @@ -1112,13 +1116,14 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, nr_syms = shdr.sh_size / shdr.sh_entsize; memset(&sym, 0, sizeof(sym)); - if (self->kernel == DSO_TYPE_USER) { - self->adjust_symbols = (ehdr.e_type == ET_EXEC || + if (dso->kernel == DSO_TYPE_USER) { + dso->adjust_symbols = (ehdr.e_type == ET_EXEC || elf_section_by_name(elf, &ehdr, &shdr, ".gnu.prelink_undo", NULL) != NULL); - } else self->adjust_symbols = 0; - + } else { + dso->adjust_symbols = 0; + } elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) { struct symbol *f; const char *elf_name = elf_sym__name(&sym, symstrs); @@ -1168,22 +1173,22 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, (sym.st_value & 1)) --sym.st_value; - if (self->kernel != DSO_TYPE_USER || kmodule) { + if (dso->kernel != DSO_TYPE_USER || kmodule) { char dso_name[PATH_MAX]; if (strcmp(section_name, (curr_dso->short_name + - self->short_name_len)) == 0) + dso->short_name_len)) == 0) goto new_symbol; if (strcmp(section_name, ".text") == 0) { curr_map = map; - curr_dso = self; + curr_dso = dso; goto new_symbol; } snprintf(dso_name, sizeof(dso_name), - "%s%s", self->short_name, section_name); + "%s%s", dso->short_name, section_name); curr_map = map_groups__find_by_name(kmap->kmaps, map->type, dso_name); if (curr_map == NULL) { @@ -1195,9 +1200,9 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, curr_dso = dso__new(dso_name); if (curr_dso == NULL) goto out_elf_end; - curr_dso->kernel = self->kernel; - curr_dso->long_name = self->long_name; - curr_dso->long_name_len = self->long_name_len; + curr_dso->kernel = dso->kernel; + curr_dso->long_name = dso->long_name; + curr_dso->long_name_len = dso->long_name_len; curr_map = map__new2(start, curr_dso, map->type); if (curr_map == NULL) { @@ -1206,9 +1211,9 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, } curr_map->map_ip = identity__map_ip; curr_map->unmap_ip = identity__map_ip; - curr_dso->symtab_type = self->symtab_type; + curr_dso->symtab_type = dso->symtab_type; map_groups__insert(kmap->kmaps, curr_map); - dsos__add(&self->node, curr_dso); + dsos__add(&dso->node, curr_dso); dso__set_loaded(curr_dso, map->type); } else curr_dso = curr_map->dso; @@ -1250,7 +1255,7 @@ new_symbol: * For misannotated, zeroed, ASM function sizes. */ if (nr > 0) { - symbols__fixup_end(&self->symbols[map->type]); + symbols__fixup_end(&dso->symbols[map->type]); if (kmap) { /* * We need to fixup this here too because we create new @@ -1266,9 +1271,9 @@ out_close: return err; } -static bool dso__build_id_equal(const struct dso *self, u8 *build_id) +static bool dso__build_id_equal(const struct dso *dso, u8 *build_id) { - return memcmp(self->build_id, build_id, sizeof(self->build_id)) == 0; + return memcmp(dso->build_id, build_id, sizeof(dso->build_id)) == 0; } bool __dsos__read_build_ids(struct list_head *head, bool with_hits) @@ -1429,7 +1434,7 @@ out: return err; } -char dso__symtab_origin(const struct dso *self) +char dso__symtab_origin(const struct dso *dso) { static const char origin[] = { [SYMTAB__KALLSYMS] = 'k', @@ -1444,12 +1449,12 @@ char dso__symtab_origin(const struct dso *self) [SYMTAB__GUEST_KMODULE] = 'G', }; - if (self == NULL || self->symtab_type == SYMTAB__NOT_FOUND) + if (dso == NULL || dso->symtab_type == SYMTAB__NOT_FOUND) return '!'; - return origin[self->symtab_type]; + return origin[dso->symtab_type]; } -int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) +int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter) { int size = PATH_MAX; char *name; @@ -1459,12 +1464,12 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) const char *root_dir; int want_symtab; - dso__set_loaded(self, map->type); + dso__set_loaded(dso, map->type); - if (self->kernel == DSO_TYPE_KERNEL) - return dso__load_kernel_sym(self, map, filter); - else if (self->kernel == DSO_TYPE_GUEST_KERNEL) - return dso__load_guest_kernel_sym(self, map, filter); + if (dso->kernel == DSO_TYPE_KERNEL) + return dso__load_kernel_sym(dso, map, filter); + else if (dso->kernel == DSO_TYPE_GUEST_KERNEL) + return dso__load_guest_kernel_sym(dso, map, filter); if (map->groups && map->groups->machine) machine = map->groups->machine; @@ -1475,11 +1480,11 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) if (!name) return -1; - self->adjust_symbols = 0; + dso->adjust_symbols = 0; - if (strncmp(self->name, "/tmp/perf-", 10) == 0) { - ret = dso__load_perf_map(self, map, filter); - self->symtab_type = ret > 0 ? SYMTAB__JAVA_JIT : + if (strncmp(dso->name, "/tmp/perf-", 10) == 0) { + ret = dso__load_perf_map(dso, map, filter); + dso->symtab_type = ret > 0 ? SYMTAB__JAVA_JIT : SYMTAB__NOT_FOUND; return ret; } @@ -1490,33 +1495,33 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) */ want_symtab = 1; restart: - for (self->symtab_type = SYMTAB__BUILD_ID_CACHE; - self->symtab_type != SYMTAB__NOT_FOUND; - self->symtab_type++) { - switch (self->symtab_type) { + for (dso->symtab_type = SYMTAB__BUILD_ID_CACHE; + dso->symtab_type != SYMTAB__NOT_FOUND; + dso->symtab_type++) { + switch (dso->symtab_type) { case SYMTAB__BUILD_ID_CACHE: /* skip the locally configured cache if a symfs is given */ if (symbol_conf.symfs[0] || - (dso__build_id_filename(self, name, size) == NULL)) { + (dso__build_id_filename(dso, name, size) == NULL)) { continue; } break; case SYMTAB__FEDORA_DEBUGINFO: snprintf(name, size, "%s/usr/lib/debug%s.debug", - symbol_conf.symfs, self->long_name); + symbol_conf.symfs, dso->long_name); break; case SYMTAB__UBUNTU_DEBUGINFO: snprintf(name, size, "%s/usr/lib/debug%s", - symbol_conf.symfs, self->long_name); + symbol_conf.symfs, dso->long_name); break; case SYMTAB__BUILDID_DEBUGINFO: { char build_id_hex[BUILD_ID_SIZE * 2 + 1]; - if (!self->has_build_id) + if (!dso->has_build_id) continue; - build_id__sprintf(self->build_id, - sizeof(self->build_id), + build_id__sprintf(dso->build_id, + sizeof(dso->build_id), build_id_hex); snprintf(name, size, "%s/usr/lib/debug/.build-id/%.2s/%s.debug", @@ -1525,7 +1530,7 @@ restart: break; case SYMTAB__SYSTEM_PATH_DSO: snprintf(name, size, "%s%s", - symbol_conf.symfs, self->long_name); + symbol_conf.symfs, dso->long_name); break; case SYMTAB__GUEST_KMODULE: if (map->groups && machine) @@ -1533,12 +1538,12 @@ restart: else root_dir = ""; snprintf(name, size, "%s%s%s", symbol_conf.symfs, - root_dir, self->long_name); + root_dir, dso->long_name); break; case SYMTAB__SYSTEM_PATH_KMODULE: snprintf(name, size, "%s%s", symbol_conf.symfs, - self->long_name); + dso->long_name); break; default:; } @@ -1548,7 +1553,7 @@ restart: if (fd < 0) continue; - ret = dso__load_sym(self, map, name, fd, filter, 0, + ret = dso__load_sym(dso, map, name, fd, filter, 0, want_symtab); close(fd); @@ -1560,7 +1565,8 @@ restart: continue; if (ret > 0) { - int nr_plt = dso__synthesize_plt_symbols(self, map, filter); + int nr_plt = dso__synthesize_plt_symbols(dso, map, + filter); if (nr_plt > 0) ret += nr_plt; break; @@ -1577,17 +1583,17 @@ restart: } free(name); - if (ret < 0 && strstr(self->name, " (deleted)") != NULL) + if (ret < 0 && strstr(dso->name, " (deleted)") != NULL) return 0; return ret; } -struct map *map_groups__find_by_name(struct map_groups *self, +struct map *map_groups__find_by_name(struct map_groups *mg, enum map_type type, const char *name) { struct rb_node *nd; - for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) { + for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) { struct map *map = rb_entry(nd, struct map, rb_node); if (map->dso && strcmp(map->dso->short_name, name) == 0) @@ -1597,28 +1603,28 @@ struct map *map_groups__find_by_name(struct map_groups *self, return NULL; } -static int dso__kernel_module_get_build_id(struct dso *self, - const char *root_dir) +static int dso__kernel_module_get_build_id(struct dso *dso, + const char *root_dir) { char filename[PATH_MAX]; /* * kernel module short names are of the form "[module]" and * we need just "module" here. */ - const char *name = self->short_name + 1; + const char *name = dso->short_name + 1; snprintf(filename, sizeof(filename), "%s/sys/module/%.*s/notes/.note.gnu.build-id", root_dir, (int)strlen(name) - 1, name); - if (sysfs__read_build_id(filename, self->build_id, - sizeof(self->build_id)) == 0) - self->has_build_id = true; + if (sysfs__read_build_id(filename, dso->build_id, + sizeof(dso->build_id)) == 0) + dso->has_build_id = true; return 0; } -static int map_groups__set_modules_path_dir(struct map_groups *self, +static int map_groups__set_modules_path_dir(struct map_groups *mg, const char *dir_name) { struct dirent *dent; @@ -1646,7 +1652,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *self, snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); - ret = map_groups__set_modules_path_dir(self, path); + ret = map_groups__set_modules_path_dir(mg, path); if (ret < 0) goto out; } else { @@ -1661,7 +1667,8 @@ static int map_groups__set_modules_path_dir(struct map_groups *self, (int)(dot - dent->d_name), dent->d_name); strxfrchar(dso_name, '-', '_'); - map = map_groups__find_by_name(self, MAP__FUNCTION, dso_name); + map = map_groups__find_by_name(mg, MAP__FUNCTION, + dso_name); if (map == NULL) continue; @@ -1711,20 +1718,20 @@ static char *get_kernel_version(const char *root_dir) return strdup(name); } -static int machine__set_modules_path(struct machine *self) +static int machine__set_modules_path(struct machine *machine) { char *version; char modules_path[PATH_MAX]; - version = get_kernel_version(self->root_dir); + version = get_kernel_version(machine->root_dir); if (!version) return -1; snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel", - self->root_dir, version); + machine->root_dir, version); free(version); - return map_groups__set_modules_path_dir(&self->kmaps, modules_path); + return map_groups__set_modules_path_dir(&machine->kmaps, modules_path); } /* @@ -1734,23 +1741,23 @@ static int machine__set_modules_path(struct machine *self) */ static struct map *map__new2(u64 start, struct dso *dso, enum map_type type) { - struct map *self = calloc(1, (sizeof(*self) + - (dso->kernel ? sizeof(struct kmap) : 0))); - if (self != NULL) { + struct map *map = calloc(1, (sizeof(*map) + + (dso->kernel ? sizeof(struct kmap) : 0))); + if (map != NULL) { /* * ->end will be filled after we load all the symbols */ - map__init(self, type, start, 0, 0, dso); + map__init(map, type, start, 0, 0, dso); } - return self; + return map; } -struct map *machine__new_module(struct machine *self, u64 start, +struct map *machine__new_module(struct machine *machine, u64 start, const char *filename) { struct map *map; - struct dso *dso = __dsos__findnew(&self->kernel_dsos, filename); + struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename); if (dso == NULL) return NULL; @@ -1759,15 +1766,15 @@ struct map *machine__new_module(struct machine *self, u64 start, if (map == NULL) return NULL; - if (machine__is_host(self)) + if (machine__is_host(machine)) dso->symtab_type = SYMTAB__SYSTEM_PATH_KMODULE; else dso->symtab_type = SYMTAB__GUEST_KMODULE; - map_groups__insert(&self->kmaps, map); + map_groups__insert(&machine->kmaps, map); return map; } -static int machine__create_modules(struct machine *self) +static int machine__create_modules(struct machine *machine) { char *line = NULL; size_t n; @@ -1776,10 +1783,10 @@ static int machine__create_modules(struct machine *self) const char *modules; char path[PATH_MAX]; - if (machine__is_default_guest(self)) + if (machine__is_default_guest(machine)) modules = symbol_conf.default_guest_modules; else { - sprintf(path, "%s/proc/modules", self->root_dir); + sprintf(path, "%s/proc/modules", machine->root_dir); modules = path; } @@ -1815,16 +1822,16 @@ static int machine__create_modules(struct machine *self) *sep = '\0'; snprintf(name, sizeof(name), "[%s]", line); - map = machine__new_module(self, start, name); + map = machine__new_module(machine, start, name); if (map == NULL) goto out_delete_line; - dso__kernel_module_get_build_id(map->dso, self->root_dir); + dso__kernel_module_get_build_id(map->dso, machine->root_dir); } free(line); fclose(file); - return machine__set_modules_path(self); + return machine__set_modules_path(machine); out_delete_line: free(line); @@ -1832,7 +1839,7 @@ out_failure: return -1; } -int dso__load_vmlinux(struct dso *self, struct map *map, +int dso__load_vmlinux(struct dso *dso, struct map *map, const char *vmlinux, symbol_filter_t filter) { int err = -1, fd; @@ -1844,9 +1851,9 @@ int dso__load_vmlinux(struct dso *self, struct map *map, if (fd < 0) return -1; - dso__set_long_name(self, (char *)vmlinux); - dso__set_loaded(self, map->type); - err = dso__load_sym(self, map, symfs_vmlinux, fd, filter, 0, 0); + dso__set_long_name(dso, (char *)vmlinux); + dso__set_loaded(dso, map->type); + err = dso__load_sym(dso, map, symfs_vmlinux, fd, filter, 0, 0); close(fd); if (err > 0) @@ -1855,7 +1862,7 @@ int dso__load_vmlinux(struct dso *self, struct map *map, return err; } -int dso__load_vmlinux_path(struct dso *self, struct map *map, +int dso__load_vmlinux_path(struct dso *dso, struct map *map, symbol_filter_t filter) { int i, err = 0; @@ -1864,20 +1871,20 @@ int dso__load_vmlinux_path(struct dso *self, struct map *map, pr_debug("Looking at the vmlinux_path (%d entries long)\n", vmlinux_path__nr_entries + 1); - filename = dso__build_id_filename(self, NULL, 0); + filename = dso__build_id_filename(dso, NULL, 0); if (filename != NULL) { - err = dso__load_vmlinux(self, map, filename, filter); + err = dso__load_vmlinux(dso, map, filename, filter); if (err > 0) { - dso__set_long_name(self, filename); + dso__set_long_name(dso, filename); goto out; } free(filename); } for (i = 0; i < vmlinux_path__nr_entries; ++i) { - err = dso__load_vmlinux(self, map, vmlinux_path[i], filter); + err = dso__load_vmlinux(dso, map, vmlinux_path[i], filter); if (err > 0) { - dso__set_long_name(self, strdup(vmlinux_path[i])); + dso__set_long_name(dso, strdup(vmlinux_path[i])); break; } } @@ -1885,7 +1892,7 @@ out: return err; } -static int dso__load_kernel_sym(struct dso *self, struct map *map, +static int dso__load_kernel_sym(struct dso *dso, struct map *map, symbol_filter_t filter) { int err; @@ -1912,10 +1919,10 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, } if (symbol_conf.vmlinux_name != NULL) { - err = dso__load_vmlinux(self, map, + err = dso__load_vmlinux(dso, map, symbol_conf.vmlinux_name, filter); if (err > 0) { - dso__set_long_name(self, + dso__set_long_name(dso, strdup(symbol_conf.vmlinux_name)); goto out_fixup; } @@ -1923,7 +1930,7 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, } if (vmlinux_path != NULL) { - err = dso__load_vmlinux_path(self, map, filter); + err = dso__load_vmlinux_path(dso, map, filter); if (err > 0) goto out_fixup; } @@ -1937,13 +1944,13 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, * we have a build-id, so check if it is the same as the running kernel, * using it if it is. */ - if (self->has_build_id) { + if (dso->has_build_id) { u8 kallsyms_build_id[BUILD_ID_SIZE]; char sbuild_id[BUILD_ID_SIZE * 2 + 1]; if (sysfs__read_build_id("/sys/kernel/notes", kallsyms_build_id, sizeof(kallsyms_build_id)) == 0) { - if (dso__build_id_equal(self, kallsyms_build_id)) { + if (dso__build_id_equal(dso, kallsyms_build_id)) { kallsyms_filename = "/proc/kallsyms"; goto do_kallsyms; } @@ -1952,7 +1959,7 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, * Now look if we have it on the build-id cache in * $HOME/.debug/[kernel.kallsyms]. */ - build_id__sprintf(self->build_id, sizeof(self->build_id), + build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id); if (asprintf(&kallsyms_allocated_filename, @@ -1979,7 +1986,7 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map, } do_kallsyms: - err = dso__load_kallsyms(self, kallsyms_filename, map, filter); + err = dso__load_kallsyms(dso, kallsyms_filename, map, filter); if (err > 0) pr_debug("Using %s for symbols\n", kallsyms_filename); free(kallsyms_allocated_filename); @@ -1987,7 +1994,7 @@ do_kallsyms: if (err > 0) { out_fixup: if (kallsyms_filename != NULL) - dso__set_long_name(self, strdup("[kernel.kallsyms]")); + dso__set_long_name(dso, strdup("[kernel.kallsyms]")); map__fixup_start(map); map__fixup_end(map); } @@ -1995,8 +2002,8 @@ out_fixup: return err; } -static int dso__load_guest_kernel_sym(struct dso *self, struct map *map, - symbol_filter_t filter) +static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map, + symbol_filter_t filter) { int err; const char *kallsyms_filename = NULL; @@ -2016,7 +2023,7 @@ static int dso__load_guest_kernel_sym(struct dso *self, struct map *map, * Or use file guest_kallsyms inputted by user on commandline */ if (symbol_conf.default_guest_vmlinux_name != NULL) { - err = dso__load_vmlinux(self, map, + err = dso__load_vmlinux(dso, map, symbol_conf.default_guest_vmlinux_name, filter); goto out_try_fixup; } @@ -2029,7 +2036,7 @@ static int dso__load_guest_kernel_sym(struct dso *self, struct map *map, kallsyms_filename = path; } - err = dso__load_kallsyms(self, kallsyms_filename, map, filter); + err = dso__load_kallsyms(dso, kallsyms_filename, map, filter); if (err > 0) pr_debug("Using %s for symbols\n", kallsyms_filename); @@ -2037,7 +2044,7 @@ out_try_fixup: if (err > 0) { if (kallsyms_filename != NULL) { machine__mmap_name(machine, path, sizeof(path)); - dso__set_long_name(self, strdup(path)); + dso__set_long_name(dso, strdup(path)); } map__fixup_start(map); map__fixup_end(map); @@ -2090,12 +2097,12 @@ size_t __dsos__fprintf(struct list_head *head, FILE *fp) return ret; } -size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp) +size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp) { struct rb_node *nd; size_t ret = 0; - for (nd = rb_first(self); nd; nd = rb_next(nd)) { + for (nd = rb_first(machines); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); ret += __dsos__fprintf(&pos->kernel_dsos, fp); ret += __dsos__fprintf(&pos->user_dsos, fp); @@ -2119,18 +2126,20 @@ static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, return ret; } -size_t machine__fprintf_dsos_buildid(struct machine *self, FILE *fp, bool with_hits) +size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp, + bool with_hits) { - return __dsos__fprintf_buildid(&self->kernel_dsos, fp, with_hits) + - __dsos__fprintf_buildid(&self->user_dsos, fp, with_hits); + return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, with_hits) + + __dsos__fprintf_buildid(&machine->user_dsos, fp, with_hits); } -size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits) +size_t machines__fprintf_dsos_buildid(struct rb_root *machines, + FILE *fp, bool with_hits) { struct rb_node *nd; size_t ret = 0; - for (nd = rb_first(self); nd; nd = rb_next(nd)) { + for (nd = rb_first(machines); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); ret += machine__fprintf_dsos_buildid(pos, fp, with_hits); } @@ -2139,59 +2148,59 @@ size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_ struct dso *dso__new_kernel(const char *name) { - struct dso *self = dso__new(name ?: "[kernel.kallsyms]"); + struct dso *dso = dso__new(name ?: "[kernel.kallsyms]"); - if (self != NULL) { - dso__set_short_name(self, "[kernel]"); - self->kernel = DSO_TYPE_KERNEL; + if (dso != NULL) { + dso__set_short_name(dso, "[kernel]"); + dso->kernel = DSO_TYPE_KERNEL; } - return self; + return dso; } static struct dso *dso__new_guest_kernel(struct machine *machine, const char *name) { char bf[PATH_MAX]; - struct dso *self = dso__new(name ?: machine__mmap_name(machine, bf, sizeof(bf))); - - if (self != NULL) { - dso__set_short_name(self, "[guest.kernel]"); - self->kernel = DSO_TYPE_GUEST_KERNEL; + struct dso *dso = dso__new(name ?: machine__mmap_name(machine, bf, + sizeof(bf))); + if (dso != NULL) { + dso__set_short_name(dso, "[guest.kernel]"); + dso->kernel = DSO_TYPE_GUEST_KERNEL; } - return self; + return dso; } -void dso__read_running_kernel_build_id(struct dso *self, struct machine *machine) +void dso__read_running_kernel_build_id(struct dso *dso, struct machine *machine) { char path[PATH_MAX]; if (machine__is_default_guest(machine)) return; sprintf(path, "%s/sys/kernel/notes", machine->root_dir); - if (sysfs__read_build_id(path, self->build_id, - sizeof(self->build_id)) == 0) - self->has_build_id = true; + if (sysfs__read_build_id(path, dso->build_id, + sizeof(dso->build_id)) == 0) + dso->has_build_id = true; } -static struct dso *machine__create_kernel(struct machine *self) +static struct dso *machine__create_kernel(struct machine *machine) { const char *vmlinux_name = NULL; struct dso *kernel; - if (machine__is_host(self)) { + if (machine__is_host(machine)) { vmlinux_name = symbol_conf.vmlinux_name; kernel = dso__new_kernel(vmlinux_name); } else { - if (machine__is_default_guest(self)) + if (machine__is_default_guest(machine)) vmlinux_name = symbol_conf.default_guest_vmlinux_name; - kernel = dso__new_guest_kernel(self, vmlinux_name); + kernel = dso__new_guest_kernel(machine, vmlinux_name); } if (kernel != NULL) { - dso__read_running_kernel_build_id(kernel, self); - dsos__add(&self->kernel_dsos, kernel); + dso__read_running_kernel_build_id(kernel, machine); + dsos__add(&machine->kernel_dsos, kernel); } return kernel; } @@ -2236,41 +2245,43 @@ static u64 machine__get_kernel_start_addr(struct machine *machine) return args.start; } -int __machine__create_kernel_maps(struct machine *self, struct dso *kernel) +int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) { enum map_type type; - u64 start = machine__get_kernel_start_addr(self); + u64 start = machine__get_kernel_start_addr(machine); for (type = 0; type < MAP__NR_TYPES; ++type) { struct kmap *kmap; - self->vmlinux_maps[type] = map__new2(start, kernel, type); - if (self->vmlinux_maps[type] == NULL) + machine->vmlinux_maps[type] = map__new2(start, kernel, type); + if (machine->vmlinux_maps[type] == NULL) return -1; - self->vmlinux_maps[type]->map_ip = - self->vmlinux_maps[type]->unmap_ip = identity__map_ip; - - kmap = map__kmap(self->vmlinux_maps[type]); - kmap->kmaps = &self->kmaps; - map_groups__insert(&self->kmaps, self->vmlinux_maps[type]); + machine->vmlinux_maps[type]->map_ip = + machine->vmlinux_maps[type]->unmap_ip = + identity__map_ip; + kmap = map__kmap(machine->vmlinux_maps[type]); + kmap->kmaps = &machine->kmaps; + map_groups__insert(&machine->kmaps, + machine->vmlinux_maps[type]); } return 0; } -void machine__destroy_kernel_maps(struct machine *self) +void machine__destroy_kernel_maps(struct machine *machine) { enum map_type type; for (type = 0; type < MAP__NR_TYPES; ++type) { struct kmap *kmap; - if (self->vmlinux_maps[type] == NULL) + if (machine->vmlinux_maps[type] == NULL) continue; - kmap = map__kmap(self->vmlinux_maps[type]); - map_groups__remove(&self->kmaps, self->vmlinux_maps[type]); + kmap = map__kmap(machine->vmlinux_maps[type]); + map_groups__remove(&machine->kmaps, + machine->vmlinux_maps[type]); if (kmap->ref_reloc_sym) { /* * ref_reloc_sym is shared among all maps, so free just @@ -2284,25 +2295,25 @@ void machine__destroy_kernel_maps(struct machine *self) kmap->ref_reloc_sym = NULL; } - map__delete(self->vmlinux_maps[type]); - self->vmlinux_maps[type] = NULL; + map__delete(machine->vmlinux_maps[type]); + machine->vmlinux_maps[type] = NULL; } } -int machine__create_kernel_maps(struct machine *self) +int machine__create_kernel_maps(struct machine *machine) { - struct dso *kernel = machine__create_kernel(self); + struct dso *kernel = machine__create_kernel(machine); if (kernel == NULL || - __machine__create_kernel_maps(self, kernel) < 0) + __machine__create_kernel_maps(machine, kernel) < 0) return -1; - if (symbol_conf.use_modules && machine__create_modules(self) < 0) + if (symbol_conf.use_modules && machine__create_modules(machine) < 0) pr_debug("Problems creating module maps, continuing anyway...\n"); /* * Now that we have all the maps created, just set the ->end of them: */ - map_groups__fixup_end(&self->kmaps); + map_groups__fixup_end(&machine->kmaps); return 0; } @@ -2366,11 +2377,11 @@ out_fail: return -1; } -size_t machine__fprintf_vmlinux_path(struct machine *self, FILE *fp) +size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) { int i; size_t printed = 0; - struct dso *kdso = self->vmlinux_maps[MAP__FUNCTION]->dso; + struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso; if (kdso->has_build_id) { char filename[PATH_MAX]; @@ -2467,9 +2478,9 @@ void symbol__exit(void) symbol_conf.initialized = false; } -int machines__create_kernel_maps(struct rb_root *self, pid_t pid) +int machines__create_kernel_maps(struct rb_root *machines, pid_t pid) { - struct machine *machine = machines__findnew(self, pid); + struct machine *machine = machines__findnew(machines, pid); if (machine == NULL) return -1; @@ -2520,7 +2531,7 @@ char *strxfrchar(char *s, char from, char to) return s; } -int machines__create_guest_kernel_maps(struct rb_root *self) +int machines__create_guest_kernel_maps(struct rb_root *machines) { int ret = 0; struct dirent **namelist = NULL; @@ -2531,7 +2542,7 @@ int machines__create_guest_kernel_maps(struct rb_root *self) if (symbol_conf.default_guest_vmlinux_name || symbol_conf.default_guest_modules || symbol_conf.default_guest_kallsyms) { - machines__create_kernel_maps(self, DEFAULT_GUEST_KERNEL_ID); + machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID); } if (symbol_conf.guestmount) { @@ -2552,7 +2563,7 @@ int machines__create_guest_kernel_maps(struct rb_root *self) pr_debug("Can't access file %s\n", path); goto failure; } - machines__create_kernel_maps(self, pid); + machines__create_kernel_maps(machines, pid); } failure: free(namelist); @@ -2561,23 +2572,23 @@ failure: return ret; } -void machines__destroy_guest_kernel_maps(struct rb_root *self) +void machines__destroy_guest_kernel_maps(struct rb_root *machines) { - struct rb_node *next = rb_first(self); + struct rb_node *next = rb_first(machines); while (next) { struct machine *pos = rb_entry(next, struct machine, rb_node); next = rb_next(&pos->rb_node); - rb_erase(&pos->rb_node, self); + rb_erase(&pos->rb_node, machines); machine__delete(pos); } } -int machine__load_kallsyms(struct machine *self, const char *filename, +int machine__load_kallsyms(struct machine *machine, const char *filename, enum map_type type, symbol_filter_t filter) { - struct map *map = self->vmlinux_maps[type]; + struct map *map = machine->vmlinux_maps[type]; int ret = dso__load_kallsyms(map->dso, filename, map, filter); if (ret > 0) { @@ -2587,16 +2598,16 @@ int machine__load_kallsyms(struct machine *self, const char *filename, * kernel, with modules between them, fixup the end of all * sections. */ - __map_groups__fixup_end(&self->kmaps, type); + __map_groups__fixup_end(&machine->kmaps, type); } return ret; } -int machine__load_vmlinux_path(struct machine *self, enum map_type type, +int machine__load_vmlinux_path(struct machine *machine, enum map_type type, symbol_filter_t filter) { - struct map *map = self->vmlinux_maps[type]; + struct map *map = machine->vmlinux_maps[type]; int ret = dso__load_vmlinux_path(map->dso, map, filter); if (ret > 0) { diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 713b0b4..242de01 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -62,7 +62,7 @@ struct symbol { char name[0]; }; -void symbol__delete(struct symbol *self); +void symbol__delete(struct symbol *sym); struct strlist; @@ -96,9 +96,9 @@ struct symbol_conf { extern struct symbol_conf symbol_conf; -static inline void *symbol__priv(struct symbol *self) +static inline void *symbol__priv(struct symbol *sym) { - return ((void *)self) - symbol_conf.priv_size; + return ((void *)sym) - symbol_conf.priv_size; } struct ref_reloc_sym { @@ -155,43 +155,45 @@ struct dso { struct dso *dso__new(const char *name); struct dso *dso__new_kernel(const char *name); -void dso__delete(struct dso *self); +void dso__delete(struct dso *dso); -int dso__name_len(const struct dso *self); +int dso__name_len(const struct dso *dso); -bool dso__loaded(const struct dso *self, enum map_type type); -bool dso__sorted_by_name(const struct dso *self, enum map_type type); +bool dso__loaded(const struct dso *dso, enum map_type type); +bool dso__sorted_by_name(const struct dso *dso, enum map_type type); -static inline void dso__set_loaded(struct dso *self, enum map_type type) +static inline void dso__set_loaded(struct dso *dso, enum map_type type) { - self->loaded |= (1 << type); + dso->loaded |= (1 << type); } -void dso__sort_by_name(struct dso *self, enum map_type type); +void dso__sort_by_name(struct dso *dso, enum map_type type); struct dso *__dsos__findnew(struct list_head *head, const char *name); -int dso__load(struct dso *self, struct map *map, symbol_filter_t filter); -int dso__load_vmlinux(struct dso *self, struct map *map, +int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter); +int dso__load_vmlinux(struct dso *dso, struct map *map, const char *vmlinux, symbol_filter_t filter); -int dso__load_vmlinux_path(struct dso *self, struct map *map, +int dso__load_vmlinux_path(struct dso *dso, struct map *map, symbol_filter_t filter); -int dso__load_kallsyms(struct dso *self, const char *filename, struct map *map, +int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map, symbol_filter_t filter); -int machine__load_kallsyms(struct machine *self, const char *filename, +int machine__load_kallsyms(struct machine *machine, const char *filename, enum map_type type, symbol_filter_t filter); -int machine__load_vmlinux_path(struct machine *self, enum map_type type, +int machine__load_vmlinux_path(struct machine *machine, enum map_type type, symbol_filter_t filter); size_t __dsos__fprintf(struct list_head *head, FILE *fp); -size_t machine__fprintf_dsos_buildid(struct machine *self, FILE *fp, bool with_hits); -size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp); -size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits); - -size_t dso__fprintf_buildid(struct dso *self, FILE *fp); -size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp); -size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp); +size_t machine__fprintf_dsos_buildid(struct machine *machine, + FILE *fp, bool with_hits); +size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp); +size_t machines__fprintf_dsos_buildid(struct rb_root *machines, + FILE *fp, bool with_hits); +size_t dso__fprintf_buildid(struct dso *dso, FILE *fp); +size_t dso__fprintf_symbols_by_name(struct dso *dso, + enum map_type type, FILE *fp); +size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp); enum symtab_type { SYMTAB__KALLSYMS = 0, @@ -207,34 +209,36 @@ enum symtab_type { SYMTAB__NOT_FOUND, }; -char dso__symtab_origin(const struct dso *self); -void dso__set_long_name(struct dso *self, char *name); -void dso__set_build_id(struct dso *self, void *build_id); -void dso__read_running_kernel_build_id(struct dso *self, struct machine *machine); -struct symbol *dso__find_symbol(struct dso *self, enum map_type type, u64 addr); -struct symbol *dso__find_symbol_by_name(struct dso *self, enum map_type type, +char dso__symtab_origin(const struct dso *dso); +void dso__set_long_name(struct dso *dso, char *name); +void dso__set_build_id(struct dso *dso, void *build_id); +void dso__read_running_kernel_build_id(struct dso *dso, + struct machine *machine); +struct symbol *dso__find_symbol(struct dso *dso, enum map_type type, + u64 addr); +struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type, const char *name); int filename__read_build_id(const char *filename, void *bf, size_t size); int sysfs__read_build_id(const char *filename, void *bf, size_t size); bool __dsos__read_build_ids(struct list_head *head, bool with_hits); -int build_id__sprintf(const u8 *self, int len, char *bf); +int build_id__sprintf(const u8 *build_id, int len, char *bf); int kallsyms__parse(const char *filename, void *arg, int (*process_symbol)(void *arg, const char *name, char type, u64 start, u64 end)); -void machine__destroy_kernel_maps(struct machine *self); -int __machine__create_kernel_maps(struct machine *self, struct dso *kernel); -int machine__create_kernel_maps(struct machine *self); +void machine__destroy_kernel_maps(struct machine *machine); +int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel); +int machine__create_kernel_maps(struct machine *machine); -int machines__create_kernel_maps(struct rb_root *self, pid_t pid); -int machines__create_guest_kernel_maps(struct rb_root *self); -void machines__destroy_guest_kernel_maps(struct rb_root *self); +int machines__create_kernel_maps(struct rb_root *machines, pid_t pid); +int machines__create_guest_kernel_maps(struct rb_root *machines); +void machines__destroy_guest_kernel_maps(struct rb_root *machines); int symbol__init(void); void symbol__exit(void); bool symbol_type__is_a(char symbol_type, enum map_type map_type); -size_t machine__fprintf_vmlinux_path(struct machine *self, FILE *fp); +size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp); #endif /* __PERF_SYMBOL */ -- cgit v1.1 From 3643b133f2cb8023e8cedcbef43215a99d7df561 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Sat, 9 Apr 2011 01:12:56 +0000 Subject: perf tools: Makefile: Clean up `python/perf.so' rule There is no need for a subshell or an explicit `export'; as per the POSIX Shell Command Language specification: http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_09_01 http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_10_02 It is only necessary to include the environment variable assignment just before the command to be run. Also, it is better to use single-quotes, because GNU make might expand `$(BASIC_CFLAGS)' into something that the shell could interpret within double-quotes. Acked-by: Raghavendra D Prabhu Link: http://lkml.kernel.org/n/tip-58n38o02ocuzrm9qh096hsf5@git.kernel.org Signed-off-by: Michael Witten Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 207dee5..aaf4dd3 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -165,12 +165,10 @@ grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) $(OUTPUT)python/perf.so: $(PYRF_OBJS) - $(QUIET_GEN)( \ - export CFLAGS="$(BASIC_CFLAGS)"; \ - python util/setup.py --quiet build_ext --build-lib='$(OUTPUT)python' \ - --build-temp='$(OUTPUT)python/temp' \ - ) - + $(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' python util/setup.py \ + --quiet build_ext \ + --build-lib='$(OUTPUT)python' \ + --build-temp='$(OUTPUT)python/temp' # # No Perl scripts right now: # -- cgit v1.1 From ced465c400b23656ef2c4fbfb4add0e5b92e3d97 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Sat, 2 Apr 2011 21:46:09 +0000 Subject: perf tools: Makefile: PYTHON{,_CONFIG} to bandage Python 3 incompatibility Currently, Python 3 is not supported by perf's code; this can cause the build to fail for systems that have Python 3 installed as the default python: python{,-config} The Correct Solution is to write compatibility code so that Python 3 works out-of-the-box. However, users often have an ancillary Python 2 installed: python2{,-config} Therefore, a quick fix is to allow the user to specify those ancillary paths as the python binaries that Makefile should use, thereby avoiding Python 3 altogether; as an added benefit, the Python binaries may be installed in non-standard locations without the need for updating any PATH variable. This commit adds the ability to set PYTHON and/or PYTHON_CONFIG either as environment variables or as make variables on the command line; the paths may be relative, and usually only PYTHON is necessary in order for PYTHON_CONFIG to be defined implicitly. Some rudimentary error checking is performed when the user explicitly specifies a value for any of these variables. In addition, this commit introduces significantly robust makefile infrastructure for working with paths and communicating with the shell; it's currently only used for handling Python, but I hope it will prove useful in refactoring the makefiles. Thanks to: Raghavendra D Prabhu for motivating this patch. Acked-by: Raghavendra D Prabhu Link: http://lkml.kernel.org/r/e987828e-87ec-4973-95e7-47f10f5d9bab-mfwitten@gmail.com Signed-off-by: Michael Witten Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 97 +++++++++++++++++----- tools/perf/config/utilities.mak | 180 ++++++++++++++++++++++++++++++++++++++++ tools/perf/feature-tests.mak | 8 +- 3 files changed, 264 insertions(+), 21 deletions(-) create mode 100644 tools/perf/config/utilities.mak diff --git a/tools/perf/Makefile b/tools/perf/Makefile index aaf4dd3..b5276c7 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -5,6 +5,8 @@ endif # The default target of this Makefile is... all: +include config/utilities.mak + ifneq ($(OUTPUT),) # check that the output directory actually exists OUTDIR := $(shell cd $(OUTPUT) && /bin/pwd) @@ -13,6 +15,12 @@ endif # Define V to have a more verbose compile. # +# Define PYTHON to point to the python binary if the default +# `python' is not correct; for example: PYTHON=python2 +# +# Define PYTHON_CONFIG to point to the python-config binary if +# the default `$(PYTHON)-config' is not correct. +# # Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 # # Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72. @@ -165,7 +173,7 @@ grep-libs = $(filter -l%,$(1)) strip-libs = $(filter-out -l%,$(1)) $(OUTPUT)python/perf.so: $(PYRF_OBJS) - $(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' python util/setup.py \ + $(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' $(PYTHON_WORD) util/setup.py \ --quiet build_ext \ --build-lib='$(OUTPUT)python' \ --build-temp='$(OUTPUT)python/temp' @@ -473,24 +481,74 @@ else endif endif -ifdef NO_LIBPYTHON - BASIC_CFLAGS += -DNO_LIBPYTHON +disable-python = $(eval $(disable-python_code)) +define disable-python_code + BASIC_CFLAGS += -DNO_LIBPYTHON + $(if $(1),$(warning No $(1) was found)) + $(warning Python support won't be built) +endef + +override PYTHON := \ + $(call get-executable-or-default,PYTHON,python) + +ifndef PYTHON + $(call disable-python,python interpreter) + python-clean := else - PYTHON_EMBED_LDOPTS = $(shell python-config --ldflags 2>/dev/null) - PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) - PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) - PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null` - FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) - ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y) - msg := $(warning No Python.h found, install python-dev[el] to have python support in 'perf script' and to build the python bindings) - BASIC_CFLAGS += -DNO_LIBPYTHON - else - ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS) - EXTLIBS += $(PYTHON_EMBED_LIBADD) - LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o - LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o - LANG_BINDINGS += $(OUTPUT)python/perf.so - endif + + PYTHON_WORD := $(call shell-wordify,$(PYTHON)) + + python-clean := $(PYTHON_WORD) util/setup.py clean \ + --build-lib='$(OUTPUT)python' \ + --build-temp='$(OUTPUT)python/temp' + + ifdef NO_LIBPYTHON + $(call disable-python) + else + + override PYTHON_CONFIG := \ + $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON)-config) + + ifndef PYTHON_CONFIG + $(call disable-python,python-config tool) + else + + PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG)) + + PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null) + PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS)) + PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) + PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null) + FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS) + + ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y) + $(call disable-python,Python.h (for Python 2.x)) + else + + ifneq ($(call try-cc,$(SOURCE_PYTHON_VERSION),$(FLAGS_PYTHON_EMBED)),y) + $(warning Python 3 is not yet supported; please set) + $(warning PYTHON and/or PYTHON_CONFIG appropriately.) + $(warning If you also have Python 2 installed, then) + $(warning try something like:) + $(warning $(and ,)) + $(warning $(and ,) make PYTHON=python2) + $(warning $(and ,)) + $(warning Otherwise, disable Python support entirely:) + $(warning $(and ,)) + $(warning $(and ,) make NO_LIBPYTHON=1) + $(warning $(and ,)) + $(error $(and ,)) + else + ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS) + EXTLIBS += $(PYTHON_EMBED_LIBADD) + LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o + LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o + LANG_BINDINGS += $(OUTPUT)python/perf.so + endif + + endif + endif + endif endif ifdef NO_DEMANGLE @@ -831,8 +889,7 @@ clean: $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(MAKE) -C Documentation/ clean $(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS - @python util/setup.py clean --build-lib='$(OUTPUT)python' \ - --build-temp='$(OUTPUT)python/temp' + $(python-clean) .PHONY: all install clean strip .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak new file mode 100644 index 0000000..6d8ff88 --- /dev/null +++ b/tools/perf/config/utilities.mak @@ -0,0 +1,180 @@ +# This allows us to work with the newline character: +define newline + + +endef +newline := $(newline) + +# nl-escape +# +# Usage: escape = $(call nl-escape[,escape]) +# +# This is used as the common way to specify +# what should replace a newline when escaping +# newlines; the default is a bizarre string. +# +nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n) + +# escape-nl +# +# Usage: escaped-text = $(call escape-nl,text[,escape]) +# +# GNU make's $(shell ...) function converts to a +# single space each newline character in the output +# produced during the expansion; this may not be +# desirable. +# +# The only solution is to change each newline into +# something that won't be converted, so that the +# information can be recovered later with +# $(call unescape-nl...) +# +escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1)) + +# unescape-nl +# +# Usage: text = $(call unescape-nl,escaped-text[,escape]) +# +# See escape-nl. +# +unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1)) + +# shell-escape-nl +# +# Usage: $(shell some-command | $(call shell-escape-nl[,escape])) +# +# Use this to escape newlines from within a shell call; +# the default escape is a bizarre string. +# +# NOTE: The escape is used directly as a string constant +# in an `awk' program that is delimited by shell +# single-quotes, so be wary of the characters +# that are chosen. +# +define shell-escape-nl +awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}' +endef + +# shell-unescape-nl +# +# Usage: $(shell some-command | $(call shell-unescape-nl[,escape])) +# +# Use this to unescape newlines from within a shell call; +# the default escape is a bizarre string. +# +# NOTE: The escape is used directly as an extended regular +# expression constant in an `awk' program that is +# delimited by shell single-quotes, so be wary +# of the characters that are chosen. +# +# (The bash shell has a bug where `{gsub(...),...}' is +# misinterpreted as a brace expansion; this can be +# overcome by putting a space between `{' and `gsub'). +# +define shell-unescape-nl +awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }' +endef + +# escape-for-shell-sq +# +# Usage: embeddable-text = $(call escape-for-shell-sq,text) +# +# This function produces text that is suitable for +# embedding in a shell string that is delimited by +# single-quotes. +# +escape-for-shell-sq = $(subst ','\'',$(1)) + +# shell-sq +# +# Usage: single-quoted-and-escaped-text = $(call shell-sq,text) +# +shell-sq = '$(escape-for-shell-sq)' + +# shell-wordify +# +# Usage: wordified-text = $(call shell-wordify,text) +# +# For instance: +# +# |define text +# |hello +# |world +# |endef +# | +# |target: +# | echo $(call shell-wordify,$(text)) +# +# At least GNU make gets confused by expanding a newline +# within the context of a command line of a makefile rule +# (this is in constrast to a `$(shell ...)' function call, +# which can handle it just fine). +# +# This function avoids the problem by producing a string +# that works as a shell word, regardless of whether or +# not it contains a newline. +# +# If the text to be wordified contains a newline, then +# an intrictate shell command substitution is constructed +# to render the text as a single line; when the shell +# processes the resulting escaped text, it transforms +# it into the original unescaped text. +# +# If the text does not contain a newline, then this function +# produces the same results as the `$(shell-sq)' function. +# +shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq)) +define _sw-esc-nl +"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))" +endef + +# is-absolute +# +# Usage: bool-value = $(call is-absolute,path) +# +is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y) + +# lookup +# +# Usage: absolute-executable-path-or-empty = $(call lookup,path) +# +# (It's necessary to use `sh -c' because GNU make messes up by +# trying too hard and getting things wrong). +# +lookup = $(call unescape-nl,$(shell sh -c $(_l-sh))) +_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,)) + +# is-executable +# +# Usage: bool-value = $(call is-executable,path) +# +# (It's necessary to use `sh -c' because GNU make messes up by +# trying too hard and getting things wrong). +# +is-executable = $(call _is-executable-helper,$(shell-sq)) +_is-executable-helper = $(shell sh -c $(_is-executable-sh)) +_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y) + +# get-executable +# +# Usage: absolute-executable-path-or-empty = $(call get-executable,path) +# +# The goal is to get an absolute path for an executable; +# the `command -v' is defined by POSIX, but it's not +# necessarily very portable, so it's only used if +# relative path resolution is requested, as determined +# by the presence of a leading `/'. +# +get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup))) +_ge-abspath = $(if $(is-executable),$(1)) + +# get-supplied-or-default-executable +# +# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default) +# +define get-executable-or-default +$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2))) +endef +_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2))) +_gea_warn = $(warning The path '$(1)' is not executable.) +_gea_err = $(if $(1),$(error Please set '$(1)' appropriately)) diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak index b041ca6..1b33420 100644 --- a/tools/perf/feature-tests.mak +++ b/tools/perf/feature-tests.mak @@ -79,9 +79,15 @@ endef endif ifndef NO_LIBPYTHON +define SOURCE_PYTHON_VERSION +#include +#if PY_VERSION_HEX >= 0x03000000 + #error +#endif +int main(void){} +endef define SOURCE_PYTHON_EMBED #include - int main(void) { Py_Initialize(); -- cgit v1.1 From 7fbd065f5a2b299172502f09fc3fbde02b48f591 Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Tue, 12 Apr 2011 20:27:59 +0000 Subject: perf tools: Move `try-cc' The `try-cc' user-defined function was in tools/perf/feature-tests.mak; this commit moves it to tools/perf/config/utilities.mak. Signed-off-by: Michael Witten Link: http://lkml.kernel.org/n/tip-bqhwcuxsrve0iodn6q4ejaoi@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/config/utilities.mak | 8 ++++++++ tools/perf/feature-tests.mak | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak index 6d8ff88..8046182 100644 --- a/tools/perf/config/utilities.mak +++ b/tools/perf/config/utilities.mak @@ -178,3 +178,11 @@ endef _ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2))) _gea_warn = $(warning The path '$(1)' is not executable.) _gea_err = $(if $(1),$(error Please set '$(1)' appropriately)) + +# try-cc +# Usage: option = $(call try-cc, source-to-build, cc-options) +try-cc = $(shell sh -c \ + 'TMP="$(OUTPUT)$(TMPOUT).$$$$"; \ + echo "$(1)" | \ + $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \ + rm -f "$$TMP"') diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak index 1b33420..6170fd2 100644 --- a/tools/perf/feature-tests.mak +++ b/tools/perf/feature-tests.mak @@ -126,11 +126,3 @@ int main(void) return 0; } endef - -# try-cc -# Usage: option = $(call try-cc, source-to-build, cc-options) -try-cc = $(shell sh -c \ - 'TMP="$(OUTPUT)$(TMPOUT).$$$$"; \ - echo "$(1)" | \ - $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \ - rm -f "$$TMP"') -- cgit v1.1 From f18568aae5612ab37f20e5f383d6154ea69c9dfc Mon Sep 17 00:00:00 2001 From: Michael Witten Date: Tue, 12 Apr 2011 20:30:13 +0000 Subject: perf tools: git mv tools/perf/{features-tests.mak,config/} Signed-off-by: Michael Witten Link: http://lkml.kernel.org/n/tip-a6zhefjayuounko1tk5sjji2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile | 2 +- tools/perf/config/feature-tests.mak | 128 ++++++++++++++++++++++++++++++++++++ tools/perf/feature-tests.mak | 128 ------------------------------------ 3 files changed, 129 insertions(+), 129 deletions(-) create mode 100644 tools/perf/config/feature-tests.mak delete mode 100644 tools/perf/feature-tests.mak diff --git a/tools/perf/Makefile b/tools/perf/Makefile index b5276c7..91ad5cc 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -138,7 +138,7 @@ INSTALL = install # explicitly what architecture to check for. Fix this up for yours.. SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ --include feature-tests.mak +-include config/feature-tests.mak ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y) CFLAGS := $(CFLAGS) -fstack-protector-all diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak new file mode 100644 index 0000000..6170fd2 --- /dev/null +++ b/tools/perf/config/feature-tests.mak @@ -0,0 +1,128 @@ +define SOURCE_HELLO +#include +int main(void) +{ + return puts(\"hi\"); +} +endef + +ifndef NO_DWARF +define SOURCE_DWARF +#include +#include +#include +#ifndef _ELFUTILS_PREREQ +#error +#endif + +int main(void) +{ + Dwarf *dbg = dwarf_begin(0, DWARF_C_READ); + return (long)dbg; +} +endef +endif + +define SOURCE_LIBELF +#include + +int main(void) +{ + Elf *elf = elf_begin(0, ELF_C_READ, 0); + return (long)elf; +} +endef + +define SOURCE_GLIBC +#include + +int main(void) +{ + const char *version = gnu_get_libc_version(); + return (long)version; +} +endef + +define SOURCE_ELF_MMAP +#include +int main(void) +{ + Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0); + return (long)elf; +} +endef + +ifndef NO_NEWT +define SOURCE_NEWT +#include + +int main(void) +{ + newtInit(); + newtCls(); + return newtFinished(); +} +endef +endif + +ifndef NO_LIBPERL +define SOURCE_PERL_EMBED +#include +#include + +int main(void) +{ +perl_alloc(); +return 0; +} +endef +endif + +ifndef NO_LIBPYTHON +define SOURCE_PYTHON_VERSION +#include +#if PY_VERSION_HEX >= 0x03000000 + #error +#endif +int main(void){} +endef +define SOURCE_PYTHON_EMBED +#include +int main(void) +{ + Py_Initialize(); + return 0; +} +endef +endif + +define SOURCE_BFD +#include + +int main(void) +{ + bfd_demangle(0, 0, 0); + return 0; +} +endef + +define SOURCE_CPLUS_DEMANGLE +extern char *cplus_demangle(const char *, int); + +int main(void) +{ + cplus_demangle(0, 0); + return 0; +} +endef + +define SOURCE_STRLCPY +#include +extern size_t strlcpy(char *dest, const char *src, size_t size); + +int main(void) +{ + strlcpy(NULL, NULL, 0); + return 0; +} +endef diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak deleted file mode 100644 index 6170fd2..0000000 --- a/tools/perf/feature-tests.mak +++ /dev/null @@ -1,128 +0,0 @@ -define SOURCE_HELLO -#include -int main(void) -{ - return puts(\"hi\"); -} -endef - -ifndef NO_DWARF -define SOURCE_DWARF -#include -#include -#include -#ifndef _ELFUTILS_PREREQ -#error -#endif - -int main(void) -{ - Dwarf *dbg = dwarf_begin(0, DWARF_C_READ); - return (long)dbg; -} -endef -endif - -define SOURCE_LIBELF -#include - -int main(void) -{ - Elf *elf = elf_begin(0, ELF_C_READ, 0); - return (long)elf; -} -endef - -define SOURCE_GLIBC -#include - -int main(void) -{ - const char *version = gnu_get_libc_version(); - return (long)version; -} -endef - -define SOURCE_ELF_MMAP -#include -int main(void) -{ - Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0); - return (long)elf; -} -endef - -ifndef NO_NEWT -define SOURCE_NEWT -#include - -int main(void) -{ - newtInit(); - newtCls(); - return newtFinished(); -} -endef -endif - -ifndef NO_LIBPERL -define SOURCE_PERL_EMBED -#include -#include - -int main(void) -{ -perl_alloc(); -return 0; -} -endef -endif - -ifndef NO_LIBPYTHON -define SOURCE_PYTHON_VERSION -#include -#if PY_VERSION_HEX >= 0x03000000 - #error -#endif -int main(void){} -endef -define SOURCE_PYTHON_EMBED -#include -int main(void) -{ - Py_Initialize(); - return 0; -} -endef -endif - -define SOURCE_BFD -#include - -int main(void) -{ - bfd_demangle(0, 0, 0); - return 0; -} -endef - -define SOURCE_CPLUS_DEMANGLE -extern char *cplus_demangle(const char *, int); - -int main(void) -{ - cplus_demangle(0, 0); - return 0; -} -endef - -define SOURCE_STRLCPY -#include -extern size_t strlcpy(char *dest, const char *src, size_t size); - -int main(void) -{ - strlcpy(NULL, NULL, 0); - return 0; -} -endef -- cgit v1.1 From 0817a6a3a4fc7c069111083351ca33a78da2a0c9 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Thu, 14 Apr 2011 10:38:18 -0700 Subject: perf script: Add support for PERF_TYPE_RAW Useful for getting stack traces for hardware events not handled by PERF_TYPE_HARDWARE. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tom Zanussi Signed-off-by: Arun Sharma Link: http://lkml.kernel.org/n/tip-qimdcdpekjqxuyqovy4kjusx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 6cf811a..b3012c4 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -82,6 +82,16 @@ static struct { PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE, }, + + [PERF_TYPE_RAW] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_SYM, + + .invalid_fields = PERF_OUTPUT_TRACE, + }, }; static bool output_set_by_user(void) @@ -502,6 +512,8 @@ static int parse_output_fields(const struct option *opt __used, type = PERF_TYPE_SOFTWARE; else if (!strcmp(str, "trace")) type = PERF_TYPE_TRACEPOINT; + else if (!strcmp(str, "raw")) + type = PERF_TYPE_RAW; else { fprintf(stderr, "Invalid event type in field string.\n"); return -EINVAL; @@ -902,7 +914,7 @@ static const struct option options[] = { OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), OPT_CALLBACK('f', "fields", NULL, "str", - "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace. Fields: comm,tid,pid,time,cpu,event,trace,sym", + "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,sym", parse_output_fields), OPT_END() -- cgit v1.1 From 2b398bd9f8f73be706b41adcbb240ce95793049a Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 14 Apr 2011 14:36:08 +0800 Subject: x86, apic: Print verbose error interrupt reason on apic=debug End users worry about the error interrupt printout we generate currently: pr_debug("APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); ... and would like to know the reason why error interrupts are generated. This patch prints out more detailed debug information. Another practical problem is that dynamic debug is not initialized yet when the APIC initializes, so the pr_debug() will not output the error interrupt debug information on bootup. In this patch, we use apic_printk(APIC_DEBUG, ...), so the apic=debug boot option will print verbose error interupts during bootup. Signed-off-by: Youquan Song Cc: Joe Perches Cc: hpa@linux.intel.com Cc: suresh.b.siddha@intel.com Cc: yong.y.wang@linux.intel.com Cc: jbaron@redhat.com Cc: trenn@suse.de Cc: kent.liu@intel.com Cc: chaohong.guo@intel.com Link: http://lkml.kernel.org/r/1302762968-24380-2-git-send-email-youquan.song@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index fabf01e..ae14712 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1812,30 +1812,41 @@ void smp_spurious_interrupt(struct pt_regs *regs) */ void smp_error_interrupt(struct pt_regs *regs) { - u32 v, v1; + u32 v0, v1; + u32 i = 0; + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); + v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); v1 = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - /* - * Here is what the APIC error bits mean: - * 0: Send CS error - * 1: Receive CS error - * 2: Send accept error - * 3: Receive accept error - * 4: Reserved - * 5: Send illegal vector - * 6: Received illegal vector - * 7: Illegal register address - */ - pr_debug("APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", + smp_processor_id(), v0 , v1); + + v1 = v1 & 0xff; + while (v1) { + if (v1 & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v1 >>= 1; + }; + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + irq_exit(); } -- cgit v1.1 From 7b70bd3441437b7bc04fc9d321e17c8ed0e8f958 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 18 Apr 2011 16:00:21 +0200 Subject: x86, MCE: Do not taint when handling correctable errors Correctable errors are considered something rather normal on modern hardware these days. Even more importantly, correctable errors mean exactly that - they've been corrected by the hardware - and there's no need to taint the kernel since execution hasn't been compromised so far. Also, drop tainting in the thermal throttling code for a similar reason: crossing a thermal threshold does not mean corruption. Signed-off-by: Borislav Petkov Acked-by: Tony Luck Acked-by: Nagananda Chumbalkar Cc: Prarit Bhargava Cc: Russ Anderson Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1303135222-17118-1-git-send-email-bp@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 1 - arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3385ea2..68e2303 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -590,7 +590,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - add_taint(TAINT_MACHINE_CHECK); } /* diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9..5846a79 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - - add_taint(TAINT_MACHINE_CHECK); return 1; } if (old_event) { @@ -393,7 +391,6 @@ static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -- cgit v1.1 From 2232d31bf18ba02f5cd632bbfc3466aeca394c75 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 15 Apr 2011 00:41:43 +0200 Subject: ath9k: fix the return value of ath_stoprecv The patch 'ath9k_hw: fix stopping rx DMA during resets' added code to detect a condition where rx DMA was stopped, but the MAC failed to enter the idle state. This condition requires a hardware reset, however the return value of ath_stoprecv was 'true' in that case, which allowed it to skip the reset when issuing a fast channel change. Signed-off-by: Felix Fietkau Reported-by: Paul Stewart Cc: stable@kernel.org Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/recv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c index dcd19bc..b29c80d 100644 --- a/drivers/net/wireless/ath/ath9k/recv.c +++ b/drivers/net/wireless/ath/ath9k/recv.c @@ -506,7 +506,7 @@ bool ath_stoprecv(struct ath_softc *sc) "confusing the DMA engine when we start RX up\n"); ATH_DBG_WARN_ON_ONCE(!stopped); } - return stopped || reset; + return stopped && !reset; } void ath_flushrecv(struct ath_softc *sc) -- cgit v1.1 From bcdd323b893ad3c9b7ef26b5e4a0bef974238501 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Wed, 16 Mar 2011 15:59:35 +0200 Subject: device: add dev_WARN_ONCE it's quite useful to print the device name on the stack dump caused by WARN(), but there are other cases where we might want to use WARN_ONCE. Introduce a helper similar to dev_WARN() for that case too. Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/device.h b/include/linux/device.h index ab8dfc0..d484051 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -742,13 +742,17 @@ do { \ #endif /* - * dev_WARN() acts like dev_printk(), but with the key difference + * dev_WARN*() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the * file/line information and a backtrace. */ #define dev_WARN(dev, format, arg...) \ WARN(1, "Device: %s\n" format, dev_driver_string(dev), ## arg); +#define dev_WARN_ONCE(dev, condition, format, arg...) \ + WARN_ONCE(condition, "Device %s\n" format, \ + dev_driver_string(dev), ## arg) + /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor)) -- cgit v1.1 From 051d51bc6a867d9466a975e4d7ca51b21a9c2c4e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Mar 2011 10:12:14 +0300 Subject: efivars: memory leak on error in create_efivars_bin_attributes() This is a cut and paste bug. We intended to free ->del_var and ->new_var but we only free ->new_var. Signed-off-by: Dan Carpenter Acked-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efivars.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index ff0c373..ff2fe40 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -677,8 +677,8 @@ create_efivars_bin_attributes(struct efivars *efivars) return 0; out_free: - kfree(efivars->new_var); - efivars->new_var = NULL; + kfree(efivars->del_var); + efivars->del_var = NULL; kfree(efivars->new_var); efivars->new_var = NULL; return error; -- cgit v1.1 From 3116aabc81ccfeeb73f183ed8b1e3031520d1e59 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Mar 2011 10:12:38 +0300 Subject: efivars: handle errors from register_efivars() We should unwind and return an error if register_efivars() fails. Signed-off-by: Dan Carpenter Acked-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efivars.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index ff2fe40..5d1ec68 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -803,6 +803,8 @@ efivars_init(void) ops.set_variable = efi.set_variable; ops.get_next_variable = efi.get_next_variable; error = register_efivars(&__efivars, &ops, efi_kobj); + if (error) + goto err_put; /* Don't forget the systab entry */ error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group); @@ -810,10 +812,15 @@ efivars_init(void) printk(KERN_ERR "efivars: Sysfs attribute export failed with error %d.\n", error); - unregister_efivars(&__efivars); - kobject_put(efi_kobj); + goto err_unregister; } + return 0; + +err_unregister: + unregister_efivars(&__efivars); +err_put: + kobject_put(efi_kobj); return error; } -- cgit v1.1 From 695cca8763078c743417886e80af8ccb78bec9f2 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Fri, 18 Mar 2011 16:50:03 -0700 Subject: firmware: Fix grammar in sysfs-firmware-dmi doc Fix the grammar in describing the position attribute of DMI entries in the dmi-sysfs module. While here, make a couple other small clarifying fixups to the docs. Reported-by: Signed-off-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-firmware-dmi | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi b/Documentation/ABI/testing/sysfs-firmware-dmi index ba9da95..c78f9ab 100644 --- a/Documentation/ABI/testing/sysfs-firmware-dmi +++ b/Documentation/ABI/testing/sysfs-firmware-dmi @@ -14,14 +14,15 @@ Description: DMI is structured as a large table of entries, where each entry has a common header indicating the type and - length of the entry, as well as 'handle' that is - supposed to be unique amongst all entries. + length of the entry, as well as a firmware-provided + 'handle' that is supposed to be unique amongst all + entries. Some entries are required by the specification, but many others are optional. In general though, users should never expect to find a specific entry type on their system unless they know for certain what their firmware - is doing. Machine to machine will vary. + is doing. Machine to machine experiences will vary. Multiple entries of the same type are allowed. In order to handle these duplicate entry types, each entry is @@ -67,25 +68,24 @@ Description: and the two terminating nul characters. type : The type of the entry. This value is the same as found in the directory name. It indicates - how the rest of the entry should be - interpreted. + how the rest of the entry should be interpreted. instance: The instance ordinal of the entry for the given type. This value is the same as found in the parent directory name. - position: The position of the entry within the entirety - of the entirety. + position: The ordinal position (zero-based) of the entry + within the entirety of the DMI entry table. === Entry Specialization === Some entry types may have other information available in - sysfs. + sysfs. Not all types are specialized. --- Type 15 - System Event Log --- This entry allows the firmware to export a log of events the system has taken. This information is typically backed by nvram, but the implementation - details are abstracted by this table. This entries data + details are abstracted by this table. This entry's data is exported in the directory: /sys/firmware/dmi/entries/15-0/system_event_log -- cgit v1.1 From aed65af1cc2f6fc9ded5a8158f1405a02cf6d2ff Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 28 Mar 2011 09:12:52 -0700 Subject: drivers: make device_type const The device_type structure does not contain data that changes during usage and should be const. This allows devices to declare the struct const. I have patches to change all the subsystems, but need the infra structure change first. Signed-off-by: Stephen Hemminger Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 4 ++-- include/linux/device.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 81b78ed..fb8130c 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -400,7 +400,7 @@ static void device_remove_groups(struct device *dev, static int device_add_attrs(struct device *dev) { struct class *class = dev->class; - struct device_type *type = dev->type; + const struct device_type *type = dev->type; int error; if (class) { @@ -440,7 +440,7 @@ static int device_add_attrs(struct device *dev) static void device_remove_attrs(struct device *dev) { struct class *class = dev->class; - struct device_type *type = dev->type; + const struct device_type *type = dev->type; device_remove_groups(dev, dev->groups); diff --git a/include/linux/device.h b/include/linux/device.h index d484051..350ceda 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -408,7 +408,7 @@ struct device { struct kobject kobj; const char *init_name; /* initial name of the device */ - struct device_type *type; + const struct device_type *type; struct mutex mutex; /* mutex to synchronize calls to * its driver. -- cgit v1.1 From 088ab0b4d855d68a0f0c16b72fb8e492a533aaa1 Mon Sep 17 00:00:00 2001 From: Ludwig Nussel Date: Mon, 28 Feb 2011 15:57:17 +0100 Subject: kernel/ksysfs.c: expose file_caps_enabled in sysfs A kernel booted with no_file_caps allows to install fscaps on a binary but doesn't actually honor the fscaps when running the binary. Userspace currently has no sane way to determine whether installing fscaps actually has any effect. Since parsing /proc/cmdline is fragile this patch exposes the current setting (1 or 0) via /sys/kernel/fscaps Signed-off-by: Ludwig Nussel Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-kernel-fscaps | 8 ++++++++ kernel/ksysfs.c | 10 ++++++++++ 2 files changed, 18 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-fscaps diff --git a/Documentation/ABI/testing/sysfs-kernel-fscaps b/Documentation/ABI/testing/sysfs-kernel-fscaps new file mode 100644 index 0000000..50a3033 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-fscaps @@ -0,0 +1,8 @@ +What: /sys/kernel/fscaps +Date: February 2011 +KernelVersion: 2.6.38 +Contact: Ludwig Nussel +Description + Shows whether file system capabilities are honored + when executing a binary + diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0b624e7..3b053c0 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -16,6 +16,7 @@ #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_KEXEC */ +/* whether file capabilities are enabled */ +static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", file_caps_enabled); +} +KERNEL_ATTR_RO(fscaps); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -158,6 +167,7 @@ struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { + &fscaps_attr.attr, #if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, -- cgit v1.1 From 5934b5f3b0116858a5f950abcac344ddee054b69 Mon Sep 17 00:00:00 2001 From: Tsugikazu Shibata Date: Mon, 4 Apr 2011 11:47:36 +0900 Subject: HOWTO: sync up Documentaion/ja_JP/HOWTO Signed-off-by: Tsugikazu Shibata Signed-off-by: Greg Kroah-Hartman --- Documentation/ja_JP/HOWTO | 129 ++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 86 deletions(-) diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO index b63301a..050d37f 100644 --- a/Documentation/ja_JP/HOWTO +++ b/Documentation/ja_JP/HOWTO @@ -11,14 +11,14 @@ for non English (read: Japanese) speakers and is not intended as a fork. So if you have any comments or updates for this file, please try to update the original English file first. -Last Updated: 2008/10/24 +Last Updated: 2011/03/31 ================================== ã“ã‚Œã¯ã€ -linux-2.6.28/Documentation/HOWTO +linux-2.6.38/Documentation/HOWTO ã®å’Œè¨³ã§ã™ã€‚ 翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ > -翻訳日: 2008/10/24 +翻訳日: 2011/3/28 翻訳者: Tsugikazu Shibata 校正者: æ¾å€‰ã•ã‚“ å°æž— é›…å…¸ã•ã‚“ (Masanori Kobayasi) @@ -256,8 +256,8 @@ Linux カーãƒãƒ«ã®é–‹ç™ºãƒ—ロセスã¯ç¾åœ¨å¹¾ã¤ã‹ã®ç•°ãªã‚‹ãƒ¡ã‚¤ãƒ³ - メイン㮠2.6.x カーãƒãƒ«ãƒ„リー - 2.6.x.y -stable カーãƒãƒ«ãƒ„リー - 2.6.x -git カーãƒãƒ«ãƒ‘ッム- - 2.6.x -mm カーãƒãƒ«ãƒ‘ッム- サブシステム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リーã¨ãƒ‘ッム+ - çµ±åˆãƒ†ã‚¹ãƒˆã®ãŸã‚ã® 2.6.x -next カーãƒãƒ«ãƒ„リー 2.6.x カーãƒãƒ«ãƒ„リー ----------------- @@ -268,9 +268,9 @@ Linux カーãƒãƒ«ã®é–‹ç™ºãƒ—ロセスã¯ç¾åœ¨å¹¾ã¤ã‹ã®ç•°ãªã‚‹ãƒ¡ã‚¤ãƒ³ - æ–°ã—ã„カーãƒãƒ«ãŒãƒªãƒªãƒ¼ã‚¹ã•ã‚ŒãŸç›´å¾Œã«ã€2週間ã®ç‰¹åˆ¥æœŸé–“ãŒè¨­ã‘られ〠ã“ã®æœŸé–“中ã«ã€ãƒ¡ãƒ³ãƒ†ãƒŠé”㯠Linus ã«å¤§ããªå·®åˆ†ã‚’é€ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚ - ã“ã®ã‚ˆã†ãªå·®åˆ†ã¯é€šå¸¸ -mm カーãƒãƒ«ã«æ•°é€±é–“å«ã¾ã‚Œã¦ããŸãƒ‘ッãƒã§ã™ã€‚ + ã“ã®ã‚ˆã†ãªå·®åˆ†ã¯é€šå¸¸ -next カーãƒãƒ«ã«æ•°é€±é–“å«ã¾ã‚Œã¦ããŸãƒ‘ッãƒã§ã™ã€‚ 大ããªå¤‰æ›´ã¯ git(カーãƒãƒ«ã®ã‚½ãƒ¼ã‚¹ç®¡ç†ãƒ„ールã€è©³ç´°ã¯ - http://git.or.cz/ å‚ç…§) を使ã£ã¦é€ã‚‹ã®ãŒå¥½ã¾ã—ã„ã‚„ã‚Šæ–¹ã§ã™ãŒã€ãƒ‘ッ + http://git-scm.com/ å‚ç…§) を使ã£ã¦é€ã‚‹ã®ãŒå¥½ã¾ã—ã„ã‚„ã‚Šæ–¹ã§ã™ãŒã€ãƒ‘ッ ãƒãƒ•ã‚¡ã‚¤ãƒ«ã®å½¢å¼ã®ã¾ã¾é€ã‚‹ã®ã§ã‚‚å分ã§ã™ã€‚ - 2週間後ã€-rc1 カーãƒãƒ«ãŒãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã€ã“ã®å¾Œã«ã¯ã‚«ãƒ¼ãƒãƒ«å…¨ä½“ã®å®‰å®š @@ -333,86 +333,44 @@ git リãƒã‚¸ãƒˆãƒªã§ç®¡ç†ã•ã‚Œã¦ã„ã‚‹Linus ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リーã®æ¯Ž れ㯠-rc カーãƒãƒ«ã¨æ¯”ã¹ã¦ã€ãƒ‘ッãƒãŒå¤§ä¸ˆå¤«ã‹ã©ã†ã‹ã‚‚確èªã—ãªã„ã§è‡ªå‹•çš„ ã«ç”Ÿæˆã•ã‚Œã‚‹ã®ã§ã€ã‚ˆã‚Šå®Ÿé¨“çš„ã§ã™ã€‚ -2.6.x -mm カーãƒãƒ«ãƒ‘ッム------------------------- - -Andrew Morton ã«ã‚ˆã£ã¦ãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã‚‹å®Ÿé¨“çš„ãªã‚«ãƒ¼ãƒãƒ«ãƒ‘ッãƒç¾¤ã§ã™ã€‚ -Andrew ã¯å€‹åˆ¥ã®ã‚µãƒ–システムカーãƒãƒ«ãƒ„リーã¨ãƒ‘ッãƒã‚’å…¨ã¦é›†ã‚ã¦ã㦠-linux-kernel メーリングリストã§åŽé›†ã•ã‚ŒãŸå¤šæ•°ã®ãƒ‘ッãƒã¨åŒæ™‚ã«ä¸€ã¤ã«ã¾ -ã¨ã‚ã¾ã™ã€‚ -ã“ã®ãƒ„リーã¯æ–°æ©Ÿèƒ½ã¨ãƒ‘ッãƒãŒæ¤œè¨¼ã•ã‚Œã‚‹å ´ã¨ãªã‚Šã¾ã™ã€‚ã‚る期間ã®é–“パッム-㌠-mm ã«å…¥ã£ã¦ä¾¡å€¤ã‚’証明ã•ã‚ŒãŸã‚‰ã€Andrew やサブシステムメンテナãŒã€ -メインラインã¸å…¥ã‚Œã‚‹ã‚ˆã†ã« Linus ã«ãƒ—ッシュã—ã¾ã™ã€‚ - -メインカーãƒãƒ«ãƒ„リーã«å«ã‚ã‚‹ãŸã‚ã« Linus ã«é€ã‚‹å‰ã«ã€ã™ã¹ã¦ã®æ–°ã—ã„パッ -ãƒãŒ -mm ツリーã§ãƒ†ã‚¹ãƒˆã•ã‚Œã‚‹ã“ã¨ãŒå¼·ã推奨ã•ã‚Œã¦ã„ã¾ã™ã€‚マージウィン -ドウãŒé–‹ãå‰ã« -mm ツリーã«ç¾ã‚Œãªã‹ã£ãŸãƒ‘ッãƒã¯ãƒ¡ã‚¤ãƒ³ãƒ©ã‚¤ãƒ³ã«ãƒžãƒ¼ã‚¸ã• -れるã“ã¨ã¯å›°é›£ã«ãªã‚Šã¾ã™ã€‚ - -ã“れらã®ã‚«ãƒ¼ãƒãƒ«ã¯å®‰å®šã—ã¦å‹•ä½œã™ã¹ãシステムã¨ã—ã¦ä½¿ã†ã®ã«ã¯é©åˆ‡ã§ã¯ã‚ -ã‚Šã¾ã›ã‚“ã—ã€ã‚«ãƒ¼ãƒãƒ«ãƒ–ランãƒã®ä¸­ã§ã‚‚ã‚‚ã£ã¨ã‚‚動作ã«ãƒªã‚¹ã‚¯ãŒé«˜ã„ã‚‚ã®ã§ã™ã€‚ - -ã‚‚ã—ã‚ãªãŸãŒã€ã‚«ãƒ¼ãƒãƒ«é–‹ç™ºãƒ—ロセスã®æ”¯æ´ã‚’ã—ãŸã„ã¨æ€ã£ã¦ã„ã‚‹ã®ã§ã‚ã‚Œã°ã€ -ã©ã†ãžã“れらã®ã‚«ãƒ¼ãƒãƒ«ãƒªãƒªãƒ¼ã‚¹ã‚’テストã«ä½¿ã£ã¦ã¿ã¦ã€ãã—ã¦ã‚‚ã—å•é¡ŒãŒã‚ -ã‚Œã°ã€ã¾ãŸã‚‚ã—å…¨ã¦ãŒæ­£ã—ã動作ã—ãŸã¨ã—ã¦ã‚‚ã€linux-kernel メーリングリ -ストã«ãƒ•ã‚£ãƒ¼ãƒ‰ãƒãƒƒã‚¯ã‚’æä¾›ã—ã¦ãã ã•ã„。 - -ã™ã¹ã¦ã®ä»–ã®å®Ÿé¨“的パッãƒã«åŠ ãˆã¦ã€ã“れらã®ã‚«ãƒ¼ãƒãƒ«ã¯é€šå¸¸ãƒªãƒªãƒ¼ã‚¹æ™‚点㧠-メインライン㮠-git カーãƒãƒ«ã«å«ã¾ã‚Œã‚‹å…¨ã¦ã®å¤‰æ›´ã‚‚å«ã‚“ã§ã„ã¾ã™ã€‚ - --mm カーãƒãƒ«ã¯æ±ºã¾ã£ãŸã‚¹ã‚±ã‚¸ãƒ¥ãƒ¼ãƒ«ã§ã¯ãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã¾ã›ã‚“ã€ã—ã‹ã—通常幾 -ã¤ã‹ã® -mm カーãƒãƒ« (1 ã‹ã‚‰ 3 ãŒæ™®é€šï¼‰ãŒå„-rc カーãƒãƒ«ã®é–“ã«ãƒªãƒªãƒ¼ã‚¹ã• -ã‚Œã¾ã™ã€‚ - サブシステム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リーã¨ãƒ‘ッム------------------------------------------- -カーãƒãƒ«ã®æ§˜ã€…ãªé ˜åŸŸã§ä½•ãŒèµ·ãã¦ã„ã‚‹ã‹ã‚’見られるよã†ã«ã™ã‚‹ãŸã‚ã€å¤šãã® -カーãƒãƒ«ã‚µãƒ–システム開発者ã¯å½¼ã‚‰ã®é–‹ç™ºãƒ„リーを公開ã—ã¦ã„ã¾ã™ã€‚ã“れら㮠-ツリーã¯èª¬æ˜Žã—ãŸã‚ˆã†ã« -mm カーãƒãƒ«ãƒªãƒªãƒ¼ã‚¹ã«å…¥ã‚Œè¾¼ã¾ã‚Œã¾ã™ã€‚ - -以下ã¯ã•ã¾ã–ã¾ãªã‚«ãƒ¼ãƒãƒ«ãƒ„リーã®ä¸­ã®ã„ãã¤ã‹ã®ãƒªã‚¹ãƒˆ- - - git ツリー- - - Kbuild ã®é–‹ç™ºãƒ„リーã€Sam Ravnborg - git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git - - - ACPI ã®é–‹ç™ºãƒ„リー〠Len Brown - git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git - - - Block ã®é–‹ç™ºãƒ„リーã€Jens Axboe - git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git - - - DRM ã®é–‹ç™ºãƒ„リーã€Dave Airlie - git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git - - - ia64 ã®é–‹ç™ºãƒ„リーã€Tony Luck - git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git - - - infiniband, Roland Dreier - git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git - - - libata, Jeff Garzik - git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git - - - ãƒãƒƒãƒˆãƒ¯ãƒ¼ã‚¯ãƒ‰ãƒ©ã‚¤ãƒ, Jeff Garzik - git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git - - - pcmcia, Dominik Brodowski - git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git - - - SCSI, James Bottomley - git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git - - - x86, Ingo Molnar - git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git - - quilt ツリー- - - USB, ドライãƒã‚³ã‚¢ã¨ I2C, Greg Kroah-Hartman - kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ +ãã‚Œãžã‚Œã®ã‚«ãƒ¼ãƒãƒ«ã‚µãƒ–システムã®ãƒ¡ãƒ³ãƒ†ãƒŠé”㯠--- ãã—ã¦å¤šãã®ã‚«ãƒ¼ãƒãƒ« +サブシステムã®é–‹ç™ºè€…é”ã‚‚ --- å„自ã®æœ€æ–°ã®é–‹ç™ºçŠ¶æ³ã‚’ソースリãƒã‚¸ãƒˆãƒªã« +公開ã—ã¦ã„ã¾ã™ã€‚ãã®ãŸã‚ã€è‡ªåˆ†ã¨ã¯ç•°ãªã‚‹é ˜åŸŸã®ã‚«ãƒ¼ãƒãƒ«ã§ä½•ãŒèµ·ãã¦ã„ã‚‹ +ã‹ã‚’ä»–ã®äººãŒè¦‹ã‚‰ã‚Œã‚‹ã‚ˆã†ã«ãªã£ã¦ã„ã¾ã™ã€‚開発ãŒæ—©ã進んã§ã„る領域ã§ã¯ã€ +開発者ã¯è‡ªèº«ã®æŠ•ç¨¿ãŒã©ã®ã‚µãƒ–システムカーãƒãƒ«ãƒ„リーを元ã«ã—ã¦ã„ã‚‹ã‹è³ªå• +ã•ã‚Œã‚‹ã®ã§ã€ãã®æŠ•ç¨¿ã¨ã™ã§ã«é€²è¡Œä¸­ã®ä»–ã®ä½œæ¥­ã¨ã®è¡çªãŒé¿ã‘られã¾ã™ã€‚ + +大部分ã®ã“れらã®ãƒªãƒã‚¸ãƒˆãƒªã¯ git ツリーã§ã™ã€‚ã—ã‹ã—ãã®ä»–ã® SCM ã‚„ +quilt シリーズã¨ã—ã¦å…¬é–‹ã•ã‚Œã¦ã„るパッãƒã‚­ãƒ¥ãƒ¼ã‚‚使ã‚ã‚Œã¦ã„ã¾ã™ã€‚ã“れら +ã®ã‚µãƒ–システムリãƒã‚¸ãƒˆãƒªã®ã‚¢ãƒ‰ãƒ¬ã‚¹ã¯ MAINTAINERS ファイルã«ãƒªã‚¹ãƒˆã•ã‚Œ +ã¦ã„ã¾ã™ã€‚ã“れらã®å¤šã㯠http://git.kernel.org/ ã§å‚ç…§ã™ã‚‹ã“ã¨ãŒã§ãã¾ +ã™ã€‚ - ãã®ä»–ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リー㯠http://git.kernel.org/ 㨠MAINTAINERS ファ - イルã«ä¸€è¦§è¡¨ãŒã‚ã‚Šã¾ã™ã€‚ +æ案ã•ã‚ŒãŸãƒ‘ッãƒãŒã“ã®ã‚ˆã†ãªã‚µãƒ–システムツリーã«ã‚³ãƒŸãƒƒãƒˆã•ã‚Œã‚‹å‰ã«ã€ãƒ¡ãƒ¼ +リングリストã§äº‹å‰ã«ãƒ¬ãƒ“ューã«ã‹ã‘られã¾ã™ï¼ˆä»¥ä¸‹ã®å¯¾å¿œã™ã‚‹ã‚»ã‚¯ã‚·ãƒ§ãƒ³ã‚’ +å‚照)。ã„ãã¤ã‹ã®ã‚«ãƒ¼ãƒãƒ«ã‚µãƒ–システムã§ã¯ã€ã“ã®ãƒ¬ãƒ“ュー㯠patchwork +ã¨ã„ã†ãƒ„ールã«ã‚ˆã£ã¦è¿½è·¡ã•ã‚Œã¾ã™ã€‚Patchwork 㯠web インターフェイス㫠+よã£ã¦ãƒ‘ッãƒæŠ•ç¨¿ã®è¡¨ç¤ºã€ãƒ‘ッãƒã¸ã®ã‚³ãƒ¡ãƒ³ãƒˆä»˜ã‘や改訂ãªã©ãŒã§ãã€ãã—㦠+メンテナã¯ãƒ‘ッãƒã«å¯¾ã—ã¦ã€ãƒ¬ãƒ“ュー中ã€å—付済ã¿ã€æ‹’å¦ã¨ã„ã†ã‚ˆã†ãªãƒžãƒ¼ã‚¯ +ã‚’ã¤ã‘ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚大部分ã®ã“れら㮠patchwork ã®ã‚µã‚¤ãƒˆã¯ +http://patchwork.kernel.org/ ã§ãƒªã‚¹ãƒˆã•ã‚Œã¦ã„ã¾ã™ã€‚ + +çµ±åˆãƒ†ã‚¹ãƒˆã®ãŸã‚ã® 2.6.x -next カーãƒãƒ«ãƒ„リー +--------------------------------------------- + +サブシステムツリーã®æ›´æ–°å†…容ãŒãƒ¡ã‚¤ãƒ³ãƒ©ã‚¤ãƒ³ã® 2.6.x ツリーã«ãƒžãƒ¼ã‚¸ã•ã‚Œ +ã‚‹å‰ã«ã€ãれらã¯çµ±åˆãƒ†ã‚¹ãƒˆã•ã‚Œã‚‹å¿…è¦ãŒã‚ã‚Šã¾ã™ã€‚ã“ã®ç›®çš„ã®ãŸã‚ã€å®Ÿè³ªçš„ +ã«å…¨ã‚µãƒ–システムツリーã‹ã‚‰ã»ã¼æ¯Žæ—¥ãƒ—ルã•ã‚Œã¦ã§ãる特別ãªãƒ†ã‚¹ãƒˆç”¨ã®ãƒª +ãƒã‚¸ãƒˆãƒªãŒå­˜åœ¨ã—ã¾ã™- + http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git + http://linux.f-seidel.de/linux-next/pmwiki/ + +ã“ã®ã‚„ã‚Šæ–¹ã«ã‚ˆã£ã¦ã€-next カーãƒãƒ«ã¯æ¬¡ã®ãƒžãƒ¼ã‚¸æ©Ÿä¼šã§ã©ã‚“ãªã‚‚ã®ãŒãƒ¡ã‚¤ãƒ³ +ラインカーãƒãƒ«ã«ãƒžãƒ¼ã‚¸ã•ã‚Œã‚‹ã‹ã€ãŠãŠã¾ã‹ãªã®å±•æœ›ã‚’æä¾›ã—ã¾ã™ã€‚-next +カーãƒãƒ«ã®å®Ÿè¡Œãƒ†ã‚¹ãƒˆã‚’è¡Œã†å†’険好ããªãƒ†ã‚¹ã‚¿ãƒ¼ã¯å¤§ã„ã«æ­“è¿Žã•ã‚Œã¾ã™ ãƒã‚°ãƒ¬ãƒãƒ¼ãƒˆ ------------- @@ -673,10 +631,9 @@ Linux カーãƒãƒ«ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¯ã€ä¸€åº¦ã«å¤§é‡ã®ã‚³ãƒ¼ãƒ‰ã®å¡Šã‚’ ã˜ã¨ã“ã‚ã‹ã‚‰ã‚¹ã‚¿ãƒ¼ãƒˆã—ãŸã®ã§ã™ã‹ã‚‰ã€‚ Paolo Ciarrocchi ã«æ„Ÿè¬ã€å½¼ã¯å½¼ã®æ›¸ã„㟠"Development Process" -(http://linux.tar.bz/articles/2.6-development_process)セクショ -ンをã“ã®ãƒ†ã‚­ã‚¹ãƒˆã®åŽŸåž‹ã«ã™ã‚‹ã“ã¨ã‚’許å¯ã—ã¦ãã‚Œã¾ã—ãŸã€‚ -Rundy Dunlap 㨠Gerrit Huizenga ã¯ãƒ¡ãƒ¼ãƒªãƒ³ã‚°ãƒªã‚¹ãƒˆã§ã‚„ã‚‹ã¹ãã“ã¨ã¨ã‚„㣠-ã¦ã¯ã„ã‘ãªã„ã“ã¨ã®ãƒªã‚¹ãƒˆã‚’æä¾›ã—ã¦ãã‚Œã¾ã—ãŸã€‚ +(http://lwn.net/Articles/94386/) セクションをã“ã®ãƒ†ã‚­ã‚¹ãƒˆã®åŽŸåž‹ã«ã™ã‚‹ +ã“ã¨ã‚’許å¯ã—ã¦ãã‚Œã¾ã—ãŸã€‚Rundy Dunlap 㨠Gerrit Huizenga ã¯ãƒ¡ãƒ¼ãƒªãƒ³ã‚° +リストã§ã‚„ã‚‹ã¹ãã“ã¨ã¨ã‚„ã£ã¦ã¯ã„ã‘ãªã„ã“ã¨ã®ãƒªã‚¹ãƒˆã‚’æä¾›ã—ã¦ãã‚Œã¾ã—ãŸã€‚ 以下ã®äººã€…ã®ãƒ¬ãƒ“ューã€ã‚³ãƒ¡ãƒ³ãƒˆã€è²¢çŒ®ã«æ„Ÿè¬ã€‚ Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers, Vojtech Pavlik, Jan Kara, Josh Boyer, Kees Cook, Andrew Morton, Andi -- cgit v1.1 From 5cf4c80a145d79db928ff55226e731de55c87644 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Tue, 19 Apr 2011 16:52:56 -0700 Subject: davinci: fix DEBUG_LL code for p2v changes Fixup davinci UART low-level debug code for new ARM generic p2v changes. Based on OMAP changes by Tony Lindgren Cc: Tony Lindgren Signed-off-by: Kevin Hilman --- arch/arm/mach-davinci/include/mach/debug-macro.S | 13 ++++++++----- arch/arm/mach-davinci/include/mach/serial.h | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-davinci/include/mach/debug-macro.S b/arch/arm/mach-davinci/include/mach/debug-macro.S index 9f1befc..f8b7ea4 100644 --- a/arch/arm/mach-davinci/include/mach/debug-macro.S +++ b/arch/arm/mach-davinci/include/mach/debug-macro.S @@ -24,6 +24,9 @@ #define UART_SHIFT 2 +#define davinci_uart_v2p(x) ((x) - PAGE_OFFSET + PLAT_PHYS_OFFSET) +#define davinci_uart_p2v(x) ((x) - PLAT_PHYS_OFFSET + PAGE_OFFSET) + .pushsection .data davinci_uart_phys: .word 0 davinci_uart_virt: .word 0 @@ -34,7 +37,7 @@ davinci_uart_virt: .word 0 /* Use davinci_uart_phys/virt if already configured */ 10: mrc p15, 0, \rp, c1, c0 tst \rp, #1 @ MMU enabled? - ldreq \rp, =__virt_to_phys(davinci_uart_phys) + ldreq \rp, =davinci_uart_v2p(davinci_uart_phys) ldrne \rp, =davinci_uart_phys add \rv, \rp, #4 @ davinci_uart_virt ldr \rp, [\rp, #0] @@ -48,18 +51,18 @@ davinci_uart_virt: .word 0 tst \rp, #1 @ MMU enabled? /* Copy uart phys address from decompressor uart info */ - ldreq \rv, =__virt_to_phys(davinci_uart_phys) + ldreq \rv, =davinci_uart_v2p(davinci_uart_phys) ldrne \rv, =davinci_uart_phys ldreq \rp, =DAVINCI_UART_INFO - ldrne \rp, =__phys_to_virt(DAVINCI_UART_INFO) + ldrne \rp, =davinci_uart_p2v(DAVINCI_UART_INFO) ldr \rp, [\rp, #0] str \rp, [\rv] /* Copy uart virt address from decompressor uart info */ - ldreq \rv, =__virt_to_phys(davinci_uart_virt) + ldreq \rv, =davinci_uart_v2p(davinci_uart_virt) ldrne \rv, =davinci_uart_virt ldreq \rp, =DAVINCI_UART_INFO - ldrne \rp, =__phys_to_virt(DAVINCI_UART_INFO) + ldrne \rp, =davinci_uart_p2v(DAVINCI_UART_INFO) ldr \rp, [\rp, #4] str \rp, [\rv] diff --git a/arch/arm/mach-davinci/include/mach/serial.h b/arch/arm/mach-davinci/include/mach/serial.h index 8051110..c9e6ce1 100644 --- a/arch/arm/mach-davinci/include/mach/serial.h +++ b/arch/arm/mach-davinci/include/mach/serial.h @@ -22,7 +22,7 @@ * * This area sits just below the page tables (see arch/arm/kernel/head.S). */ -#define DAVINCI_UART_INFO (PHYS_OFFSET + 0x3ff8) +#define DAVINCI_UART_INFO (PLAT_PHYS_OFFSET + 0x3ff8) #define DAVINCI_UART0_BASE (IO_PHYS + 0x20000) #define DAVINCI_UART1_BASE (IO_PHYS + 0x20400) -- cgit v1.1 From d8408aef910b5d538ae07218992b270a9e01067f Mon Sep 17 00:00:00 2001 From: Daniel Trautmann Date: Mon, 21 Mar 2011 15:36:35 +0100 Subject: uio_netx: Add support for netPLC cards This patch adds support for Hilscher / IBHsoftec netPLC cards to uio_netx userspace IO driver. Changes from v1 -> v2: Fixed whitespace errors reported by scripts/checkpatch.pl which were caused by email client. Signed-off-by: Daniel Trautmann Signed-off-by: "Hans J. Koch" Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio_netx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/uio/uio_netx.c b/drivers/uio/uio_netx.c index 5ffdb48..a879fd5 100644 --- a/drivers/uio/uio_netx.c +++ b/drivers/uio/uio_netx.c @@ -18,6 +18,9 @@ #define PCI_VENDOR_ID_HILSCHER 0x15CF #define PCI_DEVICE_ID_HILSCHER_NETX 0x0000 +#define PCI_DEVICE_ID_HILSCHER_NETPLC 0x0010 +#define PCI_SUBDEVICE_ID_NETPLC_RAM 0x0000 +#define PCI_SUBDEVICE_ID_NETPLC_FLASH 0x0001 #define PCI_SUBDEVICE_ID_NXSB_PCA 0x3235 #define PCI_SUBDEVICE_ID_NXPCA 0x3335 @@ -66,6 +69,10 @@ static int __devinit netx_pci_probe(struct pci_dev *dev, bar = 0; info->name = "netx"; break; + case PCI_DEVICE_ID_HILSCHER_NETPLC: + bar = 0; + info->name = "netplc"; + break; default: bar = 2; info->name = "netx_plx"; @@ -134,6 +141,18 @@ static struct pci_device_id netx_pci_ids[] = { .subdevice = 0, }, { + .vendor = PCI_VENDOR_ID_HILSCHER, + .device = PCI_DEVICE_ID_HILSCHER_NETPLC, + .subvendor = PCI_VENDOR_ID_HILSCHER, + .subdevice = PCI_SUBDEVICE_ID_NETPLC_RAM, + }, + { + .vendor = PCI_VENDOR_ID_HILSCHER, + .device = PCI_DEVICE_ID_HILSCHER_NETPLC, + .subvendor = PCI_VENDOR_ID_HILSCHER, + .subdevice = PCI_SUBDEVICE_ID_NETPLC_FLASH, + }, + { .vendor = PCI_VENDOR_ID_PLX, .device = PCI_DEVICE_ID_PLX_9030, .subvendor = PCI_VENDOR_ID_PLX, -- cgit v1.1 From f0c554fddd3be561542cd37acdb3adc9ec5483ee Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 28 Mar 2011 23:33:26 +0200 Subject: uio: fix finding mm index for vma When finding mm index for vma it looks more flexible that the mm could be sparse, and both the size of mm and the pgoff of vma could give correct selection. Signed-off-by: Hillf Danton Signed-off-by: Hans J. Koch Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 51fe179..10a029b 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -587,14 +587,12 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, static int uio_find_mem_index(struct vm_area_struct *vma) { - int mi; struct uio_device *idev = vma->vm_private_data; - for (mi = 0; mi < MAX_UIO_MAPS; mi++) { - if (idev->info->mem[mi].size == 0) + if (vma->vm_pgoff < MAX_UIO_MAPS) { + if (idev->info->mem[vma->vm_pgoff].size == 0) return -1; - if (vma->vm_pgoff == mi) - return mi; + return (int)vma->vm_pgoff; } return -1; } -- cgit v1.1 From c6edc42fe1b5562abae22beabbebd9e557527ae3 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Thu, 31 Mar 2011 20:38:47 +0800 Subject: uio: fix allocating minor id for uio device The number of uio devices that could be used should be less than UIO_MAX_DEVICES by design, and this work guards any cases in which id more than UIO_MAX_DEVICES is utilized. Signed-off-by: Hillf Danton Signed-off-by: Hans J. Koch Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 10a029b..d2efe82 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -381,7 +381,13 @@ static int uio_get_minor(struct uio_device *idev) retval = -ENOMEM; goto exit; } - idev->minor = id & MAX_ID_MASK; + if (id < UIO_MAX_DEVICES) { + idev->minor = id; + } else { + dev_err(idev->dev, "too many uio devices\n"); + retval = -EINVAL; + idr_remove(&uio_idr, id); + } exit: mutex_unlock(&minor_lock); return retval; -- cgit v1.1 From 47296b1962ead8301488f0dbe8424c7db7eac635 Mon Sep 17 00:00:00 2001 From: Jie Zhou Date: Wed, 6 Apr 2011 14:42:40 +0800 Subject: uio: clean uioinfo when uninstall uio driver The uioinfo should be cleaned up when uninstall, otherwise re-install failure of uio_pdrv_genirq.ko will happen. Signed-off-by: Jie Zhou Signed-off-by: Aisheng Dong Signed-off-by: Hans J. Koch Signed-off-by: Greg Kroah-Hartman --- drivers/uio/uio_pdrv_genirq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/uio/uio_pdrv_genirq.c b/drivers/uio/uio_pdrv_genirq.c index 7174d51..0f424af 100644 --- a/drivers/uio/uio_pdrv_genirq.c +++ b/drivers/uio/uio_pdrv_genirq.c @@ -189,6 +189,10 @@ static int uio_pdrv_genirq_remove(struct platform_device *pdev) uio_unregister_device(priv->uioinfo); pm_runtime_disable(&pdev->dev); + + priv->uioinfo->handler = NULL; + priv->uioinfo->irqcontrol = NULL; + kfree(priv); return 0; } -- cgit v1.1 From 7e5b58bcbcb3d7518389c1d82fb6e926f5a9f72c Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 7 Apr 2011 04:29:20 +0200 Subject: printk: /dev/kmsg - properly support writev() to avoid interleaved printk() lines printk: /dev/kmsg - properly support writev() to avoid interleaved printk lines We should avoid calling printk() in a loop, when we pass a single string to /dev/kmsg with writev(). Cc: Lennart Poettering Signed-off-by: Kay Sievers Cc: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 436a990..78923a9 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -806,29 +806,40 @@ static const struct file_operations oldmem_fops = { }; #endif -static ssize_t kmsg_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t kmsg_writev(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) { - char *tmp; - ssize_t ret; + char *line, *p; + int len, i; + ssize_t ret = -EFAULT; - tmp = kmalloc(count + 1, GFP_KERNEL); - if (tmp == NULL) + len = iov_length(iv, count); + line = p = kmalloc(len + 1, GFP_KERNEL); + if (line == NULL) return -ENOMEM; - ret = -EFAULT; - if (!copy_from_user(tmp, buf, count)) { - tmp[count] = 0; - ret = printk("%s", tmp); - if (ret > count) - /* printk can add a prefix */ - ret = count; + + /* + * copy all vectors into a single string, to ensure we do + * not interleave our log line with other printk calls + */ + for (i = 0; i < count; i++) { + if (copy_from_user(p, iv[i].iov_base, iv[i].iov_len)) + goto out; + p += iv[i].iov_len; } - kfree(tmp); + p[0] = '\0'; + + ret = printk("%s", line); + /* printk can add a prefix */ + if (ret > len) + ret = len; +out: + kfree(line); return ret; } static const struct file_operations kmsg_fops = { - .write = kmsg_write, + .aio_write = kmsg_writev, .llseek = noop_llseek, }; -- cgit v1.1 From 70a5f52165bd04cf3b33f30d5d234be28dcf29d4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 12 Apr 2011 16:13:31 -0700 Subject: kmsg: properly support writev to avoid interleaved printk lines fix make `len' size_t, avoid multiple-assignments. Cc: Kay Sievers Cc: Lennart Poettering Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 78923a9..8fc04b4 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -810,11 +810,11 @@ static ssize_t kmsg_writev(struct kiocb *iocb, const struct iovec *iv, unsigned long count, loff_t pos) { char *line, *p; - int len, i; + int i; ssize_t ret = -EFAULT; + size_t len = iov_length(iv, count); - len = iov_length(iv, count); - line = p = kmalloc(len + 1, GFP_KERNEL); + line = kmalloc(len + 1, GFP_KERNEL); if (line == NULL) return -ENOMEM; @@ -822,6 +822,7 @@ static ssize_t kmsg_writev(struct kiocb *iocb, const struct iovec *iv, * copy all vectors into a single string, to ensure we do * not interleave our log line with other printk calls */ + p = line; for (i = 0; i < count; i++) { if (copy_from_user(p, iv[i].iov_base, iv[i].iov_len)) goto out; -- cgit v1.1 From c6914a6f261aca0c9f715f883a353ae7ff51fe83 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 19 Apr 2011 20:36:59 -0700 Subject: can: Add missing socket check in can/bcm release. We can get here with a NULL socket argument passed from userspace, so we need to handle it accordingly. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/can/bcm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index 57b1aed..8a6a05e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1427,9 +1427,14 @@ static int bcm_init(struct sock *sk) static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; - struct bcm_sock *bo = bcm_sk(sk); + struct bcm_sock *bo; struct bcm_op *op, *next; + if (sk == NULL) + return 0; + + bo = bcm_sk(sk); + /* remove bcm_ops, timer, rx_unregister(), etc. */ unregister_netdevice_notifier(&bo->notifier); -- cgit v1.1 From 6f4d6dc167a001267eeff18bdea0ce3e9108c662 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 19 Apr 2011 09:39:22 +0000 Subject: ehea: Fix a DLPAR bug on ehea_rereg_mrs(). We are currently continuing if ehea_restart_qps() fails, when we do a memory DLPAR (remove or add more memory to the system). This patch just let the NAPI disabled if the ehea_restart_qps() fails. Signed-off-by: Breno Leitao Signed-off-by: David S. Miller --- drivers/net/ehea/ehea_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index f75d314..53c0f04 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -3040,11 +3040,14 @@ static void ehea_rereg_mrs(void) if (dev->flags & IFF_UP) { mutex_lock(&port->port_lock); - port_napi_enable(port); ret = ehea_restart_qps(dev); - check_sqs(port); - if (!ret) + if (!ret) { + check_sqs(port); + port_napi_enable(port); netif_wake_queue(dev); + } else { + netdev_err(dev, "Unable to restart QPS\n"); + } mutex_unlock(&port->port_lock); } } -- cgit v1.1 From 2430af8b7fa37ac0be102c77f9dc6ee669d24ba9 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Tue, 19 Apr 2011 02:09:55 +0000 Subject: bonding: 802.3ad - fix agg_device_up The slave member of struct aggregator does not necessarily point to a slave which is part of the aggregator. It points to the slave structure containing the aggregator structure, while completely different slaves (or no slaves at all) may be part of the aggregator. The agg_device_up() function wrongly uses agg->slave to find the state of the aggregator. Use agg->lag_ports->slave instead. The bug has been introduced by commit 4cd6fe1c6483cde93e2ec91f58b7af9c9eea51ad ("bonding: fix link down handling in 802.3ad mode"). Signed-off-by: Jiri Bohac Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 494bf96..31912f1 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1482,8 +1482,11 @@ static struct aggregator *ad_agg_selection_test(struct aggregator *best, static int agg_device_up(const struct aggregator *agg) { - return (netif_running(agg->slave->dev) && - netif_carrier_ok(agg->slave->dev)); + struct port *port = agg->lag_ports; + if (!port) + return 0; + return (netif_running(port->slave->dev) && + netif_carrier_ok(port->slave->dev)); } /** -- cgit v1.1 From 9cbdb702092a2d82f909312f4ec3eeded77bb82e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 6 Apr 2011 21:54:20 -0600 Subject: perf script: improve validation of sample attributes for output fields Check for required sample attributes using evsel rather than sample_type in the session header. If the attribute for a default field is not present for the event type (e.g., new command operating on file from older kernel) the field is removed from the output list. Expected event types must exist. For example, if a user specifies -f trace:time,trace -f sw:time,cpu,sym the perf.data file must contain both tracepoints and software events (ie., it is an error if either does not exist in the file). Attribute checking is done once at the beginning of perf-script rather than for each sample. v1 -> v2: - addressed comments from acme Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1302148460-570-1-git-send-email-daahern@cisco.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 111 +++++++++++++++++++++++++++++++++++++------- tools/perf/util/session.c | 12 +++++ tools/perf/util/session.h | 3 ++ 3 files changed, 109 insertions(+), 17 deletions(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b3012c4..974f6d3 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -51,6 +51,7 @@ struct output_option { /* default set to maintain compatibility with current format */ static struct { bool user_set; + bool wildcard_set; u64 fields; u64 invalid_fields; } output[PERF_TYPE_MAX] = { @@ -104,41 +105,113 @@ static bool output_set_by_user(void) return false; } +static const char *output_field2str(enum perf_output_field field) +{ + int i, imax = ARRAY_SIZE(all_output_options); + const char *str = ""; + + for (i = 0; i < imax; ++i) { + if (all_output_options[i].field == field) { + str = all_output_options[i].str; + break; + } + } + return str; +} + #define PRINT_FIELD(x) (output[attr->type].fields & PERF_OUTPUT_##x) -static int perf_session__check_attr(struct perf_session *session, - struct perf_event_attr *attr) +static int perf_event_attr__check_stype(struct perf_event_attr *attr, + u64 sample_type, const char *sample_msg, + enum perf_output_field field) { + int type = attr->type; + const char *evname; + + if (attr->sample_type & sample_type) + return 0; + + if (output[type].user_set) { + evname = __event_name(attr->type, attr->config); + pr_err("Samples for '%s' event do not have %s attribute set. " + "Cannot print '%s' field.\n", + evname, sample_msg, output_field2str(field)); + return -1; + } + + /* user did not ask for it explicitly so remove from the default list */ + output[type].fields &= ~field; + evname = __event_name(attr->type, attr->config); + pr_debug("Samples for '%s' event do not have %s attribute set. " + "Skipping '%s' field.\n", + evname, sample_msg, output_field2str(field)); + + return 0; +} + +static int perf_evsel__check_attr(struct perf_evsel *evsel, + struct perf_session *session) +{ + struct perf_event_attr *attr = &evsel->attr; + if (PRINT_FIELD(TRACE) && !perf_session__has_traces(session, "record -R")) return -EINVAL; if (PRINT_FIELD(SYM)) { - if (!(session->sample_type & PERF_SAMPLE_IP)) { - pr_err("Samples do not contain IP data.\n"); + if (perf_event_attr__check_stype(attr, PERF_SAMPLE_IP, "IP", + PERF_OUTPUT_SYM)) return -EINVAL; - } + if (!no_callchain && - !(session->sample_type & PERF_SAMPLE_CALLCHAIN)) + !(attr->sample_type & PERF_SAMPLE_CALLCHAIN)) symbol_conf.use_callchain = false; } if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) && - !(session->sample_type & PERF_SAMPLE_TID)) { - pr_err("Samples do not contain TID/PID data.\n"); + perf_event_attr__check_stype(attr, PERF_SAMPLE_TID, "TID", + PERF_OUTPUT_TID|PERF_OUTPUT_PID)) return -EINVAL; - } if (PRINT_FIELD(TIME) && - !(session->sample_type & PERF_SAMPLE_TIME)) { - pr_err("Samples do not contain timestamps.\n"); + perf_event_attr__check_stype(attr, PERF_SAMPLE_TIME, "TIME", + PERF_OUTPUT_TIME)) return -EINVAL; - } if (PRINT_FIELD(CPU) && - !(session->sample_type & PERF_SAMPLE_CPU)) { - pr_err("Samples do not contain cpu.\n"); + perf_event_attr__check_stype(attr, PERF_SAMPLE_CPU, "CPU", + PERF_OUTPUT_CPU)) return -EINVAL; + + return 0; +} + +/* + * verify all user requested events exist and the samples + * have the expected data + */ +static int perf_session__check_output_opt(struct perf_session *session) +{ + int j; + struct perf_evsel *evsel; + + for (j = 0; j < PERF_TYPE_MAX; ++j) { + evsel = perf_session__find_first_evtype(session, j); + + /* + * even if fields is set to 0 (ie., show nothing) event must + * exist if user explicitly includes it on the command line + */ + if (!evsel && output[j].user_set && !output[j].wildcard_set) { + pr_err("%s events do not exist. " + "Remove corresponding -f option to proceed.\n", + event_type(j)); + return -1; + } + + if (evsel && output[j].fields && + perf_evsel__check_attr(evsel, session)) + return -1; } return 0; @@ -210,9 +283,6 @@ static void process_event(union perf_event *event __unused, if (output[attr->type].fields == 0) return; - if (perf_session__check_attr(session, attr) < 0) - return; - print_sample_start(sample, thread, attr); if (PRINT_FIELD(TRACE)) @@ -525,6 +595,7 @@ static int parse_output_fields(const struct option *opt __used, output[type].fields = 0; output[type].user_set = true; + output[type].wildcard_set = false; } else { tok = str; @@ -541,6 +612,7 @@ static int parse_output_fields(const struct option *opt __used, for (j = 0; j < PERF_TYPE_MAX; ++j) { output[j].fields = 0; output[j].user_set = true; + output[j].wildcard_set = true; } } @@ -1145,6 +1217,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) pr_debug("perf script started with script %s\n\n", script_name); } + + err = perf_session__check_output_opt(session); + if (err < 0) + goto out; + err = __cmd_script(session); perf_session__delete(session); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index caa2245..fff6674 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1156,6 +1156,18 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) return ret; } +struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, + unsigned int type) +{ + struct perf_evsel *pos; + + list_for_each_entry(pos, &session->evlist->entries, node) { + if (pos->attr.type == type) + return pos; + } + return NULL; +} + void perf_session__print_symbols(union perf_event *event, struct perf_sample *sample, struct perf_session *session) diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 1ac481f..8daaa2d 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -162,6 +162,9 @@ static inline int perf_session__parse_sample(struct perf_session *session, session->sample_id_all, sample); } +struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, + unsigned int type); + void perf_session__print_symbols(union perf_event *event, struct perf_sample *sample, struct perf_session *session); -- cgit v1.1 From 8a91707d0a1a49193e23cb2d243632f2289feb24 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 20 Apr 2011 11:54:10 -0400 Subject: xen/p2m: Add EXPORT_SYMBOL_GPL to the M2P override functions. If the backends, which use these two functions, are compiled as a module we need these two functions to be exported. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2d2b32af..c851397 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -682,7 +682,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte) return 0; } - +EXPORT_SYMBOL_GPL(m2p_add_override); int m2p_remove_override(struct page *page, bool clear_pte) { unsigned long flags; @@ -719,6 +719,7 @@ int m2p_remove_override(struct page *page, bool clear_pte) return 0; } +EXPORT_SYMBOL_GPL(m2p_remove_override); struct page *m2p_find_override(unsigned long mfn) { -- cgit v1.1 From 10022a6c66e199d8f61d9044543f38785713cbbd Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 20 Apr 2011 01:57:15 +0000 Subject: can: add missing socket check in can/raw release v2: added space after 'if' according code style. We can get here with a NULL socket argument passed from userspace, so we need to handle it accordingly. Thanks to Dave Jones pointing at this issue in net/can/bcm.c Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/raw.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/can/raw.c b/net/can/raw.c index 649acfa..0eb39a7 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -305,7 +305,12 @@ static int raw_init(struct sock *sk) static int raw_release(struct socket *sock) { struct sock *sk = sock->sk; - struct raw_sock *ro = raw_sk(sk); + struct raw_sock *ro; + + if (!sk) + return 0; + + ro = raw_sk(sk); unregister_netdevice_notifier(&ro->notifier); -- cgit v1.1 From 243e6df4ed919880d079d717641ad699c6530a03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Apr 2011 20:44:04 +0200 Subject: mac80211: fix SMPS debugfs locking The locking with SMPS requests means that the debugs file should lock the mgd mutex, not the iflist mutex. Calls to __ieee80211_request_smps() need to hold that mutex, so add an assertion. This has always been wrong, but for some reason never been noticed, probably because the locking error only happens while unassociated. Cc: stable@kernel.org [2.6.34+] Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 2 ++ net/mac80211/debugfs_netdev.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 3342135..4404973 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1504,6 +1504,8 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode old_req; int err; + lockdep_assert_held(&sdata->u.mgd.mtx); + old_req = sdata->u.mgd.req_smps; sdata->u.mgd.req_smps = smps_mode; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index dacace6..9ea7c0d 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -177,9 +177,9 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; - mutex_lock(&local->iflist_mtx); + mutex_lock(&sdata->u.mgd.mtx); err = __ieee80211_request_smps(sdata, smps_mode); - mutex_unlock(&local->iflist_mtx); + mutex_unlock(&sdata->u.mgd.mtx); return err; } -- cgit v1.1 From b25026981aecde3685dd0e45ad980fff9f528daa Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 20 Apr 2011 15:57:14 +0200 Subject: iwlwifi: fix skb usage after free Since commit a120e912eb51e347f36c71b60a1d13af74d30e83 Author: Stanislaw Gruszka Date: Fri Feb 19 15:47:33 2010 -0800 iwlwifi: sanity check before counting number of tfds can be free we use skb->data after calling ieee80211_tx_status_irqsafe(), which could free skb instantly. On current kernels I do not observe practical problems related with bug, but on 2.6.35.y it cause random system hangs when stressing wireless link. Cc: stable@kernel.org # 2.6.32+ Signed-off-by: Stanislaw Gruszka Acked-by: Wey-Yi Guy Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn-tx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-tx.c b/drivers/net/wireless/iwlwifi/iwl-agn-tx.c index a709d05..2dd7d54 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-tx.c @@ -1224,12 +1224,16 @@ int iwlagn_tx_queue_reclaim(struct iwl_priv *priv, int txq_id, int index) q->read_ptr = iwl_queue_inc_wrap(q->read_ptr, q->n_bd)) { tx_info = &txq->txb[txq->q.read_ptr]; - iwlagn_tx_status(priv, tx_info, - txq_id >= IWLAGN_FIRST_AMPDU_QUEUE); + + if (WARN_ON_ONCE(tx_info->skb == NULL)) + continue; hdr = (struct ieee80211_hdr *)tx_info->skb->data; - if (hdr && ieee80211_is_data_qos(hdr->frame_control)) + if (ieee80211_is_data_qos(hdr->frame_control)) nfreed++; + + iwlagn_tx_status(priv, tx_info, + txq_id >= IWLAGN_FIRST_AMPDU_QUEUE); tx_info->skb = NULL; if (priv->cfg->ops->lib->txq_inval_byte_cnt_tbl) -- cgit v1.1 From 069f40fc07f6df3da325e7ea1698a0d6247983d5 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 20 Apr 2011 16:01:46 +0200 Subject: iwl4965: fix skb usage after free Since commit a120e912eb51e347f36c71b60a1d13af74d30e83 Author: Stanislaw Gruszka Date: Fri Feb 19 15:47:33 2010 -0800 iwlwifi: sanity check before counting number of tfds can be free we use skb->data after calling ieee80211_tx_status_irqsafe(), which could free skb instantly. On current kernels I do not observe practical problems related with bug, but on 2.6.35.y it cause random system hangs when stressing wireless link, making bisection of other problems impossible. Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/iwl-4965-tx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/iwlegacy/iwl-4965-tx.c b/drivers/net/wireless/iwlegacy/iwl-4965-tx.c index 5c40502f..fbec88d 100644 --- a/drivers/net/wireless/iwlegacy/iwl-4965-tx.c +++ b/drivers/net/wireless/iwlegacy/iwl-4965-tx.c @@ -1127,12 +1127,16 @@ int iwl4965_tx_queue_reclaim(struct iwl_priv *priv, int txq_id, int index) q->read_ptr = iwl_legacy_queue_inc_wrap(q->read_ptr, q->n_bd)) { tx_info = &txq->txb[txq->q.read_ptr]; - iwl4965_tx_status(priv, tx_info, - txq_id >= IWL4965_FIRST_AMPDU_QUEUE); + + if (WARN_ON_ONCE(tx_info->skb == NULL)) + continue; hdr = (struct ieee80211_hdr *)tx_info->skb->data; - if (hdr && ieee80211_is_data_qos(hdr->frame_control)) + if (ieee80211_is_data_qos(hdr->frame_control)) nfreed++; + + iwl4965_tx_status(priv, tx_info, + txq_id >= IWL4965_FIRST_AMPDU_QUEUE); tx_info->skb = NULL; priv->cfg->ops->lib->txq_free_tfd(priv, txq); -- cgit v1.1 From dffa4b2f62ff28c982144c7033001b1ece4d3532 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 20 Apr 2011 12:23:49 +0200 Subject: x86, mce: Drop the default decoding notifier The default notifier doesn't make a lot of sense to call in the correctable errors case. Drop it and emit the mcelog decoding hint only in the uncorrectable errors case and when no notifier is registered. Also, limit issuing the "mcelog --ascii" message in the rare case when we dump unreported CEs before panicking. While at it, remove unused old x86_mce_decode_callback from the header. Signed-off-by: Borislav Petkov Signed-off-by: Prarit Bhargava Cc: Tony Luck Cc: Nagananda Chumbalkar Cc: Russ Anderson Link: http://lkml.kernel.org/r/20110420102349.GB1361@aftab Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 24 +++++++----------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index eb16e94..021979a6 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif -extern void (*x86_mce_decode_callback)(struct mce *m); - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68e2303..ff1ae9b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -105,20 +105,6 @@ static int cpu_missing; ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); -static int default_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) -{ - pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); - pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); - - return NOTIFY_STOP; -} - -static struct notifier_block mce_dec_nb = { - .notifier_call = default_decode_mce, - .priority = -1, -}; - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -212,6 +198,8 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { + int ret = 0; + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -239,7 +227,11 @@ static void print_mce(struct mce *m) * Print out human-readable details about the MCE error, * (if the CPU has an implementation for that) */ - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -1721,8 +1713,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); - mcheck_intel_therm_init(); return 0; -- cgit v1.1 From d3bf52e998056a6002b2aecfe1d25486376382ac Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 20 Apr 2011 21:27:32 +0600 Subject: sched: Remove obsolete comment from scheduler_tick() scheduler_tick() is no longer called by fork code - this got discarded a long time ago by commit bc947631d1d532 ("sched: improve efficiency of sched_fork()"). So, remove the comment which still claims otherwise. Signed-off-by: Rakib Mullick Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTimO4iGP0QpaHO1HHF1QOnVcQpc0cw@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 30a29ad..8cb0a57 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4002,9 +4002,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. */ void scheduler_tick(void) { -- cgit v1.1 From a9cf73ea7ff78f52662c8658d93c226effbbedde Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 19 Apr 2011 22:52:49 +0000 Subject: ipv6: udp: fix the wrong headroom check At this point, skb->data points to skb_transport_header. So, headroom check is wrong. For some case:bridge(UFO is on) + eth device(UFO is off), there is no enough headroom for IPv6 frag head. But headroom check is always false. This will bring about data be moved to there prior to skb->head, when adding IPv6 frag header to skb. Signed-off-by: Shan Wei Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 15c3774..9e305d74 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1335,7 +1335,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features) skb->ip_summed = CHECKSUM_NONE; /* Check if there is enough headroom to insert fragment header. */ - if ((skb_headroom(skb) < frag_hdr_sz) && + if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) && pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) goto out; -- cgit v1.1 From 9fd097b14918875bd6f125ed699d7bbbba5893ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2011 21:32:55 +0200 Subject: block: unexport DISK_EVENT_MEDIA_CHANGE for legacy/fringe drivers In-kernel disk event polling doesn't matter for legacy/fringe drivers and may lead to infinite event loop if ->check_events() implementation generates events on level condition instead of edge. Now that block layer supports suppressing exporting unlisted events, simply leaving disk->events cleared allows these drivers to keep the internal revalidation behavior intact while avoiding weird interactions with userland event handler. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 1 - drivers/block/amiflop.c | 1 - drivers/block/ataflop.c | 1 - drivers/block/floppy.c | 1 - drivers/block/paride/pcd.c | 1 - drivers/block/paride/pd.c | 1 - drivers/block/paride/pf.c | 1 - drivers/block/swim.c | 1 - drivers/block/swim3.c | 1 - drivers/block/ub.c | 1 - drivers/block/xsysace.c | 1 - drivers/cdrom/gdrom.c | 1 - drivers/cdrom/viocd.c | 1 - drivers/message/i2o/i2o_block.c | 1 - drivers/s390/char/tape_block.c | 1 - 15 files changed, 15 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 8066d08..e086fbb 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -2547,7 +2547,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) disk->major = MajorNumber; disk->first_minor = n << DAC960_MaxPartitionsBits; disk->fops = &DAC960_BlockDeviceOperations; - disk->events = DISK_EVENT_MEDIA_CHANGE; } /* Indicate the Block Device Registration completed successfully, diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 456c0cc..8eba86b 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1736,7 +1736,6 @@ static int __init fd_probe_drives(void) disk->major = FLOPPY_MAJOR; disk->first_minor = drive; disk->fops = &floppy_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; sprintf(disk->disk_name, "fd%d", drive); disk->private_data = &unit[drive]; set_capacity(disk, 880*2); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index c871eae..ede16c6 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1964,7 +1964,6 @@ static int __init atari_floppy_init (void) unit[i].disk->first_minor = i; sprintf(unit[i].disk->disk_name, "fd%d", i); unit[i].disk->fops = &floppy_fops; - unit[i].disk->events = DISK_EVENT_MEDIA_CHANGE; unit[i].disk->private_data = &unit[i]; unit[i].disk->queue = blk_init_queue(do_fd_request, &ataflop_lock); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 301d7a9..db8f885 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4205,7 +4205,6 @@ static int __init floppy_init(void) disks[dr]->major = FLOPPY_MAJOR; disks[dr]->first_minor = TOMINOR(dr); disks[dr]->fops = &floppy_fops; - disks[dr]->events = DISK_EVENT_MEDIA_CHANGE; sprintf(disks[dr]->disk_name, "fd%d", dr); init_timer(&motor_off_timer[dr]); diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 2f2ccf6..8690e31 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -320,7 +320,6 @@ static void pcd_init_units(void) disk->first_minor = unit; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; - disk->events = DISK_EVENT_MEDIA_CHANGE; } } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 21dfdb77..869e767 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -837,7 +837,6 @@ static void pd_probe_drive(struct pd_unit *disk) p->fops = &pd_fops; p->major = major; p->first_minor = (disk - pd) << PD_BITS; - p->events = DISK_EVENT_MEDIA_CHANGE; disk->gd = p; p->private_data = disk; p->queue = pd_queue; diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 7adeb1e..f21b520 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -294,7 +294,6 @@ static void __init pf_init_units(void) disk->first_minor = unit; strcpy(disk->disk_name, pf->name); disk->fops = &pf_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; if (!(*drives[unit])[D_PRT]) pf_drive_count++; } diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 24a482f..fd5adcd 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -858,7 +858,6 @@ static int __devinit swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->first_minor = drive; sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; - swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; swd->unit[drive].disk->private_data = &swd->unit[drive]; swd->unit[drive].disk->queue = swd->queue; set_capacity(swd->unit[drive].disk, 2880); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 4c10f56..773bfa7 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1163,7 +1163,6 @@ static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device disk->major = FLOPPY_MAJOR; disk->first_minor = i; disk->fops = &floppy_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = &floppy_states[i]; disk->queue = swim3_queue; disk->flags |= GENHD_FL_REMOVABLE; diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 68b9430..0e376d4 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -2334,7 +2334,6 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum) disk->major = UB_MAJOR; disk->first_minor = lun->id * UB_PARTS_PER_LUN; disk->fops = &ub_bd_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = lun; disk->driverfs_dev = &sc->intf->dev; diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 645ff76..6c7fd7d 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -1005,7 +1005,6 @@ static int __devinit ace_setup(struct ace_device *ace) ace->gd->major = ace_major; ace->gd->first_minor = ace->id * ACE_NUM_MINORS; ace->gd->fops = &ace_fops; - ace->gd->events = DISK_EVENT_MEDIA_CHANGE; ace->gd->queue = ace->queue; ace->gd->private_data = ace; snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a'); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index b2b034f..3ceaf00 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -803,7 +803,6 @@ static int __devinit probe_gdrom(struct platform_device *devptr) goto probe_fail_cdrom_register; } gd.disk->fops = &gdrom_bdops; - gd.disk->events = DISK_EVENT_MEDIA_CHANGE; /* latch on to the interrupt */ err = gdrom_set_interrupt_handlers(); if (err) diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 4e874c5..e427fbe 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -626,7 +626,6 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id) gendisk->queue = q; gendisk->fops = &viocd_fops; gendisk->flags = GENHD_FL_CD|GENHD_FL_REMOVABLE; - gendisk->events = DISK_EVENT_MEDIA_CHANGE; set_capacity(gendisk, 0); gendisk->private_data = d; d->viocd_disk = gendisk; diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 643ad52..4796bbf 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -1000,7 +1000,6 @@ static struct i2o_block_device *i2o_block_device_alloc(void) gd->major = I2O_MAJOR; gd->queue = queue; gd->fops = &i2o_block_fops; - gd->events = DISK_EVENT_MEDIA_CHANGE; gd->private_data = dev; dev->gd = gd; diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 83cea9a55..1b3924c 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -236,7 +236,6 @@ tapeblock_setup_device(struct tape_device * device) disk->major = tapeblock_major; disk->first_minor = device->first_minor; disk->fops = &tapeblock_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = tape_get_device(device); disk->queue = blkdat->request_queue; set_capacity(disk, 0); -- cgit v1.1 From 505d9147a72d4e14323af9581dde066bd5fc439c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Thu, 21 Apr 2011 15:37:20 -0700 Subject: sparc32: fix section mismatch warnings in apc, pmc and time_32 In all cases there were a struct of_device_id variable defined __initdata. But it was referenced from struct platform_driver.of_match_table which is not guaranteed to be used during init only. So drop the __initdata annotation. This fixes following warnings: WARNING: arch/sparc/kernel/built-in.o(.data+0x810): Section mismatch in reference from the variable clock_driver to the variable .init.data:clock_match The variable clock_driver references the variable __initdata clock_match If the reference is valid then annotate the variable with __init* or __refdata (see linux/init.h) or name the variable: *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console WARNING: arch/sparc/kernel/built-in.o(.data+0xcec): Section mismatch in reference from the variable apc_driver to the variable .init.data:apc_match The variable apc_driver references the variable __initdata apc_match If the reference is valid then annotate the variable with __init* or __refdata (see linux/init.h) or name the variable: *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console WARNING: arch/sparc/kernel/built-in.o(.data+0xd60): Section mismatch in reference from the variable pmc_driver to the variable .init.data:pmc_match The variable pmc_driver references the variable __initdata pmc_match If the reference is valid then annotate the variable with __init* or __refdata (see linux/init.h) or name the variable: *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- arch/sparc/kernel/apc.c | 2 +- arch/sparc/kernel/pmc.c | 2 +- arch/sparc/kernel/time_32.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c index f679c57..1e34f29 100644 --- a/arch/sparc/kernel/apc.c +++ b/arch/sparc/kernel/apc.c @@ -165,7 +165,7 @@ static int __devinit apc_probe(struct platform_device *op) return 0; } -static struct of_device_id __initdata apc_match[] = { +static struct of_device_id apc_match[] = { { .name = APC_OBPNAME, }, diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c index 93d7b44..6a585d3 100644 --- a/arch/sparc/kernel/pmc.c +++ b/arch/sparc/kernel/pmc.c @@ -69,7 +69,7 @@ static int __devinit pmc_probe(struct platform_device *op) return 0; } -static struct of_device_id __initdata pmc_match[] = { +static struct of_device_id pmc_match[] = { { .name = PMC_OBPNAME, }, diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 4e23639..96046a4 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -168,7 +168,7 @@ static int __devinit clock_probe(struct platform_device *op) return 0; } -static struct of_device_id __initdata clock_match[] = { +static struct of_device_id clock_match[] = { { .name = "eeprom", }, -- cgit v1.1 From f486b3dc2d048e7309a733f97eb9f9f83d586df2 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Thu, 21 Apr 2011 16:35:46 -0700 Subject: sparc32: fix sparcstation 5 boot The sparcstation 5 I have available has no MID property for the CPU. This resulted in a panic when booting a SMP kernel on this box. The assigned field in cpu_data is never used, so if we fail to read the MID property then inform user and continue booting. Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- arch/sparc/kernel/smp_32.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 91c10fb..850a136 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -53,6 +53,7 @@ cpumask_t smp_commenced_mask = CPU_MASK_NONE; void __cpuinit smp_store_cpu_info(int id) { int cpu_node; + int mid; cpu_data(id).udelay_val = loops_per_jiffy; @@ -60,10 +61,13 @@ void __cpuinit smp_store_cpu_info(int id) cpu_data(id).clock_tick = prom_getintdefault(cpu_node, "clock-frequency", 0); cpu_data(id).prom_node = cpu_node; - cpu_data(id).mid = cpu_get_hwmid(cpu_node); + mid = cpu_get_hwmid(cpu_node); - if (cpu_data(id).mid < 0) - panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); + if (mid < 0) { + printk(KERN_NOTICE "No MID found for CPU%d at node 0x%08d", id, cpu_node); + mid = 0; + } + cpu_data(id).mid = mid; } void __init smp_cpus_done(unsigned int max_cpus) -- cgit v1.1 From e2a85aecebc03d165bc2dcd233deadd5dd97ea9f Mon Sep 17 00:00:00 2001 From: Andrea Galbusera Date: Thu, 21 Apr 2011 02:21:21 +0000 Subject: powerpc: Fix multicast problem in fs_enet driver mac-fec.c was setting individual UDP address registers instead of multicast group address registers when joining a multicast group. This prevented from correctly receiving UDP multicast packets. According to datasheet, replaced hash_table_high and hash_table_low with grp_hash_table_high and grp_hash_table_low respectively. Also renamed hash_table_* with grp_hash_table_* in struct fec declaration for 8xx: these registers are used only for multicast there. Tested on a MPC5121 based board. Build tested also against mpc866_ads_defconfig. Signed-off-by: Andrea Galbusera Signed-off-by: David S. Miller --- arch/powerpc/include/asm/8xx_immap.h | 4 ++-- drivers/net/fs_enet/mac-fec.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/8xx_immap.h b/arch/powerpc/include/asm/8xx_immap.h index 6b6dc20..bdf0563 100644 --- a/arch/powerpc/include/asm/8xx_immap.h +++ b/arch/powerpc/include/asm/8xx_immap.h @@ -393,8 +393,8 @@ typedef struct fec { uint fec_addr_low; /* lower 32 bits of station address */ ushort fec_addr_high; /* upper 16 bits of station address */ ushort res1; /* reserved */ - uint fec_hash_table_high; /* upper 32-bits of hash table */ - uint fec_hash_table_low; /* lower 32-bits of hash table */ + uint fec_grp_hash_table_high; /* upper 32-bits of hash table */ + uint fec_grp_hash_table_low; /* lower 32-bits of hash table */ uint fec_r_des_start; /* beginning of Rx descriptor ring */ uint fec_x_des_start; /* beginning of Tx descriptor ring */ uint fec_r_buff_size; /* Rx buffer size */ diff --git a/drivers/net/fs_enet/mac-fec.c b/drivers/net/fs_enet/mac-fec.c index 61035fc..b9fbc83 100644 --- a/drivers/net/fs_enet/mac-fec.c +++ b/drivers/net/fs_enet/mac-fec.c @@ -226,8 +226,8 @@ static void set_multicast_finish(struct net_device *dev) } FC(fecp, r_cntrl, FEC_RCNTRL_PROM); - FW(fecp, hash_table_high, fep->fec.hthi); - FW(fecp, hash_table_low, fep->fec.htlo); + FW(fecp, grp_hash_table_high, fep->fec.hthi); + FW(fecp, grp_hash_table_low, fep->fec.htlo); } static void set_multicast_list(struct net_device *dev) @@ -273,8 +273,8 @@ static void restart(struct net_device *dev) /* * Reset all multicast. */ - FW(fecp, hash_table_high, fep->fec.hthi); - FW(fecp, hash_table_low, fep->fec.htlo); + FW(fecp, grp_hash_table_high, fep->fec.hthi); + FW(fecp, grp_hash_table_low, fep->fec.htlo); /* * Set maximum receive buffer size. -- cgit v1.1 From e965c05dabdabb85af0187952ccd75e43995c4b3 Mon Sep 17 00:00:00 2001 From: Thomas Egerer Date: Wed, 20 Apr 2011 22:56:02 +0000 Subject: ipv6: Remove hoplimit initialization to -1 The changes introduced with git-commit a02e4b7d ("ipv6: Demark default hoplimit as zero.") missed to remove the hoplimit initialization. As a result, ipv6_get_mtu interprets the return value of dst_metric_raw (-1) as 255 and answers ping6 with this hoplimit. This patche removes the line such that ping6 is answered with the hoplimit value configured via sysctl. Signed-off-by: Thomas Egerer Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 843406f..0a5d02a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2012,7 +2012,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->dst.output = ip6_output; rt->rt6i_dev = net->loopback_dev; rt->rt6i_idev = idev; - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1); rt->dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; -- cgit v1.1 From e74fbd030223e29d269f4be17e3dce6de38f4c28 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 21 Apr 2011 00:20:04 +0000 Subject: be2net: increment work_counter in be_worker The commit 609ff3b ("be2net: add code to display temperature of ASIC") adds support to display temperature of ASIC but there is missing increment of work_counter in be_worker. Because of this 1) the function be_cmd_get_die_temperature is called every 1 second instead of every 32 seconds 2) be_cmd_get_die_temperature is called, although it is not supported. This patch fixes this bug. Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- drivers/net/benet/be_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c index 7cb5a11..02a0443 100644 --- a/drivers/net/benet/be_main.c +++ b/drivers/net/benet/be_main.c @@ -1873,6 +1873,7 @@ static void be_worker(struct work_struct *work) be_detect_dump_ue(adapter); reschedule: + adapter->work_counter++; schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000)); } -- cgit v1.1 From cb771838715b1c470bc5735bdae709b33b18e0ad Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Wed, 20 Apr 2011 09:00:49 +0000 Subject: atl1c: Fix work event interrupt/task races The mechanism used to initiate work events from the interrupt handler has a classic read/modify/write race between the interrupt handler that sets the condition, and the worker task that reads and clears the condition. Close these races by using atomic bit fields. Cc: stable@kernel.org Cc: Jie Yang Signed-off-by: Tim Gardner Signed-off-by: David S. Miller --- drivers/net/atl1c/atl1c.h | 6 +++--- drivers/net/atl1c/atl1c_main.c | 14 +++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/net/atl1c/atl1c.h b/drivers/net/atl1c/atl1c.h index 7cb375e..925929d 100644 --- a/drivers/net/atl1c/atl1c.h +++ b/drivers/net/atl1c/atl1c.h @@ -566,9 +566,9 @@ struct atl1c_adapter { #define __AT_TESTING 0x0001 #define __AT_RESETTING 0x0002 #define __AT_DOWN 0x0003 - u8 work_event; -#define ATL1C_WORK_EVENT_RESET 0x01 -#define ATL1C_WORK_EVENT_LINK_CHANGE 0x02 + unsigned long work_event; +#define ATL1C_WORK_EVENT_RESET 0 +#define ATL1C_WORK_EVENT_LINK_CHANGE 1 u32 msg_enable; bool have_msi; diff --git a/drivers/net/atl1c/atl1c_main.c b/drivers/net/atl1c/atl1c_main.c index 7d9d506..a6e1c36 100644 --- a/drivers/net/atl1c/atl1c_main.c +++ b/drivers/net/atl1c/atl1c_main.c @@ -325,7 +325,7 @@ static void atl1c_link_chg_event(struct atl1c_adapter *adapter) } } - adapter->work_event |= ATL1C_WORK_EVENT_LINK_CHANGE; + set_bit(ATL1C_WORK_EVENT_LINK_CHANGE, &adapter->work_event); schedule_work(&adapter->common_task); } @@ -337,20 +337,16 @@ static void atl1c_common_task(struct work_struct *work) adapter = container_of(work, struct atl1c_adapter, common_task); netdev = adapter->netdev; - if (adapter->work_event & ATL1C_WORK_EVENT_RESET) { - adapter->work_event &= ~ATL1C_WORK_EVENT_RESET; + if (test_and_clear_bit(ATL1C_WORK_EVENT_RESET, &adapter->work_event)) { netif_device_detach(netdev); atl1c_down(adapter); atl1c_up(adapter); netif_device_attach(netdev); - return; } - if (adapter->work_event & ATL1C_WORK_EVENT_LINK_CHANGE) { - adapter->work_event &= ~ATL1C_WORK_EVENT_LINK_CHANGE; + if (test_and_clear_bit(ATL1C_WORK_EVENT_LINK_CHANGE, + &adapter->work_event)) atl1c_check_link_status(adapter); - } - return; } @@ -369,7 +365,7 @@ static void atl1c_tx_timeout(struct net_device *netdev) struct atl1c_adapter *adapter = netdev_priv(netdev); /* Do the reset outside of interrupt context */ - adapter->work_event |= ATL1C_WORK_EVENT_RESET; + set_bit(ATL1C_WORK_EVENT_RESET, &adapter->work_event); schedule_work(&adapter->common_task); } -- cgit v1.1 From f01cb5fbea1c1613621f9f32f385e12c1a29dde0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 21 Apr 2011 21:17:25 -0700 Subject: Revert "bridge: Forward reserved group addresses if !STP" This reverts commit 1e253c3b8a1aeed51eef6fc366812f219b97de65. It breaks 802.3ad bonding inside of a bridge. The commit was meant to support transport bridging, and specifically virtual machines bridged to an ethernet interface connected to a switch port wiht 802.1x enabled. But this isn't the way to do it, it breaks too many other things. Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index e216079..0c7bada 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -164,7 +164,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto drop; /* If STP is turned off, then forward */ - if (p->br->stp_enabled == BR_NO_STP) + if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0) goto forward; if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, -- cgit v1.1 From 103b3934817a7c42fba6e1ef76ecb390a2837d40 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Apr 2011 11:03:20 -0400 Subject: perf, x86: P4 PMU -- Use perf_sample_data_init helper Instead of opencoded assignments better to use perf_sample_data_init helper. Tested-by: Lin Ming Signed-off-by: Cyrill Gorcunov Signed-off-by: Don Zickus Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1303398203-2918-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 8ff882f..ae31e96 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -912,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); -- cgit v1.1 From 3003eba313dd0e0502dd71548c36fe7c19801ce5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:41:54 -0400 Subject: lockdep: Print a nicer description for irq lock inversions Locking order inversion due to interrupts is a subtle problem. When an irq lockiinversion discovered by lockdep it currently reports something like: [ INFO: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected ] ... and then prints out the locks that are involved, as back traces. Judging by lkml feedback developers were routinely confused by what a HARDIRQ->safe to unsafe issue is all about, and sometimes even blew it off as a bug in lockdep. It is not obvious when lockdep prints this message about a lock that is never taken in interrupt context. After explaining the problems that lockdep is reporting, I decided to add a description of the problem in visual form. Now the following is shown: --- other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(lockA); local_irq_disable(); lock(&rq->lock); lock(lockA); lock(&rq->lock); *** DEADLOCK *** --- The above is the case when the unsafe lock is taken while holding a lock taken in irq context. But when a lock is taken that also grabs a unsafe lock, the call chain is shown: --- other info that might help us debug this: Chain exists of: &rq->lock --> lockA --> lockC Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(lockC); local_irq_disable(); lock(&rq->lock); lock(lockA); lock(&rq->lock); *** DEADLOCK *** Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014259.132728798@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 53a6895..7b2ffee 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } +static int __print_lock_name(struct lock_class *class) +{ + char str[KSYM_NAME_LEN]; + const char *name; + + name = class->name; + if (!name) + name = __get_key_name(class->key, str); + + return printk("%s", name); +} + static void print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; @@ -1325,6 +1337,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf, return; } +static void +print_irq_lock_scenario(struct lock_list *safe_entry, + struct lock_list *unsafe_entry, + struct held_lock *prev, + struct held_lock *next) +{ + struct lock_class *safe_class = safe_entry->class; + struct lock_class *unsafe_class = unsafe_entry->class; + struct lock_class *middle_class = hlock_class(prev); + + if (middle_class == safe_class) + middle_class = hlock_class(next); + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (middle_class != unsafe_class) { + printk("Chain exists of:\n "); + __print_lock_name(safe_class); + printk(" --> "); + __print_lock_name(middle_class); + printk(" --> "); + __print_lock_name(unsafe_class); + printk("\n\n"); + } + + printk(" Possible interrupt unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(unsafe_class); + printk(");\n"); + printk(" local_irq_disable();\n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk(" lock("); + __print_lock_name(middle_class); + printk(");\n"); + printk(" \n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + static int print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, @@ -1376,6 +1444,8 @@ print_bad_irq_dependency(struct task_struct *curr, print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); + print_irq_lock_scenario(backwards_entry, forwards_entry, prev, next); + lockdep_print_held_locks(curr); printk("\nthe dependencies between %s-irq-safe lock", irqclass); -- cgit v1.1 From f4185812aa046ecb97e8817e10148cacdd7a6baa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:41:55 -0400 Subject: lockdep: Print a nicer description for normal deadlocks The lockdep output can be pretty cryptic, having nicer output can save a lot of head scratching. When a normal deadlock scenario is detected by lockdep (lock A -> lock B and there exists a place where lock B -> lock A) we now get the following new output: other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(lockB); lock(lockA); lock(lockB); lock(lockA); *** DEADLOCK *** On cases where there's a deeper chair, it shows the partial chain that can cause the issue: Chain exists of: lockC --> lockA --> lockB Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(lockB); lock(lockA); lock(lockB); lock(lockC); *** DEADLOCK *** Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014259.380621789@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7b2ffee..73cebd7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1065,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth) return 0; } +static void +print_circular_lock_scenario(struct held_lock *src, + struct held_lock *tgt, + struct lock_list *prt) +{ + struct lock_class *source = hlock_class(src); + struct lock_class *target = hlock_class(tgt); + struct lock_class *parent = prt->class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (parent != source) { + printk("Chain exists of:\n "); + __print_lock_name(source); + printk(" --> "); + __print_lock_name(parent); + printk(" --> "); + __print_lock_name(target); + printk("\n\n"); + } + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(");\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(source); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + /* * When a circular dependency is detected, print the * header first: @@ -1108,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this, { struct task_struct *curr = current; struct lock_list *parent; + struct lock_list *first_parent; int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) @@ -1121,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this, print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); + first_parent = parent; while (parent) { print_circular_bug_entry(parent, --depth); @@ -1128,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this, } printk("\nother info that might help us debug this:\n\n"); + print_circular_lock_scenario(check_src, check_tgt, + first_parent); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); -- cgit v1.1 From 48702ecf308e53f176c1f6fdc193d622ded54ac0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:41:56 -0400 Subject: lockdep: Print a nicer description for simple deadlocks Lockdep output can be pretty cryptic, having nicer output can save a lot of head scratching. When a simple deadlock scenario is detected by lockdep (lock A -> lock A) we now get the following new output: other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(lock)->rlock); lock(&(lock)->rlock); *** DEADLOCK *** Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014259.643930104@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 73cebd7..c4cc5d1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1664,6 +1664,26 @@ static inline void inc_chains(void) #endif +static void +print_deadlock_scenario(struct held_lock *nxt, + struct held_lock *prv) +{ + struct lock_class *next = hlock_class(nxt); + struct lock_class *prev = hlock_class(prv); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(prev); + printk(");\n"); + printk(" lock("); + __print_lock_name(next); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); + printk(" May be due to missing lock nesting notation\n\n"); +} + static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) @@ -1682,6 +1702,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, print_lock(prev); printk("\nother info that might help us debug this:\n"); + print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); -- cgit v1.1 From dad3d7435e1d8c254d6877dc06852dc00c5da812 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:41:57 -0400 Subject: lockdep: Print a nicer description for irq inversion bugs Irq inversion and irq dependency bugs are only subtly different. The diffenerence lies where the interrupt occurred. For irq dependency: irq_disable lock(A) lock(B) unlock(B) unlock(A) irq_enable lock(B) unlock(B) lock(A) The interrupt comes in after it has been established that lock A can be held when taking an irq unsafe lock. Lockdep detects the problem when taking lock A in interrupt context. With the irq_inversion the irq happens before it is established and lockdep detects the problem with the taking of lock B: lock(A) irq_disable lock(A) lock(B) unlock(B) unlock(A) irq_enable lock(B) unlock(B) Since the problem with the locking logic for both of these issues is in actuality the same, they both should report the same scenario. This patch implements that and prints this: other info that might help us debug this: Chain exists of: &rq->lock --> lockA --> lockC Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(lockC); local_irq_disable(); lock(&rq->lock); lock(lockA); lock(&rq->lock); *** DEADLOCK *** Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014259.910720381@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c4cc5d1..0b497dd 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1395,15 +1395,15 @@ print_shortest_lock_dependencies(struct lock_list *leaf, static void print_irq_lock_scenario(struct lock_list *safe_entry, struct lock_list *unsafe_entry, - struct held_lock *prev, - struct held_lock *next) + struct lock_class *prev_class, + struct lock_class *next_class) { struct lock_class *safe_class = safe_entry->class; struct lock_class *unsafe_class = unsafe_entry->class; - struct lock_class *middle_class = hlock_class(prev); + struct lock_class *middle_class = prev_class; if (middle_class == safe_class) - middle_class = hlock_class(next); + middle_class = next_class; /* * A direct locking problem where unsafe_class lock is taken @@ -1499,7 +1499,8 @@ print_bad_irq_dependency(struct task_struct *curr, print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); - print_irq_lock_scenario(backwards_entry, forwards_entry, prev, next); + print_irq_lock_scenario(backwards_entry, forwards_entry, + hlock_class(prev), hlock_class(next)); lockdep_print_held_locks(curr); @@ -2219,6 +2220,10 @@ print_irq_inversion_bug(struct task_struct *curr, struct held_lock *this, int forwards, const char *irqclass) { + struct lock_list *entry = other; + struct lock_list *middle = NULL; + int depth; + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; @@ -2237,6 +2242,25 @@ print_irq_inversion_bug(struct task_struct *curr, printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); printk("\nother info that might help us debug this:\n"); + + /* Find a middle lock (if one exists) */ + depth = get_lock_depth(other); + do { + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + middle = entry; + entry = get_lock_parent(entry); + depth--; + } while (entry && entry != root && (depth >= 0)); + if (forwards) + print_irq_lock_scenario(root, other, + middle ? middle->class : root->class, other->class); + else + print_irq_lock_scenario(other, root, + middle ? middle->class : other->class, root->class); + lockdep_print_held_locks(curr); printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); -- cgit v1.1 From 6be8c3935b914dfbc24b27c91c2b6d583645e61a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:41:58 -0400 Subject: lockdep: Replace "Bad BFS generated tree" message with something less cryptic The message of "Bad BFS generated tree" is a bit confusing. Replace it with a more sane error message. Thanks to Peter Zijlstra for helping me come up with a better message. Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014300.135521252@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0b497dd..270cfa4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1381,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, printk("\n"); if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad BFS generated tree\n", __func__); + printk("lockdep:%s bad path found in chain graph\n", __func__); break; } -- cgit v1.1 From 282b5c2f6f663c008444321fd8fcdd374596046b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:41:59 -0400 Subject: lockdep: Print a nicer description for simple irq lock inversions Lockdep output can be pretty cryptic, having nicer output can save a lot of head scratching. When a simple irq inversion scenario is detected by lockdep (lock A taken in interrupt context but also in thread context without disabling interrupts) we now get the following (hopefully more informative) output: other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(lockA); lock(lockA); *** DEADLOCK *** Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014300.436140880@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 270cfa4..27c609f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2158,6 +2158,24 @@ static void check_chain_key(struct task_struct *curr) #endif } +static void +print_usage_bug_scenario(struct held_lock *lock) +{ + struct lock_class *class = hlock_class(lock); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk(" \n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + static int print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) @@ -2186,6 +2204,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_irqtrace_events(curr); printk("\nother info that might help us debug this:\n"); + print_usage_bug_scenario(this); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); -- cgit v1.1 From e0944ee63f7249802be74454cef81c97630ae1cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:42:00 -0400 Subject: lockdep: Remove cmpxchg to update nr_chain_hlocks For some reason nr_chain_hlocks is updated with cmpxchg, but this is performed inside of the lockdep global "grab_lock()", which also makes simple modification of this variable atomic. Remove the cmpxchg logic for updating nr_chain_hlocks and simplify the code. Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110421014300.727863282@goodmis.org Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 27c609f..63437d0 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1973,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; - int i, j, n, cn; + int i, j; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -2033,15 +2033,9 @@ cache_hit: } i++; chain->depth = curr->lockdep_depth + 1 - i; - cn = nr_chain_hlocks; - while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { - n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); - if (n == cn) - break; - cn = n; - } - if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = cn; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; -- cgit v1.1 From 13f172ff26563995049abe73f6eeba828de3c09d Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 22 Apr 2011 08:10:59 +0000 Subject: netconsole: fix deadlock when removing net driver that netconsole is using (v2) A deadlock was reported to me recently that occured when netconsole was being used in a virtual guest. If the virtio_net driver was removed while netconsole was setup to use an interface that was driven by that driver, the guest deadlocked. No backtrace was provided because netconsole was the only console configured, but it became clear pretty quickly what the problem was. In netconsole_netdev_event, if we get an unregister event, we call __netpoll_cleanup with the target_list_lock held and irqs disabled. __netpoll_cleanup can, if pending netpoll packets are waiting call cancel_delayed_work_sync, which is a sleeping path. the might_sleep call in that path gets triggered, causing a console warning to be issued. The netconsole write handler of course tries to take the target_list_lock again, which we already hold, causing deadlock. The fix is pretty striaghtforward. Simply drop the target_list_lock and re-enable irqs prior to calling __netpoll_cleanup, the re-acquire the lock, and restart the loop. Confirmed by myself to fix the problem reported. Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- drivers/net/netconsole.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index dfb67eb..eb41e44 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -671,6 +671,7 @@ static int netconsole_netdev_event(struct notifier_block *this, goto done; spin_lock_irqsave(&target_list_lock, flags); +restart: list_for_each_entry(nt, &target_list, list) { netconsole_target_get(nt); if (nt->np.dev == dev) { @@ -683,9 +684,16 @@ static int netconsole_netdev_event(struct notifier_block *this, * rtnl_lock already held */ if (nt->np.dev) { + spin_unlock_irqrestore( + &target_list_lock, + flags); __netpoll_cleanup(&nt->np); + spin_lock_irqsave(&target_list_lock, + flags); dev_put(nt->np.dev); nt->np.dev = NULL; + netconsole_target_put(nt); + goto restart; } /* Fall through */ case NETDEV_GOING_DOWN: -- cgit v1.1 From 764b0c4b3256ad4431cb52eaf99c0abe6df0a085 Mon Sep 17 00:00:00 2001 From: Pavan Savoy Date: Fri, 8 Apr 2011 04:57:42 -0500 Subject: drivers:misc:ti-st: handle delayed tty receive When certain technologies shutdown their interface without waiting for the acknowledgement from the chip. The receive_buf from the TTY would be invoked a while after the relevant technology is unregistered. This patch introduces a new flag "is_registered" which maintains the state of protocols BT, FM or GPS and thereby removes the need to clear the protocol data from ST when protocols gets unregistered. This fixes corner cases when HCI RESET is sent down from bluetooth stack and the receive_buf is called from tty after 250ms before which bluetooth would have unregistered from the system. OR - when FM application decides to close down the device without sending a power-off FM command resulting in some RDS data or interrupt data coming in after the driver is unregistered. Signed-off-by: Pavan Savoy Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ti-st/st_core.c | 23 +++++++++++++---------- include/linux/ti_wilink_st.h | 3 ++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c index 486117f..f91f82e 100644 --- a/drivers/misc/ti-st/st_core.c +++ b/drivers/misc/ti-st/st_core.c @@ -43,13 +43,15 @@ static void add_channel_to_table(struct st_data_s *st_gdata, pr_info("%s: id %d\n", __func__, new_proto->chnl_id); /* list now has the channel id as index itself */ st_gdata->list[new_proto->chnl_id] = new_proto; + st_gdata->is_registered[new_proto->chnl_id] = true; } static void remove_channel_from_table(struct st_data_s *st_gdata, struct st_proto_s *proto) { pr_info("%s: id %d\n", __func__, proto->chnl_id); - st_gdata->list[proto->chnl_id] = NULL; +/* st_gdata->list[proto->chnl_id] = NULL; */ + st_gdata->is_registered[proto->chnl_id] = false; } /* @@ -104,7 +106,7 @@ void st_send_frame(unsigned char chnl_id, struct st_data_s *st_gdata) if (unlikely (st_gdata == NULL || st_gdata->rx_skb == NULL - || st_gdata->list[chnl_id] == NULL)) { + || st_gdata->is_registered[chnl_id] == false)) { pr_err("chnl_id %d not registered, no data to send?", chnl_id); kfree_skb(st_gdata->rx_skb); @@ -141,14 +143,15 @@ void st_reg_complete(struct st_data_s *st_gdata, char err) unsigned char i = 0; pr_info(" %s ", __func__); for (i = 0; i < ST_MAX_CHANNELS; i++) { - if (likely(st_gdata != NULL && st_gdata->list[i] != NULL && - st_gdata->list[i]->reg_complete_cb != NULL)) { + if (likely(st_gdata != NULL && + st_gdata->is_registered[i] == true && + st_gdata->list[i]->reg_complete_cb != NULL)) { st_gdata->list[i]->reg_complete_cb (st_gdata->list[i]->priv_data, err); pr_info("protocol %d's cb sent %d\n", i, err); if (err) { /* cleanup registered protocol */ st_gdata->protos_registered--; - st_gdata->list[i] = NULL; + st_gdata->is_registered[i] = false; } } } @@ -475,9 +478,9 @@ void kim_st_list_protocols(struct st_data_s *st_gdata, void *buf) { seq_printf(buf, "[%d]\nBT=%c\nFM=%c\nGPS=%c\n", st_gdata->protos_registered, - st_gdata->list[0x04] != NULL ? 'R' : 'U', - st_gdata->list[0x08] != NULL ? 'R' : 'U', - st_gdata->list[0x09] != NULL ? 'R' : 'U'); + st_gdata->is_registered[0x04] == true ? 'R' : 'U', + st_gdata->is_registered[0x08] == true ? 'R' : 'U', + st_gdata->is_registered[0x09] == true ? 'R' : 'U'); } /********************************************************************/ @@ -504,7 +507,7 @@ long st_register(struct st_proto_s *new_proto) return -EPROTONOSUPPORT; } - if (st_gdata->list[new_proto->chnl_id] != NULL) { + if (st_gdata->is_registered[new_proto->chnl_id] == true) { pr_err("chnl_id %d already registered", new_proto->chnl_id); return -EALREADY; } @@ -563,7 +566,7 @@ long st_register(struct st_proto_s *new_proto) /* check for already registered once more, * since the above check is old */ - if (st_gdata->list[new_proto->chnl_id] != NULL) { + if (st_gdata->is_registered[new_proto->chnl_id] == true) { pr_err(" proto %d already registered ", new_proto->chnl_id); return -EALREADY; diff --git a/include/linux/ti_wilink_st.h b/include/linux/ti_wilink_st.h index 7071ec5..b004e55 100644 --- a/include/linux/ti_wilink_st.h +++ b/include/linux/ti_wilink_st.h @@ -140,12 +140,12 @@ extern long st_unregister(struct st_proto_s *); */ struct st_data_s { unsigned long st_state; - struct tty_struct *tty; struct sk_buff *tx_skb; #define ST_TX_SENDING 1 #define ST_TX_WAKEUP 2 unsigned long tx_state; struct st_proto_s *list[ST_MAX_CHANNELS]; + bool is_registered[ST_MAX_CHANNELS]; unsigned long rx_state; unsigned long rx_count; struct sk_buff *rx_skb; @@ -155,6 +155,7 @@ struct st_data_s { unsigned char protos_registered; unsigned long ll_state; void *kim_data; + struct tty_struct *tty; }; /* -- cgit v1.1 From fc2711992b8601c20b7cc078f533e55c3106fbd4 Mon Sep 17 00:00:00 2001 From: Pavan Savoy Date: Fri, 8 Apr 2011 04:57:43 -0500 Subject: drivers:misc:ti-st: remove rfkill dependency rfkill is no longer used by Texas Instruments shared transport driver to communicate with user-space. This patch removes the dependency of rfkill to be enabled to build shared transport driver in the Kconfig. Signed-off-by: Pavan Savoy Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ti-st/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/ti-st/Kconfig b/drivers/misc/ti-st/Kconfig index 2c8c3f39..7c3e1069 100644 --- a/drivers/misc/ti-st/Kconfig +++ b/drivers/misc/ti-st/Kconfig @@ -5,7 +5,6 @@ menu "Texas Instruments shared transport line discipline" config TI_ST tristate "Shared transport core driver" - depends on RFKILL select FW_LOADER help This enables the shared transport core driver for TI -- cgit v1.1 From 8497d6a21c4b17052e868bd53a74c82b557a6c46 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Apr 2011 19:05:37 +0200 Subject: driver-core: fix race between device_register and driver_register When a device is registered to a bus it will be a) added to the list of devices of the bus and b) bind to a driver (if one matches). As a result of a driver being registered at this bus between a) and b) this device could already be bound to a driver. This leads to a warning and incorrect refcounting. To fix this add a check to device_attach to identify an already bound device. Signed-off-by: Sebastian Ott Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index da57ee9..7e9219b 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -245,6 +245,10 @@ int device_attach(struct device *dev) device_lock(dev); if (dev->driver) { + if (klist_node_attached(&dev->p->knode_driver)) { + ret = 1; + goto out_unlock; + } ret = device_bind_driver(dev); if (ret == 0) ret = 1; @@ -257,6 +261,7 @@ int device_attach(struct device *dev) ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach); pm_runtime_put_sync(dev); } +out_unlock: device_unlock(dev); return ret; } -- cgit v1.1 From 7f100d1566d6ee353a43be92599511fc438ec281 Mon Sep 17 00:00:00 2001 From: Karthigan Srinivasan Date: Mon, 18 Apr 2011 16:16:52 -0500 Subject: drivers/base/core.c: Fixed brace coding style issue. Fixed brace coding style issue. Signed-off-by: Karthigan Srinivasan Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index fb8130c..bc8729d 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1314,8 +1314,7 @@ EXPORT_SYMBOL_GPL(put_device); EXPORT_SYMBOL_GPL(device_create_file); EXPORT_SYMBOL_GPL(device_remove_file); -struct root_device -{ +struct root_device { struct device dev; struct module *owner; }; -- cgit v1.1 From 27a33f9e8fb203e71925257cf039fe6ec623c5d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2011 09:44:42 +0200 Subject: driver core/platform_device_add_data: set platform_data to NULL if !data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the data = NULL case more consistent to the data != NULL case. The functional change is that now platform_device_add_data(somepdev, NULL, somesize) sets pdev->dev.platform_data to NULL instead of not touching it. Reviewed-by: Viresh Kumar Signed-off-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 9e0e4fc..65cb4c3 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -220,17 +220,16 @@ EXPORT_SYMBOL_GPL(platform_device_add_resources); int platform_device_add_data(struct platform_device *pdev, const void *data, size_t size) { - void *d; + void *d = NULL; - if (!data) - return 0; - - d = kmemdup(data, size, GFP_KERNEL); - if (d) { - pdev->dev.platform_data = d; - return 0; + if (data) { + d = kmemdup(data, size, GFP_KERNEL); + if (!d) + return -ENOMEM; } - return -ENOMEM; + + pdev->dev.platform_data = d; + return 0; } EXPORT_SYMBOL_GPL(platform_device_add_data); -- cgit v1.1 From 251e031d132ea3d03e0a32f2240c67f449979c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2011 09:44:43 +0200 Subject: driver core/platform_device_add_data: free platform data before overwriting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Viresh Kumar Signed-off-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 65cb4c3..58ad8e8 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -228,6 +228,7 @@ int platform_device_add_data(struct platform_device *pdev, const void *data, return -ENOMEM; } + kfree(pdev->dev.platform_data); pdev->dev.platform_data = d; return 0; } -- cgit v1.1 From cea896238fbfdbce254f51fc8fd78c59df50081f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2011 09:44:44 +0200 Subject: driver core/platform_device_add_resources: set resource to NULL if !res MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the res = NULL case more consistant to the res != NULL case as now both overwrite pdev->resource. Reviewed-by: Viresh Kumar Signed-off-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 58ad8e8..667f282 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -192,18 +192,17 @@ EXPORT_SYMBOL_GPL(platform_device_alloc); int platform_device_add_resources(struct platform_device *pdev, const struct resource *res, unsigned int num) { - struct resource *r; + struct resource *r = NULL; - if (!res) - return 0; - - r = kmemdup(res, sizeof(struct resource) * num, GFP_KERNEL); - if (r) { - pdev->resource = r; - pdev->num_resources = num; - return 0; + if (res) { + r = kmemdup(res, sizeof(struct resource) * num, GFP_KERNEL); + if (!r) + return -ENOMEM; } - return -ENOMEM; + + pdev->resource = r; + pdev->num_resources = num; + return 0; } EXPORT_SYMBOL_GPL(platform_device_add_resources); -- cgit v1.1 From 4a03d6f7c863a039b937649a93341615f531358e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2011 09:44:45 +0200 Subject: driver core/platform_device_add_resources: free resource before overwriting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Viresh Kumar Signed-off-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 667f282..7d4bdaf 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -200,6 +200,7 @@ int platform_device_add_resources(struct platform_device *pdev, return -ENOMEM; } + kfree(pdev->resource); pdev->resource = r; pdev->num_resources = num; return 0; -- cgit v1.1 From c8705082404823a5bb3e02a32ba0764399b9e6f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2011 09:44:46 +0200 Subject: driver core: let dev_set_drvdata return int instead of void as it can fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit b402843 (Driver core: move dev_get/set_drvdata to drivers/base/dd.c) calling dev_set_drvdata with dev=NULL was an unchecked error. After some discussion about what to return in this case removing the check (and so producing a null pointer exception) seems fine. Signed-off-by: Uwe Kleine-König Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 7 +++---- include/linux/device.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 7e9219b..e3a3eff 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -413,17 +413,16 @@ void *dev_get_drvdata(const struct device *dev) } EXPORT_SYMBOL(dev_get_drvdata); -void dev_set_drvdata(struct device *dev, void *data) +int dev_set_drvdata(struct device *dev, void *data) { int error; - if (!dev) - return; if (!dev->p) { error = device_private_init(dev); if (error) - return; + return error; } dev->p->driver_data = data; + return 0; } EXPORT_SYMBOL(dev_set_drvdata); diff --git a/include/linux/device.h b/include/linux/device.h index 350ceda..2215d01 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -557,7 +557,7 @@ extern int device_move(struct device *dev, struct device *new_parent, extern const char *device_get_devnode(struct device *dev, mode_t *mode, const char **tmp); extern void *dev_get_drvdata(const struct device *dev); -extern void dev_set_drvdata(struct device *dev, void *data); +extern int dev_set_drvdata(struct device *dev, void *data); /* * Root device objects for grouping under /sys/devices -- cgit v1.1 From 568aa7508c99952ef18e4bfed87545e66f9e042f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 19 Apr 2011 10:21:20 +0200 Subject: ARM: at91: AT91CAP9 has a macb device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ee621dd (net: atmel_macb Kconfig: remove long dependency line) replaced a list of several explicit machines in the dependencies of MACB by a single symbol that is selected by the respective machines. ee621dd missed to let ARCH_AT91CAP9 select HAVE_NET_MACB though which is fixed here. Signed-off-by: Uwe Kleine-König Acked-by: Andrew Victor Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD --- arch/arm/mach-at91/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig index 1939023..2d299bf 100644 --- a/arch/arm/mach-at91/Kconfig +++ b/arch/arm/mach-at91/Kconfig @@ -83,6 +83,7 @@ config ARCH_AT91CAP9 select CPU_ARM926T select GENERIC_CLOCKEVENTS select HAVE_FB_ATMEL + select HAVE_NET_MACB config ARCH_AT572D940HF bool "AT572D940HF" -- cgit v1.1 From 0312e826a48168761dbbe10a1e8cbc22ebc99767 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 30 Mar 2011 22:31:15 +1000 Subject: arm: at91: minimal defconfig for at91x40 SoC A minimal defconfig for build testing the AT91x40 SoC. Signed-off-by: Greg Ungerer Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD --- arch/arm/configs/at91x40_defconfig | 48 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 arch/arm/configs/at91x40_defconfig diff --git a/arch/arm/configs/at91x40_defconfig b/arch/arm/configs/at91x40_defconfig new file mode 100644 index 0000000..c55e921 --- /dev/null +++ b/arch/arm/configs/at91x40_defconfig @@ -0,0 +1,48 @@ +CONFIG_EXPERIMENTAL=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_EMBEDDED=y +# CONFIG_HOTPLUG is not set +# CONFIG_ELF_CORE is not set +# CONFIG_FUTEX is not set +# CONFIG_TIMERFD is not set +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_SLAB=y +# CONFIG_LBDAF is not set +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_IOSCHED_CFQ is not set +# CONFIG_MMU is not set +CONFIG_ARCH_AT91=y +CONFIG_ARCH_AT91X40=y +CONFIG_MACH_AT91EB01=y +CONFIG_AT91_EARLY_USART0=y +CONFIG_CPU_ARM7TDMI=y +CONFIG_SET_MEM_PARAM=y +CONFIG_DRAM_BASE=0x01000000 +CONFIG_DRAM_SIZE=0x00400000 +CONFIG_FLASH_MEM_BASE=0x01400000 +CONFIG_PROCESSOR_ID=0x14000040 +CONFIG_ZBOOT_ROM_TEXT=0x0 +CONFIG_ZBOOT_ROM_BSS=0x0 +CONFIG_BINFMT_FLAT=y +# CONFIG_SUSPEND is not set +# CONFIG_FW_LOADER is not set +CONFIG_MTD=y +CONFIG_MTD_PARTITIONS=y +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_RAM=y +CONFIG_MTD_ROM=y +CONFIG_BLK_DEV_RAM=y +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_DEVKMEM is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_HWMON is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_EXT2_FS=y +# CONFIG_DNOTIFY is not set +CONFIG_ROMFS_FS=y +# CONFIG_ENABLE_MUST_CHECK is not set -- cgit v1.1 From 91a2f4d3cd2a2f1a2c6830896bd2403ca0130137 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 30 Mar 2011 14:43:32 +1000 Subject: arm: at91: fix compiler warning for eb01 board build Fix compiler warning when building for AT91EB01 board: arch/arm/mach-at91/board-eb01.c:41: warning: initialisation from incompatible pointer type Signed-off-by: Greg Ungerer Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD --- arch/arm/mach-at91/board-eb01.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-at91/board-eb01.c b/arch/arm/mach-at91/board-eb01.c index 1f9d3cb..d8df59a 100644 --- a/arch/arm/mach-at91/board-eb01.c +++ b/arch/arm/mach-at91/board-eb01.c @@ -30,6 +30,11 @@ #include #include "generic.h" +static void __init at91eb01_init_irq(void) +{ + at91x40_init_interrupts(NULL); +} + static void __init at91eb01_map_io(void) { at91x40_initialize(40000000); @@ -38,7 +43,7 @@ static void __init at91eb01_map_io(void) MACHINE_START(AT91EB01, "Atmel AT91 EB01") /* Maintainer: Greg Ungerer */ .timer = &at91x40_timer, - .init_irq = at91x40_init_interrupts, + .init_irq = at91eb01_init_irq, .map_io = at91eb01_map_io, MACHINE_END -- cgit v1.1 From 9baeb7e47aed8e399d15d2ea8c032efe3680f20b Mon Sep 17 00:00:00 2001 From: Jean-Christophe PLAGNIOL-VILLARD Date: Sat, 23 Apr 2011 10:52:16 +0800 Subject: at91: Add ARCH_ID and basic cpu macros definition for 5series chips family. Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Nicolas Ferre --- arch/arm/mach-at91/include/mach/cpu.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/arm/mach-at91/include/mach/cpu.h b/arch/arm/mach-at91/include/mach/cpu.h index 3bef931..0700f21 100644 --- a/arch/arm/mach-at91/include/mach/cpu.h +++ b/arch/arm/mach-at91/include/mach/cpu.h @@ -27,6 +27,7 @@ #define ARCH_ID_AT91SAM9G45 0x819b05a0 #define ARCH_ID_AT91SAM9G45MRL 0x819b05a2 /* aka 9G45-ES2 & non ES lots */ #define ARCH_ID_AT91SAM9G45ES 0x819b05a1 /* 9G45-ES (Engineering Sample) */ +#define ARCH_ID_AT91SAM9X5 0x819a05a0 #define ARCH_ID_AT91CAP9 0x039A03A0 #define ARCH_ID_AT91SAM9XE128 0x329973a0 @@ -55,6 +56,12 @@ static inline unsigned long at91_cpu_fully_identify(void) #define ARCH_EXID_AT91SAM9G46 0x00000003 #define ARCH_EXID_AT91SAM9G45 0x00000004 +#define ARCH_EXID_AT91SAM9G15 0x00000000 +#define ARCH_EXID_AT91SAM9G35 0x00000001 +#define ARCH_EXID_AT91SAM9X35 0x00000002 +#define ARCH_EXID_AT91SAM9G25 0x00000003 +#define ARCH_EXID_AT91SAM9X25 0x00000004 + static inline unsigned long at91_exid_identify(void) { return at91_sys_read(AT91_DBGU_EXID); @@ -143,6 +150,27 @@ static inline unsigned long at91cap9_rev_identify(void) #define cpu_is_at91sam9m11() (0) #endif +#ifdef CONFIG_ARCH_AT91SAM9X5 +#define cpu_is_at91sam9x5() (at91_cpu_identify() == ARCH_ID_AT91SAM9X5) +#define cpu_is_at91sam9g15() (cpu_is_at91sam9x5() && \ + (at91_exid_identify() == ARCH_EXID_AT91SAM9G15)) +#define cpu_is_at91sam9g35() (cpu_is_at91sam9x5() && \ + (at91_exid_identify() == ARCH_EXID_AT91SAM9G35)) +#define cpu_is_at91sam9x35() (cpu_is_at91sam9x5() && \ + (at91_exid_identify() == ARCH_EXID_AT91SAM9X35)) +#define cpu_is_at91sam9g25() (cpu_is_at91sam9x5() && \ + (at91_exid_identify() == ARCH_EXID_AT91SAM9G25)) +#define cpu_is_at91sam9x25() (cpu_is_at91sam9x5() && \ + (at91_exid_identify() == ARCH_EXID_AT91SAM9X25)) +#else +#define cpu_is_at91sam9x5() (0) +#define cpu_is_at91sam9g15() (0) +#define cpu_is_at91sam9g35() (0) +#define cpu_is_at91sam9x35() (0) +#define cpu_is_at91sam9g25() (0) +#define cpu_is_at91sam9x25() (0) +#endif + #ifdef CONFIG_ARCH_AT91CAP9 #define cpu_is_at91cap9() (at91_cpu_identify() == ARCH_ID_AT91CAP9) #define cpu_is_at91cap9_revB() (at91cap9_rev_identify() == ARCH_REVISION_CAP9_B) -- cgit v1.1 From 0911f124bf55357803d53197cc1ae5479f5e37e2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Apr 2011 11:01:51 +0200 Subject: genirq: Forgotten updates/deletions after removal of compat code commit 0c6f8a8b917ad361319c8ace3e9f28e69bfdb4c1 ("genirq: Remove compat code") removed the compat code, but forgot to update some references in comments and delete some of its documentation. Signed-off-by: Geert Uytterhoeven Link: http://lkml.kernel.org/r/%3C1302426113-13808-1-git-send-email-geert%40linux-m68k.org%3E Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 19 +------------------ include/linux/irqdesc.h | 2 +- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 09a3080..a71dd18 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -53,7 +53,7 @@ typedef void (*irq_preflow_handler_t)(struct irq_data *data); * Bits which can be modified via irq_set/clear/modify_status_flags() * IRQ_LEVEL - Interrupt is level type. Will be also * updated in the code when the above trigger - * bits are modified via set_irq_type() + * bits are modified via irq_set_irq_type() * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect * it from affinity setting * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing @@ -261,23 +261,6 @@ static inline void irqd_clr_chained_irq_inprogress(struct irq_data *d) * struct irq_chip - hardware interrupt chip descriptor * * @name: name for /proc/interrupts - * @startup: deprecated, replaced by irq_startup - * @shutdown: deprecated, replaced by irq_shutdown - * @enable: deprecated, replaced by irq_enable - * @disable: deprecated, replaced by irq_disable - * @ack: deprecated, replaced by irq_ack - * @mask: deprecated, replaced by irq_mask - * @mask_ack: deprecated, replaced by irq_mask_ack - * @unmask: deprecated, replaced by irq_unmask - * @eoi: deprecated, replaced by irq_eoi - * @end: deprecated, will go away with __do_IRQ() - * @set_affinity: deprecated, replaced by irq_set_affinity - * @retrigger: deprecated, replaced by irq_retrigger - * @set_type: deprecated, replaced by irq_set_type - * @set_wake: deprecated, replaced by irq_wake - * @bus_lock: deprecated, replaced by irq_bus_lock - * @bus_sync_unlock: deprecated, replaced by irq_bus_sync_unlock - * * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index a082905..8e1dc8e 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -21,7 +21,7 @@ struct timer_rand_state; * @status: status information * @core_internal_state__do_not_mess_with_it: core internal status information * @depth: disable-depth, for nested irq_disable() calls - * @wake_depth: enable depth, for multiple set_irq_wake() callers + * @wake_depth: enable depth, for multiple irq_set_irq_wake() callers * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts -- cgit v1.1 From ee430599bf63c13ee521a352f562a4281cba5e61 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Apr 2011 11:01:53 +0200 Subject: genirq: Update DocBook comments Fix some parts to match the actual code. [ tglx: Resolved the FIXMEs Gerd put in ] Signed-off-by: Geert Uytterhoeven Link: http://lkml.kernel.org/r/%3C1302426113-13808-3-git-send-email-geert%40linux-m68k.org%3E Signed-off-by: Thomas Gleixner Cc: linux-doc@vger.kernel.org --- Documentation/DocBook/genericirq.tmpl | 82 ++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl index fb10fd0..b342234 100644 --- a/Documentation/DocBook/genericirq.tmpl +++ b/Documentation/DocBook/genericirq.tmpl @@ -191,8 +191,8 @@ Whenever an interrupt triggers, the lowlevel arch code calls into the generic interrupt code by calling desc->handle_irq(). - This highlevel IRQ handling function only uses desc->chip primitives - referenced by the assigned chip descriptor structure. + This highlevel IRQ handling function only uses desc->irq_data.chip + primitives referenced by the assigned chip descriptor structure. @@ -206,11 +206,11 @@ enable_irq() disable_irq_nosync() (SMP only) synchronize_irq() (SMP only) - set_irq_type() - set_irq_wake() - set_irq_data() - set_irq_chip() - set_irq_chip_data() + irq_set_irq_type() + irq_set_irq_wake() + irq_set_handler_data() + irq_set_chip() + irq_set_chip_data() See the autogenerated function documentation for details. @@ -225,6 +225,8 @@ handle_fasteoi_irq handle_simple_irq handle_percpu_irq + handle_edge_eoi_irq + handle_bad_irq The interrupt flow handlers (either predefined or architecture specific) are assigned to specific interrupts by the architecture @@ -241,13 +243,13 @@ default_enable(struct irq_data *data) { - desc->chip->irq_unmask(data); + desc->irq_data.chip->irq_unmask(data); } default_disable(struct irq_data *data) { if (!delay_disable(data)) - desc->chip->irq_mask(data); + desc->irq_data.chip->irq_mask(data); } default_ack(struct irq_data *data) @@ -284,9 +286,9 @@ noop(struct irq_data *data)) The following control flow is implemented (simplified excerpt): -desc->chip->irq_mask(); -handle_IRQ_event(desc->action); -desc->chip->irq_unmask(); +desc->irq_data.chip->irq_mask_ack(); +handle_irq_event(desc->action); +desc->irq_data.chip->irq_unmask(); @@ -300,8 +302,8 @@ desc->chip->irq_unmask(); The following control flow is implemented (simplified excerpt): -handle_IRQ_event(desc->action); -desc->chip->irq_eoi(); +handle_irq_event(desc->action); +desc->irq_data.chip->irq_eoi(); @@ -315,17 +317,17 @@ desc->chip->irq_eoi(); The following control flow is implemented (simplified excerpt): if (desc->status & running) { - desc->chip->irq_mask(); + desc->irq_data.chip->irq_mask_ack(); desc->status |= pending | masked; return; } -desc->chip->irq_ack(); +desc->irq_data.chip->irq_ack(); desc->status |= running; do { if (desc->status & masked) - desc->chip->irq_unmask(); + desc->irq_data.chip->irq_unmask(); desc->status &= ~pending; - handle_IRQ_event(desc->action); + handle_irq_event(desc->action); } while (status & pending); desc->status &= ~running; @@ -344,7 +346,7 @@ desc->status &= ~running; The following control flow is implemented (simplified excerpt): -handle_IRQ_event(desc->action); +handle_irq_event(desc->action); @@ -362,12 +364,29 @@ handle_IRQ_event(desc->action); The following control flow is implemented (simplified excerpt): -handle_IRQ_event(desc->action); -if (desc->chip->irq_eoi) - desc->chip->irq_eoi(); +if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(); +handle_irq_event(desc->action); +if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(); + + EOI Edge IRQ flow handler + + handle_edge_eoi_irq provides an abnomination of the edge + handler which is solely used to tame a badly wreckaged + irq controller on powerpc/cell. + + + + Bad IRQ flow handler + + handle_bad_irq is used for spurious interrupts which + have no real handler assigned.. + + Quirks and optimizations @@ -410,6 +429,7 @@ if (desc->chip->irq_eoi) irq_mask_ack() - Optional, recommended for performance irq_mask() irq_unmask() + irq_eoi() - Optional, required for eoi flow handlers irq_retrigger() - Optional irq_set_type() - Optional irq_set_wake() - Optional @@ -424,32 +444,24 @@ if (desc->chip->irq_eoi) __do_IRQ entry point - The original implementation __do_IRQ() is an alternative entry - point for all types of interrupts. + The original implementation __do_IRQ() was an alternative entry + point for all types of interrupts. It not longer exists. This handler turned out to be not suitable for all interrupt hardware and was therefore reimplemented with split - functionality for egde/level/simple/percpu interrupts. This is not + functionality for edge/level/simple/percpu interrupts. This is not only a functional optimization. It also shortens code paths for interrupts. - - To make use of the split implementation, replace the call to - __do_IRQ by a call to desc->handle_irq() and associate - the appropriate handler function to desc->handle_irq(). - In most cases the generic handler implementations should - be sufficient. - Locking on SMP The locking of chip registers is up to the architecture that - defines the chip primitives. There is a chip->lock field that can be used - for serialization, but the generic layer does not touch it. The per-irq - structure is protected via desc->lock, by the generic layer. + defines the chip primitives. The per-irq structure is + protected via desc->lock, by the generic layer. -- cgit v1.1 From 770767787c23040dc152e7ae230597ff55b39470 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Apr 2011 11:01:52 +0200 Subject: genirq: irq_desc: Document preflow_handler and affinity_hint [ tglx: Filled in the FIXME place holders ] Signed-off-by: Geert Uytterhoeven Link: http://lkml.kernel.org/r/%3C1302426113-13808-2-git-send-email-geert%40linux-m68k.org%3E Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 8e1dc8e..c70b1aa 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -16,7 +16,8 @@ struct timer_rand_state; * @irq_data: per irq and chip data passed down to chip functions * @timer_rand_state: pointer to timer rand state struct * @kstat_irqs: irq stats per cpu - * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] + * @handle_irq: highlevel irq-events handler + * @preflow_handler: handler called before the flow handler (currently used by sparc) * @action: the irq action chain * @status: status information * @core_internal_state__do_not_mess_with_it: core internal status information @@ -26,6 +27,7 @@ struct timer_rand_state; * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP + * @affinity_hint: hint to user space for preferred irq affinity * @affinity_notify: context for notification of affinity changes * @pending_mask: pending rebalanced interrupts * @threads_oneshot: bitfield to handle shared oneshot threads -- cgit v1.1 From 7f1b1244e159a8490d7fb13667c6cb7e1e75046b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 7 Apr 2011 06:01:44 +0900 Subject: genirq: Support per-IRQ thread disabling. This adds support for disabling threading on a per-IRQ basis via the IRQ status instead of the IRQ flow, which is necessary for interrupts that don't follow the natural IRQ flow channels, such as those that are virtually created. The new APIs added are simply: irq_set_thread() irq_set_nothread() which follow the rest of the IRQ status routines. Chained handlers also have IRQ_NOTHREAD set on them automatically, making the lack of threading explicit rather than implicit. Subsequently, the nothread flag can be viewed through the standard genirq debugging facilities. [ tglx: Fixed cleanup fallout ] Signed-off-by: Paul Mundt Link: http://lkml.kernel.org/r/%3C20110406210135.GF18426%40linux-sh.org%3E Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 14 +++++++++++++- kernel/irq/chip.c | 1 + kernel/irq/debug.h | 1 + kernel/irq/manage.c | 3 ++- kernel/irq/settings.h | 17 +++++++++++++++++ 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index a71dd18..39c2378 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -59,6 +59,7 @@ typedef void (*irq_preflow_handler_t)(struct irq_data *data); * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing * IRQ_NOREQUEST - Interrupt cannot be requested via * request_irq() + * IRQ_NOTHREAD - Interrupt cannot be threaded * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) @@ -85,6 +86,7 @@ enum { IRQ_NO_BALANCING = (1 << 13), IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), + IRQ_NOTHREAD = (1 << 16), }; #define IRQF_MODIFY_MASK \ @@ -422,7 +424,7 @@ irq_set_handler(unsigned int irq, irq_flow_handler_t handle) /* * Set a highlevel chained flow handler for a given IRQ. * (a chained handler is automatically enabled and set to - * IRQ_NOREQUEST and IRQ_NOPROBE) + * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ static inline void irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) @@ -452,6 +454,16 @@ static inline void irq_set_probe(unsigned int irq) irq_modify_status(irq, IRQ_NOPROBE, 0); } +static inline void irq_set_nothread(unsigned int irq) +{ + irq_modify_status(irq, 0, IRQ_NOTHREAD); +} + +static inline void irq_set_thread(unsigned int irq) +{ + irq_modify_status(irq, IRQ_NOTHREAD, 0); +} + static inline void irq_set_nested_thread(unsigned int irq, bool nest) { if (nest) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4af1e2b..52d856d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -573,6 +573,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle != handle_bad_irq && is_chained) { irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); + irq_settings_set_nothread(desc); irq_startup(desc); } out: diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 306cba3..97a8bfa 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_PER_CPU); P(IRQ_NOPROBE); P(IRQ_NOREQUEST); + P(IRQ_NOTHREAD); P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 07c1611..f7ce002 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -900,7 +900,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->handler = irq_nested_primary_handler; } else { - irq_setup_forced_threading(new); + if (irq_settings_can_thread(desc)) + irq_setup_forced_threading(new); } /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0d91730..f166783 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -8,6 +8,7 @@ enum { _IRQ_LEVEL = IRQ_LEVEL, _IRQ_NOPROBE = IRQ_NOPROBE, _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOTHREAD = IRQ_NOTHREAD, _IRQ_NOAUTOEN = IRQ_NOAUTOEN, _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, @@ -20,6 +21,7 @@ enum { #define IRQ_LEVEL GOT_YOU_MORON #define IRQ_NOPROBE GOT_YOU_MORON #define IRQ_NOREQUEST GOT_YOU_MORON +#define IRQ_NOTHREAD GOT_YOU_MORON #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #undef IRQF_MODIFY_MASK @@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc) desc->status_use_accessors |= _IRQ_NOREQUEST; } +static inline bool irq_settings_can_thread(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOTHREAD); +} + +static inline void irq_settings_clr_nothread(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOTHREAD; +} + +static inline void irq_settings_set_nothread(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOTHREAD; +} + static inline bool irq_settings_can_probe(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOPROBE); -- cgit v1.1 From 7d8280624797bbe2f5170bd3c85c75a8c9c74242 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 3 Apr 2011 11:42:53 +0200 Subject: genirq: Implement a generic interrupt chip Implement a generic interrupt chip, which is configurable and is able to handle the most common irq chip implementations. Signed-off-by: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Tested-by: H Hartley Sweeten Tested-by: Tony Lindgren Tested-by; Kevin Hilman --- include/linux/irq.h | 135 ++++++++++++++++++++++++ kernel/irq/Makefile | 1 + kernel/irq/generic-chip.c | 261 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 397 insertions(+) create mode 100644 kernel/irq/generic-chip.c diff --git a/include/linux/irq.h b/include/linux/irq.h index 39c2378..2ba2f12 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -568,6 +568,141 @@ static inline int irq_reserve_irq(unsigned int irq) return irq_reserve_irqs(irq, 1); } +#ifndef irq_reg_writel +# define irq_reg_writel(val, addr) writel(val, addr) +#endif +#ifndef irq_reg_readl +# define irq_reg_readl(addr) readl(addr) +#endif + +/** + * struct irq_chip_regs - register offsets for struct irq_gci + * @enable: Enable register offset to reg_base + * @disable: Disable register offset to reg_base + * @mask: Mask register offset to reg_base + * @ack: Ack register offset to reg_base + * @eoi: Eoi register offset to reg_base + * @type: Type configuration register offset to reg_base + * @polarity: Polarity configuration register offset to reg_base + */ +struct irq_chip_regs { + unsigned long enable; + unsigned long disable; + unsigned long mask; + unsigned long ack; + unsigned long eoi; + unsigned long type; + unsigned long polarity; +}; + +/** + * struct irq_chip_type - Generic interrupt chip instance for a flow type + * @chip: The real interrupt chip which provides the callbacks + * @regs: Register offsets for this chip + * @handler: Flow handler associated with this chip + * @type: Chip can handle these flow types + * + * A irq_generic_chip can have several instances of irq_chip_type when + * it requires different functions and register offsets for different + * flow types. + */ +struct irq_chip_type { + struct irq_chip chip; + struct irq_chip_regs regs; + irq_flow_handler_t handler; + u32 type; +}; + +/** + * struct irq_chip_generic - Generic irq chip data structure + * @lock: Lock to protect register and cache data access + * @reg_base: Register base address (virtual) + * @irq_base: Interrupt base nr for this chip + * @irq_cnt: Number of interrupts handled by this chip + * @mask_cache: Cached mask register + * @type_cache: Cached type register + * @polarity_cache: Cached polarity register + * @wake_enabled: Interrupt can wakeup from suspend + * @wake_active: Interrupt is marked as an wakeup from suspend source + * @num_ct: Number of available irq_chip_type instances (usually 1) + * @private: Private data for non generic chip callbacks + * @chip_types: Array of interrupt irq_chip_types + * + * Note, that irq_chip_generic can have multiple irq_chip_type + * implementations which can be associated to a particular irq line of + * an irq_chip_generic instance. That allows to share and protect + * state in an irq_chip_generic instance when we need to implement + * different flow mechanisms (level/edge) for it. + */ +struct irq_chip_generic { + raw_spinlock_t lock; + void __iomem *reg_base; + unsigned int irq_base; + unsigned int irq_cnt; + u32 mask_cache; + u32 type_cache; + u32 polarity_cache; + u32 wake_enabled; + u32 wake_active; + unsigned int num_ct; + void *private; + struct irq_chip_type chip_types[0]; +}; + +/** + * enum irq_gc_flags - Initialization flags for generic irq chips + * @IRQ_GC_INIT_MASK_CACHE: Initialize the mask_cache by reading mask reg + * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for + * irq chips which need to call irq_set_wake() on + * the parent irq. Usually GPIO implementations + */ +enum irq_gc_flags { + IRQ_GC_INIT_MASK_CACHE = 1 << 0, + IRQ_GC_INIT_NESTED_LOCK = 1 << 1, +}; + +/* Generic chip callback functions */ +void irq_gc_noop(struct irq_data *d); +void irq_gc_mask_disable_reg(struct irq_data *d); +void irq_gc_mask_set_bit(struct irq_data *d); +void irq_gc_mask_clr_bit(struct irq_data *d); +void irq_gc_unmask_enable_reg(struct irq_data *d); +void irq_gc_ack(struct irq_data *d); +void irq_gc_mask_disable_reg_and_ack(struct irq_data *d); +void irq_gc_eoi(struct irq_data *d); +int irq_gc_set_wake(struct irq_data *d, unsigned int on); + +/* Setup functions for irq_chip_generic */ +struct irq_chip_generic * +irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler); +void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, + enum irq_gc_flags flags, unsigned int clr, + unsigned int set); +int irq_setup_alt_chip(struct irq_data *d, unsigned int type); + +static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) +{ + return container_of(d->chip, struct irq_chip_type, chip); +} + +#define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) + +#ifdef CONFIG_SMP +static inline void irq_gc_lock(struct irq_chip_generic *gc) +{ + raw_spin_lock(&gc->lock); +} + +static inline void irq_gc_unlock(struct irq_chip_generic *gc) +{ + raw_spin_unlock(&gc->lock); +} +#else +static inline void irq_gc_lock(struct irq_chip_generic *gc) { } +static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } +#endif + #endif /* CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 54329cd..e7a13bd 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,6 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-y += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 0000000..eb23e59 --- /dev/null +++ b/kernel/irq/generic-chip.c @@ -0,0 +1,261 @@ +/* + * Library implementing the most common irq chip callback functions + * + * Copyright (C) 2011, Thomas Gleixner + */ +#include +#include +#include +#include +#include + +#include "internals.h" + +static inline struct irq_chip_regs *cur_regs(struct irq_data *d) +{ + return &container_of(d->chip, struct irq_chip_type, chip)->regs; +} + +/** + * irq_gc_noop - NOOP function + * @d: irq_data + */ +void irq_gc_noop(struct irq_data *d) +{ +} + +/** + * irq_gc_mask_disable_reg - Mask chip via disable register + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register. + */ +void irq_gc_mask_disable_reg(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); + gc->mask_cache &= ~mask; + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * @d: irq_data + * + * Chip has a single mask register. Values of this register are cached + * and protected by gc->lock + */ +void irq_gc_mask_set_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + gc->mask_cache |= mask; + irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * @d: irq_data + * + * Chip has a single mask register. Values of this register are cached + * and protected by gc->lock + */ +void irq_gc_mask_clr_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + gc->mask_cache &= ~mask; + irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_gc_unlock(gc); +} + +/** + * irq_gc_unmask_enable_reg - Unmask chip via enable register + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register. + */ +void irq_gc_unmask_enable_reg(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); + gc->mask_cache |= mask; + irq_gc_unlock(gc); +} + +/** + * irq_gc_ack - Ack pending interrupt + * @d: irq_data + */ +void irq_gc_ack(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt + * @d: irq_data + */ +void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_eoi - EOI interrupt + * @d: irq_data + */ +void irq_gc_eoi(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_gc_unlock(gc); +} + +/** + * irq_gc_set_wake - Set/clr wake bit for an interrupt + * @d: irq_data + * + * For chips where the wake from suspend functionality is not + * configured in a separate register and the wakeup active state is + * just stored in a bitmask. + */ +int irq_gc_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + if (!(mask & gc->wake_enabled)) + return -EINVAL; + + irq_gc_lock(gc); + if (on) + gc->wake_active |= mask; + else + gc->wake_active &= ~mask; + irq_gc_unlock(gc); + return 0; +} + +/** + * irq_alloc_generic_chip - Allocate a generic chip and initialize it + * @name: Name of the irq chip + * @num_ct: Number of irq_chip_type instances associated with this + * @irq_base: Interrupt base nr for this chip + * @reg_base: Register base address (virtual) + * @handler: Default flow handler associated with this chip + * + * Returns an initialized irq_chip_generic structure. The chip defaults + * to the primary (index 0) irq_chip_type and @handler + */ +struct irq_chip_generic * +irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + struct irq_chip_generic *gc; + unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + + gc = kzalloc(sz, GFP_KERNEL); + if (gc) { + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; + } + return gc; +} + +/* + * Separate lockdep class for interrupt chip which can nest irq_desc + * lock. + */ +static struct lock_class_key irq_nested_lock_class; + +/** + * irq_setup_generic_chip - Setup a range of interrupts with a generic chip + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @flags: Flags for initialization + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Set up max. 32 interrupts starting from gc->irq_base. Note, this + * initializes all interrupts to the primary irq_chip_type and its + * associated handler. + */ +void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, + enum irq_gc_flags flags, unsigned int clr, + unsigned int set) +{ + struct irq_chip_type *ct = gc->chip_types; + unsigned int i; + + /* Init mask cache ? */ + if (flags & IRQ_GC_INIT_MASK_CACHE) + gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + + for (i = gc->irq_base; msk; msk >>= 1, i++) { + if (!msk & 0x01) + continue; + + if (flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(i, &irq_nested_lock_class); + + irq_set_chip_and_handler(i, &ct->chip, ct->handler); + irq_set_chip_data(i, gc); + irq_modify_status(i, clr, set); + } + gc->irq_cnt = i - gc->irq_base; +} + +/** + * irq_setup_alt_chip - Switch to alternative chip + * @d: irq_data for this interrupt + * @type Flow type to be initialized + * + * Only to be called from chip->irq_set_type() callbacks. + */ +int irq_setup_alt_chip(struct irq_data *d, unsigned int type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = gc->chip_types; + unsigned int i; + + for (i = 0; i < gc->num_ct; i++, ct++) { + if (ct->type & type) { + d->chip = &ct->chip; + irq_data_to_desc(d)->handle_irq = ct->handler; + return 0; + } + } + return -EINVAL; +} -- cgit v1.1 From cfefd21e693dca791bf9ecfc9dd3794facad533c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2011 22:36:08 +0200 Subject: genirq: Add chip suspend and resume callbacks These callbacks are only called in the syscore suspend/resume code on interrupt chips which have been registered via the generic irq chip mechanism. Calling those callbacks per irq would be rather icky, but with the generic irq chip mechanism we can call this per registered chip. Signed-off-by: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org --- include/linux/irq.h | 11 ++++++ kernel/irq/generic-chip.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 2ba2f12..8b45384 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -280,6 +280,9 @@ static inline void irqd_clr_chained_irq_inprogress(struct irq_data *d) * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * @irq_cpu_online: configure an interrupt source for a secondary CPU * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU + * @irq_suspend: function called from core code on suspend once per chip + * @irq_resume: function called from core code on resume once per chip + * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_print_chip: optional to print special chip info in show_interrupts * @flags: chip specific flags * @@ -309,6 +312,10 @@ struct irq_chip { void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); + void (*irq_suspend)(struct irq_data *data); + void (*irq_resume)(struct irq_data *data); + void (*irq_pm_shutdown)(struct irq_data *data); + void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); unsigned long flags; @@ -626,6 +633,7 @@ struct irq_chip_type { * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks + * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * * Note, that irq_chip_generic can have multiple irq_chip_type @@ -646,6 +654,7 @@ struct irq_chip_generic { u32 wake_active; unsigned int num_ct; void *private; + struct list_head list; struct irq_chip_type chip_types[0]; }; @@ -680,6 +689,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); int irq_setup_alt_chip(struct irq_data *d, unsigned int type); +void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, + unsigned int clr, unsigned int set); static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index eb23e59..31a9db7 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -8,9 +8,13 @@ #include #include #include +#include #include "internals.h" +static LIST_HEAD(gc_list); +static DEFINE_RAW_SPINLOCK(gc_lock); + static inline struct irq_chip_regs *cur_regs(struct irq_data *d) { return &container_of(d->chip, struct irq_chip_type, chip)->regs; @@ -219,6 +223,10 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, struct irq_chip_type *ct = gc->chip_types; unsigned int i; + raw_spin_lock(&gc_lock); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock(&gc_lock); + /* Init mask cache ? */ if (flags & IRQ_GC_INIT_MASK_CACHE) gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); @@ -259,3 +267,88 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type) } return -EINVAL; } + +/** + * irq_remove_generic_chip - Remove a chip + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Remove up to 32 interrupts starting from gc->irq_base. + */ +void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, + unsigned int clr, unsigned int set) +{ + unsigned int i = gc->irq_base; + + raw_spin_lock(&gc_lock); + list_del(&gc->list); + raw_spin_unlock(&gc_lock); + + for (; msk; msk >>= 1, i++) { + if (!msk & 0x01) + continue; + + /* Remove handler first. That will mask the irq line */ + irq_set_handler(i, NULL); + irq_set_chip(i, &no_irq_chip); + irq_set_chip_data(i, NULL); + irq_modify_status(i, clr, set); + } +} + +#ifdef CONFIG_PM +static int irq_gc_suspend(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_suspend) + ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + } + return 0; +} + +static void irq_gc_resume(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_resume) + ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + } +} +#else +#define irq_gc_suspend NULL +#define irq_gc_resume NULL +#endif + +static void irq_gc_shutdown(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_pm_shutdown) + ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + } +} + +static struct syscore_ops irq_gc_syscore_ops = { + .suspend = irq_gc_suspend, + .resume = irq_gc_resume, + .shutdown = irq_gc_shutdown, +}; + +static int __init irq_gc_init_ops(void) +{ + register_syscore_ops(&irq_gc_syscore_ops); + return 0; +} +device_initcall(irq_gc_init_ops); -- cgit v1.1 From e39aece7d41119c3d63f390420e00ab4d2a526a9 Mon Sep 17 00:00:00 2001 From: Vladislav Zolotarov Date: Sat, 23 Apr 2011 07:44:46 +0000 Subject: bnx2x: fix UDP csum offload Fixed packets parameters for FW in UDP checksum offload flow. Do not dereference TCP headers on non TCP frames. Reported-by: Eric Dumazet Signed-off-by: Dmitry Kravkov Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- drivers/net/bnx2x/bnx2x_cmn.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c index e83ac6d..16581df 100644 --- a/drivers/net/bnx2x/bnx2x_cmn.c +++ b/drivers/net/bnx2x/bnx2x_cmn.c @@ -2019,15 +2019,23 @@ static inline void bnx2x_set_pbd_gso(struct sk_buff *skb, static inline u8 bnx2x_set_pbd_csum_e2(struct bnx2x *bp, struct sk_buff *skb, u32 *parsing_data, u32 xmit_type) { - *parsing_data |= ((tcp_hdrlen(skb)/4) << - ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW_SHIFT) & - ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW; + *parsing_data |= + ((((u8 *)skb_transport_header(skb) - skb->data) >> 1) << + ETH_TX_PARSE_BD_E2_TCP_HDR_START_OFFSET_W_SHIFT) & + ETH_TX_PARSE_BD_E2_TCP_HDR_START_OFFSET_W; - *parsing_data |= ((((u8 *)tcp_hdr(skb) - skb->data) / 2) << - ETH_TX_PARSE_BD_E2_TCP_HDR_START_OFFSET_W_SHIFT) & - ETH_TX_PARSE_BD_E2_TCP_HDR_START_OFFSET_W; + if (xmit_type & XMIT_CSUM_TCP) { + *parsing_data |= ((tcp_hdrlen(skb) / 4) << + ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW_SHIFT) & + ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW; - return skb_transport_header(skb) + tcp_hdrlen(skb) - skb->data; + return skb_transport_header(skb) + tcp_hdrlen(skb) - skb->data; + } else + /* We support checksum offload for TCP and UDP only. + * No need to pass the UDP header length - it's a constant. + */ + return skb_transport_header(skb) + + sizeof(struct udphdr) - skb->data; } /** @@ -2043,7 +2051,7 @@ static inline u8 bnx2x_set_pbd_csum(struct bnx2x *bp, struct sk_buff *skb, struct eth_tx_parse_bd_e1x *pbd, u32 xmit_type) { - u8 hlen = (skb_network_header(skb) - skb->data) / 2; + u8 hlen = (skb_network_header(skb) - skb->data) >> 1; /* for now NS flag is not used in Linux */ pbd->global_data = @@ -2051,9 +2059,15 @@ static inline u8 bnx2x_set_pbd_csum(struct bnx2x *bp, struct sk_buff *skb, ETH_TX_PARSE_BD_E1X_LLC_SNAP_EN_SHIFT)); pbd->ip_hlen_w = (skb_transport_header(skb) - - skb_network_header(skb)) / 2; + skb_network_header(skb)) >> 1; - hlen += pbd->ip_hlen_w + tcp_hdrlen(skb) / 2; + hlen += pbd->ip_hlen_w; + + /* We support checksum offload for TCP and UDP only */ + if (xmit_type & XMIT_CSUM_TCP) + hlen += tcp_hdrlen(skb) / 2; + else + hlen += sizeof(struct udphdr) / 2; pbd->total_hlen_w = cpu_to_le16(hlen); hlen = hlen*2; -- cgit v1.1 From fa7b69475a6c192853949ba496dd9c37b497b548 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Fri, 22 Apr 2011 10:08:52 -0700 Subject: perf events, x86, P4: Fix typo in comment Signed-off-by: Justin P. Mattock Acked-by: Cyrill Gorcunov Cc: trivial@kernel.org Link: http://lkml.kernel.org/r/1303492132-3004-1-git-send-email-justinmattock@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae31e96..f4c1da2 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1187,7 +1187,7 @@ static __init int p4_pmu_init(void) { unsigned int low, high; - /* If we get stripped -- indexig fails */ + /* If we get stripped -- indexing fails */ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); -- cgit v1.1 From 625f2a378e5a10f45fdc37932fc9f8a21676de9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 22 Apr 2011 11:19:10 -0600 Subject: sched: Get rid of lock_depth Neil Brown pointed out that lock_depth somehow escaped the BKL removal work. Let's get rid of it now. Note that the perf scripting utilities still have a bunch of code for dealing with common_lock_depth in tracepoints; I have left that in place in case anybody wants to use that code with older kernels. Suggested-by: Neil Brown Signed-off-by: Jonathan Corbet Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110422111910.456c0e84@bike.lwn.net Signed-off-by: Ingo Molnar --- Documentation/trace/kprobetrace.txt | 1 - include/linux/init_task.h | 1 - include/linux/sched.h | 6 ------ kernel/fork.c | 1 - kernel/mutex.c | 7 ------- kernel/sched.c | 11 +---------- kernel/sched_debug.c | 4 ---- kernel/trace/trace_kprobe.c | 1 - tools/perf/Documentation/perf-script-perl.txt | 1 - tools/perf/Documentation/perf-script-python.txt | 1 - 10 files changed, 1 insertion(+), 33 deletions(-) diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 6d27ab8..c83bd6b 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -120,7 +120,6 @@ format: field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; - field:int common_lock_depth; offset:8; size:4; signed:1; field:unsigned long __probe_ip; offset:12; size:4; signed:0; field:int __probe_nargs; offset:16; size:4; signed:1; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index caa151f..689496b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -134,7 +134,6 @@ extern struct cred init_cred; .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ .flags = PF_KTHREAD, \ - .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 171ba24..013314a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -731,10 +731,6 @@ struct sched_info { /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ -#ifdef CONFIG_SCHEDSTATS - /* BKL stats */ - unsigned int bkl_count; -#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -1190,8 +1186,6 @@ struct task_struct { unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; - int lock_depth; /* BKL lock depth */ - #ifdef CONFIG_SMP struct task_struct *wake_entry; int on_cpu; diff --git a/kernel/fork.c b/kernel/fork.c index e7548de..aca6287 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); diff --git a/kernel/mutex.c b/kernel/mutex.c index fe4706c..2c938e2 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -163,13 +163,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; /* - * If we own the BKL, then don't spin. The owner of - * the mutex might be waiting on us to release the BKL. - */ - if (unlikely(current->lock_depth >= 0)) - break; - - /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ diff --git a/kernel/sched.c b/kernel/sched.c index 8cb0a57..9cde2dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4121,12 +4121,6 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq(), sched_count); -#ifdef CONFIG_SCHEDSTATS - if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), rq_sched_info.bkl_count); - schedstat_inc(prev, sched_info.bkl_count); - } -#endif } static void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -5852,11 +5846,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 3669bec6..a6710a1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", - rq->rq_sched_info.bkl_count); - #undef P #undef P64 #endif @@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.wait_count); PN(se.statistics.iowait_sum); P(se.statistics.iowait_count); - P(sched_info.bkl_count); P(se.nr_migrations); P(se.statistics.nr_migrations_cold); P(se.statistics.nr_failed_migrations_affine); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35d55a3..f925c45 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -53,7 +53,6 @@ const char *reserved_field_names[] = { "common_preempt_count", "common_pid", "common_tgid", - "common_lock_depth", FIELD_STRING_IP, FIELD_STRING_RETIP, FIELD_STRING_FUNC, diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index 5bb41e5..3152cca 100644 --- a/tools/perf/Documentation/perf-script-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt @@ -63,7 +63,6 @@ The format file for the sched_wakep event defines the following fields field:unsigned char common_flags; field:unsigned char common_preempt_count; field:int common_pid; - field:int common_lock_depth; field:char comm[TASK_COMM_LEN]; field:pid_t pid; diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index 36b3827..4710220 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -463,7 +463,6 @@ The format file for the sched_wakep event defines the following fields field:unsigned char common_flags; field:unsigned char common_preempt_count; field:int common_pid; - field:int common_lock_depth; field:char comm[TASK_COMM_LEN]; field:pid_t pid; -- cgit v1.1 From 15d6aba24d88231415f4e7e091c0f1e60c3e6fd5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 24 Apr 2011 11:11:51 +0300 Subject: x86: Demacro CONFIG_PARAVIRT cpu accessors Recently, we had a build failure on !CONFIG_PARAVIRT due to a callback ->wbinvd() clashing with a macro wbinvd(). While we worked around the issue, avoid it in the future by changing the macro (and a few surrounding ones) to an inline function. Signed-off-by: Avi Kivity Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1303632711-21662-1-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/system.h | 85 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 12569e6..c2ff2a1 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -303,24 +303,81 @@ static inline void native_wbinvd(void) #ifdef CONFIG_PARAVIRT #include #else -#define read_cr0() (native_read_cr0()) -#define write_cr0(x) (native_write_cr0(x)) -#define read_cr2() (native_read_cr2()) -#define write_cr2(x) (native_write_cr2(x)) -#define read_cr3() (native_read_cr3()) -#define write_cr3(x) (native_write_cr3(x)) -#define read_cr4() (native_read_cr4()) -#define read_cr4_safe() (native_read_cr4_safe()) -#define write_cr4(x) (native_write_cr4(x)) -#define wbinvd() (native_wbinvd()) + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + #ifdef CONFIG_X86_64 -#define read_cr8() (native_read_cr8()) -#define write_cr8(x) (native_write_cr8(x)) -#define load_gs_index native_load_gs_index + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + #endif /* Clear the 'TS' bit */ -#define clts() (native_clts()) +static inline void clts(void) +{ + native_clts(); +} #endif/* CONFIG_PARAVIRT */ -- cgit v1.1 From 953a12cc2889d1be92e80a2d0bab5ffef4942300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Romieu?= Date: Sun, 24 Apr 2011 17:38:48 +0200 Subject: r8169: don't request firmware when there's no userspace. The firmware is cached during the first successfull call to open() and released once the network device is unregistered. The driver uses the cached firmware between open() and unregister_netdev(). So far the firmware is optional : a failure to load the firmware does not prevent open() to success. It is thus necessary to 1) unregister all 816x / 810[23] devices and 2) force a driver probe to issue a new firmware load. Signed-off-by: Francois Romieu Fixed-by: Ciprian Docan Cc: Realtek linux nic maintainers --- drivers/net/r8169.c | 99 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 28 deletions(-) diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 493b0de..397c368 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -170,6 +170,16 @@ static const struct { }; #undef _R +static const struct rtl_firmware_info { + int mac_version; + const char *fw_name; +} rtl_firmware_infos[] = { + { .mac_version = RTL_GIGA_MAC_VER_25, .fw_name = FIRMWARE_8168D_1 }, + { .mac_version = RTL_GIGA_MAC_VER_26, .fw_name = FIRMWARE_8168D_2 }, + { .mac_version = RTL_GIGA_MAC_VER_29, .fw_name = FIRMWARE_8105E_1 }, + { .mac_version = RTL_GIGA_MAC_VER_30, .fw_name = FIRMWARE_8105E_1 } +}; + enum cfg_version { RTL_CFG_0 = 0x00, RTL_CFG_1, @@ -565,6 +575,7 @@ struct rtl8169_private { u32 saved_wolopts; const struct firmware *fw; +#define RTL_FIRMWARE_UNKNOWN ERR_PTR(-EAGAIN); }; MODULE_AUTHOR("Realtek and the Linux r8169 crew "); @@ -1789,25 +1800,26 @@ rtl_phy_write_fw(struct rtl8169_private *tp, const struct firmware *fw) static void rtl_release_firmware(struct rtl8169_private *tp) { - release_firmware(tp->fw); - tp->fw = NULL; + if (!IS_ERR_OR_NULL(tp->fw)) + release_firmware(tp->fw); + tp->fw = RTL_FIRMWARE_UNKNOWN; } -static int rtl_apply_firmware(struct rtl8169_private *tp, const char *fw_name) +static void rtl_apply_firmware(struct rtl8169_private *tp) { - const struct firmware **fw = &tp->fw; - int rc = !*fw; - - if (rc) { - rc = request_firmware(fw, fw_name, &tp->pci_dev->dev); - if (rc < 0) - goto out; - } + const struct firmware *fw = tp->fw; /* TODO: release firmware once rtl_phy_write_fw signals failures. */ - rtl_phy_write_fw(tp, *fw); -out: - return rc; + if (!IS_ERR_OR_NULL(fw)) + rtl_phy_write_fw(tp, fw); +} + +static void rtl_apply_firmware_cond(struct rtl8169_private *tp, u8 reg, u16 val) +{ + if (rtl_readphy(tp, reg) != val) + netif_warn(tp, hw, tp->dev, "chipset not ready for firmware\n"); + else + rtl_apply_firmware(tp); } static void rtl8169s_hw_phy_config(struct rtl8169_private *tp) @@ -2246,10 +2258,8 @@ static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp) rtl_writephy(tp, 0x1f, 0x0005); rtl_writephy(tp, 0x05, 0x001b); - if ((rtl_readphy(tp, 0x06) != 0xbf00) || - (rtl_apply_firmware(tp, FIRMWARE_8168D_1) < 0)) { - netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n"); - } + + rtl_apply_firmware_cond(tp, MII_EXPANSION, 0xbf00); rtl_writephy(tp, 0x1f, 0x0000); } @@ -2351,10 +2361,8 @@ static void rtl8168d_2_hw_phy_config(struct rtl8169_private *tp) rtl_writephy(tp, 0x1f, 0x0005); rtl_writephy(tp, 0x05, 0x001b); - if ((rtl_readphy(tp, 0x06) != 0xb300) || - (rtl_apply_firmware(tp, FIRMWARE_8168D_2) < 0)) { - netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n"); - } + + rtl_apply_firmware_cond(tp, MII_EXPANSION, 0xb300); rtl_writephy(tp, 0x1f, 0x0000); } @@ -2474,8 +2482,7 @@ static void rtl8105e_hw_phy_config(struct rtl8169_private *tp) rtl_writephy(tp, 0x18, 0x0310); msleep(100); - if (rtl_apply_firmware(tp, FIRMWARE_8105E_1) < 0) - netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n"); + rtl_apply_firmware(tp); rtl_writephy_batch(tp, phy_reg_init, ARRAY_SIZE(phy_reg_init)); } @@ -3237,6 +3244,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->timer.data = (unsigned long) dev; tp->timer.function = rtl8169_phy_timer; + tp->fw = RTL_FIRMWARE_UNKNOWN; + rc = register_netdev(dev); if (rc < 0) goto err_out_msi_4; @@ -3288,10 +3297,10 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev) cancel_delayed_work_sync(&tp->task); - rtl_release_firmware(tp); - unregister_netdev(dev); + rtl_release_firmware(tp); + if (pci_dev_run_wake(pdev)) pm_runtime_get_noresume(&pdev->dev); @@ -3303,6 +3312,37 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); } +static void rtl_request_firmware(struct rtl8169_private *tp) +{ + int i; + + /* Return early if the firmware is already loaded / cached. */ + if (!IS_ERR(tp->fw)) + goto out; + + for (i = 0; i < ARRAY_SIZE(rtl_firmware_infos); i++) { + const struct rtl_firmware_info *info = rtl_firmware_infos + i; + + if (info->mac_version == tp->mac_version) { + const char *name = info->fw_name; + int rc; + + rc = request_firmware(&tp->fw, name, &tp->pci_dev->dev); + if (rc < 0) { + netif_warn(tp, ifup, tp->dev, "unable to load " + "firmware patch %s (%d)\n", name, rc); + goto out_disable_request_firmware; + } + goto out; + } + } + +out_disable_request_firmware: + tp->fw = NULL; +out: + return; +} + static int rtl8169_open(struct net_device *dev) { struct rtl8169_private *tp = netdev_priv(dev); @@ -3334,11 +3374,13 @@ static int rtl8169_open(struct net_device *dev) smp_mb(); + rtl_request_firmware(tp); + retval = request_irq(dev->irq, rtl8169_interrupt, (tp->features & RTL_FEATURE_MSI) ? 0 : IRQF_SHARED, dev->name, dev); if (retval < 0) - goto err_release_ring_2; + goto err_release_fw_2; napi_enable(&tp->napi); @@ -3359,7 +3401,8 @@ static int rtl8169_open(struct net_device *dev) out: return retval; -err_release_ring_2: +err_release_fw_2: + rtl_release_firmware(tp); rtl8169_rx_clear(tp); err_free_rx_1: dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray, -- cgit v1.1 From 328f5cc30290a92ea3ca62b2a63d2b9ebcb0d334 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:02:33 +0200 Subject: ARM: Use struct syscore_ops instead of sysdevs for PM in common code Convert some ARM architecture's common code to using struct syscore_ops objects for power management instead of sysdev classes and sysdevs. This simplifies the code and reduces the kernel's memory footprint. It also is necessary for removing sysdevs from the kernel entirely in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- arch/arm/common/vic.c | 69 +++++++++++++++------------------------- arch/arm/include/asm/mach/time.h | 1 - arch/arm/kernel/leds.c | 28 +++++++++------- arch/arm/kernel/time.c | 35 +++++++------------- arch/arm/vfp/vfpmodule.c | 19 +++-------- 5 files changed, 58 insertions(+), 94 deletions(-) diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c index 113085a..7aa4262 100644 --- a/arch/arm/common/vic.c +++ b/arch/arm/common/vic.c @@ -22,17 +22,16 @@ #include #include #include -#include +#include #include #include #include #include -#if defined(CONFIG_PM) +#ifdef CONFIG_PM /** * struct vic_device - VIC PM device - * @sysdev: The system device which is registered. * @irq: The IRQ number for the base of the VIC. * @base: The register base for the VIC. * @resume_sources: A bitmask of interrupts for resume. @@ -43,8 +42,6 @@ * @protect: Save for VIC_PROTECT. */ struct vic_device { - struct sys_device sysdev; - void __iomem *base; int irq; u32 resume_sources; @@ -59,11 +56,6 @@ struct vic_device { static struct vic_device vic_devices[CONFIG_ARM_VIC_NR]; static int vic_id; - -static inline struct vic_device *to_vic(struct sys_device *sys) -{ - return container_of(sys, struct vic_device, sysdev); -} #endif /* CONFIG_PM */ /** @@ -85,10 +77,9 @@ static void vic_init2(void __iomem *base) writel(32, base + VIC_PL190_DEF_VECT_ADDR); } -#if defined(CONFIG_PM) -static int vic_class_resume(struct sys_device *dev) +#ifdef CONFIG_PM +static void resume_one_vic(struct vic_device *vic) { - struct vic_device *vic = to_vic(dev); void __iomem *base = vic->base; printk(KERN_DEBUG "%s: resuming vic at %p\n", __func__, base); @@ -107,13 +98,18 @@ static int vic_class_resume(struct sys_device *dev) writel(vic->soft_int, base + VIC_INT_SOFT); writel(~vic->soft_int, base + VIC_INT_SOFT_CLEAR); +} - return 0; +static void vic_resume(void) +{ + int id; + + for (id = vic_id - 1; id >= 0; id--) + resume_one_vic(vic_devices + id); } -static int vic_class_suspend(struct sys_device *dev, pm_message_t state) +static void suspend_one_vic(struct vic_device *vic) { - struct vic_device *vic = to_vic(dev); void __iomem *base = vic->base; printk(KERN_DEBUG "%s: suspending vic at %p\n", __func__, base); @@ -128,14 +124,21 @@ static int vic_class_suspend(struct sys_device *dev, pm_message_t state) writel(vic->resume_irqs, base + VIC_INT_ENABLE); writel(~vic->resume_irqs, base + VIC_INT_ENABLE_CLEAR); +} + +static int vic_suspend(void) +{ + int id; + + for (id = 0; id < vic_id; id++) + suspend_one_vic(vic_devices + id); return 0; } -struct sysdev_class vic_class = { - .name = "vic", - .suspend = vic_class_suspend, - .resume = vic_class_resume, +struct syscore_ops vic_syscore_ops = { + .suspend = vic_suspend, + .resume = vic_resume, }; /** @@ -147,30 +150,8 @@ struct sysdev_class vic_class = { */ static int __init vic_pm_init(void) { - struct vic_device *dev = vic_devices; - int err; - int id; - - if (vic_id == 0) - return 0; - - err = sysdev_class_register(&vic_class); - if (err) { - printk(KERN_ERR "%s: cannot register class\n", __func__); - return err; - } - - for (id = 0; id < vic_id; id++, dev++) { - dev->sysdev.id = id; - dev->sysdev.cls = &vic_class; - - err = sysdev_register(&dev->sysdev); - if (err) { - printk(KERN_ERR "%s: failed to register device\n", - __func__); - return err; - } - } + if (vic_id > 0) + register_syscore_ops(&vic_syscore_ops); return 0; } diff --git a/arch/arm/include/asm/mach/time.h b/arch/arm/include/asm/mach/time.h index 883f6be..d5adaae 100644 --- a/arch/arm/include/asm/mach/time.h +++ b/arch/arm/include/asm/mach/time.h @@ -34,7 +34,6 @@ * timer interrupt which may be pending. */ struct sys_timer { - struct sys_device dev; void (*init)(void); void (*suspend)(void); void (*resume)(void); diff --git a/arch/arm/kernel/leds.c b/arch/arm/kernel/leds.c index 31a316c..0f107dc 100644 --- a/arch/arm/kernel/leds.c +++ b/arch/arm/kernel/leds.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -69,36 +70,37 @@ static ssize_t leds_store(struct sys_device *dev, static SYSDEV_ATTR(event, 0200, NULL, leds_store); -static int leds_suspend(struct sys_device *dev, pm_message_t state) +static struct sysdev_class leds_sysclass = { + .name = "leds", +}; + +static struct sys_device leds_device = { + .id = 0, + .cls = &leds_sysclass, +}; + +static int leds_suspend(void) { leds_event(led_stop); return 0; } -static int leds_resume(struct sys_device *dev) +static void leds_resume(void) { leds_event(led_start); - return 0; } -static int leds_shutdown(struct sys_device *dev) +static void leds_shutdown(void) { leds_event(led_halted); - return 0; } -static struct sysdev_class leds_sysclass = { - .name = "leds", +static struct syscore_ops leds_syscore_ops = { .shutdown = leds_shutdown, .suspend = leds_suspend, .resume = leds_resume, }; -static struct sys_device leds_device = { - .id = 0, - .cls = &leds_sysclass, -}; - static int __init leds_init(void) { int ret; @@ -107,6 +109,8 @@ static int __init leds_init(void) ret = sysdev_register(&leds_device); if (ret == 0) ret = sysdev_create_file(&leds_device, &attr_event); + if (ret == 0) + register_syscore_ops(&leds_syscore_ops); return ret; } diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 1ff46ca..cb634c3 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -115,48 +115,37 @@ void timer_tick(void) #endif #if defined(CONFIG_PM) && !defined(CONFIG_GENERIC_CLOCKEVENTS) -static int timer_suspend(struct sys_device *dev, pm_message_t state) +static int timer_suspend(void) { - struct sys_timer *timer = container_of(dev, struct sys_timer, dev); - - if (timer->suspend != NULL) - timer->suspend(); + if (system_timer->suspend) + system_timer->suspend(); return 0; } -static int timer_resume(struct sys_device *dev) +static void timer_resume(void) { - struct sys_timer *timer = container_of(dev, struct sys_timer, dev); - - if (timer->resume != NULL) - timer->resume(); - - return 0; + if (system_timer->resume) + system_timer->resume(); } #else #define timer_suspend NULL #define timer_resume NULL #endif -static struct sysdev_class timer_sysclass = { - .name = "timer", +static struct syscore_ops timer_syscore_ops = { .suspend = timer_suspend, .resume = timer_resume, }; -static int __init timer_init_sysfs(void) +static int __init timer_init_syscore_ops(void) { - int ret = sysdev_class_register(&timer_sysclass); - if (ret == 0) { - system_timer->dev.cls = &timer_sysclass; - ret = sysdev_register(&system_timer->dev); - } + register_syscore_ops(&timer_syscore_ops); - return ret; + return 0; } -device_initcall(timer_init_sysfs); +device_initcall(timer_init_syscore_ops); void __init time_init(void) { diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index f746950..f25e7ec 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -398,9 +398,9 @@ static void vfp_enable(void *unused) } #ifdef CONFIG_PM -#include +#include -static int vfp_pm_suspend(struct sys_device *dev, pm_message_t state) +static int vfp_pm_suspend(void) { struct thread_info *ti = current_thread_info(); u32 fpexc = fmrx(FPEXC); @@ -420,34 +420,25 @@ static int vfp_pm_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int vfp_pm_resume(struct sys_device *dev) +static void vfp_pm_resume(void) { /* ensure we have access to the vfp */ vfp_enable(NULL); /* and disable it to ensure the next usage restores the state */ fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN); - - return 0; } -static struct sysdev_class vfp_pm_sysclass = { - .name = "vfp", +static struct syscore_ops vfp_pm_syscore_ops = { .suspend = vfp_pm_suspend, .resume = vfp_pm_resume, }; -static struct sys_device vfp_pm_sysdev = { - .cls = &vfp_pm_sysclass, -}; - static void vfp_pm_init(void) { - sysdev_class_register(&vfp_pm_sysclass); - sysdev_register(&vfp_pm_sysdev); + register_syscore_ops(&vfp_pm_syscore_ops); } - #else static inline void vfp_pm_init(void) { } #endif /* CONFIG_PM */ -- cgit v1.1 From 3c437ffd20329619672b12a97bee944bccdd4ec9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:02:46 +0200 Subject: ARM / OMAP: Use struct syscore_ops for "core" power management Replace the sysdev class and struct sys_device used for power management in the OMAP's GPIO code with a struct syscore_ops object which is simpler. Signed-off-by: Rafael J. Wysocki Acked-by: Kevin Hilman Acked-by: Greg Kroah-Hartman --- arch/arm/plat-omap/gpio.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/arch/arm/plat-omap/gpio.c b/arch/arm/plat-omap/gpio.c index d2adcdd..bd9e321 100644 --- a/arch/arm/plat-omap/gpio.c +++ b/arch/arm/plat-omap/gpio.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -1372,9 +1372,7 @@ static const struct dev_pm_ops omap_mpuio_dev_pm_ops = { .resume_noirq = omap_mpuio_resume_noirq, }; -/* use platform_driver for this, now that there's no longer any - * point to sys_device (other than not disturbing old code). - */ +/* use platform_driver for this. */ static struct platform_driver omap_mpuio_driver = { .driver = { .name = "mpuio", @@ -1745,7 +1743,7 @@ static int __devinit omap_gpio_probe(struct platform_device *pdev) } #if defined(CONFIG_ARCH_OMAP16XX) || defined(CONFIG_ARCH_OMAP2PLUS) -static int omap_gpio_suspend(struct sys_device *dev, pm_message_t mesg) +static int omap_gpio_suspend(void) { int i; @@ -1795,12 +1793,12 @@ static int omap_gpio_suspend(struct sys_device *dev, pm_message_t mesg) return 0; } -static int omap_gpio_resume(struct sys_device *dev) +static void omap_gpio_resume(void) { int i; if (!cpu_class_is_omap2() && !cpu_is_omap16xx()) - return 0; + return; for (i = 0; i < gpio_bank_count; i++) { struct gpio_bank *bank = &gpio_bank[i]; @@ -1836,21 +1834,13 @@ static int omap_gpio_resume(struct sys_device *dev) __raw_writel(bank->saved_wakeup, wake_set); spin_unlock_irqrestore(&bank->lock, flags); } - - return 0; } -static struct sysdev_class omap_gpio_sysclass = { - .name = "gpio", +static struct syscore_ops omap_gpio_syscore_ops = { .suspend = omap_gpio_suspend, .resume = omap_gpio_resume, }; -static struct sys_device omap_gpio_device = { - .id = 0, - .cls = &omap_gpio_sysclass, -}; - #endif #ifdef CONFIG_ARCH_OMAP2PLUS @@ -2108,21 +2098,14 @@ postcore_initcall(omap_gpio_drv_reg); static int __init omap_gpio_sysinit(void) { - int ret = 0; - mpuio_init(); #if defined(CONFIG_ARCH_OMAP16XX) || defined(CONFIG_ARCH_OMAP2PLUS) - if (cpu_is_omap16xx() || cpu_class_is_omap2()) { - if (ret == 0) { - ret = sysdev_class_register(&omap_gpio_sysclass); - if (ret == 0) - ret = sysdev_register(&omap_gpio_device); - } - } + if (cpu_is_omap16xx() || cpu_class_is_omap2()) + register_syscore_ops(&omap_gpio_syscore_ops); #endif - return ret; + return 0; } arch_initcall(omap_gpio_sysinit); -- cgit v1.1 From b7808056141bc4d67213036921a5a685ebec0274 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:02:55 +0200 Subject: ARM / Integrator: Use struct syscore_ops for core PM Replace the sysdev class and struct sys_device used for power management by the Integrator interrupt-handling code with a struct syscore_ops object which is simpler. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- arch/arm/mach-integrator/integrator_ap.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c index 980803f..d3e9645 100644 --- a/arch/arm/mach-integrator/integrator_ap.c +++ b/arch/arm/mach-integrator/integrator_ap.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -180,13 +180,13 @@ static void __init ap_init_irq(void) #ifdef CONFIG_PM static unsigned long ic_irq_enable; -static int irq_suspend(struct sys_device *dev, pm_message_t state) +static int irq_suspend(void) { ic_irq_enable = readl(VA_IC_BASE + IRQ_ENABLE); return 0; } -static int irq_resume(struct sys_device *dev) +static void irq_resume(void) { /* disable all irq sources */ writel(-1, VA_CMIC_BASE + IRQ_ENABLE_CLEAR); @@ -194,33 +194,25 @@ static int irq_resume(struct sys_device *dev) writel(-1, VA_IC_BASE + FIQ_ENABLE_CLEAR); writel(ic_irq_enable, VA_IC_BASE + IRQ_ENABLE_SET); - return 0; } #else #define irq_suspend NULL #define irq_resume NULL #endif -static struct sysdev_class irq_class = { - .name = "irq", +static struct syscore_ops irq_syscore_ops = { .suspend = irq_suspend, .resume = irq_resume, }; -static struct sys_device irq_device = { - .id = 0, - .cls = &irq_class, -}; - -static int __init irq_init_sysfs(void) +static int __init irq_syscore_init(void) { - int ret = sysdev_class_register(&irq_class); - if (ret == 0) - ret = sysdev_register(&irq_device); - return ret; + register_syscore_ops(&irq_syscore_ops); + + return 0; } -device_initcall(irq_init_sysfs); +device_initcall(irq_syscore_init); /* * Flash handling. -- cgit v1.1 From 905339807bde7bb726001b69fbdf69ab0cf69a9e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:03:03 +0200 Subject: ARM / SA1100: Use struct syscore_ops for "core" power management Replace the sysdev class and struct sys_device used for power management by the SA1100 interrupt-handling code with a struct syscore_ops object which is simpler. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- arch/arm/mach-sa1100/irq.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/arm/mach-sa1100/irq.c b/arch/arm/mach-sa1100/irq.c index 423ddb3..dfbf824 100644 --- a/arch/arm/mach-sa1100/irq.c +++ b/arch/arm/mach-sa1100/irq.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -234,7 +234,7 @@ static struct sa1100irq_state { unsigned int iccr; } sa1100irq_state; -static int sa1100irq_suspend(struct sys_device *dev, pm_message_t state) +static int sa1100irq_suspend(void) { struct sa1100irq_state *st = &sa1100irq_state; @@ -264,7 +264,7 @@ static int sa1100irq_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int sa1100irq_resume(struct sys_device *dev) +static void sa1100irq_resume(void) { struct sa1100irq_state *st = &sa1100irq_state; @@ -277,24 +277,17 @@ static int sa1100irq_resume(struct sys_device *dev) ICMR = st->icmr; } - return 0; } -static struct sysdev_class sa1100irq_sysclass = { - .name = "sa11x0-irq", +static struct syscore_ops sa1100irq_syscore_ops = { .suspend = sa1100irq_suspend, .resume = sa1100irq_resume, }; -static struct sys_device sa1100irq_device = { - .id = 0, - .cls = &sa1100irq_sysclass, -}; - static int __init sa1100irq_init_devicefs(void) { - sysdev_class_register(&sa1100irq_sysclass); - return sysdev_register(&sa1100irq_device); + register_syscore_ops(&sa1100irq_syscore_ops); + return 0; } device_initcall(sa1100irq_init_devicefs); -- cgit v1.1 From 2eaa03b5bebd1e80014f780d7bf27c3e66daefd6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:03:11 +0200 Subject: ARM / PXA: Use struct syscore_ops for "core" power management Replace sysdev classes and struct sys_device objects used for "core" power management by the PXA platform code with struct syscore_ops objects that are simpler. This reduces the code size and the kernel memory footprint. It also is necessary for removing sysdevs entirely from the kernel in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- arch/arm/mach-pxa/balloon3.c | 1 - arch/arm/mach-pxa/clock-pxa2xx.c | 18 +++---------- arch/arm/mach-pxa/clock-pxa3xx.c | 17 +++--------- arch/arm/mach-pxa/clock.h | 7 ++--- arch/arm/mach-pxa/cm-x270.c | 1 - arch/arm/mach-pxa/cm-x2xx.c | 23 +++++------------ arch/arm/mach-pxa/colibri-evalboard.c | 1 - arch/arm/mach-pxa/colibri-pxa270-income.c | 1 - arch/arm/mach-pxa/colibri-pxa270.c | 1 - arch/arm/mach-pxa/generic.h | 8 +++--- arch/arm/mach-pxa/irq.c | 17 +++--------- arch/arm/mach-pxa/lpd270.c | 20 +++++--------- arch/arm/mach-pxa/lubbock.c | 21 +++++---------- arch/arm/mach-pxa/mainstone.c | 22 +++++----------- arch/arm/mach-pxa/mfp-pxa2xx.c | 12 ++++----- arch/arm/mach-pxa/mfp-pxa3xx.c | 21 ++++----------- arch/arm/mach-pxa/mioa701.c | 43 ++++++------------------------- arch/arm/mach-pxa/palmld.c | 1 - arch/arm/mach-pxa/palmtreo.c | 1 - arch/arm/mach-pxa/palmz72.c | 24 ++++++----------- arch/arm/mach-pxa/pxa25x.c | 25 +++++------------- arch/arm/mach-pxa/pxa27x.c | 25 +++++------------- arch/arm/mach-pxa/pxa3xx.c | 25 +++++------------- arch/arm/mach-pxa/pxa95x.c | 20 +++----------- arch/arm/mach-pxa/raumfeld.c | 1 - arch/arm/mach-pxa/smemc.c | 29 +++++---------------- arch/arm/mach-pxa/trizeps4.c | 1 - arch/arm/mach-pxa/viper.c | 12 ++++----- arch/arm/mach-pxa/vpac270.c | 1 - arch/arm/plat-pxa/gpio.c | 17 +++--------- arch/arm/plat-pxa/mfp.c | 1 - 31 files changed, 110 insertions(+), 307 deletions(-) diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c index bfbecec..810a982 100644 --- a/arch/arm/mach-pxa/balloon3.c +++ b/arch/arm/mach-pxa/balloon3.c @@ -15,7 +15,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/mach-pxa/clock-pxa2xx.c b/arch/arm/mach-pxa/clock-pxa2xx.c index 1ce0904..1d5859d 100644 --- a/arch/arm/mach-pxa/clock-pxa2xx.c +++ b/arch/arm/mach-pxa/clock-pxa2xx.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include @@ -33,32 +33,22 @@ const struct clkops clk_pxa2xx_cken_ops = { #ifdef CONFIG_PM static uint32_t saved_cken; -static int pxa2xx_clock_suspend(struct sys_device *d, pm_message_t state) +static int pxa2xx_clock_suspend(void) { saved_cken = CKEN; return 0; } -static int pxa2xx_clock_resume(struct sys_device *d) +static void pxa2xx_clock_resume(void) { CKEN = saved_cken; - return 0; } #else #define pxa2xx_clock_suspend NULL #define pxa2xx_clock_resume NULL #endif -struct sysdev_class pxa2xx_clock_sysclass = { - .name = "pxa2xx-clock", +struct syscore_ops pxa2xx_clock_syscore_ops = { .suspend = pxa2xx_clock_suspend, .resume = pxa2xx_clock_resume, }; - -static int __init pxa2xx_clock_init(void) -{ - if (cpu_is_pxa2xx()) - return sysdev_class_register(&pxa2xx_clock_sysclass); - return 0; -} -postcore_initcall(pxa2xx_clock_init); diff --git a/arch/arm/mach-pxa/clock-pxa3xx.c b/arch/arm/mach-pxa/clock-pxa3xx.c index 3f864cd..2a37a9a 100644 --- a/arch/arm/mach-pxa/clock-pxa3xx.c +++ b/arch/arm/mach-pxa/clock-pxa3xx.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -182,7 +183,7 @@ const struct clkops clk_pxa3xx_pout_ops = { static uint32_t cken[2]; static uint32_t accr; -static int pxa3xx_clock_suspend(struct sys_device *d, pm_message_t state) +static int pxa3xx_clock_suspend(void) { cken[0] = CKENA; cken[1] = CKENB; @@ -190,28 +191,18 @@ static int pxa3xx_clock_suspend(struct sys_device *d, pm_message_t state) return 0; } -static int pxa3xx_clock_resume(struct sys_device *d) +static void pxa3xx_clock_resume(void) { ACCR = accr; CKENA = cken[0]; CKENB = cken[1]; - return 0; } #else #define pxa3xx_clock_suspend NULL #define pxa3xx_clock_resume NULL #endif -struct sysdev_class pxa3xx_clock_sysclass = { - .name = "pxa3xx-clock", +struct syscore_ops pxa3xx_clock_syscore_ops = { .suspend = pxa3xx_clock_suspend, .resume = pxa3xx_clock_resume, }; - -static int __init pxa3xx_clock_init(void) -{ - if (cpu_is_pxa3xx() || cpu_is_pxa95x()) - return sysdev_class_register(&pxa3xx_clock_sysclass); - return 0; -} -postcore_initcall(pxa3xx_clock_init); diff --git a/arch/arm/mach-pxa/clock.h b/arch/arm/mach-pxa/clock.h index f9f349a..1f2fb9c 100644 --- a/arch/arm/mach-pxa/clock.h +++ b/arch/arm/mach-pxa/clock.h @@ -1,5 +1,5 @@ #include -#include +#include struct clkops { void (*enable)(struct clk *); @@ -54,7 +54,7 @@ extern const struct clkops clk_pxa2xx_cken_ops; void clk_pxa2xx_cken_enable(struct clk *clk); void clk_pxa2xx_cken_disable(struct clk *clk); -extern struct sysdev_class pxa2xx_clock_sysclass; +extern struct syscore_ops pxa2xx_clock_syscore_ops; #if defined(CONFIG_PXA3xx) || defined(CONFIG_PXA95x) #define DEFINE_PXA3_CKEN(_name, _cken, _rate, _delay) \ @@ -74,5 +74,6 @@ extern const struct clkops clk_pxa3xx_smemc_ops; extern void clk_pxa3xx_cken_enable(struct clk *); extern void clk_pxa3xx_cken_disable(struct clk *); -extern struct sysdev_class pxa3xx_clock_sysclass; +extern struct syscore_ops pxa3xx_clock_syscore_ops; + #endif diff --git a/arch/arm/mach-pxa/cm-x270.c b/arch/arm/mach-pxa/cm-x270.c index b88d601a..13518a7 100644 --- a/arch/arm/mach-pxa/cm-x270.c +++ b/arch/arm/mach-pxa/cm-x270.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include diff --git a/arch/arm/mach-pxa/cm-x2xx.c b/arch/arm/mach-pxa/cm-x2xx.c index 8225e2e..a109967 100644 --- a/arch/arm/mach-pxa/cm-x2xx.c +++ b/arch/arm/mach-pxa/cm-x2xx.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include @@ -388,7 +388,7 @@ static inline void cmx2xx_init_display(void) {} #ifdef CONFIG_PM static unsigned long sleep_save_msc[10]; -static int cmx2xx_suspend(struct sys_device *dev, pm_message_t state) +static int cmx2xx_suspend(void) { cmx2xx_pci_suspend(); @@ -412,7 +412,7 @@ static int cmx2xx_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int cmx2xx_resume(struct sys_device *dev) +static void cmx2xx_resume(void) { cmx2xx_pci_resume(); @@ -420,27 +420,18 @@ static int cmx2xx_resume(struct sys_device *dev) __raw_writel(sleep_save_msc[0], MSC0); __raw_writel(sleep_save_msc[1], MSC1); __raw_writel(sleep_save_msc[2], MSC2); - - return 0; } -static struct sysdev_class cmx2xx_pm_sysclass = { - .name = "pm", +static struct syscore_ops cmx2xx_pm_syscore_ops = { .resume = cmx2xx_resume, .suspend = cmx2xx_suspend, }; -static struct sys_device cmx2xx_pm_device = { - .cls = &cmx2xx_pm_sysclass, -}; - static int __init cmx2xx_pm_init(void) { - int error; - error = sysdev_class_register(&cmx2xx_pm_sysclass); - if (error == 0) - error = sysdev_register(&cmx2xx_pm_device); - return error; + register_syscore_ops(&cmx2xx_pm_syscore_ops); + + return 0; } #else static int __init cmx2xx_pm_init(void) { return 0; } diff --git a/arch/arm/mach-pxa/colibri-evalboard.c b/arch/arm/mach-pxa/colibri-evalboard.c index 81c3c43..d28e802 100644 --- a/arch/arm/mach-pxa/colibri-evalboard.c +++ b/arch/arm/mach-pxa/colibri-evalboard.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-pxa/colibri-pxa270-income.c b/arch/arm/mach-pxa/colibri-pxa270-income.c index 44c1b77..80538b8 100644 --- a/arch/arm/mach-pxa/colibri-pxa270-income.c +++ b/arch/arm/mach-pxa/colibri-pxa270-income.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-pxa/colibri-pxa270.c b/arch/arm/mach-pxa/colibri-pxa270.c index 6fc5d32..7545a48 100644 --- a/arch/arm/mach-pxa/colibri-pxa270.c +++ b/arch/arm/mach-pxa/colibri-pxa270.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h index a079d8b..e6c9344 100644 --- a/arch/arm/mach-pxa/generic.h +++ b/arch/arm/mach-pxa/generic.h @@ -61,10 +61,10 @@ extern unsigned pxa3xx_get_clk_frequency_khz(int); #define pxa3xx_get_clk_frequency_khz(x) (0) #endif -extern struct sysdev_class pxa_irq_sysclass; -extern struct sysdev_class pxa_gpio_sysclass; -extern struct sysdev_class pxa2xx_mfp_sysclass; -extern struct sysdev_class pxa3xx_mfp_sysclass; +extern struct syscore_ops pxa_irq_syscore_ops; +extern struct syscore_ops pxa_gpio_syscore_ops; +extern struct syscore_ops pxa2xx_mfp_syscore_ops; +extern struct syscore_ops pxa3xx_mfp_syscore_ops; void __init pxa_set_ffuart_info(void *info); void __init pxa_set_btuart_info(void *info); diff --git a/arch/arm/mach-pxa/irq.c b/arch/arm/mach-pxa/irq.c index 6251e3f..32ed551 100644 --- a/arch/arm/mach-pxa/irq.c +++ b/arch/arm/mach-pxa/irq.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -183,7 +183,7 @@ void __init pxa_init_irq(int irq_nr, set_wake_t fn) static unsigned long saved_icmr[MAX_INTERNAL_IRQS/32]; static unsigned long saved_ipr[MAX_INTERNAL_IRQS]; -static int pxa_irq_suspend(struct sys_device *dev, pm_message_t state) +static int pxa_irq_suspend(void) { int i; @@ -202,7 +202,7 @@ static int pxa_irq_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int pxa_irq_resume(struct sys_device *dev) +static void pxa_irq_resume(void) { int i; @@ -218,22 +218,13 @@ static int pxa_irq_resume(struct sys_device *dev) __raw_writel(saved_ipr[i], IRQ_BASE + IPR(i)); __raw_writel(1, IRQ_BASE + ICCR); - return 0; } #else #define pxa_irq_suspend NULL #define pxa_irq_resume NULL #endif -struct sysdev_class pxa_irq_sysclass = { - .name = "irq", +struct syscore_ops pxa_irq_syscore_ops = { .suspend = pxa_irq_suspend, .resume = pxa_irq_resume, }; - -static int __init pxa_irq_init(void) -{ - return sysdev_class_register(&pxa_irq_sysclass); -} - -core_initcall(pxa_irq_init); diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c index f5de541..6cf8180 100644 --- a/arch/arm/mach-pxa/lpd270.c +++ b/arch/arm/mach-pxa/lpd270.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include @@ -159,30 +159,22 @@ static void __init lpd270_init_irq(void) #ifdef CONFIG_PM -static int lpd270_irq_resume(struct sys_device *dev) +static void lpd270_irq_resume(void) { __raw_writew(lpd270_irq_enabled, LPD270_INT_MASK); - return 0; } -static struct sysdev_class lpd270_irq_sysclass = { - .name = "cpld_irq", +static struct syscore_ops lpd270_irq_syscore_ops = { .resume = lpd270_irq_resume, }; -static struct sys_device lpd270_irq_device = { - .cls = &lpd270_irq_sysclass, -}; - static int __init lpd270_irq_device_init(void) { - int ret = -ENODEV; if (machine_is_logicpd_pxa270()) { - ret = sysdev_class_register(&lpd270_irq_sysclass); - if (ret == 0) - ret = sysdev_register(&lpd270_irq_device); + register_syscore_ops(&lpd270_irq_syscore_ops); + return 0; } - return ret; + return -ENODEV; } device_initcall(lpd270_irq_device_init); diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c index 3ede978..e10ddb8 100644 --- a/arch/arm/mach-pxa/lubbock.c +++ b/arch/arm/mach-pxa/lubbock.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -176,31 +176,22 @@ static void __init lubbock_init_irq(void) #ifdef CONFIG_PM -static int lubbock_irq_resume(struct sys_device *dev) +static void lubbock_irq_resume(void) { LUB_IRQ_MASK_EN = lubbock_irq_enabled; - return 0; } -static struct sysdev_class lubbock_irq_sysclass = { - .name = "cpld_irq", +static struct syscore_ops lubbock_irq_syscore_ops = { .resume = lubbock_irq_resume, }; -static struct sys_device lubbock_irq_device = { - .cls = &lubbock_irq_sysclass, -}; - static int __init lubbock_irq_device_init(void) { - int ret = -ENODEV; - if (machine_is_lubbock()) { - ret = sysdev_class_register(&lubbock_irq_sysclass); - if (ret == 0) - ret = sysdev_register(&lubbock_irq_device); + register_syscore_ops(&lubbock_irq_syscore_ops); + return 0; } - return ret; + return -ENODEV; } device_initcall(lubbock_irq_device_init); diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c index 95163ba..3479e2b 100644 --- a/arch/arm/mach-pxa/mainstone.c +++ b/arch/arm/mach-pxa/mainstone.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include @@ -185,31 +185,21 @@ static void __init mainstone_init_irq(void) #ifdef CONFIG_PM -static int mainstone_irq_resume(struct sys_device *dev) +static void mainstone_irq_resume(void) { MST_INTMSKENA = mainstone_irq_enabled; - return 0; } -static struct sysdev_class mainstone_irq_sysclass = { - .name = "cpld_irq", +static struct syscore_ops mainstone_irq_syscore_ops = { .resume = mainstone_irq_resume, }; -static struct sys_device mainstone_irq_device = { - .cls = &mainstone_irq_sysclass, -}; - static int __init mainstone_irq_device_init(void) { - int ret = -ENODEV; + if (machine_is_mainstone()) + register_syscore_ops(&mainstone_irq_syscore_ops); - if (machine_is_mainstone()) { - ret = sysdev_class_register(&mainstone_irq_sysclass); - if (ret == 0) - ret = sysdev_register(&mainstone_irq_device); - } - return ret; + return 0; } device_initcall(mainstone_irq_device_init); diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c index 1d1419b..87ae312 100644 --- a/arch/arm/mach-pxa/mfp-pxa2xx.c +++ b/arch/arm/mach-pxa/mfp-pxa2xx.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -338,7 +338,7 @@ static unsigned long saved_gafr[2][4]; static unsigned long saved_gpdr[4]; static unsigned long saved_pgsr[4]; -static int pxa2xx_mfp_suspend(struct sys_device *d, pm_message_t state) +static int pxa2xx_mfp_suspend(void) { int i; @@ -365,7 +365,7 @@ static int pxa2xx_mfp_suspend(struct sys_device *d, pm_message_t state) return 0; } -static int pxa2xx_mfp_resume(struct sys_device *d) +static void pxa2xx_mfp_resume(void) { int i; @@ -376,15 +376,13 @@ static int pxa2xx_mfp_resume(struct sys_device *d) PGSR(i) = saved_pgsr[i]; } PSSR = PSSR_RDH | PSSR_PH; - return 0; } #else #define pxa2xx_mfp_suspend NULL #define pxa2xx_mfp_resume NULL #endif -struct sysdev_class pxa2xx_mfp_sysclass = { - .name = "mfp", +struct syscore_ops pxa2xx_mfp_syscore_ops = { .suspend = pxa2xx_mfp_suspend, .resume = pxa2xx_mfp_resume, }; @@ -409,6 +407,6 @@ static int __init pxa2xx_mfp_init(void) for (i = 0; i <= gpio_to_bank(pxa_last_gpio); i++) gpdr_lpm[i] = GPDR(i * 32); - return sysdev_class_register(&pxa2xx_mfp_sysclass); + return 0; } postcore_initcall(pxa2xx_mfp_init); diff --git a/arch/arm/mach-pxa/mfp-pxa3xx.c b/arch/arm/mach-pxa/mfp-pxa3xx.c index 7a270ee..89863a0 100644 --- a/arch/arm/mach-pxa/mfp-pxa3xx.c +++ b/arch/arm/mach-pxa/mfp-pxa3xx.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -31,13 +31,13 @@ * a pull-down mode if they're an active low chip select, and we're * just entering standby. */ -static int pxa3xx_mfp_suspend(struct sys_device *d, pm_message_t state) +static int pxa3xx_mfp_suspend(void) { mfp_config_lpm(); return 0; } -static int pxa3xx_mfp_resume(struct sys_device *d) +static void pxa3xx_mfp_resume(void) { mfp_config_run(); @@ -47,24 +47,13 @@ static int pxa3xx_mfp_resume(struct sys_device *d) * preserve them here in case they will be referenced later */ ASCR &= ~(ASCR_RDH | ASCR_D1S | ASCR_D2S | ASCR_D3S); - return 0; } #else #define pxa3xx_mfp_suspend NULL #define pxa3xx_mfp_resume NULL #endif -struct sysdev_class pxa3xx_mfp_sysclass = { - .name = "mfp", +struct syscore_ops pxa3xx_mfp_syscore_ops = { .suspend = pxa3xx_mfp_suspend, - .resume = pxa3xx_mfp_resume, + .resume = pxa3xx_mfp_resume, }; - -static int __init mfp_init_devicefs(void) -{ - if (cpu_is_pxa3xx()) - return sysdev_class_register(&pxa3xx_mfp_sysclass); - - return 0; -} -postcore_initcall(mfp_init_devicefs); diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c index 23925db..e347013 100644 --- a/arch/arm/mach-pxa/mioa701.c +++ b/arch/arm/mach-pxa/mioa701.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -488,7 +488,7 @@ static void install_bootstrap(void) } -static int mioa701_sys_suspend(struct sys_device *sysdev, pm_message_t state) +static int mioa701_sys_suspend(void) { int i = 0, is_bt_on; u32 *mem_resume_vector = phys_to_virt(RESUME_VECTOR_ADDR); @@ -514,7 +514,7 @@ static int mioa701_sys_suspend(struct sys_device *sysdev, pm_message_t state) return 0; } -static int mioa701_sys_resume(struct sys_device *sysdev) +static void mioa701_sys_resume(void) { int i = 0; u32 *mem_resume_vector = phys_to_virt(RESUME_VECTOR_ADDR); @@ -527,43 +527,18 @@ static int mioa701_sys_resume(struct sys_device *sysdev) *mem_resume_enabler = save_buffer[i++]; *mem_resume_bt = save_buffer[i++]; *mem_resume_unknown = save_buffer[i++]; - - return 0; } -static struct sysdev_class mioa701_sysclass = { - .name = "mioa701", -}; - -static struct sys_device sysdev_bootstrap = { - .cls = &mioa701_sysclass, -}; - -static struct sysdev_driver driver_bootstrap = { - .suspend = &mioa701_sys_suspend, - .resume = &mioa701_sys_resume, +static struct syscore_ops mioa701_syscore_ops = { + .suspend = mioa701_sys_suspend, + .resume = mioa701_sys_resume, }; static int __init bootstrap_init(void) { - int rc; int save_size = mioa701_bootstrap_lg + (sizeof(u32) * 3); - rc = sysdev_class_register(&mioa701_sysclass); - if (rc) { - printk(KERN_ERR "Failed registering mioa701 sys class\n"); - return -ENODEV; - } - rc = sysdev_register(&sysdev_bootstrap); - if (rc) { - printk(KERN_ERR "Failed registering mioa701 sys device\n"); - return -ENODEV; - } - rc = sysdev_driver_register(&mioa701_sysclass, &driver_bootstrap); - if (rc) { - printk(KERN_ERR "Failed registering PMU sys driver\n"); - return -ENODEV; - } + register_syscore_ops(&mioa701_syscore_ops); save_buffer = kmalloc(save_size, GFP_KERNEL); if (!save_buffer) @@ -576,9 +551,7 @@ static int __init bootstrap_init(void) static void bootstrap_exit(void) { kfree(save_buffer); - sysdev_driver_unregister(&mioa701_sysclass, &driver_bootstrap); - sysdev_unregister(&sysdev_bootstrap); - sysdev_class_unregister(&mioa701_sysclass); + unregister_syscore_ops(&mioa701_syscore_ops); printk(KERN_CRIT "Unregistering mioa701 suspend will hang next" "resume !!!\n"); diff --git a/arch/arm/mach-pxa/palmld.c b/arch/arm/mach-pxa/palmld.c index a6f898c..4061ecd 100644 --- a/arch/arm/mach-pxa/palmld.c +++ b/arch/arm/mach-pxa/palmld.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-pxa/palmtreo.c b/arch/arm/mach-pxa/palmtreo.c index 8aadad5..20d1b18 100644 --- a/arch/arm/mach-pxa/palmtreo.c +++ b/arch/arm/mach-pxa/palmtreo.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-pxa/palmz72.c b/arch/arm/mach-pxa/palmz72.c index 3b8a4f3..65f24f0 100644 --- a/arch/arm/mach-pxa/palmz72.c +++ b/arch/arm/mach-pxa/palmz72.c @@ -19,7 +19,7 @@ */ #include -#include +#include #include #include #include @@ -233,9 +233,9 @@ static struct palmz72_resume_info palmz72_resume_info = { static unsigned long store_ptr; -/* sys_device for Palm Zire 72 PM */ +/* syscore_ops for Palm Zire 72 PM */ -static int palmz72_pm_suspend(struct sys_device *dev, pm_message_t msg) +static int palmz72_pm_suspend(void) { /* setup the resume_info struct for the original bootloader */ palmz72_resume_info.resume_addr = (u32) cpu_resume; @@ -249,31 +249,23 @@ static int palmz72_pm_suspend(struct sys_device *dev, pm_message_t msg) return 0; } -static int palmz72_pm_resume(struct sys_device *dev) +static void palmz72_pm_resume(void) { *PALMZ72_SAVE_DWORD = store_ptr; - return 0; } -static struct sysdev_class palmz72_pm_sysclass = { - .name = "palmz72_pm", +static struct syscore_ops palmz72_pm_syscore_ops = { .suspend = palmz72_pm_suspend, .resume = palmz72_pm_resume, }; -static struct sys_device palmz72_pm_device = { - .cls = &palmz72_pm_sysclass, -}; - static int __init palmz72_pm_init(void) { - int ret = -ENODEV; if (machine_is_palmz72()) { - ret = sysdev_class_register(&palmz72_pm_sysclass); - if (ret == 0) - ret = sysdev_register(&palmz72_pm_device); + register_syscore_ops(&palmz72_pm_syscore_ops); + return 0; } - return ret; + return -ENODEV; } device_initcall(palmz72_pm_init); diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c index a4af8c5..fed363c 100644 --- a/arch/arm/mach-pxa/pxa25x.c +++ b/arch/arm/mach-pxa/pxa25x.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -350,21 +350,9 @@ static struct platform_device *pxa25x_devices[] __initdata = { &pxa_device_asoc_platform, }; -static struct sys_device pxa25x_sysdev[] = { - { - .cls = &pxa_irq_sysclass, - }, { - .cls = &pxa2xx_mfp_sysclass, - }, { - .cls = &pxa_gpio_sysclass, - }, { - .cls = &pxa2xx_clock_sysclass, - } -}; - static int __init pxa25x_init(void) { - int i, ret = 0; + int ret = 0; if (cpu_is_pxa25x()) { @@ -377,11 +365,10 @@ static int __init pxa25x_init(void) pxa25x_init_pm(); - for (i = 0; i < ARRAY_SIZE(pxa25x_sysdev); i++) { - ret = sysdev_register(&pxa25x_sysdev[i]); - if (ret) - pr_err("failed to register sysdev[%d]\n", i); - } + register_syscore_ops(&pxa_irq_syscore_ops); + register_syscore_ops(&pxa2xx_mfp_syscore_ops); + register_syscore_ops(&pxa_gpio_syscore_ops); + register_syscore_ops(&pxa2xx_clock_syscore_ops); ret = platform_add_devices(pxa25x_devices, ARRAY_SIZE(pxa25x_devices)); diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c index 909756e..2fecbec 100644 --- a/arch/arm/mach-pxa/pxa27x.c +++ b/arch/arm/mach-pxa/pxa27x.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -428,21 +428,9 @@ static struct platform_device *devices[] __initdata = { &pxa27x_device_pwm1, }; -static struct sys_device pxa27x_sysdev[] = { - { - .cls = &pxa_irq_sysclass, - }, { - .cls = &pxa2xx_mfp_sysclass, - }, { - .cls = &pxa_gpio_sysclass, - }, { - .cls = &pxa2xx_clock_sysclass, - } -}; - static int __init pxa27x_init(void) { - int i, ret = 0; + int ret = 0; if (cpu_is_pxa27x()) { @@ -455,11 +443,10 @@ static int __init pxa27x_init(void) pxa27x_init_pm(); - for (i = 0; i < ARRAY_SIZE(pxa27x_sysdev); i++) { - ret = sysdev_register(&pxa27x_sysdev[i]); - if (ret) - pr_err("failed to register sysdev[%d]\n", i); - } + register_syscore_ops(&pxa_irq_syscore_ops); + register_syscore_ops(&pxa2xx_mfp_syscore_ops); + register_syscore_ops(&pxa_gpio_syscore_ops); + register_syscore_ops(&pxa2xx_clock_syscore_ops); ret = platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c index 8dd1073..8521d7d 100644 --- a/arch/arm/mach-pxa/pxa3xx.c +++ b/arch/arm/mach-pxa/pxa3xx.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include @@ -427,21 +427,9 @@ static struct platform_device *devices[] __initdata = { &pxa27x_device_pwm1, }; -static struct sys_device pxa3xx_sysdev[] = { - { - .cls = &pxa_irq_sysclass, - }, { - .cls = &pxa3xx_mfp_sysclass, - }, { - .cls = &pxa_gpio_sysclass, - }, { - .cls = &pxa3xx_clock_sysclass, - } -}; - static int __init pxa3xx_init(void) { - int i, ret = 0; + int ret = 0; if (cpu_is_pxa3xx()) { @@ -462,11 +450,10 @@ static int __init pxa3xx_init(void) pxa3xx_init_pm(); - for (i = 0; i < ARRAY_SIZE(pxa3xx_sysdev); i++) { - ret = sysdev_register(&pxa3xx_sysdev[i]); - if (ret) - pr_err("failed to register sysdev[%d]\n", i); - } + register_syscore_ops(&pxa_irq_syscore_ops); + register_syscore_ops(&pxa3xx_mfp_syscore_ops); + register_syscore_ops(&pxa_gpio_syscore_ops); + register_syscore_ops(&pxa3xx_clock_syscore_ops); ret = platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/arch/arm/mach-pxa/pxa95x.c b/arch/arm/mach-pxa/pxa95x.c index 23b229b..ecc82a3 100644 --- a/arch/arm/mach-pxa/pxa95x.c +++ b/arch/arm/mach-pxa/pxa95x.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include @@ -260,16 +260,6 @@ static struct platform_device *devices[] __initdata = { &pxa27x_device_pwm1, }; -static struct sys_device pxa95x_sysdev[] = { - { - .cls = &pxa_irq_sysclass, - }, { - .cls = &pxa_gpio_sysclass, - }, { - .cls = &pxa3xx_clock_sysclass, - } -}; - static int __init pxa95x_init(void) { int ret = 0, i; @@ -293,11 +283,9 @@ static int __init pxa95x_init(void) if ((ret = pxa_init_dma(IRQ_DMA, 32))) return ret; - for (i = 0; i < ARRAY_SIZE(pxa95x_sysdev); i++) { - ret = sysdev_register(&pxa95x_sysdev[i]); - if (ret) - pr_err("failed to register sysdev[%d]\n", i); - } + register_syscore_ops(&pxa_irq_syscore_ops); + register_syscore_ops(&pxa_gpio_syscore_ops); + register_syscore_ops(&pxa3xx_clock_syscore_ops); ret = platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c index cd18613..d130f77 100644 --- a/arch/arm/mach-pxa/raumfeld.c +++ b/arch/arm/mach-pxa/raumfeld.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/mach-pxa/smemc.c b/arch/arm/mach-pxa/smemc.c index 232b731..7992305 100644 --- a/arch/arm/mach-pxa/smemc.c +++ b/arch/arm/mach-pxa/smemc.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include @@ -16,7 +16,7 @@ static unsigned long msc[2]; static unsigned long sxcnfg, memclkcfg; static unsigned long csadrcfg[4]; -static int pxa3xx_smemc_suspend(struct sys_device *dev, pm_message_t state) +static int pxa3xx_smemc_suspend(void) { msc[0] = __raw_readl(MSC0); msc[1] = __raw_readl(MSC1); @@ -30,7 +30,7 @@ static int pxa3xx_smemc_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int pxa3xx_smemc_resume(struct sys_device *dev) +static void pxa3xx_smemc_resume(void) { __raw_writel(msc[0], MSC0); __raw_writel(msc[1], MSC1); @@ -40,34 +40,19 @@ static int pxa3xx_smemc_resume(struct sys_device *dev) __raw_writel(csadrcfg[1], CSADRCFG1); __raw_writel(csadrcfg[2], CSADRCFG2); __raw_writel(csadrcfg[3], CSADRCFG3); - - return 0; } -static struct sysdev_class smemc_sysclass = { - .name = "smemc", +static struct syscore_ops smemc_syscore_ops = { .suspend = pxa3xx_smemc_suspend, .resume = pxa3xx_smemc_resume, }; -static struct sys_device smemc_sysdev = { - .id = 0, - .cls = &smemc_sysclass, -}; - static int __init smemc_init(void) { - int ret = 0; + if (cpu_is_pxa3xx()) + register_syscore_ops(&smemc_syscore_ops); - if (cpu_is_pxa3xx()) { - ret = sysdev_class_register(&smemc_sysclass); - if (ret) - return ret; - - ret = sysdev_register(&smemc_sysdev); - } - - return ret; + return 0; } subsys_initcall(smemc_init); #endif diff --git a/arch/arm/mach-pxa/trizeps4.c b/arch/arm/mach-pxa/trizeps4.c index b9cfbeb..687417a 100644 --- a/arch/arm/mach-pxa/trizeps4.c +++ b/arch/arm/mach-pxa/trizeps4.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c index b523f11..903218e 100644 --- a/arch/arm/mach-pxa/viper.c +++ b/arch/arm/mach-pxa/viper.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -130,20 +131,19 @@ static u8 viper_hw_version(void) return v1; } -/* CPU sysdev */ -static int viper_cpu_suspend(struct sys_device *sysdev, pm_message_t state) +/* CPU system core operations. */ +static int viper_cpu_suspend(void) { viper_icr_set_bit(VIPER_ICR_R_DIS); return 0; } -static int viper_cpu_resume(struct sys_device *sysdev) +static void viper_cpu_resume(void) { viper_icr_clear_bit(VIPER_ICR_R_DIS); - return 0; } -static struct sysdev_driver viper_cpu_sysdev_driver = { +static struct syscore_ops viper_cpu_syscore_ops = { .suspend = viper_cpu_suspend, .resume = viper_cpu_resume, }; @@ -945,7 +945,7 @@ static void __init viper_init(void) viper_init_vcore_gpios(); viper_init_cpufreq(); - sysdev_driver_register(&cpu_sysdev_class, &viper_cpu_sysdev_driver); + register_syscore_ops(&viper_cpu_syscore_ops); if (version) { pr_info("viper: hardware v%di%d detected. " diff --git a/arch/arm/mach-pxa/vpac270.c b/arch/arm/mach-pxa/vpac270.c index f71d377..67bd414 100644 --- a/arch/arm/mach-pxa/vpac270.c +++ b/arch/arm/mach-pxa/vpac270.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/plat-pxa/gpio.c b/arch/arm/plat-pxa/gpio.c index dce088f..48ebb94 100644 --- a/arch/arm/plat-pxa/gpio.c +++ b/arch/arm/plat-pxa/gpio.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -295,7 +295,7 @@ void __init pxa_init_gpio(int mux_irq, int start, int end, set_wake_t fn) } #ifdef CONFIG_PM -static int pxa_gpio_suspend(struct sys_device *dev, pm_message_t state) +static int pxa_gpio_suspend(void) { struct pxa_gpio_chip *c; int gpio; @@ -312,7 +312,7 @@ static int pxa_gpio_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int pxa_gpio_resume(struct sys_device *dev) +static void pxa_gpio_resume(void) { struct pxa_gpio_chip *c; int gpio; @@ -326,22 +326,13 @@ static int pxa_gpio_resume(struct sys_device *dev) __raw_writel(c->saved_gfer, c->regbase + GFER_OFFSET); __raw_writel(c->saved_gpdr, c->regbase + GPDR_OFFSET); } - return 0; } #else #define pxa_gpio_suspend NULL #define pxa_gpio_resume NULL #endif -struct sysdev_class pxa_gpio_sysclass = { - .name = "gpio", +struct syscore_ops pxa_gpio_syscore_ops = { .suspend = pxa_gpio_suspend, .resume = pxa_gpio_resume, }; - -static int __init pxa_gpio_init(void) -{ - return sysdev_class_register(&pxa_gpio_sysclass); -} - -core_initcall(pxa_gpio_init); diff --git a/arch/arm/plat-pxa/mfp.c b/arch/arm/plat-pxa/mfp.c index a9aa5ad..be12ead 100644 --- a/arch/arm/plat-pxa/mfp.c +++ b/arch/arm/plat-pxa/mfp.c @@ -17,7 +17,6 @@ #include #include #include -#include #include -- cgit v1.1 From bb072c3cf21d1c9a5a2eeb5a00679ee7bf39675b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:03:21 +0200 Subject: ARM / Samsung: Use struct syscore_ops for "core" power management Replace sysdev classes and struct sys_device objects used for "core" power management by Samsung platforms with struct syscore_ops objects that are simpler. This generally reduces the code size and the kernel memory footprint. It also is necessary for removing sysdevs entirely from the kernel in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Kukjin Kim --- arch/arm/mach-exynos4/pm.c | 45 ++++++++++++--------- arch/arm/mach-s3c2410/irq.c | 30 +------------- arch/arm/mach-s3c2410/mach-bast.c | 17 +++----- arch/arm/mach-s3c2410/pm.c | 13 +++--- arch/arm/mach-s3c2410/s3c2410.c | 5 +++ arch/arm/mach-s3c2412/irq.c | 2 - arch/arm/mach-s3c2412/mach-jive.c | 19 +++------ arch/arm/mach-s3c2412/pm.c | 27 +++++++------ arch/arm/mach-s3c2412/s3c2412.c | 4 ++ arch/arm/mach-s3c2416/irq.c | 2 - arch/arm/mach-s3c2416/pm.c | 27 ++++++------- arch/arm/mach-s3c2416/s3c2416.c | 5 +++ arch/arm/mach-s3c2440/mach-osiris.c | 18 +++------ arch/arm/mach-s3c2440/s3c2440.c | 8 ++++ arch/arm/mach-s3c2440/s3c2442.c | 6 +++ arch/arm/mach-s3c2440/s3c244x-irq.c | 4 -- arch/arm/mach-s3c2440/s3c244x.c | 62 ++++++++++++++--------------- arch/arm/mach-s3c64xx/irq-pm.c | 18 ++++----- arch/arm/mach-s5pv210/pm.c | 25 ++++++++---- arch/arm/plat-s3c24xx/dma.c | 68 ++++++++++++-------------------- arch/arm/plat-s3c24xx/irq-pm.c | 7 +--- arch/arm/plat-s5p/irq-pm.c | 7 +--- arch/arm/plat-samsung/include/plat/cpu.h | 6 +++ arch/arm/plat-samsung/include/plat/pm.h | 6 ++- 24 files changed, 203 insertions(+), 228 deletions(-) diff --git a/arch/arm/mach-exynos4/pm.c b/arch/arm/mach-exynos4/pm.c index 10d917d..8755ca8 100644 --- a/arch/arm/mach-exynos4/pm.c +++ b/arch/arm/mach-exynos4/pm.c @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -372,7 +373,27 @@ void exynos4_scu_enable(void __iomem *scu_base) flush_cache_all(); } -static int exynos4_pm_resume(struct sys_device *dev) +static struct sysdev_driver exynos4_pm_driver = { + .add = exynos4_pm_add, +}; + +static __init int exynos4_pm_drvinit(void) +{ + unsigned int tmp; + + s3c_pm_init(); + + /* All wakeup disable */ + + tmp = __raw_readl(S5P_WAKEUP_MASK); + tmp |= ((0xFF << 8) | (0x1F << 1)); + __raw_writel(tmp, S5P_WAKEUP_MASK); + + return sysdev_driver_register(&exynos4_sysclass, &exynos4_pm_driver); +} +arch_initcall(exynos4_pm_drvinit); + +static void exynos4_pm_resume(void) { /* For release retention */ @@ -394,27 +415,15 @@ static int exynos4_pm_resume(struct sys_device *dev) /* enable L2X0*/ writel_relaxed(1, S5P_VA_L2CC + L2X0_CTRL); #endif - - return 0; } -static struct sysdev_driver exynos4_pm_driver = { - .add = exynos4_pm_add, +static struct syscore_ops exynos4_pm_syscore_ops = { .resume = exynos4_pm_resume, }; -static __init int exynos4_pm_drvinit(void) +static __init int exynos4_pm_syscore_init(void) { - unsigned int tmp; - - s3c_pm_init(); - - /* All wakeup disable */ - - tmp = __raw_readl(S5P_WAKEUP_MASK); - tmp |= ((0xFF << 8) | (0x1F << 1)); - __raw_writel(tmp, S5P_WAKEUP_MASK); - - return sysdev_driver_register(&exynos4_sysclass, &exynos4_pm_driver); + register_syscore_ops(&exynos4_pm_syscore_ops); + return 0; } -arch_initcall(exynos4_pm_drvinit); +arch_initcall(exynos4_pm_syscore_init); diff --git a/arch/arm/mach-s3c2410/irq.c b/arch/arm/mach-s3c2410/irq.c index 5e2f353..2854129 100644 --- a/arch/arm/mach-s3c2410/irq.c +++ b/arch/arm/mach-s3c2410/irq.c @@ -23,38 +23,12 @@ #include #include #include -#include +#include #include #include -static int s3c2410_irq_add(struct sys_device *sysdev) -{ - return 0; -} - -static struct sysdev_driver s3c2410_irq_driver = { - .add = s3c2410_irq_add, +struct syscore_ops s3c24xx_irq_syscore_ops = { .suspend = s3c24xx_irq_suspend, .resume = s3c24xx_irq_resume, }; - -static int __init s3c2410_irq_init(void) -{ - return sysdev_driver_register(&s3c2410_sysclass, &s3c2410_irq_driver); -} - -arch_initcall(s3c2410_irq_init); - -static struct sysdev_driver s3c2410a_irq_driver = { - .add = s3c2410_irq_add, - .suspend = s3c24xx_irq_suspend, - .resume = s3c24xx_irq_resume, -}; - -static int __init s3c2410a_irq_init(void) -{ - return sysdev_driver_register(&s3c2410a_sysclass, &s3c2410a_irq_driver); -} - -arch_initcall(s3c2410a_irq_init); diff --git a/arch/arm/mach-s3c2410/mach-bast.c b/arch/arm/mach-s3c2410/mach-bast.c index 2970ea9..1e2d536 100644 --- a/arch/arm/mach-s3c2410/mach-bast.c +++ b/arch/arm/mach-s3c2410/mach-bast.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -214,17 +214,16 @@ static struct s3c2410_uartcfg bast_uartcfgs[] __initdata = { /* NAND Flash on BAST board */ #ifdef CONFIG_PM -static int bast_pm_suspend(struct sys_device *sd, pm_message_t state) +static int bast_pm_suspend(void) { /* ensure that an nRESET is not generated on resume. */ gpio_direction_output(S3C2410_GPA(21), 1); return 0; } -static int bast_pm_resume(struct sys_device *sd) +static void bast_pm_resume(void) { s3c_gpio_cfgpin(S3C2410_GPA(21), S3C2410_GPA21_nRSTOUT); - return 0; } #else @@ -232,16 +231,11 @@ static int bast_pm_resume(struct sys_device *sd) #define bast_pm_resume NULL #endif -static struct sysdev_class bast_pm_sysclass = { - .name = "mach-bast", +static struct syscore_ops bast_pm_syscore_ops = { .suspend = bast_pm_suspend, .resume = bast_pm_resume, }; -static struct sys_device bast_pm_sysdev = { - .cls = &bast_pm_sysclass, -}; - static int smartmedia_map[] = { 0 }; static int chip0_map[] = { 1 }; static int chip1_map[] = { 2 }; @@ -642,8 +636,7 @@ static void __init bast_map_io(void) static void __init bast_init(void) { - sysdev_class_register(&bast_pm_sysclass); - sysdev_register(&bast_pm_sysdev); + register_syscore_ops(&bast_pm_syscore_ops); s3c_i2c0_set_platdata(&bast_i2c_info); s3c_nand_set_platdata(&bast_nand_info); diff --git a/arch/arm/mach-s3c2410/pm.c b/arch/arm/mach-s3c2410/pm.c index 725636f..4728f9a 100644 --- a/arch/arm/mach-s3c2410/pm.c +++ b/arch/arm/mach-s3c2410/pm.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -92,7 +93,7 @@ static void s3c2410_pm_prepare(void) } } -static int s3c2410_pm_resume(struct sys_device *dev) +static void s3c2410_pm_resume(void) { unsigned long tmp; @@ -104,10 +105,12 @@ static int s3c2410_pm_resume(struct sys_device *dev) if ( machine_is_aml_m5900() ) s3c2410_gpio_setpin(S3C2410_GPF(2), 0); - - return 0; } +struct syscore_ops s3c2410_pm_syscore_ops = { + .resume = s3c2410_pm_resume, +}; + static int s3c2410_pm_add(struct sys_device *dev) { pm_cpu_prep = s3c2410_pm_prepare; @@ -119,7 +122,6 @@ static int s3c2410_pm_add(struct sys_device *dev) #if defined(CONFIG_CPU_S3C2410) static struct sysdev_driver s3c2410_pm_driver = { .add = s3c2410_pm_add, - .resume = s3c2410_pm_resume, }; /* register ourselves */ @@ -133,7 +135,6 @@ arch_initcall(s3c2410_pm_drvinit); static struct sysdev_driver s3c2410a_pm_driver = { .add = s3c2410_pm_add, - .resume = s3c2410_pm_resume, }; static int __init s3c2410a_pm_drvinit(void) @@ -147,7 +148,6 @@ arch_initcall(s3c2410a_pm_drvinit); #if defined(CONFIG_CPU_S3C2440) static struct sysdev_driver s3c2440_pm_driver = { .add = s3c2410_pm_add, - .resume = s3c2410_pm_resume, }; static int __init s3c2440_pm_drvinit(void) @@ -161,7 +161,6 @@ arch_initcall(s3c2440_pm_drvinit); #if defined(CONFIG_CPU_S3C2442) static struct sysdev_driver s3c2442_pm_driver = { .add = s3c2410_pm_add, - .resume = s3c2410_pm_resume, }; static int __init s3c2442_pm_drvinit(void) diff --git a/arch/arm/mach-s3c2410/s3c2410.c b/arch/arm/mach-s3c2410/s3c2410.c index adc90a3..f1d3bd8 100644 --- a/arch/arm/mach-s3c2410/s3c2410.c +++ b/arch/arm/mach-s3c2410/s3c2410.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include @@ -168,6 +170,9 @@ int __init s3c2410_init(void) { printk("S3C2410: Initialising architecture\n"); + register_syscore_ops(&s3c2410_pm_syscore_ops); + register_syscore_ops(&s3c24xx_irq_syscore_ops); + return sysdev_register(&s3c2410_sysdev); } diff --git a/arch/arm/mach-s3c2412/irq.c b/arch/arm/mach-s3c2412/irq.c index f3355d2..1a1aa22 100644 --- a/arch/arm/mach-s3c2412/irq.c +++ b/arch/arm/mach-s3c2412/irq.c @@ -202,8 +202,6 @@ static int s3c2412_irq_add(struct sys_device *sysdev) static struct sysdev_driver s3c2412_irq_driver = { .add = s3c2412_irq_add, - .suspend = s3c24xx_irq_suspend, - .resume = s3c24xx_irq_resume, }; static int s3c2412_irq_init(void) diff --git a/arch/arm/mach-s3c2412/mach-jive.c b/arch/arm/mach-s3c2412/mach-jive.c index 923e01b..85dcaeb 100644 --- a/arch/arm/mach-s3c2412/mach-jive.c +++ b/arch/arm/mach-s3c2412/mach-jive.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -486,7 +486,7 @@ static struct s3c2410_udc_mach_info jive_udc_cfg __initdata = { /* Jive power management device */ #ifdef CONFIG_PM -static int jive_pm_suspend(struct sys_device *sd, pm_message_t state) +static int jive_pm_suspend(void) { /* Write the magic value u-boot uses to check for resume into * the INFORM0 register, and ensure INFORM1 is set to the @@ -498,10 +498,9 @@ static int jive_pm_suspend(struct sys_device *sd, pm_message_t state) return 0; } -static int jive_pm_resume(struct sys_device *sd) +static void jive_pm_resume(void) { __raw_writel(0x0, S3C2412_INFORM0); - return 0; } #else @@ -509,16 +508,11 @@ static int jive_pm_resume(struct sys_device *sd) #define jive_pm_resume NULL #endif -static struct sysdev_class jive_pm_sysclass = { - .name = "jive-pm", +static struct syscore_ops jive_pm_syscore_ops = { .suspend = jive_pm_suspend, .resume = jive_pm_resume, }; -static struct sys_device jive_pm_sysdev = { - .cls = &jive_pm_sysclass, -}; - static void __init jive_map_io(void) { s3c24xx_init_io(jive_iodesc, ARRAY_SIZE(jive_iodesc)); @@ -536,10 +530,9 @@ static void jive_power_off(void) static void __init jive_machine_init(void) { - /* register system devices for managing low level suspend */ + /* register system core operations for managing low level suspend */ - sysdev_class_register(&jive_pm_sysclass); - sysdev_register(&jive_pm_sysdev); + register_syscore_ops(&jive_pm_syscore_ops); /* write our sleep configurations for the IO. Pull down all unused * IO, ensure that we have turned off all peripherals we do not diff --git a/arch/arm/mach-s3c2412/pm.c b/arch/arm/mach-s3c2412/pm.c index a7417c4..752b13a 100644 --- a/arch/arm/mach-s3c2412/pm.c +++ b/arch/arm/mach-s3c2412/pm.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -86,13 +87,24 @@ static struct sleep_save s3c2412_sleep[] = { SAVE_ITEM(S3C2413_GPJSLPCON), }; -static int s3c2412_pm_suspend(struct sys_device *dev, pm_message_t state) +static struct sysdev_driver s3c2412_pm_driver = { + .add = s3c2412_pm_add, +}; + +static __init int s3c2412_pm_init(void) +{ + return sysdev_driver_register(&s3c2412_sysclass, &s3c2412_pm_driver); +} + +arch_initcall(s3c2412_pm_init); + +static int s3c2412_pm_suspend(void) { s3c_pm_do_save(s3c2412_sleep, ARRAY_SIZE(s3c2412_sleep)); return 0; } -static int s3c2412_pm_resume(struct sys_device *dev) +static void s3c2412_pm_resume(void) { unsigned long tmp; @@ -102,18 +114,9 @@ static int s3c2412_pm_resume(struct sys_device *dev) __raw_writel(tmp, S3C2412_PWRCFG); s3c_pm_do_restore(s3c2412_sleep, ARRAY_SIZE(s3c2412_sleep)); - return 0; } -static struct sysdev_driver s3c2412_pm_driver = { - .add = s3c2412_pm_add, +struct syscore_ops s3c2412_pm_syscore_ops = { .suspend = s3c2412_pm_suspend, .resume = s3c2412_pm_resume, }; - -static __init int s3c2412_pm_init(void) -{ - return sysdev_driver_register(&s3c2412_sysclass, &s3c2412_pm_driver); -} - -arch_initcall(s3c2412_pm_init); diff --git a/arch/arm/mach-s3c2412/s3c2412.c b/arch/arm/mach-s3c2412/s3c2412.c index 4c6df51..ef0958d 100644 --- a/arch/arm/mach-s3c2412/s3c2412.c +++ b/arch/arm/mach-s3c2412/s3c2412.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -244,5 +245,8 @@ int __init s3c2412_init(void) { printk("S3C2412: Initialising architecture\n"); + register_syscore_ops(&s3c2412_pm_syscore_ops); + register_syscore_ops(&s3c24xx_irq_syscore_ops); + return sysdev_register(&s3c2412_sysdev); } diff --git a/arch/arm/mach-s3c2416/irq.c b/arch/arm/mach-s3c2416/irq.c index 77b38f2..28ad20d 100644 --- a/arch/arm/mach-s3c2416/irq.c +++ b/arch/arm/mach-s3c2416/irq.c @@ -236,8 +236,6 @@ static int __init s3c2416_irq_add(struct sys_device *sysdev) static struct sysdev_driver s3c2416_irq_driver = { .add = s3c2416_irq_add, - .suspend = s3c24xx_irq_suspend, - .resume = s3c24xx_irq_resume, }; static int __init s3c2416_irq_init(void) diff --git a/arch/arm/mach-s3c2416/pm.c b/arch/arm/mach-s3c2416/pm.c index 4a04205..41db2b2 100644 --- a/arch/arm/mach-s3c2416/pm.c +++ b/arch/arm/mach-s3c2416/pm.c @@ -11,6 +11,7 @@ */ #include +#include #include #include @@ -55,30 +56,26 @@ static int s3c2416_pm_add(struct sys_device *sysdev) return 0; } -static int s3c2416_pm_suspend(struct sys_device *dev, pm_message_t state) +static struct sysdev_driver s3c2416_pm_driver = { + .add = s3c2416_pm_add, +}; + +static __init int s3c2416_pm_init(void) { - return 0; + return sysdev_driver_register(&s3c2416_sysclass, &s3c2416_pm_driver); } -static int s3c2416_pm_resume(struct sys_device *dev) +arch_initcall(s3c2416_pm_init); + + +static void s3c2416_pm_resume(void) { /* unset the return-from-sleep amd inform flags */ __raw_writel(0x0, S3C2443_PWRMODE); __raw_writel(0x0, S3C2412_INFORM0); __raw_writel(0x0, S3C2412_INFORM1); - - return 0; } -static struct sysdev_driver s3c2416_pm_driver = { - .add = s3c2416_pm_add, - .suspend = s3c2416_pm_suspend, +struct syscore_ops s3c2416_pm_syscore_ops = { .resume = s3c2416_pm_resume, }; - -static __init int s3c2416_pm_init(void) -{ - return sysdev_driver_register(&s3c2416_sysclass, &s3c2416_pm_driver); -} - -arch_initcall(s3c2416_pm_init); diff --git a/arch/arm/mach-s3c2416/s3c2416.c b/arch/arm/mach-s3c2416/s3c2416.c index ba7fd87..494ce91 100644 --- a/arch/arm/mach-s3c2416/s3c2416.c +++ b/arch/arm/mach-s3c2416/s3c2416.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -54,6 +55,7 @@ #include #include #include +#include #include #include @@ -95,6 +97,9 @@ int __init s3c2416_init(void) s3c_fb_setname("s3c2443-fb"); + register_syscore_ops(&s3c2416_pm_syscore_ops); + register_syscore_ops(&s3c24xx_irq_syscore_ops); + return sysdev_register(&s3c2416_sysdev); } diff --git a/arch/arm/mach-s3c2440/mach-osiris.c b/arch/arm/mach-s3c2440/mach-osiris.c index 14dc678..d885363 100644 --- a/arch/arm/mach-s3c2440/mach-osiris.c +++ b/arch/arm/mach-s3c2440/mach-osiris.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -284,7 +284,7 @@ static struct platform_device osiris_pcmcia = { #ifdef CONFIG_PM static unsigned char pm_osiris_ctrl0; -static int osiris_pm_suspend(struct sys_device *sd, pm_message_t state) +static int osiris_pm_suspend(void) { unsigned int tmp; @@ -304,7 +304,7 @@ static int osiris_pm_suspend(struct sys_device *sd, pm_message_t state) return 0; } -static int osiris_pm_resume(struct sys_device *sd) +static void osiris_pm_resume(void) { if (pm_osiris_ctrl0 & OSIRIS_CTRL0_FIX8) __raw_writeb(OSIRIS_CTRL1_FIX8, OSIRIS_VA_CTRL1); @@ -312,8 +312,6 @@ static int osiris_pm_resume(struct sys_device *sd) __raw_writeb(pm_osiris_ctrl0, OSIRIS_VA_CTRL0); s3c_gpio_cfgpin(S3C2410_GPA(21), S3C2410_GPA21_nRSTOUT); - - return 0; } #else @@ -321,16 +319,11 @@ static int osiris_pm_resume(struct sys_device *sd) #define osiris_pm_resume NULL #endif -static struct sysdev_class osiris_pm_sysclass = { - .name = "mach-osiris", +static struct syscore_ops osiris_pm_syscore_ops = { .suspend = osiris_pm_suspend, .resume = osiris_pm_resume, }; -static struct sys_device osiris_pm_sysdev = { - .cls = &osiris_pm_sysclass, -}; - /* Link for DVS driver to TPS65011 */ static void osiris_tps_release(struct device *dev) @@ -439,8 +432,7 @@ static void __init osiris_map_io(void) static void __init osiris_init(void) { - sysdev_class_register(&osiris_pm_sysclass); - sysdev_register(&osiris_pm_sysdev); + register_syscore_ops(&osiris_pm_syscore_ops); s3c_i2c0_set_platdata(NULL); s3c_nand_set_platdata(&osiris_nand_info); diff --git a/arch/arm/mach-s3c2440/s3c2440.c b/arch/arm/mach-s3c2440/s3c2440.c index f7663f7..ce99ff7 100644 --- a/arch/arm/mach-s3c2440/s3c2440.c +++ b/arch/arm/mach-s3c2440/s3c2440.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,7 @@ #include #include #include +#include #include #include @@ -51,6 +53,12 @@ int __init s3c2440_init(void) s3c_device_wdt.resource[1].start = IRQ_S3C2440_WDT; s3c_device_wdt.resource[1].end = IRQ_S3C2440_WDT; + /* register suspend/resume handlers */ + + register_syscore_ops(&s3c2410_pm_syscore_ops); + register_syscore_ops(&s3c244x_pm_syscore_ops); + register_syscore_ops(&s3c24xx_irq_syscore_ops); + /* register our system device for everything else */ return sysdev_register(&s3c2440_sysdev); diff --git a/arch/arm/mach-s3c2440/s3c2442.c b/arch/arm/mach-s3c2440/s3c2442.c index ecf8135..6224bad 100644 --- a/arch/arm/mach-s3c2440/s3c2442.c +++ b/arch/arm/mach-s3c2440/s3c2442.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ #include #include #include +#include #include #include @@ -167,6 +169,10 @@ int __init s3c2442_init(void) { printk("S3C2442: Initialising architecture\n"); + register_syscore_ops(&s3c2410_pm_syscore_ops); + register_syscore_ops(&s3c244x_pm_syscore_ops); + register_syscore_ops(&s3c24xx_irq_syscore_ops); + return sysdev_register(&s3c2442_sysdev); } diff --git a/arch/arm/mach-s3c2440/s3c244x-irq.c b/arch/arm/mach-s3c2440/s3c244x-irq.c index de07c2f..c63e8f2 100644 --- a/arch/arm/mach-s3c2440/s3c244x-irq.c +++ b/arch/arm/mach-s3c2440/s3c244x-irq.c @@ -116,8 +116,6 @@ static int s3c244x_irq_add(struct sys_device *sysdev) static struct sysdev_driver s3c2440_irq_driver = { .add = s3c244x_irq_add, - .suspend = s3c24xx_irq_suspend, - .resume = s3c24xx_irq_resume, }; static int s3c2440_irq_init(void) @@ -129,8 +127,6 @@ arch_initcall(s3c2440_irq_init); static struct sysdev_driver s3c2442_irq_driver = { .add = s3c244x_irq_add, - .suspend = s3c24xx_irq_suspend, - .resume = s3c24xx_irq_resume, }; diff --git a/arch/arm/mach-s3c2440/s3c244x.c b/arch/arm/mach-s3c2440/s3c244x.c index 90c1707..7e8a23d 100644 --- a/arch/arm/mach-s3c2440/s3c244x.c +++ b/arch/arm/mach-s3c2440/s3c244x.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -134,45 +135,14 @@ void __init s3c244x_init_clocks(int xtal) s3c2410_baseclk_add(); } -#ifdef CONFIG_PM - -static struct sleep_save s3c244x_sleep[] = { - SAVE_ITEM(S3C2440_DSC0), - SAVE_ITEM(S3C2440_DSC1), - SAVE_ITEM(S3C2440_GPJDAT), - SAVE_ITEM(S3C2440_GPJCON), - SAVE_ITEM(S3C2440_GPJUP) -}; - -static int s3c244x_suspend(struct sys_device *dev, pm_message_t state) -{ - s3c_pm_do_save(s3c244x_sleep, ARRAY_SIZE(s3c244x_sleep)); - return 0; -} - -static int s3c244x_resume(struct sys_device *dev) -{ - s3c_pm_do_restore(s3c244x_sleep, ARRAY_SIZE(s3c244x_sleep)); - return 0; -} - -#else -#define s3c244x_suspend NULL -#define s3c244x_resume NULL -#endif - /* Since the S3C2442 and S3C2440 share items, put both sysclasses here */ struct sysdev_class s3c2440_sysclass = { .name = "s3c2440-core", - .suspend = s3c244x_suspend, - .resume = s3c244x_resume }; struct sysdev_class s3c2442_sysclass = { .name = "s3c2442-core", - .suspend = s3c244x_suspend, - .resume = s3c244x_resume }; /* need to register class before we actually register the device, and @@ -194,3 +164,33 @@ static int __init s3c2442_core_init(void) } core_initcall(s3c2442_core_init); + + +#ifdef CONFIG_PM +static struct sleep_save s3c244x_sleep[] = { + SAVE_ITEM(S3C2440_DSC0), + SAVE_ITEM(S3C2440_DSC1), + SAVE_ITEM(S3C2440_GPJDAT), + SAVE_ITEM(S3C2440_GPJCON), + SAVE_ITEM(S3C2440_GPJUP) +}; + +static int s3c244x_suspend(void) +{ + s3c_pm_do_save(s3c244x_sleep, ARRAY_SIZE(s3c244x_sleep)); + return 0; +} + +static void s3c244x_resume(void) +{ + s3c_pm_do_restore(s3c244x_sleep, ARRAY_SIZE(s3c244x_sleep)); +} +#else +#define s3c244x_suspend NULL +#define s3c244x_resume NULL +#endif + +struct syscore_ops s3c244x_pm_syscore_ops = { + .suspend = s3c244x_suspend, + .resume = s3c244x_resume, +}; diff --git a/arch/arm/mach-s3c64xx/irq-pm.c b/arch/arm/mach-s3c64xx/irq-pm.c index da1bec6..8bec61e 100644 --- a/arch/arm/mach-s3c64xx/irq-pm.c +++ b/arch/arm/mach-s3c64xx/irq-pm.c @@ -13,7 +13,7 @@ */ #include -#include +#include #include #include #include @@ -54,7 +54,7 @@ static struct irq_grp_save { static u32 irq_uart_mask[CONFIG_SERIAL_SAMSUNG_UARTS]; -static int s3c64xx_irq_pm_suspend(struct sys_device *dev, pm_message_t state) +static int s3c64xx_irq_pm_suspend(void) { struct irq_grp_save *grp = eint_grp_save; int i; @@ -75,7 +75,7 @@ static int s3c64xx_irq_pm_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int s3c64xx_irq_pm_resume(struct sys_device *dev) +static void s3c64xx_irq_pm_resume(void) { struct irq_grp_save *grp = eint_grp_save; int i; @@ -94,18 +94,18 @@ static int s3c64xx_irq_pm_resume(struct sys_device *dev) } S3C_PMDBG("%s: IRQ configuration restored\n", __func__); - return 0; } -static struct sysdev_driver s3c64xx_irq_driver = { +struct syscore_ops s3c64xx_irq_syscore_ops = { .suspend = s3c64xx_irq_pm_suspend, .resume = s3c64xx_irq_pm_resume, }; -static int __init s3c64xx_irq_pm_init(void) +static __init int s3c64xx_syscore_init(void) { - return sysdev_driver_register(&s3c64xx_sysclass, &s3c64xx_irq_driver); -} + register_syscore_ops(&s3c64xx_irq_syscore_ops); -arch_initcall(s3c64xx_irq_pm_init); + return 0; +} +core_initcall(s3c64xx_syscore_init); diff --git a/arch/arm/mach-s5pv210/pm.c b/arch/arm/mach-s5pv210/pm.c index 549d792..24febae 100644 --- a/arch/arm/mach-s5pv210/pm.c +++ b/arch/arm/mach-s5pv210/pm.c @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -140,7 +141,17 @@ static int s5pv210_pm_add(struct sys_device *sysdev) return 0; } -static int s5pv210_pm_resume(struct sys_device *dev) +static struct sysdev_driver s5pv210_pm_driver = { + .add = s5pv210_pm_add, +}; + +static __init int s5pv210_pm_drvinit(void) +{ + return sysdev_driver_register(&s5pv210_sysclass, &s5pv210_pm_driver); +} +arch_initcall(s5pv210_pm_drvinit); + +static void s5pv210_pm_resume(void) { u32 tmp; @@ -150,17 +161,15 @@ static int s5pv210_pm_resume(struct sys_device *dev) __raw_writel(tmp , S5P_OTHERS); s3c_pm_do_restore_core(s5pv210_core_save, ARRAY_SIZE(s5pv210_core_save)); - - return 0; } -static struct sysdev_driver s5pv210_pm_driver = { - .add = s5pv210_pm_add, +static struct syscore_ops s5pv210_pm_syscore_ops = { .resume = s5pv210_pm_resume, }; -static __init int s5pv210_pm_drvinit(void) +static __init int s5pv210_pm_syscore_init(void) { - return sysdev_driver_register(&s5pv210_sysclass, &s5pv210_pm_driver); + register_syscore_ops(&s5pv210_pm_syscore_ops); + return 0; } -arch_initcall(s5pv210_pm_drvinit); +arch_initcall(s5pv210_pm_syscore_init); diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c index 27ea852..c10d10c 100644 --- a/arch/arm/plat-s3c24xx/dma.c +++ b/arch/arm/plat-s3c24xx/dma.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -1195,19 +1195,12 @@ int s3c2410_dma_getposition(unsigned int channel, dma_addr_t *src, dma_addr_t *d EXPORT_SYMBOL(s3c2410_dma_getposition); -static inline struct s3c2410_dma_chan *to_dma_chan(struct sys_device *dev) -{ - return container_of(dev, struct s3c2410_dma_chan, dev); -} - -/* system device class */ +/* system core operations */ #ifdef CONFIG_PM -static int s3c2410_dma_suspend(struct sys_device *dev, pm_message_t state) +static void s3c2410_dma_suspend_chan(s3c2410_dma_chan *cp) { - struct s3c2410_dma_chan *cp = to_dma_chan(dev); - printk(KERN_DEBUG "suspending dma channel %d\n", cp->number); if (dma_rdreg(cp, S3C2410_DMA_DMASKTRIG) & S3C2410_DMASKTRIG_ON) { @@ -1222,13 +1215,21 @@ static int s3c2410_dma_suspend(struct sys_device *dev, pm_message_t state) s3c2410_dma_dostop(cp); } +} + +static int s3c2410_dma_suspend(void) +{ + struct s3c2410_dma_chan *cp = s3c2410_chans; + int channel; + + for (channel = 0; channel < dma_channels; cp++, channel++) + s3c2410_dma_suspend_chan(cp); return 0; } -static int s3c2410_dma_resume(struct sys_device *dev) +static void s3c2410_dma_resume_chan(struct s3c2410_dma_chan *cp) { - struct s3c2410_dma_chan *cp = to_dma_chan(dev); unsigned int no = cp->number | DMACH_LOW_LEVEL; /* restore channel's hardware configuration */ @@ -1249,13 +1250,21 @@ static int s3c2410_dma_resume(struct sys_device *dev) return 0; } +static void s3c2410_dma_resume(void) +{ + struct s3c2410_dma_chan *cp = s3c2410_chans + dma_channels - 1; + int channel; + + for (channel = dma_channels - 1; channel >= 0; cp++, channel--) + s3c2410_dma_resume_chan(cp); +} + #else #define s3c2410_dma_suspend NULL #define s3c2410_dma_resume NULL #endif /* CONFIG_PM */ -struct sysdev_class dma_sysclass = { - .name = "s3c24xx-dma", +struct syscore_ops dma_syscore_ops = { .suspend = s3c2410_dma_suspend, .resume = s3c2410_dma_resume, }; @@ -1269,39 +1278,14 @@ static void s3c2410_dma_cache_ctor(void *p) /* initialisation code */ -static int __init s3c24xx_dma_sysclass_init(void) +static int __init s3c24xx_dma_syscore_init(void) { - int ret = sysdev_class_register(&dma_sysclass); - - if (ret != 0) - printk(KERN_ERR "dma sysclass registration failed\n"); - - return ret; -} - -core_initcall(s3c24xx_dma_sysclass_init); - -static int __init s3c24xx_dma_sysdev_register(void) -{ - struct s3c2410_dma_chan *cp = s3c2410_chans; - int channel, ret; - - for (channel = 0; channel < dma_channels; cp++, channel++) { - cp->dev.cls = &dma_sysclass; - cp->dev.id = channel; - ret = sysdev_register(&cp->dev); - - if (ret) { - printk(KERN_ERR "error registering dev for dma %d\n", - channel); - return ret; - } - } + register_syscore_ops(&dma_syscore_ops); return 0; } -late_initcall(s3c24xx_dma_sysdev_register); +late_initcall(s3c24xx_dma_syscore_init); int __init s3c24xx_dma_init(unsigned int channels, unsigned int irq, unsigned int stride) diff --git a/arch/arm/plat-s3c24xx/irq-pm.c b/arch/arm/plat-s3c24xx/irq-pm.c index c3624d8..0efb2e2 100644 --- a/arch/arm/plat-s3c24xx/irq-pm.c +++ b/arch/arm/plat-s3c24xx/irq-pm.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -65,7 +64,7 @@ static unsigned long save_extint[3]; static unsigned long save_eintflt[4]; static unsigned long save_eintmask; -int s3c24xx_irq_suspend(struct sys_device *dev, pm_message_t state) +int s3c24xx_irq_suspend(void) { unsigned int i; @@ -81,7 +80,7 @@ int s3c24xx_irq_suspend(struct sys_device *dev, pm_message_t state) return 0; } -int s3c24xx_irq_resume(struct sys_device *dev) +void s3c24xx_irq_resume(void) { unsigned int i; @@ -93,6 +92,4 @@ int s3c24xx_irq_resume(struct sys_device *dev) s3c_pm_do_restore(irq_save, ARRAY_SIZE(irq_save)); __raw_writel(save_eintmask, S3C24XX_EINTMASK); - - return 0; } diff --git a/arch/arm/plat-s5p/irq-pm.c b/arch/arm/plat-s5p/irq-pm.c index 5259ad4..327acb3 100644 --- a/arch/arm/plat-s5p/irq-pm.c +++ b/arch/arm/plat-s5p/irq-pm.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -77,17 +76,15 @@ static struct sleep_save eint_save[] = { SAVE_ITEM(S5P_EINT_MASK(3)), }; -int s3c24xx_irq_suspend(struct sys_device *dev, pm_message_t state) +int s3c24xx_irq_suspend(void) { s3c_pm_do_save(eint_save, ARRAY_SIZE(eint_save)); return 0; } -int s3c24xx_irq_resume(struct sys_device *dev) +void s3c24xx_irq_resume(void) { s3c_pm_do_restore(eint_save, ARRAY_SIZE(eint_save)); - - return 0; } diff --git a/arch/arm/plat-samsung/include/plat/cpu.h b/arch/arm/plat-samsung/include/plat/cpu.h index cedfff5..3aedac0 100644 --- a/arch/arm/plat-samsung/include/plat/cpu.h +++ b/arch/arm/plat-samsung/include/plat/cpu.h @@ -68,6 +68,12 @@ extern void s3c24xx_init_uartdevs(char *name, struct sys_timer; extern struct sys_timer s3c24xx_timer; +extern struct syscore_ops s3c2410_pm_syscore_ops; +extern struct syscore_ops s3c2412_pm_syscore_ops; +extern struct syscore_ops s3c2416_pm_syscore_ops; +extern struct syscore_ops s3c244x_pm_syscore_ops; +extern struct syscore_ops s3c64xx_irq_syscore_ops; + /* system device classes */ extern struct sysdev_class s3c2410_sysclass; diff --git a/arch/arm/plat-samsung/include/plat/pm.h b/arch/arm/plat-samsung/include/plat/pm.h index 937cc2a..7fb6f6b 100644 --- a/arch/arm/plat-samsung/include/plat/pm.h +++ b/arch/arm/plat-samsung/include/plat/pm.h @@ -103,14 +103,16 @@ extern void s3c_pm_do_restore_core(struct sleep_save *ptr, int count); #ifdef CONFIG_PM extern int s3c_irqext_wake(struct irq_data *data, unsigned int state); -extern int s3c24xx_irq_suspend(struct sys_device *dev, pm_message_t state); -extern int s3c24xx_irq_resume(struct sys_device *dev); +extern int s3c24xx_irq_suspend(void); +extern void s3c24xx_irq_resume(void); #else #define s3c_irqext_wake NULL #define s3c24xx_irq_suspend NULL #define s3c24xx_irq_resume NULL #endif +extern struct syscore_ops s3c24xx_irq_syscore_ops; + /* PM debug functions */ #ifdef CONFIG_SAMSUNG_PM_DEBUG -- cgit v1.1 From 67f9cbf9affe39f67cd3f1d2e2a2a43089d9ab3a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Apr 2011 22:03:31 +0200 Subject: PM / Blackfin: Use struct syscore_ops instead of sysdevs for PM Convert some Blackfin architecture's code to using struct syscore_ops objects for power management instead of sysdev classes and sysdevs. This simplifies the code and reduces the kernel's memory footprint. It also is necessary for removing sysdevs from the kernel entirely in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Mike Frysinger --- arch/blackfin/kernel/nmi.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 0b5f72f..401eb1d 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include @@ -196,43 +196,31 @@ void touch_nmi_watchdog(void) /* Suspend/resume support */ #ifdef CONFIG_PM -static int nmi_wdt_suspend(struct sys_device *dev, pm_message_t state) +static int nmi_wdt_suspend(void) { nmi_wdt_stop(); return 0; } -static int nmi_wdt_resume(struct sys_device *dev) +static void nmi_wdt_resume(void) { if (nmi_active) nmi_wdt_start(); - return 0; } -static struct sysdev_class nmi_sysclass = { - .name = DRV_NAME, +static struct syscore_ops nmi_syscore_ops = { .resume = nmi_wdt_resume, .suspend = nmi_wdt_suspend, }; -static struct sys_device device_nmi_wdt = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_nmi_wdt_sysfs(void) +static int __init init_nmi_wdt_syscore(void) { - int error; - - if (!nmi_active) - return 0; + if (nmi_active) + register_syscore_ops(&nmi_syscore_ops); - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_nmi_wdt); - return error; + return 0; } -late_initcall(init_nmi_wdt_sysfs); +late_initcall(init_nmi_wdt_syscore); #endif /* CONFIG_PM */ -- cgit v1.1 From 745b1f4f62852340e97a67d8c414d52fa1529185 Mon Sep 17 00:00:00 2001 From: Paul Parsons Date: Fri, 15 Apr 2011 14:39:27 +0000 Subject: ARM: pxa/hx4700: bq24022 regulator needs to be enabled Add REGULATOR_CHANGE_STATUS flag to hx4700 bq24022 regulator. Without this flag the bq24022 cannot be enabled and the battery will not charge. Signed-off-by: Paul Parsons Cc: Philipp Zabel Tested-by: Dmitry Artamonow Signed-off-by: Eric Miao --- arch/arm/mach-pxa/hx4700.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-pxa/hx4700.c b/arch/arm/mach-pxa/hx4700.c index 6de0ad0..9cdcca5 100644 --- a/arch/arm/mach-pxa/hx4700.c +++ b/arch/arm/mach-pxa/hx4700.c @@ -711,7 +711,7 @@ static struct regulator_consumer_supply bq24022_consumers[] = { static struct regulator_init_data bq24022_init_data = { .constraints = { .max_uA = 500000, - .valid_ops_mask = REGULATOR_CHANGE_CURRENT, + .valid_ops_mask = REGULATOR_CHANGE_CURRENT|REGULATOR_CHANGE_STATUS, }, .num_consumer_supplies = ARRAY_SIZE(bq24022_consumers), .consumer_supplies = bq24022_consumers, -- cgit v1.1 From e454d1632000b6e86003209811bda1186b34f8f3 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 20 Apr 2011 15:41:29 +0200 Subject: ARM: pxa/magician: bq24022 regulator needs to be enabled Add REGULATOR_CHANGE_STATUS flag to magician bq24022 regulator to enable charging. Signed-off-by: Philipp Zabel Signed-off-by: Eric Miao --- arch/arm/mach-pxa/magician.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c index a72993d..9984ef7 100644 --- a/arch/arm/mach-pxa/magician.c +++ b/arch/arm/mach-pxa/magician.c @@ -599,7 +599,7 @@ static struct regulator_consumer_supply bq24022_consumers[] = { static struct regulator_init_data bq24022_init_data = { .constraints = { .max_uA = 500000, - .valid_ops_mask = REGULATOR_CHANGE_CURRENT, + .valid_ops_mask = REGULATOR_CHANGE_CURRENT | REGULATOR_CHANGE_STATUS, }, .num_consumer_supplies = ARRAY_SIZE(bq24022_consumers), .consumer_supplies = bq24022_consumers, -- cgit v1.1 From 8c61d9d611cb5b290f1b4ac57c4631acfd6e3b5a Mon Sep 17 00:00:00 2001 From: Hans Petter Selasky Date: Sun, 24 Apr 2011 22:35:19 -0700 Subject: cdc_ncm: fix short packet issue on some devices The default maximum transmit length for NCM USB frames should be so that a short packet happens at the end if the device supports a length greater than the defined maximum. This is achieved by adding 4 bytes to the maximum length so that the existing logic can fit a short packet there. Signed-off-by: Hans Petter Selasky Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 967371f..1033ef6 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -54,13 +54,13 @@ #include #include -#define DRIVER_VERSION "7-Feb-2011" +#define DRIVER_VERSION "23-Apr-2011" /* CDC NCM subclass 3.2.1 */ #define USB_CDC_NCM_NDP16_LENGTH_MIN 0x10 /* Maximum NTB length */ -#define CDC_NCM_NTB_MAX_SIZE_TX 16384 /* bytes */ +#define CDC_NCM_NTB_MAX_SIZE_TX (16384 + 4) /* bytes, must be short terminated */ #define CDC_NCM_NTB_MAX_SIZE_RX 16384 /* bytes */ /* Minimum value for MaxDatagramSize, ch. 6.2.9 */ -- cgit v1.1 From ad10e1051ec97a7e56e001eb1e9721b3d883e030 Mon Sep 17 00:00:00 2001 From: Michael Williamson Date: Wed, 23 Mar 2011 12:15:41 +0000 Subject: davinci: mityomapl138: Use correct id for NAND controller For the MityDSP-L138/MityARM-1808 SOMS, the NAND controller id (which needs to correspond to the chipselect, and is used for controlling the HW ECC computation) is not correct. Fix it. Signed-off-by: Michael Williamson Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/board-mityomapl138.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-davinci/board-mityomapl138.c b/arch/arm/mach-davinci/board-mityomapl138.c index 2aa79c5..e5d554c 100644 --- a/arch/arm/mach-davinci/board-mityomapl138.c +++ b/arch/arm/mach-davinci/board-mityomapl138.c @@ -414,7 +414,7 @@ static struct resource mityomapl138_nandflash_resource[] = { static struct platform_device mityomapl138_nandflash_device = { .name = "davinci_nand", - .id = 0, + .id = 1, .dev = { .platform_data = &mityomapl138_nandflash_data, }, -- cgit v1.1 From 336f402790defe4c4d33838b41bcd239537a5431 Mon Sep 17 00:00:00 2001 From: Michael Williamson Date: Wed, 23 Mar 2011 12:15:42 +0000 Subject: davinci: mityomapl138: Use auto-probe to determine attached PHY ID Current board configurations involving the MityDSP-L138 and MityARM-1808 only have one attached PHY, but it's address may not be the same. Default the behavior to auto-probe for the PHY and use the first one found. Signed-off-by: Michael Williamson Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/board-mityomapl138.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-davinci/board-mityomapl138.c b/arch/arm/mach-davinci/board-mityomapl138.c index e5d554c..606a6f2 100644 --- a/arch/arm/mach-davinci/board-mityomapl138.c +++ b/arch/arm/mach-davinci/board-mityomapl138.c @@ -29,7 +29,7 @@ #include #include -#define MITYOMAPL138_PHY_ID "0:03" +#define MITYOMAPL138_PHY_ID "" #define FACTORY_CONFIG_MAGIC 0x012C0138 #define FACTORY_CONFIG_VERSION 0x00010001 -- cgit v1.1 From 9e7d24f622fc57ff5d58af3d44ba503430148354 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Wed, 6 Apr 2011 17:17:21 +0000 Subject: DA830: fix SPI1 base address Commit 54ce6883d29630ff334bee4256a25e3f8719a181 (davinci: da8xx: add spi resources and registration routine) wrongly assumed that SPI1 is mapped at the same address on DA830/OMAP-L137 and DA850/OMAP-L138; actually, the base address was valid only for the latter SoC. Teach the code to pass the correct SPI1 memory resource for both SoCs... Signed-off-by: Sergei Shtylyov Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/devices-da8xx.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c index 625d4b6..58a02dc 100644 --- a/arch/arm/mach-davinci/devices-da8xx.c +++ b/arch/arm/mach-davinci/devices-da8xx.c @@ -39,7 +39,8 @@ #define DA8XX_GPIO_BASE 0x01e26000 #define DA8XX_I2C1_BASE 0x01e28000 #define DA8XX_SPI0_BASE 0x01c41000 -#define DA8XX_SPI1_BASE 0x01f0e000 +#define DA830_SPI1_BASE 0x01e12000 +#define DA850_SPI1_BASE 0x01f0e000 #define DA8XX_EMAC_CTRL_REG_OFFSET 0x3000 #define DA8XX_EMAC_MOD_REG_OFFSET 0x2000 @@ -762,8 +763,8 @@ static struct resource da8xx_spi0_resources[] = { static struct resource da8xx_spi1_resources[] = { [0] = { - .start = DA8XX_SPI1_BASE, - .end = DA8XX_SPI1_BASE + SZ_4K - 1, + .start = DA830_SPI1_BASE, + .end = DA830_SPI1_BASE + SZ_4K - 1, .flags = IORESOURCE_MEM, }, [1] = { @@ -832,5 +833,10 @@ int __init da8xx_register_spi(int instance, struct spi_board_info *info, da8xx_spi_pdata[instance].num_chipselect = len; + if (instance == 1 && cpu_is_davinci_da850()) { + da8xx_spi1_resources[0].start = DA850_SPI1_BASE; + da8xx_spi1_resources[0].end = DA850_SPI1_BASE + SZ_4K - 1; + } + return platform_device_register(&da8xx_spi_device[instance]); } -- cgit v1.1 From 45b146d746ea1b7f87b023a79d5186d0e87793eb Mon Sep 17 00:00:00 2001 From: Russell King - ARM Linux Date: Fri, 15 Apr 2011 12:32:40 +0000 Subject: ARM: Davinci: Fix I2C build errors Several Davinci platforms select the I2C EEPROM support, but don't select I2C support. This causes I2C EEPROM support to be built into the kernel, but I2C support may not be configured to be built in. This leads to linker errors due to missing I2C symbols. Arrange for I2C to be selected whenever EEPROM_AT24 is selected. Signed-off-by: Russell King Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig index 32f1479..c0deaca 100644 --- a/arch/arm/mach-davinci/Kconfig +++ b/arch/arm/mach-davinci/Kconfig @@ -63,6 +63,7 @@ config MACH_DAVINCI_EVM depends on ARCH_DAVINCI_DM644x select MISC_DEVICES select EEPROM_AT24 + select I2C help Configure this option to specify the whether the board used for development is a DM644x EVM @@ -72,6 +73,7 @@ config MACH_SFFSDR depends on ARCH_DAVINCI_DM644x select MISC_DEVICES select EEPROM_AT24 + select I2C help Say Y here to select the Lyrtech Small Form Factor Software Defined Radio (SFFSDR) board. @@ -105,6 +107,7 @@ config MACH_DAVINCI_DM6467_EVM select MACH_DAVINCI_DM6467TEVM select MISC_DEVICES select EEPROM_AT24 + select I2C help Configure this option to specify the whether the board used for development is a DM6467 EVM @@ -118,6 +121,7 @@ config MACH_DAVINCI_DM365_EVM depends on ARCH_DAVINCI_DM365 select MISC_DEVICES select EEPROM_AT24 + select I2C help Configure this option to specify whether the board used for development is a DM365 EVM @@ -129,6 +133,7 @@ config MACH_DAVINCI_DA830_EVM select GPIO_PCF857X select MISC_DEVICES select EEPROM_AT24 + select I2C help Say Y here to select the TI DA830/OMAP-L137/AM17x Evaluation Module. @@ -205,6 +210,7 @@ config MACH_MITYOMAPL138 depends on ARCH_DAVINCI_DA850 select MISC_DEVICES select EEPROM_AT24 + select I2C help Say Y here to select the Critical Link MityDSP-L138/MityARM-1808 System on Module. Information on this SoM may be found at -- cgit v1.1 From bf26c018490c2fce7fe9b629083b96ce0e6ad019 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 Apr 2011 16:53:20 +0200 Subject: ptrace: Prepare to fix racy accesses on task breakpoints When a task is traced and is in a stopped state, the tracer may execute a ptrace request to examine the tracee state and get its task struct. Right after, the tracee can be killed and thus its breakpoints released. This can happen concurrently when the tracer is in the middle of reading or modifying these breakpoints, leading to dereferencing a freed pointer. Hence, to prepare the fix, create a generic breakpoint reference holding API. When a reference on the breakpoints of a task is held, the breakpoints won't be released until the last reference is dropped. After that, no more ptrace request on the task's breakpoints can be serviced for the tracer. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Prasad Cc: Paul Mundt Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1302284067-7860-2-git-send-email-fweisbec@gmail.com --- include/linux/ptrace.h | 13 ++++++++++++- include/linux/sched.h | 3 +++ kernel/exit.c | 2 +- kernel/ptrace.c | 17 +++++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index a1147e5..9178d5c 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -189,6 +189,10 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) child->ptrace = current->ptrace; __ptrace_link(child, current->parent); } + +#ifdef CONFIG_HAVE_HW_BREAKPOINT + atomic_set(&child->ptrace_bp_refcnt, 1); +#endif } /** @@ -350,6 +354,13 @@ extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); -#endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT +extern int ptrace_get_breakpoints(struct task_struct *tsk); +extern void ptrace_put_breakpoints(struct task_struct *tsk); +#else +static inline void ptrace_put_breakpoints(struct task_struct *tsk) { } +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + +#endif /* __KERNEL */ #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 18d63ce..781abd1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1537,6 +1537,9 @@ struct task_struct { unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; #endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT + atomic_t ptrace_bp_refcnt; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/kernel/exit.c b/kernel/exit.c index f5d2f63..8dd8741 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - flush_ptrace_hw_breakpoint(tsk); + ptrace_put_breakpoints(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0fc1eed..dc7ab65 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include /* @@ -879,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +int ptrace_get_breakpoints(struct task_struct *tsk) +{ + if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) + return 0; + + return -1; +} + +void ptrace_put_breakpoints(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) + flush_ptrace_hw_breakpoint(tsk); +} +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v1.1 From 87dc669ba25777b67796d7262c569429e58b1ed4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Apr 2011 17:29:36 +0200 Subject: x86, hw_breakpoints: Fix racy access to ptrace breakpoints While the tracer accesses ptrace breakpoints, the child task may concurrently exit due to a SIGKILL and thus release its breakpoints at the same time. We can then dereference some freed pointers. To fix this, hold a reference on the child breakpoints before manipulating them. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Prasad Cc: Paul Mundt Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1302284067-7860-3-git-send-email-fweisbec@gmail.com --- arch/x86/kernel/ptrace.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 45892dc..f65e5b5 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -655,6 +658,9 @@ restore: } goto restore; } + + ptrace_put_breakpoints(tsk); + return ((orig_ret < 0) ? orig_ret : rc); } @@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (n < HBP_NUM) { struct perf_event *bp; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + bp = thread->ptrace_bps[n]; if (!bp) - return 0; - val = bp->hw.info.address; + val = 0; + else + val = bp->hw.info.address; + + ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event *bp; struct thread_struct *t = &tsk->thread; struct perf_event_attr attr; + int err = 0; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); @@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) - return PTR_ERR(bp); + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto put; + } t->ptrace_bps[nr] = bp; } else { - int err; - bp = t->ptrace_bps[nr]; attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); - if (err) - return err; } - - return 0; +put: + ptrace_put_breakpoints(tsk); + return err; } /* -- cgit v1.1 From 07fa7a0a8a586c01a8b416358c7012dcb9dc688d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Apr 2011 17:29:36 +0200 Subject: powerpc, hw_breakpoints: Fix racy access to ptrace breakpoints While the tracer accesses ptrace breakpoints, the child task may concurrently exit due to a SIGKILL and thus release its breakpoints at the same time. We can then dereference some freed pointers. To fix this, hold a reference on the child breakpoints before manipulating them. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Acked-by: Prasad Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Paul Mundt Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1302284067-7860-4-git-send-email-fweisbec@gmail.com --- arch/powerpc/kernel/ptrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 55613e3..4edeeb3 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1591,7 +1591,10 @@ long arch_ptrace(struct task_struct *child, long request, } case PTRACE_SET_DEBUGREG: + if (ptrace_get_breakpoints(child) < 0) + return -ESRCH; ret = ptrace_set_debugreg(child, addr, data); + ptrace_put_breakpoints(child); break; #ifdef CONFIG_PPC64 -- cgit v1.1 From bf0b8f4b55e591ba417c2dbaff42769e1fc773b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Apr 2011 17:29:36 +0200 Subject: arm, hw_breakpoints: Fix racy access to ptrace breakpoints While the tracer accesses ptrace breakpoints, the child task may concurrently exit due to a SIGKILL and thus release its breakpoints at the same time. We can then dereference some freed pointers. To fix this, hold a reference on the child breakpoints before manipulating them. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Acked-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Prasad Cc: Paul Mundt Link: http://lkml.kernel.org/r/1302284067-7860-5-git-send-email-fweisbec@gmail.com --- arch/arm/kernel/ptrace.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 2bf27f3..8182f45 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -767,12 +767,20 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_HAVE_HW_BREAKPOINT case PTRACE_GETHBPREGS: + if (ptrace_get_breakpoints(child) < 0) + return -ESRCH; + ret = ptrace_gethbpregs(child, addr, (unsigned long __user *)data); + ptrace_put_breakpoints(child); break; case PTRACE_SETHBPREGS: + if (ptrace_get_breakpoints(child) < 0) + return -ESRCH; + ret = ptrace_sethbpregs(child, addr, (unsigned long __user *)data); + ptrace_put_breakpoints(child); break; #endif -- cgit v1.1 From e0ac8457d020c0289ea566917267da9e5e6d9865 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Apr 2011 17:29:36 +0200 Subject: sh, hw_breakpoints: Fix racy access to ptrace breakpoints While the tracer accesses ptrace breakpoints, the child task may concurrently exit due to a SIGKILL and thus release its breakpoints at the same time. We can then dereference some freed pointers. To fix this, hold a reference on the child breakpoints before manipulating them. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Prasad Link: http://lkml.kernel.org/r/1302284067-7860-6-git-send-email-fweisbec@gmail.com --- arch/sh/kernel/ptrace_32.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 2130ca6..3d7b209 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -117,7 +117,11 @@ void user_enable_single_step(struct task_struct *child) set_tsk_thread_flag(child, TIF_SINGLESTEP); + if (ptrace_get_breakpoints(child) < 0) + return; + set_single_step(child, pc); + ptrace_put_breakpoints(child); } void user_disable_single_step(struct task_struct *child) -- cgit v1.1 From 0972ddb2373d5e127aabdcabd8305eff0242cd0b Mon Sep 17 00:00:00 2001 From: Held Bernhard Date: Sun, 24 Apr 2011 22:07:32 +0000 Subject: net: provide cow_metrics() methods to blackhole dst_ops Since commit 62fa8a846d7d (net: Implement read-only protection and COW'ing of metrics.) the kernel throws an oops. [ 101.620985] BUG: unable to handle kernel NULL pointer dereference at (null) [ 101.621050] IP: [< (null)>] (null) [ 101.621084] PGD 6e53c067 PUD 3dd6a067 PMD 0 [ 101.621122] Oops: 0010 [#1] SMP [ 101.621153] last sysfs file: /sys/devices/virtual/ppp/ppp/uevent [ 101.621192] CPU 2 [ 101.621206] Modules linked in: l2tp_ppp pppox ppp_generic slhc l2tp_netlink l2tp_core deflate zlib_deflate twofish_x86_64 twofish_common des_generic cbc ecb sha1_generic hmac af_key iptable_filter snd_pcm_oss snd_mixer_oss snd_seq snd_seq_device loop snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_pcm snd_timer snd i2c_i801 iTCO_wdt psmouse soundcore snd_page_alloc evdev uhci_hcd ehci_hcd thermal [ 101.621552] [ 101.621567] Pid: 5129, comm: openl2tpd Not tainted 2.6.39-rc4-Quad #3 Gigabyte Technology Co., Ltd. G33-DS3R/G33-DS3R [ 101.621637] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 101.621684] RSP: 0018:ffff88003ddeba60 EFLAGS: 00010202 [ 101.621716] RAX: ffff88003ddb5600 RBX: ffff88003ddb5600 RCX: 0000000000000020 [ 101.621758] RDX: ffffffff81a69a00 RSI: ffffffff81b7ee61 RDI: ffff88003ddb5600 [ 101.621800] RBP: ffff8800537cd900 R08: 0000000000000000 R09: ffff88003ddb5600 [ 101.621840] R10: 0000000000000005 R11: 0000000000014b38 R12: ffff88003ddb5600 [ 101.621881] R13: ffffffff81b7e480 R14: ffffffff81b7e8b8 R15: ffff88003ddebad8 [ 101.621924] FS: 00007f06e4182700(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 [ 101.621971] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 101.622005] CR2: 0000000000000000 CR3: 0000000045274000 CR4: 00000000000006e0 [ 101.622046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 101.622087] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 101.622129] Process openl2tpd (pid: 5129, threadinfo ffff88003ddea000, task ffff88003de9a280) [ 101.622177] Stack: [ 101.622191] ffffffff81447efa ffff88007d3ded80 ffff88003de9a280 ffff88007d3ded80 [ 101.622245] 0000000000000001 ffff88003ddebbb8 ffffffff8148d5a7 0000000000000212 [ 101.622299] ffff88003dcea000 ffff88003dcea188 ffffffff00000001 ffffffff81b7e480 [ 101.622353] Call Trace: [ 101.622374] [] ? ipv4_blackhole_route+0x1ba/0x210 [ 101.622415] [] ? xfrm_lookup+0x417/0x510 [ 101.622450] [] ? extract_buf+0x9a/0x140 [ 101.622485] [] ? __ip_flush_pending_frames+0x70/0x70 [ 101.622526] [] ? udp_sendmsg+0x62f/0x810 [ 101.622562] [] ? sock_sendmsg+0x116/0x130 [ 101.622599] [] ? find_get_page+0x18/0x90 [ 101.622633] [] ? filemap_fault+0x12a/0x4b0 [ 101.622668] [] ? move_addr_to_kernel+0x64/0x90 [ 101.622706] [] ? verify_iovec+0x7a/0xf0 [ 101.622739] [] ? sys_sendmsg+0x292/0x420 [ 101.622774] [] ? handle_pte_fault+0x8a/0x7c0 [ 101.622810] [] ? __pte_alloc+0xae/0x130 [ 101.622844] [] ? handle_mm_fault+0x138/0x380 [ 101.622880] [] ? do_page_fault+0x189/0x410 [ 101.622915] [] ? sys_getsockname+0xf3/0x110 [ 101.622952] [] ? ip_setsockopt+0x4d/0xa0 [ 101.622986] [] ? sockfd_lookup_light+0x22/0x90 [ 101.623024] [] ? system_call_fastpath+0x16/0x1b [ 101.623060] Code: Bad RIP value. [ 101.623090] RIP [< (null)>] (null) [ 101.623125] RSP [ 101.623146] CR2: 0000000000000000 [ 101.650871] ---[ end trace ca3856a7d8e8dad4 ]--- [ 101.651011] __sk_free: optmem leakage (160 bytes) detected. The oops happens in dst_metrics_write_ptr() include/net/dst.h:124: return dst->ops->cow_metrics(dst, p); dst->ops->cow_metrics is NULL and causes the oops. Provide cow_metrics() methods, like we did in commit 214f45c91bb (net: provide default_advmss() methods to blackhole dst_ops) Signed-off-by: Held Bernhard Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 +++++++ net/ipv6/route.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c1acf69..99e6e4b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2690,6 +2690,12 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), @@ -2698,6 +2704,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .default_mtu = ipv4_blackhole_default_mtu, .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .cow_metrics = ipv4_rt_blackhole_cow_metrics, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0a5d02a..fd0eec6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -153,6 +153,12 @@ static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .protocol = cpu_to_be16(ETH_P_IPV6), @@ -161,6 +167,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .default_mtu = ip6_blackhole_default_mtu, .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, + .cow_metrics = ip6_rt_blackhole_cow_metrics, }; static const u32 ip6_template_metrics[RTAX_MAX] = { -- cgit v1.1 From 85723943537bf6a73bdf1140b2088fbe0c17c3c2 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Sat, 23 Apr 2011 22:18:26 +0800 Subject: drivers:base:fix the coding format of memory.c Fix the line longer than 80 of memory_uevent function . Signed-off-by: Wanlong Gao Signed-off-by: Greg Kroah-Hartman --- drivers/base/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 3da6a43..3e9aa3d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -48,7 +48,8 @@ static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) return MEMORY_CLASS_NAME; } -static int memory_uevent(struct kset *kset, struct kobject *obj, struct kobj_uevent_env *env) +static int memory_uevent(struct kset *kset, struct kobject *obj, + struct kobj_uevent_env *env) { int retval = 0; -- cgit v1.1 From ad58671cf32c74a8d6e8f51e63e9cf4e7a73bf1e Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 19 Apr 2011 12:43:45 +0100 Subject: Add a strtobool function matching semantics of existing in kernel equivalents This is a rename of the usr_strtobool proposal, which was a renamed, relocated and fixed version of previous kstrtobool RFC Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- include/linux/string.h | 1 + lib/string.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index a716ee2..a176db2 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -123,6 +123,7 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); +extern int strtobool(const char *s, bool *res); #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); diff --git a/lib/string.c b/lib/string.c index f71bead..01fad9b 100644 --- a/lib/string.c +++ b/lib/string.c @@ -535,6 +535,35 @@ bool sysfs_streq(const char *s1, const char *s2) } EXPORT_SYMBOL(sysfs_streq); +/** + * strtobool - convert common user inputs into boolean values + * @s: input string + * @res: result + * + * This routine returns 0 iff the first character is one of 'Yy1Nn0'. + * Otherwise it will return -EINVAL. Value pointed to by res is + * updated upon finding a match. + */ +int strtobool(const char *s, bool *res) +{ + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + break; + case 'n': + case 'N': + case '0': + *res = false; + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(strtobool); + #ifndef __HAVE_ARCH_MEMSET /** * memset - Fill a region of memory with the given value -- cgit v1.1 From 8705b48e7159655c116154928fe104fd6561fa94 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 19 Apr 2011 12:43:46 +0100 Subject: debugfs: move to new strtobool No functional changes requires that we eat errors from strtobool. If people want to not do this, then it should be fixed at a later date. V2: Simplification suggested by Rusty Russell removes the need for additional variable ret. Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 89d394d..568304d 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -429,25 +429,16 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, { char buf[32]; int buf_size; + bool bv; u32 *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - *val = 1; - break; - case 'n': - case 'N': - case '0': - *val = 0; - break; - } - + if (strtobool(buf, &bv) == 0) + *val = bv; + return count; } -- cgit v1.1 From e7e09cd667a43d8287f85d453a16fc0ec1e2c7b7 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 19 Apr 2011 12:43:47 +0100 Subject: params.c: Use new strtobool function to process boolean inputs No functional changes. Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- kernel/params.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 7ab388a..6888761 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); int param_set_bool(const char *val, const struct kernel_param *kp) { bool v; + int ret; /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ - switch (val[0]) { - case 'y': case 'Y': case '1': - v = true; - break; - case 'n': case 'N': case '0': - v = false; - break; - default: - return -EINVAL; - } + ret = strtobool(val, &v); + if (ret) + return ret; if (kp->flags & KPARAM_ISBOOL) *(bool *)kp->arg = v; -- cgit v1.1 From 97232fd666f27d1a1cc560adaac446b0bb57f578 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 12 Apr 2011 14:55:11 -0400 Subject: staging: olpc: Add The olpc dcon xo1 driver uses udelay() without including . This patch adds it. Signed-off-by: Jeff Mahoney Signed-off-by: Randy Dunlap Cc: Andres Salomon Signed-off-by: Greg Kroah-Hartman --- drivers/staging/olpc_dcon/olpc_dcon_xo_1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c b/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c index b5d21f6..22c04ea 100644 --- a/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c +++ b/drivers/staging/olpc_dcon/olpc_dcon_xo_1.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include "olpc_dcon.h" -- cgit v1.1 From 4f80494daa1f9705c3717a02e7d713ea2f874bd0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 12 Apr 2011 15:32:36 -0400 Subject: staging: gma500: Depend on X86 The gma500 driver calls set_pages_uc, which is an x86 pageattr call. Since this driver is only used with Intel x86 motherboard chipsets, make the driver depend on X86. Signed-off-by: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gma500/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/gma500/Kconfig b/drivers/staging/gma500/Kconfig index 5501eb9..ce8beda 100644 --- a/drivers/staging/gma500/Kconfig +++ b/drivers/staging/gma500/Kconfig @@ -1,6 +1,6 @@ config DRM_PSB tristate "Intel GMA500 KMS Framebuffer" - depends on DRM && PCI + depends on DRM && PCI && X86 select FB_CFB_COPYAREA select FB_CFB_FILLRECT select FB_CFB_IMAGEBLIT -- cgit v1.1 From c6cdaded146875e2e47e946f6592c1775eaee849 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 12 Apr 2011 21:05:07 -0400 Subject: staging: rts_pstor: Add There are a few files in the rts_pstor driver that use vmalloc/vfree without including the header for it. This patch adds to those files. Signed-off-by: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rts_pstor/ms.c | 1 + drivers/staging/rts_pstor/rtsx_chip.c | 1 + drivers/staging/rts_pstor/rtsx_scsi.c | 1 + drivers/staging/rts_pstor/xd.c | 1 + 4 files changed, 4 insertions(+) diff --git a/drivers/staging/rts_pstor/ms.c b/drivers/staging/rts_pstor/ms.c index 810e170..d89795c 100644 --- a/drivers/staging/rts_pstor/ms.c +++ b/drivers/staging/rts_pstor/ms.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rtsx.h" #include "rtsx_transport.h" diff --git a/drivers/staging/rts_pstor/rtsx_chip.c b/drivers/staging/rts_pstor/rtsx_chip.c index d2f1c71..755be43 100644 --- a/drivers/staging/rts_pstor/rtsx_chip.c +++ b/drivers/staging/rts_pstor/rtsx_chip.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "rtsx.h" #include "rtsx_transport.h" diff --git a/drivers/staging/rts_pstor/rtsx_scsi.c b/drivers/staging/rts_pstor/rtsx_scsi.c index 20c2464..7de1fae 100644 --- a/drivers/staging/rts_pstor/rtsx_scsi.c +++ b/drivers/staging/rts_pstor/rtsx_scsi.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rtsx.h" #include "rtsx_transport.h" diff --git a/drivers/staging/rts_pstor/xd.c b/drivers/staging/rts_pstor/xd.c index 7bcd468..9f3add1 100644 --- a/drivers/staging/rts_pstor/xd.c +++ b/drivers/staging/rts_pstor/xd.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "rtsx.h" #include "rtsx_transport.h" -- cgit v1.1 From 1035117d2a47583f9539c28bf6ce5f677946e172 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 12 Apr 2011 21:05:33 -0400 Subject: staging: rts_pstor: use #ifdef instead of #if This patch fixes a number of the following warnings: warning: "CONFIG_RTS_PSTOR_DEBUG" is not defined The code uses '#if CONFIG_RTS_PSTOR_DEBUG' when it should be using '#ifdef' Signed-off-by: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rts_pstor/debug.h | 2 +- drivers/staging/rts_pstor/rtsx_chip.c | 4 ++-- drivers/staging/rts_pstor/sd.c | 4 ++-- drivers/staging/rts_pstor/trace.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/staging/rts_pstor/debug.h b/drivers/staging/rts_pstor/debug.h index e1408b0..ab305be 100644 --- a/drivers/staging/rts_pstor/debug.h +++ b/drivers/staging/rts_pstor/debug.h @@ -28,7 +28,7 @@ #define RTSX_STOR "rts_pstor: " -#if CONFIG_RTS_PSTOR_DEBUG +#ifdef CONFIG_RTS_PSTOR_DEBUG #define RTSX_DEBUGP(x...) printk(KERN_DEBUG RTSX_STOR x) #define RTSX_DEBUGPN(x...) printk(KERN_DEBUG x) #define RTSX_DEBUGPX(x...) printk(x) diff --git a/drivers/staging/rts_pstor/rtsx_chip.c b/drivers/staging/rts_pstor/rtsx_chip.c index 755be43..4e60780 100644 --- a/drivers/staging/rts_pstor/rtsx_chip.c +++ b/drivers/staging/rts_pstor/rtsx_chip.c @@ -1312,11 +1312,11 @@ void rtsx_polling_func(struct rtsx_chip *chip) #ifdef SUPPORT_OCP if (CHECK_LUN_MODE(chip, SD_MS_2LUN)) { - #if CONFIG_RTS_PSTOR_DEBUG +#ifdef CONFIG_RTS_PSTOR_DEBUG if (chip->ocp_stat & (SD_OC_NOW | SD_OC_EVER | MS_OC_NOW | MS_OC_EVER)) { RTSX_DEBUGP("Over current, OCPSTAT is 0x%x\n", chip->ocp_stat); } - #endif +#endif if (chip->ocp_stat & (SD_OC_NOW | SD_OC_EVER)) { if (chip->card_exist & SD_CARD) { diff --git a/drivers/staging/rts_pstor/sd.c b/drivers/staging/rts_pstor/sd.c index 8d066bd..b1277a6 100644 --- a/drivers/staging/rts_pstor/sd.c +++ b/drivers/staging/rts_pstor/sd.c @@ -909,7 +909,7 @@ static int sd_change_phase(struct rtsx_chip *chip, u8 sample_point, u8 tune_dir) RTSX_WRITE_REG(chip, SD_VPCLK0_CTL, PHASE_NOT_RESET, PHASE_NOT_RESET); RTSX_WRITE_REG(chip, CLK_CTL, CHANGE_CLK, 0); } else { -#if CONFIG_RTS_PSTOR_DEBUG +#ifdef CONFIG_RTS_PSTOR_DEBUG rtsx_read_register(chip, SD_VP_CTL, &val); RTSX_DEBUGP("SD_VP_CTL: 0x%x\n", val); rtsx_read_register(chip, SD_DCMPS_CTL, &val); @@ -958,7 +958,7 @@ static int sd_change_phase(struct rtsx_chip *chip, u8 sample_point, u8 tune_dir) return STATUS_SUCCESS; Fail: -#if CONFIG_RTS_PSTOR_DEBUG +#ifdef CONFIG_RTS_PSTOR_DEBUG rtsx_read_register(chip, SD_VP_CTL, &val); RTSX_DEBUGP("SD_VP_CTL: 0x%x\n", val); rtsx_read_register(chip, SD_DCMPS_CTL, &val); diff --git a/drivers/staging/rts_pstor/trace.h b/drivers/staging/rts_pstor/trace.h index 2c668ba..bc83b49 100644 --- a/drivers/staging/rts_pstor/trace.h +++ b/drivers/staging/rts_pstor/trace.h @@ -82,7 +82,7 @@ do { \ #define TRACE_GOTO(chip, label) goto label #endif -#if CONFIG_RTS_PSTOR_DEBUG +#ifdef CONFIG_RTS_PSTOR_DEBUG static inline void rtsx_dump(u8 *buf, int buf_len) { int i; -- cgit v1.1 From 9f908a9eaa94f07265811c59b740a4608d3dfc96 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 12 Apr 2011 22:57:57 -0400 Subject: staging: ft1000: Remove unnecessary EXPORT_SYMBOLs ft1000-pcmcia uses EXPORT_SYMBOL unnecessarily for sharing symbols inside the same module. For some reason, this is causing section conflicts on ia64 as well, even though neither are static. error: __ksymtab_stop_ft1000_card causes a section type conflict error: __ksymtab_init_ft1000_card causes a section type conflict Signed-off-by: Jeff Mahoney Signed-off-by: Greg Kroah-Hartman --- drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c | 4 ---- drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c | 3 --- 2 files changed, 7 deletions(-) diff --git a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c index eeb7dd4..830822f 100644 --- a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c +++ b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c @@ -2288,7 +2288,3 @@ err_dev: free_netdev(dev); return NULL; } - -EXPORT_SYMBOL(init_ft1000_card); -EXPORT_SYMBOL(stop_ft1000_card); -EXPORT_SYMBOL(flarion_ft1000_cnt); diff --git a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c index 935608e..bdfb1ae 100644 --- a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c +++ b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c @@ -214,6 +214,3 @@ void ft1000CleanupProc(struct net_device *dev) remove_proc_entry(FT1000_PROC, init_net.proc_net); unregister_netdevice_notifier(&ft1000_netdev_notifier); } - -EXPORT_SYMBOL(ft1000InitProc); -EXPORT_SYMBOL(ft1000CleanupProc); -- cgit v1.1 From d1b2e95ab016a0c5b01748986f6ce42e9d11cab2 Mon Sep 17 00:00:00 2001 From: Max Vozeler Date: Mon, 18 Apr 2011 21:44:10 +0200 Subject: staging: usbip: vhci: fix oops on subsequent attach vhci_rx/vhci_tx threads are created once but stopped each time the vdev is shut down. On subsequent attach wake_up_process() oopses trying to access the stopped threads. This patch does as before the kthread conversion which is to create the threads each time a device is attached and stop the threads when the device is shut down. Signed-off-by: Max Vozeler Cc: Arnd Bergmann Cc: Takahiro Hirofuchi Cc: Arjan Mels Signed-off-by: Greg Kroah-Hartman --- drivers/staging/usbip/vhci_hcd.c | 9 ++++----- drivers/staging/usbip/vhci_sysfs.c | 7 ++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/staging/usbip/vhci_hcd.c b/drivers/staging/usbip/vhci_hcd.c index 0f02a4b..14caae2 100644 --- a/drivers/staging/usbip/vhci_hcd.c +++ b/drivers/staging/usbip/vhci_hcd.c @@ -876,8 +876,10 @@ static void vhci_shutdown_connection(struct usbip_device *ud) } /* kill threads related to this sdev, if v.c. exists */ - kthread_stop(vdev->ud.tcp_rx); - kthread_stop(vdev->ud.tcp_tx); + if (vdev->ud.tcp_rx) + kthread_stop(vdev->ud.tcp_rx); + if (vdev->ud.tcp_tx) + kthread_stop(vdev->ud.tcp_tx); usbip_uinfo("stop threads\n"); @@ -949,9 +951,6 @@ static void vhci_device_init(struct vhci_device *vdev) { memset(vdev, 0, sizeof(*vdev)); - vdev->ud.tcp_rx = kthread_create(vhci_rx_loop, &vdev->ud, "vhci_rx"); - vdev->ud.tcp_tx = kthread_create(vhci_tx_loop, &vdev->ud, "vhci_tx"); - vdev->ud.side = USBIP_VHCI; vdev->ud.status = VDEV_ST_NULL; /* vdev->ud.lock = SPIN_LOCK_UNLOCKED; */ diff --git a/drivers/staging/usbip/vhci_sysfs.c b/drivers/staging/usbip/vhci_sysfs.c index 3f2459f..e2dadbd 100644 --- a/drivers/staging/usbip/vhci_sysfs.c +++ b/drivers/staging/usbip/vhci_sysfs.c @@ -21,6 +21,7 @@ #include "vhci.h" #include +#include /* TODO: refine locking ?*/ @@ -220,13 +221,13 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->ud.tcp_socket = socket; vdev->ud.status = VDEV_ST_NOTASSIGNED; - wake_up_process(vdev->ud.tcp_rx); - wake_up_process(vdev->ud.tcp_tx); - spin_unlock(&vdev->ud.lock); spin_unlock(&the_controller->lock); /* end the lock */ + vdev->ud.tcp_rx = kthread_run(vhci_rx_loop, &vdev->ud, "vhci_rx"); + vdev->ud.tcp_tx = kthread_run(vhci_tx_loop, &vdev->ud, "vhci_tx"); + rh_port_connect(rhport, speed); return count; -- cgit v1.1 From 95c2b71b7b7422749950e3d497a894957d724b16 Mon Sep 17 00:00:00 2001 From: Peter Foley Date: Sun, 24 Apr 2011 17:15:09 -0400 Subject: staging: solo6x10: add select SND_PCM to fix build error This patch fixes a build error when SND_PCM is not set by adding a select statment. Signed-off-by: Peter Foley Acked-By: Ben Collins Signed-off-by: Greg Kroah-Hartman --- drivers/staging/solo6x10/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/solo6x10/Kconfig b/drivers/staging/solo6x10/Kconfig index 2cf77c9..03dcac4 100644 --- a/drivers/staging/solo6x10/Kconfig +++ b/drivers/staging/solo6x10/Kconfig @@ -2,6 +2,7 @@ config SOLO6X10 tristate "Softlogic 6x10 MPEG codec cards" depends on PCI && VIDEO_DEV && SND && I2C select VIDEOBUF_DMA_SG + select SND_PCM ---help--- This driver supports the Softlogic based MPEG-4 and h.264 codec codec cards. -- cgit v1.1 From b27b8ea853c4c1c1d5d95448cf69b3f10a9558d4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Apr 2011 11:26:35 -0700 Subject: staging: intel_sst: intelmid needs delay.h intel_sst drivers need to #include so that they build cleanly: drivers/staging/intel_sst/intelmid_v1_control.c:188: error: implicit declaration of function 'msleep' drivers/staging/intel_sst/intelmid_v2_control.c:172: error: implicit declaration of function 'msleep' Signed-off-by: Randy Dunlap Cc: Vinod Koul Cc: Harsha Priya Cc: KP Jeeja Cc: Dharageswari R Signed-off-by: Greg Kroah-Hartman --- drivers/staging/intel_sst/intelmid_v1_control.c | 1 + drivers/staging/intel_sst/intelmid_v2_control.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/staging/intel_sst/intelmid_v1_control.c b/drivers/staging/intel_sst/intelmid_v1_control.c index 9cc15c1..1ea8142 100644 --- a/drivers/staging/intel_sst/intelmid_v1_control.c +++ b/drivers/staging/intel_sst/intelmid_v1_control.c @@ -28,6 +28,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/drivers/staging/intel_sst/intelmid_v2_control.c b/drivers/staging/intel_sst/intelmid_v2_control.c index 26d815a6..3c6b3ab 100644 --- a/drivers/staging/intel_sst/intelmid_v2_control.c +++ b/drivers/staging/intel_sst/intelmid_v2_control.c @@ -29,6 +29,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include "intel_sst.h" #include "intelmid_snd_control.h" -- cgit v1.1 From 049cfaaa47cb9b796bbc298869c0a27d434bb766 Mon Sep 17 00:00:00 2001 From: Ben Gardiner Date: Thu, 21 Apr 2011 14:19:01 -0400 Subject: ASoC: davinci-mcasp: correct tdm_slots limit The current check for the number of tdm-slots specified by platform data is always true (x >= 2 || x <= 32); therefore the else branch that warns of an incorrect number of slots can never be taken. Check that the number of tdm slots specified by platform data is between 2 and 32, inclusive. Signed-off-by: Ben Gardiner Reviewed-by: James Nuss Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/davinci/davinci-mcasp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index a5af834..1456a17 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -644,7 +644,7 @@ static void davinci_hw_param(struct davinci_audio_dev *dev, int stream) mcasp_set_reg(dev->base + DAVINCI_MCASP_TXTDM_REG, mask); mcasp_set_bits(dev->base + DAVINCI_MCASP_TXFMT_REG, TXORD); - if ((dev->tdm_slots >= 2) || (dev->tdm_slots <= 32)) + if ((dev->tdm_slots >= 2) && (dev->tdm_slots <= 32)) mcasp_mod_bits(dev->base + DAVINCI_MCASP_TXFMCTL_REG, FSXMOD(dev->tdm_slots), FSXMOD(0x1FF)); else @@ -660,7 +660,7 @@ static void davinci_hw_param(struct davinci_audio_dev *dev, int stream) AHCLKRE); mcasp_set_reg(dev->base + DAVINCI_MCASP_RXTDM_REG, mask); - if ((dev->tdm_slots >= 2) || (dev->tdm_slots <= 32)) + if ((dev->tdm_slots >= 2) && (dev->tdm_slots <= 32)) mcasp_mod_bits(dev->base + DAVINCI_MCASP_RXFMCTL_REG, FSRMOD(dev->tdm_slots), FSRMOD(0x1FF)); else -- cgit v1.1 From 9595c8f035829d0c5deffbfdc6819d6797b3b402 Mon Sep 17 00:00:00 2001 From: Ben Gardiner Date: Thu, 21 Apr 2011 14:19:02 -0400 Subject: davinci-mcasp: use bitfield definitions for PDIR The current driver creates value for set/clr of PDIR using (x<<26) instead of the #defines that are convieniently made available. Update the driver to use the bitfield definitions of PDIR. There is no functional change introduced by this patch. Signed-off-by: Ben Gardiner Reviewed-by: James Nuss Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/davinci/davinci-mcasp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index 1456a17..2b637e0 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -434,7 +434,8 @@ static int davinci_mcasp_set_dai_fmt(struct snd_soc_dai *cpu_dai, mcasp_set_bits(base + DAVINCI_MCASP_ACLKRCTL_REG, ACLKRE); mcasp_set_bits(base + DAVINCI_MCASP_RXFMCTL_REG, AFSRE); - mcasp_set_bits(base + DAVINCI_MCASP_PDIR_REG, (0x7 << 26)); + mcasp_set_bits(base + DAVINCI_MCASP_PDIR_REG, + ACLKX | AHCLKX | AFSX); break; case SND_SOC_DAIFMT_CBM_CFS: /* codec is clock master and frame slave */ @@ -444,7 +445,8 @@ static int davinci_mcasp_set_dai_fmt(struct snd_soc_dai *cpu_dai, mcasp_set_bits(base + DAVINCI_MCASP_ACLKRCTL_REG, ACLKRE); mcasp_set_bits(base + DAVINCI_MCASP_RXFMCTL_REG, AFSRE); - mcasp_set_bits(base + DAVINCI_MCASP_PDIR_REG, (0x2d << 26)); + mcasp_set_bits(base + DAVINCI_MCASP_PDIR_REG, + ACLKX | AFSX | ACLKR | AFSR); break; case SND_SOC_DAIFMT_CBM_CFM: /* codec is clock and frame master */ @@ -454,7 +456,8 @@ static int davinci_mcasp_set_dai_fmt(struct snd_soc_dai *cpu_dai, mcasp_clr_bits(base + DAVINCI_MCASP_ACLKRCTL_REG, ACLKRE); mcasp_clr_bits(base + DAVINCI_MCASP_RXFMCTL_REG, AFSRE); - mcasp_clr_bits(base + DAVINCI_MCASP_PDIR_REG, (0x3f << 26)); + mcasp_clr_bits(base + DAVINCI_MCASP_PDIR_REG, + ACLKX | AHCLKX | AFSX | ACLKR | AHCLKR | AFSR); break; default: -- cgit v1.1 From a90f549e25fa77544aaff18bdf534912f3090d39 Mon Sep 17 00:00:00 2001 From: Ben Gardiner Date: Thu, 21 Apr 2011 14:19:03 -0400 Subject: davinci-mcasp: fix _CBM_CFS hw_params The current davinci_mcasp_set_dai_fmt() sets bits ACLKXE and ACLKRE (CLKXM and CLKRM as they are reffered to in SPRUFM1 [1]) for codec clock-slave/ frame-slave mode (_CBS_CFS) which selects internally generated bit-clock and frame-sync signals; however, it does the same thing again for codec clock-master/frame-slave mode (_CBM_CFS) in the very next case statement which is incorrectly selecting internally generated bit-clocks in this mode. For codec clock-master/frame-slave mode (_CBM_CFS), clear bits ACLKXE and ACLKRE to select externally-generated bit-clocks. [1] http://www.ti.com/litv/pdf/sprufm1 Signed-off-by: Ben Gardiner Reviewed-by: James Nuss Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/davinci/davinci-mcasp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index 2b637e0..09c4ff9 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -439,10 +439,10 @@ static int davinci_mcasp_set_dai_fmt(struct snd_soc_dai *cpu_dai, break; case SND_SOC_DAIFMT_CBM_CFS: /* codec is clock master and frame slave */ - mcasp_set_bits(base + DAVINCI_MCASP_ACLKXCTL_REG, ACLKXE); + mcasp_clr_bits(base + DAVINCI_MCASP_ACLKXCTL_REG, ACLKXE); mcasp_set_bits(base + DAVINCI_MCASP_TXFMCTL_REG, AFSXE); - mcasp_set_bits(base + DAVINCI_MCASP_ACLKRCTL_REG, ACLKRE); + mcasp_clr_bits(base + DAVINCI_MCASP_ACLKRCTL_REG, ACLKRE); mcasp_set_bits(base + DAVINCI_MCASP_RXFMCTL_REG, AFSRE); mcasp_set_bits(base + DAVINCI_MCASP_PDIR_REG, -- cgit v1.1 From db92f43745dc07acd05ca64a06075801c042cb57 Mon Sep 17 00:00:00 2001 From: Ben Gardiner Date: Thu, 21 Apr 2011 14:19:04 -0400 Subject: davinci-mcasp: fix _CBM_CFS pin directions The current davinci_mcasp_set_dai_fmt() sets bits ACLKX and ACLKR in the PDIR register for the codec clock-master/frame-slave mode; however, this results in the ACLKX and ACLKR pins being outputs according to SPRUFM1 [1] which conflicts with "codec is clock master." Similarly to the previous patch in this series, "fix _CBM_CFS hw_params" -- For codec clock-master/frame-slave mode (_CMB_CFS), clear bits ACLKX and ACLKR in the PDIR register to set the pins as inputs and hence allow externally sourced bit-clocks. [1] http://www.ti.com/litv/pdf/sprufm1 Signed-off-by: Ben Gardiner Reviewed-by: James Nuss Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/davinci/davinci-mcasp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/soc/davinci/davinci-mcasp.c b/sound/soc/davinci/davinci-mcasp.c index 09c4ff9..4ddc6d3 100644 --- a/sound/soc/davinci/davinci-mcasp.c +++ b/sound/soc/davinci/davinci-mcasp.c @@ -445,8 +445,10 @@ static int davinci_mcasp_set_dai_fmt(struct snd_soc_dai *cpu_dai, mcasp_clr_bits(base + DAVINCI_MCASP_ACLKRCTL_REG, ACLKRE); mcasp_set_bits(base + DAVINCI_MCASP_RXFMCTL_REG, AFSRE); + mcasp_clr_bits(base + DAVINCI_MCASP_PDIR_REG, + ACLKX | ACLKR); mcasp_set_bits(base + DAVINCI_MCASP_PDIR_REG, - ACLKX | AFSX | ACLKR | AFSR); + AFSX | AFSR); break; case SND_SOC_DAIFMT_CBM_CFM: /* codec is clock and frame master */ -- cgit v1.1 From 1437f5bca3c2d162f058cba37dfbeb20f619040d Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Sat, 23 Apr 2011 21:29:05 +0800 Subject: sched: Remove noop in alloc_rt_sched_group() The rq varible, though computed for each possible cpu, has nothing to do in the function, so it can be removed. This also eliminates a build warning. Signed-off-by: Hillf Danton Reviewed-by: Yong Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTin-FfQfqW5ym1iuEmrk8s777Y1LAg@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 9cde2dd..f11a2a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8226,7 +8226,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; struct sched_rt_entity *rt_se; - struct rq *rq; int i; tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8240,8 +8239,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { - rq = cpu_rq(i); - rt_rq = kzalloc_node(sizeof(struct rt_rq), GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) -- cgit v1.1 From 94403f8863d0d1d2005291b2ef0719c2534aa303 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 Apr 2011 08:18:31 +0200 Subject: perf events: Add stalled cycles generic event - PERF_COUNT_HW_STALLED_CYCLES The new PERF_COUNT_HW_STALLED_CYCLES event tries to approximate cycles the CPU does nothing useful, because it is stalled on a cache-miss or some other condition. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-fue11vymwqsoo5to72jxxjyl@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 +++ include/linux/perf_event.h | 1 + tools/perf/util/parse-events.c | 1 + tools/perf/util/python.c | 1 + 4 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9ae4a2a..efa2704 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1413,6 +1413,9 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + /* Install the stalled-cycles event: 0xff: All reasons, 0xa2: Resource stalls */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0xffa2; + if (ebx & 0x40) { /* * Erratum AAJ80 detected, we work it around by using diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee9f1e7..ac636dd 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -52,6 +52,7 @@ enum perf_hw_id { PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_HW_BRANCH_MISSES = 5, PERF_COUNT_HW_BUS_CYCLES = 6, + PERF_COUNT_HW_STALLED_CYCLES = 7, PERF_COUNT_HW_MAX, /* non-ABI */ }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 952b4ae..1869e4c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -38,6 +38,7 @@ static struct event_symbol event_symbols[] = { { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, { CHW(BRANCH_MISSES), "branch-misses", "" }, { CHW(BUS_CYCLES), "bus-cycles", "" }, + { CHW(STALLED_CYCLES), "stalled-cycles", "" }, { CSW(CPU_CLOCK), "cpu-clock", "" }, { CSW(TASK_CLOCK), "task-clock", "" }, diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index f5e3845..406f613 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -798,6 +798,7 @@ static struct { { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES }, { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES }, + { "COUNT_HW_STALLED_CYCLES", PERF_COUNT_HW_STALLED_CYCLES }, { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D }, { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I }, { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL }, -- cgit v1.1 From 5c543e3c442d6382db127152c7096ca6a55283de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 12:02:04 +0200 Subject: perf events, x86: Mark constrant tables read mostly Various constraint tables were not marked read-mostly. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-wpqwwvmhxucy5e718wnamjiv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index efa2704..067a48b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -36,7 +36,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static struct event_constraint intel_core_event_constraints[] = +static struct event_constraint intel_core_event_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -47,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_core2_event_constraints[] = +static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -70,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_event_constraints[] = +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -86,19 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct extra_reg intel_nehalem_extra_regs[] = +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] = +static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0xb7, 0), EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_event_constraints[] = +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -110,7 +110,7 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_snb_event_constraints[] = +static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -123,21 +123,21 @@ static struct event_constraint intel_snb_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct extra_reg intel_westmere_extra_regs[] = +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] = +static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0xb7, 0), INTEL_EVENT_CONSTRAINT(0xbb, 0), EVENT_CONSTRAINT_END }; -static struct event_constraint intel_gen_event_constraints[] = +static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -- cgit v1.1 From 11ba2b85f506306c8dfc9fe144aa4ddc43242855 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 Apr 2011 15:05:10 +0200 Subject: perf stat: Print stalled cycles percentage Print: 611,527 cycles 400,553 instructions # ( 0.71 instructions per cycle ) 77,809 stalled-cycles # ( 12.71% of all cycles ) 0.000610987 seconds time elapsed Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/n/tip-fd6x8r1cpyb6zhlrc4ix8m45@git.kernel.org --- tools/perf/builtin-stat.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 03f0e45..3a29041 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -442,7 +442,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %10.3f IPC ", ratio); + fprintf(stderr, " # ( %4.2f instructions per cycle )", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { total = avg_stats(&runtime_branches_stats[cpu]); @@ -450,7 +450,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %% ", ratio); + fprintf(stderr, " # %10.3f %%", ratio); } else if (runtime_nsecs_stats[cpu].n != 0) { total = avg_stats(&runtime_nsecs_stats[cpu]); @@ -459,6 +459,13 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ratio = 1000.0 * avg / total; fprintf(stderr, " # %10.3f M/sec", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + fprintf(stderr, " # (%5.2f%% of all cycles )", ratio); } } -- cgit v1.1 From d58f4c82fed69fdd4a79fa54fe17fd14d98c27aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 03:42:18 +0200 Subject: perf stat: Print cache misses as percentage Before: 113,393,041 cache-references # 83.636 M/sec 7,052,454 cache-misses # 5.202 M/sec After: 112,589,441 cache-references # 87.925 M/sec 6,556,354 cache-misses # 5.823 % misses/hits percentages are more expressive than absolute numbers or rates. (Also prettify the CPUs printout line to not have a trailing whitespace.) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-axm28f43x439bl41zkvfzd63@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3a29041..0de3a20 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -157,6 +157,7 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; +struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -219,10 +220,12 @@ static int read_counter_aggr(struct perf_evsel *counter) */ if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) update_stats(&runtime_nsecs_stats[0], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); return 0; } @@ -404,7 +407,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) - fprintf(stderr, " # %10.3f CPUs ", + fprintf(stderr, " # %10.3f CPUs", avg / avg_stats(&walltime_nsecs_stats)); } @@ -452,6 +455,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %10.3f %%", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && + runtime_cacherefs_stats[cpu].n != 0) { + total = avg_stats(&runtime_cacherefs_stats[cpu]); + + if (total) + ratio = avg * 100 / total; + + fprintf(stderr, " # %10.3f %%", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { total = avg_stats(&runtime_nsecs_stats[cpu]); -- cgit v1.1 From b908debd4eef91471016138569f7a9e292be682e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 03:55:40 +0200 Subject: perf tools: Accept case-insensitive symbolic event variants We currently fail on something like '-e CPU-migrations', with: invalid or unsupported event: 'CPU-migrations' While 'CPU-migrations' is how we actually print out the event in the default perf stat output: Performance counter stats for 'true': 0.202204 task-clock-msecs # 0.282 CPUs 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec So change the matching to be case-insensitive. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-omcm3edjjtx83a4kh2e244se@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1869e4c..8e54bdb 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -649,13 +649,15 @@ static int check_events(const char *str, unsigned int i) int n; n = strlen(event_symbols[i].symbol); - if (!strncmp(str, event_symbols[i].symbol, n)) + if (!strncasecmp(str, event_symbols[i].symbol, n)) return n; n = strlen(event_symbols[i].alias); - if (n) - if (!strncmp(str, event_symbols[i].alias, n)) + if (n) { + if (!strncasecmp(str, event_symbols[i].alias, n)) return n; + } + return 0; } -- cgit v1.1 From ceb53fbf6dbb1df26d38379a262c6981fe73dd36 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:06:33 +0200 Subject: perf stat: Fail more clearly when an invalid modifier is specified Currently we fail without printing any error message on "perf stat -e task-clock-msecs". The reason is that the task-clock event is matched and the "-msecs" postfix is assumed to be an event modifier - but is not recognized. This patch changes the code to be more informative: $ perf stat -e task-clock-msecs true invalid event modifier: '-msecs' Run 'perf list' for a list of valid events and modifiers And restructures the return value of parse_event_modifier() to allow the printing of all variants of invalid event modifiers. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-wlaw3dvz1ly6wple8l52cfca@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8e54bdb..b686269 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -721,15 +721,19 @@ parse_numeric_event(const char **strp, struct perf_event_attr *attr) return EVT_FAILED; } -static enum event_result +static int parse_event_modifier(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; int exclude = 0; int eu = 0, ek = 0, eh = 0, precise = 0; - if (*str++ != ':') + if (!*str) return 0; + + if (*str++ != ':') + return -1; + while (*str) { if (*str == 'u') { if (!exclude) @@ -750,14 +754,16 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) ++str; } - if (str >= *strp + 2) { - *strp = str; - attr->exclude_user = eu; - attr->exclude_kernel = ek; - attr->exclude_hv = eh; - attr->precise_ip = precise; - return 1; - } + if (str < *strp + 2) + return -1; + + *strp = str; + + attr->exclude_user = eu; + attr->exclude_kernel = ek; + attr->exclude_hv = eh; + attr->precise_ip = precise; + return 0; } @@ -800,7 +806,12 @@ parse_event_symbols(const struct option *opt, const char **str, return EVT_FAILED; modifier: - parse_event_modifier(str, attr); + if (parse_event_modifier(str, attr) < 0) { + fprintf(stderr, "invalid event modifier: '%s'\n", *str); + fprintf(stderr, "Run 'perf list' for a list of valid events and modifiers\n"); + + return EVT_FAILED; + } return ret; } -- cgit v1.1 From 749141d926faf23ef811686a8050e7cf13dc223f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:24:57 +0200 Subject: perf stat: Make all displayed event names parseable as well Right now we display this by default: 0.202204 task-clock-msecs # 0.282 CPUs 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 85 page-faults # 0.420 M/sec The task-clock-msecs event cannot actually be passed back as an event name, the event name we recognize is 'task-clock'. So change the output of the cpu-clock and task-clock events to be idempotent. ( Units should be printed out in the right-side column, if needed. ) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-lexrnbzy09asscgd4f7oac4i@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index b686269..b5bfef1 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -70,8 +70,8 @@ static const char *hw_event_names[] = { }; static const char *sw_event_names[] = { - "cpu-clock-msecs", - "task-clock-msecs", + "cpu-clock", + "task-clock", "page-faults", "context-switches", "CPU-migrations", -- cgit v1.1 From dcd9936a5a6d89512b5323c1145647f2dbe0236f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:36:37 +0200 Subject: perf stat: Factor our shadow stats Create update_shadow_stats() which is then used in both read_counter_aggr() and read_counter(). This not only simplifies the code but also fixes a bug: HW_CACHE_REFERENCES was not updated in read_counter(). Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-9uc55z3g88r47exde7zxjm6p@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0de3a20..e5e82f6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -194,6 +194,23 @@ static inline int nsec_counter(struct perf_evsel *evsel) } /* + * Update various tracking values we maintain to print + * more semantic information such as miss/hit ratios, + * instruction rates, etc: + */ +static void update_shadow_stats(struct perf_evsel *counter, u64 *count) +{ + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) + update_stats(&runtime_nsecs_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); +} + +/* * Read out the results of a single counter: * aggregate counts across CPUs in system-wide mode */ @@ -218,14 +235,7 @@ static int read_counter_aggr(struct perf_evsel *counter) /* * Save the full runtime - to allow normalization during printout: */ - if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) - update_stats(&runtime_nsecs_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) - update_stats(&runtime_cacherefs_stats[0], count[0]); + update_shadow_stats(counter, count); return 0; } @@ -245,12 +255,7 @@ static int read_counter(struct perf_evsel *counter) count = counter->counts->cpu[cpu].values; - if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) - update_stats(&runtime_nsecs_stats[cpu], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[cpu], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[cpu], count[0]); + update_shadow_stats(counter, count); } return 0; -- cgit v1.1 From 481f988a016f7a0327a5537bde4794349fc4625c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:34:16 +0200 Subject: perf stat: Add stalled cycles accounting, prettify the resulting output Add stalled cycles accounting and use it to print the "cycles stalled per instruction" value. Also change the unit of the cycles output from M/sec to GHz - this is more intuitive. Prettify the output to: Performance counter stats for './loop_1b_instructions': 239.775036 task-clock # 0.997 CPUs utilized 761,903,912 cycles # 3.178 GHz 356,620,620 stalled-cycles # 46.81% of all cycles are idle 1,001,578,351 instructions # 1.31 insns per cycle # 0.36 stalled cycles per insn 14,782 cache-references # 0.062 M/sec 5,694 cache-misses # 38.520 % of all cache refs 0.240493656 seconds time elapsed Also adjust the --repeat output to make the percentages align vertically: Performance counter stats for './loop_1b_instructions' (10 runs): 236.096793 task-clock # 0.997 CPUs utilized ( +- 0.011% ) 756,553,086 cycles # 3.204 GHz ( +- 0.002% ) 354,942,692 stalled-cycles # 46.92% of all cycles are idle ( +- 0.008% ) 1,001,389,700 instructions # 1.32 insns per cycle # 0.35 stalled cycles per insn ( +- 0.000% ) 10,166 cache-references # 0.043 M/sec ( +- 0.742% ) 468 cache-misses # 4.608 % of all cache refs ( +- 13.385% ) 0.236874136 seconds time elapsed ( +- 0.01% ) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-uapziqny39601apdmmhoz7hk@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e5e82f6..e881c20 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -156,6 +156,7 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; @@ -204,6 +205,8 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES)) + update_stats(&runtime_stalled_cycles_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) @@ -412,8 +415,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) - fprintf(stderr, " # %10.3f CPUs", - avg / avg_stats(&walltime_nsecs_stats)); + fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) @@ -450,7 +452,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # ( %4.2f instructions per cycle )", ratio); + fprintf(stderr, " # %4.2f insns per cycle", ratio); + + total = avg_stats(&runtime_stalled_cycles_stats[cpu]); + + if (total && avg) { + ratio = total / avg; + fprintf(stderr, "\n # %4.2f stalled cycles per insn", ratio); + } + } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { total = avg_stats(&runtime_branches_stats[cpu]); @@ -458,7 +468,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %%", ratio); + fprintf(stderr, " # %8.3f %% of all branches", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { @@ -467,22 +477,29 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %%", ratio); + fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); - } else if (runtime_nsecs_stats[cpu].n != 0) { + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + fprintf(stderr, " # %5.2f%% of all cycles are idle ", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) - ratio = 1000.0 * avg / total; + ratio = 1.0 * avg / total; - fprintf(stderr, " # %10.3f M/sec", ratio); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { - total = avg_stats(&runtime_cycles_stats[cpu]); + fprintf(stderr, " # %8.3f GHz ", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { + total = avg_stats(&runtime_nsecs_stats[cpu]); if (total) - ratio = avg / total * 100.0; + ratio = 1000.0 * avg / total; - fprintf(stderr, " # (%5.2f%% of all cycles )", ratio); + fprintf(stderr, " # %8.3f M/sec ", ratio); } } @@ -619,7 +636,7 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " %18.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { - fprintf(stderr, " ( +- %7.3f%% )", + fprintf(stderr, " ( +-%5.2f%% )", 100*stddev_stats(&walltime_nsecs_stats) / avg_stats(&walltime_nsecs_stats)); } -- cgit v1.1 From 1fc570ad89e55dc32dfa4dda1311948b38f26524 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:20:22 +0200 Subject: perf stat: Add stalled cycles to the default output The new default output looks like this: Performance counter stats for './loop_1b_instructions': 236.010686 task-clock # 0.996 CPUs utilized 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 99 page-faults # 0.000 M/sec 756,487,646 cycles # 3.205 GHz 354,938,996 stalled-cycles # 46.92% of all cycles are idle 1,001,403,797 instructions # 1.32 insns per cycle # 0.35 stalled cycles per insn 100,279,773 branches # 424.895 M/sec 12,646 branch-misses # 0.013 % of all branches 0.236902540 seconds time elapsed We dropped cache-refs and cache-misses and added stalled-cycles - this is a more generic "how well utilized is the CPU" metric. If the stalled-cycles ratio is too high then more specific measurements can be taken to figure out the source of the inefficiency. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-pbpl2l4mn797s69bclfpwkwn@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 5 ++--- tools/perf/util/parse-events.c | 11 ++++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e881c20..924d18c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,11 +65,10 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, }; @@ -468,7 +467,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %8.3f %% of all branches", ratio); + fprintf(stderr, " # %5.2f %% of all branches ", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index b5bfef1..bbbb735 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -32,13 +32,13 @@ char debugfs_path[MAXPATHLEN]; static struct event_symbol event_symbols[] = { { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, + { CHW(STALLED_CYCLES), "stalled-cycles", "idle-cycles" }, { CHW(INSTRUCTIONS), "instructions", "" }, { CHW(CACHE_REFERENCES), "cache-references", "" }, { CHW(CACHE_MISSES), "cache-misses", "" }, { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, { CHW(BRANCH_MISSES), "branch-misses", "" }, { CHW(BUS_CYCLES), "bus-cycles", "" }, - { CHW(STALLED_CYCLES), "stalled-cycles", "" }, { CSW(CPU_CLOCK), "cpu-clock", "" }, { CSW(TASK_CLOCK), "task-clock", "" }, @@ -54,9 +54,9 @@ static struct event_symbol event_symbols[] = { #define __PERF_EVENT_FIELD(config, name) \ ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT) -#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) +#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) #define PERF_EVENT_CONFIG(config) __PERF_EVENT_FIELD(config, CONFIG) -#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) +#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) static const char *hw_event_names[] = { @@ -67,6 +67,7 @@ static const char *hw_event_names[] = { "branches", "branch-misses", "bus-cycles", + "stalled-cycles", }; static const char *sw_event_names[] = { @@ -308,7 +309,7 @@ const char *__event_name(int type, u64 config) switch (type) { case PERF_TYPE_HARDWARE: - if (config < PERF_COUNT_HW_MAX) + if (config < PERF_COUNT_HW_MAX && hw_event_names[config]) return hw_event_names[config]; return "unknown-hardware"; @@ -334,7 +335,7 @@ const char *__event_name(int type, u64 config) } case PERF_TYPE_SOFTWARE: - if (config < PERF_COUNT_SW_MAX) + if (config < PERF_COUNT_SW_MAX && sw_event_names[config]) return sw_event_names[config]; return "unknown-software"; -- cgit v1.1 From f99844cb76b7d347711c22cdcb94266b7214141f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:35:39 +0200 Subject: perf stat: Fix -nan% output in perf stat noise printouts Before: 0 CPU-migrations # 0.000 M/sec ( +- -nan% ) After: 0 CPU-migrations # 0.000 M/sec ( +- 0.00% ) Also factor out the noise printing function. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-z89h2v1bk1mikcbsf7e6v34q@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 924d18c..845ded8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -382,6 +382,16 @@ static int run_perf_stat(int argc __used, const char **argv) return WEXITSTATUS(status); } +static void print_noise_pct(double total, double avg) +{ + double pct = 0.0; + + if (avg) + pct = 100.0*total/avg; + + fprintf(stderr, " ( +-%6.2f%% )", pct); +} + static void print_noise(struct perf_evsel *evsel, double avg) { struct perf_stat *ps; @@ -390,8 +400,7 @@ static void print_noise(struct perf_evsel *evsel, double avg) return; ps = evsel->priv; - fprintf(stderr, " ( +- %7.3f%% )", - 100 * stddev_stats(&ps->res_stats[0]) / avg); + print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); } static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) @@ -635,9 +644,8 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " %18.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { - fprintf(stderr, " ( +-%5.2f%% )", - 100*stddev_stats(&walltime_nsecs_stats) / - avg_stats(&walltime_nsecs_stats)); + print_noise_pct(stddev_stats(&walltime_nsecs_stats), + avg_stats(&walltime_nsecs_stats)); } fprintf(stderr, "\n\n"); } -- cgit v1.1 From a5d243d04a150acbfa79d641154f49e5d920f64f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:39:24 +0200 Subject: perf stat: Print stalled cycles warning colors Print the stalled-cycles percentage with different warning level ASCII colors, as the percentage passes the 25%/50%/75% thresholds. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-e25zz44rcms7mu9az4fu5zp0@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 845ded8..e7d91f9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -46,6 +46,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" +#include "util/color.h" #include "util/header.h" #include "util/cpumap.h" #include "util/thread.h" @@ -426,6 +427,29 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } +static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 25.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all cycles are idle "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -488,12 +512,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { - total = avg_stats(&runtime_cycles_stats[cpu]); - - if (total) - ratio = avg / total * 100.0; - - fprintf(stderr, " # %5.2f%% of all cycles are idle ", ratio); + print_stalled_cycles(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); @@ -508,6 +527,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ratio = 1000.0 * avg / total; fprintf(stderr, " # %8.3f M/sec ", ratio); + } else { + fprintf(stderr, " "); } } -- cgit v1.1 From c78df6c1d49b5d798f1579141e3a12be7c325d1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 12:16:10 +0200 Subject: perf stat: Print branch misses warning colors Print the missed-branches percentage with different warning level ASCII colors, as the percentage passes the 5%/10%/20% thresholds. These thresholds are set to relatively low levels, because on most CPUs even a moderate percentage of branch-misses already shows up as a slowdown. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-ybqukg7p86leiup7gl03ecgk@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e7d91f9..5d4e1b9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -450,6 +450,29 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl fprintf(stderr, " of all cycles are idle "); } +static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_branches_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all branches "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -495,13 +518,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { - total = avg_stats(&runtime_branches_stats[cpu]); - - if (total) - ratio = avg * 100 / total; - - fprintf(stderr, " # %5.2f %% of all branches ", ratio); - + print_branch_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); -- cgit v1.1 From 8bb6c79f24e66538f606076915e918242c02ec7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 13:25:24 +0200 Subject: perf stat: Print out miss/hit ratio for L1 data-cache events Print out this kind of l1-dcache-misses percentage: Performance counter stats for './bw_tcp localhost': 29,956,262,201 cycles # 3.002 GHz (scaled from 85.14%) 8,255,209,558 stalled-cycles # 27.56% of all cycles are idle (scaled from 86.56%) 1,206,130,308 l1-dcache-misses # 40.49% of all L1-dcache hits (scaled from 86.30%) 2,978,756,779 l1-dcache-refs # 298.512 M/sec (scaled from 70.02%) 8,861,956,159 instructions # 0.30 insns per cycle # 0.93 stalled cycles per insn (scaled from 84.27%) 1,644,306,068 branches # 164.782 M/sec (scaled from 86.43%) 74,778,443 branch-misses # 4.55% of all branches (scaled from 70.69%) 9978.695711 task-clock # 0.693 CPUs utilized 14.404347983 seconds time elapsed And color the result depending on the severity of cache-trashing. Acked-by: Peter Zijlstra Acked-by Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-54gmz0zymaid84zcs7joq02p@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5d4e1b9..03bac6a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -159,6 +159,7 @@ struct stats runtime_cycles_stats[MAX_NR_CPUS]; struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; +struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -211,6 +212,8 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) update_stats(&runtime_cacherefs_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) + update_stats(&runtime_l1_dcache_stats[0], count[0]); } /* @@ -473,6 +476,29 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double fprintf(stderr, " of all branches "); } +static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_l1_dcache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all L1-dcache hits "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -519,6 +545,13 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { print_branch_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_branches_stats[cpu].n != 0) { + print_l1_dcache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); -- cgit v1.1 From c6264deff7ea6125492b442edad885e5429679af Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 13:50:47 +0200 Subject: perf stat: Add -d/--detailed flag to run with a lot of events Add the new -d/--detailed flag, which generates a pretty detailed event list: Performance counter stats for './hackbench 10' (10 runs): 1514.287888 task-clock # 10.897 CPUs utilized ( +- 3.05% ) 39,698 context-switches # 0.026 M/sec ( +- 12.19% ) 8,147 CPU-migrations # 0.005 M/sec ( +- 16.55% ) 17,918 page-faults # 0.012 M/sec ( +- 0.37% ) 2,944,504,050 cycles # 1.944 GHz ( +- 3.89% ) (32.60%) 1,043,971,283 stalled-cycles # 35.45% of all cycles are idle ( +- 5.22% ) (44.48%) 1,655,906,768 instructions # 0.56 insns per cycle # 0.63 stalled cycles per insn ( +- 1.95% ) (55.09%) 338,832,373 branches # 223.757 M/sec ( +- 1.96% ) (64.47%) 3,892,416 branch-misses # 1.15% of all branches ( +- 5.49% ) (73.12%) 606,410,482 L1-dcache-loads # 400.459 M/sec ( +- 1.29% ) (71.21%) 31,204,395 L1-dcache-load-misses # 5.15% of all L1-dcache hits ( +- 3.04% ) (60.43%) 3,922,751 LLC-loads # 2.590 M/sec ( +- 6.80% ) (46.87%) 5,037,288 LLC-load-misses # 3.327 M/sec ( +- 3.56% ) (13.00%) 0.138966828 seconds time elapsed ( +- 4.11% ) This can be used "at a glance" for narrower analysis. -d can also be used in addition to other -e events, to further expand an event list. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-cxs98quixs3qyvdqx3goojc4@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 68 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 03bac6a..6959fde 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -73,6 +73,47 @@ static struct perf_event_attr default_attrs[] = { }; +/* + * Detailed stats: + */ +static struct perf_event_attr detailed_attrs[] = { + + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, + + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + struct perf_evlist *evsel_list; static bool system_wide = false; @@ -86,6 +127,7 @@ static pid_t target_pid = -1; static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; +static bool detailed_run = false; static bool big_num = true; static int big_num_opt = -1; static const char *cpu_list; @@ -550,7 +592,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && - runtime_branches_stats[cpu].n != 0) { + runtime_l1_dcache_stats[cpu].n != 0) { print_l1_dcache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { @@ -625,8 +667,7 @@ static void print_counter_aggr(struct perf_evsel *counter) avg_enabled = avg_stats(&ps->res_stats[1]); avg_running = avg_stats(&ps->res_stats[2]); - fprintf(stderr, " (scaled from %.2f%%)", - 100 * avg_running / avg_enabled); + fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); } fprintf(stderr, "\n"); } @@ -668,10 +709,8 @@ static void print_counter(struct perf_evsel *counter) if (!csv_output) { print_noise(counter, 1.0); - if (run != ena) { - fprintf(stderr, " (scaled from %.2f%%)", - 100.0 * run / ena); - } + if (run != ena) + fprintf(stderr, " (%.2f%%)", 100.0 * run / ena); } fputc('\n', stderr); } @@ -778,6 +817,8 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), + OPT_BOOLEAN('d', "detailed", &detailed_run, + "detailed run - start a lot of events"), OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, "print large numbers with thousands\' separators", stat__set_big_num), @@ -839,7 +880,18 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) } /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!null_run && !evsel_list->nr_entries) { + if (detailed_run) { + size_t c; + + for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) { + pos = perf_evsel__new(&detailed_attrs[c], c); + if (pos == NULL) + goto out; + perf_evlist__add(evsel_list, pos); + } + } + /* Set attrs and nr_counters if no event is selected and !null_run */ + if (!detailed_run && !null_run && !evsel_list->nr_entries) { size_t c; for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { -- cgit v1.1 From 3f602b08dec32c418fc391fc838db357aab84f8a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 25 Apr 2011 19:39:24 +0000 Subject: xfrm: Fix replay window size calculation on initialization On replay initialization, we compute the size of the replay buffer to see if the replay window fits into the buffer. This computation lacks a mutliplication by 8 because we need the size in bit, not in byte. So we might return an error even though the replay window would fit into the buffer. This patch fixes this issue. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_replay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index f218385..e8a7814 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -532,7 +532,7 @@ int xfrm_init_replay(struct xfrm_state *x) if (replay_esn) { if (replay_esn->replay_window > - replay_esn->bmp_len * sizeof(__u32)) + replay_esn->bmp_len * sizeof(__u32) * 8) return -EINVAL; if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn) -- cgit v1.1 From c0a56e64aec331f33ead29ba493ee184d9bdc840 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 25 Apr 2011 19:40:23 +0000 Subject: esp6: Fix scatterlist initialization When we use IPsec extended sequence numbers, we may overwrite the last scatterlist of the associated data by the scatterlist for the skb. This patch fixes this by placing the scatterlist for the skb right behind the last scatterlist of the associated data. esp4 does it already like that. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/esp6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5aa8ec8..59dccfb 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -371,7 +371,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); asg = esp_req_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; skb->ip_summed = CHECKSUM_NONE; -- cgit v1.1 From 7833aa05b8db63484b43b4b4c389cd4533140afb Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 25 Apr 2011 19:41:21 +0000 Subject: xfrm: Check for the new replay implementation if an esn state is inserted IPsec extended sequence numbers can be used only with the new anti-replay window implementation. So check if the new implementation is used if an esn state is inserted and return an error if it is not. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5d1d60d..c658cb3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -124,6 +124,9 @@ static inline int verify_replay(struct xfrm_usersa_info *p, { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; + if ((p->flags & XFRM_STATE_ESN) && !rt) + return -EINVAL; + if (!rt) return 0; -- cgit v1.1 From 9ceb1c3d1fe15c2f9b55eaa8978019ef0e0a06ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 02:57:53 +0200 Subject: perf stat: Fix printout vertical alignment Before: | | Performance counter stats for '/home/mingo/hackbench 20' (5 runs): | | 71,321,607 instructions:u # 0.42 insns per cycle ( +- 0.00% ) | 168,040,009 cycles:u # 0.000 GHz ( +- 0.81% ) | | 1.468002368 seconds time elapsed ( +- 1.33% ) | After: | | Performance counter stats for '/home/mingo/hackbench 20' (5 runs): | | 71,321,607 instructions:u # 0.42 insns per cycle ( +- 0.00% ) | 168,040,009 cycles:u # 0.000 GHz ( +- 0.81% ) | | 1.468002368 seconds time elapsed ( +- 1.33% ) | The last column (stddev noise) is properly aligned, vertically. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn0dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6959fde..003caa8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -575,7 +575,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %4.2f insns per cycle", ratio); + fprintf(stderr, " # %4.2f insns per cycle ", ratio); total = avg_stats(&runtime_stalled_cycles_stats[cpu]); -- cgit v1.1 From 860ad7823fdc00cd61dc70e7f35e07fb327cc9a4 Mon Sep 17 00:00:00 2001 From: Sonny Rao Date: Mon, 18 Apr 2011 22:12:59 +0100 Subject: ARM: 6884/1: Fix infinite loop in ARM user perf_event backtrace code The ARM user backtrace code can get into an infinite loop if it runs into an invalid stack frame which points back to itself. This situation has been observed in practice. Fix it by capping the number of entries in the backtrace. This is also what other architectures do in their backtrace code. Signed-off-by: Sonny Rao Acked-by: Jamie Iles Acked-by: Olof Johansson Signed-off-by: Russell King --- arch/arm/kernel/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 979da39..139e3c8 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -746,7 +746,8 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while (tail && !((unsigned long)tail & 0x3)) + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } -- cgit v1.1 From a8d2518c2a7235f0b772c1f1bd30218130e631a7 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 27 Apr 2011 01:34:27 +0100 Subject: ARM: 6887/1: Mark broadcast_timer_setup() __cpuinit This function is only called by percpu_timer_setup() which is also __cpuinit marked. Thus it's safe to mark this function as __cpuinit as well. Signed-off-by: Stephen Boyd Signed-off-by: Russell King --- arch/arm/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 8fe05ad..f29b8a2 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -479,7 +479,7 @@ static void broadcast_timer_set_mode(enum clock_event_mode mode, { } -static void broadcast_timer_setup(struct clock_event_device *evt) +static void __cpuinit broadcast_timer_setup(struct clock_event_device *evt) { evt->name = "dummy_timer"; evt->features = CLOCK_EVT_FEAT_ONESHOT | -- cgit v1.1 From 0c61227094b3ddaca2f847ee287c4a2e3762b5a2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 26 Apr 2011 11:21:32 +0300 Subject: x86, setup: Fix EDD3.0 data verification. Check for nonzero path in edd_has_edd30() has no sense. First, it looks at the wrong memory. Device path starts at offset 30 of the info->params structure which is at offset 8 from the beginning of info structure, but code looks at info + 4 instead. This was correct when code was introduced, but around v2.6.4 three more fields were added to edd_info structure (commit 66b61a5c in history.git). Second, even if it will check correct memory it will always succeed since at offset 30 (params->key) there will be non-zero values otherwise previous check would fail. The patch replaces this bogus check with one that verifies checksum. Signed-off-by: Gleb Natapov Link: http://lkml.kernel.org/r/20110426082132.GG2265@redhat.com Signed-off-by: H. Peter Anvin --- drivers/firmware/edd.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c index 96c25d9..f1b7f65 100644 --- a/drivers/firmware/edd.c +++ b/drivers/firmware/edd.c @@ -531,8 +531,8 @@ static int edd_has_edd30(struct edd_device *edev) { struct edd_info *info; - int i, nonzero_path = 0; - char c; + int i; + u8 csum = 0; if (!edev) return 0; @@ -544,16 +544,16 @@ edd_has_edd30(struct edd_device *edev) return 0; } - for (i = 30; i <= 73; i++) { - c = *(((uint8_t *) info) + i + 4); - if (c) { - nonzero_path++; - break; - } - } - if (!nonzero_path) { + + /* We support only T13 spec */ + if (info->params.device_path_info_length != 44) + return 0; + + for (i = 30; i < info->params.device_path_info_length + 30; i++) + csum += *(((u8 *)&info->params) + i); + + if (csum) return 0; - } return 1; } -- cgit v1.1 From 9fdcdbb0d84922e7ccda2f717a04ea62629f7e18 Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Tue, 29 Mar 2011 00:46:12 -0400 Subject: mmc: sdhci-pci: Fix error case in sdhci_pci_probe_slot() If pci_ioremap_bar() fails during probe, we "goto release;" and free the host, but then we return 0 -- which tells sdhci_pci_probe() that the probe succeeded. Since we think the probe succeeded, when we unload sdhci we'll go to sdhci_pci_remove_slot() and it will try to dereference slot->host, which is now NULL because we freed it in the error path earlier. The patch simply sets ret appropriately, so that sdhci_pci_probe() will detect the failure immediately and bail out. Signed-off-by: Chris Ball Cc: --- drivers/mmc/host/sdhci-pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c index a136be7..f8b5f37 100644 --- a/drivers/mmc/host/sdhci-pci.c +++ b/drivers/mmc/host/sdhci-pci.c @@ -957,6 +957,7 @@ static struct sdhci_pci_slot * __devinit sdhci_pci_probe_slot( host->ioaddr = pci_ioremap_bar(pdev, bar); if (!host->ioaddr) { dev_err(&pdev->dev, "failed to remap registers\n"); + ret = -ENOMEM; goto release; } -- cgit v1.1 From 9bc21848b1d6cb8389d927196b16c9950b5e21e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Sat, 9 Apr 2011 08:16:47 +0200 Subject: mmc: core: mmc_add_card(): fix missing break in switch statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a cosmetic bug that affects printk() for SD-combo cards. Reported-by: Prashanth Bhat Signed-off-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Chris Ball --- drivers/mmc/core/bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 63667a8..d6d62fd 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -284,6 +284,7 @@ int mmc_add_card(struct mmc_card *card) type = "SD-combo"; if (mmc_card_blockaddr(card)) type = "SDHC-combo"; + break; default: type = "?"; break; -- cgit v1.1 From f69475142136c8ad9b9c717aea2ff907aed9f863 Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Mon, 11 Apr 2011 17:00:44 -0400 Subject: mmc: omap: Fix possible NULL pointer deref Either OMAP_MMC_STAT_CARD_ERR or OMAP_MMC_STAT_END_OF_CMD might fire if there is no host->cmd pointer. Check for a valid host->cmd pointer before calling mmc_omap_cmd_done(). Signed-off-by: Michael Buesch Acked-by: Tony Lindgren Signed-off-by: Chris Ball --- drivers/mmc/host/omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 2e032f0..a6c3290 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -832,7 +832,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) return IRQ_HANDLED; } - if (end_command) + if (end_command && host->cmd) mmc_omap_cmd_done(host, host->cmd); if (host->data != NULL) { if (transfer_error) -- cgit v1.1 From 26fc8775b51484d8c0a671198639c6d5ae60533e Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Fri, 15 Apr 2011 20:08:19 +0200 Subject: mmc: fix a race between card-detect rescan and clock-gate work instances Currently there is a race in the MMC core between a card-detect rescan work and the clock-gating work, scheduled from a command completion. Fix it by removing the dedicated clock-gating mutex and using the MMC standard locking mechanism instead. Signed-off-by: Guennadi Liakhovetski Cc: Simon Horman Cc: Magnus Damm Acked-by: Linus Walleij Cc: Signed-off-by: Chris Ball --- drivers/mmc/core/host.c | 9 ++++----- include/linux/mmc/host.h | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 461e6a1..2b200c1 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -94,7 +94,7 @@ static void mmc_host_clk_gate_delayed(struct mmc_host *host) spin_unlock_irqrestore(&host->clk_lock, flags); return; } - mutex_lock(&host->clk_gate_mutex); + mmc_claim_host(host); spin_lock_irqsave(&host->clk_lock, flags); if (!host->clk_requests) { spin_unlock_irqrestore(&host->clk_lock, flags); @@ -104,7 +104,7 @@ static void mmc_host_clk_gate_delayed(struct mmc_host *host) pr_debug("%s: gated MCI clock\n", mmc_hostname(host)); } spin_unlock_irqrestore(&host->clk_lock, flags); - mutex_unlock(&host->clk_gate_mutex); + mmc_release_host(host); } /* @@ -130,7 +130,7 @@ void mmc_host_clk_ungate(struct mmc_host *host) { unsigned long flags; - mutex_lock(&host->clk_gate_mutex); + mmc_claim_host(host); spin_lock_irqsave(&host->clk_lock, flags); if (host->clk_gated) { spin_unlock_irqrestore(&host->clk_lock, flags); @@ -140,7 +140,7 @@ void mmc_host_clk_ungate(struct mmc_host *host) } host->clk_requests++; spin_unlock_irqrestore(&host->clk_lock, flags); - mutex_unlock(&host->clk_gate_mutex); + mmc_release_host(host); } /** @@ -215,7 +215,6 @@ static inline void mmc_host_clk_init(struct mmc_host *host) host->clk_gated = false; INIT_WORK(&host->clk_gate_work, mmc_host_clk_gate_work); spin_lock_init(&host->clk_lock); - mutex_init(&host->clk_gate_mutex); } /** diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index bcb793e..eb792cb 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -183,7 +183,6 @@ struct mmc_host { struct work_struct clk_gate_work; /* delayed clock gate */ unsigned int clk_old; /* old clock value cache */ spinlock_t clk_lock; /* lock for clk fields */ - struct mutex clk_gate_mutex; /* mutex for clock gating */ #endif /* host specific block data */ -- cgit v1.1 From c919c2a073c5d2e076e52a56b44281d922721b61 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Thu, 21 Apr 2011 09:09:59 +0200 Subject: mmc: tmio: fix .set_ios(MMC_POWER_UP) handling The aggressive clock gating for TMIO MMC patch has broken switching interface power on, using MFD or platform callbacks. Restore the ios->power_mode == MMC_POWER_UP && ios->clock == 0 case handling. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Chris Ball --- drivers/mmc/host/tmio_mmc_pio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c index 62d37de..710339a 100644 --- a/drivers/mmc/host/tmio_mmc_pio.c +++ b/drivers/mmc/host/tmio_mmc_pio.c @@ -728,15 +728,15 @@ static void tmio_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) tmio_mmc_set_clock(host, ios->clock); /* Power sequence - OFF -> UP -> ON */ - if (ios->power_mode == MMC_POWER_OFF || !ios->clock) { + if (ios->power_mode == MMC_POWER_UP) { + /* power up SD bus */ + if (host->set_pwr) + host->set_pwr(host->pdev, 1); + } else if (ios->power_mode == MMC_POWER_OFF || !ios->clock) { /* power down SD bus */ if (ios->power_mode == MMC_POWER_OFF && host->set_pwr) host->set_pwr(host->pdev, 0); tmio_mmc_clk_stop(host); - } else if (ios->power_mode == MMC_POWER_UP) { - /* power up SD bus */ - if (host->set_pwr) - host->set_pwr(host->pdev, 1); } else { /* start bus clock */ tmio_mmc_clk_start(host); -- cgit v1.1 From b7b4d3426d2b5ecab21578eb20d8e456a1aace8f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 27 Apr 2011 14:24:19 +0100 Subject: mmc: sdhci: Check mrq->cmd in sdhci_tasklet_finish It seems that under certain circumstances that the sdhci_tasklet_finish() call can be entered with mrq->cmd set to NULL, causing the system to crash with a NULL pointer de-reference. Unable to handle kernel NULL pointer dereference at virtual address 00000000 PC is at sdhci_tasklet_finish+0x34/0xe8 LR is at sdhci_tasklet_finish+0x24/0xe8 Seen on S3C6410 system. Signed-off-by: Ben Dooks Signed-off-by: Mark Brown Cc: Signed-off-by: Chris Ball --- drivers/mmc/host/sdhci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 9e15f41..fd8f488 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1345,7 +1345,7 @@ static void sdhci_tasklet_finish(unsigned long param) * upon error conditions. */ if (!(host->flags & SDHCI_DEVICE_DEAD) && - (mrq->cmd->error || + ((mrq->cmd && mrq->cmd->error) || (mrq->data && (mrq->data->error || (mrq->data->stop && mrq->data->stop->error))) || (host->quirks & SDHCI_QUIRK_RESET_AFTER_REQUEST))) { -- cgit v1.1 From 0c9c99a765321104cc5f9c97f949382a9ba4927e Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Wed, 27 Apr 2011 17:35:31 -0400 Subject: mmc: sdhci: Check mrq != NULL in sdhci_tasklet_finish It seems that under certain circumstances the sdhci_tasklet_finish() call can be entered with mrq set to NULL, causing the system to crash with a NULL pointer de-reference. Seen on S3C6410 system. Based on a patch by Dimitris Papastamos. Reported-by: Dimitris Papastamos Cc: Signed-off-by: Chris Ball --- drivers/mmc/host/sdhci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index fd8f488..5d20661 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1334,6 +1334,13 @@ static void sdhci_tasklet_finish(unsigned long param) host = (struct sdhci_host*)param; + /* + * If this tasklet gets rescheduled while running, it will + * be run again afterwards but without any active request. + */ + if (!host->mrq) + return; + spin_lock_irqsave(&host->lock, flags); del_timer(&host->timer); -- cgit v1.1 From f5346668150c37094b42cc2d07ec5fd1451eb980 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Apr 2011 23:08:34 -0700 Subject: Input: wm831x-ts - fix races with IRQ management If the WM831x pen down and data IRQs run in parallel it is possible for the data and pen down IRQs to deadlock themselves as one is part way through disabling its operation while the other is part way through enabling. Fix this by always disabling the pen down interrupt while data is active and vice versa. When a changeover is required we disable the IRQ that is to be stopped then schedule work that will enable the new IRQ. We need to handle the data flow in the data IRQ as the readback from the device needs to be ordered correctly with the IRQ for robust operation. This also fixes an issue when using the built in IRQs due to enable_irq() not being valid from interrupt context on an interrupt controller with bus operations like the built in IRQ controller - this issue may also have affected other interrupt controllers. We can't rely on having the data and pen down IRQs available via GPIOs on the CPU on every system. Signed-off-by: Mark Brown Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wm831x-ts.c | 54 +++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/drivers/input/touchscreen/wm831x-ts.c b/drivers/input/touchscreen/wm831x-ts.c index 6ae054f..b937301 100644 --- a/drivers/input/touchscreen/wm831x-ts.c +++ b/drivers/input/touchscreen/wm831x-ts.c @@ -68,8 +68,23 @@ struct wm831x_ts { unsigned int pd_irq; bool pressure; bool pen_down; + struct work_struct pd_data_work; }; +static void wm831x_pd_data_work(struct work_struct *work) +{ + struct wm831x_ts *wm831x_ts = + container_of(work, struct wm831x_ts, pd_data_work); + + if (wm831x_ts->pen_down) { + enable_irq(wm831x_ts->data_irq); + dev_dbg(wm831x_ts->wm831x->dev, "IRQ PD->DATA done\n"); + } else { + enable_irq(wm831x_ts->pd_irq); + dev_dbg(wm831x_ts->wm831x->dev, "IRQ DATA->PD done\n"); + } +} + static irqreturn_t wm831x_ts_data_irq(int irq, void *irq_data) { struct wm831x_ts *wm831x_ts = irq_data; @@ -110,6 +125,9 @@ static irqreturn_t wm831x_ts_data_irq(int irq, void *irq_data) } if (!wm831x_ts->pen_down) { + /* Switch from data to pen down */ + dev_dbg(wm831x->dev, "IRQ DATA->PD\n"); + disable_irq_nosync(wm831x_ts->data_irq); /* Don't need data any more */ @@ -128,6 +146,8 @@ static irqreturn_t wm831x_ts_data_irq(int irq, void *irq_data) ABS_PRESSURE, 0); input_report_key(wm831x_ts->input_dev, BTN_TOUCH, 0); + + schedule_work(&wm831x_ts->pd_data_work); } input_sync(wm831x_ts->input_dev); @@ -141,6 +161,11 @@ static irqreturn_t wm831x_ts_pen_down_irq(int irq, void *irq_data) struct wm831x *wm831x = wm831x_ts->wm831x; int ena = 0; + if (wm831x_ts->pen_down) + return IRQ_HANDLED; + + disable_irq_nosync(wm831x_ts->pd_irq); + /* Start collecting data */ if (wm831x_ts->pressure) ena |= WM831X_TCH_Z_ENA; @@ -156,7 +181,10 @@ static irqreturn_t wm831x_ts_pen_down_irq(int irq, void *irq_data) WM831X_TCHPD_EINT, WM831X_TCHPD_EINT); wm831x_ts->pen_down = true; - enable_irq(wm831x_ts->data_irq); + + /* Switch from pen down to data */ + dev_dbg(wm831x->dev, "IRQ PD->DATA\n"); + schedule_work(&wm831x_ts->pd_data_work); return IRQ_HANDLED; } @@ -182,13 +210,28 @@ static void wm831x_ts_input_close(struct input_dev *idev) struct wm831x_ts *wm831x_ts = input_get_drvdata(idev); struct wm831x *wm831x = wm831x_ts->wm831x; + /* Shut the controller down, disabling all other functionality too */ wm831x_set_bits(wm831x, WM831X_TOUCH_CONTROL_1, - WM831X_TCH_ENA | WM831X_TCH_CVT_ENA | - WM831X_TCH_X_ENA | WM831X_TCH_Y_ENA | - WM831X_TCH_Z_ENA, 0); + WM831X_TCH_ENA | WM831X_TCH_X_ENA | + WM831X_TCH_Y_ENA | WM831X_TCH_Z_ENA, 0); - if (wm831x_ts->pen_down) + /* Make sure any pending IRQs are done, the above will prevent + * new ones firing. + */ + synchronize_irq(wm831x_ts->data_irq); + synchronize_irq(wm831x_ts->pd_irq); + + /* Make sure the IRQ completion work is quiesced */ + flush_work_sync(&wm831x_ts->pd_data_work); + + /* If we ended up with the pen down then make sure we revert back + * to pen detection state for the next time we start up. + */ + if (wm831x_ts->pen_down) { disable_irq(wm831x_ts->data_irq); + enable_irq(wm831x_ts->pd_irq); + wm831x_ts->pen_down = false; + } } static __devinit int wm831x_ts_probe(struct platform_device *pdev) @@ -212,6 +255,7 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev) wm831x_ts->wm831x = wm831x; wm831x_ts->input_dev = input_dev; + INIT_WORK(&wm831x_ts->pd_data_work, wm831x_pd_data_work); /* * If we have a direct IRQ use it, otherwise use the interrupt -- cgit v1.1 From acad9853b95df6a3887f52e0ec88e4a77119ee28 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Apr 2011 23:08:51 -0700 Subject: Input: wm831x-ts - allow IRQ flags to be specified This allows maximum flexibility for configuring the direct GPIO based interrupts. Signed-off-by: Mark Brown Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wm831x-ts.c | 16 +++++++++++++--- include/linux/mfd/wm831x/pdata.h | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/input/touchscreen/wm831x-ts.c b/drivers/input/touchscreen/wm831x-ts.c index b937301..78e8705 100644 --- a/drivers/input/touchscreen/wm831x-ts.c +++ b/drivers/input/touchscreen/wm831x-ts.c @@ -241,7 +241,7 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev) struct wm831x_pdata *core_pdata = dev_get_platdata(pdev->dev.parent); struct wm831x_touch_pdata *pdata = NULL; struct input_dev *input_dev; - int error; + int error, irqf; if (core_pdata) pdata = core_pdata->touch; @@ -314,9 +314,14 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev) wm831x_set_bits(wm831x, WM831X_TOUCH_CONTROL_1, WM831X_TCH_RATE_MASK, 6); + if (pdata && pdata->data_irqf) + irqf = pdata->data_irqf; + else + irqf = IRQF_TRIGGER_HIGH; + error = request_threaded_irq(wm831x_ts->data_irq, NULL, wm831x_ts_data_irq, - IRQF_ONESHOT, + irqf | IRQF_ONESHOT, "Touchscreen data", wm831x_ts); if (error) { dev_err(&pdev->dev, "Failed to request data IRQ %d: %d\n", @@ -325,9 +330,14 @@ static __devinit int wm831x_ts_probe(struct platform_device *pdev) } disable_irq(wm831x_ts->data_irq); + if (pdata && pdata->pd_irqf) + irqf = pdata->pd_irqf; + else + irqf = IRQF_TRIGGER_HIGH; + error = request_threaded_irq(wm831x_ts->pd_irq, NULL, wm831x_ts_pen_down_irq, - IRQF_ONESHOT, + irqf | IRQF_ONESHOT, "Touchscreen pen down", wm831x_ts); if (error) { dev_err(&pdev->dev, "Failed to request pen down IRQ %d: %d\n", diff --git a/include/linux/mfd/wm831x/pdata.h b/include/linux/mfd/wm831x/pdata.h index 173086d..6b0eb13 100644 --- a/include/linux/mfd/wm831x/pdata.h +++ b/include/linux/mfd/wm831x/pdata.h @@ -81,7 +81,9 @@ struct wm831x_touch_pdata { int rpu; /** Pen down sensitivity resistor divider */ int pressure; /** Report pressure (boolean) */ unsigned int data_irq; /** Touch data ready IRQ */ + int data_irqf; /** IRQ flags for data ready IRQ */ unsigned int pd_irq; /** Touch pendown detect IRQ */ + int pd_irqf; /** IRQ flags for pen down IRQ */ }; enum wm831x_watchdog_action { -- cgit v1.1 From bf283707d5fb174ec09215ae19860ad04ba7b67a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Apr 2011 23:09:26 -0700 Subject: Input: wm831x-ts - move BTN_TOUCH reporting to data transfer Don't report BTN_TOUCH until we've got data as some less robust applications can be confused by getting a touch event by itself and it doesn't seem unreasonable for them to expect coordinates along with a touch. Signed-off-by: Mark Brown Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wm831x-ts.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/input/touchscreen/wm831x-ts.c b/drivers/input/touchscreen/wm831x-ts.c index 78e8705..9175d49 100644 --- a/drivers/input/touchscreen/wm831x-ts.c +++ b/drivers/input/touchscreen/wm831x-ts.c @@ -148,6 +148,8 @@ static irqreturn_t wm831x_ts_data_irq(int irq, void *irq_data) input_report_key(wm831x_ts->input_dev, BTN_TOUCH, 0); schedule_work(&wm831x_ts->pd_data_work); + } else { + input_report_key(wm831x_ts->input_dev, BTN_TOUCH, 1); } input_sync(wm831x_ts->input_dev); @@ -174,9 +176,6 @@ static irqreturn_t wm831x_ts_pen_down_irq(int irq, void *irq_data) WM831X_TCH_X_ENA | WM831X_TCH_Y_ENA | WM831X_TCH_Z_ENA, WM831X_TCH_X_ENA | WM831X_TCH_Y_ENA | ena); - input_report_key(wm831x_ts->input_dev, BTN_TOUCH, 1); - input_sync(wm831x_ts->input_dev); - wm831x_set_bits(wm831x, WM831X_INTERRUPT_STATUS_1, WM831X_TCHPD_EINT, WM831X_TCHPD_EINT); -- cgit v1.1 From 8a850cadca0e387c87a0911a61e99fd66aeb57ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 11:16:44 +0200 Subject: perf event, x86: Use better stalled cycles metric Use the UOPS_EXECUTED.*,c=1,i=1 event on Intel CPUs - it is a rather good indicator of CPU execution stalls, more sensitive and more inclusive than the 0xa2 resource stalls event (which does not count nearly as many stall types). Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn2dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 067a48b..1ea9422 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1413,8 +1413,8 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; - /* Install the stalled-cycles event: 0xff: All reasons, 0xa2: Resource stalls */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0xffa2; + /* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0x1803fb1; if (ebx & 0x40) { /* -- cgit v1.1 From f9cef0a90c4e7637f1ec98474a1a099aec45eb65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 18:17:11 +0200 Subject: perf stat: Add --sync/-S option --sync will tell perf stat to run sync() before starting a command. This allows IO-heavy tests to be used with --repeat, without one iteration impacting the other. Elapsed time will stabilize for example: before: 3.971525714 seconds time elapsed ( +- 8.56% ) after: 3.211098537 seconds time elapsed ( +- 1.52% ) So measurements will be more accurate. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 003caa8..5658a77 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -128,6 +128,7 @@ static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; static bool detailed_run = false; +static bool sync_run = false; static bool big_num = true; static int big_num_opt = -1; static const char *cpu_list; @@ -819,6 +820,8 @@ static const struct option options[] = { "null run - dont start any counters"), OPT_BOOLEAN('d', "detailed", &detailed_run, "detailed run - start a lot of events"), + OPT_BOOLEAN('S', "sync", &sync_run, + "call sync() before starting a run"), OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, "print large numbers with thousands\' separators", stat__set_big_num), @@ -944,6 +947,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) for (run_idx = 0; run_idx < run_count; run_idx++) { if (run_count != 1 && verbose) fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); + + if (sync_run) + sync(); + status = run_perf_stat(argc, argv); } -- cgit v1.1 From ede70290046043b2638204cab55e26ea1d0c6cd9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 08:48:42 +0200 Subject: perf stat: Fix compatibility behavior Instead of failing on an unknown event, when new perf stat is run on older kernels: $ ./perf stat true Error: open_counter returned with 22 (Invalid argument). /bin/dmesg may provide additional information. Fatal: Not all events could be opened. Just ignore EINVAL and ENOSYS, we'll print the results as not counted: Performance counter stats for 'true': 0.239483 task-clock # 0.493 CPUs utilized 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 86 page-faults # 0.359 M/sec 704,766 cycles # 2.943 GHz stalled-cycles 381,961 instructions # 0.54 insns per cycle 69,626 branches # 290.735 M/sec 4,594 branch-misses # 6.60% of all branches 0.000485883 seconds time elapsed Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio5hjpn3dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5658a77..da77077 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -372,7 +372,10 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == -EPERM || errno == -EACCES) { + if (errno == EINVAL || errno == ENOSYS) + continue; + + if (errno == EPERM || errno == EACCES) { error("You may not have permission to collect %sstats.\n" "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid or running as root.", -- cgit v1.1 From e11feaa1192a079ba8e88a12121e9b12d55d4239 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 27 Apr 2011 14:27:24 -0400 Subject: watchdog, hung_task_timeout: Add Kconfig configurable default This patch allows the default value for sysctl_hung_task_timeout_secs to be set at build time. The feature carries virtually no overhead, so it makes sense to keep it enabled. On heavily loaded systems, though, it can end up triggering stack traces when there is no bug other than the system being underprovisioned. We use this patch to keep the hung task facility available but disabled at boot-time. The default of 120 seconds is preserved. As a note, commit e162b39a may have accidentally reverted commit fb822db4, which raised the default from 120 seconds to 480 seconds. Signed-off-by: Jeff Mahoney Acked-by: Mandeep Singh Baines Link: http://lkml.kernel.org/r/4DB8600C.8080000@suse.com Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 2 +- lib/Kconfig.debug | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 53ead17..ea64012 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Zero means infinite timeout - no checking done: */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; unsigned long __read_mostly sysctl_hung_task_warnings = 10; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c768bcd..debbb05 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -238,6 +238,21 @@ config DETECT_HUNG_TASK enabled then all held locks will also be reported. This feature has negligible overhead. +config DEFAULT_HUNG_TASK_TIMEOUT + int "Default timeout for hung task detection (in seconds)" + depends on DETECT_HUNG_TASK + default 120 + help + This option controls the default timeout (in seconds) used + to determine when a task has become non-responsive and should + be considered hung. + + It can be adjusted at runtime via the kernel.hung_task_timeout + sysctl or by writing a value to /proc/sys/kernel/hung_task_timeout. + + A timeout of 0 disables the check. The default is two minutes. + Keeping the default should be fine in most cases. + config BOOTPARAM_HUNG_TASK_PANIC bool "Panic (Reboot) On Hung Tasks" depends on DETECT_HUNG_TASK -- cgit v1.1 From 83a5d2d1b4cc06ce17b44873e004fc0b55552183 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 12 Mar 2011 03:27:22 +0000 Subject: ARM: Fix .size directive for xscale_dma_a0_map_area gas used to accept (and ignore?) .size directives which referred to undefined symbols, as this does. In binutils 2.21 these are treated as fatal errors. Signed-off-by: Ben Hutchings Signed-off-by: Eric Miao --- arch/arm/mm/proc-xscale.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S index ce233bc..42af976 100644 --- a/arch/arm/mm/proc-xscale.S +++ b/arch/arm/mm/proc-xscale.S @@ -395,7 +395,7 @@ ENTRY(xscale_dma_a0_map_area) teq r2, #DMA_TO_DEVICE beq xscale_dma_clean_range b xscale_dma_flush_range -ENDPROC(xscsale_dma_a0_map_area) +ENDPROC(xscale_dma_a0_map_area) /* * dma_unmap_area(start, size, dir) -- cgit v1.1 From 1270b01f7530ac73bcf08325bcd85c94e2bbebc1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Apr 2011 18:19:17 +0100 Subject: ASoC: Fix CODEC name in Goni This was typoed at some point in the multi-component merge, though the driver was added along with that. Signed-off-by: Mark Brown Acked-by: Jassi Brar Acked-by: Liam Girdwood --- sound/soc/samsung/goni_wm8994.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/samsung/goni_wm8994.c b/sound/soc/samsung/goni_wm8994.c index f6b3a3c..3a7861c 100644 --- a/sound/soc/samsung/goni_wm8994.c +++ b/sound/soc/samsung/goni_wm8994.c @@ -238,7 +238,7 @@ static struct snd_soc_dai_link goni_dai[] = { .cpu_dai_name = "samsung-i2s.0", .codec_dai_name = "wm8994-hifi", .platform_name = "samsung-audio", - .codec_name = "wm8994-codec.0-0x1a", + .codec_name = "wm8994-codec.0-001a", .init = goni_wm8994_init, .ops = &goni_hifi_ops, }, { @@ -247,7 +247,7 @@ static struct snd_soc_dai_link goni_dai[] = { .cpu_dai_name = "goni-voice-dai", .codec_dai_name = "wm8994-voice", .platform_name = "samsung-audio", - .codec_name = "wm8994-codec.0-0x1a", + .codec_name = "wm8994-codec.0-001a", .ops = &goni_voice_ops, }, }; -- cgit v1.1 From 69b91bc1551a2fc746a01fea9d3291e60be3780d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Apr 2011 18:24:35 +0100 Subject: ASoC: Fix CODEC DAI names for Goni Immediately after sending the last fix I realised that the CODEC DAI names also don't correspond to the WM8994 driver. Update the DAI names to match. Signed-off-by: Mark Brown Acked-by: Jassi Brar Acked-by: Liam Girdwood --- sound/soc/samsung/goni_wm8994.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/samsung/goni_wm8994.c b/sound/soc/samsung/goni_wm8994.c index 3a7861c..0e80dae 100644 --- a/sound/soc/samsung/goni_wm8994.c +++ b/sound/soc/samsung/goni_wm8994.c @@ -236,7 +236,7 @@ static struct snd_soc_dai_link goni_dai[] = { .name = "WM8994", .stream_name = "WM8994 HiFi", .cpu_dai_name = "samsung-i2s.0", - .codec_dai_name = "wm8994-hifi", + .codec_dai_name = "wm8994-aif1", .platform_name = "samsung-audio", .codec_name = "wm8994-codec.0-001a", .init = goni_wm8994_init, @@ -245,7 +245,7 @@ static struct snd_soc_dai_link goni_dai[] = { .name = "WM8994 Voice", .stream_name = "Voice", .cpu_dai_name = "goni-voice-dai", - .codec_dai_name = "wm8994-voice", + .codec_dai_name = "wm8994-aif2", .platform_name = "samsung-audio", .codec_name = "wm8994-codec.0-001a", .ops = &goni_voice_ops, -- cgit v1.1 From f61583941667c96d61fc6991b9f23307f9bfa87e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 19 Apr 2011 22:49:29 +0200 Subject: b43: trivial: update module info about ucode16_mimo firmware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: RafaÅ‚ MiÅ‚ecki Signed-off-by: John W. Linville --- drivers/net/wireless/b43/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c index 57eb5b6..7841b95 100644 --- a/drivers/net/wireless/b43/main.c +++ b/drivers/net/wireless/b43/main.c @@ -72,6 +72,7 @@ MODULE_FIRMWARE("b43/ucode11.fw"); MODULE_FIRMWARE("b43/ucode13.fw"); MODULE_FIRMWARE("b43/ucode14.fw"); MODULE_FIRMWARE("b43/ucode15.fw"); +MODULE_FIRMWARE("b43/ucode16_mimo.fw"); MODULE_FIRMWARE("b43/ucode5.fw"); MODULE_FIRMWARE("b43/ucode9.fw"); -- cgit v1.1 From 1501b6764f0c363a9f1d72f9d422841f81f1bd8c Mon Sep 17 00:00:00 2001 From: Wey-Yi Guy Date: Mon, 25 Apr 2011 11:12:57 -0700 Subject: iwlegacy: led stay solid on when no traffic commit 5ed540aecc2aae92d5c97b9a9306a5bf88ad5574 change the led behavior for iwlwifi driver; the side effect cause led blink all the time. Modify the led blink table to fix this problem Signed-off-by: Wey-Yi Guy Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/iwl-led.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/iwlegacy/iwl-led.c b/drivers/net/wireless/iwlegacy/iwl-led.c index 15eb8b7..bda0d61 100644 --- a/drivers/net/wireless/iwlegacy/iwl-led.c +++ b/drivers/net/wireless/iwlegacy/iwl-led.c @@ -48,8 +48,21 @@ module_param(led_mode, int, S_IRUGO); MODULE_PARM_DESC(led_mode, "0=system default, " "1=On(RF On)/Off(RF Off), 2=blinking"); +/* Throughput OFF time(ms) ON time (ms) + * >300 25 25 + * >200 to 300 40 40 + * >100 to 200 55 55 + * >70 to 100 65 65 + * >50 to 70 75 75 + * >20 to 50 85 85 + * >10 to 20 95 95 + * >5 to 10 110 110 + * >1 to 5 130 130 + * >0 to 1 167 167 + * <=0 SOLID ON + */ static const struct ieee80211_tpt_blink iwl_blink[] = { - { .throughput = 0 * 1024 - 1, .blink_time = 334 }, + { .throughput = 0, .blink_time = 334 }, { .throughput = 1 * 1024 - 1, .blink_time = 260 }, { .throughput = 5 * 1024 - 1, .blink_time = 220 }, { .throughput = 10 * 1024 - 1, .blink_time = 190 }, @@ -101,6 +114,11 @@ static int iwl_legacy_led_cmd(struct iwl_priv *priv, if (priv->blink_on == on && priv->blink_off == off) return 0; + if (off == 0) { + /* led is SOLID_ON */ + on = IWL_LED_SOLID; + } + IWL_DEBUG_LED(priv, "Led blink time compensation=%u\n", priv->cfg->base_params->led_compensation); led_cmd.on = iwl_legacy_blink_compensation(priv, on, -- cgit v1.1 From f325757ab2812b42da4d690cf8da73c0e678368c Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 28 Apr 2011 11:36:54 +0200 Subject: iwl4965: fix "TX Power requested while scanning" Fix the following: WARNING: at drivers/net/wireless/iwlegacy/iwl-4965.c:1128 \ iwl4965_send_tx_power+0x61/0x102 [iwl4965]() Hardware name: [...] TX Power requested while scanning! Pid: 5723, comm: kworker/u:28 Not tainted 2.6.39-0.rc4.4.fc14.x86_64 #1 Call Trace: [] warn_slowpath_common+0x85/0x9d [] ? iwl4965_show_temperature+0x49/0x49 [iwl4965] [] warn_slowpath_fmt+0x46/0x48 [] iwl4965_send_tx_power+0x61/0x102 [iwl4965] [] ? mutex_lock+0x36/0x50 [] iwl4965_bg_txpower_work+0x57/0x73 [iwl4965] [] process_one_work+0x18d/0x286 [] worker_thread+0xfd/0x181 [] ? manage_workers.clone.16+0x172/0x172 [] kthread+0x82/0x8a [] kernel_thread_helper+0x4/0x10 [] ? kthread_worker_fn+0x14b/0x14b [] ? gs_change+0x13/0x13 Reported-and-tested-by: Paul Bolle Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/iwl4965-base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/iwlegacy/iwl4965-base.c b/drivers/net/wireless/iwlegacy/iwl4965-base.c index d484c36..a62fe24 100644 --- a/drivers/net/wireless/iwlegacy/iwl4965-base.c +++ b/drivers/net/wireless/iwlegacy/iwl4965-base.c @@ -2984,15 +2984,15 @@ static void iwl4965_bg_txpower_work(struct work_struct *work) struct iwl_priv *priv = container_of(work, struct iwl_priv, txpower_work); + mutex_lock(&priv->mutex); + /* If a scan happened to start before we got here * then just return; the statistics notification will * kick off another scheduled work to compensate for * any temperature delta we missed here. */ if (test_bit(STATUS_EXIT_PENDING, &priv->status) || test_bit(STATUS_SCANNING, &priv->status)) - return; - - mutex_lock(&priv->mutex); + goto out; /* Regardless of if we are associated, we must reconfigure the * TX power since frames can be sent on non-radar channels while @@ -3002,7 +3002,7 @@ static void iwl4965_bg_txpower_work(struct work_struct *work) /* Update last_temperature to keep is_calib_needed from running * when it isn't needed... */ priv->last_temperature = priv->temperature; - +out: mutex_unlock(&priv->mutex); } -- cgit v1.1 From 68972efa657040f891c7eda07c7da8c8dd576788 Mon Sep 17 00:00:00 2001 From: Paul Stewart Date: Thu, 28 Apr 2011 05:43:37 +0000 Subject: usbnet: Resubmit interrupt URB if device is open Resubmit interrupt URB if device is open. Use a flag set in usbnet_open() to determine this state. Also kill and free interrupt URB in usbnet_disconnect(). [Rebased off git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git] Signed-off-by: Paul Stewart Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 8 ++++++++ include/linux/usb/usbnet.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 069c1cf..009bba3 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -736,6 +736,7 @@ int usbnet_open (struct net_device *net) } } + set_bit(EVENT_DEV_OPEN, &dev->flags); netif_start_queue (net); netif_info(dev, ifup, dev->net, "open: enable queueing (rx %d, tx %d) mtu %d %s framing\n", @@ -1259,6 +1260,9 @@ void usbnet_disconnect (struct usb_interface *intf) if (dev->driver_info->unbind) dev->driver_info->unbind (dev, intf); + usb_kill_urb(dev->interrupt); + usb_free_urb(dev->interrupt); + free_netdev(net); usb_put_dev (xdev); } @@ -1498,6 +1502,10 @@ int usbnet_resume (struct usb_interface *intf) int retval; if (!--dev->suspend_count) { + /* resume interrupt URBs */ + if (dev->interrupt && test_bit(EVENT_DEV_OPEN, &dev->flags)) + usb_submit_urb(dev->interrupt, GFP_NOIO); + spin_lock_irq(&dev->txq.lock); while ((res = usb_get_from_anchor(&dev->deferred))) { diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 0e18550..605b0aa 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -68,6 +68,7 @@ struct usbnet { # define EVENT_RX_PAUSED 5 # define EVENT_DEV_WAKING 6 # define EVENT_DEV_ASLEEP 7 +# define EVENT_DEV_OPEN 8 }; static inline struct usb_driver *driver_of(struct usb_interface *intf) -- cgit v1.1 From cb1e922fa104bb0bb3aa5fc6ca7f7e070f3b55e9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 28 Apr 2011 15:11:21 -0400 Subject: SELinux: pass last path component in may_create New inodes are created in a two stage process. We first will compute the label on a new inode in security_inode_create() and check if the operation is allowed. We will then actually re-compute that same label and apply it in security_inode_init_security(). The change to do new label calculations based in part on the last component of the path name only passed the path component information all the way down the security_inode_init_security hook. Down the security_inode_create hook the path information did not make it past may_create. Thus the two calculations came up differently and the permissions check might not actually be against the label that is created. Pass and use the same information in both places to harmonize the calculations and checks. Reported-by: Dominick Grift Signed-off-by: Eric Paris --- security/selinux/hooks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d52a925..9a93af8 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1573,7 +1573,8 @@ static int may_create(struct inode *dir, return rc; if (!newsid || !(sbsec->flags & SE_SBLABELSUPP)) { - rc = security_transition_sid(sid, dsec->sid, tclass, NULL, &newsid); + rc = security_transition_sid(sid, dsec->sid, tclass, + &dentry->d_name, &newsid); if (rc) return rc; } -- cgit v1.1 From 5d30b10bd68df007e7ae21e77d1e0ce184b53040 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 28 Apr 2011 15:55:52 -0400 Subject: flex_array: flex_array_prealloc takes a number of elements, not an end Change flex_array_prealloc to take the number of elements for which space should be allocated instead of the last (inclusive) element. Users and documentation are updated accordingly. flex_arrays got introduced before they had users. When folks started using it, they ended up needing a different API than was coded up originally. This swaps over to the API that folks apparently need. Based-on-patch-by: Steffen Klassert Signed-off-by: Eric Paris Tested-by: Chris Richards Acked-by: Dave Hansen Cc: stable@kernel.org [2.6.38+] --- Documentation/flexible-arrays.txt | 4 ++-- include/linux/flex_array.h | 2 +- lib/flex_array.c | 13 ++++++++----- security/selinux/ss/policydb.c | 6 +++--- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Documentation/flexible-arrays.txt b/Documentation/flexible-arrays.txt index cb8a3a0..df904ae 100644 --- a/Documentation/flexible-arrays.txt +++ b/Documentation/flexible-arrays.txt @@ -66,10 +66,10 @@ trick is to ensure that any needed memory allocations are done before entering atomic context, using: int flex_array_prealloc(struct flex_array *array, unsigned int start, - unsigned int end, gfp_t flags); + unsigned int nr_elements, gfp_t flags); This function will ensure that memory for the elements indexed in the range -defined by start and end has been allocated. Thereafter, a +defined by start and nr_elements has been allocated. Thereafter, a flex_array_put() call on an element in that range is guaranteed not to block. diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h index 70e4efa..ebeb2f3 100644 --- a/include/linux/flex_array.h +++ b/include/linux/flex_array.h @@ -61,7 +61,7 @@ struct flex_array { struct flex_array *flex_array_alloc(int element_size, unsigned int total, gfp_t flags); int flex_array_prealloc(struct flex_array *fa, unsigned int start, - unsigned int end, gfp_t flags); + unsigned int nr_elements, gfp_t flags); void flex_array_free(struct flex_array *fa); void flex_array_free_parts(struct flex_array *fa); int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, diff --git a/lib/flex_array.c b/lib/flex_array.c index c0ea40b..0c33b24 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -232,10 +232,10 @@ EXPORT_SYMBOL(flex_array_clear); /** * flex_array_prealloc - guarantee that array space exists - * @fa: the flex array for which to preallocate parts - * @start: index of first array element for which space is allocated - * @end: index of last (inclusive) element for which space is allocated - * @flags: page allocation flags + * @fa: the flex array for which to preallocate parts + * @start: index of first array element for which space is allocated + * @nr_elements: number of elements for which space is allocated + * @flags: page allocation flags * * This will guarantee that no future calls to flex_array_put() * will allocate memory. It can be used if you are expecting to @@ -245,13 +245,16 @@ EXPORT_SYMBOL(flex_array_clear); * Locking must be provided by the caller. */ int flex_array_prealloc(struct flex_array *fa, unsigned int start, - unsigned int end, gfp_t flags) + unsigned int nr_elements, gfp_t flags) { int start_part; int end_part; int part_nr; + unsigned int end; struct flex_array_part *part; + end = start + nr_elements - 1; + if (start >= fa->total_nr_elements || end >= fa->total_nr_elements) return -ENOSPC; if (elements_fit_in_base(fa)) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index e7b850a..e6e7ce0 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -502,7 +502,7 @@ static int policydb_index(struct policydb *p) goto out; rc = flex_array_prealloc(p->type_val_to_struct_array, 0, - p->p_types.nprim - 1, GFP_KERNEL | __GFP_ZERO); + p->p_types.nprim, GFP_KERNEL | __GFP_ZERO); if (rc) goto out; @@ -519,7 +519,7 @@ static int policydb_index(struct policydb *p) goto out; rc = flex_array_prealloc(p->sym_val_to_name[i], - 0, p->symtab[i].nprim - 1, + 0, p->symtab[i].nprim, GFP_KERNEL | __GFP_ZERO); if (rc) goto out; @@ -2375,7 +2375,7 @@ int policydb_read(struct policydb *p, void *fp) goto bad; /* preallocate so we don't have to worry about the put ever failing */ - rc = flex_array_prealloc(p->type_attr_map_array, 0, p->p_types.nprim - 1, + rc = flex_array_prealloc(p->type_attr_map_array, 0, p->p_types.nprim, GFP_KERNEL | __GFP_ZERO); if (rc) goto bad; -- cgit v1.1 From bf69d41d198138e3c601e9a6645f4f1369aff7e0 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 28 Apr 2011 15:55:52 -0400 Subject: flex_arrays: allow zero length flex arrays Just like kmalloc will allow one to allocate a 0 length segment of memory flex arrays should do the same thing. It should bomb if you try to use something, but it should at least allow the allocation. This is needed because when SELinux switched to using flex_arrays in 2.6.38 the inability to allocate a 0 length array resulted in SELinux policy load returning -ENOSPC when previously it worked. Based-on-patch-by: Steffen Klassert Signed-off-by: Eric Paris Tested-by: Chris Richards Cc: stable@kernel.org [2.6.38+] --- lib/flex_array.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/flex_array.c b/lib/flex_array.c index 0c33b24..854b57b 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -253,9 +253,16 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, unsigned int end; struct flex_array_part *part; + if (!start && !nr_elements) + return 0; + if (start >= fa->total_nr_elements) + return -ENOSPC; + if (!nr_elements) + return 0; + end = start + nr_elements - 1; - if (start >= fa->total_nr_elements || end >= fa->total_nr_elements) + if (end >= fa->total_nr_elements) return -ENOSPC; if (elements_fit_in_base(fa)) return 0; @@ -346,6 +353,8 @@ int flex_array_shrink(struct flex_array *fa) int part_nr; int ret = 0; + if (!fa->total_nr_elements) + return 0; if (elements_fit_in_base(fa)) return ret; for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) { -- cgit v1.1 From b3b270054b80e6195b1d2b2ce082239911261839 Mon Sep 17 00:00:00 2001 From: Peter Korsgaard Date: Tue, 26 Apr 2011 01:45:41 +0000 Subject: dsa/mv88e6131: fix unknown multicast/broadcast forwarding on mv88e6085 The 88e6085 has a few differences from the other devices in the port control registers, causing unknown multicast/broadcast packets to get dropped when using the standard port setup. At the same time update kconfig to clarify that the mv88e6085 is now supported. Signed-off-by: Peter Korsgaard Acked-by: Lennert Buytenhek Signed-off-by: David S. Miller --- net/dsa/Kconfig | 4 ++-- net/dsa/mv88e6131.c | 26 +++++++++++++++++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 87bb5f4..c53ded2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -41,12 +41,12 @@ config NET_DSA_MV88E6XXX_NEED_PPU default n config NET_DSA_MV88E6131 - bool "Marvell 88E6095/6095F/6131 ethernet switch chip support" + bool "Marvell 88E6085/6095/6095F/6131 ethernet switch chip support" select NET_DSA_MV88E6XXX select NET_DSA_MV88E6XXX_NEED_PPU select NET_DSA_TAG_DSA ---help--- - This enables support for the Marvell 88E6095/6095F/6131 + This enables support for the Marvell 88E6085/6095/6095F/6131 ethernet switch chips. config NET_DSA_MV88E6123_61_65 diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c index 3da4188..45f7411 100644 --- a/net/dsa/mv88e6131.c +++ b/net/dsa/mv88e6131.c @@ -207,8 +207,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p) * mode, but do not enable forwarding of unknown unicasts. */ val = 0x0433; - if (p == dsa_upstream_port(ds)) + if (p == dsa_upstream_port(ds)) { val |= 0x0104; + /* + * On 6085, unknown multicast forward is controlled + * here rather than in Port Control 2 register. + */ + if (ps->id == ID_6085) + val |= 0x0008; + } if (ds->dsa_port_mask & (1 << p)) val |= 0x0100; REG_WRITE(addr, 0x04, val); @@ -251,10 +258,19 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p) * If this is the upstream port for this switch, enable * forwarding of unknown multicast addresses. */ - val = 0x0080 | dsa_upstream_port(ds); - if (p == dsa_upstream_port(ds)) - val |= 0x0040; - REG_WRITE(addr, 0x08, val); + if (ps->id == ID_6085) + /* + * on 6085, bits 3:0 are reserved, bit 6 control ARP + * mirroring, and multicast forward is handled in + * Port Control register. + */ + REG_WRITE(addr, 0x08, 0x0080); + else { + val = 0x0080 | dsa_upstream_port(ds); + if (p == dsa_upstream_port(ds)) + val |= 0x0040; + REG_WRITE(addr, 0x08, val); + } /* * Rate Control: disable ingress rate limiting. -- cgit v1.1 From 4d27e9dcff00a6425d779b065ec8892e4f391661 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Apr 2011 00:35:50 +0200 Subject: PM: Make power domain callbacks take precedence over subsystem ones Change the PM core's behavior related to power domains in such a way that, if a power domain is defined for a given device, its callbacks will be executed instead of and not in addition to the device subsystem's PM callbacks. The idea behind the initial implementation of power domains handling by the PM core was that power domain callbacks would be executed in addition to subsystem callbacks, so that it would be possible to extend the subsystem callbacks by using power domains. It turns out, however, that this wouldn't be really convenient in some important situations. For example, there are systems in which power can only be removed from entire power domains. On those systems it is not desirable to execute device drivers' PM callbacks until it is known that power is going to be removed from the devices in question, which means that they should be executed by power domain callbacks rather then by subsystem (e.g. bus type) PM callbacks, because subsystems generally have no information about what devices belong to which power domain. Thus, for instance, if the bus type in question is the platform bus type, its PM callbacks generally should not be called in addition to power domain callbacks, because they run device drivers' callbacks unconditionally if defined. While in principle the default subsystem PM callbacks, or a subset of them, may be replaced with different functions, it doesn't seem correct to do so, because that would change the subsystem's behavior with respect to all devices in the system, regardless of whether or not they belong to any power domains. Thus, the only remaining option is to make power domain callbacks take precedence over subsystem callbacks. Signed-off-by: Rafael J. Wysocki Acked-by: Grant Likely Acked-by: Kevin Hilman --- drivers/base/power/main.c | 64 +++++++++++++++++++++----------------------- drivers/base/power/runtime.c | 29 +++++++------------- 2 files changed, 41 insertions(+), 52 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index abe3ab7..3b35456 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -426,10 +426,8 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) if (dev->pwr_domain) { pm_dev_dbg(dev, state, "EARLY power domain "); - pm_noirq_op(dev, &dev->pwr_domain->ops, state); - } - - if (dev->type && dev->type->pm) { + error = pm_noirq_op(dev, &dev->pwr_domain->ops, state); + } else if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "EARLY type "); error = pm_noirq_op(dev, dev->type->pm, state); } else if (dev->class && dev->class->pm) { @@ -517,7 +515,8 @@ static int device_resume(struct device *dev, pm_message_t state, bool async) if (dev->pwr_domain) { pm_dev_dbg(dev, state, "power domain "); - pm_op(dev, &dev->pwr_domain->ops, state); + error = pm_op(dev, &dev->pwr_domain->ops, state); + goto End; } if (dev->type && dev->type->pm) { @@ -629,12 +628,11 @@ static void device_complete(struct device *dev, pm_message_t state) { device_lock(dev); - if (dev->pwr_domain && dev->pwr_domain->ops.complete) { + if (dev->pwr_domain) { pm_dev_dbg(dev, state, "completing power domain "); - dev->pwr_domain->ops.complete(dev); - } - - if (dev->type && dev->type->pm) { + if (dev->pwr_domain->ops.complete) + dev->pwr_domain->ops.complete(dev); + } else if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "completing type "); if (dev->type->pm->complete) dev->type->pm->complete(dev); @@ -732,7 +730,12 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) { int error; - if (dev->type && dev->type->pm) { + if (dev->pwr_domain) { + pm_dev_dbg(dev, state, "LATE power domain "); + error = pm_noirq_op(dev, &dev->pwr_domain->ops, state); + if (error) + return error; + } else if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "LATE type "); error = pm_noirq_op(dev, dev->type->pm, state); if (error) @@ -749,11 +752,6 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) return error; } - if (dev->pwr_domain) { - pm_dev_dbg(dev, state, "LATE power domain "); - pm_noirq_op(dev, &dev->pwr_domain->ops, state); - } - return 0; } @@ -841,21 +839,27 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) goto End; } + if (dev->pwr_domain) { + pm_dev_dbg(dev, state, "power domain "); + error = pm_op(dev, &dev->pwr_domain->ops, state); + goto End; + } + if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "type "); error = pm_op(dev, dev->type->pm, state); - goto Domain; + goto End; } if (dev->class) { if (dev->class->pm) { pm_dev_dbg(dev, state, "class "); error = pm_op(dev, dev->class->pm, state); - goto Domain; + goto End; } else if (dev->class->suspend) { pm_dev_dbg(dev, state, "legacy class "); error = legacy_suspend(dev, state, dev->class->suspend); - goto Domain; + goto End; } } @@ -869,12 +873,6 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) } } - Domain: - if (!error && dev->pwr_domain) { - pm_dev_dbg(dev, state, "power domain "); - pm_op(dev, &dev->pwr_domain->ops, state); - } - End: device_unlock(dev); complete_all(&dev->power.completion); @@ -965,7 +963,14 @@ static int device_prepare(struct device *dev, pm_message_t state) device_lock(dev); - if (dev->type && dev->type->pm) { + if (dev->pwr_domain) { + pm_dev_dbg(dev, state, "preparing power domain "); + if (dev->pwr_domain->ops.prepare) + error = dev->pwr_domain->ops.prepare(dev); + suspend_report_result(dev->pwr_domain->ops.prepare, error); + if (error) + goto End; + } else if (dev->type && dev->type->pm) { pm_dev_dbg(dev, state, "preparing type "); if (dev->type->pm->prepare) error = dev->type->pm->prepare(dev); @@ -984,13 +989,6 @@ static int device_prepare(struct device *dev, pm_message_t state) if (dev->bus->pm->prepare) error = dev->bus->pm->prepare(dev); suspend_report_result(dev->bus->pm->prepare, error); - if (error) - goto End; - } - - if (dev->pwr_domain && dev->pwr_domain->ops.prepare) { - pm_dev_dbg(dev, state, "preparing power domain "); - dev->pwr_domain->ops.prepare(dev); } End: diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3172c60d..0d4587b 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -168,7 +168,6 @@ static int rpm_check_suspend_allowed(struct device *dev) static int rpm_idle(struct device *dev, int rpmflags) { int (*callback)(struct device *); - int (*domain_callback)(struct device *); int retval; retval = rpm_check_suspend_allowed(dev); @@ -214,7 +213,9 @@ static int rpm_idle(struct device *dev, int rpmflags) dev->power.idle_notification = true; - if (dev->type && dev->type->pm) + if (dev->pwr_domain) + callback = dev->pwr_domain->ops.runtime_idle; + else if (dev->type && dev->type->pm) callback = dev->type->pm->runtime_idle; else if (dev->class && dev->class->pm) callback = dev->class->pm->runtime_idle; @@ -223,19 +224,10 @@ static int rpm_idle(struct device *dev, int rpmflags) else callback = NULL; - if (dev->pwr_domain) - domain_callback = dev->pwr_domain->ops.runtime_idle; - else - domain_callback = NULL; - - if (callback || domain_callback) { + if (callback) { spin_unlock_irq(&dev->power.lock); - if (domain_callback) - retval = domain_callback(dev); - - if (!retval && callback) - callback(dev); + callback(dev); spin_lock_irq(&dev->power.lock); } @@ -382,7 +374,9 @@ static int rpm_suspend(struct device *dev, int rpmflags) __update_runtime_status(dev, RPM_SUSPENDING); - if (dev->type && dev->type->pm) + if (dev->pwr_domain) + callback = dev->pwr_domain->ops.runtime_suspend; + else if (dev->type && dev->type->pm) callback = dev->type->pm->runtime_suspend; else if (dev->class && dev->class->pm) callback = dev->class->pm->runtime_suspend; @@ -400,8 +394,6 @@ static int rpm_suspend(struct device *dev, int rpmflags) else pm_runtime_cancel_pending(dev); } else { - if (dev->pwr_domain) - rpm_callback(dev->pwr_domain->ops.runtime_suspend, dev); no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); @@ -582,9 +574,8 @@ static int rpm_resume(struct device *dev, int rpmflags) __update_runtime_status(dev, RPM_RESUMING); if (dev->pwr_domain) - rpm_callback(dev->pwr_domain->ops.runtime_resume, dev); - - if (dev->type && dev->type->pm) + callback = dev->pwr_domain->ops.runtime_resume; + else if (dev->type && dev->type->pm) callback = dev->type->pm->runtime_resume; else if (dev->class && dev->class->pm) callback = dev->class->pm->runtime_resume; -- cgit v1.1 From 69c9dd1ecf446ad8a830e4afc539a2a1adc85b78 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Apr 2011 00:36:05 +0200 Subject: PM: Export platform bus type's default PM callbacks Export the default PM callbacks defined for the platform bus type so that they can be used by power domains for suspending and resuming platform devices in the future. Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 72 +++++++++++------------------------------ include/linux/platform_device.h | 60 ++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 9e0e4fc..313556f 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -667,7 +667,7 @@ static int platform_legacy_resume(struct device *dev) return ret; } -static int platform_pm_prepare(struct device *dev) +int platform_pm_prepare(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -678,7 +678,7 @@ static int platform_pm_prepare(struct device *dev) return ret; } -static void platform_pm_complete(struct device *dev) +void platform_pm_complete(struct device *dev) { struct device_driver *drv = dev->driver; @@ -686,16 +686,11 @@ static void platform_pm_complete(struct device *dev) drv->pm->complete(dev); } -#else /* !CONFIG_PM_SLEEP */ - -#define platform_pm_prepare NULL -#define platform_pm_complete NULL - -#endif /* !CONFIG_PM_SLEEP */ +#endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND -int __weak platform_pm_suspend(struct device *dev) +int platform_pm_suspend(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -713,7 +708,7 @@ int __weak platform_pm_suspend(struct device *dev) return ret; } -int __weak platform_pm_suspend_noirq(struct device *dev) +int platform_pm_suspend_noirq(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -729,7 +724,7 @@ int __weak platform_pm_suspend_noirq(struct device *dev) return ret; } -int __weak platform_pm_resume(struct device *dev) +int platform_pm_resume(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -747,7 +742,7 @@ int __weak platform_pm_resume(struct device *dev) return ret; } -int __weak platform_pm_resume_noirq(struct device *dev) +int platform_pm_resume_noirq(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -763,18 +758,11 @@ int __weak platform_pm_resume_noirq(struct device *dev) return ret; } -#else /* !CONFIG_SUSPEND */ - -#define platform_pm_suspend NULL -#define platform_pm_resume NULL -#define platform_pm_suspend_noirq NULL -#define platform_pm_resume_noirq NULL - -#endif /* !CONFIG_SUSPEND */ +#endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS -static int platform_pm_freeze(struct device *dev) +int platform_pm_freeze(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -792,7 +780,7 @@ static int platform_pm_freeze(struct device *dev) return ret; } -static int platform_pm_freeze_noirq(struct device *dev) +int platform_pm_freeze_noirq(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -808,7 +796,7 @@ static int platform_pm_freeze_noirq(struct device *dev) return ret; } -static int platform_pm_thaw(struct device *dev) +int platform_pm_thaw(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -826,7 +814,7 @@ static int platform_pm_thaw(struct device *dev) return ret; } -static int platform_pm_thaw_noirq(struct device *dev) +int platform_pm_thaw_noirq(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -842,7 +830,7 @@ static int platform_pm_thaw_noirq(struct device *dev) return ret; } -static int platform_pm_poweroff(struct device *dev) +int platform_pm_poweroff(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -860,7 +848,7 @@ static int platform_pm_poweroff(struct device *dev) return ret; } -static int platform_pm_poweroff_noirq(struct device *dev) +int platform_pm_poweroff_noirq(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -876,7 +864,7 @@ static int platform_pm_poweroff_noirq(struct device *dev) return ret; } -static int platform_pm_restore(struct device *dev) +int platform_pm_restore(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -894,7 +882,7 @@ static int platform_pm_restore(struct device *dev) return ret; } -static int platform_pm_restore_noirq(struct device *dev) +int platform_pm_restore_noirq(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; @@ -910,18 +898,7 @@ static int platform_pm_restore_noirq(struct device *dev) return ret; } -#else /* !CONFIG_HIBERNATE_CALLBACKS */ - -#define platform_pm_freeze NULL -#define platform_pm_thaw NULL -#define platform_pm_poweroff NULL -#define platform_pm_restore NULL -#define platform_pm_freeze_noirq NULL -#define platform_pm_thaw_noirq NULL -#define platform_pm_poweroff_noirq NULL -#define platform_pm_restore_noirq NULL - -#endif /* !CONFIG_HIBERNATE_CALLBACKS */ +#endif /* CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM_RUNTIME @@ -949,23 +926,10 @@ int __weak platform_pm_runtime_idle(struct device *dev) #endif /* !CONFIG_PM_RUNTIME */ static const struct dev_pm_ops platform_dev_pm_ops = { - .prepare = platform_pm_prepare, - .complete = platform_pm_complete, - .suspend = platform_pm_suspend, - .resume = platform_pm_resume, - .freeze = platform_pm_freeze, - .thaw = platform_pm_thaw, - .poweroff = platform_pm_poweroff, - .restore = platform_pm_restore, - .suspend_noirq = platform_pm_suspend_noirq, - .resume_noirq = platform_pm_resume_noirq, - .freeze_noirq = platform_pm_freeze_noirq, - .thaw_noirq = platform_pm_thaw_noirq, - .poweroff_noirq = platform_pm_poweroff_noirq, - .restore_noirq = platform_pm_restore_noirq, .runtime_suspend = platform_pm_runtime_suspend, .runtime_resume = platform_pm_runtime_resume, .runtime_idle = platform_pm_runtime_idle, + USE_PLATFORM_PM_SLEEP_OPS }; struct bus_type platform_bus_type = { diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 744942c..e0093e0 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -205,4 +205,64 @@ static inline char *early_platform_driver_setup_func(void) \ } #endif /* MODULE */ +#ifdef CONFIG_PM_SLEEP +extern int platform_pm_prepare(struct device *dev); +extern void platform_pm_complete(struct device *dev); +#else +#define platform_pm_prepare NULL +#define platform_pm_complete NULL +#endif + +#ifdef CONFIG_SUSPEND +extern int platform_pm_suspend(struct device *dev); +extern int platform_pm_suspend_noirq(struct device *dev); +extern int platform_pm_resume(struct device *dev); +extern int platform_pm_resume_noirq(struct device *dev); +#else +#define platform_pm_suspend NULL +#define platform_pm_resume NULL +#define platform_pm_suspend_noirq NULL +#define platform_pm_resume_noirq NULL +#endif + +#ifdef CONFIG_HIBERNATE_CALLBACKS +extern int platform_pm_freeze(struct device *dev); +extern int platform_pm_freeze_noirq(struct device *dev); +extern int platform_pm_thaw(struct device *dev); +extern int platform_pm_thaw_noirq(struct device *dev); +extern int platform_pm_poweroff(struct device *dev); +extern int platform_pm_poweroff_noirq(struct device *dev); +extern int platform_pm_restore(struct device *dev); +extern int platform_pm_restore_noirq(struct device *dev); +#else +#define platform_pm_freeze NULL +#define platform_pm_thaw NULL +#define platform_pm_poweroff NULL +#define platform_pm_restore NULL +#define platform_pm_freeze_noirq NULL +#define platform_pm_thaw_noirq NULL +#define platform_pm_poweroff_noirq NULL +#define platform_pm_restore_noirq NULL +#endif + +#ifdef CONFIG_PM_SLEEP +#define USE_PLATFORM_PM_SLEEP_OPS \ + .prepare = platform_pm_prepare, \ + .complete = platform_pm_complete, \ + .suspend = platform_pm_suspend, \ + .resume = platform_pm_resume, \ + .freeze = platform_pm_freeze, \ + .thaw = platform_pm_thaw, \ + .poweroff = platform_pm_poweroff, \ + .restore = platform_pm_restore, \ + .suspend_noirq = platform_pm_suspend_noirq, \ + .resume_noirq = platform_pm_resume_noirq, \ + .freeze_noirq = platform_pm_freeze_noirq, \ + .thaw_noirq = platform_pm_thaw_noirq, \ + .poweroff_noirq = platform_pm_poweroff_noirq, \ + .restore_noirq = platform_pm_restore_noirq, +#else +#define USE_PLATFORM_PM_SLEEP_OPS +#endif + #endif /* _PLATFORM_DEVICE_H_ */ -- cgit v1.1 From 38ade3a1fa0421c12627c7b48c33e89414fc9b76 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Apr 2011 00:36:21 +0200 Subject: shmobile: Use power domains for platform runtime PM shmobile platforms replace the runtime PM callbacks of the platform bus type with their own routines, but this means that the callbacks are replaced system-wide. This may not be the right approach if the platform devices on the system are not of the same type (e.g. some of them belong to an SoC and the others are located in separate chips), because in those cases they may require different handling. Thus it is better to use power domains to override the platform bus type's PM handling, as it generally is possible to use different power domains for devices with different PM requirements. Define a default power domain for shmobile in both the SH and ARM falvors and use it to override the platform bus type's PM callbacks. Since the suspend and hibernate callbacks of the new "default" power domains need to be the same and the platform bus type's suspend and hibernate callbacks for the time being, export those callbacks so that can be used outside of the platform bus type code. Signed-off-by: Rafael J. Wysocki --- arch/arm/mach-shmobile/pm_runtime.c | 25 +++++++++++++++++------- arch/sh/kernel/cpu/shmobile/pm_runtime.c | 33 +++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/arch/arm/mach-shmobile/pm_runtime.c b/arch/arm/mach-shmobile/pm_runtime.c index 94912d3..12bb504 100644 --- a/arch/arm/mach-shmobile/pm_runtime.c +++ b/arch/arm/mach-shmobile/pm_runtime.c @@ -66,11 +66,11 @@ static void platform_pm_runtime_bug(struct device *dev, dev_err(dev, "runtime pm suspend before resume\n"); } -int platform_pm_runtime_suspend(struct device *dev) +static int default_platform_runtime_suspend(struct device *dev) { struct pm_runtime_data *prd = __to_prd(dev); - dev_dbg(dev, "platform_pm_runtime_suspend()\n"); + dev_dbg(dev, "%s()\n", __func__); platform_pm_runtime_bug(dev, prd); @@ -82,11 +82,11 @@ int platform_pm_runtime_suspend(struct device *dev) return 0; } -int platform_pm_runtime_resume(struct device *dev) +static int default_platform_runtime_resume(struct device *dev) { struct pm_runtime_data *prd = __to_prd(dev); - dev_dbg(dev, "platform_pm_runtime_resume()\n"); + dev_dbg(dev, "%s()\n", __func__); platform_pm_runtime_init(dev, prd); @@ -98,12 +98,21 @@ int platform_pm_runtime_resume(struct device *dev) return 0; } -int platform_pm_runtime_idle(struct device *dev) +static int default_platform_runtime_idle(struct device *dev) { /* suspend synchronously to disable clocks immediately */ return pm_runtime_suspend(dev); } +static struct dev_power_domain default_power_domain = { + .ops = { + .runtime_suspend = default_platform_runtime_suspend, + .runtime_resume = default_platform_runtime_resume, + .runtime_idle = default_platform_runtime_idle, + USE_PLATFORM_PM_SLEEP_OPS + }, +}; + static int platform_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { @@ -114,10 +123,12 @@ static int platform_bus_notify(struct notifier_block *nb, if (action == BUS_NOTIFY_BIND_DRIVER) { prd = devres_alloc(__devres_release, sizeof(*prd), GFP_KERNEL); - if (prd) + if (prd) { devres_add(dev, prd); - else + dev->pwr_domain = &default_power_domain; + } else { dev_err(dev, "unable to alloc memory for runtime pm\n"); + } } return 0; diff --git a/arch/sh/kernel/cpu/shmobile/pm_runtime.c b/arch/sh/kernel/cpu/shmobile/pm_runtime.c index 6dcb816..22db127 100644 --- a/arch/sh/kernel/cpu/shmobile/pm_runtime.c +++ b/arch/sh/kernel/cpu/shmobile/pm_runtime.c @@ -139,7 +139,7 @@ void platform_pm_runtime_suspend_idle(void) queue_work(pm_wq, &hwblk_work); } -int platform_pm_runtime_suspend(struct device *dev) +static int default_platform_runtime_suspend(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); struct pdev_archdata *ad = &pdev->archdata; @@ -147,7 +147,7 @@ int platform_pm_runtime_suspend(struct device *dev) int hwblk = ad->hwblk_id; int ret = 0; - dev_dbg(dev, "platform_pm_runtime_suspend() [%d]\n", hwblk); + dev_dbg(dev, "%s() [%d]\n", __func__, hwblk); /* ignore off-chip platform devices */ if (!hwblk) @@ -183,20 +183,20 @@ int platform_pm_runtime_suspend(struct device *dev) mutex_unlock(&ad->mutex); out: - dev_dbg(dev, "platform_pm_runtime_suspend() [%d] returns %d\n", - hwblk, ret); + dev_dbg(dev, "%s() [%d] returns %d\n", + __func__, hwblk, ret); return ret; } -int platform_pm_runtime_resume(struct device *dev) +static int default_platform_runtime_resume(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); struct pdev_archdata *ad = &pdev->archdata; int hwblk = ad->hwblk_id; int ret = 0; - dev_dbg(dev, "platform_pm_runtime_resume() [%d]\n", hwblk); + dev_dbg(dev, "%s() [%d]\n", __func__, hwblk); /* ignore off-chip platform devices */ if (!hwblk) @@ -228,19 +228,19 @@ int platform_pm_runtime_resume(struct device *dev) */ mutex_unlock(&ad->mutex); out: - dev_dbg(dev, "platform_pm_runtime_resume() [%d] returns %d\n", - hwblk, ret); + dev_dbg(dev, "%s() [%d] returns %d\n", + __func__, hwblk, ret); return ret; } -int platform_pm_runtime_idle(struct device *dev) +static int default_platform_runtime_idle(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); int hwblk = pdev->archdata.hwblk_id; int ret = 0; - dev_dbg(dev, "platform_pm_runtime_idle() [%d]\n", hwblk); + dev_dbg(dev, "%s() [%d]\n", __func__, hwblk); /* ignore off-chip platform devices */ if (!hwblk) @@ -252,10 +252,19 @@ int platform_pm_runtime_idle(struct device *dev) /* suspend synchronously to disable clocks immediately */ ret = pm_runtime_suspend(dev); out: - dev_dbg(dev, "platform_pm_runtime_idle() [%d] done!\n", hwblk); + dev_dbg(dev, "%s() [%d] done!\n", __func__, hwblk); return ret; } +static struct dev_power_domain default_power_domain = { + .ops = { + .runtime_suspend = default_platform_runtime_suspend, + .runtime_resume = default_platform_runtime_resume, + .runtime_idle = default_platform_runtime_idle, + USE_PLATFORM_PM_SLEEP_OPS + }, +}; + static int platform_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { @@ -276,6 +285,7 @@ static int platform_bus_notify(struct notifier_block *nb, hwblk_disable(hwblk_info, hwblk); /* make sure driver re-inits itself once */ __set_bit(PDEV_ARCHDATA_FLAG_INIT, &pdev->archdata.flags); + dev->pwr_domain = &default_power_domain; break; /* TODO: add BUS_NOTIFY_BIND_DRIVER and increase idle count */ case BUS_NOTIFY_BOUND_DRIVER: @@ -289,6 +299,7 @@ static int platform_bus_notify(struct notifier_block *nb, __set_bit(PDEV_ARCHDATA_FLAG_INIT, &pdev->archdata.flags); break; case BUS_NOTIFY_DEL_DEVICE: + dev->pwr_domain = NULL; break; } return 0; -- cgit v1.1 From 8b313a38ecffc0ff0b4c5115f0a461f73b7dfdb6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Apr 2011 00:36:32 +0200 Subject: PM / Platform: Use generic runtime PM callbacks directly Once shmobile platforms have been converted to using power domains for overriding the platform bus type's PM callbacks, it isn't necessary to use the __weakly defined wrappers around the generinc runtime PM callbacks in the platform bus type any more. Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 313556f..079c18a 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -900,35 +900,10 @@ int platform_pm_restore_noirq(struct device *dev) #endif /* CONFIG_HIBERNATE_CALLBACKS */ -#ifdef CONFIG_PM_RUNTIME - -int __weak platform_pm_runtime_suspend(struct device *dev) -{ - return pm_generic_runtime_suspend(dev); -}; - -int __weak platform_pm_runtime_resume(struct device *dev) -{ - return pm_generic_runtime_resume(dev); -}; - -int __weak platform_pm_runtime_idle(struct device *dev) -{ - return pm_generic_runtime_idle(dev); -}; - -#else /* !CONFIG_PM_RUNTIME */ - -#define platform_pm_runtime_suspend NULL -#define platform_pm_runtime_resume NULL -#define platform_pm_runtime_idle NULL - -#endif /* !CONFIG_PM_RUNTIME */ - static const struct dev_pm_ops platform_dev_pm_ops = { - .runtime_suspend = platform_pm_runtime_suspend, - .runtime_resume = platform_pm_runtime_resume, - .runtime_idle = platform_pm_runtime_idle, + .runtime_suspend = pm_generic_runtime_suspend, + .runtime_resume = pm_generic_runtime_resume, + .runtime_idle = pm_generic_runtime_idle, USE_PLATFORM_PM_SLEEP_OPS }; -- cgit v1.1 From 638080c37ae08fd0c44cec13d7948ca5385ae851 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Fri, 29 Apr 2011 00:36:42 +0200 Subject: OMAP2+ / PM: move runtime PM implementation to use device power domains In commit 7538e3db6e015e890825fbd9f8659952896ddd5b (PM: add support for device power domains) a better way for handling platform-specific power hooks was introduced. Rather than using the platform_bus dev_pm_ops overrides (platform_bus_set_pm_ops()), this patch moves the OMAP runtime PM implementation over to using device power domains. Since OMAP is the only user of platform_bus_set_pm_ops(), that interface can be removed (and will be in a forthcoming patch.) [rjw: Rebased on top of a previous change modifying the handling of power domains by the PM core so that power domain callbacks take precendence over subsystem-level PM callbacks.] Signed-off-by: Kevin Hilman Acked-by: Grant Likely Signed-off-by: Rafael J. Wysocki --- arch/arm/mach-omap2/Makefile | 6 +-- arch/arm/mach-omap2/pm_bus.c | 85 ---------------------------------------- arch/arm/plat-omap/omap_device.c | 23 +++++++++++ 3 files changed, 26 insertions(+), 88 deletions(-) delete mode 100644 arch/arm/mach-omap2/pm_bus.c diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index a45cd64..b353584 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -59,10 +59,10 @@ endif # Power Management ifeq ($(CONFIG_PM),y) obj-$(CONFIG_ARCH_OMAP2) += pm24xx.o -obj-$(CONFIG_ARCH_OMAP2) += sleep24xx.o pm_bus.o +obj-$(CONFIG_ARCH_OMAP2) += sleep24xx.o obj-$(CONFIG_ARCH_OMAP3) += pm34xx.o sleep34xx.o \ - cpuidle34xx.o pm_bus.o -obj-$(CONFIG_ARCH_OMAP4) += pm44xx.o pm_bus.o + cpuidle34xx.o +obj-$(CONFIG_ARCH_OMAP4) += pm44xx.o obj-$(CONFIG_PM_DEBUG) += pm-debug.o obj-$(CONFIG_OMAP_SMARTREFLEX) += sr_device.o smartreflex.o obj-$(CONFIG_OMAP_SMARTREFLEX_CLASS3) += smartreflex-class3.o diff --git a/arch/arm/mach-omap2/pm_bus.c b/arch/arm/mach-omap2/pm_bus.c deleted file mode 100644 index 5acd2ab..0000000 --- a/arch/arm/mach-omap2/pm_bus.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Runtime PM support code for OMAP - * - * Author: Kevin Hilman, Deep Root Systems, LLC - * - * Copyright (C) 2010 Texas Instruments, Inc. - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef CONFIG_PM_RUNTIME -static int omap_pm_runtime_suspend(struct device *dev) -{ - struct platform_device *pdev = to_platform_device(dev); - int r, ret = 0; - - dev_dbg(dev, "%s\n", __func__); - - ret = pm_generic_runtime_suspend(dev); - - if (!ret && dev->parent == &omap_device_parent) { - r = omap_device_idle(pdev); - WARN_ON(r); - } - - return ret; -}; - -static int omap_pm_runtime_resume(struct device *dev) -{ - struct platform_device *pdev = to_platform_device(dev); - int r; - - dev_dbg(dev, "%s\n", __func__); - - if (dev->parent == &omap_device_parent) { - r = omap_device_enable(pdev); - WARN_ON(r); - } - - return pm_generic_runtime_resume(dev); -}; -#else -#define omap_pm_runtime_suspend NULL -#define omap_pm_runtime_resume NULL -#endif /* CONFIG_PM_RUNTIME */ - -static int __init omap_pm_runtime_init(void) -{ - const struct dev_pm_ops *pm; - struct dev_pm_ops *omap_pm; - - pm = platform_bus_get_pm_ops(); - if (!pm) { - pr_err("%s: unable to get dev_pm_ops from platform_bus\n", - __func__); - return -ENODEV; - } - - omap_pm = kmemdup(pm, sizeof(struct dev_pm_ops), GFP_KERNEL); - if (!omap_pm) { - pr_err("%s: unable to alloc memory for new dev_pm_ops\n", - __func__); - return -ENOMEM; - } - - omap_pm->runtime_suspend = omap_pm_runtime_suspend; - omap_pm->runtime_resume = omap_pm_runtime_resume; - - platform_bus_set_pm_ops(omap_pm); - - return 0; -} -core_initcall(omap_pm_runtime_init); diff --git a/arch/arm/plat-omap/omap_device.c b/arch/arm/plat-omap/omap_device.c index 9bbda9a..a37b8eb 100644 --- a/arch/arm/plat-omap/omap_device.c +++ b/arch/arm/plat-omap/omap_device.c @@ -536,6 +536,28 @@ int omap_early_device_register(struct omap_device *od) return 0; } +static int _od_runtime_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + + return omap_device_idle(pdev); +} + +static int _od_runtime_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + + return omap_device_enable(pdev); +} + +static struct dev_power_domain omap_device_power_domain = { + .ops = { + .runtime_suspend = _od_runtime_suspend, + .runtime_resume = _od_runtime_resume, + USE_PLATFORM_PM_SLEEP_OPS + } +}; + /** * omap_device_register - register an omap_device with one omap_hwmod * @od: struct omap_device * to register @@ -549,6 +571,7 @@ int omap_device_register(struct omap_device *od) pr_debug("omap_device: %s: registering\n", od->pdev.name); od->pdev.dev.parent = &omap_device_parent; + od->pdev.dev.pwr_domain = &omap_device_power_domain; return platform_device_register(&od->pdev); } -- cgit v1.1 From 1d2b71f61b6a10216274e27b717becf9ae101fc7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Apr 2011 00:36:53 +0200 Subject: PM / Runtime: Add subsystem data field to struct dev_pm_info Some subsystems need to attach PM-related data to struct device and they need to use devres for this purpose. For their convenience and to make code more straightforward, add a new field called subsys_data to struct dev_pm_info and let subsystems use it for attaching PM-related information to devices. Convert the ARM shmobile platform to using the new field. Signed-off-by: Rafael J. Wysocki --- arch/arm/mach-shmobile/pm_runtime.c | 34 +++++++++++++++++----------------- include/linux/pm.h | 1 + 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/arm/mach-shmobile/pm_runtime.c b/arch/arm/mach-shmobile/pm_runtime.c index 12bb504..30bbe9a 100644 --- a/arch/arm/mach-shmobile/pm_runtime.c +++ b/arch/arm/mach-shmobile/pm_runtime.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_PM_RUNTIME #define BIT_ONCE 0 @@ -29,22 +30,9 @@ struct pm_runtime_data { struct clk *clk; }; -static void __devres_release(struct device *dev, void *res) -{ - struct pm_runtime_data *prd = res; - - dev_dbg(dev, "__devres_release()\n"); - - if (test_bit(BIT_CLK_ENABLED, &prd->flags)) - clk_disable(prd->clk); - - if (test_bit(BIT_ACTIVE, &prd->flags)) - clk_put(prd->clk); -} - static struct pm_runtime_data *__to_prd(struct device *dev) { - return devres_find(dev, __devres_release, NULL, NULL); + return dev ? dev->power.subsys_data : NULL; } static void platform_pm_runtime_init(struct device *dev, @@ -121,14 +109,26 @@ static int platform_bus_notify(struct notifier_block *nb, dev_dbg(dev, "platform_bus_notify() %ld !\n", action); - if (action == BUS_NOTIFY_BIND_DRIVER) { - prd = devres_alloc(__devres_release, sizeof(*prd), GFP_KERNEL); + switch (action) { + case BUS_NOTIFY_BIND_DRIVER: + prd = kzalloc(sizeof(*prd), GFP_KERNEL); if (prd) { - devres_add(dev, prd); + dev->power.subsys_data = prd; dev->pwr_domain = &default_power_domain; } else { dev_err(dev, "unable to alloc memory for runtime pm\n"); } + break; + case BUS_NOTIFY_UNBOUND_DRIVER: + prd = __to_prd(dev); + if (prd) { + if (test_bit(BIT_CLK_ENABLED, &prd->flags)) + clk_disable(prd->clk); + + if (test_bit(BIT_ACTIVE, &prd->flags)) + clk_put(prd->clk); + } + break; } return 0; diff --git a/include/linux/pm.h b/include/linux/pm.h index 512e091..f4167d0 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -460,6 +460,7 @@ struct dev_pm_info { unsigned long active_jiffies; unsigned long suspended_jiffies; unsigned long accounting_timestamp; + void *subsys_data; /* Owned by the subsystem. */ #endif }; -- cgit v1.1 From cf3cc1aa9b6d0bf1750143af65829f4368d77492 Mon Sep 17 00:00:00 2001 From: Viktor Rosendahl Date: Mon, 28 Mar 2011 18:56:05 +0300 Subject: kprobes/arm: Fix ldrd/strd emulation Currently emulate_ldrd and emulate_strd don't even have the adjustment of the PC value, so in case of Rn == PC, it will not update the PC incorrectly but instead load/store from the wrong address. Let's add both the adjustment of the PC value and the check for PC == PC. Signed-off-by: Viktor Rosendahl Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 2389131..3b0cf90 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -540,9 +540,12 @@ static void __kprobes emulate_ldrd(struct kprobe *p, struct pt_regs *regs) { insn_2arg_fn_t *i_fn = (insn_2arg_fn_t *)&p->ainsn.insn[0]; kprobe_opcode_t insn = p->opcode; + long ppc = (long)p->addr + 8; int rd = (insn >> 12) & 0xf; int rn = (insn >> 16) & 0xf; int rm = insn & 0xf; /* rm may be invalid, don't care. */ + long rmv = (rm == 15) ? ppc : regs->uregs[rm]; + long rnv = (rn == 15) ? ppc : regs->uregs[rn]; /* Not following the C calling convention here, so need asm(). */ __asm__ __volatile__ ( @@ -554,29 +557,36 @@ static void __kprobes emulate_ldrd(struct kprobe *p, struct pt_regs *regs) "str r0, %[rn] \n\t" /* in case of writeback */ "str r2, %[rd0] \n\t" "str r3, %[rd1] \n\t" - : [rn] "+m" (regs->uregs[rn]), + : [rn] "+m" (rnv), [rd0] "=m" (regs->uregs[rd]), [rd1] "=m" (regs->uregs[rd+1]) - : [rm] "m" (regs->uregs[rm]), + : [rm] "m" (rmv), [cpsr] "r" (regs->ARM_cpsr), [i_fn] "r" (i_fn) : "r0", "r1", "r2", "r3", "lr", "cc" ); + if (rn != 15) + regs->uregs[rn] = rnv; /* Save Rn in case of writeback. */ } static void __kprobes emulate_strd(struct kprobe *p, struct pt_regs *regs) { insn_4arg_fn_t *i_fn = (insn_4arg_fn_t *)&p->ainsn.insn[0]; kprobe_opcode_t insn = p->opcode; + long ppc = (long)p->addr + 8; int rd = (insn >> 12) & 0xf; int rn = (insn >> 16) & 0xf; int rm = insn & 0xf; - long rnv = regs->uregs[rn]; - long rmv = regs->uregs[rm]; /* rm/rmv may be invalid, don't care. */ + long rnv = (rn == 15) ? ppc : regs->uregs[rn]; + /* rm/rmv may be invalid, don't care. */ + long rmv = (rm == 15) ? ppc : regs->uregs[rm]; + long rnv_wb; - regs->uregs[rn] = insnslot_4arg_rflags(rnv, rmv, regs->uregs[rd], + rnv_wb = insnslot_4arg_rflags(rnv, rmv, regs->uregs[rd], regs->uregs[rd+1], regs->ARM_cpsr, i_fn); + if (rn != 15) + regs->uregs[rn] = rnv_wb; /* Save Rn in case of writeback. */ } static void __kprobes emulate_ldr(struct kprobe *p, struct pt_regs *regs) -- cgit v1.1 From 073090cb701148396b1130be81f8ac84a41f196d Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Wed, 6 Apr 2011 11:17:09 +0100 Subject: ARM: kprobes: Fix probing of conditionally executed instructions When a kprobe is placed onto conditionally executed ARM instructions, many of the emulation routines used to single step them produce corrupt register results. Rather than fix all of these cases we modify the framework which calls them to test the relevant condition flags and, if the test fails, skip calling the emulation code. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/include/asm/kprobes.h | 3 ++ arch/arm/kernel/kprobes-decode.c | 91 ++++++++++++++++++++++++++++++++++++++++ arch/arm/kernel/kprobes.c | 3 +- 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h index bb8a19b..e46bdd0 100644 --- a/arch/arm/include/asm/kprobes.h +++ b/arch/arm/include/asm/kprobes.h @@ -39,10 +39,13 @@ typedef u32 kprobe_opcode_t; struct kprobe; typedef void (kprobe_insn_handler_t)(struct kprobe *, struct pt_regs *); +typedef unsigned long (kprobe_check_cc)(unsigned long); + /* Architecture specific copy of original instruction. */ struct arch_specific_insn { kprobe_opcode_t *insn; kprobe_insn_handler_t *insn_handler; + kprobe_check_cc *insn_check_cc; }; struct prev_kprobe { diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 3b0cf90..2e84169 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1396,6 +1396,96 @@ space_cccc_111x(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD; } +static unsigned long __kprobes __check_eq(unsigned long cpsr) +{ + return cpsr & PSR_Z_BIT; +} + +static unsigned long __kprobes __check_ne(unsigned long cpsr) +{ + return (~cpsr) & PSR_Z_BIT; +} + +static unsigned long __kprobes __check_cs(unsigned long cpsr) +{ + return cpsr & PSR_C_BIT; +} + +static unsigned long __kprobes __check_cc(unsigned long cpsr) +{ + return (~cpsr) & PSR_C_BIT; +} + +static unsigned long __kprobes __check_mi(unsigned long cpsr) +{ + return cpsr & PSR_N_BIT; +} + +static unsigned long __kprobes __check_pl(unsigned long cpsr) +{ + return (~cpsr) & PSR_N_BIT; +} + +static unsigned long __kprobes __check_vs(unsigned long cpsr) +{ + return cpsr & PSR_V_BIT; +} + +static unsigned long __kprobes __check_vc(unsigned long cpsr) +{ + return (~cpsr) & PSR_V_BIT; +} + +static unsigned long __kprobes __check_hi(unsigned long cpsr) +{ + cpsr &= ~(cpsr >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return cpsr & PSR_C_BIT; +} + +static unsigned long __kprobes __check_ls(unsigned long cpsr) +{ + cpsr &= ~(cpsr >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ + return (~cpsr) & PSR_C_BIT; +} + +static unsigned long __kprobes __check_ge(unsigned long cpsr) +{ + cpsr ^= (cpsr << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return (~cpsr) & PSR_N_BIT; +} + +static unsigned long __kprobes __check_lt(unsigned long cpsr) +{ + cpsr ^= (cpsr << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + return cpsr & PSR_N_BIT; +} + +static unsigned long __kprobes __check_gt(unsigned long cpsr) +{ + unsigned long temp = cpsr ^ (cpsr << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + temp |= (cpsr << 1); /* PSR_N_BIT |= PSR_Z_BIT */ + return (~temp) & PSR_N_BIT; +} + +static unsigned long __kprobes __check_le(unsigned long cpsr) +{ + unsigned long temp = cpsr ^ (cpsr << 3); /* PSR_N_BIT ^= PSR_V_BIT */ + temp |= (cpsr << 1); /* PSR_N_BIT |= PSR_Z_BIT */ + return temp & PSR_N_BIT; +} + +static unsigned long __kprobes __check_al(unsigned long cpsr) +{ + return true; +} + +static kprobe_check_cc * const condition_checks[16] = { + &__check_eq, &__check_ne, &__check_cs, &__check_cc, + &__check_mi, &__check_pl, &__check_vs, &__check_vc, + &__check_hi, &__check_ls, &__check_ge, &__check_lt, + &__check_gt, &__check_le, &__check_al, &__check_al +}; + /* Return: * INSN_REJECTED If instruction is one not allowed to kprobe, * INSN_GOOD If instruction is supported and uses instruction slot, @@ -1411,6 +1501,7 @@ space_cccc_111x(kprobe_opcode_t insn, struct arch_specific_insn *asi) enum kprobe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + asi->insn_check_cc = condition_checks[insn>>28]; asi->insn[1] = KPROBE_RETURN_INSTRUCTION; if ((insn & 0xf0000000) == 0xf0000000) { diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c index 2ba7deb..1656c87 100644 --- a/arch/arm/kernel/kprobes.c +++ b/arch/arm/kernel/kprobes.c @@ -134,7 +134,8 @@ static void __kprobes singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { regs->ARM_pc += 4; - p->ainsn.insn_handler(p, regs); + if (p->ainsn.insn_check_cc(regs->ARM_cpsr)) + p->ainsn.insn_handler(p, regs); } /* -- cgit v1.1 From a539f5d46c868cb1f37b92e62e040b400571aed4 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Wed, 6 Apr 2011 11:17:10 +0100 Subject: ARM: kprobes: Remove redundant condition checks from simulation routines Now we have the framework code handling conditionally executed instructions we can remove redundant checks in individual simulation routines. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 2e84169..f52fac0c5 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -71,10 +71,6 @@ #define PSR_fs (PSR_f|PSR_s) #define KPROBE_RETURN_INSTRUCTION 0xe1a0f00e /* mov pc, lr */ -#define SET_R0_TRUE_INSTRUCTION 0xe3a00001 /* mov r0, #1 */ - -#define truecc_insn(insn) (((insn) & 0xf0000000) | \ - (SET_R0_TRUE_INSTRUCTION & 0x0fffffff)) typedef long (insn_0arg_fn_t)(void); typedef long (insn_1arg_fn_t)(long); @@ -419,14 +415,10 @@ insnslot_llret_4arg_rwflags(long r0, long r1, long r2, long r3, long *cpsr, static void __kprobes simulate_bbl(struct kprobe *p, struct pt_regs *regs) { - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; kprobe_opcode_t insn = p->opcode; long iaddr = (long)p->addr; int disp = branch_displacement(insn); - if (!insnslot_1arg_rflags(0, regs->ARM_cpsr, i_fn)) - return; - if (insn & (1 << 24)) regs->ARM_lr = iaddr + 4; @@ -446,14 +438,10 @@ static void __kprobes simulate_blx1(struct kprobe *p, struct pt_regs *regs) static void __kprobes simulate_blx2bx(struct kprobe *p, struct pt_regs *regs) { - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; kprobe_opcode_t insn = p->opcode; int rm = insn & 0xf; long rmv = regs->uregs[rm]; - if (!insnslot_1arg_rflags(0, regs->ARM_cpsr, i_fn)) - return; - if (insn & (1 << 5)) regs->ARM_lr = (long)p->addr + 4; @@ -465,7 +453,6 @@ static void __kprobes simulate_blx2bx(struct kprobe *p, struct pt_regs *regs) static void __kprobes simulate_ldm1stm1(struct kprobe *p, struct pt_regs *regs) { - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; kprobe_opcode_t insn = p->opcode; int rn = (insn >> 16) & 0xf; int lbit = insn & (1 << 20); @@ -476,9 +463,6 @@ static void __kprobes simulate_ldm1stm1(struct kprobe *p, struct pt_regs *regs) int reg_bit_vector; int reg_count; - if (!insnslot_1arg_rflags(0, regs->ARM_cpsr, i_fn)) - return; - reg_count = 0; reg_bit_vector = insn & 0xffff; while (reg_bit_vector) { @@ -510,11 +494,6 @@ static void __kprobes simulate_ldm1stm1(struct kprobe *p, struct pt_regs *regs) static void __kprobes simulate_stm1_pc(struct kprobe *p, struct pt_regs *regs) { - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; - - if (!insnslot_1arg_rflags(0, regs->ARM_cpsr, i_fn)) - return; - regs->ARM_pc = (long)p->addr + str_pc_offset; simulate_ldm1stm1(p, regs); regs->ARM_pc = (long)p->addr + 4; @@ -1056,9 +1035,8 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* BLX(2) : cccc 0001 0010 xxxx xxxx xxxx 0011 xxxx */ /* BX : cccc 0001 0010 xxxx xxxx xxxx 0001 xxxx */ if ((insn & 0x0ff000d0) == 0x01200010) { - asi->insn[0] = truecc_insn(insn); asi->insn_handler = simulate_blx2bx; - return INSN_GOOD; + return INSN_GOOD_NO_SLOT; } /* CLZ : cccc 0001 0110 xxxx xxxx xxxx 0001 xxxx */ @@ -1333,10 +1311,9 @@ space_cccc_100x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* LDM(1) : cccc 100x x0x1 xxxx xxxx xxxx xxxx xxxx */ /* STM(1) : cccc 100x x0x0 xxxx xxxx xxxx xxxx xxxx */ - asi->insn[0] = truecc_insn(insn); asi->insn_handler = ((insn & 0x108000) == 0x008000) ? /* STM & R15 */ simulate_stm1_pc : simulate_ldm1stm1; - return INSN_GOOD; + return INSN_GOOD_NO_SLOT; } static enum kprobe_insn __kprobes @@ -1344,9 +1321,8 @@ space_cccc_101x(kprobe_opcode_t insn, struct arch_specific_insn *asi) { /* B : cccc 1010 xxxx xxxx xxxx xxxx xxxx xxxx */ /* BL : cccc 1011 xxxx xxxx xxxx xxxx xxxx xxxx */ - asi->insn[0] = truecc_insn(insn); asi->insn_handler = simulate_bbl; - return INSN_GOOD; + return INSN_GOOD_NO_SLOT; } static enum kprobe_insn __kprobes -- cgit v1.1 From ad111ce46674a473370d302325db8f418ac4fe92 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Wed, 6 Apr 2011 11:17:11 +0100 Subject: ARM: kprobes: Fix emulation of CMP, CMN, TST and TEQ instructions. Probing these instructions was corrupting R0 because the emulation code didn't account for the fact that they don't write a result to a register. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 55 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index f52fac0c5..ee29d33 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -808,6 +808,17 @@ emulate_alu_imm_rwflags(struct kprobe *p, struct pt_regs *regs) } static void __kprobes +emulate_alu_tests_imm(struct kprobe *p, struct pt_regs *regs) +{ + insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; + kprobe_opcode_t insn = p->opcode; + int rn = (insn >> 16) & 0xf; + long rnv = (rn == 15) ? (long)p->addr + 8 : regs->uregs[rn]; + + insnslot_1arg_rwflags(rnv, ®s->ARM_cpsr, i_fn); +} + +static void __kprobes emulate_alu_rflags(struct kprobe *p, struct pt_regs *regs) { insn_3arg_fn_t *i_fn = (insn_3arg_fn_t *)&p->ainsn.insn[0]; @@ -843,6 +854,22 @@ emulate_alu_rwflags(struct kprobe *p, struct pt_regs *regs) insnslot_3arg_rwflags(rnv, rmv, rsv, ®s->ARM_cpsr, i_fn); } +static void __kprobes +emulate_alu_tests(struct kprobe *p, struct pt_regs *regs) +{ + insn_3arg_fn_t *i_fn = (insn_3arg_fn_t *)&p->ainsn.insn[0]; + kprobe_opcode_t insn = p->opcode; + long ppc = (long)p->addr + 8; + int rn = (insn >> 16) & 0xf; + int rs = (insn >> 8) & 0xf; /* rs/rsv may be invalid, don't care. */ + int rm = insn & 0xf; + long rnv = (rn == 15) ? ppc : regs->uregs[rn]; + long rmv = (rm == 15) ? ppc : regs->uregs[rm]; + long rsv = regs->uregs[rs]; + + insnslot_3arg_rwflags(rnv, rmv, rsv, ®s->ARM_cpsr, i_fn); +} + static enum kprobe_insn __kprobes prep_emulate_ldr_str(kprobe_opcode_t insn, struct arch_specific_insn *asi) { @@ -1142,8 +1169,20 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) insn |= 0x00000200; /* Rs = r2 */ } asi->insn[0] = insn; - asi->insn_handler = (insn & (1 << 20)) ? /* S-bit */ + + if ((insn & 0x0f900000) == 0x01100000) { + /* + * TST : cccc 0001 0001 xxxx xxxx xxxx xxxx xxxx + * TEQ : cccc 0001 0011 xxxx xxxx xxxx xxxx xxxx + * CMP : cccc 0001 0101 xxxx xxxx xxxx xxxx xxxx + * CMN : cccc 0001 0111 xxxx xxxx xxxx xxxx xxxx + */ + asi->insn_handler = emulate_alu_tests; + } else { + /* ALU ops which write to Rd */ + asi->insn_handler = (insn & (1 << 20)) ? /* S-bit */ emulate_alu_rwflags : emulate_alu_rflags; + } return INSN_GOOD; } @@ -1170,8 +1209,20 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi) */ insn &= 0xffff0fff; /* Rd = r0 */ asi->insn[0] = insn; - asi->insn_handler = (insn & (1 << 20)) ? /* S-bit */ + + if ((insn & 0x0f900000) == 0x03100000) { + /* + * TST : cccc 0011 0001 xxxx xxxx xxxx xxxx xxxx + * TEQ : cccc 0011 0011 xxxx xxxx xxxx xxxx xxxx + * CMP : cccc 0011 0101 xxxx xxxx xxxx xxxx xxxx + * CMN : cccc 0011 0111 xxxx xxxx xxxx xxxx xxxx + */ + asi->insn_handler = emulate_alu_tests_imm; + } else { + /* ALU ops which write to Rd */ + asi->insn_handler = (insn & (1 << 20)) ? /* S-bit */ emulate_alu_imm_rwflags : emulate_alu_imm_rflags; + } return INSN_GOOD; } -- cgit v1.1 From 896a74e19d0131413a96502429994bc8e6bbbe5a Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Wed, 6 Apr 2011 11:17:12 +0100 Subject: ARM: kprobes: Fix emulation of Data-processing (immediate) instructions Emulation of instructions like "ADD rd, rn, #" would result in a corrupted value for rd. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index ee29d33..baf053e 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1207,7 +1207,7 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi) * *S (bit 20) updates condition codes * ADC/SBC/RSC reads the C flag */ - insn &= 0xffff0fff; /* Rd = r0 */ + insn &= 0xfff00fff; /* Rn = r0 and Rd = r0 */ asi->insn[0] = insn; if ((insn & 0x0f900000) == 0x03100000) { -- cgit v1.1 From 51468ea91efad9c7e6dbae43cd8bdc423ec61709 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 7 Apr 2011 13:25:15 +0100 Subject: ARM: kprobes: Reject probing MRS instructions which read SPSR We need to reject probing of instructions which read SPSR because we can't handle this as the value in SPSR is lost when the exception handler for the probe breakpoint first runs. This patch also fixes the bitmask for MRS instructions decoding to include checking bits 5-7. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index baf053e..e5bc576 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1026,14 +1026,16 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* cccc 0001 0xx0 xxxx xxxx xxxx xxxx xxx0 xxxx */ if ((insn & 0x0f900010) == 0x01000000) { - /* BXJ : cccc 0001 0010 xxxx xxxx xxxx 0010 xxxx */ - /* MSR : cccc 0001 0x10 xxxx xxxx xxxx 0000 xxxx */ + /* BXJ : cccc 0001 0010 xxxx xxxx xxxx 0010 xxxx */ + /* MSR : cccc 0001 0x10 xxxx xxxx xxxx 0000 xxxx */ + /* MRS spsr : cccc 0001 0100 xxxx xxxx xxxx 0000 xxxx */ if ((insn & 0x0ff000f0) == 0x01200020 || - (insn & 0x0fb000f0) == 0x01200000) + (insn & 0x0fb000f0) == 0x01200000 || + (insn & 0x0ff000f0) == 0x01400000) return INSN_REJECTED; - /* MRS : cccc 0001 0x00 xxxx xxxx xxxx 0000 xxxx */ - if ((insn & 0x0fb00010) == 0x01000000) + /* MRS cpsr : cccc 0001 0000 xxxx xxxx xxxx 0000 xxxx */ + if ((insn & 0x0ff000f0) == 0x01000000) return prep_emulate_rd12(insn, asi); /* SMLALxy : cccc 0001 0100 xxxx xxxx xxxx 1xx0 xxxx */ -- cgit v1.1 From c412aba2a1243192a4d53736805a96bdda915608 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 7 Apr 2011 13:25:16 +0100 Subject: ARM: kprobes: Fix emulation of MRS instruction The MRS instruction should set mode and interrupt bits in the read value so it is simpler to use a new simulation routine (simulate_mrs) rather than some modified emulation. prep_emulate_rd12 is now unused and removed. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index e5bc576..4ab83f4 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -451,6 +451,14 @@ static void __kprobes simulate_blx2bx(struct kprobe *p, struct pt_regs *regs) regs->ARM_cpsr |= PSR_T_BIT; } +static void __kprobes simulate_mrs(struct kprobe *p, struct pt_regs *regs) +{ + kprobe_opcode_t insn = p->opcode; + int rd = (insn >> 12) & 0xf; + unsigned long mask = 0xf8ff03df; /* Mask out execution state */ + regs->uregs[rd] = regs->ARM_cpsr & mask; +} + static void __kprobes simulate_ldm1stm1(struct kprobe *p, struct pt_regs *regs) { kprobe_opcode_t insn = p->opcode; @@ -896,15 +904,6 @@ prep_emulate_rd12rm0(kprobe_opcode_t insn, struct arch_specific_insn *asi) } static enum kprobe_insn __kprobes -prep_emulate_rd12(kprobe_opcode_t insn, struct arch_specific_insn *asi) -{ - insn &= 0xffff0fff; /* Rd = r0 */ - asi->insn[0] = insn; - asi->insn_handler = emulate_rd12; - return INSN_GOOD; -} - -static enum kprobe_insn __kprobes prep_emulate_rd12rn16rm0_wflags(kprobe_opcode_t insn, struct arch_specific_insn *asi) { @@ -1035,8 +1034,10 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_REJECTED; /* MRS cpsr : cccc 0001 0000 xxxx xxxx xxxx 0000 xxxx */ - if ((insn & 0x0ff000f0) == 0x01000000) - return prep_emulate_rd12(insn, asi); + if ((insn & 0x0ff000f0) == 0x01000000) { + asi->insn_handler = simulate_mrs; + return INSN_GOOD_NO_SLOT; + } /* SMLALxy : cccc 0001 0100 xxxx xxxx xxxx 1xx0 xxxx */ if ((insn & 0x0ff00090) == 0x01400080) -- cgit v1.1 From 983ebd9365096c3386f6ed0978333e15f66024f5 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 7 Apr 2011 13:25:17 +0100 Subject: ARM: kprobes: Reject probing of instructions which write to PC unpredictably. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 4ab83f4..6d09db9 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -68,6 +68,8 @@ #define branch_displacement(insn) sign_extend(((insn) & 0xffffff) << 2, 25) +#define is_r15(insn, bitpos) (((insn) & (0xf << bitpos)) == (0xf << bitpos)) + #define PSR_fs (PSR_f|PSR_s) #define KPROBE_RETURN_INSTRUCTION 0xe1a0f00e /* mov pc, lr */ @@ -897,6 +899,9 @@ prep_emulate_ldr_str(kprobe_opcode_t insn, struct arch_specific_insn *asi) static enum kprobe_insn __kprobes prep_emulate_rd12rm0(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ + insn &= 0xffff0ff0; /* Rd = r0, Rm = r0 */ asi->insn[0] = insn; asi->insn_handler = emulate_rd12rm0; @@ -907,6 +912,9 @@ static enum kprobe_insn __kprobes prep_emulate_rd12rn16rm0_wflags(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ + insn &= 0xfff00ff0; /* Rd = r0, Rn = r0 */ insn |= 0x00000001; /* Rm = r1 */ asi->insn[0] = insn; @@ -918,6 +926,9 @@ static enum kprobe_insn __kprobes prep_emulate_rd16rs8rm0_wflags(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + if (is_r15(insn, 16)) + return INSN_REJECTED; /* Rd is PC */ + insn &= 0xfff0f0f0; /* Rd = r0, Rs = r0 */ insn |= 0x00000001; /* Rm = r1 */ asi->insn[0] = insn; @@ -929,6 +940,9 @@ static enum kprobe_insn __kprobes prep_emulate_rd16rn12rs8rm0_wflags(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + if (is_r15(insn, 16)) + return INSN_REJECTED; /* Rd is PC */ + insn &= 0xfff000f0; /* Rd = r0, Rn = r0 */ insn |= 0x00000102; /* Rs = r1, Rm = r2 */ asi->insn[0] = insn; @@ -940,6 +954,9 @@ static enum kprobe_insn __kprobes prep_emulate_rdhi16rdlo12rs8rm0_wflags(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + if (is_r15(insn, 16) || is_r15(insn, 12)) + return INSN_REJECTED; /* RdHi or RdLo is PC */ + insn &= 0xfff000f0; /* RdHi = r0, RdLo = r1 */ insn |= 0x00001203; /* Rs = r2, Rm = r3 */ asi->insn[0] = insn; @@ -1035,6 +1052,8 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* MRS cpsr : cccc 0001 0000 xxxx xxxx xxxx 0000 xxxx */ if ((insn & 0x0ff000f0) == 0x01000000) { + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ asi->insn_handler = simulate_mrs; return INSN_GOOD_NO_SLOT; } @@ -1065,6 +1084,8 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* BLX(2) : cccc 0001 0010 xxxx xxxx xxxx 0011 xxxx */ /* BX : cccc 0001 0010 xxxx xxxx xxxx 0001 xxxx */ if ((insn & 0x0ff000d0) == 0x01200010) { + if ((insn & 0x0ff000ff) == 0x0120003f) + return INSN_REJECTED; /* BLX pc */ asi->insn_handler = simulate_blx2bx; return INSN_GOOD_NO_SLOT; } @@ -1234,6 +1255,8 @@ space_cccc_0110__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) { /* SEL : cccc 0110 1000 xxxx xxxx xxxx 1011 xxxx GE: !!! */ if ((insn & 0x0ff000f0) == 0x068000b0) { + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ insn &= 0xfff00ff0; /* Rd = r0, Rn = r0 */ insn |= 0x00000001; /* Rm = r1 */ asi->insn[0] = insn; @@ -1247,6 +1270,8 @@ space_cccc_0110__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* USAT16 : cccc 0110 1110 xxxx xxxx xxxx 0011 xxxx :Q */ if ((insn & 0x0fa00030) == 0x06a00010 || (insn & 0x0fb000f0) == 0x06a00030) { + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ insn &= 0xffff0ff0; /* Rd = r0, Rm = r0 */ asi->insn[0] = insn; asi->insn_handler = emulate_sat; @@ -1384,6 +1409,9 @@ space_cccc_1100_010x(kprobe_opcode_t insn, struct arch_specific_insn *asi) { /* MCRR : cccc 1100 0100 xxxx xxxx xxxx xxxx xxxx : (Rd!=Rn) */ /* MRRC : cccc 1100 0101 xxxx xxxx xxxx xxxx xxxx : (Rd!=Rn) */ + if (is_r15(insn, 16) || is_r15(insn, 12)) + return INSN_REJECTED; /* Rn or Rd is PC */ + insn &= 0xfff00fff; insn |= 0x00001000; /* Rn = r0, Rd = r1 */ asi->insn[0] = insn; -- cgit v1.1 From 75539aea4cb96823171cabac7391f1a26fbf9c1b Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 7 Apr 2011 13:25:18 +0100 Subject: ARM: kprobes: Fix error in comment Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 6d09db9..54c1759 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1069,7 +1069,7 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) return prep_emulate_rd16rs8rm0_wflags(insn, asi); /* SMLAxy : cccc 0001 0000 xxxx xxxx xxxx 1xx0 xxxx : Q */ - /* SMLAWy : cccc 0001 0010 xxxx xxxx xxxx 0x00 xxxx : Q */ + /* SMLAWy : cccc 0001 0010 xxxx xxxx xxxx 1x00 xxxx : Q */ return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); } -- cgit v1.1 From ba48d40713ae5afe758dd5525a0f8740cce71d22 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Thu, 7 Apr 2011 13:25:19 +0100 Subject: ARM: kprobes: Reject probing of undefined multiply instructions The instructions space for 'Multiply and multiply-accumulate' instructions contains some undefined patterns. We need to reject probing of these because they may in future become defined and the kprobes code may then emulate them faultily. This has already happened with the new MLS instruction which this patch also adds correct decoding for as well as tightening up other decoding tests. (Before this patch the wrong emulation routine was being called for MLS though it still produced correct results.) Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 54c1759..8f3f03e 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1102,13 +1102,16 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) } /* cccc 0000 xxxx xxxx xxxx xxxx xxxx 1001 xxxx */ - else if ((insn & 0x0f000090) == 0x00000090) { + else if ((insn & 0x0f0000f0) == 0x00000090) { /* MUL : cccc 0000 0000 xxxx xxxx xxxx 1001 xxxx : */ /* MULS : cccc 0000 0001 xxxx xxxx xxxx 1001 xxxx :cc */ /* MLA : cccc 0000 0010 xxxx xxxx xxxx 1001 xxxx : */ /* MLAS : cccc 0000 0011 xxxx xxxx xxxx 1001 xxxx :cc */ /* UMAAL : cccc 0000 0100 xxxx xxxx xxxx 1001 xxxx : */ + /* undef : cccc 0000 0101 xxxx xxxx xxxx 1001 xxxx : */ + /* MLS : cccc 0000 0110 xxxx xxxx xxxx 1001 xxxx : */ + /* undef : cccc 0000 0111 xxxx xxxx xxxx 1001 xxxx : */ /* UMULL : cccc 0000 1000 xxxx xxxx xxxx 1001 xxxx : */ /* UMULLS : cccc 0000 1001 xxxx xxxx xxxx 1001 xxxx :cc */ /* UMLAL : cccc 0000 1010 xxxx xxxx xxxx 1001 xxxx : */ @@ -1117,9 +1120,11 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* SMULLS : cccc 0000 1101 xxxx xxxx xxxx 1001 xxxx :cc */ /* SMLAL : cccc 0000 1110 xxxx xxxx xxxx 1001 xxxx : */ /* SMLALS : cccc 0000 1111 xxxx xxxx xxxx 1001 xxxx :cc */ - if ((insn & 0x0fe000f0) == 0x00000090) { + if ((insn & 0x00d00000) == 0x00500000) { + return INSN_REJECTED; + } else if ((insn & 0x00e00000) == 0x00000000) { return prep_emulate_rd16rs8rm0_wflags(insn, asi); - } else if ((insn & 0x0fe000f0) == 0x00200090) { + } else if ((insn & 0x00a00000) == 0x00200000) { return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); } else { return prep_emulate_rdhi16rdlo12rs8rm0_wflags(insn, asi); -- cgit v1.1 From ec58d7f2373cec47f30b773d40a89f18bf6fc489 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Fri, 8 Apr 2011 15:32:53 +0100 Subject: ARM: kprobes: Reject probing of STREX and LDREX instructions The emulation code for STREX and LDREX instructions is faulty, however, rather than attempting to fix this we reject probes of these instructions. We do this because they can never succeed in gaining exclusive access as the exception framework clears the exclusivity monitor when a probes breakpoint is hit. (This is a general problem when probing all instructions executing between a LDREX and its corresponding STREX and can lead to infinite retry loops.) Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 8f3f03e..c24e21e 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1136,17 +1136,34 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* SWP : cccc 0001 0000 xxxx xxxx xxxx 1001 xxxx */ /* SWPB : cccc 0001 0100 xxxx xxxx xxxx 1001 xxxx */ - /* LDRD : cccc 000x xxx0 xxxx xxxx xxxx 1101 xxxx */ - /* STRD : cccc 000x xxx0 xxxx xxxx xxxx 1111 xxxx */ + /* ??? : cccc 0001 0x01 xxxx xxxx xxxx 1001 xxxx */ + /* ??? : cccc 0001 0x10 xxxx xxxx xxxx 1001 xxxx */ + /* ??? : cccc 0001 0x11 xxxx xxxx xxxx 1001 xxxx */ /* STREX : cccc 0001 1000 xxxx xxxx xxxx 1001 xxxx */ /* LDREX : cccc 0001 1001 xxxx xxxx xxxx 1001 xxxx */ + /* STREXD: cccc 0001 1010 xxxx xxxx xxxx 1001 xxxx */ + /* LDREXD: cccc 0001 1011 xxxx xxxx xxxx 1001 xxxx */ + /* STREXB: cccc 0001 1100 xxxx xxxx xxxx 1001 xxxx */ + /* LDREXB: cccc 0001 1101 xxxx xxxx xxxx 1001 xxxx */ + /* STREXH: cccc 0001 1110 xxxx xxxx xxxx 1001 xxxx */ + /* LDREXH: cccc 0001 1111 xxxx xxxx xxxx 1001 xxxx */ + + /* LDRD : cccc 000x xxx0 xxxx xxxx xxxx 1101 xxxx */ + /* STRD : cccc 000x xxx0 xxxx xxxx xxxx 1111 xxxx */ /* LDRH : cccc 000x xxx1 xxxx xxxx xxxx 1011 xxxx */ /* STRH : cccc 000x xxx0 xxxx xxxx xxxx 1011 xxxx */ /* LDRSB : cccc 000x xxx1 xxxx xxxx xxxx 1101 xxxx */ /* LDRSH : cccc 000x xxx1 xxxx xxxx xxxx 1111 xxxx */ - if ((insn & 0x0fb000f0) == 0x01000090) { - /* SWP/SWPB */ - return prep_emulate_rd12rn16rm0_wflags(insn, asi); + if ((insn & 0x0f0000f0) == 0x01000090) { + if ((insn & 0x0fb000f0) == 0x01000090) { + /* SWP/SWPB */ + return prep_emulate_rd12rn16rm0_wflags(insn, + asi); + } else { + /* STREX/LDREX variants and unallocaed space */ + return INSN_REJECTED; + } + } else if ((insn & 0x0e1000d0) == 0x00000d0) { /* STRD/LDRD */ insn &= 0xfff00fff; -- cgit v1.1 From 6823fc85fcfba11675f2027aadf2d5291c6f351b Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Fri, 8 Apr 2011 15:32:54 +0100 Subject: ARM: kprobes: Fix emulation of LDRH, STRH, LDRSB and LDRSH instructions The decoding of these instructions got the register indexed and immediate indexed forms the wrong way around, causing incorrect emulation. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index c24e21e..348ff47 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -883,11 +883,12 @@ emulate_alu_tests(struct kprobe *p, struct pt_regs *regs) static enum kprobe_insn __kprobes prep_emulate_ldr_str(kprobe_opcode_t insn, struct arch_specific_insn *asi) { - int ibit = (insn & (1 << 26)) ? 25 : 22; + int not_imm = (insn & (1 << 26)) ? (insn & (1 << 25)) + : (~insn & (1 << 22)); insn &= 0xfff00fff; insn |= 0x00001000; /* Rn = r0, Rd = r1 */ - if (insn & (1 << ibit)) { + if (not_imm) { insn &= ~0xf; insn |= 2; /* Rm = r2 */ } -- cgit v1.1 From 54823accfcfc715e9e757a621afb40dabc01d033 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Fri, 8 Apr 2011 15:32:55 +0100 Subject: ARM: kprobes: Reject probing of LDR/STR instructions which update PC unpredictably Using PC as an base register with writeback is UNPREDICTABLE, as is non word-sized loads or stores of PC. (We only really care about preventing loads to PC but it keeps the code simpler if we also exclude stores.) Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 348ff47..a4dba1f 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -70,6 +70,12 @@ #define is_r15(insn, bitpos) (((insn) & (0xf << bitpos)) == (0xf << bitpos)) +/* + * Test if load/store instructions writeback the address register. + * if P (bit 24) == 0 or W (bit 21) == 1 + */ +#define is_writeback(insn) ((insn ^ 0x01000000) & 0x01200000) + #define PSR_fs (PSR_f|PSR_s) #define KPROBE_RETURN_INSTRUCTION 0xe1a0f00e /* mov pc, lr */ @@ -886,6 +892,9 @@ prep_emulate_ldr_str(kprobe_opcode_t insn, struct arch_specific_insn *asi) int not_imm = (insn & (1 << 26)) ? (insn & (1 << 25)) : (~insn & (1 << 22)); + if (is_writeback(insn) && is_r15(insn, 16)) + return INSN_REJECTED; /* Writeback to PC */ + insn &= 0xfff00fff; insn |= 0x00001000; /* Rn = r0, Rd = r1 */ if (not_imm) { @@ -1167,6 +1176,11 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) } else if ((insn & 0x0e1000d0) == 0x00000d0) { /* STRD/LDRD */ + if ((insn & 0x0000e000) == 0x0000e000) + return INSN_REJECTED; /* Rd is LR or PC */ + if (is_writeback(insn) && is_r15(insn, 16)) + return INSN_REJECTED; /* Writeback to PC */ + insn &= 0xfff00fff; insn |= 0x00002000; /* Rn = r0, Rd = r2 */ if (insn & (1 << 22)) { @@ -1180,6 +1194,9 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD; } + /* LDRH/STRH/LDRSB/LDRSH */ + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ return prep_emulate_ldr_str(insn, asi); } -- cgit v1.1 From 5c6b76fc7d8220e8f00e7a49fb56ca852d7fb661 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Fri, 8 Apr 2011 15:32:56 +0100 Subject: ARM: kprobes: Fix emulation of LDRD and STRD instructions The decoding of these instructions got the register indexed and immediate indexed forms the wrong way around, causing incorrect emulation. Instructions like "LDRD Rx, [Rx]" were corrupting Rx because the base register writeback was being performed unconditionally, overwriting the value just loaded from memory. The fix is to only writeback the base register when that form of the instruction is used. Note, now that we reject probing writeback with PC the emulation code doesn't need the check rn!=15. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index a4dba1f..826abc1 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -560,8 +560,8 @@ static void __kprobes emulate_ldrd(struct kprobe *p, struct pt_regs *regs) [i_fn] "r" (i_fn) : "r0", "r1", "r2", "r3", "lr", "cc" ); - if (rn != 15) - regs->uregs[rn] = rnv; /* Save Rn in case of writeback. */ + if (is_writeback(insn)) + regs->uregs[rn] = rnv; } static void __kprobes emulate_strd(struct kprobe *p, struct pt_regs *regs) @@ -580,8 +580,8 @@ static void __kprobes emulate_strd(struct kprobe *p, struct pt_regs *regs) rnv_wb = insnslot_4arg_rflags(rnv, rmv, regs->uregs[rd], regs->uregs[rd+1], regs->ARM_cpsr, i_fn); - if (rn != 15) - regs->uregs[rn] = rnv_wb; /* Save Rn in case of writeback. */ + if (is_writeback(insn)) + regs->uregs[rn] = rnv_wb; } static void __kprobes emulate_ldr(struct kprobe *p, struct pt_regs *regs) @@ -1183,8 +1183,8 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) insn &= 0xfff00fff; insn |= 0x00002000; /* Rn = r0, Rd = r2 */ - if (insn & (1 << 22)) { - /* I bit */ + if (!(insn & (1 << 22))) { + /* Register index */ insn &= ~0xf; insn |= 1; /* Rm = r1 */ } -- cgit v1.1 From 81ff5720b9561b48e3dc640ca15799ba3919f5a6 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 12 Apr 2011 07:45:21 +0100 Subject: ARM: kprobes: Reject probing of LDRB instructions which load PC These instructions are specified as UNPREDICTABLE. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 826abc1..a37745f 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1416,6 +1416,10 @@ space_cccc_01xx(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* STRB : cccc 01xx x1x0 xxxx xxxx xxxx xxxx xxxx */ /* STRBT : cccc 01x0 x110 xxxx xxxx xxxx xxxx xxxx */ /* STRT : cccc 01x0 x010 xxxx xxxx xxxx xxxx xxxx */ + + if ((insn & 0x00500000) == 0x00500000 && is_r15(insn, 12)) + return INSN_REJECTED; /* LDRB into PC */ + return prep_emulate_ldr_str(insn, asi); } -- cgit v1.1 From 0e384ed164bdc2ad832270e81dbd14a17c143e78 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 12 Apr 2011 07:45:22 +0100 Subject: ARM: kprobes: Add emulation of RBIT instruction The v6T2 RBIT instruction was accidentally being emulated correctly, this patch adds correct decoding for the instruction. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index a37745f..b6bbc0b 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1320,9 +1320,10 @@ space_cccc_0110__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* REV : cccc 0110 1011 xxxx xxxx xxxx 0011 xxxx */ /* REV16 : cccc 0110 1011 xxxx xxxx xxxx 1011 xxxx */ + /* RBIT : cccc 0110 1111 xxxx xxxx xxxx 0011 xxxx */ /* REVSH : cccc 0110 1111 xxxx xxxx xxxx 1011 xxxx */ if ((insn & 0x0ff00070) == 0x06b00030 || - (insn & 0x0ff000f0) == 0x06f000b0) + (insn & 0x0ff00070) == 0x06f00030) return prep_emulate_rd12rm0(insn, asi); /* SADD16 : cccc 0110 0001 xxxx xxxx xxxx 0001 xxxx :GE */ -- cgit v1.1 From 780b5c1162286cc75ef2dcc0f14215d9f9f06230 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 12 Apr 2011 07:45:23 +0100 Subject: ARM: kprobes: Reject probing of undefined media instructions The instructions space for media instructions contains some undefined patterns. We need to reject probing of these because they may in future become defined and the kprobes code may then emulate them faultily. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index b6bbc0b..f0ca7f4 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1326,52 +1326,86 @@ space_cccc_0110__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) (insn & 0x0ff00070) == 0x06f00030) return prep_emulate_rd12rm0(insn, asi); + /* ??? : cccc 0110 0000 xxxx xxxx xxxx xxx1 xxxx : */ /* SADD16 : cccc 0110 0001 xxxx xxxx xxxx 0001 xxxx :GE */ /* SADDSUBX : cccc 0110 0001 xxxx xxxx xxxx 0011 xxxx :GE */ /* SSUBADDX : cccc 0110 0001 xxxx xxxx xxxx 0101 xxxx :GE */ /* SSUB16 : cccc 0110 0001 xxxx xxxx xxxx 0111 xxxx :GE */ /* SADD8 : cccc 0110 0001 xxxx xxxx xxxx 1001 xxxx :GE */ + /* ??? : cccc 0110 0001 xxxx xxxx xxxx 1011 xxxx : */ + /* ??? : cccc 0110 0001 xxxx xxxx xxxx 1101 xxxx : */ /* SSUB8 : cccc 0110 0001 xxxx xxxx xxxx 1111 xxxx :GE */ /* QADD16 : cccc 0110 0010 xxxx xxxx xxxx 0001 xxxx : */ /* QADDSUBX : cccc 0110 0010 xxxx xxxx xxxx 0011 xxxx : */ /* QSUBADDX : cccc 0110 0010 xxxx xxxx xxxx 0101 xxxx : */ /* QSUB16 : cccc 0110 0010 xxxx xxxx xxxx 0111 xxxx : */ /* QADD8 : cccc 0110 0010 xxxx xxxx xxxx 1001 xxxx : */ + /* ??? : cccc 0110 0010 xxxx xxxx xxxx 1011 xxxx : */ + /* ??? : cccc 0110 0010 xxxx xxxx xxxx 1101 xxxx : */ /* QSUB8 : cccc 0110 0010 xxxx xxxx xxxx 1111 xxxx : */ /* SHADD16 : cccc 0110 0011 xxxx xxxx xxxx 0001 xxxx : */ /* SHADDSUBX : cccc 0110 0011 xxxx xxxx xxxx 0011 xxxx : */ /* SHSUBADDX : cccc 0110 0011 xxxx xxxx xxxx 0101 xxxx : */ /* SHSUB16 : cccc 0110 0011 xxxx xxxx xxxx 0111 xxxx : */ /* SHADD8 : cccc 0110 0011 xxxx xxxx xxxx 1001 xxxx : */ + /* ??? : cccc 0110 0011 xxxx xxxx xxxx 1011 xxxx : */ + /* ??? : cccc 0110 0011 xxxx xxxx xxxx 1101 xxxx : */ /* SHSUB8 : cccc 0110 0011 xxxx xxxx xxxx 1111 xxxx : */ + /* ??? : cccc 0110 0100 xxxx xxxx xxxx xxx1 xxxx : */ /* UADD16 : cccc 0110 0101 xxxx xxxx xxxx 0001 xxxx :GE */ /* UADDSUBX : cccc 0110 0101 xxxx xxxx xxxx 0011 xxxx :GE */ /* USUBADDX : cccc 0110 0101 xxxx xxxx xxxx 0101 xxxx :GE */ /* USUB16 : cccc 0110 0101 xxxx xxxx xxxx 0111 xxxx :GE */ /* UADD8 : cccc 0110 0101 xxxx xxxx xxxx 1001 xxxx :GE */ + /* ??? : cccc 0110 0101 xxxx xxxx xxxx 1011 xxxx : */ + /* ??? : cccc 0110 0101 xxxx xxxx xxxx 1101 xxxx : */ /* USUB8 : cccc 0110 0101 xxxx xxxx xxxx 1111 xxxx :GE */ /* UQADD16 : cccc 0110 0110 xxxx xxxx xxxx 0001 xxxx : */ /* UQADDSUBX : cccc 0110 0110 xxxx xxxx xxxx 0011 xxxx : */ /* UQSUBADDX : cccc 0110 0110 xxxx xxxx xxxx 0101 xxxx : */ /* UQSUB16 : cccc 0110 0110 xxxx xxxx xxxx 0111 xxxx : */ /* UQADD8 : cccc 0110 0110 xxxx xxxx xxxx 1001 xxxx : */ + /* ??? : cccc 0110 0110 xxxx xxxx xxxx 1011 xxxx : */ + /* ??? : cccc 0110 0110 xxxx xxxx xxxx 1101 xxxx : */ /* UQSUB8 : cccc 0110 0110 xxxx xxxx xxxx 1111 xxxx : */ /* UHADD16 : cccc 0110 0111 xxxx xxxx xxxx 0001 xxxx : */ /* UHADDSUBX : cccc 0110 0111 xxxx xxxx xxxx 0011 xxxx : */ /* UHSUBADDX : cccc 0110 0111 xxxx xxxx xxxx 0101 xxxx : */ /* UHSUB16 : cccc 0110 0111 xxxx xxxx xxxx 0111 xxxx : */ /* UHADD8 : cccc 0110 0111 xxxx xxxx xxxx 1001 xxxx : */ + /* ??? : cccc 0110 0111 xxxx xxxx xxxx 1011 xxxx : */ + /* ??? : cccc 0110 0111 xxxx xxxx xxxx 1101 xxxx : */ /* UHSUB8 : cccc 0110 0111 xxxx xxxx xxxx 1111 xxxx : */ + if ((insn & 0x0f800010) == 0x06000010) { + if ((insn & 0x00300000) == 0x00000000 || + (insn & 0x000000e0) == 0x000000a0 || + (insn & 0x000000e0) == 0x000000c0) + return INSN_REJECTED; /* Unallocated space */ + return prep_emulate_rd12rn16rm0_wflags(insn, asi); + } + /* PKHBT : cccc 0110 1000 xxxx xxxx xxxx x001 xxxx : */ /* PKHTB : cccc 0110 1000 xxxx xxxx xxxx x101 xxxx : */ + if ((insn & 0x0ff00030) == 0x06800010) + return prep_emulate_rd12rn16rm0_wflags(insn, asi); + /* SXTAB16 : cccc 0110 1000 xxxx xxxx xxxx 0111 xxxx : */ /* SXTB : cccc 0110 1010 xxxx xxxx xxxx 0111 xxxx : */ + /* ??? : cccc 0110 1001 xxxx xxxx xxxx 0111 xxxx : */ /* SXTAB : cccc 0110 1010 xxxx xxxx xxxx 0111 xxxx : */ /* SXTAH : cccc 0110 1011 xxxx xxxx xxxx 0111 xxxx : */ /* UXTAB16 : cccc 0110 1100 xxxx xxxx xxxx 0111 xxxx : */ + /* ??? : cccc 0110 1101 xxxx xxxx xxxx 0111 xxxx : */ /* UXTAB : cccc 0110 1110 xxxx xxxx xxxx 0111 xxxx : */ /* UXTAH : cccc 0110 1111 xxxx xxxx xxxx 0111 xxxx : */ - return prep_emulate_rd12rn16rm0_wflags(insn, asi); + if ((insn & 0x0f8000f0) == 0x06800070) { + if ((insn & 0x00300000) == 0x00100000) + return INSN_REJECTED; /* Unallocated space */ + return prep_emulate_rd12rn16rm0_wflags(insn, asi); + } + + /* Other instruction encodings aren't yet defined */ + return INSN_REJECTED; } static enum kprobe_insn __kprobes -- cgit v1.1 From 8dd7cfbed83c74b1fb991fae264944e041e22e62 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 12 Apr 2011 07:45:24 +0100 Subject: ARM: kprobes: Fix emulation of SXTB16, SXTB, SXTH, UXTB16, UXTB and UXTH instructions These sign extension instructions are encoded as extend-and-add instructions where the register to add is specified as r15. The decoding routines weren't checking for this and were using the incorrect emulation code, giving incorrect results. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index f0ca7f4..1e413a9c 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1390,18 +1390,28 @@ space_cccc_0110__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) return prep_emulate_rd12rn16rm0_wflags(insn, asi); /* SXTAB16 : cccc 0110 1000 xxxx xxxx xxxx 0111 xxxx : */ - /* SXTB : cccc 0110 1010 xxxx xxxx xxxx 0111 xxxx : */ + /* SXTB16 : cccc 0110 1000 1111 xxxx xxxx 0111 xxxx : */ /* ??? : cccc 0110 1001 xxxx xxxx xxxx 0111 xxxx : */ /* SXTAB : cccc 0110 1010 xxxx xxxx xxxx 0111 xxxx : */ + /* SXTB : cccc 0110 1010 1111 xxxx xxxx 0111 xxxx : */ /* SXTAH : cccc 0110 1011 xxxx xxxx xxxx 0111 xxxx : */ + /* SXTH : cccc 0110 1011 1111 xxxx xxxx 0111 xxxx : */ /* UXTAB16 : cccc 0110 1100 xxxx xxxx xxxx 0111 xxxx : */ + /* UXTB16 : cccc 0110 1100 1111 xxxx xxxx 0111 xxxx : */ /* ??? : cccc 0110 1101 xxxx xxxx xxxx 0111 xxxx : */ /* UXTAB : cccc 0110 1110 xxxx xxxx xxxx 0111 xxxx : */ + /* UXTB : cccc 0110 1110 1111 xxxx xxxx 0111 xxxx : */ /* UXTAH : cccc 0110 1111 xxxx xxxx xxxx 0111 xxxx : */ + /* UXTH : cccc 0110 1111 1111 xxxx xxxx 0111 xxxx : */ if ((insn & 0x0f8000f0) == 0x06800070) { if ((insn & 0x00300000) == 0x00100000) return INSN_REJECTED; /* Unallocated space */ - return prep_emulate_rd12rn16rm0_wflags(insn, asi); + + if ((insn & 0x000f0000) == 0x000f0000) { + return prep_emulate_rd12rm0(insn, asi); + } else { + return prep_emulate_rd12rn16rm0_wflags(insn, asi); + } } /* Other instruction encodings aren't yet defined */ -- cgit v1.1 From 038c3839c917e3eea1150a1dc55607b9bde2d5ac Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 12 Apr 2011 07:45:25 +0100 Subject: ARM: kprobes: Fix emulation of SMUAD, SMUSD and SMMUL instructions The signed multiply instructions were being decoded incorrectly. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 1e413a9c..068e5c8 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1436,18 +1436,26 @@ space_cccc_0111__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) return prep_emulate_rdhi16rdlo12rs8rm0_wflags(insn, asi); /* SMLAD : cccc 0111 0000 xxxx xxxx xxxx 00x1 xxxx :Q */ + /* SMUAD : cccc 0111 0000 xxxx 1111 xxxx 00x1 xxxx :Q */ /* SMLSD : cccc 0111 0000 xxxx xxxx xxxx 01x1 xxxx :Q */ + /* SMUSD : cccc 0111 0000 xxxx 1111 xxxx 01x1 xxxx : */ /* SMMLA : cccc 0111 0101 xxxx xxxx xxxx 00x1 xxxx : */ - /* SMMLS : cccc 0111 0101 xxxx xxxx xxxx 11x1 xxxx : */ + /* SMMUL : cccc 0111 0101 xxxx 1111 xxxx 00x1 xxxx : */ if ((insn & 0x0ff00090) == 0x07000010 || - (insn & 0x0ff000d0) == 0x07500010 || - (insn & 0x0ff000d0) == 0x075000d0) + (insn & 0x0ff000d0) == 0x07500010) { + + if ((insn & 0x0000f000) == 0x0000f000) { + return prep_emulate_rd16rs8rm0_wflags(insn, asi); + } else { + return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); + } + } + + /* SMMLS : cccc 0111 0101 xxxx xxxx xxxx 11x1 xxxx : */ + if ((insn & 0x0ff000d0) == 0x075000d0) return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); - /* SMUSD : cccc 0111 0000 xxxx xxxx xxxx 01x1 xxxx : */ - /* SMUAD : cccc 0111 0000 xxxx 1111 xxxx 00x1 xxxx :Q */ - /* SMMUL : cccc 0111 0101 xxxx 1111 xxxx 00x1 xxxx : */ - return prep_emulate_rd16rs8rm0_wflags(insn, asi); + return INSN_REJECTED; } static enum kprobe_insn __kprobes -- cgit v1.1 From c6e4ae32911c2a0ad02d47ee59ccba352a74c38e Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 12 Apr 2011 07:45:26 +0100 Subject: ARM: kprobes: Fix emulation of USAD8 instructions The USAD8 instruction wasn't being explicitly decoded leading to the incorrect emulation routine being called. It can be correctly decoded in the same way as the signed multiply instructions so we move the decoding there. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 068e5c8..8391dac 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1425,11 +1425,6 @@ space_cccc_0111__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) if ((insn & 0x0ff000f0) == 0x03f000f0) return INSN_REJECTED; - /* USADA8 : cccc 0111 1000 xxxx xxxx xxxx 0001 xxxx */ - /* USAD8 : cccc 0111 1000 xxxx 1111 xxxx 0001 xxxx */ - if ((insn & 0x0ff000f0) == 0x07800010) - return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); - /* SMLALD : cccc 0111 0100 xxxx xxxx xxxx 00x1 xxxx */ /* SMLSLD : cccc 0111 0100 xxxx xxxx xxxx 01x1 xxxx */ if ((insn & 0x0ff00090) == 0x07400010) @@ -1441,8 +1436,11 @@ space_cccc_0111__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* SMUSD : cccc 0111 0000 xxxx 1111 xxxx 01x1 xxxx : */ /* SMMLA : cccc 0111 0101 xxxx xxxx xxxx 00x1 xxxx : */ /* SMMUL : cccc 0111 0101 xxxx 1111 xxxx 00x1 xxxx : */ + /* USADA8 : cccc 0111 1000 xxxx xxxx xxxx 0001 xxxx : */ + /* USAD8 : cccc 0111 1000 xxxx 1111 xxxx 0001 xxxx : */ if ((insn & 0x0ff00090) == 0x07000010 || - (insn & 0x0ff000d0) == 0x07500010) { + (insn & 0x0ff000d0) == 0x07500010 || + (insn & 0x0ff000f0) == 0x07800010) { if ((insn & 0x0000f000) == 0x0000f000) { return prep_emulate_rd16rs8rm0_wflags(insn, asi); -- cgit v1.1 From fa1a03b429b3fd5f28e7fdd20ce99ca572bfd236 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Mon, 18 Apr 2011 08:53:54 +0100 Subject: ARM: kprobes: Reject probing of all coprocessor instructions The kernel doesn't currently support VFP or Neon code, and probing of code with CP15 operations is fraught with bad consequences. Therefore we don't need the ability to probe coprocessor instructions and the code to support this can be removed. The removed code also had at least two bugs: - MRC into R15 should set CPSR not trash PC - LDC and STC which use PC as base register needed the address offset by 8 Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 113 ++++----------------------------------- 1 file changed, 11 insertions(+), 102 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 8391dac..ff39707 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -520,17 +520,6 @@ static void __kprobes simulate_mov_ipsp(struct kprobe *p, struct pt_regs *regs) regs->uregs[12] = regs->uregs[13]; } -static void __kprobes emulate_ldcstc(struct kprobe *p, struct pt_regs *regs) -{ - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; - kprobe_opcode_t insn = p->opcode; - int rn = (insn >> 16) & 0xf; - long rnv = regs->uregs[rn]; - - /* Save Rn in case of writeback. */ - regs->uregs[rn] = insnslot_1arg_rflags(rnv, regs->ARM_cpsr, i_fn); -} - static void __kprobes emulate_ldrd(struct kprobe *p, struct pt_regs *regs) { insn_2arg_fn_t *i_fn = (insn_2arg_fn_t *)&p->ainsn.insn[0]; @@ -635,31 +624,6 @@ static void __kprobes emulate_str(struct kprobe *p, struct pt_regs *regs) regs->uregs[rn] = rnv_wb; /* Save Rn in case of writeback. */ } -static void __kprobes emulate_mrrc(struct kprobe *p, struct pt_regs *regs) -{ - insn_llret_0arg_fn_t *i_fn = (insn_llret_0arg_fn_t *)&p->ainsn.insn[0]; - kprobe_opcode_t insn = p->opcode; - union reg_pair fnr; - int rd = (insn >> 12) & 0xf; - int rn = (insn >> 16) & 0xf; - - fnr.dr = insnslot_llret_0arg_rflags(regs->ARM_cpsr, i_fn); - regs->uregs[rn] = fnr.r0; - regs->uregs[rd] = fnr.r1; -} - -static void __kprobes emulate_mcrr(struct kprobe *p, struct pt_regs *regs) -{ - insn_2arg_fn_t *i_fn = (insn_2arg_fn_t *)&p->ainsn.insn[0]; - kprobe_opcode_t insn = p->opcode; - int rd = (insn >> 12) & 0xf; - int rn = (insn >> 16) & 0xf; - long rnv = regs->uregs[rn]; - long rdv = regs->uregs[rd]; - - insnslot_2arg_rflags(rnv, rdv, regs->ARM_cpsr, i_fn); -} - static void __kprobes emulate_sat(struct kprobe *p, struct pt_regs *regs) { insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; @@ -693,24 +657,6 @@ static void __kprobes emulate_none(struct kprobe *p, struct pt_regs *regs) insnslot_0arg_rflags(regs->ARM_cpsr, i_fn); } -static void __kprobes emulate_rd12(struct kprobe *p, struct pt_regs *regs) -{ - insn_0arg_fn_t *i_fn = (insn_0arg_fn_t *)&p->ainsn.insn[0]; - kprobe_opcode_t insn = p->opcode; - int rd = (insn >> 12) & 0xf; - - regs->uregs[rd] = insnslot_0arg_rflags(regs->ARM_cpsr, i_fn); -} - -static void __kprobes emulate_ird12(struct kprobe *p, struct pt_regs *regs) -{ - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; - kprobe_opcode_t insn = p->opcode; - int ird = (insn >> 12) & 0xf; - - insnslot_1arg_rflags(regs->uregs[ird], regs->ARM_cpsr, i_fn); -} - static void __kprobes emulate_rn16(struct kprobe *p, struct pt_regs *regs) { insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; @@ -1010,40 +956,22 @@ space_1111(kprobe_opcode_t insn, struct arch_specific_insn *asi) } /* SETEND : 1111 0001 0000 0001 xxxx xxxx 0000 xxxx */ - /* CDP2 : 1111 1110 xxxx xxxx xxxx xxxx xxx0 xxxx */ - if ((insn & 0xffff00f0) == 0xf1010000 || - (insn & 0xff000010) == 0xfe000000) { + if ((insn & 0xffff00f0) == 0xf1010000) { asi->insn[0] = insn; asi->insn_handler = emulate_none; return INSN_GOOD; } + /* Coprocessor instructions... */ /* MCRR2 : 1111 1100 0100 xxxx xxxx xxxx xxxx xxxx : (Rd != Rn) */ /* MRRC2 : 1111 1100 0101 xxxx xxxx xxxx xxxx xxxx : (Rd != Rn) */ - if ((insn & 0xffe00000) == 0xfc400000) { - insn &= 0xfff00fff; /* Rn = r0 */ - insn |= 0x00001000; /* Rd = r1 */ - asi->insn[0] = insn; - asi->insn_handler = - (insn & (1 << 20)) ? emulate_mrrc : emulate_mcrr; - return INSN_GOOD; - } + /* LDC2 : 1111 110x xxx1 xxxx xxxx xxxx xxxx xxxx */ + /* STC2 : 1111 110x xxx0 xxxx xxxx xxxx xxxx xxxx */ + /* CDP2 : 1111 1110 xxxx xxxx xxxx xxxx xxx0 xxxx */ + /* MCR2 : 1111 1110 xxx0 xxxx xxxx xxxx xxx1 xxxx */ + /* MRC2 : 1111 1110 xxx1 xxxx xxxx xxxx xxx1 xxxx */ - /* LDC2 : 1111 110x xxx1 xxxx xxxx xxxx xxxx xxxx */ - /* STC2 : 1111 110x xxx0 xxxx xxxx xxxx xxxx xxxx */ - if ((insn & 0xfe000000) == 0xfc000000) { - insn &= 0xfff0ffff; /* Rn = r0 */ - asi->insn[0] = insn; - asi->insn_handler = emulate_ldcstc; - return INSN_GOOD; - } - - /* MCR2 : 1111 1110 xxx0 xxxx xxxx xxxx xxx1 xxxx */ - /* MRC2 : 1111 1110 xxx1 xxxx xxxx xxxx xxx1 xxxx */ - insn &= 0xffff0fff; /* Rd = r0 */ - asi->insn[0] = insn; - asi->insn_handler = (insn & (1 << 20)) ? emulate_rd12 : emulate_ird12; - return INSN_GOOD; + return INSN_REJECTED; } static enum kprobe_insn __kprobes @@ -1504,14 +1432,7 @@ space_cccc_1100_010x(kprobe_opcode_t insn, struct arch_specific_insn *asi) { /* MCRR : cccc 1100 0100 xxxx xxxx xxxx xxxx xxxx : (Rd!=Rn) */ /* MRRC : cccc 1100 0101 xxxx xxxx xxxx xxxx xxxx : (Rd!=Rn) */ - if (is_r15(insn, 16) || is_r15(insn, 12)) - return INSN_REJECTED; /* Rn or Rd is PC */ - - insn &= 0xfff00fff; - insn |= 0x00001000; /* Rn = r0, Rd = r1 */ - asi->insn[0] = insn; - asi->insn_handler = (insn & (1 << 20)) ? emulate_mrrc : emulate_mcrr; - return INSN_GOOD; + return INSN_REJECTED; } static enum kprobe_insn __kprobes @@ -1519,10 +1440,7 @@ space_cccc_110x(kprobe_opcode_t insn, struct arch_specific_insn *asi) { /* LDC : cccc 110x xxx1 xxxx xxxx xxxx xxxx xxxx */ /* STC : cccc 110x xxx0 xxxx xxxx xxxx xxxx xxxx */ - insn &= 0xfff0ffff; /* Rn = r0 */ - asi->insn[0] = insn; - asi->insn_handler = emulate_ldcstc; - return INSN_GOOD; + return INSN_REJECTED; } static enum kprobe_insn __kprobes @@ -1535,18 +1453,9 @@ space_cccc_111x(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_REJECTED; /* CDP : cccc 1110 xxxx xxxx xxxx xxxx xxx0 xxxx */ - if ((insn & 0x0f000010) == 0x0e000000) { - asi->insn[0] = insn; - asi->insn_handler = emulate_none; - return INSN_GOOD; - } - /* MCR : cccc 1110 xxx0 xxxx xxxx xxxx xxx1 xxxx */ /* MRC : cccc 1110 xxx1 xxxx xxxx xxxx xxx1 xxxx */ - insn &= 0xffff0fff; /* Rd = r0 */ - asi->insn[0] = insn; - asi->insn_handler = (insn & (1 << 20)) ? emulate_rd12 : emulate_ird12; - return INSN_GOOD; + return INSN_REJECTED; } static unsigned long __kprobes __check_eq(unsigned long cpsr) -- cgit v1.1 From ac211c6994fb5f1f282745054c00d29e53639cb1 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Mon, 18 Apr 2011 08:53:55 +0100 Subject: ARM: kprobes: Consolidate stub decoding functions Following the change to remove support for coprocessor instructions we are left with three stub functions which can be consolidated. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index ff39707..fb81897 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1428,33 +1428,19 @@ space_cccc_101x(kprobe_opcode_t insn, struct arch_specific_insn *asi) } static enum kprobe_insn __kprobes -space_cccc_1100_010x(kprobe_opcode_t insn, struct arch_specific_insn *asi) +space_cccc_11xx(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + /* Coprocessor instructions... */ /* MCRR : cccc 1100 0100 xxxx xxxx xxxx xxxx xxxx : (Rd!=Rn) */ /* MRRC : cccc 1100 0101 xxxx xxxx xxxx xxxx xxxx : (Rd!=Rn) */ - return INSN_REJECTED; -} + /* LDC : cccc 110x xxx1 xxxx xxxx xxxx xxxx xxxx */ + /* STC : cccc 110x xxx0 xxxx xxxx xxxx xxxx xxxx */ + /* CDP : cccc 1110 xxxx xxxx xxxx xxxx xxx0 xxxx */ + /* MCR : cccc 1110 xxx0 xxxx xxxx xxxx xxx1 xxxx */ + /* MRC : cccc 1110 xxx1 xxxx xxxx xxxx xxx1 xxxx */ -static enum kprobe_insn __kprobes -space_cccc_110x(kprobe_opcode_t insn, struct arch_specific_insn *asi) -{ - /* LDC : cccc 110x xxx1 xxxx xxxx xxxx xxxx xxxx */ - /* STC : cccc 110x xxx0 xxxx xxxx xxxx xxxx xxxx */ - return INSN_REJECTED; -} + /* SVC : cccc 1111 xxxx xxxx xxxx xxxx xxxx xxxx */ -static enum kprobe_insn __kprobes -space_cccc_111x(kprobe_opcode_t insn, struct arch_specific_insn *asi) -{ - /* BKPT : 1110 0001 0010 xxxx xxxx xxxx 0111 xxxx */ - /* SWI : cccc 1111 xxxx xxxx xxxx xxxx xxxx xxxx */ - if ((insn & 0xfff000f0) == 0xe1200070 || - (insn & 0x0f000000) == 0x0f000000) - return INSN_REJECTED; - - /* CDP : cccc 1110 xxxx xxxx xxxx xxxx xxx0 xxxx */ - /* MCR : cccc 1110 xxx0 xxxx xxxx xxxx xxx1 xxxx */ - /* MRC : cccc 1110 xxx1 xxxx xxxx xxxx xxx1 xxxx */ return INSN_REJECTED; } @@ -1598,17 +1584,9 @@ arm_kprobe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return space_cccc_101x(insn, asi); - } else if ((insn & 0x0fe00000) == 0x0c400000) { - - return space_cccc_1100_010x(insn, asi); - - } else if ((insn & 0x0e000000) == 0x0c000000) { - - return space_cccc_110x(insn, asi); - } - return space_cccc_111x(insn, asi); + return space_cccc_11xx(insn, asi); } void __init arm_kprobe_decode_init(void) -- cgit v1.1 From f0aeb8bff0fe9de50e1e4093ef86ff8f17a9b1b0 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Mon, 18 Apr 2011 08:53:56 +0100 Subject: ARM: kprobes: Reject probing of SETEND instructions The emulation of SETEND was broken as it changed the endianess for the running kprobes handling code. Rather than adding a new simulation routine to fix this we'll just reject probing of SETEND as these should be very rare in the kernel. Note, the function emulate_none is now unused but it is left in the source code as future patches will use it. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index fb81897..b81fbfb 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -956,11 +956,6 @@ space_1111(kprobe_opcode_t insn, struct arch_specific_insn *asi) } /* SETEND : 1111 0001 0000 0001 xxxx xxxx 0000 xxxx */ - if ((insn & 0xffff00f0) == 0xf1010000) { - asi->insn[0] = insn; - asi->insn_handler = emulate_none; - return INSN_GOOD; - } /* Coprocessor instructions... */ /* MCRR2 : 1111 1100 0100 xxxx xxxx xxxx xxxx xxxx : (Rd != Rn) */ -- cgit v1.1 From 41713d1396312a027ec02abc4767041627c178ef Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Mon, 18 Apr 2011 08:53:57 +0100 Subject: ARM: kprobes: Fix emulation of PLD instructions The PLD instructions wasn't being decoded correctly and the emulation code wasn't adjusting PC correctly. As the PLD instruction is only a performance hint we emulate it as a simple nop, and we can broaden the instruction decoding to take into account newer PLI and PLDW instructions. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index b81fbfb..bddf8d0 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -657,14 +657,8 @@ static void __kprobes emulate_none(struct kprobe *p, struct pt_regs *regs) insnslot_0arg_rflags(regs->ARM_cpsr, i_fn); } -static void __kprobes emulate_rn16(struct kprobe *p, struct pt_regs *regs) +static void __kprobes emulate_nop(struct kprobe *p, struct pt_regs *regs) { - insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; - kprobe_opcode_t insn = p->opcode; - int rn = (insn >> 16) & 0xf; - long rnv = regs->uregs[rn]; - - insnslot_1arg_rflags(rnv, regs->ARM_cpsr, i_fn); } static void __kprobes emulate_rd12rm0(struct kprobe *p, struct pt_regs *regs) @@ -941,12 +935,13 @@ space_1111(kprobe_opcode_t insn, struct arch_specific_insn *asi) (insn & 0xfe5f0f00) == 0xf84d0500) return INSN_REJECTED; - /* PLD : 1111 01x1 x101 xxxx xxxx xxxx xxxx xxxx : */ - if ((insn & 0xfd700000) == 0xf4500000) { - insn &= 0xfff0ffff; /* Rn = r0 */ - asi->insn[0] = insn; - asi->insn_handler = emulate_rn16; - return INSN_GOOD; + /* memory hint : 1111 0100 x001 xxxx xxxx xxxx xxxx xxxx : */ + /* PLDI : 1111 0100 x101 xxxx xxxx xxxx xxxx xxxx : */ + /* PLDW : 1111 0101 x001 xxxx xxxx xxxx xxxx xxxx : */ + /* PLD : 1111 0101 x101 xxxx xxxx xxxx xxxx xxxx : */ + if ((insn & 0xfe300000) == 0xf4100000) { + asi->insn_handler = emulate_nop; + return INSN_GOOD_NO_SLOT; } /* BLX(1) : 1111 101x xxxx xxxx xxxx xxxx xxxx xxxx : */ -- cgit v1.1 From 72c2bab2be734d63dee4342e67b1754c54fded70 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Mon, 18 Apr 2011 08:53:58 +0100 Subject: ARM: kprobes: Remove redundant code in space_1111 The tests to explicitly reject probing CPS, RFE and SRS instructions are redundant as the default case is now to reject undecoded patterns. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index bddf8d0..dc315c1 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -927,14 +927,6 @@ prep_emulate_rdhi16rdlo12rs8rm0_wflags(kprobe_opcode_t insn, static enum kprobe_insn __kprobes space_1111(kprobe_opcode_t insn, struct arch_specific_insn *asi) { - /* CPS mmod == 1 : 1111 0001 0000 xx10 xxxx xxxx xx0x xxxx */ - /* RFE : 1111 100x x0x1 xxxx xxxx 1010 xxxx xxxx */ - /* SRS : 1111 100x x1x0 1101 xxxx 0101 xxxx xxxx */ - if ((insn & 0xfff30020) == 0xf1020000 || - (insn & 0xfe500f00) == 0xf8100a00 || - (insn & 0xfe5f0f00) == 0xf84d0500) - return INSN_REJECTED; - /* memory hint : 1111 0100 x001 xxxx xxxx xxxx xxxx xxxx : */ /* PLDI : 1111 0100 x101 xxxx xxxx xxxx xxxx xxxx : */ /* PLDW : 1111 0101 x001 xxxx xxxx xxxx xxxx xxxx : */ @@ -950,7 +942,11 @@ space_1111(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD_NO_SLOT; } - /* SETEND : 1111 0001 0000 0001 xxxx xxxx 0000 xxxx */ + /* CPS : 1111 0001 0000 xxx0 xxxx xxxx xx0x xxxx */ + /* SETEND: 1111 0001 0000 0001 xxxx xxxx 0000 xxxx */ + + /* SRS : 1111 100x x1x0 xxxx xxxx xxxx xxxx xxxx */ + /* RFE : 1111 100x x0x1 xxxx xxxx xxxx xxxx xxxx */ /* Coprocessor instructions... */ /* MCRR2 : 1111 1100 0100 xxxx xxxx xxxx xxxx xxxx : (Rd != Rn) */ -- cgit v1.1 From f704a6e25bd07e1381af81f19fb1d123975807b1 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 19 Apr 2011 10:52:16 +0100 Subject: ARM: kprobes: Reject probing of undefined data processing instructions The instruction decoding in space_cccc_000x needs to reject probing of instructions with undefined patterns as they may in future become defined and then emulated faultily - as has already happened with the SMC instruction. This fix is achieved by testing for the instruction patterns we want to probe and making the the default fall-through paths reject probes. This also allows us to remove some explicit tests for instructions that we wish to reject, as that is now the default action. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index dc315c1..4715537 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -966,14 +966,6 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* cccc 0001 0xx0 xxxx xxxx xxxx xxxx xxx0 xxxx */ if ((insn & 0x0f900010) == 0x01000000) { - /* BXJ : cccc 0001 0010 xxxx xxxx xxxx 0010 xxxx */ - /* MSR : cccc 0001 0x10 xxxx xxxx xxxx 0000 xxxx */ - /* MRS spsr : cccc 0001 0100 xxxx xxxx xxxx 0000 xxxx */ - if ((insn & 0x0ff000f0) == 0x01200020 || - (insn & 0x0fb000f0) == 0x01200000 || - (insn & 0x0ff000f0) == 0x01400000) - return INSN_REJECTED; - /* MRS cpsr : cccc 0001 0000 xxxx xxxx xxxx 0000 xxxx */ if ((insn & 0x0ff000f0) == 0x01000000) { if (is_r15(insn, 12)) @@ -994,17 +986,21 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* SMLAxy : cccc 0001 0000 xxxx xxxx xxxx 1xx0 xxxx : Q */ /* SMLAWy : cccc 0001 0010 xxxx xxxx xxxx 1x00 xxxx : Q */ - return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); + if ((insn & 0x0ff00090) == 0x01000080 || + (insn & 0x0ff000b0) == 0x01200080) + return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); + + /* BXJ : cccc 0001 0010 xxxx xxxx xxxx 0010 xxxx */ + /* MSR : cccc 0001 0x10 xxxx xxxx xxxx 0000 xxxx */ + /* MRS spsr : cccc 0001 0100 xxxx xxxx xxxx 0000 xxxx */ + /* Other instruction encodings aren't yet defined */ + return INSN_REJECTED; } /* cccc 0001 0xx0 xxxx xxxx xxxx xxxx 0xx1 xxxx */ else if ((insn & 0x0f900090) == 0x01000010) { - /* BKPT : 1110 0001 0010 xxxx xxxx xxxx 0111 xxxx */ - if ((insn & 0xfff000f0) == 0xe1200070) - return INSN_REJECTED; - /* BLX(2) : cccc 0001 0010 xxxx xxxx xxxx 0011 xxxx */ /* BX : cccc 0001 0010 xxxx xxxx xxxx 0001 xxxx */ if ((insn & 0x0ff000d0) == 0x01200010) { @@ -1022,7 +1018,14 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* QSUB : cccc 0001 0010 xxxx xxxx xxxx 0101 xxxx :Q */ /* QDADD : cccc 0001 0100 xxxx xxxx xxxx 0101 xxxx :Q */ /* QDSUB : cccc 0001 0110 xxxx xxxx xxxx 0101 xxxx :Q */ - return prep_emulate_rd12rn16rm0_wflags(insn, asi); + if ((insn & 0x0f9000f0) == 0x01000050) + return prep_emulate_rd12rn16rm0_wflags(insn, asi); + + /* BKPT : 1110 0001 0010 xxxx xxxx xxxx 0111 xxxx */ + /* SMC : cccc 0001 0110 xxxx xxxx xxxx 0111 xxxx */ + + /* Other instruction encodings aren't yet defined */ + return INSN_REJECTED; } /* cccc 0000 xxxx xxxx xxxx xxxx xxxx 1001 xxxx */ -- cgit v1.1 From c9836777d5f22e64eefc759d682511560b6d6d78 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 19 Apr 2011 10:52:17 +0100 Subject: ARM: kprobes: Add emulation of MOVW and MOVT instructions The MOVW and MOVT instructions account for approximately 7% of all instructions in a ARMv7 kernel as GCC uses them instead of a literal pool. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 4715537..a063f15 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -661,6 +661,17 @@ static void __kprobes emulate_nop(struct kprobe *p, struct pt_regs *regs) { } +static void __kprobes +emulate_rd12_modify(struct kprobe *p, struct pt_regs *regs) +{ + insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; + kprobe_opcode_t insn = p->opcode; + int rd = (insn >> 12) & 0xf; + long rdv = regs->uregs[rd]; + + regs->uregs[rd] = insnslot_1arg_rflags(rdv, regs->ARM_cpsr, i_fn); +} + static void __kprobes emulate_rd12rm0(struct kprobe *p, struct pt_regs *regs) { insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; @@ -847,6 +858,18 @@ prep_emulate_ldr_str(kprobe_opcode_t insn, struct arch_specific_insn *asi) } static enum kprobe_insn __kprobes +prep_emulate_rd12_modify(kprobe_opcode_t insn, struct arch_specific_insn *asi) +{ + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ + + insn &= 0xffff0fff; /* Rd = r0 */ + asi->insn[0] = insn; + asi->insn_handler = emulate_rd12_modify; + return INSN_GOOD; +} + +static enum kprobe_insn __kprobes prep_emulate_rd12rm0(kprobe_opcode_t insn, struct arch_specific_insn *asi) { if (is_r15(insn, 12)) @@ -1170,14 +1193,17 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) static enum kprobe_insn __kprobes space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi) { + /* MOVW : cccc 0011 0000 xxxx xxxx xxxx xxxx xxxx */ + /* MOVT : cccc 0011 0100 xxxx xxxx xxxx xxxx xxxx */ + if ((insn & 0x0fb00000) == 0x03000000) + return prep_emulate_rd12_modify(insn, asi); + /* * MSR : cccc 0011 0x10 xxxx xxxx xxxx xxxx xxxx - * Undef : cccc 0011 0100 xxxx xxxx xxxx xxxx xxxx * ALU op with S bit and Rd == 15 : * cccc 001x xxx1 xxxx 1111 xxxx xxxx xxxx */ if ((insn & 0x0fb00000) == 0x03200000 || /* MSR */ - (insn & 0x0ff00000) == 0x03400000 || /* Undef */ (insn & 0x0e10f000) == 0x0210f000) /* ALU s-bit, R15 */ return INSN_REJECTED; -- cgit v1.1 From 20e8155e24ba5c04558780d0a8da13c39f98c002 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 19 Apr 2011 10:52:18 +0100 Subject: ARM: kprobes: Add emulation of SBFX, UBFX, BFI and BFC instructions These bit field manipulation instructions occur several thousand times in an ARMv7 kernel. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index a063f15..6bed8b0 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -672,6 +672,19 @@ emulate_rd12_modify(struct kprobe *p, struct pt_regs *regs) regs->uregs[rd] = insnslot_1arg_rflags(rdv, regs->ARM_cpsr, i_fn); } +static void __kprobes +emulate_rd12rn0_modify(struct kprobe *p, struct pt_regs *regs) +{ + insn_2arg_fn_t *i_fn = (insn_2arg_fn_t *)&p->ainsn.insn[0]; + kprobe_opcode_t insn = p->opcode; + int rd = (insn >> 12) & 0xf; + int rn = insn & 0xf; + long rdv = regs->uregs[rd]; + long rnv = regs->uregs[rn]; + + regs->uregs[rd] = insnslot_2arg_rflags(rdv, rnv, regs->ARM_cpsr, i_fn); +} + static void __kprobes emulate_rd12rm0(struct kprobe *p, struct pt_regs *regs) { insn_1arg_fn_t *i_fn = (insn_1arg_fn_t *)&p->ainsn.insn[0]; @@ -870,6 +883,20 @@ prep_emulate_rd12_modify(kprobe_opcode_t insn, struct arch_specific_insn *asi) } static enum kprobe_insn __kprobes +prep_emulate_rd12rn0_modify(kprobe_opcode_t insn, + struct arch_specific_insn *asi) +{ + if (is_r15(insn, 12)) + return INSN_REJECTED; /* Rd is PC */ + + insn &= 0xffff0ff0; /* Rd = r0 */ + insn |= 0x00000001; /* Rn = r1 */ + asi->insn[0] = insn; + asi->insn_handler = emulate_rd12rn0_modify; + return INSN_GOOD; +} + +static enum kprobe_insn __kprobes prep_emulate_rd12rm0(kprobe_opcode_t insn, struct arch_specific_insn *asi) { if (is_r15(insn, 12)) @@ -1396,6 +1423,21 @@ space_cccc_0111__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) if ((insn & 0x0ff000d0) == 0x075000d0) return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); + /* SBFX : cccc 0111 101x xxxx xxxx xxxx x101 xxxx : */ + /* UBFX : cccc 0111 111x xxxx xxxx xxxx x101 xxxx : */ + if ((insn & 0x0fa00070) == 0x07a00050) + return prep_emulate_rd12rm0(insn, asi); + + /* BFI : cccc 0111 110x xxxx xxxx xxxx x001 xxxx : */ + /* BFC : cccc 0111 110x xxxx xxxx xxxx x001 1111 : */ + if ((insn & 0x0fe00070) == 0x07c00010) { + + if ((insn & 0x0000000f) == 0x0000000f) + return prep_emulate_rd12_modify(insn, asi); + else + return prep_emulate_rd12rn0_modify(insn, asi); + } + return INSN_REJECTED; } -- cgit v1.1 From 94254930786216106100e9a28c9a244df58be61f Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 19 Apr 2011 10:52:19 +0100 Subject: ARM: kprobes: Add emulation of hint instructions like NOP and WFI Being able to probe NOP instructions is useful for hard-coding probeable locations and is used by the kprobes test code. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 6bed8b0..03b85ad 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -1225,6 +1225,30 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi) if ((insn & 0x0fb00000) == 0x03000000) return prep_emulate_rd12_modify(insn, asi); + /* hints : cccc 0011 0010 0000 xxxx xxxx xxxx xxxx */ + if ((insn & 0x0fff0000) == 0x03200000) { + unsigned op2 = insn & 0x000000ff; + if (op2 == 0x01 || op2 == 0x04) { + /* YIELD : cccc 0011 0010 0000 xxxx xxxx 0000 0001 */ + /* SEV : cccc 0011 0010 0000 xxxx xxxx 0000 0100 */ + asi->insn[0] = insn; + asi->insn_handler = emulate_none; + return INSN_GOOD; + } else if (op2 <= 0x03) { + /* NOP : cccc 0011 0010 0000 xxxx xxxx 0000 0000 */ + /* WFE : cccc 0011 0010 0000 xxxx xxxx 0000 0010 */ + /* WFI : cccc 0011 0010 0000 xxxx xxxx 0000 0011 */ + /* + * We make WFE and WFI true NOPs to avoid stalls due + * to missing events whilst processing the probe. + */ + asi->insn_handler = emulate_nop; + return INSN_GOOD_NO_SLOT; + } + /* For DBG and unallocated hints it's safest to reject them */ + return INSN_REJECTED; + } + /* * MSR : cccc 0011 0x10 xxxx xxxx xxxx xxxx xxxx * ALU op with S bit and Rd == 15 : -- cgit v1.1 From cdc253611582f08036ccb6e10efe95b8e760a7e2 Mon Sep 17 00:00:00 2001 From: Jon Medhurst Date: Tue, 19 Apr 2011 10:52:20 +0100 Subject: ARM: kprobes: Tidy-up kprobes-decode.c - Remove coding standard violations reported by checkpatch.pl - Delete comment about handling of conditional branches which is no longer true. - Delete comment at end of file which lists all ARM instructions. This duplicates data available in the ARM ARM and seems like an unnecessary maintenance burden to keep this up to date and accurate. Signed-off-by: Jon Medhurst Signed-off-by: Nicolas Pitre --- arch/arm/kernel/kprobes-decode.c | 131 +++++++-------------------------------- 1 file changed, 23 insertions(+), 108 deletions(-) diff --git a/arch/arm/kernel/kprobes-decode.c b/arch/arm/kernel/kprobes-decode.c index 03b85ad..15eeff6 100644 --- a/arch/arm/kernel/kprobes-decode.c +++ b/arch/arm/kernel/kprobes-decode.c @@ -34,9 +34,6 @@ * * *) If the PC is written to by the instruction, the * instruction must be fully simulated in software. - * If it is a conditional instruction, the handler - * will use insn[0] to copy its condition code to - * set r0 to 1 and insn[1] to "mov pc, lr" to return. * * *) Otherwise, a modified form of the instruction is * directly executed. Its handler calls the @@ -1026,7 +1023,8 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* SMLALxy : cccc 0001 0100 xxxx xxxx xxxx 1xx0 xxxx */ if ((insn & 0x0ff00090) == 0x01400080) - return prep_emulate_rdhi16rdlo12rs8rm0_wflags(insn, asi); + return prep_emulate_rdhi16rdlo12rs8rm0_wflags(insn, + asi); /* SMULWy : cccc 0001 0010 xxxx xxxx xxxx 1x10 xxxx */ /* SMULxy : cccc 0001 0110 xxxx xxxx xxxx 1xx0 xxxx */ @@ -1097,15 +1095,15 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* SMULLS : cccc 0000 1101 xxxx xxxx xxxx 1001 xxxx :cc */ /* SMLAL : cccc 0000 1110 xxxx xxxx xxxx 1001 xxxx : */ /* SMLALS : cccc 0000 1111 xxxx xxxx xxxx 1001 xxxx :cc */ - if ((insn & 0x00d00000) == 0x00500000) { + if ((insn & 0x00d00000) == 0x00500000) return INSN_REJECTED; - } else if ((insn & 0x00e00000) == 0x00000000) { - return prep_emulate_rd16rs8rm0_wflags(insn, asi); - } else if ((insn & 0x00a00000) == 0x00200000) { - return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); - } else { - return prep_emulate_rdhi16rdlo12rs8rm0_wflags(insn, asi); - } + else if ((insn & 0x00e00000) == 0x00000000) + return prep_emulate_rd16rs8rm0_wflags(insn, asi); + else if ((insn & 0x00a00000) == 0x00200000) + return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); + else + return prep_emulate_rdhi16rdlo12rs8rm0_wflags(insn, + asi); } /* cccc 000x xxxx xxxx xxxx xxxx xxxx 1xx1 xxxx */ @@ -1171,7 +1169,7 @@ space_cccc_000x(kprobe_opcode_t insn, struct arch_specific_insn *asi) /* * ALU op with S bit and Rd == 15 : - * cccc 000x xxx1 xxxx 1111 xxxx xxxx xxxx + * cccc 000x xxx1 xxxx 1111 xxxx xxxx xxxx */ if ((insn & 0x0e10f000) == 0x0010f000) return INSN_REJECTED; @@ -1401,11 +1399,10 @@ space_cccc_0110__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) if ((insn & 0x00300000) == 0x00100000) return INSN_REJECTED; /* Unallocated space */ - if ((insn & 0x000f0000) == 0x000f0000) { + if ((insn & 0x000f0000) == 0x000f0000) return prep_emulate_rd12rm0(insn, asi); - } else { + else return prep_emulate_rd12rn16rm0_wflags(insn, asi); - } } /* Other instruction encodings aren't yet defined */ @@ -1436,11 +1433,10 @@ space_cccc_0111__1(kprobe_opcode_t insn, struct arch_specific_insn *asi) (insn & 0x0ff000d0) == 0x07500010 || (insn & 0x0ff000f0) == 0x07800010) { - if ((insn & 0x0000f000) == 0x0000f000) { + if ((insn & 0x0000f000) == 0x0000f000) return prep_emulate_rd16rs8rm0_wflags(insn, asi); - } else { + else return prep_emulate_rd16rn12rs8rm0_wflags(insn, asi); - } } /* SMMLS : cccc 0111 0101 xxxx xxxx xxxx 11x1 xxxx : */ @@ -1633,40 +1629,38 @@ arm_kprobe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) asi->insn_check_cc = condition_checks[insn>>28]; asi->insn[1] = KPROBE_RETURN_INSTRUCTION; - if ((insn & 0xf0000000) == 0xf0000000) { + if ((insn & 0xf0000000) == 0xf0000000) return space_1111(insn, asi); - } else if ((insn & 0x0e000000) == 0x00000000) { + else if ((insn & 0x0e000000) == 0x00000000) return space_cccc_000x(insn, asi); - } else if ((insn & 0x0e000000) == 0x02000000) { + else if ((insn & 0x0e000000) == 0x02000000) return space_cccc_001x(insn, asi); - } else if ((insn & 0x0f000010) == 0x06000010) { + else if ((insn & 0x0f000010) == 0x06000010) return space_cccc_0110__1(insn, asi); - } else if ((insn & 0x0f000010) == 0x07000010) { + else if ((insn & 0x0f000010) == 0x07000010) return space_cccc_0111__1(insn, asi); - } else if ((insn & 0x0c000000) == 0x04000000) { + else if ((insn & 0x0c000000) == 0x04000000) return space_cccc_01xx(insn, asi); - } else if ((insn & 0x0e000000) == 0x08000000) { + else if ((insn & 0x0e000000) == 0x08000000) return space_cccc_100x(insn, asi); - } else if ((insn & 0x0e000000) == 0x0a000000) { + else if ((insn & 0x0e000000) == 0x0a000000) return space_cccc_101x(insn, asi); - } - return space_cccc_11xx(insn, asi); } @@ -1674,82 +1668,3 @@ void __init arm_kprobe_decode_init(void) { find_str_pc_offset(); } - - -/* - * All ARM instructions listed below. - * - * Instructions and their general purpose registers are given. - * If a particular register may not use R15, it is prefixed with a "!". - * If marked with a "*" means the value returned by reading R15 - * is implementation defined. - * - * ADC/ADD/AND/BIC/CMN/CMP/EOR/MOV/MVN/ORR/RSB/RSC/SBC/SUB/TEQ - * TST: Rd, Rn, Rm, !Rs - * BX: Rm - * BLX(2): !Rm - * BX: Rm (R15 legal, but discouraged) - * BXJ: !Rm, - * CLZ: !Rd, !Rm - * CPY: Rd, Rm - * LDC/2,STC/2 immediate offset & unindex: Rn - * LDC/2,STC/2 immediate pre/post-indexed: !Rn - * LDM(1/3): !Rn, register_list - * LDM(2): !Rn, !register_list - * LDR,STR,PLD immediate offset: Rd, Rn - * LDR,STR,PLD register offset: Rd, Rn, !Rm - * LDR,STR,PLD scaled register offset: Rd, !Rn, !Rm - * LDR,STR immediate pre/post-indexed: Rd, !Rn - * LDR,STR register pre/post-indexed: Rd, !Rn, !Rm - * LDR,STR scaled register pre/post-indexed: Rd, !Rn, !Rm - * LDRB,STRB immediate offset: !Rd, Rn - * LDRB,STRB register offset: !Rd, Rn, !Rm - * LDRB,STRB scaled register offset: !Rd, !Rn, !Rm - * LDRB,STRB immediate pre/post-indexed: !Rd, !Rn - * LDRB,STRB register pre/post-indexed: !Rd, !Rn, !Rm - * LDRB,STRB scaled register pre/post-indexed: !Rd, !Rn, !Rm - * LDRT,LDRBT,STRBT immediate pre/post-indexed: !Rd, !Rn - * LDRT,LDRBT,STRBT register pre/post-indexed: !Rd, !Rn, !Rm - * LDRT,LDRBT,STRBT scaled register pre/post-indexed: !Rd, !Rn, !Rm - * LDRH/SH/SB/D,STRH/SH/SB/D immediate offset: !Rd, Rn - * LDRH/SH/SB/D,STRH/SH/SB/D register offset: !Rd, Rn, !Rm - * LDRH/SH/SB/D,STRH/SH/SB/D immediate pre/post-indexed: !Rd, !Rn - * LDRH/SH/SB/D,STRH/SH/SB/D register pre/post-indexed: !Rd, !Rn, !Rm - * LDREX: !Rd, !Rn - * MCR/2: !Rd - * MCRR/2,MRRC/2: !Rd, !Rn - * MLA: !Rd, !Rn, !Rm, !Rs - * MOV: Rd - * MRC/2: !Rd (if Rd==15, only changes cond codes, not the register) - * MRS,MSR: !Rd - * MUL: !Rd, !Rm, !Rs - * PKH{BT,TB}: !Rd, !Rn, !Rm - * QDADD,[U]QADD/16/8/SUBX: !Rd, !Rm, !Rn - * QDSUB,[U]QSUB/16/8/ADDX: !Rd, !Rm, !Rn - * REV/16/SH: !Rd, !Rm - * RFE: !Rn - * {S,U}[H]ADD{16,8,SUBX},{S,U}[H]SUB{16,8,ADDX}: !Rd, !Rn, !Rm - * SEL: !Rd, !Rn, !Rm - * SMLA,SMLA{D,W},SMLSD,SMML{A,S}: !Rd, !Rn, !Rm, !Rs - * SMLAL,SMLA{D,LD},SMLSLD,SMMULL,SMULW: !RdHi, !RdLo, !Rm, !Rs - * SMMUL,SMUAD,SMUL,SMUSD: !Rd, !Rm, !Rs - * SSAT/16: !Rd, !Rm - * STM(1/2): !Rn, register_list* (R15 in reg list not recommended) - * STRT immediate pre/post-indexed: Rd*, !Rn - * STRT register pre/post-indexed: Rd*, !Rn, !Rm - * STRT scaled register pre/post-indexed: Rd*, !Rn, !Rm - * STREX: !Rd, !Rn, !Rm - * SWP/B: !Rd, !Rn, !Rm - * {S,U}XTA{B,B16,H}: !Rd, !Rn, !Rm - * {S,U}XT{B,B16,H}: !Rd, !Rm - * UM{AA,LA,UL}L: !RdHi, !RdLo, !Rm, !Rs - * USA{D8,A8,T,T16}: !Rd, !Rm, !Rs - * - * May transfer control by writing R15 (possible mode changes or alternate - * mode accesses marked by "*"): - * ALU op (* with s-bit), B, BL, BKPT, BLX(1/2), BX, BXJ, CPS*, CPY, - * LDM(1), LDM(2/3)*, LDR, MOV, RFE*, SWI* - * - * Instructions that do not take general registers, nor transfer control: - * CDP/2, SETEND, SRS* - */ -- cgit v1.1 From bfacf2225a955bea9c41c707fc72ba16009674a0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 Apr 2011 13:25:51 -0400 Subject: cifs: change bleft in decode_unicode_ssetup back to signed type The buffer length checks in this function depend on this value being a signed data type, but 690c522fa converted it to an unsigned type. Also, eliminate a problem with the null termination check in the same function. cifs_strndup_from_ucs handles that situation correctly already, and the existing check could potentially lead to a buffer overrun since it increments bleft without checking to see whether it falls off the end of the buffer. Cc: stable@kernel.org Reported-and-Acked-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/sess.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index f6728eb..2e2c911 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -276,7 +276,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses, } static void -decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses, +decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses, const struct nls_table *nls_cp) { int len; @@ -284,19 +284,6 @@ decode_unicode_ssetup(char **pbcc_area, __u16 bleft, struct cifsSesInfo *ses, cFYI(1, "bleft %d", bleft); - /* - * Windows servers do not always double null terminate their final - * Unicode string. Check to see if there are an uneven number of bytes - * left. If so, then add an extra NULL pad byte to the end of the - * response. - * - * See section 2.7.2 in "Implementing CIFS" for details - */ - if (bleft % 2) { - data[bleft] = 0; - ++bleft; - } - kfree(ses->serverOS); ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); cFYI(1, "serverOS=%s", ses->serverOS); -- cgit v1.1 From fcda7f4578bbf9717444ca6da8a421d21489d078 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 Apr 2011 13:25:51 -0400 Subject: cifs: check for bytes_remaining going to zero in CIFS_SessSetup It's possible that when we go to decode the string area in the SESSION_SETUP response, that bytes_remaining will be 0. Decrementing it at that point will mean that it can go "negative" and wrap. Check for a bytes_remaining value of 0, and don't try to decode the string area if that's the case. Cc: stable@kernel.org Reported-and-Acked-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/sess.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 2e2c911..645114a 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -916,7 +916,9 @@ ssetup_ntlmssp_authenticate: } /* BB check if Unicode and decode strings */ - if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { ++bcc_ptr; -- cgit v1.1 From 2a2047bc94d0efc316401170c3d078d9edc20dc4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 Apr 2011 13:29:49 -0400 Subject: cifs: sanitize length checking in coalesce_t2 (try #3) There are a couple of places in this code where these values can wrap or go negative, and that could potentially end up overflowing the buffer. Ensure that that doesn't happen. Do all of the length calculation and checks first, and only perform the memcpy after they pass. Also, increase some stack variables to 32 bits to ensure that they don't wrap without being detected. Finally, change the error codes to be a bit more descriptive of any problems detected. -EINVAL isn't very accurate. Cc: stable@kernel.org Reported-and-Acked-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4bc862a..8b75a8ec 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -274,7 +274,8 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) char *data_area_of_target; char *data_area_of_buf2; int remaining; - __u16 byte_count, total_data_size, total_in_buf, total_in_buf2; + unsigned int byte_count, total_in_buf; + __u16 total_data_size, total_in_buf2; total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); @@ -287,7 +288,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) remaining = total_data_size - total_in_buf; if (remaining < 0) - return -EINVAL; + return -EPROTO; if (remaining == 0) /* nothing to do, ignore */ return 0; @@ -308,20 +309,29 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) data_area_of_target += total_in_buf; /* copy second buffer into end of first buffer */ - memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); total_in_buf += total_in_buf2; + /* is the result too big for the field? */ + if (total_in_buf > USHRT_MAX) + return -EPROTO; put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); + + /* fix up the BCC */ byte_count = get_bcc_le(pTargetSMB); byte_count += total_in_buf2; + /* is the result too big for the field? */ + if (byte_count > USHRT_MAX) + return -EPROTO; put_bcc_le(byte_count, pTargetSMB); byte_count = pTargetSMB->smb_buf_length; byte_count += total_in_buf2; - - /* BB also add check that we are not beyond maximum buffer size */ - + /* don't allow buffer to overflow */ + if (byte_count > CIFSMaxBufSize) + return -ENOBUFS; pTargetSMB->smb_buf_length = byte_count; + memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); + if (remaining == total_in_buf2) { cFYI(1, "found the last secondary response"); return 0; /* we are done */ -- cgit v1.1 From c7a7b814c9dca9ee01b38e63b4a46de87156d3b6 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Thu, 28 Apr 2011 11:00:30 -0600 Subject: ioremap: Delay sanity check until after a successful mapping While tracking down the reason for an ioremap() failure I was distracted by the WARN_ONCE() in __ioremap_caller(). Performing a WARN_ONCE() sanity check before the mapping is successful seems pointless if the caller sends bad values. A case in point is when the BIOS provides erroneous screen_info values causing vesafb_probe() to request an outrageuous size. The WARN_ONCE is then wasted on bogosity. Move the warning to a point where the mapping has been successfully allocated. Addresses: http://bugs.launchpad.net/bugs/772042 Reviewed-by: Suresh Siddha Signed-off-by: Tim Gardner Link: http://lkml.kernel.org/r/4DB99D2E.9080106@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0369843..be1ef57 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return (__force void __iomem *)phys_to_virt(phys_addr); /* - * Check if the request spans more than any BAR in the iomem resource - * tree. - */ - WARN_ONCE(iomem_map_sanity_check(phys_addr, size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ last_pfn = last_addr >> PAGE_SHIFT; @@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + return ret_addr; err_free_area: free_vm_area(area); -- cgit v1.1 From bf2253a6f00e8fea5b026e471e9f0d0a1b3621f2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Apr 2011 10:15:14 +0200 Subject: cdrom: always check_disk_change() on open cdrom_open() called check_disk_change() after the rest of open path succeeded which leads to the following bizarre behavior. * After media change, if the device opened without O_NONBLOCK, open_for_data() naturally fails with -ENOMEDIA and check_disk_change() is never called. The media is known to be gone and the open failure makes it obvious to the userland but device invalidation never happens. * But if the device is opened with O_NONBLOCK, all the checks are bypassed and cdrom_open() doesn't notice that the media is not there and check_disk_change() is called and invalidation happens. There's nothing to be gained by avoiding calling check_disk_change() on open failure. Common cases end up calling check_disk_change() anyway. All we get is inconsistent behavior. Fix it by moving check_disk_change() invocation to the top of cdrom_open() so that it always gets called regardless of how the rest of open proceeds. Stable: 2.6.38 Signed-off-by: Tejun Heo Reported-by: Amit Shah Tested-by: Amit Shah Cc: stable@kernel.org Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 514dd8e..75fb965 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -986,6 +986,9 @@ int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t cdinfo(CD_OPEN, "entering cdrom_open\n"); + /* open is event synchronization point, check events first */ + check_disk_change(bdev); + /* if this was a O_NONBLOCK open and we should honor the flags, * do a quick open without drive/disc integrity checks. */ cdi->use_count++; @@ -1012,9 +1015,6 @@ int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t cdinfo(CD_OPEN, "Use count for \"/dev/%s\" now %d\n", cdi->name, cdi->use_count); - /* Do this on open. Don't wait for mount, because they might - not be mounting, but opening with O_NONBLOCK */ - check_disk_change(bdev); return 0; err_release: if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) { -- cgit v1.1 From 02e352287a40bd456eb78df705bf888bc3161d3f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Apr 2011 10:15:20 +0200 Subject: block: rescan partitions on invalidated devices on -ENOMEDIA too __blkdev_get() doesn't rescan partitions if disk->fops->open() fails, which leads to ghost partition devices lingering after medimum removal is known to both the kernel and userland. The behavior also creates a subtle inconsistency where O_NONBLOCK open, which doesn't fail even if there's no medium, clears the ghots partitions, which is exploited to work around the problem from userland. Fix it by updating __blkdev_get() to issue partition rescan after -ENOMEDIA too. This was reported in the following bz. https://bugzilla.kernel.org/show_bug.cgi?id=13029 Stable: 2.6.38 Signed-off-by: Tejun Heo Reported-by: David Zeuthen Reported-by: Martin Pitt Reported-by: Kay Sievers Tested-by: Kay Sievers Cc: Alan Cox Cc: stable@kernel.org Signed-off-by: Jens Axboe --- fs/block_dev.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 5147bdd..257b00e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1102,6 +1102,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_part) goto out_clear; + ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); if (ret == -ERESTARTSYS) { @@ -1118,9 +1119,18 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) put_disk(disk); goto restart; } - if (ret) - goto out_clear; } + /* + * If the device is invalidated, rescan partition + * if open succeeded or failed with -ENOMEDIUM. + * The latter is necessary to prevent ghost + * partitions on a removed medium. + */ + if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) + rescan_partitions(disk, bdev); + if (ret) + goto out_clear; + if (!bdev->bd_openers) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); @@ -1128,8 +1138,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdi = &default_backing_dev_info; bdev_inode_switch_bdi(bdev->bd_inode, bdi); } - if (bdev->bd_invalidated) - rescan_partitions(disk, bdev); } else { struct block_device *whole; whole = bdget_disk(disk, 0); @@ -1153,13 +1161,14 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } else { if (bdev->bd_contains == bdev) { - if (bdev->bd_disk->fops->open) { + ret = 0; + if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); - if (ret) - goto out_unlock_bdev; - } - if (bdev->bd_invalidated) + /* the same as first opener case, read comment there */ + if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) rescan_partitions(bdev->bd_disk, bdev); + if (ret) + goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ module_put(disk->fops->owner); -- cgit v1.1 From 8f62242246351b5a4bc0c1f00c0c7003edea128a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 13:19:47 +0200 Subject: perf events: Add generic front-end and back-end stalled cycle event definitions Add two generic hardware events: front-end and back-end stalled cycles. These events measure conditions when the CPU is executing code but its capabilities are not fully utilized. Understanding such situations and analyzing them is an important sub-task of code optimization workflows. Both events limit performance: most front end stalls tend to be caused by branch misprediction or instruction fetch cachemisses, backend stalls can be caused by various resource shortages or inefficient instruction scheduling. Front-end stalls are the more important ones: code cannot run fast if the instruction stream is not being kept up. An over-utilized back-end can cause front-end stalls and thus has to be kept an eye on as well. The exact composition is very program logic and instruction mix dependent. We use the terms 'stall', 'front-end' and 'back-end' loosely and try to use the best available events from specific CPUs that approximate these concepts. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n000io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- include/linux/perf_event.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 1ea9422..393085b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1414,7 +1414,7 @@ static __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_nehalem_extra_regs; /* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0x1803fb1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; if (ebx & 0x40) { /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ac636dd..4e2d7ae 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -52,7 +52,8 @@ enum perf_hw_id { PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_HW_BRANCH_MISSES = 5, PERF_COUNT_HW_BUS_CYCLES = 6, - PERF_COUNT_HW_STALLED_CYCLES = 7, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, PERF_COUNT_HW_MAX, /* non-ABI */ }; -- cgit v1.1 From 91fc4cc00099986bc1ba50e1f421c3548cffae42 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:17:19 +0200 Subject: perf, x86: Add new stalled cycles events for Intel and AMD CPUs Extend the Intel and AMD event definitions with generic front-end and back-end stall events. ( These are only approximations - suggestions are welcome for better events. ) Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n001io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 14 ++++++++------ arch/x86/kernel/cpu/perf_event_intel.c | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index cf4e369..fe29c1d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ }; static u64 amd_pmu_event_map(int hw_event) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 393085b..7983b9a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1413,7 +1413,9 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; - /* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; if (ebx & 0x40) { -- cgit v1.1 From e296e1276ca389156d7f06eed668dac30141c37d Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 25 Apr 2011 14:48:18 -0300 Subject: [media] mceusb: add Dell transceiver ID Add device ID for a Dell-branded, Philips device ID transceiver reported by an OpenELEC user on their forums. http://openelec.tv/forum/27-hardware-support/5622-adding-support-for-an-ir-receiver--dell-branded--#5622 Signed-off-by: Jarod Wilson Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/mceusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c index 044fb7a..0c273ec 100644 --- a/drivers/media/rc/mceusb.c +++ b/drivers/media/rc/mceusb.c @@ -220,6 +220,8 @@ static struct usb_device_id mceusb_dev_table[] = { { USB_DEVICE(VENDOR_PHILIPS, 0x206c) }, /* Philips/Spinel plus IR transceiver for ASUS */ { USB_DEVICE(VENDOR_PHILIPS, 0x2088) }, + /* Philips IR transceiver (Dell branded) */ + { USB_DEVICE(VENDOR_PHILIPS, 0x2093) }, /* Realtek MCE IR Receiver and card reader */ { USB_DEVICE(VENDOR_REALTEK, 0x0161), .driver_info = MULTIFUNCTION }, -- cgit v1.1 From d7516c7cf37a5fcb84b8f229947e3f52c62e19ae Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 25 Apr 2011 14:50:50 -0300 Subject: [media] ite-cir: modular build on ppc requires delay.h include Signed-off-by: Jarod Wilson Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/ite-cir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c index ac0e42b..9485dac 100644 --- a/drivers/media/rc/ite-cir.c +++ b/drivers/media/rc/ite-cir.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From b30039333ae2a1cdd19ebd856a69e96918a46637 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Tue, 26 Apr 2011 12:25:02 -0300 Subject: [media] rc: show RC_TYPE_OTHER in sysfs Signed-off-by: Jarod Wilson Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 5ac1baf..9f0a2d9 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -733,6 +733,7 @@ static struct { { RC_TYPE_SONY, "sony" }, { RC_TYPE_RC5_SZ, "rc-5-sz" }, { RC_TYPE_LIRC, "lirc" }, + { RC_TYPE_OTHER, "other" }, }; #define PROTO_NONE "none" -- cgit v1.1 From 23ef710e1a6c4d6b9ef1c2fa19410f7f1479401e Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Wed, 27 Apr 2011 19:01:44 -0300 Subject: [media] imon: add conditional locking in change_protocol The imon_ir_change_protocol function gets called two different ways, one way is from rc_register_device, for initial protocol selection/setup, and the other is via a userspace-initiated protocol change request, either by direct sysfs prodding or by something like ir-keytable. In the rc_register_device case, the imon context lock is already held, but when initiated from userspace, it is not, so we must acquire it, prior to calling send_packet, which requires that the lock is held. Without this change, there's an easily reproduceable deadlock when another function calls send_packet (such as either of the display write fops) after a userspace-initiated change_protocol. With a lock-debugging-enabled kernel, I was getting this: [ 15.014153] ===================================== [ 15.015048] [ BUG: bad unlock balance detected! ] [ 15.015048] ------------------------------------- [ 15.015048] ir-keytable/773 is trying to release lock (&ictx->lock) at: [ 15.015048] [] mutex_unlock+0xe/0x10 [ 15.015048] but there are no more locks to release! [ 15.015048] [ 15.015048] other info that might help us debug this: [ 15.015048] 2 locks held by ir-keytable/773: [ 15.015048] #0: (&buffer->mutex){+.+.+.}, at: [] sysfs_write_file+0x3c/0x144 [ 15.015048] #1: (s_active#87){.+.+.+}, at: [] sysfs_write_file+0xe7/0x144 [ 15.015048] [ 15.015048] stack backtrace: [ 15.015048] Pid: 773, comm: ir-keytable Not tainted 2.6.38.4-20.fc15.x86_64.debug #1 [ 15.015048] Call Trace: [ 15.015048] [] ? print_unlock_inbalance_bug+0xca/0xd5 [ 15.015048] [] ? lock_release_non_nested+0xc1/0x263 [ 15.015048] [] ? mutex_unlock+0xe/0x10 [ 15.015048] [] ? mutex_unlock+0xe/0x10 [ 15.015048] [] ? lock_release+0x17d/0x1a4 [ 15.015048] [] ? __mutex_unlock_slowpath+0xc5/0x125 [ 15.015048] [] ? mutex_unlock+0xe/0x10 [ 15.015048] [] ? send_packet+0x1c9/0x264 [imon] [ 15.015048] [] ? lock_release_non_nested+0xdb/0x263 [ 15.015048] [] ? imon_ir_change_protocol+0x126/0x15e [imon] [ 15.015048] [] ? store_protocols+0x1c3/0x286 [rc_core] [ 15.015048] [] ? dev_attr_store+0x20/0x22 [ 15.015048] [] ? sysfs_write_file+0x108/0x144 ... The original report that led to the investigation was the following: [ 1679.457305] INFO: task LCDd:8460 blocked for more than 120 seconds. [ 1679.457307] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1679.457309] LCDd D ffff88010fcd89c8 0 8460 1 0x00000000 [ 1679.457312] ffff8800d5a03b48 0000000000000082 0000000000000000 ffff8800d5a03fd8 [ 1679.457314] 00000000012dcd30 fffffffffffffffd ffff8800d5a03fd8 ffff88010fcd86f0 [ 1679.457316] ffff8800d5a03fd8 ffff8800d5a03fd8 ffff88010fcd89d0 ffff8800d5a03fd8 [ 1679.457319] Call Trace: [ 1679.457324] [] ? zone_statistics+0x75/0x90 [ 1679.457327] [] ? get_page_from_freelist+0x3c7/0x820 [ 1679.457330] [] __mutex_lock_slowpath+0x139/0x320 [ 1679.457335] [] mutex_lock+0x11/0x30 [ 1679.457338] [] display_open+0x66/0x130 [imon] [ 1679.457345] [] usb_open+0x180/0x310 [usbcore] [ 1679.457349] [] chrdev_open+0x1bb/0x2d0 [ 1679.457350] [] __dentry_open+0x10d/0x370 [ 1679.457352] [] ? chrdev_open+0x0/0x2d0 ... Bump the driver version here so its easier to tell if people have this locking fix or not, and also make locking during probe easier to follow. CC: stable@kernel.org Reported-by: Benjamin Hodgetts Signed-off-by: Jarod Wilson Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/imon.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c index f714e1a..3ca8659 100644 --- a/drivers/media/rc/imon.c +++ b/drivers/media/rc/imon.c @@ -46,7 +46,7 @@ #define MOD_AUTHOR "Jarod Wilson " #define MOD_DESC "Driver for SoundGraph iMON MultiMedia IR/Display" #define MOD_NAME "imon" -#define MOD_VERSION "0.9.2" +#define MOD_VERSION "0.9.3" #define DISPLAY_MINOR_BASE 144 #define DEVICE_NAME "lcd%d" @@ -460,8 +460,9 @@ static int display_close(struct inode *inode, struct file *file) } /** - * Sends a packet to the device -- this function must be called - * with ictx->lock held. + * Sends a packet to the device -- this function must be called with + * ictx->lock held, or its unlock/lock sequence while waiting for tx + * to complete can/will lead to a deadlock. */ static int send_packet(struct imon_context *ictx) { @@ -991,12 +992,21 @@ static void imon_touch_display_timeout(unsigned long data) * the iMON remotes, and those used by the Windows MCE remotes (which is * really just RC-6), but only one or the other at a time, as the signals * are decoded onboard the receiver. + * + * This function gets called two different ways, one way is from + * rc_register_device, for initial protocol selection/setup, and the other is + * via a userspace-initiated protocol change request, either by direct sysfs + * prodding or by something like ir-keytable. In the rc_register_device case, + * the imon context lock is already held, but when initiated from userspace, + * it is not, so we must acquire it prior to calling send_packet, which + * requires that the lock is held. */ static int imon_ir_change_protocol(struct rc_dev *rc, u64 rc_type) { int retval; struct imon_context *ictx = rc->priv; struct device *dev = ictx->dev; + bool unlock = false; unsigned char ir_proto_packet[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86 }; @@ -1029,6 +1039,11 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 rc_type) memcpy(ictx->usb_tx_buf, &ir_proto_packet, sizeof(ir_proto_packet)); + if (!mutex_is_locked(&ictx->lock)) { + unlock = true; + mutex_lock(&ictx->lock); + } + retval = send_packet(ictx); if (retval) goto out; @@ -1037,6 +1052,9 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 rc_type) ictx->pad_mouse = false; out: + if (unlock) + mutex_unlock(&ictx->lock); + return retval; } @@ -2134,6 +2152,7 @@ static struct imon_context *imon_init_intf0(struct usb_interface *intf) goto rdev_setup_failed; } + mutex_unlock(&ictx->lock); return ictx; rdev_setup_failed: @@ -2205,6 +2224,7 @@ static struct imon_context *imon_init_intf1(struct usb_interface *intf, goto urb_submit_failed; } + mutex_unlock(&ictx->lock); return ictx; urb_submit_failed: @@ -2299,6 +2319,8 @@ static int __devinit imon_probe(struct usb_interface *interface, usb_set_intfdata(interface, ictx); if (ifnum == 0) { + mutex_lock(&ictx->lock); + if (product == 0xffdc && ictx->rf_device) { sysfs_err = sysfs_create_group(&interface->dev.kobj, &imon_rf_attr_group); @@ -2309,13 +2331,14 @@ static int __devinit imon_probe(struct usb_interface *interface, if (ictx->display_supported) imon_init_display(ictx, interface); + + mutex_unlock(&ictx->lock); } dev_info(dev, "iMON device (%04x:%04x, intf%d) on " "usb<%d:%d> initialized\n", vendor, product, ifnum, usbdev->bus->busnum, usbdev->devnum); - mutex_unlock(&ictx->lock); mutex_unlock(&driver_lock); return 0; -- cgit v1.1 From 129c04cb8ce2e4bf3f17223f58ef16aa8a2cb3b8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:41:28 +0200 Subject: perf tools: Add front-end and back-end stalled cycles support Update perf tooling to deal with front-end and back-end stalled cycles events. Add both the default 'perf stat' output. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n002io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 10 ++++++---- tools/perf/util/parse-events.c | 37 +++++++++++++++++++------------------ tools/perf/util/python.c | 4 +++- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index da77077..6a4a8a3 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -66,7 +66,8 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, @@ -84,7 +85,8 @@ static struct perf_event_attr detailed_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, @@ -249,7 +251,7 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES)) + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) update_stats(&runtime_stalled_cycles_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); @@ -607,7 +609,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { print_stalled_cycles(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index bbbb735..04d2f0a 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -31,24 +31,25 @@ char debugfs_path[MAXPATHLEN]; #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x static struct event_symbol event_symbols[] = { - { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, - { CHW(STALLED_CYCLES), "stalled-cycles", "idle-cycles" }, - { CHW(INSTRUCTIONS), "instructions", "" }, - { CHW(CACHE_REFERENCES), "cache-references", "" }, - { CHW(CACHE_MISSES), "cache-misses", "" }, - { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, - { CHW(BRANCH_MISSES), "branch-misses", "" }, - { CHW(BUS_CYCLES), "bus-cycles", "" }, - - { CSW(CPU_CLOCK), "cpu-clock", "" }, - { CSW(TASK_CLOCK), "task-clock", "" }, - { CSW(PAGE_FAULTS), "page-faults", "faults" }, - { CSW(PAGE_FAULTS_MIN), "minor-faults", "" }, - { CSW(PAGE_FAULTS_MAJ), "major-faults", "" }, - { CSW(CONTEXT_SWITCHES), "context-switches", "cs" }, - { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, - { CSW(ALIGNMENT_FAULTS), "alignment-faults", "" }, - { CSW(EMULATION_FAULTS), "emulation-faults", "" }, + { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, + { CHW(STALLED_CYCLES_FRONTEND), "stalled-cycles-frontend", "idle-cycles-frontend" }, + { CHW(STALLED_CYCLES_BACKEND), "stalled-cycles-backend", "idle-cycles-backend" }, + { CHW(INSTRUCTIONS), "instructions", "" }, + { CHW(CACHE_REFERENCES), "cache-references", "" }, + { CHW(CACHE_MISSES), "cache-misses", "" }, + { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, + { CHW(BRANCH_MISSES), "branch-misses", "" }, + { CHW(BUS_CYCLES), "bus-cycles", "" }, + + { CSW(CPU_CLOCK), "cpu-clock", "" }, + { CSW(TASK_CLOCK), "task-clock", "" }, + { CSW(PAGE_FAULTS), "page-faults", "faults" }, + { CSW(PAGE_FAULTS_MIN), "minor-faults", "" }, + { CSW(PAGE_FAULTS_MAJ), "major-faults", "" }, + { CSW(CONTEXT_SWITCHES), "context-switches", "cs" }, + { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, + { CSW(ALIGNMENT_FAULTS), "alignment-faults", "" }, + { CSW(EMULATION_FAULTS), "emulation-faults", "" }, }; #define __PERF_EVENT_FIELD(config, name) \ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 406f613..8b0eff8 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -798,7 +798,6 @@ static struct { { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES }, { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES }, - { "COUNT_HW_STALLED_CYCLES", PERF_COUNT_HW_STALLED_CYCLES }, { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D }, { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I }, { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL }, @@ -811,6 +810,9 @@ static struct { { "COUNT_HW_CACHE_RESULT_ACCESS", PERF_COUNT_HW_CACHE_RESULT_ACCESS }, { "COUNT_HW_CACHE_RESULT_MISS", PERF_COUNT_HW_CACHE_RESULT_MISS }, + { "COUNT_HW_STALLED_CYCLES_FRONTEND", PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { "COUNT_HW_STALLED_CYCLES_BACKEND", PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, + { "COUNT_SW_CPU_CLOCK", PERF_COUNT_SW_CPU_CLOCK }, { "COUNT_SW_TASK_CLOCK", PERF_COUNT_SW_TASK_CLOCK }, { "COUNT_SW_PAGE_FAULTS", PERF_COUNT_SW_PAGE_FAULTS }, -- cgit v1.1 From d3d1e86da07b4565815e3dbcd082f53017d215f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 13:49:08 +0200 Subject: perf stat: Analyze front-end and back-end stall counts Sample output: Performance counter stats for './loop_1b': 873.691065 task-clock # 1.000 CPUs utilized 1 context-switches # 0.000 M/sec 1 CPU-migrations # 0.000 M/sec 96 page-faults # 0.000 M/sec 2,012,637,222 cycles # 2.304 GHz (66.58%) 1,001,397,911 stalled-cycles-frontend # 49.76% frontend cycles idle (66.58%) 7,523,398 stalled-cycles-backend # 0.37% backend cycles idle (66.76%) 2,004,551,046 instructions # 1.00 insns per cycle # 0.50 stalled cycles per insn (66.80%) 1,001,304,992 branches # 1146.063 M/sec (66.76%) 39,453 branch-misses # 0.00% of all branches (66.64%) 0.874046121 seconds time elapsed Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n003io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 41 +++++++++++++++++++++++++++++++++++------ tools/perf/util/parse-events.c | 7 ++++--- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6a4a8a3..e454499 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -201,7 +201,8 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; -struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; @@ -251,8 +252,10 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) + update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) - update_stats(&runtime_stalled_cycles_stats[0], count[0]); + update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) @@ -478,7 +481,30 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } -static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg) +static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 20.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " frontend cycles idle "); +} + +static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg) { double total, ratio = 0.0; const char *color; @@ -498,7 +524,7 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl fprintf(stderr, " # "); color_fprintf(stderr, color, "%5.2f%%", ratio); - fprintf(stderr, " of all cycles are idle "); + fprintf(stderr, " backend cycles idle "); } static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) @@ -583,7 +609,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %4.2f insns per cycle ", ratio); - total = avg_stats(&runtime_stalled_cycles_stats[cpu]); + total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); + total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); if (total && avg) { ratio = total / avg; @@ -609,8 +636,10 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { + print_stalled_cycles_frontend(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { - print_stalled_cycles(cpu, evsel, avg); + print_stalled_cycles_backend(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 04d2f0a..8a407f3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -60,7 +60,7 @@ static struct event_symbol event_symbols[] = { #define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) -static const char *hw_event_names[] = { +static const char *hw_event_names[PERF_COUNT_HW_MAX] = { "cycles", "instructions", "cache-references", @@ -68,10 +68,11 @@ static const char *hw_event_names[] = { "branches", "branch-misses", "bus-cycles", - "stalled-cycles", + "stalled-cycles-frontend", + "stalled-cycles-backend", }; -static const char *sw_event_names[] = { +static const char *sw_event_names[PERF_COUNT_SW_MAX] = { "cpu-clock", "task-clock", "page-faults", -- cgit v1.1 From 2b427e14b77dbf3e05f1bd0785f1d07ea5fe924e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:16:18 +0200 Subject: perf stat: Adjust stall cycles warning percentages Adjust to color thresholds to better match the percentages seen in real workloads. Both are now a bit more sensitive. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n004io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e454499..2492a0e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -492,11 +492,11 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us ratio = avg / total * 100.0; color = PERF_COLOR_NORMAL; - if (ratio > 75.0) + if (ratio > 50.0) color = PERF_COLOR_RED; - else if (ratio > 50.0) + else if (ratio > 30.0) color = PERF_COLOR_MAGENTA; - else if (ratio > 20.0) + else if (ratio > 10.0) color = PERF_COLOR_YELLOW; fprintf(stderr, " # "); @@ -519,7 +519,7 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use color = PERF_COLOR_RED; else if (ratio > 50.0) color = PERF_COLOR_MAGENTA; - else if (ratio > 25.0) + else if (ratio > 20.0) color = PERF_COLOR_YELLOW; fprintf(stderr, " # "); -- cgit v1.1 From fce3c786d3a49eff397583b4b62fa38df90db937 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 30 Apr 2011 09:03:15 +0200 Subject: perf stat: Leave more room for percentages Triple digit percentages do not fit otherwise. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n005io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2492a0e..9e596ab 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -499,8 +499,8 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us else if (ratio > 10.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " frontend cycles idle "); } @@ -522,9 +522,9 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use else if (ratio > 20.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); - fprintf(stderr, " backend cycles idle "); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " backend cycles idle "); } static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) @@ -545,8 +545,8 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double else if (ratio > 5.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " of all branches "); } @@ -568,8 +568,8 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou else if (ratio > 5.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " of all L1-dcache hits "); } @@ -607,14 +607,14 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %4.2f insns per cycle ", ratio); + fprintf(stderr, " # %5.2f insns per cycle ", ratio); total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); if (total && avg) { ratio = total / avg; - fprintf(stderr, "\n # %4.2f stalled cycles per insn", ratio); + fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && -- cgit v1.1 From 370faf1dd0461ad811852c8abbbcd3d73b1e4fc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 16:11:03 +0200 Subject: perf stat: Fail softly on unsupported events David Ahern reported this perf stat failure: > # /tmp/build-perf/perf stat -- sleep 1 > Error: stalled-cycles-frontend event is not supported. > Fatal: Not all events could be opened. > > This is a Dell R410 with an E5620 processor. Fail in a softer fashion on unknown/unsupported events. Reported-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n006io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 9e596ab..c8b535b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -377,7 +377,7 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == EINVAL || errno == ENOSYS) + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) continue; if (errno == EPERM || errno == EACCES) { @@ -385,8 +385,6 @@ static int run_perf_stat(int argc __used, const char **argv) "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid or running as root.", system_wide ? "system-wide " : ""); - } else if (errno == ENOENT) { - error("%s event is not supported. ", event_name(counter)); } else { error("open_counter returned with %d (%s). " "/bin/dmesg may provide additional information.\n", -- cgit v1.1 From 301120396b766ae4480e52ece220516a1707822b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 30 Apr 2011 09:14:54 +0200 Subject: perf events, x86: Add Westmere stalled-cycles-frontend/backend events Extend the Intel Westmere PMU driver with definitions for generic front-end and back-end stall events. ( These are only approximations. ) Reported-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n008io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7983b9a..be8363a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1458,6 +1458,12 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + pr_cont("Westmere events, "); break; -- cgit v1.1 From 88fda5619e6cd7988dc1d9a52f2da9ee8fd0e64d Mon Sep 17 00:00:00 2001 From: Hussam Al-Tayeb Date: Mon, 21 Feb 2011 15:20:26 -0300 Subject: [media] rc_core: avoid kernel oops when rmmod saa7134 The following is a patch to avoid a kernel oops when running rmmod saa7134 on kernel 2.6.27.1. The change is as suggested by mchehab on irc.freenode.org Signed-off-by: Hussam Al-Tayeb Signed-off-by: Mauro Carvalho Chehab --- drivers/media/rc/rc-main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 9f0a2d9..33afd98 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -707,7 +707,8 @@ static void ir_close(struct input_dev *idev) { struct rc_dev *rdev = input_get_drvdata(idev); - rdev->close(rdev); + if (rdev) + rdev->close(rdev); } /* class for /sys/class/rc */ -- cgit v1.1 From 2b5cb549f8a31b44575fe25ccd043ddb7e901cf8 Mon Sep 17 00:00:00 2001 From: Malcolm Priestley Date: Tue, 22 Feb 2011 03:14:34 -0300 Subject: [media] Missing frontend config for LME DM04/QQBOX Forgot to add the DVB_STV0299/DVB_PLL to config Signed-off-by: Malcolm Priestley Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb/dvb-usb/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/dvb/dvb-usb/Kconfig b/drivers/media/dvb/dvb-usb/Kconfig index ccbd39a..c545039 100644 --- a/drivers/media/dvb/dvb-usb/Kconfig +++ b/drivers/media/dvb/dvb-usb/Kconfig @@ -356,6 +356,8 @@ config DVB_USB_LME2510 select DVB_TDA826X if !DVB_FE_CUSTOMISE select DVB_STV0288 if !DVB_FE_CUSTOMISE select DVB_IX2505V if !DVB_FE_CUSTOMISE + select DVB_STV0299 if !DVB_FE_CUSTOMISE + select DVB_PLL if !DVB_FE_CUSTOMISE help Say Y here to support the LME DM04/QQBOX DVB-S USB2.0 . -- cgit v1.1 From 0f22072ab50cac7983f9660d33974b45184da4f9 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Fri, 29 Apr 2011 15:48:07 +0100 Subject: ARM: 6891/1: prevent heap corruption in OABI semtimedop When CONFIG_OABI_COMPAT is set, the wrapper for semtimedop does not bound the nsops argument. A sufficiently large value will cause an integer overflow in allocation size, followed by copying too much data into the allocated buffer. Fix this by restricting nsops to SEMOPM. Untested. Cc: stable@kernel.org Signed-off-by: Dan Rosenberg Signed-off-by: Russell King --- arch/arm/kernel/sys_oabi-compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index 4ad8da1..af0aaeb 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -311,7 +311,7 @@ asmlinkage long sys_oabi_semtimedop(int semid, long err; int i; - if (nsops < 1) + if (nsops < 1 || nsops > SEMOPM) return -EINVAL; sops = kmalloc(sizeof(*sops) * nsops, GFP_KERNEL); if (!sops) -- cgit v1.1 From 80845a33165278f3236812009e9c568ba8c29938 Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Fri, 1 Apr 2011 14:12:02 -0300 Subject: [media] v4l: make sure drivers supply a zeroed struct v4l2_subdev Some v4l drivers currently don't initialize their struct v4l2_subdev with zeros, and this is a problem since some of the v4l2 code expects this. One example is the addition of internal_ops in commit 45f6f84, after that we are at risk of random oopses with these drivers when code in v4l2_device_register_subdev tries to dereference sd->internal_ops->*, as can be shown by the report at http://bugs.launchpad.net/bugs/745213 and analysis of its crash at https://lkml.org/lkml/2011/4/1/168 Use kzalloc within problematic drivers to ensure we have a zeroed struct v4l2_subdev. BugLink: http://bugs.launchpad.net/bugs/745213 Cc: Signed-off-by: Herton Ronaldo Krzesinski Signed-off-by: Mauro Carvalho Chehab --- drivers/media/radio/saa7706h.c | 2 +- drivers/media/radio/tef6862.c | 2 +- drivers/media/video/m52790.c | 2 +- drivers/media/video/tda9840.c | 2 +- drivers/media/video/tea6415c.c | 2 +- drivers/media/video/tea6420.c | 2 +- drivers/media/video/upd64031a.c | 2 +- drivers/media/video/upd64083.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/media/radio/saa7706h.c b/drivers/media/radio/saa7706h.c index 585680f..b1193df 100644 --- a/drivers/media/radio/saa7706h.c +++ b/drivers/media/radio/saa7706h.c @@ -376,7 +376,7 @@ static int __devinit saa7706h_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%02x (%s)\n", client->addr << 1, client->adapter->name); - state = kmalloc(sizeof(struct saa7706h_state), GFP_KERNEL); + state = kzalloc(sizeof(struct saa7706h_state), GFP_KERNEL); if (state == NULL) return -ENOMEM; sd = &state->sd; diff --git a/drivers/media/radio/tef6862.c b/drivers/media/radio/tef6862.c index 7c0d777..0991e19 100644 --- a/drivers/media/radio/tef6862.c +++ b/drivers/media/radio/tef6862.c @@ -176,7 +176,7 @@ static int __devinit tef6862_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%02x (%s)\n", client->addr << 1, client->adapter->name); - state = kmalloc(sizeof(struct tef6862_state), GFP_KERNEL); + state = kzalloc(sizeof(struct tef6862_state), GFP_KERNEL); if (state == NULL) return -ENOMEM; state->freq = TEF6862_LO_FREQ; diff --git a/drivers/media/video/m52790.c b/drivers/media/video/m52790.c index 5e1c9a8..303ffa7 100644 --- a/drivers/media/video/m52790.c +++ b/drivers/media/video/m52790.c @@ -174,7 +174,7 @@ static int m52790_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%x (%s)\n", client->addr << 1, client->adapter->name); - state = kmalloc(sizeof(struct m52790_state), GFP_KERNEL); + state = kzalloc(sizeof(struct m52790_state), GFP_KERNEL); if (state == NULL) return -ENOMEM; diff --git a/drivers/media/video/tda9840.c b/drivers/media/video/tda9840.c index 5d4cf3b..22fa820 100644 --- a/drivers/media/video/tda9840.c +++ b/drivers/media/video/tda9840.c @@ -171,7 +171,7 @@ static int tda9840_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%x (%s)\n", client->addr << 1, client->adapter->name); - sd = kmalloc(sizeof(struct v4l2_subdev), GFP_KERNEL); + sd = kzalloc(sizeof(struct v4l2_subdev), GFP_KERNEL); if (sd == NULL) return -ENOMEM; v4l2_i2c_subdev_init(sd, client, &tda9840_ops); diff --git a/drivers/media/video/tea6415c.c b/drivers/media/video/tea6415c.c index 19621ed5..827425c 100644 --- a/drivers/media/video/tea6415c.c +++ b/drivers/media/video/tea6415c.c @@ -152,7 +152,7 @@ static int tea6415c_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%x (%s)\n", client->addr << 1, client->adapter->name); - sd = kmalloc(sizeof(struct v4l2_subdev), GFP_KERNEL); + sd = kzalloc(sizeof(struct v4l2_subdev), GFP_KERNEL); if (sd == NULL) return -ENOMEM; v4l2_i2c_subdev_init(sd, client, &tea6415c_ops); diff --git a/drivers/media/video/tea6420.c b/drivers/media/video/tea6420.c index 5ea8404..f350b6c 100644 --- a/drivers/media/video/tea6420.c +++ b/drivers/media/video/tea6420.c @@ -125,7 +125,7 @@ static int tea6420_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%x (%s)\n", client->addr << 1, client->adapter->name); - sd = kmalloc(sizeof(struct v4l2_subdev), GFP_KERNEL); + sd = kzalloc(sizeof(struct v4l2_subdev), GFP_KERNEL); if (sd == NULL) return -ENOMEM; v4l2_i2c_subdev_init(sd, client, &tea6420_ops); diff --git a/drivers/media/video/upd64031a.c b/drivers/media/video/upd64031a.c index f8138c7..1aab96a 100644 --- a/drivers/media/video/upd64031a.c +++ b/drivers/media/video/upd64031a.c @@ -230,7 +230,7 @@ static int upd64031a_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%x (%s)\n", client->addr << 1, client->adapter->name); - state = kmalloc(sizeof(struct upd64031a_state), GFP_KERNEL); + state = kzalloc(sizeof(struct upd64031a_state), GFP_KERNEL); if (state == NULL) return -ENOMEM; sd = &state->sd; diff --git a/drivers/media/video/upd64083.c b/drivers/media/video/upd64083.c index 28e0e6b..9bbe617 100644 --- a/drivers/media/video/upd64083.c +++ b/drivers/media/video/upd64083.c @@ -202,7 +202,7 @@ static int upd64083_probe(struct i2c_client *client, v4l_info(client, "chip found @ 0x%x (%s)\n", client->addr << 1, client->adapter->name); - state = kmalloc(sizeof(struct upd64083_state), GFP_KERNEL); + state = kzalloc(sizeof(struct upd64083_state), GFP_KERNEL); if (state == NULL) return -ENOMEM; sd = &state->sd; -- cgit v1.1 From b730011061e2805a46b7291e708b6caaf2be6869 Mon Sep 17 00:00:00 2001 From: Oliver Endriss Date: Tue, 29 Mar 2011 17:35:24 -0300 Subject: [media] ngene: Fix CI data transfer regression Fix CI data transfer regression introduced by previous cleanup. Signed-off-by: Oliver Endriss Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb/ngene/ngene-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/dvb/ngene/ngene-core.c b/drivers/media/dvb/ngene/ngene-core.c index 175a0f6..9630705 100644 --- a/drivers/media/dvb/ngene/ngene-core.c +++ b/drivers/media/dvb/ngene/ngene-core.c @@ -1520,6 +1520,7 @@ static int init_channel(struct ngene_channel *chan) if (dev->ci.en && (io & NGENE_IO_TSOUT)) { dvb_ca_en50221_init(adapter, dev->ci.en, 0, 1); set_transfer(chan, 1); + chan->dev->channel[2].DataFormatFlags = DF_SWAP32; set_transfer(&chan->dev->channel[2], 1); dvb_register_device(adapter, &chan->ci_dev, &ngene_dvbdev_ci, (void *) chan, -- cgit v1.1 From bfd36103ec26599557c2bd3225a1f1c9267f8fcb Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 29 Apr 2011 17:51:06 +0200 Subject: iwlagn: fix "Received BA when not expected" Need to use broadcast sta_id for management frames, otherwise we broke BA session in the firmware and get messages like that: "Received BA when not expected" or (on older kernels): "BA scd_flow 0 does not match txq_id 10" This fix regression introduced in 2.6.35 during station management code rewrite by: commit 2a87c26bbe9587baeb9e56d3ce0b4971bd777643 Author: Johannes Berg Date: Fri Apr 30 11:30:45 2010 -0700 iwlwifi: use iwl_find_station less Patch partially resolve: https://bugzilla.kernel.org/show_bug.cgi?id=16691 However, there are still 11n performance problems on 4965 and 5xxx devices that need to be investigated. Cc: stable@kernel.org # 2.6.35+ Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Acked-by: Wey-Yi Guy Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn-tx.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-tx.c b/drivers/net/wireless/iwlwifi/iwl-agn-tx.c index 2dd7d54..0712b672 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-tx.c @@ -568,12 +568,17 @@ int iwlagn_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) hdr_len = ieee80211_hdrlen(fc); - /* Find index into station table for destination station */ - sta_id = iwl_sta_id_or_broadcast(priv, ctx, info->control.sta); - if (sta_id == IWL_INVALID_STATION) { - IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", - hdr->addr1); - goto drop_unlock; + /* For management frames use broadcast id to do not break aggregation */ + if (!ieee80211_is_data(fc)) + sta_id = ctx->bcast_sta_id; + else { + /* Find index into station table for destination station */ + sta_id = iwl_sta_id_or_broadcast(priv, ctx, info->control.sta); + if (sta_id == IWL_INVALID_STATION) { + IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", + hdr->addr1); + goto drop_unlock; + } } IWL_DEBUG_TX(priv, "station Id %d\n", sta_id); -- cgit v1.1 From 16b345d89686ca0482a9ca741a1167def1abdd7f Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 29 Apr 2011 17:51:56 +0200 Subject: iwl4965: fix "Received BA when not expected" Need to use broadcast sta_id for management frames, otherwise we broke BA session in the firmware and get messages like that: "Received BA when not expected" or (on older kernels): "BA scd_flow 0 does not match txq_id 10" This fix regression introduced in 2.6.35 during station management code rewrite by: commit 2a87c26bbe9587baeb9e56d3ce0b4971bd777643 Author: Johannes Berg Date: Fri Apr 30 11:30:45 2010 -0700 iwlwifi: use iwl_find_station less Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/iwl-4965-tx.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/iwlegacy/iwl-4965-tx.c b/drivers/net/wireless/iwlegacy/iwl-4965-tx.c index fbec88d..79ac081 100644 --- a/drivers/net/wireless/iwlegacy/iwl-4965-tx.c +++ b/drivers/net/wireless/iwlegacy/iwl-4965-tx.c @@ -316,12 +316,18 @@ int iwl4965_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) hdr_len = ieee80211_hdrlen(fc); - /* Find index into station table for destination station */ - sta_id = iwl_legacy_sta_id_or_broadcast(priv, ctx, info->control.sta); - if (sta_id == IWL_INVALID_STATION) { - IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", - hdr->addr1); - goto drop_unlock; + /* For management frames use broadcast id to do not break aggregation */ + if (!ieee80211_is_data(fc)) + sta_id = ctx->bcast_sta_id; + else { + /* Find index into station table for destination station */ + sta_id = iwl_legacy_sta_id_or_broadcast(priv, ctx, info->control.sta); + + if (sta_id == IWL_INVALID_STATION) { + IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", + hdr->addr1); + goto drop_unlock; + } } IWL_DEBUG_TX(priv, "station Id %d\n", sta_id); -- cgit v1.1 From 8333a46ad3877485e4d67ef499c6dda36bfd1f9a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 26 Apr 2011 10:30:11 +0000 Subject: bnx2: cancel timer on device removal This oops was recently reported to me: invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/pci0000:00/0000:00:01.0/0000:01:0d.0/0000:02:05.0/device CPU 1 Modules linked in: bnx2(+) sunrpc ipv6 dm_mirror dm_region_hash dm_log sg microcode serio_raw amd64_edac_mod edac_core edac_mce_amd k8temp i2c_piix4 shpchp ext4 mbcache jbd2 sd_mod crc_t10dif mptsas mptscsih mptbase scsi_transport_sas radeon ttm drm_kms_helper drm hwmon i2c_algo_bit i2c_core dm_mod [last unloaded: bnx2] Modules linked in: bnx2(+) sunrpc ipv6 dm_mirror dm_region_hash dm_log sg microcode serio_raw amd64_edac_mod edac_core edac_mce_amd k8temp i2c_piix4 shpchp ext4 mbcache jbd2 sd_mod crc_t10dif mptsas mptscsih mptbase scsi_transport_sas radeon ttm drm_kms_helper drm hwmon i2c_algo_bit i2c_core dm_mod [last unloaded: bnx2] Pid: 23900, comm: pidof Not tainted 2.6.32-130.el6.x86_64 #1 BladeCenter LS21 -[797251Z]- RIP: 0010:[] [] 0xffffffffa058b270 RSP: 0018:ffff880002083e48 EFLAGS: 00010246 RAX: ffff880002083e90 RBX: ffff88007ccd4000 RCX: 0000000000000000 RDX: 0000000000000100 RSI: dead000000200200 RDI: ffff8800007b8700 RBP: ffff880002083ed0 R08: ffff88000208db40 R09: 0000022d191d27c8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800007b9bc8 R13: ffff880002083e90 R14: ffff8800007b8700 R15: ffffffffa058b270 FS: 00007fbb3bcf7700(0000) GS:ffff880002080000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001664a98 CR3: 0000000060395000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process pidof (pid: 23900, threadinfo ffff8800007e8000, task ffff8800091c0040) Stack: ffffffff81079f77 ffffffff8109e010 ffff88007ccd5c20 ffff88007ccd5820 <0> ffff88007ccd5420 ffff8800007e9fd8 ffff8800007e9fd8 0000010000000000 <0> ffff88007ccd5020 ffff880002083e90 ffff880002083e90 ffffffff8102a00d Call Trace: [] ? run_timer_softirq+0x197/0x340 [] ? tick_sched_timer+0x0/0xc0 [] ? lapic_next_event+0x1d/0x30 [] __do_softirq+0xb7/0x1e0 [] ? hrtimer_interrupt+0x140/0x250 [] ? filldir+0x0/0xe0 [] call_softirq+0x1c/0x30 [] do_softirq+0x65/0xa0 [] irq_exit+0x85/0x90 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] ? selinux_file_permission+0x45/0x150 [] ? _atomic_dec_and_lock+0x55/0x80 [] security_file_permission+0x16/0x20 [] vfs_readdir+0x71/0xe0 [] sys_getdents+0x89/0xf0 [] system_call_fastpath+0x16/0x1b It occured during some stress testing, in which the reporter was repeatedly removing and modprobing the bnx2 module while doing various other random operations on the bnx2 registered net device. Noting that this error occured on a serdes based device, we noted that there were a few ethtool operations (most notably self_test and set_phys_id) that have execution paths that lead into bnx2_setup_serdes_phy. This function is notable because it executes a mod_timer call, which starts the bp->timer running. Currently bnx2 is setup to assume that this timer only nees to be stopped when bnx2_close or bnx2_suspend is called. Since the above ethtool operations are not gated on the net device having been opened however, that assumption is incorrect, and can lead to the timer still running after the module has been removed, leading to the oops above (as well as other simmilar oopses). Fix the problem by ensuring that the timer is stopped when pci_device_unregister is called. Signed-off-by: Neil Horman Reported-by: Hushan Jia CC: Michael Chan CC: "David S. Miller" Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/bnx2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 8e6d618..d8383a9 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -8413,6 +8413,8 @@ bnx2_remove_one(struct pci_dev *pdev) unregister_netdev(dev); + del_timer_sync(&bp->timer); + if (bp->mips_firmware) release_firmware(bp->mips_firmware); if (bp->rv2p_firmware) -- cgit v1.1 From b3c914aa84f4e4bbb3efc8f41c359d96e5e932d2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 27 Apr 2011 09:54:28 +0000 Subject: usbnet: add support for some Huawei modems with cdc-ether ports Some newer Huawei devices (T-Mobile Rocket, others) have cdc-ether compatible ports, so recognize and expose them. Signed-off-by: Dan Williams Acked-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 341f705..a301479 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -460,7 +460,7 @@ static const struct driver_info cdc_info = { .manage_power = cdc_manage_power, }; -static const struct driver_info mbm_info = { +static const struct driver_info wwan_info = { .description = "Mobile Broadband Network Device", .flags = FLAG_WWAN, .bind = usbnet_cdc_bind, @@ -471,6 +471,7 @@ static const struct driver_info mbm_info = { /*-------------------------------------------------------------------------*/ +#define HUAWEI_VENDOR_ID 0x12D1 static const struct usb_device_id products [] = { /* @@ -587,8 +588,17 @@ static const struct usb_device_id products [] = { }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE), - .driver_info = (unsigned long)&mbm_info, + .driver_info = (unsigned long)&wwan_info, +}, { + /* Various Huawei modems with a network port like the UMG1831 */ + .match_flags = USB_DEVICE_ID_MATCH_VENDOR + | USB_DEVICE_ID_MATCH_INT_INFO, + .idVendor = HUAWEI_VENDOR_ID, + .bInterfaceClass = USB_CLASS_COMM, + .bInterfaceSubClass = USB_CDC_SUBCLASS_ETHERNET, + .bInterfaceProtocol = 255, + .driver_info = (unsigned long)&wwan_info, }, { }, // END }; -- cgit v1.1 From 686f13bb17784fbf8595a59ff4e4bd707d5ae66f Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Fri, 29 Apr 2011 14:15:53 +0200 Subject: usbnet: Transfer of maintainership Somebody has to do it, however unfortunate be the cause. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c85368d..eb4f996 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6551,7 +6551,7 @@ S: Maintained F: drivers/usb/host/uhci* USB "USBNET" DRIVER FRAMEWORK -M: David Brownell +M: Oliver Neukum L: netdev@vger.kernel.org W: http://www.linux-usb.org/usbnet S: Maintained -- cgit v1.1 From eee9700c5dd8fbac517c8c1e85c60b688bc311a9 Mon Sep 17 00:00:00 2001 From: Adam Jaremko Date: Thu, 28 Apr 2011 07:41:18 +0000 Subject: net: ftmac100: fix scheduling while atomic during PHY link status change Signed-off-by: Adam Jaremko Acked-by: Po-Yu Chuang Signed-off-by: David S. Miller --- drivers/net/ftmac100.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ftmac100.c b/drivers/net/ftmac100.c index a316619..9bd7746 100644 --- a/drivers/net/ftmac100.c +++ b/drivers/net/ftmac100.c @@ -139,11 +139,11 @@ static int ftmac100_reset(struct ftmac100 *priv) * that hardware reset completed (what the f*ck). * We still need to wait for a while. */ - usleep_range(500, 1000); + udelay(500); return 0; } - usleep_range(1000, 10000); + udelay(1000); } netdev_err(netdev, "software reset failed\n"); @@ -772,7 +772,7 @@ static int ftmac100_mdio_read(struct net_device *netdev, int phy_id, int reg) if ((phycr & FTMAC100_PHYCR_MIIRD) == 0) return phycr & FTMAC100_PHYCR_MIIRDATA; - usleep_range(100, 1000); + udelay(100); } netdev_err(netdev, "mdio read timed out\n"); @@ -801,7 +801,7 @@ static void ftmac100_mdio_write(struct net_device *netdev, int phy_id, int reg, if ((phycr & FTMAC100_PHYCR_MIIWR) == 0) return; - usleep_range(100, 1000); + udelay(100); } netdev_err(netdev, "mdio write timed out\n"); -- cgit v1.1 From 947b4ad1d198b7303ecc961f4939a331be0c48f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 22:52:42 +0200 Subject: perf list: Fix max event string size Recent stalled-cycles event names were larger than the 40 chars printout used by perf list. Extend that, make it robust for future extensions and also adjust alignments in face of wider event names. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n009io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8a407f3..ffa493a 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -929,7 +929,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob) snprintf(evt_path, MAXPATHLEN, "%s:%s", sys_dirent.d_name, evt_dirent.d_name); - printf(" %-42s [%s]\n", evt_path, + printf(" %-50s [%s]\n", evt_path, event_type_descriptors[PERF_TYPE_TRACEPOINT]); } closedir(evt_dir); @@ -994,7 +994,7 @@ void print_events_type(u8 type) else snprintf(name, sizeof(name), "%s", syms->symbol); - printf(" %-42s [%s]\n", name, + printf(" %-50s [%s]\n", name, event_type_descriptors[type]); } } @@ -1012,11 +1012,10 @@ int print_hwcache_events(const char *event_glob) for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { char *name = event_cache_name(type, op, i); - if (event_glob != NULL && - !strglobmatch(name, event_glob)) + if (event_glob != NULL && !strglobmatch(name, event_glob)) continue; - printf(" %-42s [%s]\n", name, + printf(" %-50s [%s]\n", name, event_type_descriptors[PERF_TYPE_HW_CACHE]); ++printed; } @@ -1026,14 +1025,16 @@ int print_hwcache_events(const char *event_glob) return printed; } +#define MAX_NAME_LEN 100 + /* * Print the help text for the event symbols: */ void print_events(const char *event_glob) { - struct event_symbol *syms = event_symbols; unsigned int i, type, prev_type = -1, printed = 0, ntypes_printed = 0; - char name[40]; + struct event_symbol *syms = event_symbols; + char name[MAX_NAME_LEN]; printf("\n"); printf("List of pre-defined events (to be used in -e):\n"); @@ -1053,10 +1054,10 @@ void print_events(const char *event_glob) continue; if (strlen(syms->alias)) - sprintf(name, "%s OR %s", syms->symbol, syms->alias); + snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias); else - strcpy(name, syms->symbol); - printf(" %-42s [%s]\n", name, + strncpy(name, syms->symbol, MAX_NAME_LEN); + printf(" %-50s [%s]\n", name, event_type_descriptors[type]); prev_type = type; @@ -1073,12 +1074,12 @@ void print_events(const char *event_glob) return; printf("\n"); - printf(" %-42s [%s]\n", + printf(" %-50s [%s]\n", "rNNN (see 'perf list --help' on how to encode it)", event_type_descriptors[PERF_TYPE_RAW]); printf("\n"); - printf(" %-42s [%s]\n", + printf(" %-50s [%s]\n", "mem:[:access]", event_type_descriptors[PERF_TYPE_BREAKPOINT]); printf("\n"); -- cgit v1.1 From 2b5a4ace664cfe05c17bee60c4da66263a05fccf Mon Sep 17 00:00:00 2001 From: artpol Date: Wed, 27 Apr 2011 17:49:14 +0000 Subject: mii: add support of pause frames in mii_get_an Add support of pause frames advertise in mii_get_an. This provides all drivers that use mii_ethtool_gset to represent their own and Link partner flow control abilities in ethtool. Signed-off-by: Artem Polyakov Signed-off-by: David S. Miller --- drivers/net/mii.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/mii.c b/drivers/net/mii.c index 0a6c6a2..d4fc00b 100644 --- a/drivers/net/mii.c +++ b/drivers/net/mii.c @@ -49,6 +49,10 @@ static u32 mii_get_an(struct mii_if_info *mii, u16 addr) result |= ADVERTISED_100baseT_Half; if (advert & ADVERTISE_100FULL) result |= ADVERTISED_100baseT_Full; + if (advert & ADVERTISE_PAUSE_CAP) + result |= ADVERTISED_Pause; + if (advert & ADVERTISE_PAUSE_ASYM) + result |= ADVERTISED_Asym_Pause; return result; } -- cgit v1.1 From 773d67903ad608d3f64cc5b00e2f881473413c13 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 26 Apr 2011 09:18:51 -0700 Subject: misc: fix ti-st build issues st_drv uses skb*() interfaces, so it should depend on NET. It also uses GPIO interfaces, so it should depend on GPIOLIB. st_kim.c uses syss_*() calls, so it should #include . Fixes these observed build errors: ERROR: "skb_queue_purge" [drivers/misc/ti-st/st_drv.ko] undefined! ERROR: "skb_pull" [drivers/misc/ti-st/st_drv.ko] undefined! ERROR: "skb_queue_tail" [drivers/misc/ti-st/st_drv.ko] undefined! ERROR: "__alloc_skb" [drivers/misc/ti-st/st_drv.ko] undefined! ERROR: "kfree_skb" [drivers/misc/ti-st/st_drv.ko] undefined! ERROR: "skb_dequeue" [drivers/misc/ti-st/st_drv.ko] undefined! ERROR: "skb_put" [drivers/misc/ti-st/st_drv.ko] undefined! Signed-off-by: Randy Dunlap Cc: Pavan Savoy Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ti-st/Kconfig | 1 + drivers/misc/ti-st/st_kim.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/misc/ti-st/Kconfig b/drivers/misc/ti-st/Kconfig index 7c3e1069..abb5de1 100644 --- a/drivers/misc/ti-st/Kconfig +++ b/drivers/misc/ti-st/Kconfig @@ -5,6 +5,7 @@ menu "Texas Instruments shared transport line discipline" config TI_ST tristate "Shared transport core driver" + depends on NET && GPIOLIB select FW_LOADER help This enables the shared transport core driver for TI diff --git a/drivers/misc/ti-st/st_kim.c b/drivers/misc/ti-st/st_kim.c index b4488c8..5da93ee 100644 --- a/drivers/misc/ti-st/st_kim.c +++ b/drivers/misc/ti-st/st_kim.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include -- cgit v1.1 From 57d5f9f808b7650a92f31e9cd3acd3f415a22530 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Mon, 14 Mar 2011 23:58:45 -0700 Subject: x86: get_bios_ebda_length() Add a wrapper routine that tells us the length of the EBDA if it is present. This guy also ensures that the returned length doesn't let the EBDA run past the 640KiB mark. Signed-off-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/bios_ebda.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 3c75210..5174cf0 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -14,6 +14,27 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } +/* + * Return the sanitized length of the EBDA in bytes, if it exists. + */ +static inline unsigned int get_bios_ebda_length(void) +{ + unsigned int address; + unsigned int length; + + address = get_bios_ebda(); + if (!address) + return 0; + + /* EBDA length is byte 0 of the EBDA (stored in KiB) */ + length = *(unsigned char *)phys_to_virt(address); + length <<= 10; + + /* Trim the length if it extends beyond 640KiB */ + length = min_t(unsigned int, (640 * 1024) - address, length); + return length; +} + void reserve_ebda_region(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -- cgit v1.1 From f548ccd47d608e88d432745091e13f927ced83f7 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Mon, 14 Mar 2011 23:58:50 -0700 Subject: x86: Better comments for get_bios_ebda() Make the comments a bit clearer for get_bios_ebda so that it actually tells us what it is returning. Signed-off-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/bios_ebda.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 5174cf0..aa6a317 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -4,11 +4,14 @@ #include /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E. + * Returns physical address of EBDA. Returns 0 if there is no EBDA. */ static inline unsigned int get_bios_ebda(void) { + /* + * There is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E. + */ unsigned int address = *(unsigned short *)phys_to_virt(0x40E); address <<= 4; return address; /* 0 means none */ -- cgit v1.1 From c63ca0c01d73563d4e2ab174bb3dd1e5efb907e6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 Apr 2011 16:04:15 -0600 Subject: perf stat: Tell user about unsupported events in the list Similar to perf-record, tell user about unsupported events that will not be counted if invoked in verbose mode. e.g., $ perf stat -e dTLB-prefetch-misses -v -- sleep 1 dTLB-prefetch-misses event is not supported by the kernel. dTLB-prefetch-misses: 0 0 0 Performance counter stats for 'sleep 1': dTLB-prefetch-misses 1.001884783 seconds time elapsed Signed-off-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1304114655-10600-1-git-send-email-dsahern@gmail.com Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c8b535b..602c3c9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -377,8 +377,12 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) { + if (verbose) + ui__warning("%s event is not supported by the kernel.\n", + event_name(counter)); continue; + } if (errno == EPERM || errno == EACCES) { error("You may not have permission to collect %sstats.\n" -- cgit v1.1 From 85eb8c8d0b0900c073b0e6f89979ac9c439ade1a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 30 Apr 2011 00:25:44 +0200 Subject: PM / Runtime: Generic clock manipulation rountines for runtime PM (v6) Many different platforms and subsystems may want to disable device clocks during suspend and enable them during resume which is going to be done in a very similar way in all those cases. For this reason, provide generic routines for the manipulation of device clocks during suspend and resume. Convert the ARM shmobile platform to using the new routines. Signed-off-by: Rafael J. Wysocki --- arch/arm/mach-shmobile/pm_runtime.c | 140 +----------- drivers/base/power/Makefile | 1 + drivers/base/power/clock_ops.c | 430 ++++++++++++++++++++++++++++++++++++ include/linux/pm_runtime.h | 42 ++++ kernel/power/Kconfig | 4 + 5 files changed, 486 insertions(+), 131 deletions(-) create mode 100644 drivers/base/power/clock_ops.c diff --git a/arch/arm/mach-shmobile/pm_runtime.c b/arch/arm/mach-shmobile/pm_runtime.c index 30bbe9a..2d1b67a 100644 --- a/arch/arm/mach-shmobile/pm_runtime.c +++ b/arch/arm/mach-shmobile/pm_runtime.c @@ -21,70 +21,6 @@ #include #ifdef CONFIG_PM_RUNTIME -#define BIT_ONCE 0 -#define BIT_ACTIVE 1 -#define BIT_CLK_ENABLED 2 - -struct pm_runtime_data { - unsigned long flags; - struct clk *clk; -}; - -static struct pm_runtime_data *__to_prd(struct device *dev) -{ - return dev ? dev->power.subsys_data : NULL; -} - -static void platform_pm_runtime_init(struct device *dev, - struct pm_runtime_data *prd) -{ - if (prd && !test_and_set_bit(BIT_ONCE, &prd->flags)) { - prd->clk = clk_get(dev, NULL); - if (!IS_ERR(prd->clk)) { - set_bit(BIT_ACTIVE, &prd->flags); - dev_info(dev, "clocks managed by runtime pm\n"); - } - } -} - -static void platform_pm_runtime_bug(struct device *dev, - struct pm_runtime_data *prd) -{ - if (prd && !test_and_set_bit(BIT_ONCE, &prd->flags)) - dev_err(dev, "runtime pm suspend before resume\n"); -} - -static int default_platform_runtime_suspend(struct device *dev) -{ - struct pm_runtime_data *prd = __to_prd(dev); - - dev_dbg(dev, "%s()\n", __func__); - - platform_pm_runtime_bug(dev, prd); - - if (prd && test_bit(BIT_ACTIVE, &prd->flags)) { - clk_disable(prd->clk); - clear_bit(BIT_CLK_ENABLED, &prd->flags); - } - - return 0; -} - -static int default_platform_runtime_resume(struct device *dev) -{ - struct pm_runtime_data *prd = __to_prd(dev); - - dev_dbg(dev, "%s()\n", __func__); - - platform_pm_runtime_init(dev, prd); - - if (prd && test_bit(BIT_ACTIVE, &prd->flags)) { - clk_enable(prd->clk); - set_bit(BIT_CLK_ENABLED, &prd->flags); - } - - return 0; -} static int default_platform_runtime_idle(struct device *dev) { @@ -94,87 +30,29 @@ static int default_platform_runtime_idle(struct device *dev) static struct dev_power_domain default_power_domain = { .ops = { - .runtime_suspend = default_platform_runtime_suspend, - .runtime_resume = default_platform_runtime_resume, + .runtime_suspend = pm_runtime_clk_suspend, + .runtime_resume = pm_runtime_clk_resume, .runtime_idle = default_platform_runtime_idle, USE_PLATFORM_PM_SLEEP_OPS }, }; -static int platform_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - struct pm_runtime_data *prd; - - dev_dbg(dev, "platform_bus_notify() %ld !\n", action); - - switch (action) { - case BUS_NOTIFY_BIND_DRIVER: - prd = kzalloc(sizeof(*prd), GFP_KERNEL); - if (prd) { - dev->power.subsys_data = prd; - dev->pwr_domain = &default_power_domain; - } else { - dev_err(dev, "unable to alloc memory for runtime pm\n"); - } - break; - case BUS_NOTIFY_UNBOUND_DRIVER: - prd = __to_prd(dev); - if (prd) { - if (test_bit(BIT_CLK_ENABLED, &prd->flags)) - clk_disable(prd->clk); +#define DEFAULT_PWR_DOMAIN_PTR (&default_power_domain) - if (test_bit(BIT_ACTIVE, &prd->flags)) - clk_put(prd->clk); - } - break; - } +#else - return 0; -} - -#else /* CONFIG_PM_RUNTIME */ - -static int platform_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - struct clk *clk; - - dev_dbg(dev, "platform_bus_notify() %ld !\n", action); - - switch (action) { - case BUS_NOTIFY_BIND_DRIVER: - clk = clk_get(dev, NULL); - if (!IS_ERR(clk)) { - clk_enable(clk); - clk_put(clk); - dev_info(dev, "runtime pm disabled, clock forced on\n"); - } - break; - case BUS_NOTIFY_UNBOUND_DRIVER: - clk = clk_get(dev, NULL); - if (!IS_ERR(clk)) { - clk_disable(clk); - clk_put(clk); - dev_info(dev, "runtime pm disabled, clock forced off\n"); - } - break; - } - - return 0; -} +#define DEFAULT_PWR_DOMAIN_PTR NULL #endif /* CONFIG_PM_RUNTIME */ -static struct notifier_block platform_bus_notifier = { - .notifier_call = platform_bus_notify +static struct pm_clk_notifier_block platform_bus_notifier = { + .pwr_domain = DEFAULT_PWR_DOMAIN_PTR, + .con_ids = { NULL, }, }; static int __init sh_pm_runtime_init(void) { - bus_register_notifier(&platform_bus_type, &platform_bus_notifier); + pm_runtime_clk_add_notifier(&platform_bus_type, &platform_bus_notifier); return 0; } core_initcall(sh_pm_runtime_init); diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 118c1b9..06a7073 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_PM_OPP) += opp.o +obj-$(CONFIG_HAVE_CLK) += clock_ops.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG ccflags-$(CONFIG_PM_VERBOSE) += -DDEBUG diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c new file mode 100644 index 0000000..d74abf3 --- /dev/null +++ b/drivers/base/power/clock_ops.c @@ -0,0 +1,430 @@ +/* + * drivers/base/power/clock_ops.c - Generic clock manipulation PM callbacks + * + * Copyright (c) 2011 Rafael J. Wysocki , Renesas Electronics Corp. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PM_RUNTIME + +struct pm_runtime_clk_data { + struct list_head clock_list; + struct mutex lock; +}; + +enum pce_status { + PCE_STATUS_NONE = 0, + PCE_STATUS_ACQUIRED, + PCE_STATUS_ENABLED, + PCE_STATUS_ERROR, +}; + +struct pm_clock_entry { + struct list_head node; + char *con_id; + struct clk *clk; + enum pce_status status; +}; + +static struct pm_runtime_clk_data *__to_prd(struct device *dev) +{ + return dev ? dev->power.subsys_data : NULL; +} + +/** + * pm_runtime_clk_add - Start using a device clock for runtime PM. + * @dev: Device whose clock is going to be used for runtime PM. + * @con_id: Connection ID of the clock. + * + * Add the clock represented by @con_id to the list of clocks used for + * the runtime PM of @dev. + */ +int pm_runtime_clk_add(struct device *dev, const char *con_id) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce; + + if (!prd) + return -EINVAL; + + ce = kzalloc(sizeof(*ce), GFP_KERNEL); + if (!ce) { + dev_err(dev, "Not enough memory for clock entry.\n"); + return -ENOMEM; + } + + if (con_id) { + ce->con_id = kstrdup(con_id, GFP_KERNEL); + if (!ce->con_id) { + dev_err(dev, + "Not enough memory for clock connection ID.\n"); + kfree(ce); + return -ENOMEM; + } + } + + mutex_lock(&prd->lock); + list_add_tail(&ce->node, &prd->clock_list); + mutex_unlock(&prd->lock); + return 0; +} + +/** + * __pm_runtime_clk_remove - Destroy runtime PM clock entry. + * @ce: Runtime PM clock entry to destroy. + * + * This routine must be called under the mutex protecting the runtime PM list + * of clocks corresponding the the @ce's device. + */ +static void __pm_runtime_clk_remove(struct pm_clock_entry *ce) +{ + if (!ce) + return; + + list_del(&ce->node); + + if (ce->status < PCE_STATUS_ERROR) { + if (ce->status == PCE_STATUS_ENABLED) + clk_disable(ce->clk); + + if (ce->status >= PCE_STATUS_ACQUIRED) + clk_put(ce->clk); + } + + if (ce->con_id) + kfree(ce->con_id); + + kfree(ce); +} + +/** + * pm_runtime_clk_remove - Stop using a device clock for runtime PM. + * @dev: Device whose clock should not be used for runtime PM any more. + * @con_id: Connection ID of the clock. + * + * Remove the clock represented by @con_id from the list of clocks used for + * the runtime PM of @dev. + */ +void pm_runtime_clk_remove(struct device *dev, const char *con_id) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce; + + if (!prd) + return; + + mutex_lock(&prd->lock); + + list_for_each_entry(ce, &prd->clock_list, node) { + if (!con_id && !ce->con_id) { + __pm_runtime_clk_remove(ce); + break; + } else if (!con_id || !ce->con_id) { + continue; + } else if (!strcmp(con_id, ce->con_id)) { + __pm_runtime_clk_remove(ce); + break; + } + } + + mutex_unlock(&prd->lock); +} + +/** + * pm_runtime_clk_init - Initialize a device's list of runtime PM clocks. + * @dev: Device to initialize the list of runtime PM clocks for. + * + * Allocate a struct pm_runtime_clk_data object, initialize its lock member and + * make the @dev's power.subsys_data field point to it. + */ +int pm_runtime_clk_init(struct device *dev) +{ + struct pm_runtime_clk_data *prd; + + prd = kzalloc(sizeof(*prd), GFP_KERNEL); + if (!prd) { + dev_err(dev, "Not enough memory fo runtime PM data.\n"); + return -ENOMEM; + } + + INIT_LIST_HEAD(&prd->clock_list); + mutex_init(&prd->lock); + dev->power.subsys_data = prd; + return 0; +} + +/** + * pm_runtime_clk_destroy - Destroy a device's list of runtime PM clocks. + * @dev: Device to destroy the list of runtime PM clocks for. + * + * Clear the @dev's power.subsys_data field, remove the list of clock entries + * from the struct pm_runtime_clk_data object pointed to by it before and free + * that object. + */ +void pm_runtime_clk_destroy(struct device *dev) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce, *c; + + if (!prd) + return; + + dev->power.subsys_data = NULL; + + mutex_lock(&prd->lock); + + list_for_each_entry_safe_reverse(ce, c, &prd->clock_list, node) + __pm_runtime_clk_remove(ce); + + mutex_unlock(&prd->lock); + + kfree(prd); +} + +/** + * pm_runtime_clk_acquire - Acquire a device clock. + * @dev: Device whose clock is to be acquired. + * @con_id: Connection ID of the clock. + */ +static void pm_runtime_clk_acquire(struct device *dev, + struct pm_clock_entry *ce) +{ + ce->clk = clk_get(dev, ce->con_id); + if (IS_ERR(ce->clk)) { + ce->status = PCE_STATUS_ERROR; + } else { + ce->status = PCE_STATUS_ACQUIRED; + dev_dbg(dev, "Clock %s managed by runtime PM.\n", ce->con_id); + } +} + +/** + * pm_runtime_clk_suspend - Disable clocks in a device's runtime PM clock list. + * @dev: Device to disable the clocks for. + */ +int pm_runtime_clk_suspend(struct device *dev) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce; + + dev_dbg(dev, "%s()\n", __func__); + + if (!prd) + return 0; + + mutex_lock(&prd->lock); + + list_for_each_entry_reverse(ce, &prd->clock_list, node) { + if (ce->status == PCE_STATUS_NONE) + pm_runtime_clk_acquire(dev, ce); + + if (ce->status < PCE_STATUS_ERROR) { + clk_disable(ce->clk); + ce->status = PCE_STATUS_ACQUIRED; + } + } + + mutex_unlock(&prd->lock); + + return 0; +} + +/** + * pm_runtime_clk_resume - Enable clocks in a device's runtime PM clock list. + * @dev: Device to enable the clocks for. + */ +int pm_runtime_clk_resume(struct device *dev) +{ + struct pm_runtime_clk_data *prd = __to_prd(dev); + struct pm_clock_entry *ce; + + dev_dbg(dev, "%s()\n", __func__); + + if (!prd) + return 0; + + mutex_lock(&prd->lock); + + list_for_each_entry(ce, &prd->clock_list, node) { + if (ce->status == PCE_STATUS_NONE) + pm_runtime_clk_acquire(dev, ce); + + if (ce->status < PCE_STATUS_ERROR) { + clk_enable(ce->clk); + ce->status = PCE_STATUS_ENABLED; + } + } + + mutex_unlock(&prd->lock); + + return 0; +} + +/** + * pm_runtime_clk_notify - Notify routine for device addition and removal. + * @nb: Notifier block object this function is a member of. + * @action: Operation being carried out by the caller. + * @data: Device the routine is being run for. + * + * For this function to work, @nb must be a member of an object of type + * struct pm_clk_notifier_block containing all of the requisite data. + * Specifically, the pwr_domain member of that object is copied to the device's + * pwr_domain field and its con_ids member is used to populate the device's list + * of runtime PM clocks, depending on @action. + * + * If the device's pwr_domain field is already populated with a value different + * from the one stored in the struct pm_clk_notifier_block object, the function + * does nothing. + */ +static int pm_runtime_clk_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct pm_clk_notifier_block *clknb; + struct device *dev = data; + char *con_id; + int error; + + dev_dbg(dev, "%s() %ld\n", __func__, action); + + clknb = container_of(nb, struct pm_clk_notifier_block, nb); + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + if (dev->pwr_domain) + break; + + error = pm_runtime_clk_init(dev); + if (error) + break; + + dev->pwr_domain = clknb->pwr_domain; + if (clknb->con_ids[0]) { + for (con_id = clknb->con_ids[0]; *con_id; con_id++) + pm_runtime_clk_add(dev, con_id); + } else { + pm_runtime_clk_add(dev, NULL); + } + + break; + case BUS_NOTIFY_DEL_DEVICE: + if (dev->pwr_domain != clknb->pwr_domain) + break; + + dev->pwr_domain = NULL; + pm_runtime_clk_destroy(dev); + break; + } + + return 0; +} + +#else /* !CONFIG_PM_RUNTIME */ + +/** + * enable_clock - Enable a device clock. + * @dev: Device whose clock is to be enabled. + * @con_id: Connection ID of the clock. + */ +static void enable_clock(struct device *dev, const char *con_id) +{ + struct clk *clk; + + clk = clk_get(dev, con_id); + if (!IS_ERR(clk)) { + clk_enable(clk); + clk_put(clk); + dev_info(dev, "Runtime PM disabled, clock forced on.\n"); + } +} + +/** + * disable_clock - Disable a device clock. + * @dev: Device whose clock is to be disabled. + * @con_id: Connection ID of the clock. + */ +static void disable_clock(struct device *dev, const char *con_id) +{ + struct clk *clk; + + clk = clk_get(dev, con_id); + if (!IS_ERR(clk)) { + clk_disable(clk); + clk_put(clk); + dev_info(dev, "Runtime PM disabled, clock forced off.\n"); + } +} + +/** + * pm_runtime_clk_notify - Notify routine for device addition and removal. + * @nb: Notifier block object this function is a member of. + * @action: Operation being carried out by the caller. + * @data: Device the routine is being run for. + * + * For this function to work, @nb must be a member of an object of type + * struct pm_clk_notifier_block containing all of the requisite data. + * Specifically, the con_ids member of that object is used to enable or disable + * the device's clocks, depending on @action. + */ +static int pm_runtime_clk_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct pm_clk_notifier_block *clknb; + struct device *dev = data; + + dev_dbg(dev, "%s() %ld\n", __func__, action); + + clknb = container_of(nb, struct pm_clk_notifier_block, nb); + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + if (clknb->con_ids[0]) { + for (con_id = clknb->con_ids[0]; *con_id; con_id++) + enable_clock(dev, con_id); + } else { + enable_clock(dev, NULL); + } + break; + case BUS_NOTIFY_DEL_DEVICE: + if (clknb->con_ids[0]) { + for (con_id = clknb->con_ids[0]; *con_id; con_id++) + disable_clock(dev, con_id); + } else { + disable_clock(dev, NULL); + } + break; + } + + return 0; +} + +#endif /* !CONFIG_PM_RUNTIME */ + +/** + * pm_runtime_clk_add_notifier - Add bus type notifier for runtime PM clocks. + * @bus: Bus type to add the notifier to. + * @clknb: Notifier to be added to the given bus type. + * + * The nb member of @clknb is not expected to be initialized and its + * notifier_call member will be replaced with pm_runtime_clk_notify(). However, + * the remaining members of @clknb should be populated prior to calling this + * routine. + */ +void pm_runtime_clk_add_notifier(struct bus_type *bus, + struct pm_clk_notifier_block *clknb) +{ + if (!bus || !clknb) + return; + + clknb->nb.notifier_call = pm_runtime_clk_notify; + bus_register_notifier(bus, &clknb->nb); +} diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 8de9aa6..878cf84 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -245,4 +245,46 @@ static inline void pm_runtime_dont_use_autosuspend(struct device *dev) __pm_runtime_use_autosuspend(dev, false); } +struct pm_clk_notifier_block { + struct notifier_block nb; + struct dev_power_domain *pwr_domain; + char *con_ids[]; +}; + +#ifdef CONFIG_PM_RUNTIME_CLK +extern int pm_runtime_clk_init(struct device *dev); +extern void pm_runtime_clk_destroy(struct device *dev); +extern int pm_runtime_clk_add(struct device *dev, const char *con_id); +extern void pm_runtime_clk_remove(struct device *dev, const char *con_id); +extern int pm_runtime_clk_suspend(struct device *dev); +extern int pm_runtime_clk_resume(struct device *dev); +#else +static inline int pm_runtime_clk_init(struct device *dev) +{ + return -EINVAL; +} +static inline void pm_runtime_clk_destroy(struct device *dev) +{ +} +static inline int pm_runtime_clk_add(struct device *dev, const char *con_id) +{ + return -EINVAL; +} +static inline void pm_runtime_clk_remove(struct device *dev, const char *con_id) +{ +} +#define pm_runtime_clock_suspend NULL +#define pm_runtime_clock_resume NULL +#endif + +#ifdef CONFIG_HAVE_CLK +extern void pm_runtime_clk_add_notifier(struct bus_type *bus, + struct pm_clk_notifier_block *clknb); +#else +static inline void pm_runtime_clk_add_notifier(struct bus_type *bus, + struct pm_clk_notifier_block *clknb) +{ +} +#endif + #endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6de9a8f..d74ad4a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -229,3 +229,7 @@ config PM_OPP representing individual voltage domains and provides SOC implementations a ready to use framework to manage OPPs. For more information, read + +config PM_RUNTIME_CLK + def_bool y + depends on PM_RUNTIME && HAVE_CLK -- cgit v1.1 From 74c5b31c6618f01079212332b2e5f6c42f2d6307 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Fri, 29 Apr 2011 17:39:19 -0700 Subject: driver: Google EFI SMI The "gsmi" driver bridges userland with firmware specific routines for accessing hardware. Currently, this driver only supports NVRAM and eventlog information. Deprecated functions have been removed from the driver, though their op-codes are left in place so that they are not re-used. This driver works by trampolining into the firmware via the smi_command outlined in the FADT table. Three protocols are used due to various limitations over time, but all are included herein. This driver should only ever load on Google boards, identified by either a "Google, Inc." board vendor string in DMI, or "GOOGLE" in the OEM strings of the FADT ACPI table. This logic happens in gsmi_system_valid(). Signed-off-by: Duncan Laurie Signed-off-by: Aaron Durbin Signed-off-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-firmware-gsmi | 58 ++ drivers/firmware/Kconfig | 2 + drivers/firmware/Makefile | 2 + drivers/firmware/google/Kconfig | 9 + drivers/firmware/google/Makefile | 2 + drivers/firmware/google/gsmi.c | 940 ++++++++++++++++++++++++++ 6 files changed, 1013 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-firmware-gsmi create mode 100644 drivers/firmware/google/Kconfig create mode 100644 drivers/firmware/google/Makefile create mode 100644 drivers/firmware/google/gsmi.c diff --git a/Documentation/ABI/testing/sysfs-firmware-gsmi b/Documentation/ABI/testing/sysfs-firmware-gsmi new file mode 100644 index 0000000..0faa0aa --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-gsmi @@ -0,0 +1,58 @@ +What: /sys/firmware/gsmi +Date: March 2011 +Contact: Mike Waychison +Description: + Some servers used internally at Google have firmware + that provides callback functionality via explicit SMI + triggers. Some of the callbacks are similar to those + provided by the EFI runtime services page, but due to + historical reasons this different entry-point has been + used. + + The gsmi driver implements the kernel's abstraction for + these firmware callbacks. Currently, this functionality + is limited to handling the system event log and getting + access to EFI-style variables stored in nvram. + + Layout: + + /sys/firmware/gsmi/vars: + + This directory has the same layout (and + underlying implementation as /sys/firmware/efi/vars. + See Documentation/ABI/*/sysfs-firmware-efi-vars + for more information on how to interact with + this structure. + + /sys/firmware/gsmi/append_to_eventlog - write-only: + + This file takes a binary blob and passes it onto + the firmware to be timestamped and appended to + the system eventlog. The binary format is + interpreted by the firmware and may change from + platform to platform. The only kernel-enforced + requirement is that the blob be prefixed with a + 32bit host-endian type used as part of the + firmware call. + + /sys/firmware/gsmi/clear_config - write-only: + + Writing any value to this file will cause the + entire firmware configuration to be reset to + "factory defaults". Callers should assume that + a reboot is required for the configuration to be + cleared. + + /sys/firmware/gsmi/clear_eventlog - write-only: + + This file is used to clear out a portion/the + whole of the system event log. Values written + should be values between 1 and 100 inclusive (in + ASCII) representing the fraction of the log to + clear. Not all platforms support fractional + clearing though, and this writes to this file + will error out if the firmware doesn't like your + submitted fraction. + + Callers should assume that a reboot is needed + for this operation to complete. diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index b3a25a5..efba163 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -157,4 +157,6 @@ config SIGMA If unsure, say N here. Drivers that need these helpers will select this option automatically. +source "drivers/firmware/google/Kconfig" + endmenu diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index 00bb0b8..d7d6009 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -13,3 +13,5 @@ obj-$(CONFIG_ISCSI_IBFT_FIND) += iscsi_ibft_find.o obj-$(CONFIG_ISCSI_IBFT) += iscsi_ibft.o obj-$(CONFIG_FIRMWARE_MEMMAP) += memmap.o obj-$(CONFIG_SIGMA) += sigma.o + +obj-y += google/ diff --git a/drivers/firmware/google/Kconfig b/drivers/firmware/google/Kconfig new file mode 100644 index 0000000..4a03835 --- /dev/null +++ b/drivers/firmware/google/Kconfig @@ -0,0 +1,9 @@ +config GOOGLE_SMI + tristate "SMI interface for Google platforms" + depends on ACPI && DMI + select EFI_VARS + help + Say Y here if you want to enable SMI callbacks for Google + platforms. This provides an interface for writing to and + clearing the EFI event log and reading and writing NVRAM + variables. diff --git a/drivers/firmware/google/Makefile b/drivers/firmware/google/Makefile new file mode 100644 index 0000000..fb127d7 --- /dev/null +++ b/drivers/firmware/google/Makefile @@ -0,0 +1,2 @@ + +obj-$(CONFIG_GOOGLE_SMI) += gsmi.o diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c new file mode 100644 index 0000000..fa7f0b3 --- /dev/null +++ b/drivers/firmware/google/gsmi.c @@ -0,0 +1,940 @@ +/* + * Copyright 2010 Google Inc. All Rights Reserved. + * Author: dlaurie@google.com (Duncan Laurie) + * + * Re-worked to expose sysfs APIs by mikew@google.com (Mike Waychison) + * + * EFI SMI interface for Google platforms + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GSMI_SHUTDOWN_CLEAN 0 /* Clean Shutdown */ +/* TODO(mikew@google.com): Tie in HARDLOCKUP_DETECTOR with NMIWDT */ +#define GSMI_SHUTDOWN_NMIWDT 1 /* NMI Watchdog */ +#define GSMI_SHUTDOWN_PANIC 2 /* Panic */ +#define GSMI_SHUTDOWN_OOPS 3 /* Oops */ +#define GSMI_SHUTDOWN_DIE 4 /* Die -- No longer meaningful */ +#define GSMI_SHUTDOWN_MCE 5 /* Machine Check */ +#define GSMI_SHUTDOWN_SOFTWDT 6 /* Software Watchdog */ +#define GSMI_SHUTDOWN_MBE 7 /* Uncorrected ECC */ +#define GSMI_SHUTDOWN_TRIPLE 8 /* Triple Fault */ + +#define DRIVER_VERSION "1.0" +#define GSMI_GUID_SIZE 16 +#define GSMI_BUF_SIZE 1024 +#define GSMI_BUF_ALIGN sizeof(u64) +#define GSMI_CALLBACK 0xef + +/* SMI return codes */ +#define GSMI_SUCCESS 0x00 +#define GSMI_UNSUPPORTED2 0x03 +#define GSMI_LOG_FULL 0x0b +#define GSMI_VAR_NOT_FOUND 0x0e +#define GSMI_HANDSHAKE_SPIN 0x7d +#define GSMI_HANDSHAKE_CF 0x7e +#define GSMI_HANDSHAKE_NONE 0x7f +#define GSMI_INVALID_PARAMETER 0x82 +#define GSMI_UNSUPPORTED 0x83 +#define GSMI_BUFFER_TOO_SMALL 0x85 +#define GSMI_NOT_READY 0x86 +#define GSMI_DEVICE_ERROR 0x87 +#define GSMI_NOT_FOUND 0x8e + +#define QUIRKY_BOARD_HASH 0x78a30a50 + +/* Internally used commands passed to the firmware */ +#define GSMI_CMD_GET_NVRAM_VAR 0x01 +#define GSMI_CMD_GET_NEXT_VAR 0x02 +#define GSMI_CMD_SET_NVRAM_VAR 0x03 +#define GSMI_CMD_SET_EVENT_LOG 0x08 +#define GSMI_CMD_CLEAR_EVENT_LOG 0x09 +#define GSMI_CMD_CLEAR_CONFIG 0x20 +#define GSMI_CMD_HANDSHAKE_TYPE 0xC1 + +/* Magic entry type for kernel events */ +#define GSMI_LOG_ENTRY_TYPE_KERNEL 0xDEAD + +/* SMI buffers must be in 32bit physical address space */ +struct gsmi_buf { + u8 *start; /* start of buffer */ + size_t length; /* length of buffer */ + dma_addr_t handle; /* dma allocation handle */ + u32 address; /* physical address of buffer */ +}; + +struct gsmi_device { + struct platform_device *pdev; /* platform device */ + struct gsmi_buf *name_buf; /* variable name buffer */ + struct gsmi_buf *data_buf; /* generic data buffer */ + struct gsmi_buf *param_buf; /* parameter buffer */ + spinlock_t lock; /* serialize access to SMIs */ + u16 smi_cmd; /* SMI command port */ + int handshake_type; /* firmware handler interlock type */ + struct dma_pool *dma_pool; /* DMA buffer pool */ +} gsmi_dev; + +/* Packed structures for communicating with the firmware */ +struct gsmi_nvram_var_param { + efi_guid_t guid; + u32 name_ptr; + u32 attributes; + u32 data_len; + u32 data_ptr; +} __packed; + +struct gsmi_get_next_var_param { + u8 guid[GSMI_GUID_SIZE]; + u32 name_ptr; + u32 name_len; +} __packed; + +struct gsmi_set_eventlog_param { + u32 data_ptr; + u32 data_len; + u32 type; +} __packed; + +/* Event log formats */ +struct gsmi_log_entry_type_1 { + u16 type; + u32 instance; +} __packed; + + +/* + * Some platforms don't have explicit SMI handshake + * and need to wait for SMI to complete. + */ +#define GSMI_DEFAULT_SPINCOUNT 0x10000 +static unsigned int spincount = GSMI_DEFAULT_SPINCOUNT; +module_param(spincount, uint, 0600); +MODULE_PARM_DESC(spincount, + "The number of loop iterations to use when using the spin handshake."); + +static struct gsmi_buf *gsmi_buf_alloc(void) +{ + struct gsmi_buf *smibuf; + + smibuf = kzalloc(sizeof(*smibuf), GFP_KERNEL); + if (!smibuf) { + printk(KERN_ERR "gsmi: out of memory\n"); + return NULL; + } + + /* allocate buffer in 32bit address space */ + smibuf->start = dma_pool_alloc(gsmi_dev.dma_pool, GFP_KERNEL, + &smibuf->handle); + if (!smibuf->start) { + printk(KERN_ERR "gsmi: failed to allocate name buffer\n"); + kfree(smibuf); + return NULL; + } + + /* fill in the buffer handle */ + smibuf->length = GSMI_BUF_SIZE; + smibuf->address = (u32)virt_to_phys(smibuf->start); + + return smibuf; +} + +static void gsmi_buf_free(struct gsmi_buf *smibuf) +{ + if (smibuf) { + if (smibuf->start) + dma_pool_free(gsmi_dev.dma_pool, smibuf->start, + smibuf->handle); + kfree(smibuf); + } +} + +/* + * Make a call to gsmi func(sub). GSMI error codes are translated to + * in-kernel errnos (0 on success, -ERRNO on error). + */ +static int gsmi_exec(u8 func, u8 sub) +{ + u16 cmd = (sub << 8) | func; + u16 result = 0; + int rc = 0; + + /* + * AH : Subfunction number + * AL : Function number + * EBX : Parameter block address + * DX : SMI command port + * + * Three protocols here. See also the comment in gsmi_init(). + */ + if (gsmi_dev.handshake_type == GSMI_HANDSHAKE_CF) { + /* + * If handshake_type == HANDSHAKE_CF then set CF on the + * way in and wait for the handler to clear it; this avoids + * corrupting register state on those chipsets which have + * a delay between writing the SMI trigger register and + * entering SMM. + */ + asm volatile ( + "stc\n" + "outb %%al, %%dx\n" + "1: jc 1b\n" + : "=a" (result) + : "0" (cmd), + "d" (gsmi_dev.smi_cmd), + "b" (gsmi_dev.param_buf->address) + : "memory", "cc" + ); + } else if (gsmi_dev.handshake_type == GSMI_HANDSHAKE_SPIN) { + /* + * If handshake_type == HANDSHAKE_SPIN we spin a + * hundred-ish usecs to ensure the SMI has triggered. + */ + asm volatile ( + "outb %%al, %%dx\n" + "1: loop 1b\n" + : "=a" (result) + : "0" (cmd), + "d" (gsmi_dev.smi_cmd), + "b" (gsmi_dev.param_buf->address), + "c" (spincount) + : "memory", "cc" + ); + } else { + /* + * If handshake_type == HANDSHAKE_NONE we do nothing; + * either we don't need to or it's legacy firmware that + * doesn't understand the CF protocol. + */ + asm volatile ( + "outb %%al, %%dx\n\t" + : "=a" (result) + : "0" (cmd), + "d" (gsmi_dev.smi_cmd), + "b" (gsmi_dev.param_buf->address) + : "memory", "cc" + ); + } + + /* check return code from SMI handler */ + switch (result) { + case GSMI_SUCCESS: + break; + case GSMI_VAR_NOT_FOUND: + /* not really an error, but let the caller know */ + rc = 1; + break; + case GSMI_INVALID_PARAMETER: + printk(KERN_ERR "gsmi: exec 0x%04x: Invalid parameter\n", cmd); + rc = -EINVAL; + break; + case GSMI_BUFFER_TOO_SMALL: + printk(KERN_ERR "gsmi: exec 0x%04x: Buffer too small\n", cmd); + rc = -ENOMEM; + break; + case GSMI_UNSUPPORTED: + case GSMI_UNSUPPORTED2: + if (sub != GSMI_CMD_HANDSHAKE_TYPE) + printk(KERN_ERR "gsmi: exec 0x%04x: Not supported\n", + cmd); + rc = -ENOSYS; + break; + case GSMI_NOT_READY: + printk(KERN_ERR "gsmi: exec 0x%04x: Not ready\n", cmd); + rc = -EBUSY; + break; + case GSMI_DEVICE_ERROR: + printk(KERN_ERR "gsmi: exec 0x%04x: Device error\n", cmd); + rc = -EFAULT; + break; + case GSMI_NOT_FOUND: + printk(KERN_ERR "gsmi: exec 0x%04x: Data not found\n", cmd); + rc = -ENOENT; + break; + case GSMI_LOG_FULL: + printk(KERN_ERR "gsmi: exec 0x%04x: Log full\n", cmd); + rc = -ENOSPC; + break; + case GSMI_HANDSHAKE_CF: + case GSMI_HANDSHAKE_SPIN: + case GSMI_HANDSHAKE_NONE: + rc = result; + break; + default: + printk(KERN_ERR "gsmi: exec 0x%04x: Unknown error 0x%04x\n", + cmd, result); + rc = -ENXIO; + } + + return rc; +} + +/* Return the number of unicode characters in data */ +static size_t +utf16_strlen(efi_char16_t *data, unsigned long maxlength) +{ + unsigned long length = 0; + + while (*data++ != 0 && length < maxlength) + length++; + return length; +} + +static efi_status_t gsmi_get_variable(efi_char16_t *name, + efi_guid_t *vendor, u32 *attr, + unsigned long *data_size, + void *data) +{ + struct gsmi_nvram_var_param param = { + .name_ptr = gsmi_dev.name_buf->address, + .data_ptr = gsmi_dev.data_buf->address, + .data_len = (u32)*data_size, + }; + efi_status_t ret = EFI_SUCCESS; + unsigned long flags; + size_t name_len = utf16_strlen(name, GSMI_BUF_SIZE / 2); + int rc; + + if (name_len >= GSMI_BUF_SIZE / 2) + return EFI_BAD_BUFFER_SIZE; + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + /* Vendor guid */ + memcpy(¶m.guid, vendor, sizeof(param.guid)); + + /* variable name, already in UTF-16 */ + memset(gsmi_dev.name_buf->start, 0, gsmi_dev.name_buf->length); + memcpy(gsmi_dev.name_buf->start, name, name_len * 2); + + /* data pointer */ + memset(gsmi_dev.data_buf->start, 0, gsmi_dev.data_buf->length); + + /* parameter buffer */ + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + memcpy(gsmi_dev.param_buf->start, ¶m, sizeof(param)); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_GET_NVRAM_VAR); + if (rc < 0) { + printk(KERN_ERR "gsmi: Get Variable failed\n"); + ret = EFI_LOAD_ERROR; + } else if (rc == 1) { + /* variable was not found */ + ret = EFI_NOT_FOUND; + } else { + /* Get the arguments back */ + memcpy(¶m, gsmi_dev.param_buf->start, sizeof(param)); + + /* The size reported is the min of all of our buffers */ + *data_size = min(*data_size, gsmi_dev.data_buf->length); + *data_size = min_t(unsigned long, *data_size, param.data_len); + + /* Copy data back to return buffer. */ + memcpy(data, gsmi_dev.data_buf->start, *data_size); + + /* All variables are have the following attributes */ + *attr = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS; + } + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + return ret; +} + +static efi_status_t gsmi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + struct gsmi_get_next_var_param param = { + .name_ptr = gsmi_dev.name_buf->address, + .name_len = gsmi_dev.name_buf->length, + }; + efi_status_t ret = EFI_SUCCESS; + int rc; + unsigned long flags; + + /* For the moment, only support buffers that exactly match in size */ + if (*name_size != GSMI_BUF_SIZE) + return EFI_BAD_BUFFER_SIZE; + + /* Let's make sure the thing is at least null-terminated */ + if (utf16_strlen(name, GSMI_BUF_SIZE / 2) == GSMI_BUF_SIZE / 2) + return EFI_INVALID_PARAMETER; + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + /* guid */ + memcpy(¶m.guid, vendor, sizeof(param.guid)); + + /* variable name, already in UTF-16 */ + memcpy(gsmi_dev.name_buf->start, name, *name_size); + + /* parameter buffer */ + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + memcpy(gsmi_dev.param_buf->start, ¶m, sizeof(param)); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_GET_NEXT_VAR); + if (rc < 0) { + printk(KERN_ERR "gsmi: Get Next Variable Name failed\n"); + ret = EFI_LOAD_ERROR; + } else if (rc == 1) { + /* variable not found -- end of list */ + ret = EFI_NOT_FOUND; + } else { + /* copy variable data back to return buffer */ + memcpy(¶m, gsmi_dev.param_buf->start, sizeof(param)); + + /* Copy the name back */ + memcpy(name, gsmi_dev.name_buf->start, GSMI_BUF_SIZE); + *name_size = utf16_strlen(name, GSMI_BUF_SIZE / 2) * 2; + + /* copy guid to return buffer */ + memcpy(vendor, ¶m.guid, sizeof(param.guid)); + ret = EFI_SUCCESS; + } + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + return ret; +} + +static efi_status_t gsmi_set_variable(efi_char16_t *name, + efi_guid_t *vendor, + unsigned long attr, + unsigned long data_size, + void *data) +{ + struct gsmi_nvram_var_param param = { + .name_ptr = gsmi_dev.name_buf->address, + .data_ptr = gsmi_dev.data_buf->address, + .data_len = (u32)data_size, + .attributes = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + }; + size_t name_len = utf16_strlen(name, GSMI_BUF_SIZE / 2); + efi_status_t ret = EFI_SUCCESS; + int rc; + unsigned long flags; + + if (name_len >= GSMI_BUF_SIZE / 2) + return EFI_BAD_BUFFER_SIZE; + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + /* guid */ + memcpy(¶m.guid, vendor, sizeof(param.guid)); + + /* variable name, already in UTF-16 */ + memset(gsmi_dev.name_buf->start, 0, gsmi_dev.name_buf->length); + memcpy(gsmi_dev.name_buf->start, name, name_len * 2); + + /* data pointer */ + memset(gsmi_dev.data_buf->start, 0, gsmi_dev.data_buf->length); + memcpy(gsmi_dev.data_buf->start, data, data_size); + + /* parameter buffer */ + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + memcpy(gsmi_dev.param_buf->start, ¶m, sizeof(param)); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_SET_NVRAM_VAR); + if (rc < 0) { + printk(KERN_ERR "gsmi: Set Variable failed\n"); + ret = EFI_INVALID_PARAMETER; + } + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + return ret; +} + +static const struct efivar_operations efivar_ops = { + .get_variable = gsmi_get_variable, + .set_variable = gsmi_set_variable, + .get_next_variable = gsmi_get_next_variable, +}; + +static ssize_t eventlog_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + struct gsmi_set_eventlog_param param = { + .data_ptr = gsmi_dev.data_buf->address, + }; + int rc = 0; + unsigned long flags; + + /* Pull the type out */ + if (count < sizeof(u32)) + return -EINVAL; + param.type = *(u32 *)buf; + count -= sizeof(u32); + buf += sizeof(u32); + + /* The remaining buffer is the data payload */ + if (count > gsmi_dev.data_buf->length) + return -EINVAL; + param.data_len = count - sizeof(u32); + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + /* data pointer */ + memset(gsmi_dev.data_buf->start, 0, gsmi_dev.data_buf->length); + memcpy(gsmi_dev.data_buf->start, buf, param.data_len); + + /* parameter buffer */ + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + memcpy(gsmi_dev.param_buf->start, ¶m, sizeof(param)); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_SET_EVENT_LOG); + if (rc < 0) + printk(KERN_ERR "gsmi: Set Event Log failed\n"); + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + return rc; + +} + +static struct bin_attribute eventlog_bin_attr = { + .attr = {.name = "append_to_eventlog", .mode = 0200}, + .write = eventlog_write, +}; + +static ssize_t gsmi_clear_eventlog_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc; + unsigned long flags; + unsigned long val; + struct { + u32 percentage; + u32 data_type; + } param; + + rc = strict_strtoul(buf, 0, &val); + if (rc) + return rc; + + /* + * Value entered is a percentage, 0 through 100, anything else + * is invalid. + */ + if (val > 100) + return -EINVAL; + + /* data_type here selects the smbios event log. */ + param.percentage = val; + param.data_type = 0; + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + /* parameter buffer */ + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + memcpy(gsmi_dev.param_buf->start, ¶m, sizeof(param)); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_CLEAR_EVENT_LOG); + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + if (rc) + return rc; + return count; +} + +static struct kobj_attribute gsmi_clear_eventlog_attr = { + .attr = {.name = "clear_eventlog", .mode = 0200}, + .store = gsmi_clear_eventlog_store, +}; + +static ssize_t gsmi_clear_config_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + /* clear parameter buffer */ + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_CLEAR_CONFIG); + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + if (rc) + return rc; + return count; +} + +static struct kobj_attribute gsmi_clear_config_attr = { + .attr = {.name = "clear_config", .mode = 0200}, + .store = gsmi_clear_config_store, +}; + +static const struct attribute *gsmi_attrs[] = { + &gsmi_clear_config_attr.attr, + &gsmi_clear_eventlog_attr.attr, + NULL, +}; + +static int gsmi_shutdown_reason(int reason) +{ + struct gsmi_log_entry_type_1 entry = { + .type = GSMI_LOG_ENTRY_TYPE_KERNEL, + .instance = reason, + }; + struct gsmi_set_eventlog_param param = { + .data_len = sizeof(entry), + .type = 1, + }; + static int saved_reason; + int rc = 0; + unsigned long flags; + + /* avoid duplicate entries in the log */ + if (saved_reason & (1 << reason)) + return 0; + + spin_lock_irqsave(&gsmi_dev.lock, flags); + + saved_reason |= (1 << reason); + + /* data pointer */ + memset(gsmi_dev.data_buf->start, 0, gsmi_dev.data_buf->length); + memcpy(gsmi_dev.data_buf->start, &entry, sizeof(entry)); + + /* parameter buffer */ + param.data_ptr = gsmi_dev.data_buf->address; + memset(gsmi_dev.param_buf->start, 0, gsmi_dev.param_buf->length); + memcpy(gsmi_dev.param_buf->start, ¶m, sizeof(param)); + + rc = gsmi_exec(GSMI_CALLBACK, GSMI_CMD_SET_EVENT_LOG); + + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + if (rc < 0) + printk(KERN_ERR "gsmi: Log Shutdown Reason failed\n"); + else + printk(KERN_EMERG "gsmi: Log Shutdown Reason 0x%02x\n", + reason); + + return rc; +} + +static int gsmi_reboot_callback(struct notifier_block *nb, + unsigned long reason, void *arg) +{ + gsmi_shutdown_reason(GSMI_SHUTDOWN_CLEAN); + return NOTIFY_DONE; +} + +static struct notifier_block gsmi_reboot_notifier = { + .notifier_call = gsmi_reboot_callback +}; + +static int gsmi_die_callback(struct notifier_block *nb, + unsigned long reason, void *arg) +{ + if (reason == DIE_OOPS) + gsmi_shutdown_reason(GSMI_SHUTDOWN_OOPS); + return NOTIFY_DONE; +} + +static struct notifier_block gsmi_die_notifier = { + .notifier_call = gsmi_die_callback +}; + +static int gsmi_panic_callback(struct notifier_block *nb, + unsigned long reason, void *arg) +{ + gsmi_shutdown_reason(GSMI_SHUTDOWN_PANIC); + return NOTIFY_DONE; +} + +static struct notifier_block gsmi_panic_notifier = { + .notifier_call = gsmi_panic_callback, +}; + +/* + * This hash function was blatantly copied from include/linux/hash.h. + * It is used by this driver to obfuscate a board name that requires a + * quirk within this driver. + * + * Please do not remove this copy of the function as any changes to the + * global utility hash_64() function would break this driver's ability + * to identify a board and provide the appropriate quirk -- mikew@google.com + */ +static u64 __init local_hash_64(u64 val, unsigned bits) +{ + u64 hash = val; + + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + u64 n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; + + /* High bits are more random, so use them. */ + return hash >> (64 - bits); +} + +static u32 __init hash_oem_table_id(char s[8]) +{ + u64 input; + memcpy(&input, s, 8); + return local_hash_64(input, 32); +} + +static struct dmi_system_id gsmi_dmi_table[] __initdata = { + { + .ident = "Google Board", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Google, Inc."), + }, + }, + {} +}; +MODULE_DEVICE_TABLE(dmi, gsmi_dmi_table); + +static __init int gsmi_system_valid(void) +{ + u32 hash; + + if (!dmi_check_system(gsmi_dmi_table)) + return -ENODEV; + + /* + * Only newer firmware supports the gsmi interface. All older + * firmware that didn't support this interface used to plug the + * table name in the first four bytes of the oem_table_id field. + * Newer firmware doesn't do that though, so use that as the + * discriminant factor. We have to do this in order to + * whitewash our board names out of the public driver. + */ + if (!strncmp(acpi_gbl_FADT.header.oem_table_id, "FACP", 4)) { + printk(KERN_INFO "gsmi: Board is too old\n"); + return -ENODEV; + } + + /* Disable on board with 1.0 BIOS due to Google bug 2602657 */ + hash = hash_oem_table_id(acpi_gbl_FADT.header.oem_table_id); + if (hash == QUIRKY_BOARD_HASH) { + const char *bios_ver = dmi_get_system_info(DMI_BIOS_VERSION); + if (strncmp(bios_ver, "1.0", 3) == 0) { + pr_info("gsmi: disabled on this board's BIOS %s\n", + bios_ver); + return -ENODEV; + } + } + + /* check for valid SMI command port in ACPI FADT */ + if (acpi_gbl_FADT.smi_command == 0) { + pr_info("gsmi: missing smi_command\n"); + return -ENODEV; + } + + /* Found */ + return 0; +} + +static struct kobject *gsmi_kobj; +static struct efivars efivars; + +static __init int gsmi_init(void) +{ + unsigned long flags; + int ret; + + ret = gsmi_system_valid(); + if (ret) + return ret; + + gsmi_dev.smi_cmd = acpi_gbl_FADT.smi_command; + + /* register device */ + gsmi_dev.pdev = platform_device_register_simple("gsmi", -1, NULL, 0); + if (IS_ERR(gsmi_dev.pdev)) { + printk(KERN_ERR "gsmi: unable to register platform device\n"); + return PTR_ERR(gsmi_dev.pdev); + } + + /* SMI access needs to be serialized */ + spin_lock_init(&gsmi_dev.lock); + + /* SMI callbacks require 32bit addresses */ + gsmi_dev.pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); + gsmi_dev.pdev->dev.dma_mask = + &gsmi_dev.pdev->dev.coherent_dma_mask; + ret = -ENOMEM; + gsmi_dev.dma_pool = dma_pool_create("gsmi", &gsmi_dev.pdev->dev, + GSMI_BUF_SIZE, GSMI_BUF_ALIGN, 0); + if (!gsmi_dev.dma_pool) + goto out_err; + + /* + * pre-allocate buffers because sometimes we are called when + * this is not feasible: oops, panic, die, mce, etc + */ + gsmi_dev.name_buf = gsmi_buf_alloc(); + if (!gsmi_dev.name_buf) { + printk(KERN_ERR "gsmi: failed to allocate name buffer\n"); + goto out_err; + } + + gsmi_dev.data_buf = gsmi_buf_alloc(); + if (!gsmi_dev.data_buf) { + printk(KERN_ERR "gsmi: failed to allocate data buffer\n"); + goto out_err; + } + + gsmi_dev.param_buf = gsmi_buf_alloc(); + if (!gsmi_dev.param_buf) { + printk(KERN_ERR "gsmi: failed to allocate param buffer\n"); + goto out_err; + } + + /* + * Determine type of handshake used to serialize the SMI + * entry. See also gsmi_exec(). + * + * There's a "behavior" present on some chipsets where writing the + * SMI trigger register in the southbridge doesn't result in an + * immediate SMI. Rather, the processor can execute "a few" more + * instructions before the SMI takes effect. To ensure synchronous + * behavior, implement a handshake between the kernel driver and the + * firmware handler to spin until released. This ioctl determines + * the type of handshake. + * + * NONE: The firmware handler does not implement any + * handshake. Either it doesn't need to, or it's legacy firmware + * that doesn't know it needs to and never will. + * + * CF: The firmware handler will clear the CF in the saved + * state before returning. The driver may set the CF and test for + * it to clear before proceeding. + * + * SPIN: The firmware handler does not implement any handshake + * but the driver should spin for a hundred or so microseconds + * to ensure the SMI has triggered. + * + * Finally, the handler will return -ENOSYS if + * GSMI_CMD_HANDSHAKE_TYPE is unimplemented, which implies + * HANDSHAKE_NONE. + */ + spin_lock_irqsave(&gsmi_dev.lock, flags); + gsmi_dev.handshake_type = GSMI_HANDSHAKE_SPIN; + gsmi_dev.handshake_type = + gsmi_exec(GSMI_CALLBACK, GSMI_CMD_HANDSHAKE_TYPE); + if (gsmi_dev.handshake_type == -ENOSYS) + gsmi_dev.handshake_type = GSMI_HANDSHAKE_NONE; + spin_unlock_irqrestore(&gsmi_dev.lock, flags); + + /* Remove and clean up gsmi if the handshake could not complete. */ + if (gsmi_dev.handshake_type == -ENXIO) { + printk(KERN_INFO "gsmi version " DRIVER_VERSION + " failed to load\n"); + ret = -ENODEV; + goto out_err; + } + + printk(KERN_INFO "gsmi version " DRIVER_VERSION " loaded\n"); + + /* Register in the firmware directory */ + ret = -ENOMEM; + gsmi_kobj = kobject_create_and_add("gsmi", firmware_kobj); + if (!gsmi_kobj) { + printk(KERN_INFO "gsmi: Failed to create firmware kobj\n"); + goto out_err; + } + + /* Setup eventlog access */ + ret = sysfs_create_bin_file(gsmi_kobj, &eventlog_bin_attr); + if (ret) { + printk(KERN_INFO "gsmi: Failed to setup eventlog"); + goto out_err; + } + + /* Other attributes */ + ret = sysfs_create_files(gsmi_kobj, gsmi_attrs); + if (ret) { + printk(KERN_INFO "gsmi: Failed to add attrs"); + goto out_err; + } + + if (register_efivars(&efivars, &efivar_ops, gsmi_kobj)) { + printk(KERN_INFO "gsmi: Failed to register efivars\n"); + goto out_err; + } + + register_reboot_notifier(&gsmi_reboot_notifier); + register_die_notifier(&gsmi_die_notifier); + atomic_notifier_chain_register(&panic_notifier_list, + &gsmi_panic_notifier); + + return 0; + + out_err: + kobject_put(gsmi_kobj); + gsmi_buf_free(gsmi_dev.param_buf); + gsmi_buf_free(gsmi_dev.data_buf); + gsmi_buf_free(gsmi_dev.name_buf); + if (gsmi_dev.dma_pool) + dma_pool_destroy(gsmi_dev.dma_pool); + platform_device_unregister(gsmi_dev.pdev); + pr_info("gsmi: failed to load: %d\n", ret); + return ret; +} + +static void __exit gsmi_exit(void) +{ + unregister_reboot_notifier(&gsmi_reboot_notifier); + unregister_die_notifier(&gsmi_die_notifier); + atomic_notifier_chain_unregister(&panic_notifier_list, + &gsmi_panic_notifier); + unregister_efivars(&efivars); + + kobject_put(gsmi_kobj); + gsmi_buf_free(gsmi_dev.param_buf); + gsmi_buf_free(gsmi_dev.data_buf); + gsmi_buf_free(gsmi_dev.name_buf); + dma_pool_destroy(gsmi_dev.dma_pool); + platform_device_unregister(gsmi_dev.pdev); +} + +module_init(gsmi_init); +module_exit(gsmi_exit); + +MODULE_AUTHOR("Google, Inc."); +MODULE_LICENSE("GPL"); -- cgit v1.1 From e561bc45920aade3f8a5aad9058a00e750af1345 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Fri, 29 Apr 2011 17:39:25 -0700 Subject: driver: Google Memory Console This patch introduces the 'memconsole' driver. Our firmware gives us access to an in-memory log of the firmware's output. This gives us visibility in a data-center of headless machines as to what the firmware is doing. The memory console is found by the driver by finding a header block in the EBDA. The buffer is then copied out, and is exported to userland in the file /sys/firmware/log. Signed-off-by: San Mehat Signed-off-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-firmware-log | 7 ++ drivers/firmware/google/Kconfig | 8 ++ drivers/firmware/google/Makefile | 1 + drivers/firmware/google/memconsole.c | 166 +++++++++++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-firmware-log create mode 100644 drivers/firmware/google/memconsole.c diff --git a/Documentation/ABI/testing/sysfs-firmware-log b/Documentation/ABI/testing/sysfs-firmware-log new file mode 100644 index 0000000..9b58e7c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-log @@ -0,0 +1,7 @@ +What: /sys/firmware/log +Date: February 2011 +Contact: Mike Waychison +Description: + The /sys/firmware/log is a binary file that represents a + read-only copy of the firmware's log if one is + available. diff --git a/drivers/firmware/google/Kconfig b/drivers/firmware/google/Kconfig index 4a03835..640dc6b 100644 --- a/drivers/firmware/google/Kconfig +++ b/drivers/firmware/google/Kconfig @@ -7,3 +7,11 @@ config GOOGLE_SMI platforms. This provides an interface for writing to and clearing the EFI event log and reading and writing NVRAM variables. + +config GOOGLE_MEMCONSOLE + tristate "Firmware Memory Console" + depends on DMI + help + This option enables the kernel to search for a firmware log in + the EBDA on Google servers. If found, this log is exported to + userland in the file /sys/firmware/log. diff --git a/drivers/firmware/google/Makefile b/drivers/firmware/google/Makefile index fb127d7..54a294e 100644 --- a/drivers/firmware/google/Makefile +++ b/drivers/firmware/google/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_GOOGLE_SMI) += gsmi.o +obj-$(CONFIG_GOOGLE_MEMCONSOLE) += memconsole.o diff --git a/drivers/firmware/google/memconsole.c b/drivers/firmware/google/memconsole.c new file mode 100644 index 0000000..2a90ba6 --- /dev/null +++ b/drivers/firmware/google/memconsole.c @@ -0,0 +1,166 @@ +/* + * memconsole.c + * + * Infrastructure for importing the BIOS memory based console + * into the kernel log ringbuffer. + * + * Copyright 2010 Google Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BIOS_MEMCONSOLE_V1_MAGIC 0xDEADBABE +#define BIOS_MEMCONSOLE_V2_MAGIC (('M')|('C'<<8)|('O'<<16)|('N'<<24)) + +struct biosmemcon_ebda { + u32 signature; + union { + struct { + u8 enabled; + u32 buffer_addr; + u16 start; + u16 end; + u16 num_chars; + u8 wrapped; + } __packed v1; + struct { + u32 buffer_addr; + /* Misdocumented as number of pages! */ + u16 num_bytes; + u16 start; + u16 end; + } __packed v2; + }; +} __packed; + +static char *memconsole_baseaddr; +static size_t memconsole_length; + +static ssize_t memconsole_read(struct file *filp, struct kobject *kobp, + struct bin_attribute *bin_attr, char *buf, + loff_t pos, size_t count) +{ + return memory_read_from_buffer(buf, count, &pos, memconsole_baseaddr, + memconsole_length); +} + +static struct bin_attribute memconsole_bin_attr = { + .attr = {.name = "log", .mode = 0444}, + .read = memconsole_read, +}; + + +static void found_v1_header(struct biosmemcon_ebda *hdr) +{ + printk(KERN_INFO "BIOS console v1 EBDA structure found at %p\n", hdr); + printk(KERN_INFO "BIOS console buffer at 0x%.8x, " + "start = %d, end = %d, num = %d\n", + hdr->v1.buffer_addr, hdr->v1.start, + hdr->v1.end, hdr->v1.num_chars); + + memconsole_length = hdr->v1.num_chars; + memconsole_baseaddr = phys_to_virt(hdr->v1.buffer_addr); +} + +static void found_v2_header(struct biosmemcon_ebda *hdr) +{ + printk(KERN_INFO "BIOS console v2 EBDA structure found at %p\n", hdr); + printk(KERN_INFO "BIOS console buffer at 0x%.8x, " + "start = %d, end = %d, num_bytes = %d\n", + hdr->v2.buffer_addr, hdr->v2.start, + hdr->v2.end, hdr->v2.num_bytes); + + memconsole_length = hdr->v2.end - hdr->v2.start; + memconsole_baseaddr = phys_to_virt(hdr->v2.buffer_addr + + hdr->v2.start); +} + +/* + * Search through the EBDA for the BIOS Memory Console, and + * set the global variables to point to it. Return true if found. + */ +static bool found_memconsole(void) +{ + unsigned int address; + size_t length, cur; + + address = get_bios_ebda(); + if (!address) { + printk(KERN_INFO "BIOS EBDA non-existent.\n"); + return false; + } + + /* EBDA length is byte 0 of EBDA (in KB) */ + length = *(u8 *)phys_to_virt(address); + length <<= 10; /* convert to bytes */ + + /* + * Search through EBDA for BIOS memory console structure + * note: signature is not necessarily dword-aligned + */ + for (cur = 0; cur < length; cur++) { + struct biosmemcon_ebda *hdr = phys_to_virt(address + cur); + + /* memconsole v1 */ + if (hdr->signature == BIOS_MEMCONSOLE_V1_MAGIC) { + found_v1_header(hdr); + return true; + } + + /* memconsole v2 */ + if (hdr->signature == BIOS_MEMCONSOLE_V2_MAGIC) { + found_v2_header(hdr); + return true; + } + } + + printk(KERN_INFO "BIOS console EBDA structure not found!\n"); + return false; +} + +static struct dmi_system_id memconsole_dmi_table[] __initdata = { + { + .ident = "Google Board", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Google, Inc."), + }, + }, + {} +}; +MODULE_DEVICE_TABLE(dmi, memconsole_dmi_table); + +static int __init memconsole_init(void) +{ + int ret; + + if (!dmi_check_system(memconsole_dmi_table)) + return -ENODEV; + + if (!found_memconsole()) + return -ENODEV; + + memconsole_bin_attr.size = memconsole_length; + + ret = sysfs_create_bin_file(firmware_kobj, &memconsole_bin_attr); + + return ret; +} + +static void __exit memconsole_exit(void) +{ + sysfs_remove_bin_file(firmware_kobj, &memconsole_bin_attr); +} + +module_init(memconsole_init); +module_exit(memconsole_exit); + +MODULE_AUTHOR("Google, Inc."); +MODULE_LICENSE("GPL"); -- cgit v1.1 From a1d9a09ae8003380a7f2297ee4367947cbdf874f Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Fri, 29 Apr 2011 17:39:31 -0700 Subject: Introduce CONFIG_GOOGLE_FIRMWARE In order to keep Google's firmware drivers organized amongst themselves, all Google firmware drivers are gated on CONFIG_GOOGLE_FIRMWARE=y, which defaults to 'n' in the kernel build. Signed-off-by: Mike Waychison Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/Makefile | 2 +- drivers/firmware/google/Kconfig | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index d7d6009..47338c9 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -14,4 +14,4 @@ obj-$(CONFIG_ISCSI_IBFT) += iscsi_ibft.o obj-$(CONFIG_FIRMWARE_MEMMAP) += memmap.o obj-$(CONFIG_SIGMA) += sigma.o -obj-y += google/ +obj-$(CONFIG_GOOGLE_FIRMWARE) += google/ diff --git a/drivers/firmware/google/Kconfig b/drivers/firmware/google/Kconfig index 640dc6b..87096b6 100644 --- a/drivers/firmware/google/Kconfig +++ b/drivers/firmware/google/Kconfig @@ -1,3 +1,15 @@ +config GOOGLE_FIRMWARE + bool "Google Firmware Drivers" + depends on X86 + default n + help + These firmware drivers are used by Google's servers. They are + only useful if you are working directly on one of their + proprietary servers. If in doubt, say "N". + +menu "Google Firmware Drivers" + depends on GOOGLE_FIRMWARE + config GOOGLE_SMI tristate "SMI interface for Google platforms" depends on ACPI && DMI @@ -15,3 +27,5 @@ config GOOGLE_MEMCONSOLE This option enables the kernel to search for a firmware log in the EBDA on Google servers. If found, this log is exported to userland in the file /sys/firmware/log. + +endmenu -- cgit v1.1 From 058e297d34a404caaa5ed277de15698d8dc43000 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 22:35:33 -0400 Subject: ftrace: Only update the function code on write to filter files If function tracing is enabled, a read of the filter files will cause the call to stop_machine to update the function trace sites. It should only call stop_machine on write. Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee24fa1..666880d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2413,14 +2413,16 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_match_records(parser->buffer, parser->idx, enable); } - mutex_lock(&ftrace_lock); - if (ftrace_start_up && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftrace_lock); - trace_parser_put(parser); kfree(iter); + if (file->f_mode & FMODE_WRITE) { + mutex_lock(&ftrace_lock); + if (ftrace_start_up && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftrace_lock); + } + mutex_unlock(&ftrace_regex_lock); return 0; } -- cgit v1.1 From 0778d9ad33898faab7bf6316108b471790376e35 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 10:36:31 -0400 Subject: ftrace: Make FTRACE_WARN_ON() work in if condition Let FTRACE_WARN_ON() be used as a stand alone statement or inside a conditional: if (FTRACE_WARN_ON(x)) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee24fa1..4ff65599 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -39,16 +39,20 @@ #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ - do { \ - if (WARN_ON(cond)) \ + ({ \ + int ___r = cond; \ + if (WARN_ON(___r)) \ ftrace_kill(); \ - } while (0) + ___r; \ + }) #define FTRACE_WARN_ON_ONCE(cond) \ - do { \ - if (WARN_ON_ONCE(cond)) \ + ({ \ + int ___r = cond; \ + if (WARN_ON_ONCE(___r)) \ ftrace_kill(); \ - } while (0) + ___r; \ + }) /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 -- cgit v1.1 From 8ab2b7efd3e2ccf2c2dda3206b8171ecdbd0af40 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 21 Apr 2011 22:41:35 -0400 Subject: ftrace: Remove unnecessary disabling of irqs The disabling of interrupts around ftrace_update_code() was used to protect against the evil ftrace daemon from years past. But that daemon has long been killed. It is safe to keep interrupts enabled while updating the initial mcount into nops. The ftrace_mutex is also held which keeps other users at bay. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4ff65599..f199fb2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2707,7 +2707,6 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; mutex_lock(&ftrace_lock); p = start; @@ -2724,10 +2723,7 @@ static int ftrace_process_locs(struct module *mod, ftrace_record_ip(addr); } - /* disable interrupts to prevent kstop machine */ - local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v1.1 From 3499e461147636bf55c41128d83b679ac6ab2d86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 21 Apr 2011 22:59:12 -0400 Subject: ftrace: Remove failures file The failures file in the debugfs tracing directory would list the functions that failed to convert when the old dead ftrace daemon tried to update code but failed. Since this code is now dead along with the daemon the failures file is useless. Remove it. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 ++------------------------------------- 1 file changed, 2 insertions(+), 37 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f199fb2..97b30f8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1355,9 +1355,8 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_FAILURES = (1 << 2), - FTRACE_ITER_PRINTALL = (1 << 3), - FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_PRINTALL = (1 << 2), + FTRACE_ITER_HASH = (1 << 3), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1487,12 +1486,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) rec = &iter->pg->records[iter->idx++]; if ((rec->flags & FTRACE_FL_FREE) || - (!(iter->flags & FTRACE_ITER_FAILURES) && - (rec->flags & FTRACE_FL_FAILED)) || - - ((iter->flags & FTRACE_ITER_FAILURES) && - !(rec->flags & FTRACE_FL_FAILED)) || - ((iter->flags & FTRACE_ITER_FILTER) && !(rec->flags & FTRACE_FL_FILTER)) || @@ -1633,24 +1626,6 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -static int -ftrace_failures_open(struct inode *inode, struct file *file) -{ - int ret; - struct seq_file *m; - struct ftrace_iterator *iter; - - ret = ftrace_avail_open(inode, file); - if (!ret) { - m = file->private_data; - iter = m->private; - iter->flags = FTRACE_ITER_FAILURES; - } - - return ret; -} - - static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; @@ -2448,13 +2423,6 @@ static const struct file_operations ftrace_avail_fops = { .release = seq_release_private, }; -static const struct file_operations ftrace_failures_fops = { - .open = ftrace_failures_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -2683,9 +2651,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); - trace_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - trace_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); -- cgit v1.1 From 45a4a2372b364107cabea79f255b333236626416 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 21 Apr 2011 23:16:46 -0400 Subject: ftrace: Remove FTRACE_FL_FAILED flag Since we disable all function tracer processing if we detect that a modification of a instruction had failed, we do not need to track that the record has failed. No more ftrace processing is allowed, and the FTRACE_FL_FAILED flag is pointless. Removing this flag simplifies some of the code, but some ftrace_disabled checks needed to be added or move around a little. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 9 +++--- kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++++------------------- 2 files changed, 51 insertions(+), 34 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ca29e03..2a195ff 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,11 +147,10 @@ extern int ftrace_text_reserved(void *start, void *end); enum { FTRACE_FL_FREE = (1 << 0), - FTRACE_FL_FAILED = (1 << 1), - FTRACE_FL_FILTER = (1 << 2), - FTRACE_FL_ENABLED = (1 << 3), - FTRACE_FL_NOTRACE = (1 << 4), - FTRACE_FL_CONVERTED = (1 << 5), + FTRACE_FL_FILTER = (1 << 1), + FTRACE_FL_ENABLED = (1 << 2), + FTRACE_FL_NOTRACE = (1 << 3), + FTRACE_FL_CONVERTED = (1 << 4), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97b30f8..eb19fae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1083,19 +1083,20 @@ static void ftrace_replace_code(int enable) struct ftrace_page *pg; int failed; + if (unlikely(ftrace_disabled)) + return; + do_for_each_ftrace_rec(pg, rec) { /* * Skip over free records, records that have * failed and not converted. */ if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || !(rec->flags & FTRACE_FL_CONVERTED)) continue; failed = __ftrace_replace_code(rec, enable); if (failed) { - rec->flags |= FTRACE_FL_FAILED; ftrace_bug(failed, rec->ip); /* Stop processing */ return; @@ -1111,10 +1112,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ip = rec->ip; + if (unlikely(ftrace_disabled)) + return 0; + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { ftrace_bug(ret, ip); - rec->flags |= FTRACE_FL_FAILED; return 0; } return 1; @@ -1466,6 +1469,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = NULL; + if (unlikely(ftrace_disabled)) + return NULL; + if (iter->flags & FTRACE_ITER_HASH) return t_hash_next(m, pos); @@ -1518,6 +1524,10 @@ static void *t_start(struct seq_file *m, loff_t *pos) loff_t l; mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + return NULL; + /* * If an lseek was done, then reset and start from beginning. */ @@ -1636,8 +1646,6 @@ static void ftrace_filter_reset(int enable) if (enable) ftrace_filtered = 0; do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; rec->flags &= ~type; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); @@ -1767,9 +1775,6 @@ static int ftrace_match_records(char *buff, int len, int enable) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; - if (ftrace_match_record(rec, search, search_len, type)) { if (not) rec->flags &= ~flag; @@ -1837,10 +1842,11 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) } mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_module_record(rec, mod, search, search_len, type)) { @@ -1854,6 +1860,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); return found; @@ -2008,10 +2015,11 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return -EINVAL; mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { if (!ftrace_match_record(rec, search, len, type)) continue; @@ -2218,6 +2226,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, mutex_lock(&ftrace_regex_lock); + ret = -ENODEV; + if (unlikely(ftrace_disabled)) + goto out_unlock; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; iter = m->private; @@ -2545,9 +2557,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) bool exists; int i; - if (ftrace_disabled) - return -ENODEV; - /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) @@ -2556,9 +2565,15 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) search_len = strlen(search); mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) { + mutex_unlock(&ftrace_lock); + return -ENODEV; + } + do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + if (rec->flags & FTRACE_FL_FREE) continue; if (ftrace_match_record(rec, search, search_len, type)) { @@ -2700,10 +2715,11 @@ void ftrace_release_mod(struct module *mod) struct dyn_ftrace *rec; struct ftrace_page *pg; + mutex_lock(&ftrace_lock); + if (ftrace_disabled) - return; + goto out_unlock; - mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if (within_module_core(rec->ip, mod)) { /* @@ -2714,6 +2730,7 @@ void ftrace_release_mod(struct module *mod) ftrace_free_rec(rec); } } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); } @@ -3108,16 +3125,17 @@ void ftrace_kill(void) */ int register_ftrace_function(struct ftrace_ops *ops) { - int ret; - - if (unlikely(ftrace_disabled)) - return -1; + int ret = -1; mutex_lock(&ftrace_lock); + if (unlikely(ftrace_disabled)) + goto out_unlock; + ret = __register_ftrace_function(ops); ftrace_startup(0); + out_unlock: mutex_unlock(&ftrace_lock); return ret; } @@ -3145,14 +3163,14 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; + int ret = -ENODEV; mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (unlikely(ftrace_disabled)) + goto out; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; -- cgit v1.1 From d2c8c3eafbf715306ec891e7ca52d3d999acbe31 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Apr 2011 14:32:42 -0400 Subject: ftrace: Remove FTRACE_FL_CONVERTED flag Since we disable all function tracer processing if we detect that a modification of a instruction had failed, we do not need to track that the record has failed. No more ftrace processing is allowed, and the FTRACE_FL_CONVERTED flag is pointless. The FTRACE_FL_CONVERTED flag was used to denote records that were successfully converted from mcount calls into nops. But if a single record fails, all of ftrace is disabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 12 ++++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2a195ff..3204744 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -150,7 +150,6 @@ enum { FTRACE_FL_FILTER = (1 << 1), FTRACE_FL_ENABLED = (1 << 2), FTRACE_FL_NOTRACE = (1 << 3), - FTRACE_FL_CONVERTED = (1 << 4), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb19fae..9abaaf4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1087,12 +1087,8 @@ static void ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { - /* - * Skip over free records, records that have - * failed and not converted. - */ - if (rec->flags & FTRACE_FL_FREE || - !(rec->flags & FTRACE_FL_CONVERTED)) + /* Skip over free records */ + if (rec->flags & FTRACE_FL_FREE) continue; failed = __ftrace_replace_code(rec, enable); @@ -1280,10 +1276,10 @@ static int ftrace_update_code(struct module *mod) */ if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); - continue; + /* Game over */ + break; } - p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; /* -- cgit v1.1 From 996e87be7f537fb34638875dd37083166c733425 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 Apr 2011 16:11:03 -0400 Subject: ftrace: Move record update for normal and modules into a separate function The updating of a function record is moved to a single function. This will allow us to add specific changes in one location for both modules and kernel functions. Later patches will determine if the function record itself needs to be updated (which enables the mcount caller), or just the ftrace_ops needs the update. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9abaaf4..5b758ea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1743,6 +1743,15 @@ static int ftrace_match(char *str, char *regex, int len, int type) return matched; } +static void +update_record(struct dyn_ftrace *rec, unsigned long flag, int not) +{ + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; +} + static int ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) { @@ -1772,10 +1781,7 @@ static int ftrace_match_records(char *buff, int len, int enable) do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, search, search_len, type)) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + update_record(rec, flag, not); found = 1; } /* @@ -1846,10 +1852,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) if (ftrace_match_module_record(rec, mod, search, search_len, type)) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + update_record(rec, flag, not); found = 1; } if (enable && (rec->flags & FTRACE_FL_FILTER)) -- cgit v1.1 From 491d0dcfb9707e1f83eff93ca503eb7573162ef2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Apr 2011 21:43:36 -0400 Subject: ftrace: Consolidate updating of ftrace_trace_function There are three locations that perform almost identical functions in order to update the ftrace_trace_function (the ftrace function variable that gets called by mcount). Consolidate these into a single function called update_ftrace_function(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 95 ++++++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 61 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b758ea..33bcc71 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,6 +151,34 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + /* + * If there's only one function registered, then call that + * function directly. Otherwise, we need to iterate over the + * registered callers. + */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + func = ftrace_list->func; + else + func = ftrace_list_func; + + /* If we filter on pids, update to use the pid function */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; + ftrace_trace_function = ftrace_test_stop_func; +#endif +} + static int __register_ftrace_function(struct ftrace_ops *ops) { ops->next = ftrace_list; @@ -162,30 +190,8 @@ static int __register_ftrace_function(struct ftrace_ops *ops) */ rcu_assign_pointer(ftrace_list, ops); - if (ftrace_enabled) { - ftrace_func_t func; - - if (ops->next == &ftrace_list_end) - func = ops->func; - else - func = ftrace_list_func; - - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; - ftrace_trace_function = ftrace_test_stop_func; -#endif - } + if (ftrace_enabled) + update_ftrace_function(); return 0; } @@ -213,52 +219,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; - if (ftrace_enabled) { - /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) { - ftrace_func_t func = ftrace_list->func; - - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; -#endif - } - } + if (ftrace_enabled) + update_ftrace_function(); return 0; } static void ftrace_update_pid_func(void) { - ftrace_func_t func; - + /* Only do something if we are tracing something */ if (ftrace_trace_function == ftrace_stub) return; -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - func = ftrace_trace_function; -#else - func = __ftrace_trace_function; -#endif - - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } else { - if (func == ftrace_pid_func) - func = ftrace_pid_function; - } - -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; -#endif + update_ftrace_function(); } #ifdef CONFIG_FUNCTION_PROFILER -- cgit v1.1 From b9df92d2a94eef8811061aecb1396290df440e2e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Apr 2011 20:32:08 -0400 Subject: ftrace: Consolidate the function match routines for normal and mods The code used for matching functions is almost identical between normal selecting of functions and using the :mod: feature of set_ftrace_notrace. Consolidate the two users into one function. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 98 +++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 62 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 33bcc71..4f19dbb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1726,34 +1726,52 @@ update_record(struct dyn_ftrace *rec, unsigned long flag, int not) } static int -ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +ftrace_match_record(struct dyn_ftrace *rec, char *mod, + char *regex, int len, int type) { char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + + if (mod) { + /* module lookup requires matching the module */ + if (!modname || strcmp(modname, mod)) + return 0; + + /* blank search means to match all funcs in the mod */ + if (!len) + return 1; + } - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); return ftrace_match(str, regex, len, type); } -static int ftrace_match_records(char *buff, int len, int enable) +static int match_records(char *buff, int len, char *mod, int enable, int not) { - unsigned int search_len; + unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; + int type = MATCH_FULL; + char *search = buff; unsigned long flag; - char *search; - int type; - int not; int found = 0; - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = filter_parse_regex(buff, len, &search, ¬); + if (len) { + type = filter_parse_regex(buff, len, &search, ¬); + search_len = strlen(search); + } - search_len = strlen(search); + flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + do_for_each_ftrace_rec(pg, rec) { - if (ftrace_match_record(rec, search, search_len, type)) { + if (ftrace_match_record(rec, mod, search, search_len, type)) { update_record(rec, flag, not); found = 1; } @@ -1763,43 +1781,23 @@ static int ftrace_match_records(char *buff, int len, int enable) */ if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; + } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); return found; } static int -ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) +ftrace_match_records(char *buff, int len, int enable) { - char str[KSYM_SYMBOL_LEN]; - char *modname; - - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - - if (!modname || strcmp(modname, mod)) - return 0; - - /* blank search means to match all funcs in the mod */ - if (len) - return ftrace_match(str, regex, len, type); - else - return 1; + return match_records(buff, len, NULL, enable, 0); } static int ftrace_match_module_records(char *buff, char *mod, int enable) { - unsigned search_len = 0; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; - unsigned long flag; int not = 0; - int found = 0; - - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; /* blank or '*' mean the same */ if (strcmp(buff, "*") == 0) @@ -1811,31 +1809,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) not = 1; } - if (strlen(buff)) { - type = filter_parse_regex(buff, strlen(buff), &search, ¬); - search_len = strlen(search); - } - - mutex_lock(&ftrace_lock); - - if (unlikely(ftrace_disabled)) - goto out_unlock; - - do_for_each_ftrace_rec(pg, rec) { - - if (ftrace_match_module_record(rec, mod, - search, search_len, type)) { - update_record(rec, flag, not); - found = 1; - } - if (enable && (rec->flags & FTRACE_FL_FILTER)) - ftrace_filtered = 1; - - } while_for_each_ftrace_rec(); - out_unlock: - mutex_unlock(&ftrace_lock); - - return found; + return match_records(buff, strlen(buff), mod, enable, not); } /* @@ -1993,7 +1967,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, search, len, type)) + if (!ftrace_match_record(rec, NULL, search, len, type)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -2548,7 +2522,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) if (rec->flags & FTRACE_FL_FREE) continue; - if (ftrace_match_record(rec, search, search_len, type)) { + if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { -- cgit v1.1 From e8bf8df9c296b782c32236c6a5893aec301320c7 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 30 Apr 2011 10:14:08 +0100 Subject: CLKDEV: Fix clkdev return value for NULL clk case clkdev may incorrectly cause a clkdev entry with a NULL clk to return -ENOENT. This is not the intention of this code; -ENOENT should only be returned if the clock entry can not be found in the table. Fix this. Reported-by: Stephen Boyd Signed-off-by: Russell King --- drivers/clk/clkdev.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c index 0fc0a79..6db161f 100644 --- a/drivers/clk/clkdev.c +++ b/drivers/clk/clkdev.c @@ -32,10 +32,9 @@ static DEFINE_MUTEX(clocks_mutex); * Then we take the most specific entry - with the following * order of precedence: dev+con > dev only > con only. */ -static struct clk *clk_find(const char *dev_id, const char *con_id) +static struct clk_lookup *clk_find(const char *dev_id, const char *con_id) { - struct clk_lookup *p; - struct clk *clk = NULL; + struct clk_lookup *p, *cl = NULL; int match, best = 0; list_for_each_entry(p, &clocks, node) { @@ -52,27 +51,27 @@ static struct clk *clk_find(const char *dev_id, const char *con_id) } if (match > best) { - clk = p->clk; + cl = p; if (match != 3) best = match; else break; } } - return clk; + return cl; } struct clk *clk_get_sys(const char *dev_id, const char *con_id) { - struct clk *clk; + struct clk_lookup *cl; mutex_lock(&clocks_mutex); - clk = clk_find(dev_id, con_id); - if (clk && !__clk_get(clk)) - clk = NULL; + cl = clk_find(dev_id, con_id); + if (cl && !__clk_get(cl->clk)) + cl = NULL; mutex_unlock(&clocks_mutex); - return clk ? clk : ERR_PTR(-ENOENT); + return cl ? cl->clk : ERR_PTR(-ENOENT); } EXPORT_SYMBOL(clk_get_sys); -- cgit v1.1 From e20a2d205c05cef6b5783df339a7d54adeb50962 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 29 Apr 2011 17:47:43 -0400 Subject: x86, AMD: Fix APIC timer erratum 400 affecting K8 Rev.A-E processors Older AMD K8 processors (Revisions A-E) are affected by erratum 400 (APIC timer interrupts don't occur in C states greater than C1). This, for example, means that X86_FEATURE_ARAT flag should not be set for these parts. This addresses regression introduced by commit b87cf80af3ba4b4c008b4face3c68d604e1715c6 ("x86, AMD: Set ARAT feature on AMD processors") where the system may become unresponsive until external interrupt (such as keyboard input) occurs. This results, for example, in time not being reported correctly, lack of progress on the system and other lockups. Reported-by: Joerg-Volker Peetz Tested-by: Joerg-Volker Peetz Acked-by: Borislav Petkov Signed-off-by: Boris Ostrovsky Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1304113663-6586-1-git-send-email-ostr@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3532d3b..bb9eb29 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -698,7 +698,7 @@ cpu_dev_register(amd_cpu_dev); */ const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); EXPORT_SYMBOL_GPL(amd_erratum_400); -- cgit v1.1 From 2be19102b71c1a45d37fec50303791daa1a06869 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 May 2011 19:12:04 +0200 Subject: x86, NUMA: Fix empty memblk detection in numa_cleanup_meminfo() numa_cleanup_meminfo() trims each memblk between low (0) and high (max_pfn) limits and discards empty ones. However, the emptiness detection incorrectly used equality test. If the start of a memblk is higher than max_pfn, it is empty but fails the equality test and doesn't get discarded. The condition triggers when max_pfn is lower than start of a NUMA node and results in memory misconfiguration - leading to WARN_ON()s and other funnies. The bug was discovered in devel branch where 32bit too uses this code path for NUMA init. If a node is above the addressing limit, max_pfn ends up lower than the node triggering this problem. The failure hasn't been observed on x86-64 but is still possible with broken hardware e820/NUMA info. As the fix is very low risk, it would be better to apply it even for 64bit. Fix it by using >= instead of ==. Signed-off-by: Yinghai Lu [ Extracted the actual fix from the original patch and rewrote patch description. ] Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/20110501171204.GO29280@htj.dyndns.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e8c00cc..85b52fc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -306,7 +306,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->end = min(bi->end, high); /* and there's no empty block */ - if (bi->start == bi->end) { + if (bi->start >= bi->end) { numa_remove_memblk_from(i--, mi); continue; } -- cgit v1.1 From 9de4966a4d218f29c68e96e8e7b4d2840dedec79 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 1 May 2011 14:09:21 +0200 Subject: x86: Fix spelling error in the memcpy() source code comment Signed-off-by: Bart Van Assche Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/201105011409.21629.bvanassche@acm.org Signed-off-by: Ingo Molnar --- arch/x86/lib/memcpy_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e..2a560bb 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -49,7 +49,7 @@ ENTRY(memcpy) jb .Lhandle_tail /* - * We check whether memory false dependece could occur, + * We check whether memory false dependence could occur, * then jump to corresponding copy mode. */ cmp %dil, %sil -- cgit v1.1 From a05d2ad1c1f391c7f514a1d1e09b5417968a7d07 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Apr 2011 01:54:57 +0000 Subject: af_unix: Only allow recv on connected seqpacket sockets. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following oops discovered by Dan Aloni: > Anyway, the following is the output of the Oops that I got on the > Ubuntu kernel on which I first detected the problem > (2.6.37-12-generic). The Oops that followed will be more useful, I > guess. >[ 5594.669852] BUG: unable to handle kernel NULL pointer dereference > at           (null) > [ 5594.681606] IP: [] unix_dgram_recvmsg+0x1fb/0x420 > [ 5594.687576] PGD 2a05d067 PUD 2b951067 PMD 0 > [ 5594.693720] Oops: 0002 [#1] SMP > [ 5594.699888] last sysfs file: The bug was that unix domain sockets use a pseduo packet for connecting and accept uses that psudo packet to get the socket. In the buggy seqpacket case we were allowing unconnected sockets to call recvmsg and try to receive the pseudo packet. That is always wrong and as of commit 7361c36c5 the pseudo packet had become enough different from a normal packet that the kernel started oopsing. Do for seqpacket_recv what was done for seqpacket_send in 2.5 and only allow it on connected seqpacket sockets. Cc: stable@kernel.org Tested-by: Dan Aloni Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/unix/af_unix.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3a43a83..b1d75be 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -524,6 +524,8 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); +static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t, int); static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, @@ -583,7 +585,7 @@ static const struct proto_ops unix_seqpacket_ops = { .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = unix_seqpacket_sendmsg, - .recvmsg = unix_dgram_recvmsg, + .recvmsg = unix_seqpacket_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; @@ -1699,6 +1701,18 @@ static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, return unix_dgram_sendmsg(kiocb, sock, msg, len); } +static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + return unix_dgram_recvmsg(iocb, sock, msg, size, flags); +} + static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_sock *u = unix_sk(sk); -- cgit v1.1 From 7cfd260910b881250cde76ba92ebe3cbf8493a8f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 1 May 2011 02:04:11 +0000 Subject: ipv4: don't spam dmesg with "Using LC-trie" messages fib_trie_table() is called during netns creation and Chromium uses clone(CLONE_NEWNET) to sandbox renderer process. Don't print anything. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e9013d6..5fe9b8b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1978,9 +1978,6 @@ struct fib_table *fib_trie_table(u32 id) t = (struct trie *) tb->tb_data; memset(t, 0, sizeof(*t)); - if (id == RT_TABLE_LOCAL) - pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); - return tb; } -- cgit v1.1 From 383cf4e8d47f902600263191f8f167379c376985 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Wed, 27 Apr 2011 17:02:37 +0300 Subject: usb: musb: omap2430: Fix retention idle on musb peripheral only boards Recent runtime pm and hwmod conversions for 2.6.39 broke the musb peripheral mode OMAP retention idle on boards where the board mode in struct musb_hdrc_platform_data is set to MUSB_PERIPHERAL. These conversions changed the way how the OTG_SYSCONFIG register is configured and used in runtime. Before 2.6.39 smart standby/idle modes were activated statically in OTG_SYSCONFIG. Those modes allow that the musb is able to idle when peripheral device is not connected to host. In 2.6.39 the OTG_SYSCONFIG is updated runtime depending on VBUS status. No standby/idle modes are used when device is connected and force standby/idle when disconnected. Unfortunately VBUS disconnect event that handles the disconnect case lets the peripheral musb to idle only when board mode is MUSB_OTG. Fix this by checking the peripheral mode also. Signed-off-by: Jarkko Nikula Signed-off-by: Felipe Balbi --- drivers/usb/musb/omap2430.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c index 57a27fa..e9e60b6 100644 --- a/drivers/usb/musb/omap2430.c +++ b/drivers/usb/musb/omap2430.c @@ -270,7 +270,7 @@ static int musb_otg_notifications(struct notifier_block *nb, DBG(4, "VBUS Disconnect\n"); #ifdef CONFIG_USB_GADGET_MUSB_HDRC - if (is_otg_enabled(musb)) + if (is_otg_enabled(musb) || is_peripheral_enabled(musb)) if (musb->gadget_driver) #endif { -- cgit v1.1 From cdefce169594742ace29a2016dfa381755428ab5 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 29 Apr 2011 16:17:35 +0300 Subject: usb: musb: gadget: Fix out-of-sync runtime pm calls If cable is not connected to peripheral only board when initializing the gadget driver, then runtime pm calls are out-of-sync and the musb cannot idle with omap2430.c. This was noted on Nokia N900 where musb prevented the CPU to be able to enter deeper retention idle state. This was working in 2.6.38 before runtime pm conversions but there musb smart standby/idle modes were configured statically where they are now updated runtime depending on use and cable status. Reason for out-of-sync is that runtime pm is activated in function musb_gadget.c: usb_gadget_probe_driver but suspended only in OTG mode if cable is not connected when initializing. In peripheral only mode this leads to out-of-sync runtime pm since runtime pm remain active and is activated another time in omap2430.c: musb_otg_notifications for VBUS Connect event and thus cannot suspend for VBUS Disconnect event since the use count remains active. Fix this by moving cable status check and pm_runtime_put call in usb_gadget_probe_driver out of is_otg_enabled block. Signed-off-by: Jarkko Nikula Signed-off-by: Felipe Balbi --- drivers/usb/musb/musb_gadget.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c index 6dfbf9f..f47c201 100644 --- a/drivers/usb/musb/musb_gadget.c +++ b/drivers/usb/musb/musb_gadget.c @@ -1887,11 +1887,9 @@ int usb_gadget_probe_driver(struct usb_gadget_driver *driver, otg_set_vbus(musb->xceiv, 1); hcd->self.uses_pio_for_control = 1; - - if (musb->xceiv->last_event == USB_EVENT_NONE) - pm_runtime_put(musb->controller); - } + if (musb->xceiv->last_event == USB_EVENT_NONE) + pm_runtime_put(musb->controller); return 0; -- cgit v1.1 From d11536e4e0e99c26d33c849b44cd279cdd67b032 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Thu, 21 Apr 2011 19:52:41 +0530 Subject: mfd: Fix usbhs_enable error handling In the case of missing platform_data we do not hold a spin_lock, thus we should not call spin_unlock_irqrestore in the error path. Also simplify the error handling by separating the successful path from error path. I think this change improves readability. Signed-off-by: Axel Lin Tested-by: Steve Calfee Signed-off-by: Felipe Balbi --- drivers/mfd/omap-usb-host.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index 53450f4..b3bb3ac 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -700,8 +700,7 @@ static int usbhs_enable(struct device *dev) dev_dbg(dev, "starting TI HSUSB Controller\n"); if (!pdata) { dev_dbg(dev, "missing platform_data\n"); - ret = -ENODEV; - goto end_enable; + return -ENODEV; } spin_lock_irqsave(&omap->lock, flags); @@ -915,7 +914,8 @@ static int usbhs_enable(struct device *dev) end_count: omap->count++; - goto end_enable; + spin_unlock_irqrestore(&omap->lock, flags); + return 0; err_tll: if (pdata->ehci_data->phy_reset) { @@ -931,8 +931,6 @@ err_tll: clk_disable(omap->usbhost_fs_fck); clk_disable(omap->usbhost_hs_fck); clk_disable(omap->usbhost_ick); - -end_enable: spin_unlock_irqrestore(&omap->lock, flags); return ret; } -- cgit v1.1 From 9688678a6670c7f0ae3872450a8047c0ad401efb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:51 +0200 Subject: x86-64, NUMA: Simplify hotadd memory handling The only special handling NUMA needs to do for hotadd memory is determining the node for the hotadd memory given the address of it and there's nothing specific to specific config method used. srat_64.c does somewhat elaborate error checking on ACPI_SRAT_MEM_HOT_PLUGGABLE regions, remembers them and implements memory_add_physaddr_to_nid() which determines the node for given hotadd address. This is almost completely redundant. All the information is already available to the generic NUMA code which already performs all the sanity checking and merging. All that's necessary is not using __initdata from numa_meminfo and providing a function which uses it to map address to node. Drop the specific implementation from srat_64.c and add generic memory_add_physaddr_to_nid() in numa_64.c, which is enabled if CONFIG_MEMORY_HOTPLUG is set. Other than dropping the code, srat_64.c doesn't need any change as it already calls numa_add_memblk() for hot pluggable regions which is enough. While at it, change CONFIG_MEMORY_HOTPLUG_SPARSE in srat_64.c to CONFIG_MEMORY_HOTPLUG, for NUMA on x86-64, the two are always the same. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/init_64.c | 8 ------ arch/x86/mm/numa_64.c | 22 ++++++++++++++- arch/x86/mm/srat_64.c | 78 +-------------------------------------------------- 3 files changed, 22 insertions(+), 86 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7942335..0404bb3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -679,14 +679,6 @@ int arch_add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(arch_add_memory); -#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - #endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_vsyscall; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a96767c..4057b5d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -28,7 +28,12 @@ EXPORT_SYMBOL(node_data); nodemask_t numa_nodes_parsed __initdata; -static struct numa_meminfo numa_meminfo __initdata; +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + static int numa_distance_cnt; static u8 *numa_distance; @@ -540,3 +545,18 @@ int __cpuinit numa_cpu_node(int cpu) return __apicid_to_node[apicid]; return NUMA_NO_NODE; } + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 8e9d339..9994d2c 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -26,8 +26,6 @@ int acpi_numa __initdata; -static struct bootnode nodes_add[MAX_NUMNODES]; - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -37,7 +35,6 @@ static __init void bad_srat(void) { printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - memset(nodes_add, 0, sizeof(nodes_add)); } static __init inline int srat_disabled(void) @@ -131,67 +128,11 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm, apic_id, node); } -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif -/* - * Update nodes_add[] - * This code supports one contiguous hot add area per node - */ -static void __init -update_nodes_add(int node, unsigned long start, unsigned long end) -{ - unsigned long s_pfn = start >> PAGE_SHIFT; - unsigned long e_pfn = end >> PAGE_SHIFT; - int changed = 0; - struct bootnode *nd = &nodes_add[node]; - - /* I had some trouble with strange memory hotadd regions breaking - the boot. Be very strict here and reject anything unexpected. - If you want working memory hotadd write correct SRATs. - - The node size check is a basic sanity check to guard against - mistakes */ - if ((signed long)(end - start) < NODE_MIN_SIZE) { - printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return; - } - - /* This check might be a bit too strict, but I'm keeping it for now. */ - if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { - printk(KERN_ERR - "SRAT: Hotplug area %lu -> %lu has existing memory\n", - s_pfn, e_pfn); - return; - } - - /* Looks good */ - - if (nd->start == nd->end) { - nd->start = start; - nd->end = end; - changed = 1; - } else { - if (nd->start == end) { - nd->start = start; - changed = 1; - } - if (nd->end == start) { - nd->end = end; - changed = 1; - } - if (!changed) - printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); - } - - if (changed) { - node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", - nd->start, nd->end); - } -} /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init @@ -228,9 +169,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, start, end); - - if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) - update_nodes_add(node, start, end); } void __init acpi_numa_arch_fixup(void) {} @@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void) return ret; return srat_disabled() ? -EINVAL : 0; } - -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) -int memory_add_physaddr_to_nid(u64 start) -{ - int i, ret = 0; - - for_each_node(i) - if (nodes_add[i].start <= start && nodes_add[i].end > start) - ret = i; - - return ret; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif -- cgit v1.1 From ebe685f24eeb85fbdb0f33792f1dabdbf35eff38 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:51 +0200 Subject: x86-64, NUMA: trivial cleanups for setup_node_bootmem() Make the following trivial changes in preparation for further updates. * nodeid -> nid, nid -> tnid * use nd_ prefix for nodedata related variables * remove start/end_pfn and use start/end directly Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/numa_64.c | 52 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 4057b5d..8043d5e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -128,14 +128,11 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) /* Initialize bootmem allocator for a node */ void __init -setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) +setup_node_bootmem(int nid, unsigned long start, unsigned long end) { - unsigned long start_pfn, last_pfn, nodedata_phys; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - int nid; - - if (!end) - return; + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + unsigned long nd_pa; + int tnid; /* * Don't confuse VM with a node that doesn't have the @@ -146,30 +143,27 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, - start, end); - - start_pfn = start >> PAGE_SHIFT; - last_pfn = end >> PAGE_SHIFT; + printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", + nid, start, end); - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, - SMP_CACHE_BYTES); - if (node_data[nodeid] == NULL) + node_data[nid] = early_node_mem(nid, start, end, nd_size, + SMP_CACHE_BYTES); + if (node_data[nid] == NULL) return; - nodedata_phys = __pa(node_data[nodeid]); - memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, - nodedata_phys + pgdat_size - 1); - nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT); - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - - memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->node_id = nodeid; - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; - - node_set_online(nodeid); + nd_pa = __pa(node_data[nid]); + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); } /** -- cgit v1.1 From acd26d611e60c1a7c2a14269ab99760f779121f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:51 +0200 Subject: x86-64, NUMA: simplify nodedata allocation With top-down memblock allocation, the allocation range limits in ealry_node_mem() can be simplified - try node-local first, then any node but in any case don't allocate below DMA limit. Remove early_node_mem() and implement simplified allocation directly in setup_node_bootmem(). Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/numa_64.c | 53 +++++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 8043d5e..b4fd25e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -37,38 +37,6 @@ __initdata static int numa_distance_cnt; static u8 *numa_distance; -static void * __init early_node_mem(int nodeid, unsigned long start, - unsigned long end, unsigned long size, - unsigned long align) -{ - unsigned long mem; - - /* - * put it on high as possible - * something will go with NODE_DATA - */ - if (start < (MAX_DMA_PFN< (MAX_DMA32_PFN<> PAGE_SHIFT); if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + node_data[nid] = __va(nd_pa); memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; -- cgit v1.1 From c4b90c11992e61123071977c0e5556e59a70852c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86-32, NUMA: Automatically set apicid -> node in setup_local_APIC() Some x86-32 NUMA implementations (NUMAQ) don't initialize apicid -> node mapping using set_apicid_to_node() during NUMA init but implement custom apic->x86_32_numa_cpu_node() instead. This patch automatically initializes the default apic -> node mapping table from apic->x86_32_numa_cpu_node() from setup_local_APIC() such that the mapping table is in sync with the actual mapping. As the table isn't used by custom implementations, this doesn't make any difference at this point. This is in preparation of unifying numa_cpu_node() between x86-32 and 64. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/apic/apic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2bc503b..a6cd02a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1237,6 +1237,16 @@ void __cpuinit setup_local_APIC(void) /* always use the value from LDR */ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = logical_smp_processor_id(); + + /* + * Some NUMA implementations (NUMAQ) don't initialize apicid to + * node mapping during NUMA init. Now that logical apicid is + * guaranteed to be known, give it another chance. This is already + * a bit too late - percpu allocation has already happened without + * proper NUMA affinity. + */ + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); #endif /* -- cgit v1.1 From 6bd262731bf7559bab8c749786e8652e2df1fb4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86, NUMA: Unify 32/64bit numa_cpu_node() implementation Currently, the only meaningful user of apic->x86_32_numa_cpu_node() is NUMAQ which returns valid mapping only after CPU is initialized during SMP bringup; thus, the previous patch to set apicid -> node in setup_local_APIC() makes __apicid_to_node[] always contain the correct mapping whether custom apic->x86_32_numa_cpu_node() is used or not. So, there is no reason to keep separate 32bit implementation. We can always consult __apicid_to_node[]. Move 64bit implementation from numa_64.c to numa.c and remove 32bit implementation from numa_32.c. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 10 ++++++++++ arch/x86/include/asm/numa_32.h | 6 ------ arch/x86/include/asm/numa_64.h | 3 --- arch/x86/mm/numa.c | 9 +++++++++ arch/x86/mm/numa_32.c | 5 ----- arch/x86/mm/numa_64.c | 9 --------- 6 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index a50fc9f..5982d41 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_NUMA_H #define _ASM_X86_NUMA_H +#include + #include #include @@ -22,10 +24,18 @@ static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; } + +extern int __cpuinit numa_cpu_node(int cpu); + #else /* CONFIG_NUMA */ static inline void set_apicid_to_node(int apicid, s16 node) { } + +static inline int numa_cpu_node(int cpu) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index c6beed1e..242522f 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -5,12 +5,6 @@ extern int numa_off; extern int pxm_to_nid(int pxm); -#ifdef CONFIG_NUMA -extern int __cpuinit numa_cpu_node(int cpu); -#else /* CONFIG_NUMA */ -static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 344eb17..12461eb 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -26,7 +26,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, extern nodemask_t numa_nodes_parsed __initdata; -extern int __cpuinit numa_cpu_node(int cpu); extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); @@ -35,8 +34,6 @@ extern void __init numa_set_distance(int from, int to, int distance); #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) void numa_emu_cmdline(char *); #endif /* CONFIG_NUMA_EMU */ -#else -static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } #endif #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 745258d..e9005c4 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -32,6 +32,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index c757c0a..e0d9716 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -107,11 +107,6 @@ extern unsigned long highend_pfn, highstart_pfn; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -int __cpuinit numa_cpu_node(int cpu) -{ - return apic->x86_32_numa_cpu_node(cpu); -} - /* * FLAT - support for basic PC memory model with discontig enabled, essentially * a single node with all available processors in it with a flat diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index b4fd25e..7f83ade 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -512,15 +512,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -int __cpuinit numa_cpu_node(int cpu) -{ - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - - if (apicid != BAD_APICID) - return __apicid_to_node[apicid]; - return NUMA_NO_NODE; -} - #ifdef CONFIG_MEMORY_HOTPLUG int memory_add_physaddr_to_nid(u64 start) { -- cgit v1.1 From 84914ed0ec6787d38e84b510f92ad4ca3a572fd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86-32, NUMA: Make apic->x86_32_numa_cpu_node() optional NUMAQ is the only meaningful user of this callback and setup_local_APIC() the only callsite. Stop torturing everyone else by making the callback optional and removing all the boilerplate implementations and assignments. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/apic.h | 9 ++++++--- arch/x86/kernel/apic/apic.c | 20 +++----------------- arch/x86/kernel/apic/apic_noop.c | 9 --------- arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/es7000_32.c | 7 ------- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/summit_32.c | 1 - 7 files changed, 9 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 2b7d573..a0c46f0 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -363,7 +363,12 @@ struct apic { */ int (*x86_32_early_logical_apicid)(int cpu); - /* determine CPU -> NUMA node mapping */ + /* + * Optional method called from setup_local_APIC() after logical + * apicid is guaranteed to be known to initialize apicid -> node + * mapping if NUMA initialization hasn't done so already. Don't + * add new users. + */ int (*x86_32_numa_cpu_node)(int cpu); #endif }; @@ -537,8 +542,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) return cpuid_apic >> index_msb; } -extern int default_x86_32_numa_cpu_node(int cpu); - #endif static inline unsigned int diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a6cd02a..0c67b4f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1245,8 +1245,9 @@ void __cpuinit setup_local_APIC(void) * a bit too late - percpu allocation has already happened without * proper NUMA affinity. */ - set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), - apic->x86_32_numa_cpu_node(cpu)); + if (apic->x86_32_numa_cpu_node) + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); #endif /* @@ -2013,21 +2014,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifdef CONFIG_X86_32 -int default_x86_32_numa_cpu_node(int cpu) -{ -#ifdef CONFIG_NUMA - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); - - if (apicid != BAD_APICID) - return __apicid_to_node[apicid]; - return NUMA_NO_NODE; -#else - return 0; -#endif -} -#endif - /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index f1baa2d..775b82b 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(cpu_has_apic && !disable_apic); } -#ifdef CONFIG_X86_32 -static int noop_x86_32_numa_cpu_node(int cpu) -{ - /* we're always on node 0 */ - return 0; -} -#endif - struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -195,6 +187,5 @@ struct apic apic_noop = { #ifdef CONFIG_X86_32 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node, #endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 541a2e4..d84ac5a5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -253,5 +253,4 @@ struct apic apic_bigsmp = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 3e9de48..70533de 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_numa_cpu_node(int cpu) -{ - return 0; -} - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -688,7 +683,6 @@ struct apic __refdata apic_es7000_cluster = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, - .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; struct apic __refdata apic_es7000 = { @@ -752,5 +746,4 @@ struct apic __refdata apic_es7000 = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = es7000_early_logical_apicid, - .x86_32_numa_cpu_node = es7000_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fc84c7b..6541e47 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -172,7 +172,6 @@ struct apic apic_default = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index e4b8059..35bcd7d 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -551,5 +551,4 @@ struct apic apic_summit = { .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, .x86_32_early_logical_apicid = summit_early_logical_apicid, - .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, }; -- cgit v1.1 From 797390d8554b1e07aabea37d0140933b0412dba0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86-32, NUMA: use sparse_memory_present_with_active_regions() Instead of calling memory_present() for each region from NUMA init, call sparse_memory_present_with_active_regions() from paging_init() similarly to x86-64. For flat and numaq, this results in exactly the same memory_present() calls. For srat, if there are multiple memory chunks for a node, after this change, memory_present() will be called separately for each chunk instead of being called once to encompass the whole range, which doesn't cause any harm and actually is the better behavior. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/apic/numaq_32.c | 2 -- arch/x86/mm/init_32.c | 1 + arch/x86/mm/numa_32.c | 1 - arch/x86/mm/srat_32.c | 8 +------- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 0aced70..41b8b29 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -91,8 +91,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd) memblock_x86_register_active_regions(node, node_start_pfn[node], node_end_pfn[node]); - - memory_present(node, node_start_pfn[node], node_end_pfn[node]); } /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 80088f9..2cde0a34 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -716,6 +716,7 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_sizes_init(); } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index e0d9716..f847fa1 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -119,7 +119,6 @@ int __init get_memcfg_numa_flat(void) node_start_pfn[0] = 0; node_end_pfn[0] = max_pfn; memblock_x86_register_active_regions(0, 0, max_pfn); - memory_present(0, 0, max_pfn); /* Indicate there is one node available. */ nodes_clear(node_online_map); diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index ae20046..6b9bfd7 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -209,7 +209,7 @@ static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_ch int __init get_memcfg_from_srat(void) { - int i, j, nid; + int i, j; if (srat_disabled()) goto out_fail; @@ -273,12 +273,6 @@ int __init get_memcfg_from_srat(void) /* for out of order entries in SRAT */ sort_node_map(); - for_each_online_node(nid) { - unsigned long start = node_start_pfn[nid]; - unsigned long end = min(node_end_pfn[nid], max_pfn); - - memory_present(nid, start, end); - } return 1; out_fail: printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" -- cgit v1.1 From 1201e10a092adc9c88a6ce5f27740cc5cd0d26e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86, NUMA: trivial cleanups * Kill no longer used struct bootnode. * Kill dangling declaration of pxm_to_nid() in numa_32.h. * Make setup_node_bootmem() static. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/acpi.h | 2 -- arch/x86/include/asm/amd_nb.h | 1 - arch/x86/include/asm/numa_32.h | 2 -- arch/x86/include/asm/numa_64.h | 7 ------- arch/x86/mm/numa_64.c | 2 +- 5 files changed, 1 insertion(+), 13 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 12e0e7d..416d865 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -183,8 +183,6 @@ static inline void disable_acpi(void) { } #define ARCH_HAS_POWER_INIT 1 -struct bootnode; - #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; extern int x86_acpi_numa_init(void); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 3316822..67f87f2 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range { extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; -struct bootnode; extern bool early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index 242522f..7e54b64 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -3,8 +3,6 @@ extern int numa_off; -extern int pxm_to_nid(int pxm); - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 12461eb..794da6d 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -3,18 +3,11 @@ #include -struct bootnode { - u64 start; - u64 end; -}; - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) extern int numa_off; extern unsigned long numa_free_all_bootmem(void); -extern void setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end); #ifdef CONFIG_NUMA /* diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7f83ade..287ae79 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -95,7 +95,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) } /* Initialize bootmem allocator for a node */ -void __init +static void __init setup_node_bootmem(int nid, unsigned long start, unsigned long end) { const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; -- cgit v1.1 From 7b2600f8ee0536bb738f3387cf2c30e8e334e149 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86, NUMA: rename srat_64.c to srat.c Rename srat_64.c to srat.c. This is to prepare for unification of NUMA init paths between 32 and 64bit. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/Makefile | 5 +- arch/x86/mm/srat.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/srat_64.c | 184 -------------------------------------------------- 3 files changed, 188 insertions(+), 185 deletions(-) create mode 100644 arch/x86/mm/srat.c delete mode 100644 arch/x86/mm/srat_64.c diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 3e608ed..37e7043 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -24,7 +24,10 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o -obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +ifeq ($(CONFIG_ACPI_NUMA),y) +obj-$(CONFIG_X86_64) += srat.o +obj-$(CONFIG_X86_32) += srat_32.o +endif obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 0000000..9994d2c --- /dev/null +++ b/arch/x86/mm/srat.c @@ -0,0 +1,184 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int acpi_numa __initdata; + +static __init int setup_node(int pxm) +{ + return acpi_map_pxm_to_node(pxm); +} + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; +} + +static __init inline int srat_disabled(void) +{ + return acpi_numa < 0; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + int i, j; + + for (i = 0; i < slit->locality_count; i++) + for (j = 0; j < slit->locality_count; j++) + numa_set_distance(pxm_to_node(i), pxm_to_node(j), + slit->entry[slit->locality_count * i + j]); +} + +/* Callback for Proximity Domain -> x2APIC mapping */ +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", + pxm, apic_id, node); +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain_lo; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (get_uv_system_type() >= UV_X2APIC) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", + pxm, apic_id, node); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline int save_add_info(void) {return 1;} +#else +static inline int save_add_info(void) {return 0;} +#endif + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + unsigned long start, end; + int node, pxm; + + if (srat_disabled()) + return; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } + if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) + return; + + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + return; + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + + if (numa_add_memblk(node, start, end) < 0) { + bad_srat(); + return; + } + + printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, + start, end); +} + +void __init acpi_numa_arch_fixup(void) {} + +int __init x86_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c deleted file mode 100644 index 9994d2c..0000000 --- a/arch/x86/mm/srat_64.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * ACPI 3.0 based NUMA setup - * Copyright 2004 Andi Kleen, SuSE Labs. - * - * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. - * - * Called from acpi_numa_init while reading the SRAT and SLIT tables. - * Assumes all memory regions belonging to a single proximity domain - * are in one chunk. Holes between them will be included in the node. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int acpi_numa __initdata; - -static __init int setup_node(int pxm) -{ - return acpi_map_pxm_to_node(pxm); -} - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; -} - -static __init inline int srat_disabled(void) -{ - return acpi_numa < 0; -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ - int i, j; - - for (i = 0; i < slit->locality_count; i++) - for (j = 0; j < slit->locality_count; j++) - numa_set_distance(pxm_to_node(i), pxm_to_node(j), - slit->entry[slit->locality_count * i + j]); -} - -/* Callback for Proximity Domain -> x2APIC mapping */ -void __init -acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) -{ - int pxm, node; - int apic_id; - - if (srat_disabled()) - return; - if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { - bad_srat(); - return; - } - if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; - pxm = pa->proximity_domain; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); - bad_srat(); - return; - } - - apic_id = pa->apic_id; - if (apic_id >= MAX_LOCAL_APIC) { - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); - return; - } - set_apicid_to_node(apic_id, node); - node_set(node, numa_nodes_parsed); - acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); -} - -/* Callback for Proximity Domain -> LAPIC mapping */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) -{ - int pxm, node; - int apic_id; - - if (srat_disabled()) - return; - if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; - pxm = pa->proximity_domain_lo; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); - bad_srat(); - return; - } - - if (get_uv_system_type() >= UV_X2APIC) - apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; - else - apic_id = pa->apic_id; - - if (apic_id >= MAX_LOCAL_APIC) { - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); - return; - } - - set_apicid_to_node(apic_id, node); - node_set(node, numa_nodes_parsed); - acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static inline int save_add_info(void) {return 1;} -#else -static inline int save_add_info(void) {return 0;} -#endif - -/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ - unsigned long start, end; - int node, pxm; - - if (srat_disabled()) - return; - if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; - - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; - start = ma->base_address; - end = start + ma->length; - pxm = ma->proximity_domain; - node = setup_node(pxm); - if (node < 0) { - printk(KERN_ERR "SRAT: Too many proximity domains.\n"); - bad_srat(); - return; - } - - if (numa_add_memblk(node, start, end) < 0) { - bad_srat(); - return; - } - - printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, - start, end); -} - -void __init acpi_numa_arch_fixup(void) {} - -int __init x86_acpi_numa_init(void) -{ - int ret; - - ret = acpi_numa_init(); - if (ret < 0) - return ret; - return srat_disabled() ? -EINVAL : 0; -} -- cgit v1.1 From eca9ad313293c41021bfcf23e985a14f6991a121 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:52 +0200 Subject: x86, NUMA: make srat.c 32bit safe Make srat.c 32bit safe by removing the assumption that unsigned long is 64bit. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/srat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 9994d2c..81dbfde 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -138,7 +138,7 @@ static inline int save_add_info(void) {return 0;} void __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { - unsigned long start, end; + u64 start, end; int node, pxm; if (srat_disabled()) @@ -167,7 +167,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } - printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, start, end); } -- cgit v1.1 From daf4f480ae24270bac06db4293908d36b4834e21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: Move get_memcfg_numa() into numa_32.c There's no reason get_memcfg_numa() to be implemented inline in mmzone_32.h. Move it to numa_32.c and also make get_memcfg_numa_flag() static. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/mmzone_32.h | 18 ------------------ arch/x86/mm/numa_32.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 91df7c5..73e5745 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -16,28 +16,10 @@ extern struct pglist_data *node_data[]; /* summit or generic arch */ #include -extern int get_memcfg_numa_flat(void); -/* - * This allows any one NUMA architecture to be compiled - * for, and still fall back to the flat function if it - * fails. - */ -static inline void get_memcfg_numa(void) -{ - - if (get_memcfg_numaq()) - return; - if (get_memcfg_from_srat()) - return; - get_memcfg_numa_flat(); -} - extern void resume_map_numa_kva(pgd_t *pgd); #else /* !CONFIG_NUMA */ -#define get_memcfg_numa get_memcfg_numa_flat - static inline void resume_map_numa_kva(pgd_t *pgd) {} #endif /* CONFIG_NUMA */ diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f847fa1..abf1247 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -112,7 +112,7 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); * a single node with all available processors in it with a flat * memory map. */ -int __init get_memcfg_numa_flat(void) +static int __init get_memcfg_numa_flat(void) { printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); @@ -332,6 +332,15 @@ static __init void init_alloc_remap(int nid) nid, node_pa, node_pa + size, remap_va, remap_va + size); } +static void get_memcfg_numa(void) +{ + if (get_memcfg_numaq()) + return; + if (get_memcfg_from_srat()) + return; + get_memcfg_numa_flat(); +} + void __init initmem_init(void) { int nid; -- cgit v1.1 From e6df595b37c7c033ef7400b4fdd382a2dc4f4131 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Move numa_nodes_parsed to numa.[hc] Move numa_nodes_parsed from numa_64.[hc] to numa.[hc] to prepare for NUMA init path unification. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 1 + arch/x86/include/asm/numa_64.h | 4 ---- arch/x86/mm/numa.c | 1 + arch/x86/mm/numa_64.c | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 5982d41..c24306c 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -19,6 +19,7 @@ * numa_cpu_node(). */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; +extern nodemask_t numa_nodes_parsed __initdata; static inline void set_apicid_to_node(int apicid, s16 node) { diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 794da6d..e84113f 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -#include - #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) extern int numa_off; @@ -17,8 +15,6 @@ extern unsigned long numa_free_all_bootmem(void); */ #define NODE_MIN_SIZE (4*1024*1024) -extern nodemask_t numa_nodes_parsed __initdata; - extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e9005c4..cce1741 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -6,6 +6,7 @@ #include int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; static __init int numa_setup(char *opt) { diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 287ae79..70bd822 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -26,8 +26,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -nodemask_t numa_nodes_parsed __initdata; - static struct numa_meminfo numa_meminfo #ifndef CONFIG_MEMORY_HOTPLUG __initdata -- cgit v1.1 From b0d310801a4c1f95b44357e4ebc22a9903e3bf3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: implement temporary NUMA init shims To help transition to common NUMA init, implement temporary 32bit shims for numa_add_memblk() and numa_set_distance(). numa_add_memblk() registers the memblk and adjusts node_start/end_pfn[]. numa_set_distance() is noop. These shims will allow using 64bit NUMA init functions on 32bit and gradual transition to common NUMA init path. For detailed description, please read description of commits which make use of the shim functions. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 3 +++ arch/x86/include/asm/numa_64.h | 3 --- arch/x86/mm/numa_32.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index c24306c..db449c7 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -21,6 +21,9 @@ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; +extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); +extern void __init numa_set_distance(int from, int to, int distance); + static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index e84113f..506dd05 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -15,9 +15,6 @@ extern unsigned long numa_free_all_bootmem(void); */ #define NODE_MIN_SIZE (4*1024*1024) -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index abf1247..d0369a5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -414,3 +414,37 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif +/* temporary shim, will go away soon */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + + printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n", + nid, start_pfn, end_pfn); + + if (start >= (u64)max_pfn << PAGE_SHIFT) { + printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", + start_pfn, end_pfn); + return 0; + } + + node_set_online(nid); + memblock_x86_register_active_regions(nid, start_pfn, + min(end_pfn, max_pfn)); + + if (!node_has_online_mem(nid)) { + node_start_pfn[nid] = start_pfn; + node_end_pfn[nid] = end_pfn; + } else { + node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn); + node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn); + } + return 0; +} + +/* temporary shim, will go away soon */ +void __init numa_set_distance(int from, int to, int distance) +{ + /* nada */ +} -- cgit v1.1 From 5acd91ab837c9d066af7345aea6462dc55695db7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: Replace srat_32.c with srat.c SRAT support implementation in srat_32.c and srat.c are generally similar; however, there are some differences. First of all, 64bit implementation supports more types of SRAT entries. 64bit supports x2apic, affinity, memory and SLIT. 32bit only supports processor and memory. Most other differences stem from different initialization protocols employed by 64bit and 32bit NUMA init paths. On 64bit, * Mappings among PXM, node and apicid are directly done in each SRAT entry callback. * Memory affinity information is passed to numa_add_memblk() which takes care of all interfacing with NUMA init. * Doesn't directly initialize NUMA configurations. All the information is recorded in numa_nodes_parsed and memblks. On 32bit, * Checks numa_off. * Things go through one more level of indirection via private tables but eventually end up initializing the same mappings. * node_start/end_pfn[] are initialized and memblock_x86_register_active_regions() is called for each memory chunk. * node_set_online() is called for each online node. * sort_node_map() is called. There are also other minor differences in sanity checking and messages but taking 64bit version should be good enough. This patch drops the 32bit specific implementation and makes the 64bit implementation common for both 32 and 64bit. The init protocol differences are dealt with in two places - the numa_add_memblk() shim added in the previous patch and new temporary numa_32.c:get_memcfg_from_srat() which wraps invocation of x86_acpi_numa_init(). The shim numa_add_memblk() handles the folowings. * node_start/end_pfn[] initialization. * node_set_online() for memory nodes. * Invocation of memblock_x86_register_active_regions(). The shim get_memcfg_from_srat() handles the followings. * numa_off check. * node_set_online() for CPU nodes. * sort_node_map() invocation. * Clearing of numa_nodes_parsed and active_ranges on failure. The shims are temporary and will be removed as the generic NUMA init path in 32bit is replaced with 64bit one. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/mmzone_32.h | 2 - arch/x86/include/asm/srat.h | 39 ------ arch/x86/mm/Makefile | 5 +- arch/x86/mm/numa_32.c | 23 ++++ arch/x86/mm/srat_32.c | 281 --------------------------------------- 5 files changed, 24 insertions(+), 326 deletions(-) delete mode 100644 arch/x86/include/asm/srat.h delete mode 100644 arch/x86/mm/srat_32.c diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 73e5745..5e83a41 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -13,8 +13,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #include -/* summit or generic arch */ -#include extern void resume_map_numa_kva(pgd_t *pgd); diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h deleted file mode 100644 index b508d63..0000000 --- a/arch/x86/include/asm/srat.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen - */ - -#ifndef _ASM_X86_SRAT_H -#define _ASM_X86_SRAT_H - -#ifdef CONFIG_ACPI_NUMA -extern int get_memcfg_from_srat(void); -#else -static inline int get_memcfg_from_srat(void) -{ - return 0; -} -#endif - -#endif /* _ASM_X86_SRAT_H */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 37e7043..62997be 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -24,10 +24,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o -ifeq ($(CONFIG_ACPI_NUMA),y) -obj-$(CONFIG_X86_64) += srat.o -obj-$(CONFIG_X86_32) += srat_32.o -endif +obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d0369a5..8641239 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid) nid, node_pa, node_pa + size, remap_va, remap_va + size); } +static int get_memcfg_from_srat(void) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + if (numa_off) + return 0; + + if (x86_acpi_numa_init() < 0) { + nodes_clear(numa_nodes_parsed); + remove_all_active_ranges(); + return 0; + } + + for_each_node_mask(nid, numa_nodes_parsed) + node_set_online(nid); + sort_node_map(); + return 1; +#else + return 0; +#endif +} + static void get_memcfg_numa(void) { if (get_memcfg_numaq()) diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index 6b9bfd7..0000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Some of the code in this file has been gleaned from the 64 bit - * discontigmem support code base. - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to Pat Gaughen - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * proximity macros and definitions - */ -#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ -#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ -#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) -#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) -/* bitmap length; _PXM is at most 255 */ -#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) -static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ - -#define MAX_CHUNKS_PER_NODE 3 -#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) -struct node_memory_chunk_s { - unsigned long start_pfn; - unsigned long end_pfn; - u8 pxm; // proximity domain of node - u8 nid; // which cnode contains this chunk? - u8 bank; // which mem bank on this node -}; -static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; - -static int __initdata num_memory_chunks; /* total number of memory chunks */ -static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; - -int acpi_numa __initdata; - -static __init void bad_srat(void) -{ - printk(KERN_ERR "SRAT: SRAT not used.\n"); - acpi_numa = -1; - num_memory_chunks = 0; -} - -static __init inline int srat_disabled(void) -{ - return numa_off || acpi_numa < 0; -} - -/* Identify CPU proximity domains */ -void __init -acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) -{ - if (srat_disabled()) - return; - if (cpu_affinity->header.length != - sizeof(struct acpi_srat_cpu_affinity)) { - bad_srat(); - return; - } - - if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) - return; /* empty entry */ - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); - - /* don't need to check apic_id here, because it is always 8 bits */ - apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; - - printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", - cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); -} - -/* - * Identify memory proximity domains and hot-remove capabilities. - * Fill node memory chunk list structure. - */ -void __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) -{ - unsigned long long paddr, size; - unsigned long start_pfn, end_pfn; - u8 pxm; - struct node_memory_chunk_s *p, *q, *pend; - - if (srat_disabled()) - return; - if (memory_affinity->header.length != - sizeof(struct acpi_srat_mem_affinity)) { - bad_srat(); - return; - } - - if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; /* empty entry */ - - pxm = memory_affinity->proximity_domain & 0xff; - - /* mark this node as "seen" in node bitmap */ - BMAP_SET(pxm_bitmap, pxm); - - /* calculate info for memory chunk structure */ - paddr = memory_affinity->base_address; - size = memory_affinity->length; - - start_pfn = paddr >> PAGE_SHIFT; - end_pfn = (paddr + size) >> PAGE_SHIFT; - - - if (num_memory_chunks >= MAXCHUNKS) { - printk(KERN_WARNING "Too many mem chunks in SRAT." - " Ignoring %lld MBytes at %llx\n", - size/(1024*1024), paddr); - return; - } - - /* Insertion sort based on base address */ - pend = &node_memory_chunk[num_memory_chunks]; - for (p = &node_memory_chunk[0]; p < pend; p++) { - if (start_pfn < p->start_pfn) - break; - } - if (p < pend) { - for (q = pend; q >= p; q--) - *(q + 1) = *q; - } - p->start_pfn = start_pfn; - p->end_pfn = end_pfn; - p->pxm = pxm; - - num_memory_chunks++; - - printk(KERN_DEBUG "Memory range %08lx to %08lx" - " in proximity domain %02x %s\n", - start_pfn, end_pfn, - pxm, - ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? - "enabled and removable" : "enabled" ) ); -} - -/* Callback for SLIT parsing */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ -} - -void acpi_numa_arch_fixup(void) -{ -} -/* - * The SRAT table always lists ascending addresses, so can always - * assume that the first "start" address that you see is the real - * start of the node, and that the current "end" address is after - * the previous one. - */ -static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) -{ - /* - * Only add present memory as told by the e820. - * There is no guarantee from the SRAT that the memory it - * enumerates is present at boot time because it represents - * *possible* memory hotplug areas the same as normal RAM. - */ - if (memory_chunk->start_pfn >= max_pfn) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - memory_chunk->start_pfn, memory_chunk->end_pfn); - return -1; - } - if (memory_chunk->nid != nid) - return -1; - - if (!node_has_online_mem(nid)) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_start_pfn[nid] > memory_chunk->start_pfn) - node_start_pfn[nid] = memory_chunk->start_pfn; - - if (node_end_pfn[nid] < memory_chunk->end_pfn) - node_end_pfn[nid] = memory_chunk->end_pfn; - - return 0; -} - -int __init get_memcfg_from_srat(void) -{ - int i, j; - - if (srat_disabled()) - goto out_fail; - - if (acpi_numa_init() < 0) - goto out_fail; - - if (num_memory_chunks == 0) { - printk(KERN_DEBUG - "could not find any ACPI SRAT memory areas.\n"); - goto out_fail; - } - - /* Calculate total number of nodes in system from PXM bitmap and create - * a set of sequential node IDs starting at zero. (ACPI doesn't seem - * to specify the range of _PXM values.) - */ - /* - * MCD - we no longer HAVE to number nodes sequentially. PXM domain - * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically - * 32, so we will continue numbering them in this manner until MAX_NUMNODES - * approaches MAX_PXM_DOMAINS for i386. - */ - nodes_clear(node_online_map); - for (i = 0; i < MAX_PXM_DOMAINS; i++) { - if (BMAP_TEST(pxm_bitmap, i)) { - int nid = acpi_map_pxm_to_node(i); - node_set_online(nid); - } - } - BUG_ON(num_online_nodes() == 0); - - /* set cnode id in memory chunk structure */ - for (i = 0; i < num_memory_chunks; i++) - node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); - - printk(KERN_DEBUG "pxm bitmap: "); - for (i = 0; i < sizeof(pxm_bitmap); i++) { - printk(KERN_CONT "%02x ", pxm_bitmap[i]); - } - printk(KERN_CONT "\n"); - printk(KERN_DEBUG "Number of logical nodes in system = %d\n", - num_online_nodes()); - printk(KERN_DEBUG "Number of memory chunks in system = %d\n", - num_memory_chunks); - - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); - - for (j = 0; j < num_memory_chunks; j++){ - struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; - printk(KERN_DEBUG - "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", - j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - if (node_read_chunk(chunk->nid, chunk)) - continue; - - memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, - min(chunk->end_pfn, max_pfn)); - } - /* for out of order entries in SRAT */ - sort_node_map(); - - return 1; -out_fail: - printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" - " table\n"); - return 0; -} -- cgit v1.1 From 299a180aec6a8ee3069cf0fe90d722ac20c1f837 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86-32, NUMA: Update numaq to use new NUMA init protocol Update numaq such that it calls numa_add_memblk() and sets numa_nodes_parsed instead of directly diddling with NUMA states. The original get_memcfg_numaq() is renamed to numaq_numa_init() and new get_memcfg_numaq() is created in numa_32.c. The shim numa_add_memblk() implementation handles node_start/end_pfn[] and node_set_online() for nodes with memory. The new get_memcfg_numaq() exactly the same with get_memcfg_from_srat() other than calling the numaq init function. Things get_memcfgs_numaq() do are not strictly necessary for numaq but added for consistency and to help unifying NUMA init handling. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numaq.h | 7 +------ arch/x86/kernel/apic/numaq_32.c | 28 ++++++++++------------------ arch/x86/mm/numa_32.c | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 37c5165..c3b3c32 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -29,7 +29,7 @@ #ifdef CONFIG_X86_NUMAQ extern int found_numaq; -extern int get_memcfg_numaq(void); +extern int numaq_numa_init(void); extern int pci_numaq_init(void); extern void *xquad_portio; @@ -166,11 +166,6 @@ struct sys_cfg_data { void numaq_tsc_disable(void); -#else -static inline int get_memcfg_numaq(void) -{ - return 0; -} #endif /* CONFIG_X86_NUMAQ */ #endif /* _ASM_X86_NUMAQ_H */ diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 41b8b29..30f1331 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -48,8 +48,6 @@ #include #include -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) - int found_numaq; /* @@ -79,25 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4]; static inline void numaq_register_node(int node, struct sys_cfg_data *scd) { struct eachquadmem *eq = scd->eq + node; + u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; + u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; + int ret; - node_set_online(node); - - /* Convert to pages */ - node_start_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); - - node_end_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - - memblock_x86_register_active_regions(node, node_start_pfn[node], - node_end_pfn[node]); + node_set(node, numa_nodes_parsed); + ret = numa_add_memblk(node, start, end); + BUG_ON(ret < 0); } /* * Function: smp_dump_qct() * * Description: gets memory layout from the quad config table. This - * function also updates node_online_map with the nodes (quads) present. + * function also updates numa_nodes_parsed with the nodes (quads) present. */ static void __init smp_dump_qct(void) { @@ -106,7 +99,6 @@ static void __init smp_dump_qct(void) scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - nodes_clear(node_online_map); for_each_node(node) { if (scd->quads_present31_0 & (1 << node)) numaq_register_node(node, scd); @@ -276,14 +268,14 @@ static __init void early_check_numaq(void) } } -int __init get_memcfg_numaq(void) +int __init numaq_numa_init(void) { early_check_numaq(); if (!found_numaq) - return 0; + return -ENOENT; smp_dump_qct(); - return 1; + return 0; } #define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 8641239..14135e5 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid) nid, node_pa, node_pa + size, remap_va, remap_va + size); } +static int get_memcfg_numaq(void) +{ +#ifdef CONFIG_X86_NUMAQ + int nid; + + if (numa_off) + return 0; + + if (numaq_numa_init() < 0) { + nodes_clear(numa_nodes_parsed); + remove_all_active_ranges(); + return 0; + } + + for_each_node_mask(nid, numa_nodes_parsed) + node_set_online(nid); + sort_node_map(); + return 1; +#else + return 0; +#endif +} + static int get_memcfg_from_srat(void) { #ifdef CONFIG_ACPI_NUMA -- cgit v1.1 From a4106eae650a4d5d30fcdd36d998edfa5ccb0ec4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Move NUMA init logic from numa_64.c to numa.c Move the generic 64bit NUMA init machinery from numa_64.c to numa.c. * node_data[], numa_mem_info and numa_distance * numa_add_memblk[_to](), numa_remove_memblk[_from]() * numa_set_distance() and friends * numa_init() and all the numa_meminfo handling helpers called from it * dummy_numa_init() * memory_add_physaddr_to_nid() A new function x86_numa_init() is added and the content of numa_64.c::initmem_init() is moved into it. initmem_init() now simply calls x86_numa_init(). Constants and numa_off declaration are moved from numa_{32|64}.h to numa.h. This is code reorganization and doesn't involve any functional change. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 16 ++ arch/x86/include/asm/numa_32.h | 2 - arch/x86/include/asm/numa_64.h | 19 -- arch/x86/mm/numa.c | 523 ++++++++++++++++++++++++++++++++++++++++- arch/x86/mm/numa_64.c | 503 +-------------------------------------- arch/x86/mm/numa_internal.h | 2 + 6 files changed, 539 insertions(+), 526 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index db449c7..7454086 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -9,6 +9,16 @@ #ifdef CONFIG_NUMA #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) + +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + +extern int numa_off; /* * __apicid_to_node[] stores the raw mapping between physical apicid and @@ -68,4 +78,10 @@ static inline void numa_remove_cpu(int cpu) { } void debug_cpumask_set_cpu(int cpu, int node, bool enable); #endif +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +void numa_emu_cmdline(char *); +#endif /* CONFIG_NUMA_EMU */ + #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index 7e54b64..e7d6b82 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_NUMA_32_H #define _ASM_X86_NUMA_32_H -extern int numa_off; - #ifdef CONFIG_HIGHMEM extern void set_highmem_pages_init(void); #else diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 506dd05..0c05f7a 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,25 +1,6 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) - -extern int numa_off; - extern unsigned long numa_free_all_bootmem(void); -#ifdef CONFIG_NUMA -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -void numa_emu_cmdline(char *); -#endif /* CONFIG_NUMA_EMU */ -#endif - #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index cce1741..ed1daba 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -1,13 +1,42 @@ /* Common code for 32 and 64-bit NUMA */ -#include -#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include #include +#include + +#include "numa_internal.h" int __initdata numa_off; nodemask_t numa_nodes_parsed __initdata; +#ifdef CONFIG_X86_64 +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; +#endif + static __init int numa_setup(char *opt) { if (!opt) @@ -105,6 +134,392 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } +#ifdef CONFIG_X86_64 +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize bootmem allocator for a node */ +static void __init +setup_node_bootmem(int nid, unsigned long start, unsigned long end) +{ + const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; + const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + unsigned long nd_pa; + int tnid; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + + start = roundup(start, ZONE_ALIGN); + + printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", + nid, start, end); + + /* + * Try to allocate node data on local node and then fall back to + * all nodes. Never allocate in DMA zone. + */ + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { + pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); + return; + } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + + /* report and initialize */ + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = __va(nd_pa); + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = (u64)max_pfn << PAGE_SHIFT; + int i, j, k; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start >= bi->end) { + numa_remove_memblk_from(i--, mi); + continue; + } + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + unsigned long start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = max(min(bi->start, bj->start), low); + end = min(max(bi->end, bj->end), high); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_x86_free_range(__pa(numa_distance), + __pa(numa_distance) + size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, + size, PAGE_SIZE); + if (phys == MEMBLOCK_ERROR) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node at the time of + * table creation or @distance doesn't make sense, the call is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt) { + printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + unsigned long numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + unsigned long s = mi->blk[i].start >> PAGE_SHIFT; + unsigned long e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((long)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - (memblock_x86_hole_size(0, + max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) + memblock_x86_register_active_regions(mi->blk[i].nid, + mi->blk[i].start >> PAGE_SHIFT, + mi->blk[i].end >> PAGE_SHIFT); + + /* for out of order entries */ + sort_node_map(); + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = (u64)max_pfn << PAGE_SHIFT; + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_bootmem(nid, start, end); + } + + return 0; +} +#endif + /* * There are unfortunately some poorly designed mainboards around that * only connect memory to a single CPU. This breaks the 1:1 cpu->node @@ -127,6 +542,93 @@ void __init numa_init_array(void) } } +#ifdef CONFIG_X86_64 +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + remove_all_active_ranges(); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + 0LU, max_pfn << PAGE_SHIFT); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} +#endif + static __init int find_near_online_node(int node) { int n, val; @@ -292,3 +794,18 @@ const struct cpumask *cpumask_of_node(int node) EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG) +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 70bd822..dd27f40 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -2,499 +2,13 @@ * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include #include "numa_internal.h" -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -static struct numa_meminfo numa_meminfo -#ifndef CONFIG_MEMORY_HOTPLUG -__initdata -#endif -; - -static int numa_distance_cnt; -static u8 *numa_distance; - -static int __init numa_add_memblk_to(int nid, u64 start, u64 end, - struct numa_meminfo *mi) -{ - /* ignore zero length blks */ - if (start == end) - return 0; - - /* whine about and ignore invalid blks */ - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", - nid, start, end); - return 0; - } - - if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: too many memblk ranges\n"); - return -EINVAL; - } - - mi->blk[mi->nr_blks].start = start; - mi->blk[mi->nr_blks].end = end; - mi->blk[mi->nr_blks].nid = nid; - mi->nr_blks++; - return 0; -} - -/** - * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo - * @idx: Index of memblk to remove - * @mi: numa_meminfo to remove memblk from - * - * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and - * decrementing @mi->nr_blks. - */ -void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) -{ - mi->nr_blks--; - memmove(&mi->blk[idx], &mi->blk[idx + 1], - (mi->nr_blks - idx) * sizeof(mi->blk[0])); -} - -/** - * numa_add_memblk - Add one numa_memblk to numa_meminfo - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * Add a new memblk to the default numa_meminfo. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - return numa_add_memblk_to(nid, start, end, &numa_meminfo); -} - -/* Initialize bootmem allocator for a node */ -static void __init -setup_node_bootmem(int nid, unsigned long start, unsigned long end) -{ - const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; - const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; - const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - unsigned long nd_pa; - int tnid; - - /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", - nid, start, end); - - /* - * Try to allocate node data on local node and then fall back to - * all nodes. Never allocate in DMA zone. - */ - nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) - nd_pa = memblock_find_in_range(nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) { - pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); - return; - } - memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); - - /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", - nd_pa, nd_pa + nd_size - 1); - tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); - - node_data[nid] = __va(nd_pa); - memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - NODE_DATA(nid)->node_id = nid; - NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; - NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; - - node_set_online(nid); -} - -/** - * numa_cleanup_meminfo - Cleanup a numa_meminfo - * @mi: numa_meminfo to clean up - * - * Sanitize @mi by merging and removing unncessary memblks. Also check for - * conflicts and clear unused memblks. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_cleanup_meminfo(struct numa_meminfo *mi) -{ - const u64 low = 0; - const u64 high = (u64)max_pfn << PAGE_SHIFT; - int i, j, k; - - for (i = 0; i < mi->nr_blks; i++) { - struct numa_memblk *bi = &mi->blk[i]; - - /* make sure all blocks are inside the limits */ - bi->start = max(bi->start, low); - bi->end = min(bi->end, high); - - /* and there's no empty block */ - if (bi->start >= bi->end) { - numa_remove_memblk_from(i--, mi); - continue; - } - - for (j = i + 1; j < mi->nr_blks; j++) { - struct numa_memblk *bj = &mi->blk[j]; - unsigned long start, end; - - /* - * See whether there are overlapping blocks. Whine - * about but allow overlaps of the same nid. They - * will be merged below. - */ - if (bi->end > bj->start && bi->start < bj->end) { - if (bi->nid != bj->nid) { - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->nid, bj->start, bj->end); - return -EINVAL; - } - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->start, bj->end); - } - - /* - * Join together blocks on the same node, holes - * between which don't overlap with memory on other - * nodes. - */ - if (bi->nid != bj->nid) - continue; - start = max(min(bi->start, bj->start), low); - end = min(max(bi->end, bj->end), high); - for (k = 0; k < mi->nr_blks; k++) { - struct numa_memblk *bk = &mi->blk[k]; - - if (bi->nid == bk->nid) - continue; - if (start < bk->end && end > bk->start) - break; - } - if (k < mi->nr_blks) - continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", - bi->nid, bi->start, bi->end, bj->start, bj->end, - start, end); - bi->start = start; - bi->end = end; - numa_remove_memblk_from(j--, mi); - } - } - - for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { - mi->blk[i].start = mi->blk[i].end = 0; - mi->blk[i].nid = NUMA_NO_NODE; - } - - return 0; -} - -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - -/** - * numa_reset_distance - Reset NUMA distance table - * - * The current table is freed. The next numa_set_distance() call will - * create a new one. - */ -void __init numa_reset_distance(void) -{ - size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); - - /* numa_distance could be 1LU marking allocation failure, test cnt */ - if (numa_distance_cnt) - memblock_x86_free_range(__pa(numa_distance), - __pa(numa_distance) + size); - numa_distance_cnt = 0; - numa_distance = NULL; /* enable table creation */ -} - -static int __init numa_alloc_distance(void) -{ - nodemask_t nodes_parsed; - size_t size; - int i, j, cnt = 0; - u64 phys; - - /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) - cnt = i; - cnt++; - size = cnt * cnt * sizeof(numa_distance[0]); - - phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, - size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { - pr_warning("NUMA: Warning: can't allocate distance table!\n"); - /* don't retry until explicitly reset */ - numa_distance = (void *)1LU; - return -ENOMEM; - } - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); - - numa_distance = __va(phys); - numa_distance_cnt = cnt; - - /* fill with the default distances */ - for (i = 0; i < cnt; i++) - for (j = 0; j < cnt; j++) - numa_distance[i * cnt + j] = i == j ? - LOCAL_DISTANCE : REMOTE_DISTANCE; - printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); - - return 0; -} - -/** - * numa_set_distance - Set NUMA distance from one NUMA to another - * @from: the 'from' node to set distance - * @to: the 'to' node to set distance - * @distance: NUMA distance - * - * Set the distance from node @from to @to to @distance. If distance table - * doesn't exist, one which is large enough to accommodate all the currently - * known nodes will be created. - * - * If such table cannot be allocated, a warning is printed and further - * calls are ignored until the distance table is reset with - * numa_reset_distance(). - * - * If @from or @to is higher than the highest known node at the time of - * table creation or @distance doesn't make sense, the call is ignored. - * This is to allow simplification of specific NUMA config implementations. - */ -void __init numa_set_distance(int from, int to, int distance) -{ - if (!numa_distance && numa_alloc_distance() < 0) - return; - - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - if ((u8)distance != distance || - (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - numa_distance[from * numa_distance_cnt + to] = distance; -} - -int __node_distance(int from, int to) -{ - if (from >= numa_distance_cnt || to >= numa_distance_cnt) - return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; - return numa_distance[from * numa_distance_cnt + to]; -} -EXPORT_SYMBOL(__node_distance); - -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - unsigned long numaram, e820ram; - int i; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - unsigned long s = mi->blk[i].start >> PAGE_SHIFT; - unsigned long e = mi->blk[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((long)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - (memblock_x86_hole_size(0, - max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return false; - } - return true; -} - -static int __init numa_register_memblks(struct numa_meminfo *mi) -{ - int i, nid; - - /* Account for nodes with cpus and no memory */ - node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - for (i = 0; i < mi->nr_blks; i++) - memblock_x86_register_active_regions(mi->blk[i].nid, - mi->blk[i].start >> PAGE_SHIFT, - mi->blk[i].end >> PAGE_SHIFT); - - /* for out of order entries */ - sort_node_map(); - if (!numa_meminfo_cover_memory(mi)) - return -EINVAL; - - /* Finally register nodes. */ - for_each_node_mask(nid, node_possible_map) { - u64 start = (u64)max_pfn << PAGE_SHIFT; - u64 end = 0; - - for (i = 0; i < mi->nr_blks; i++) { - if (nid != mi->blk[i].nid) - continue; - start = min(mi->blk[i].start, start); - end = max(mi->blk[i].end, end); - } - - if (start < end) - setup_node_bootmem(nid, start, end); - } - - return 0; -} - -/** - * dummy_numma_init - Fallback dummy NUMA init - * - * Used if there's no underlying NUMA architecture, NUMA initialization - * fails, or NUMA is disabled on the command line. - * - * Must online at least one node and add memory blocks that cover all - * allowed memory. This function must not fail. - */ -static int __init dummy_numa_init(void) -{ - printk(KERN_INFO "%s\n", - numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - 0LU, max_pfn << PAGE_SHIFT); - - node_set(0, numa_nodes_parsed); - numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); - - return 0; -} - -static int __init numa_init(int (*init_func)(void)) -{ - int i; - int ret; - - for (i = 0; i < MAX_LOCAL_APIC; i++) - set_apicid_to_node(i, NUMA_NO_NODE); - - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); - numa_reset_distance(); - - ret = init_func(); - if (ret < 0) - return ret; - ret = numa_cleanup_meminfo(&numa_meminfo); - if (ret < 0) - return ret; - - numa_emulation(&numa_meminfo, numa_distance_cnt); - - ret = numa_register_memblks(&numa_meminfo); - if (ret < 0) - return ret; - - for (i = 0; i < nr_cpu_ids; i++) { - int nid = early_cpu_to_node(i); - - if (nid == NUMA_NO_NODE) - continue; - if (!node_online(nid)) - numa_clear_node(i); - } - numa_init_array(); - return 0; -} - void __init initmem_init(void) { - if (!numa_off) { -#ifdef CONFIG_ACPI_NUMA - if (!numa_init(x86_acpi_numa_init)) - return; -#endif -#ifdef CONFIG_AMD_NUMA - if (!numa_init(amd_numa_init)) - return; -#endif - } - - numa_init(dummy_numa_init); + x86_numa_init(); } unsigned long __init numa_free_all_bootmem(void) @@ -509,18 +23,3 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } - -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 start) -{ - struct numa_meminfo *mi = &numa_meminfo; - int nid = mi->blk[0].nid; - int i; - - for (i = 0; i < mi->nr_blks; i++) - if (mi->blk[i].start <= start && mi->blk[i].end > start) - nid = mi->blk[i].nid; - return nid; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ef2d973..ad86ec9 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -19,6 +19,8 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); void __init numa_reset_distance(void); +void __init x86_numa_init(void); + #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); -- cgit v1.1 From 744baba0c4072b04664952a89292e4708eaf949a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Enable build of generic NUMA init code on 32bit Generic NUMA init code was moved to numa.c from numa_64.c but is still guaraded by CONFIG_X86_64. This patch removes the compile guard and enables compiling on 32bit. * numa_add_memblk() and numa_set_distance() clash with the shim implementation in numa_32.c and are left out. * memory_add_physaddr_to_nid() clashes with 32bit implementation and is left out. * MAX_DMA_PFN definition in dma.h moved out of !CONFIG_X86_32. * node_data definition in numa_32.c removed in favor of the one in numa.c. There are places where ulong is assumed to be 64bit. The next patch will fix them up. Note that although the code is compiled it isn't used yet and this patch doesn't cause any functional change. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/dma.h | 6 +++--- arch/x86/mm/numa.c | 10 ++++------ arch/x86/mm/numa_32.c | 3 --- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 057099e..d1a314b 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -69,6 +69,9 @@ #define MAX_DMA_CHANNELS 8 +/* 16MB ISA DMA zone */ +#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) + #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ @@ -76,9 +79,6 @@ #else -/* 16MB ISA DMA zone */ -#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) - /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ed1daba..c400f3b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -23,7 +23,6 @@ int __initdata numa_off; nodemask_t numa_nodes_parsed __initdata; -#ifdef CONFIG_X86_64 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -35,7 +34,6 @@ __initdata static int numa_distance_cnt; static u8 *numa_distance; -#endif static __init int numa_setup(char *opt) { @@ -134,7 +132,6 @@ void __init setup_node_to_cpumask_map(void) pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } -#ifdef CONFIG_X86_64 static int __init numa_add_memblk_to(int nid, u64 start, u64 end, struct numa_meminfo *mi) { @@ -176,6 +173,7 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +#ifdef CONFIG_X86_64 /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -191,6 +189,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) { return numa_add_memblk_to(nid, start, end, &numa_meminfo); } +#endif /* Initialize bootmem allocator for a node */ static void __init @@ -402,6 +401,7 @@ static int __init numa_alloc_distance(void) return 0; } +#ifdef CONFIG_X86_64 /** * numa_set_distance - Set NUMA distance from one NUMA to another * @from: the 'from' node to set distance @@ -440,6 +440,7 @@ void __init numa_set_distance(int from, int to, int distance) numa_distance[from * numa_distance_cnt + to] = distance; } +#endif int __node_distance(int from, int to) { @@ -518,7 +519,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return 0; } -#endif /* * There are unfortunately some poorly designed mainboards around that @@ -542,7 +542,6 @@ void __init numa_init_array(void) } } -#ifdef CONFIG_X86_64 static int __init numa_init(int (*init_func)(void)) { int i; @@ -627,7 +626,6 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -#endif static __init int find_near_online_node(int node) { diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 14135e5..975a76f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -41,9 +41,6 @@ #include #include -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - /* * numa interface - we expect the numa architecture specific code to have * populated the following initialisation. -- cgit v1.1 From 38f3e1ca24cc3ec416855e02676f91c898a8a262 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:53 +0200 Subject: x86, NUMA: Remove long 64bit assumption from numa.c Code moved from numa_64.c has assumption that long is 64bit in several places. This patch removes the assumption by using {s|u}64_t explicity, using PFN_PHYS() for page number -> addr conversions and adjusting printf formats. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/numa.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c400f3b..b45caa3 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -192,13 +192,12 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) #endif /* Initialize bootmem allocator for a node */ -static void __init -setup_node_bootmem(int nid, unsigned long start, unsigned long end) +static void __init setup_node_bootmem(int nid, u64 start, u64 end) { - const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT; - const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT; + const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); + const u64 nd_high = PFN_PHYS(max_pfn_mapped); const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - unsigned long nd_pa; + u64 nd_pa; int tnid; /* @@ -210,7 +209,7 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end) start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", + printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", nid, start, end); /* @@ -223,13 +222,13 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end) nd_pa = memblock_find_in_range(nd_low, nd_high, nd_size, SMP_CACHE_BYTES); if (nd_pa == MEMBLOCK_ERROR) { - pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid); + pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid); return; } memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", + printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]\n", nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (tnid != nid) @@ -257,7 +256,7 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end) int __init numa_cleanup_meminfo(struct numa_meminfo *mi) { const u64 low = 0; - const u64 high = (u64)max_pfn << PAGE_SHIFT; + const u64 high = PFN_PHYS(max_pfn); int i, j, k; for (i = 0; i < mi->nr_blks; i++) { @@ -275,7 +274,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) for (j = i + 1; j < mi->nr_blks; j++) { struct numa_memblk *bj = &mi->blk[j]; - unsigned long start, end; + u64 start, end; /* * See whether there are overlapping blocks. Whine @@ -313,7 +312,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } if (k < mi->nr_blks) continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", bi->nid, bi->start, bi->end, bj->start, bj->end, start, end); bi->start = start; @@ -378,7 +377,7 @@ static int __init numa_alloc_distance(void) cnt++; size = cnt * cnt * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), size, PAGE_SIZE); if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate distance table!\n"); @@ -456,24 +455,24 @@ EXPORT_SYMBOL(__node_distance); */ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) { - unsigned long numaram, e820ram; + u64 numaram, e820ram; int i; numaram = 0; for (i = 0; i < mi->nr_blks; i++) { - unsigned long s = mi->blk[i].start >> PAGE_SHIFT; - unsigned long e = mi->blk[i].end >> PAGE_SHIFT; + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; numaram += e - s; numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((long)numaram < 0) + if ((s64)numaram < 0) numaram = 0; } e820ram = max_pfn - (memblock_x86_hole_size(0, - max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); + PFN_PHYS(max_pfn)) >> PAGE_SHIFT); /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", (numaram << PAGE_SHIFT) >> 20, (e820ram << PAGE_SHIFT) >> 20); return false; @@ -503,7 +502,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* Finally register nodes. */ for_each_node_mask(nid, node_possible_map) { - u64 start = (u64)max_pfn << PAGE_SHIFT; + u64 start = PFN_PHYS(max_pfn); u64 end = 0; for (i = 0; i < mi->nr_blks; i++) { @@ -595,11 +594,11 @@ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", - 0LU, max_pfn << PAGE_SHIFT); + printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", + 0LLU, PFN_PHYS(max_pfn)); node_set(0, numa_nodes_parsed); - numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; } -- cgit v1.1 From 99cca492ea8ced305bfd687521ed69fb9e0147aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:54 +0200 Subject: x86-32, NUMA: Add @start and @end to init_alloc_remap() Instead of dereferencing node_start/end_pfn[] directly, make init_alloc_remap() take @start and @end and let the caller be responsible for making sure the range is sane. This is to prepare for use from unified NUMA init code. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/numa_32.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 975a76f..9008632 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -265,8 +265,10 @@ void resume_map_numa_kva(pgd_t *pgd_base) * opportunistically and the callers will fall back to other memory * allocation mechanisms on failure. */ -static __init void init_alloc_remap(int nid) +static __init void init_alloc_remap(int nid, u64 start, u64 end) { + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; unsigned long size, pfn; u64 node_pa, remap_pa; void *remap_va; @@ -276,24 +278,15 @@ static __init void init_alloc_remap(int nid) * memory could be added but not currently present. */ printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, node_start_pfn[nid], node_end_pfn[nid]); - if (node_start_pfn[nid] > max_pfn) - return; - if (!node_end_pfn[nid]) - return; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; + nid, start_pfn, end_pfn); /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, node_start_pfn[nid], - min(node_end_pfn[nid], max_pfn)); + size = node_memmap_size_bytes(nid, start_pfn, end_pfn); size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); size = ALIGN(size, LARGE_PAGE_BYTES); /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT, - (u64)node_end_pfn[nid] << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); + node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); if (node_pa == MEMBLOCK_ERROR) { pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", size, nid); @@ -391,8 +384,14 @@ void __init initmem_init(void) get_memcfg_numa(); numa_init_array(); - for_each_online_node(nid) - init_alloc_remap(nid); + for_each_online_node(nid) { + u64 start = (u64)node_start_pfn[nid] << PAGE_SHIFT; + u64 end = min((u64)node_end_pfn[nid] << PAGE_SHIFT, + (u64)max_pfn << PAGE_SHIFT); + + if (start < end) + init_alloc_remap(nid, start, end); + } #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; -- cgit v1.1 From 7888e96b264fad27f97f58c0f3a4d20326eaf181 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 14:18:54 +0200 Subject: x86, NUMA: Initialize and use remap allocator from setup_node_bootmem() setup_node_bootmem() is taken from 64bit and doesn't use remap allocator. It's about to be shared with 32bit so add support for it. If NODE_DATA is remapped, it's noted in the debug message and node locality check is skipped as the __pa() of the remapped address doesn't reflect the actual physical address. On 64bit, remap allocator becomes noop and doesn't affect the behavior. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/numa.c | 41 +++++++++++++++++++++++++++-------------- arch/x86/mm/numa_32.c | 2 +- arch/x86/mm/numa_internal.h | 6 ++++++ 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b45caa3..a72317a 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -197,7 +197,9 @@ static void __init setup_node_bootmem(int nid, u64 start, u64 end) const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); const u64 nd_high = PFN_PHYS(max_pfn_mapped); const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + bool remapped = false; u64 nd_pa; + void *nd; int tnid; /* @@ -207,34 +209,45 @@ static void __init setup_node_bootmem(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; + /* initialize remap allocator before aligning to ZONE_ALIGN */ + init_alloc_remap(nid, start, end); + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", nid, start, end); /* - * Try to allocate node data on local node and then fall back to - * all nodes. Never allocate in DMA zone. + * Allocate node data. Try remap allocator first, node-local + * memory and then any node. Never allocate in DMA zone. */ - nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, + nd = alloc_remap(nid, nd_size); + if (nd) { + nd_pa = __pa(nd); + remapped = true; + } else { + nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) - nd_pa = memblock_find_in_range(nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) { - pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid); - return; + if (nd_pa == MEMBLOCK_ERROR) + nd_pa = memblock_find_in_range(nd_low, nd_high, + nd_size, SMP_CACHE_BYTES); + if (nd_pa == MEMBLOCK_ERROR) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } + memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); + nd = __va(nd_pa); } - memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]\n", - nd_pa, nd_pa + nd_size - 1); + printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (tnid != nid) + if (!remapped && tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); - node_data[nid] = __va(nd_pa); + node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 9008632..fbd558f 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -265,7 +265,7 @@ void resume_map_numa_kva(pgd_t *pgd_base) * opportunistically and the callers will fall back to other memory * allocation mechanisms on failure. */ -static __init void init_alloc_remap(int nid, u64 start, u64 end) +void __init init_alloc_remap(int nid, u64 start, u64 end) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = end >> PAGE_SHIFT; diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ad86ec9..7178c3a 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,6 +21,12 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); +#ifdef CONFIG_X86_64 +static inline void init_alloc_remap(int nid, u64 start, u64 end) { } +#else +void __init init_alloc_remap(int nid, u64 start, u64 end); +#endif + #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); -- cgit v1.1 From 6e3d4bec6b1e0829ed8b23be750762255f225019 Mon Sep 17 00:00:00 2001 From: Keshava Munegowda Date: Thu, 21 Apr 2011 19:52:43 +0530 Subject: omap:usb: add regulator support for EHCI in case of ehci phy mode; regulator of phy should be enabled before initializing the usbhs core driver. Signed-off-by: Keshava Munegowda Tested-by: Steve Calfee Signed-off-by: Felipe Balbi --- drivers/mfd/omap-usb-host.c | 1 - drivers/usb/host/ehci-omap.c | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index b3bb3ac..2e16511 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #define USBHS_DRIVER_NAME "usbhs-omap" diff --git a/drivers/usb/host/ehci-omap.c b/drivers/usb/host/ehci-omap.c index 7e41a95..627f3a6 100644 --- a/drivers/usb/host/ehci-omap.c +++ b/drivers/usb/host/ehci-omap.c @@ -40,6 +40,7 @@ #include #include #include +#include /* EHCI Register Set */ #define EHCI_INSNREG04 (0xA0) @@ -118,6 +119,8 @@ static int ehci_hcd_omap_probe(struct platform_device *pdev) struct ehci_hcd *omap_ehci; int ret = -ENODEV; int irq; + int i; + char supply[7]; if (usb_disabled()) return -ENODEV; @@ -158,6 +161,23 @@ static int ehci_hcd_omap_probe(struct platform_device *pdev) hcd->rsrc_len = resource_size(res); hcd->regs = regs; + /* get ehci regulator and enable */ + for (i = 0 ; i < OMAP3_HS_USB_PORTS ; i++) { + if (pdata->port_mode[i] != OMAP_EHCI_PORT_MODE_PHY) { + pdata->regulator[i] = NULL; + continue; + } + snprintf(supply, sizeof(supply), "hsusb%d", i); + pdata->regulator[i] = regulator_get(dev, supply); + if (IS_ERR(pdata->regulator[i])) { + pdata->regulator[i] = NULL; + dev_dbg(dev, + "failed to get ehci port%d regulator\n", i); + } else { + regulator_enable(pdata->regulator[i]); + } + } + ret = omap_usbhs_enable(dev); if (ret) { dev_err(dev, "failed to start usbhs with err %d\n", ret); -- cgit v1.1 From bd6709a91a593d8fe35d08da542e9f93bb74a304 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Make 32bit use common NUMA init path With both _numa_init() methods converted and the rest of init code adjusted, numa_32.c now can switch from the 32bit only init code to the common one in numa.c. * Shim get_memcfg_*()'s are dropped and initmem_init() calls x86_numa_init(), which is updated to handle NUMAQ. * All boilerplate operations including node range limiting, pgdat alloc/init are handled by numa_init(). 32bit only implementation is removed. * 32bit numa_add_memblk(), numa_set_distance() and memory_add_physaddr_to_nid() removed and common versions in numa_32.c enabled for 32bit. This change causes the following behavior changes. * NODE_DATA()->node_start_pfn/node_spanned_pages properly initialized for 32bit too. * Much more sanity checks and configuration cleanups. * Proper handling of node distances. * The same NUMA init messages as 64bit. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/topology.h | 7 -- arch/x86/mm/numa.c | 10 +- arch/x86/mm/numa_32.c | 231 +--------------------------------------- 3 files changed, 7 insertions(+), 241 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 8dba769..c006924 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -93,18 +93,11 @@ extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) #ifdef CONFIG_X86_32 -extern unsigned long node_start_pfn[]; -extern unsigned long node_end_pfn[]; -#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid]) - # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 - #else - # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 - #endif /* sched_domains SD_NODE_INIT for NUMA machines */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a72317a..56ed714 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -173,7 +173,6 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } -#ifdef CONFIG_X86_64 /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -189,7 +188,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) { return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -#endif /* Initialize bootmem allocator for a node */ static void __init setup_node_bootmem(int nid, u64 start, u64 end) @@ -413,7 +411,6 @@ static int __init numa_alloc_distance(void) return 0; } -#ifdef CONFIG_X86_64 /** * numa_set_distance - Set NUMA distance from one NUMA to another * @from: the 'from' node to set distance @@ -452,7 +449,6 @@ void __init numa_set_distance(int from, int to, int distance) numa_distance[from * numa_distance_cnt + to] = distance; } -#endif int __node_distance(int from, int to) { @@ -626,6 +622,10 @@ static int __init dummy_numa_init(void) void __init x86_numa_init(void) { if (!numa_off) { +#ifdef CONFIG_X86_NUMAQ + if (!numa_init(numaq_numa_init)) + return; +#endif #ifdef CONFIG_ACPI_NUMA if (!numa_init(x86_acpi_numa_init)) return; @@ -805,7 +805,7 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG) +#ifdef CONFIG_MEMORY_HOTPLUG int memory_add_physaddr_to_nid(u64 start) { struct numa_meminfo *mi = &numa_meminfo; diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index fbd558f..849a975 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -22,36 +22,11 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include #include -#include -#include -#include -#include #include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +#include "numa_internal.h" #ifdef CONFIG_DISCONTIGMEM /* @@ -96,7 +71,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, } #endif -extern unsigned long find_max_low_pfn(void); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -105,68 +79,6 @@ static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); /* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -static int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memblock_x86_register_active_regions(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE)); - if (!NODE_DATA(nid)) { - unsigned long pgdat_phys; - pgdat_phys = memblock_find_in_range(min_low_pfn<>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - -/* * Remap memory allocator */ static unsigned long node_remap_start_pfn[MAX_NUMNODES]; @@ -322,76 +234,9 @@ void __init init_alloc_remap(int nid, u64 start, u64 end) nid, node_pa, node_pa + size, remap_va, remap_va + size); } -static int get_memcfg_numaq(void) -{ -#ifdef CONFIG_X86_NUMAQ - int nid; - - if (numa_off) - return 0; - - if (numaq_numa_init() < 0) { - nodes_clear(numa_nodes_parsed); - remove_all_active_ranges(); - return 0; - } - - for_each_node_mask(nid, numa_nodes_parsed) - node_set_online(nid); - sort_node_map(); - return 1; -#else - return 0; -#endif -} - -static int get_memcfg_from_srat(void) -{ -#ifdef CONFIG_ACPI_NUMA - int nid; - - if (numa_off) - return 0; - - if (x86_acpi_numa_init() < 0) { - nodes_clear(numa_nodes_parsed); - remove_all_active_ranges(); - return 0; - } - - for_each_node_mask(nid, numa_nodes_parsed) - node_set_online(nid); - sort_node_map(); - return 1; -#else - return 0; -#endif -} - -static void get_memcfg_numa(void) -{ - if (get_memcfg_numaq()) - return; - if (get_memcfg_from_srat()) - return; - get_memcfg_numa_flat(); -} - void __init initmem_init(void) { - int nid; - - get_memcfg_numa(); - numa_init_array(); - - for_each_online_node(nid) { - u64 start = (u64)node_start_pfn[nid] << PAGE_SHIFT; - u64 end = min((u64)node_end_pfn[nid] << PAGE_SHIFT, - (u64)max_pfn << PAGE_SHIFT); - - if (start < end) - init_alloc_remap(nid, start, end); - } + x86_numa_init(); #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; @@ -412,81 +257,9 @@ void __init initmem_init(void) printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) - allocate_pgdat(nid); printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); - for_each_online_node(nid) - propagate_e820_map_node(nid); - - for_each_online_node(nid) { - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(nid)->node_id = nid; - } setup_bootmem_allocator(); } - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ - int nid; - unsigned long pfn = PFN_DOWN(addr); - - for_each_node(nid) - if (node_start_pfn[nid] <= pfn && - pfn < node_end_pfn[nid]) - return nid; - - return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - int nid = paddr_to_nid(addr); - return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - -/* temporary shim, will go away soon */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - - printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n", - nid, start_pfn, end_pfn); - - if (start >= (u64)max_pfn << PAGE_SHIFT) { - printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", - start_pfn, end_pfn); - return 0; - } - - node_set_online(nid); - memblock_x86_register_active_regions(nid, start_pfn, - min(end_pfn, max_pfn)); - - if (!node_has_online_mem(nid)) { - node_start_pfn[nid] = start_pfn; - node_end_pfn[nid] = end_pfn; - } else { - node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn); - node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn); - } - return 0; -} - -/* temporary shim, will go away soon */ -void __init numa_set_distance(int from, int to, int distance) -{ - /* nada */ -} -- cgit v1.1 From 752d4f372f90a2f6eb562aaffb639957890cbcab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Make numa_init_array() static numa_init_array() no longer has users outside of numa.c. Make it static. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/include/asm/numa.h | 2 -- arch/x86/mm/numa.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 7454086..bfacd2c 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -61,14 +61,12 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_NUMA extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); -extern void __init numa_init_array(void); extern void __init init_cpu_to_node(void); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } -static inline void numa_init_array(void) { } static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 56ed714..ecb5685 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -535,7 +535,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) * as the number of CPUs is not known yet. We round robin the existing * nodes. */ -void __init numa_init_array(void) +static void __init numa_init_array(void) { int rr, i; -- cgit v1.1 From c6f58878204b0414e00b43f9bcebf754284f95b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Rename amdtopology_64.c to amdtopology.c amdtopology is going to be used by 32bit too drop _64 suffix. This is pure rename. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/amdtopology.c | 196 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/amdtopology_64.c | 196 ------------------------------------------- 3 files changed, 197 insertions(+), 197 deletions(-) create mode 100644 arch/x86/mm/amdtopology.c delete mode 100644 arch/x86/mm/amdtopology_64.c diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 62997be..3d11327 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c new file mode 100644 index 0000000..0919c26 --- /dev/null +++ b/arch/x86/mm/amdtopology.c @@ -0,0 +1,196 @@ +/* + * AMD NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the AMD northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned char __initdata nodeids[8]; + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -ENOENT; +} + +static __init void early_get_boot_cpu_id(void) +{ + /* + * need to get the APIC ID of the BSP so can use that to + * create apicid_to_node in amd_scan_nodes() + */ +#ifdef CONFIG_X86_MPPARSE + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + early_get_smp_config(); +#endif +} + +int __init amd_numa_init(void) +{ + unsigned long start = PFN_PHYS(0); + unsigned long end = PFN_PHYS(max_pfn); + unsigned numnodes; + unsigned long prevbase; + int i, j, nb; + u32 nodeid, reg; + unsigned int bits, cores, apicid_base; + + if (!early_pci_allowed()) + return -EINVAL; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -ENOENT; + + pr_info("Number of physical nodes %d\n", numnodes); + + prevbase = 0; + for (i = 0; i < 8; i++) { + unsigned long base, limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeids[i] = nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + pr_info("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + pr_info("Skipping node entry %d (base %lx)\n", + i, base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + pr_err("Node %d using interleaving mode %lx/%lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); + return -EINVAL; + } + if (node_isset(nodeid, numa_nodes_parsed)) { + pr_info("Node %d already present, skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit <<= 24; + limit |= (1<<24)-1; + limit++; + + if (limit > end) + limit = end; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + pr_err("Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + pr_err("Node %d bogus settings %lx-%lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + pr_err("Node map not sorted %lx,%lx\n", + prevbase, base); + return -EINVAL; + } + + pr_info("Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + + prevbase = base; + numa_add_memblk(nodeid, base, limit); + node_set(nodeid, numa_nodes_parsed); + } + + if (!nodes_weight(numa_nodes_parsed)) + return -ENOENT; + + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + apicid_base = 0; + + /* get the APIC ID of the BSP early for systems with apicid lifting */ + early_get_boot_cpu_id(); + if (boot_cpu_physical_apicid > 0) { + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for_each_node_mask(i, numa_nodes_parsed) + for (j = apicid_base; j < cores + apicid_base; j++) + set_apicid_to_node((i << bits) + j, i); + + return 0; +} diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c deleted file mode 100644 index 0919c26..0000000 --- a/arch/x86/mm/amdtopology_64.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * AMD NUMA support. - * Discover the memory map and associated nodes. - * - * This version reads it directly from the AMD northbridge. - * - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned char __initdata nodeids[8]; - -static __init int find_northbridge(void) -{ - int num; - - for (num = 0; num < 32; num++) { - u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) - continue; - return num; - } - - return -ENOENT; -} - -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in amd_scan_nodes() - */ -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif -} - -int __init amd_numa_init(void) -{ - unsigned long start = PFN_PHYS(0); - unsigned long end = PFN_PHYS(max_pfn); - unsigned numnodes; - unsigned long prevbase; - int i, j, nb; - u32 nodeid, reg; - unsigned int bits, cores, apicid_base; - - if (!early_pci_allowed()) - return -EINVAL; - - nb = find_northbridge(); - if (nb < 0) - return nb; - - pr_info("Scanning NUMA topology in Northbridge %d\n", nb); - - reg = read_pci_config(0, nb, 0, 0x60); - numnodes = ((reg >> 4) & 0xF) + 1; - if (numnodes <= 1) - return -ENOENT; - - pr_info("Number of physical nodes %d\n", numnodes); - - prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base, limit; - - base = read_pci_config(0, nb, 1, 0x40 + i*8); - limit = read_pci_config(0, nb, 1, 0x44 + i*8); - - nodeids[i] = nodeid = limit & 7; - if ((base & 3) == 0) { - if (i < numnodes) - pr_info("Skipping disabled node %d\n", i); - continue; - } - if (nodeid >= numnodes) { - pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); - continue; - } - - if (!limit) { - pr_info("Skipping node entry %d (base %lx)\n", - i, base); - continue; - } - if ((base >> 8) & 3 || (limit >> 8) & 3) { - pr_err("Node %d using interleaving mode %lx/%lx\n", - nodeid, (base >> 8) & 3, (limit >> 8) & 3); - return -EINVAL; - } - if (node_isset(nodeid, numa_nodes_parsed)) { - pr_info("Node %d already present, skipping\n", - nodeid); - continue; - } - - limit >>= 16; - limit <<= 24; - limit |= (1<<24)-1; - limit++; - - if (limit > end) - limit = end; - if (limit <= base) - continue; - - base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - pr_err("Empty node %d\n", nodeid); - continue; - } - if (limit < base) { - pr_err("Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); - continue; - } - - /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { - pr_err("Node map not sorted %lx,%lx\n", - prevbase, base); - return -EINVAL; - } - - pr_info("Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - - prevbase = base; - numa_add_memblk(nodeid, base, limit); - node_set(nodeid, numa_nodes_parsed); - } - - if (!nodes_weight(numa_nodes_parsed)) - return -ENOENT; - - /* - * We seem to have valid NUMA configuration. Map apicids to nodes - * using the coreid bits from early_identify_cpu. - */ - bits = boot_cpu_data.x86_coreid_bits; - cores = 1 << bits; - apicid_base = 0; - - /* get the APIC ID of the BSP early for systems with apicid lifting */ - early_get_boot_cpu_id(); - if (boot_cpu_physical_apicid > 0) { - pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; - } - - for_each_node_mask(i, numa_nodes_parsed) - for (j = apicid_base; j < cores + apicid_base; j++) - set_apicid_to_node((i << bits) + j, i); - - return 0; -} -- cgit v1.1 From 2706a0bf7b02693ed88752df877f10c2206292ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too Now that NUMA init path is unified, amdtopology can be enabled on 32bit. Make amdtopology.c safe on 32bit by explicitly using u64 and drop X86_64 dependency from Kconfig. Inclusion of bootmem.h is added for max_pfn declaration. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/Kconfig | 2 +- arch/x86/mm/amdtopology.c | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8db4fbf..50cb68d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1174,7 +1174,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" - depends on X86_64 && NUMA && PCI + depends on NUMA && PCI ---help--- Enable AMD NUMA node topology detection. You should say Y here if you have a multi processor AMD system. This uses an old method to diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 0919c26..5247d01 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void) int __init amd_numa_init(void) { - unsigned long start = PFN_PHYS(0); - unsigned long end = PFN_PHYS(max_pfn); + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); unsigned numnodes; - unsigned long prevbase; + u64 prevbase; int i, j, nb; u32 nodeid, reg; unsigned int bits, cores, apicid_base; @@ -95,7 +96,7 @@ int __init amd_numa_init(void) prevbase = 0; for (i = 0; i < 8; i++) { - unsigned long base, limit; + u64 base, limit; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); @@ -107,18 +108,18 @@ int __init amd_numa_init(void) continue; } if (nodeid >= numnodes) { - pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, base, limit); continue; } if (!limit) { - pr_info("Skipping node entry %d (base %lx)\n", + pr_info("Skipping node entry %d (base %Lx)\n", i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - pr_err("Node %d using interleaving mode %lx/%lx\n", + pr_err("Node %d using interleaving mode %Lx/%Lx\n", nodeid, (base >> 8) & 3, (limit >> 8) & 3); return -EINVAL; } @@ -150,19 +151,19 @@ int __init amd_numa_init(void) continue; } if (limit < base) { - pr_err("Node %d bogus settings %lx-%lx.\n", + pr_err("Node %d bogus settings %Lx-%Lx.\n", nodeid, base, limit); continue; } /* Could sort here, but pun for now. Should not happen anyroads. */ if (prevbase > base) { - pr_err("Node map not sorted %lx,%lx\n", + pr_err("Node map not sorted %Lx,%Lx\n", prevbase, base); return -EINVAL; } - pr_info("Node %d MemBase %016lx Limit %016lx\n", + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", nodeid, base, limit); prevbase = base; -- cgit v1.1 From 1b7e03ef7570568d2fb9e6640d7006a0edd728f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 May 2011 17:24:48 +0200 Subject: x86, NUMA: Enable emulation on 32bit too Now that NUMA init path is unified, NUMA emulation can be enabled on 32bit. Make numa_emluation.c safe on 32bit by doing the followings. * Define MAX_DMA32_PFN on 32bit too. * Include bootmem.h for max_pfn declaration. * Use u64 explicitly and always use PFN_PHYS() when converting page number to address. * Avoid __udivdi3() generation on 32bit by doing number of pages calculation instead in split_nodes_interleave(). And drop X86_64 dependency from Kconfig. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Yinghai Lu Cc: David Rientjes Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/dma.h | 10 +++------- arch/x86/mm/numa_emulation.c | 16 +++++++++++----- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50cb68d..648fca4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1201,7 +1201,7 @@ config NODES_SPAN_OTHER_NODES config NUMA_EMU bool "NUMA emulation" - depends on X86_64 && NUMA + depends on NUMA ---help--- Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index d1a314b..0bdb0c5 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -72,19 +72,15 @@ /* 16MB ISA DMA zone */ #define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) -#ifdef CONFIG_X86_32 +/* 4GB broken PCI/AGP hardware bus master zone */ +#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ #define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000) - #else - -/* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) - /* Compat define for old dma zone */ #define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) - #endif /* 8237 DMA controllers */ diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index de84cc1..d0ed086 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "numa_internal.h" @@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, nr_nodes = MAX_NUMNODES; } - size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + /* * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. @@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, */ while (nodes_weight(physnode_mask)) { for_each_node_mask(i, physnode_mask) { - u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); u64 start, limit, end; int phys_blk; @@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) { static struct numa_meminfo ei __initdata; static struct numa_meminfo pi __initdata; - const u64 max_addr = max_pfn << PAGE_SHIFT; + const u64 max_addr = PFN_PHYS(max_pfn); u8 *phys_dist = NULL; size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; @@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, - (u64)max_pfn_mapped << PAGE_SHIFT, + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (phys == MEMBLOCK_ERROR) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); -- cgit v1.1 From a56bca80db8903bb557b9ac38da68dc5b98ea672 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 May 2011 17:24:49 +0200 Subject: x86, NUMA: Rename setup_node_bootmem() to setup_node_data() After using memblock to replace bootmem, that function only sets up node_data now. Change the name to reflect what it actually does. tj: Minor adjustment to the patch description. Signed-off-by: Yinghai Lu Signed-off-by: Tejun Heo --- arch/x86/mm/numa.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ecb5685..9a0ed31 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -189,8 +189,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -/* Initialize bootmem allocator for a node */ -static void __init setup_node_bootmem(int nid, u64 start, u64 end) +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) { const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); const u64 nd_high = PFN_PHYS(max_pfn_mapped); @@ -522,7 +522,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) } if (start < end) - setup_node_bootmem(nid, start, end); + setup_node_data(nid, start, end); } return 0; -- cgit v1.1 From e5a10c1bd12a5d71bbb6406c1b0dbbc9d8958397 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 May 2011 17:24:49 +0200 Subject: x86, NUMA: Trim numa meminfo with max_pfn in a separate loop During testing 32bit numa unifying code from tj, found one system with more than 64g fails to use numa. It turns out we do not trim numa meminfo correctly against max_pfn in case start address of a node is higher than 64GiB. Bug fix made it to tip tree. This patch moves the checking and trimming to a separate loop. So we don't need to compare low/high in following merge loops. It makes the code more readable. Also it makes the node merge printouts less strange. On a 512GiB numa system with 32bit, before: > NUMA: Node 0 [0,a0000) + [100000,80000000) -> [0,80000000) > NUMA: Node 0 [0,80000000) + [100000000,1080000000) -> [0,1000000000) after: > NUMA: Node 0 [0,a0000) + [100000,80000000) -> [0,80000000) > NUMA: Node 0 [0,80000000) + [100000000,1000000000) -> [0,1000000000) Signed-off-by: Yinghai Lu [Updated patch description and comment slightly.] Signed-off-by: Tejun Heo --- arch/x86/mm/numa.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 9a0ed31..f5510d8 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -270,6 +270,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) const u64 high = PFN_PHYS(max_pfn); int i, j, k; + /* first, trim all entries */ for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; @@ -278,10 +279,13 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->end = min(bi->end, high); /* and there's no empty block */ - if (bi->start >= bi->end) { + if (bi->start >= bi->end) numa_remove_memblk_from(i--, mi); - continue; - } + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; for (j = i + 1; j < mi->nr_blks; j++) { struct numa_memblk *bj = &mi->blk[j]; @@ -311,8 +315,8 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) */ if (bi->nid != bj->nid) continue; - start = max(min(bi->start, bj->start), low); - end = min(max(bi->end, bj->end), high); + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); for (k = 0; k < mi->nr_blks; k++) { struct numa_memblk *bk = &mi->blk[k]; @@ -332,6 +336,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } } + /* clear unused ones */ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { mi->blk[i].start = mi->blk[i].end = 0; mi->blk[i].nid = NUMA_NO_NODE; -- cgit v1.1 From c42321c76b0ef472e3bae4bfcb0f46ab19e038ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 May 2011 18:16:22 +0200 Subject: genirq: Make generic irq chip depend on CONFIG_GENERIC_IRQ_CHIP Only compile it in when there are users. Signed-off-by: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org --- kernel/irq/Kconfig | 4 ++++ kernel/irq/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c574f9a..d1d051b3 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -48,6 +48,10 @@ config IRQ_PREFLOW_FASTEOI config IRQ_EDGE_EOI_HANDLER bool +# Generic configurable interrupt chip implementation +config GENERIC_IRQ_CHIP + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index e7a13bd..7329005 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,6 +1,6 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o -obj-y += generic-chip.o +obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -- cgit v1.1 From b50b9f408502a2ea90459ae36ba8cdc9cc005cfe Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Apr 2011 18:17:09 +0300 Subject: UBIFS: do not free write-buffers when in R/O mode Currently UBIFS has a small optimization - it frees write-buffers when it is re-mounted from R/W mode to R/O mode. Of course, when it is mounted R/O, it does not allocate write-buffers as well. This optimization is nice but it leads to subtle problems and complications in recovery, which I can reproduce using the integck test. The symptoms are that after a power cut the file-system cannot be mounted if we first mount it R/O, and then re-mount R/W - 'ubifs_rcvry_gc_commit()' prints: UBIFS error (pid 34456): could not find an empty LEB Analysis of the problem. When mounting R/W, the reply process sets journal heads to buds [1], but when mounting R/O - it does not do this, because the write-buffers are not allocated. So 'ubifs_rcvry_gc_commit()' works completely differently for the same file-system but for the following 2 cases: 1. mounting R/W after a power cut and recover 2. mounting R/O after a power cut, re-mounting R/W and run deferred recovery In the former case, we have journal heads seeked to the a bud, in the latter case, they are non-seeked (wbuf->lnum == -1). So in the latter case we do not try to recover the GC LEB by garbage-collecting to the GC head, but we just try to find an empty LEB, and there may be no empty LEBs, so we just fail. On the other hand, in the former case (mount R/W), we are able to make a GC LEB (@c->gc_lnum) by garbage-collecting. Thus, let's remove this small nice optimization and always allocate write-buffers. This should not make too big difference - we have only 3 of them, each of max. write unit size, which is usually 2KiB. So this is about 6KiB of RAM for the typical case, and only when mounted R/O. [1]: Note, currently the replay process is setting (seeking) the journal heads to _some_ buds, not necessarily to the buds which had been the journal heads before the power cut happened. This will be fixed separately. Signed-off-by: Artem Bityutskiy Cc: stable@kernel.org --- fs/ubifs/log.c | 20 -------------------- fs/ubifs/super.c | 15 ++++----------- 2 files changed, 4 insertions(+), 31 deletions(-) diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 4d0cb12..40fa780 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -175,26 +175,6 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) } /** - * ubifs_create_buds_lists - create journal head buds lists for remount rw. - * @c: UBIFS file-system description object - */ -void ubifs_create_buds_lists(struct ubifs_info *c) -{ - struct rb_node *p; - - spin_lock(&c->buds_lock); - p = rb_first(&c->buds); - while (p) { - struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb); - struct ubifs_jhead *jhead = &c->jheads[bud->jhead]; - - list_add_tail(&bud->list, &jhead->buds_list); - p = rb_next(p); - } - spin_unlock(&c->buds_lock); -} - -/** * ubifs_add_bud_to_log - add a new bud to the log. * @c: UBIFS file-system description object * @jhead: journal head the bud belongs to diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index be6c7b0..04ad07f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1257,12 +1257,12 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; } + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { - err = alloc_wbufs(c); - if (err) - goto out_cbuf; - /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { @@ -1631,12 +1631,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (err) goto out; - err = alloc_wbufs(c); - if (err) - goto out; - - ubifs_create_buds_lists(c); - /* Create background thread */ c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { @@ -1744,7 +1738,6 @@ static void ubifs_remount_ro(struct ubifs_info *c) if (err) ubifs_ro_mode(c, err); - free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; kfree(c->write_reserve_buf); -- cgit v1.1 From 52c6e6f990669deac3f370f1603815adb55a1dbd Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Apr 2011 18:46:31 +0300 Subject: UBIFS: seek journal heads to the latest bud in replay This is the second fix of the following symptom: UBIFS error (pid 34456): could not find an empty LEB which sometimes happens after power cuts when we mount the file-system - UBIFS refuses it with the above error message which comes from the 'ubifs_rcvry_gc_commit()' function. I can reproduce this using the integck test with the UBIFS power cut emulation enabled. Analysis of the problem. Currently UBIFS replay seeks the journal heads to the last _replayed_ bud. But the buds are replayed out-of-order, so the replay basically seeks journal heads to the "random" bud belonging to this head, and not to the _last_ one. The result of this is that the GC head may be seeked to a full LEB with no free space, or very little free space. And 'ubifs_rcvry_gc_commit()' tries to find a fully or mostly dirty LEB to match the current GC head (because we need to garbage-collect that dirty LEB at one go, because we do not have @c->gc_lnum). So 'ubifs_find_dirty_leb()' fails and we fall back to finding an empty LEB and also fail. As a result - recovery fails and mounting fails. This patch teaches the replay to initialize the GC heads exactly to the latest buds, i.e. the buds which have the largest sequence number in corresponding log reference nodes. Signed-off-by: Artem Bityutskiy Cc: stable@kernel.org --- fs/ubifs/replay.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index eed0fcf..d3d6d36 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -59,6 +59,7 @@ enum { * @new_size: truncation new size * @free: amount of free space in a bud * @dirty: amount of dirty space in a bud from padding and deletion nodes + * @jhead: journal head number of the bud * * UBIFS journal replay must compare node sequence numbers, which means it must * build a tree of node information to insert into the TNC. @@ -80,6 +81,7 @@ struct replay_entry { struct { int free; int dirty; + int jhead; }; }; }; @@ -159,6 +161,11 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) err = PTR_ERR(lp); goto out; } + + /* Make sure the journal head points to the latest bud */ + err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum, + c->leb_size - r->free, UBI_SHORTTERM); + out: ubifs_release_lprops(c); return err; @@ -627,10 +634,6 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, ubifs_assert(sleb->endpt - offs >= used); ubifs_assert(sleb->endpt % c->min_io_size == 0); - if (sleb->endpt + c->min_io_size <= c->leb_size && !c->ro_mount) - err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, - sleb->endpt, UBI_SHORTTERM); - *dirty = sleb->endpt - offs - used; *free = c->leb_size - sleb->endpt; @@ -653,12 +656,14 @@ out_dump: * @sqnum: sequence number * @free: amount of free space in bud * @dirty: amount of dirty space from padding and deletion nodes + * @jhead: journal head number for the bud * * This function inserts a reference node to the replay tree and returns zero * in case of success or a negative error code in case of failure. */ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, - unsigned long long sqnum, int free, int dirty) + unsigned long long sqnum, int free, int dirty, + int jhead) { struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; @@ -688,6 +693,7 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, r->flags = REPLAY_REF; r->free = free; r->dirty = dirty; + r->jhead = jhead; rb_link_node(&r->rb, parent, p); rb_insert_color(&r->rb, &c->replay_tree); @@ -712,7 +718,7 @@ static int replay_buds(struct ubifs_info *c) if (err) return err; err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, - free, dirty); + free, dirty, b->bud->jhead); if (err) return err; } -- cgit v1.1 From 94b2c363dcf732a4edab4ed66041cb36e7f28fbf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 30 Apr 2011 22:56:20 +0200 Subject: genirq: Fix typo CONFIG_GENIRC_IRQ_SHOW_LEVEL commit ab7798ffcf98b11a9525cf65bacdae3fd58d357f ("genirq: Expand generic show_interrupts()") added the Kconfig option GENERIC_IRQ_SHOW_LEVEL to accomodate PowerPC, but this doesn't actually enable the functionality due to a typo in the #ifdef check. Signed-off-by: Geert Uytterhoeven Cc: Linux/PPC Development Link: http://lkml.kernel.org/r/%3Calpine.DEB.2.00.1104302251370.19068%40ayla.of.borg%3E Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index dd201bd..834899f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -419,7 +419,7 @@ int show_interrupts(struct seq_file *p, void *v) } else { seq_printf(p, " %8s", "None"); } -#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL +#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif if (desc->name) -- cgit v1.1 From a38647837a411f7df79623128421eef2118b5884 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 29 Apr 2011 11:34:00 -0400 Subject: xen/mmu: Add workaround "x86-64, mm: Put early page table high" As a consequence of the commit: commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e Author: Yinghai Lu Date: Fri Dec 17 16:58:28 2010 -0800 x86-64, mm: Put early page table high it causes the Linux kernel to crash under Xen: mapping kernel into physical memory Xen: setup ISA identity maps about to get started... (XEN) mm.c:2466:d0 Bad type (saw 7400000000000001 != exp 1000000000000000) for mfn b1d89 (pfn bacf7) (XEN) mm.c:3027:d0 Error while pinning mfn b1d89 (XEN) traps.c:481:d0 Unhandled invalid opcode fault/trap [#6] on VCPU 0 [ec=0000] (XEN) domain_crash_sync called from entry.S (XEN) Domain 0 (vcpu#0) crashed on cpu#0: ... The reason is that at some point init_memory_mapping is going to reach the pagetable pages area and map those pages too (mapping them as normal memory that falls in the range of addresses passed to init_memory_mapping as argument). Some of those pages are already pagetable pages (they are in the range pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and everything is fine. Some of these pages are not pagetable pages yet (they fall in the range pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they are going to be mapped RW. When these pages become pagetable pages and are hooked into the pagetable, xen will find that the guest has already a RW mapping of them somewhere and fail the operation. The reason Xen requires pagetables to be RO is that the hypervisor needs to verify that the pagetables are valid before using them. The validation operations are called "pinning" (more details in arch/x86/xen/mmu.c). In order to fix the issue we mark all the pages in the entire range pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation is completed only the range pgt_buf_start-pgt_buf_end is reserved by init_memory_mapping. Hence the kernel is going to crash as soon as one of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those ranges are RO). For this reason, this function is introduced which is called _after_ the init_memory_mapping has completed (in a perfect world we would call this function from init_memory_mapping, but lets ignore that). Because we are called _after_ init_memory_mapping the pgt_buf_[start, end,top] have all changed to new values (b/c another init_memory_mapping is called). Hence, the first time we enter this function, we save away the pgt_buf_start value and update the pgt_buf_[end,top]. When we detect that the "old" pgt_buf_start through pgt_buf_end PFNs have been reserved (so memblock_x86_reserve_range has been called), we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. And then we update those "old" pgt_buf_[end|top] with the new ones so that we can redo this on the next pagetable. Acked-by: "H. Peter Anvin" Reviewed-by: Jeremy Fitzhardinge [v1: Updated with Jeremy's comments] [v2: Added the crash output] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aef7af9..1bca25f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } +#ifdef CONFIG_X86_64 +static __initdata u64 __last_pgt_set_rw = 0; +static __initdata u64 __pgt_buf_start = 0; +static __initdata u64 __pgt_buf_end = 0; +static __initdata u64 __pgt_buf_top = 0; +/* + * As a consequence of the commit: + * + * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e + * Author: Yinghai Lu + * Date: Fri Dec 17 16:58:28 2010 -0800 + * + * x86-64, mm: Put early page table high + * + * at some point init_memory_mapping is going to reach the pagetable pages + * area and map those pages too (mapping them as normal memory that falls + * in the range of addresses passed to init_memory_mapping as argument). + * Some of those pages are already pagetable pages (they are in the range + * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and + * everything is fine. + * Some of these pages are not pagetable pages yet (they fall in the range + * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they + * are going to be mapped RW. When these pages become pagetable pages and + * are hooked into the pagetable, xen will find that the guest has already + * a RW mapping of them somewhere and fail the operation. + * The reason Xen requires pagetables to be RO is that the hypervisor needs + * to verify that the pagetables are valid before using them. The validation + * operations are called "pinning". + * + * In order to fix the issue we mark all the pages in the entire range + * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation + * is completed only the range pgt_buf_start-pgt_buf_end is reserved by + * init_memory_mapping. Hence the kernel is going to crash as soon as one + * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those + * ranges are RO). + * + * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ + * the init_memory_mapping has completed (in a perfect world we would + * call this function from init_memory_mapping, but lets ignore that). + * + * Because we are called _after_ init_memory_mapping the pgt_buf_[start, + * end,top] have all changed to new values (b/c init_memory_mapping + * is called and setting up another new page-table). Hence, the first time + * we enter this function, we save away the pgt_buf_start value and update + * the pgt_buf_[end,top]. + * + * When we detect that the "old" pgt_buf_start through pgt_buf_end + * PFNs have been reserved (so memblock_x86_reserve_range has been called), + * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. + * + * And then we update those "old" pgt_buf_[end|top] with the new ones + * so that we can redo this on the next pagetable. + */ +static __init void mark_rw_past_pgt(void) { + + if (pgt_buf_end > pgt_buf_start) { + u64 addr, size; + + /* Save it away. */ + if (!__pgt_buf_start) { + __pgt_buf_start = pgt_buf_start; + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + return; + } + /* If we get the range that starts at __pgt_buf_end that means + * the range is reserved, and that in 'init_memory_mapping' + * the 'memblock_x86_reserve_range' has been called with the + * outdated __pgt_buf_start, __pgt_buf_end (the "new" + * pgt_buf_[start|end|top] refer now to a new pagetable. + * Note: we are called _after_ the pgt_buf_[..] have been + * updated.*/ + + addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), + &size, PAGE_SIZE); + + /* Still not reserved, meaning 'memblock_x86_reserve_range' + * hasn't been called yet. Update the _end and _top.*/ + if (addr == PFN_PHYS(__pgt_buf_start)) { + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + return; + } + + /* OK, the area is reserved, meaning it is time for us to + * set RW for the old end->top PFNs. */ + + /* ..unless we had already done this. */ + if (__pgt_buf_end == __last_pgt_set_rw) + return; + + addr = PFN_PHYS(__pgt_buf_end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", + PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); + + while (addr < PFN_PHYS(__pgt_buf_top)) { + make_lowmem_page_readwrite(__va(addr)); + addr += PAGE_SIZE; + } + /* And update everything so that we are ready for the next + * pagetable (the one created for regions past 4GB) */ + __last_pgt_set_rw = __pgt_buf_end; + __pgt_buf_start = pgt_buf_start; + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + } + return; +} +#else +static __init void mark_rw_past_pgt(void) { } +#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1489,6 +1602,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) unsigned long pfn = pte_pfn(pte); /* + * A bit of optimization. We do not need to call the workaround + * when xen_set_pte_init is called with a PTE with 0 as PFN. + * That is b/c the pagetable at that point are just being populated + * with empty values and we can save some cycles by not calling + * the 'memblock' code.*/ + if (pfn) + mark_rw_past_pgt(); + /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an * early_ioremap fixmap slot as a freshly allocated page, make sure @@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { + mark_rw_past_pgt(); + #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif -- cgit v1.1 From b9269dc7bfdf8c985971c09f2dcb2aa04ad7986d Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 12 Apr 2011 12:19:49 +0100 Subject: xen: mask_rw_pte mark RO all pagetable pages up to pgt_buf_top mask_rw_pte is currently checking if a pfn is a pagetable page if it falls in the range pgt_buf_start - pgt_buf_end but that is incorrect because pgt_buf_end is a moving target: pgt_buf_top is the real boundary. Acked-by: "H. Peter Anvin" Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 1bca25f..55c965b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1616,7 +1616,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) * it is RO. */ if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_end)) || + pfn >= pgt_buf_start && pfn < pgt_buf_top)) || (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); -- cgit v1.1 From 2e053a27d9d5ad5e0831e002cbf8043836fb2060 Mon Sep 17 00:00:00 2001 From: "B.J. Buchalter" Date: Mon, 2 May 2011 13:33:42 -0400 Subject: firewire: Fix for broken configrom updates in quick succession Current implementation of ohci_set_config_rom() uses a deferred bus reset via fw_schedule_bus_reset(). If clients add multiple unit descriptors to the config_rom in quick succession, the deferred bus reset may not have fired before succeeding update requests have come in. This can lead to an incorrect partial update of the config_rom for both addition and removal of config_rom descriptors, as the ohci_set_config_rom() routine will return -EBUSY if a previous pending update has not been completed yet; the requested update just gets dropped on the floor. This patch recognizes that the "in-flight" update can be modified until it has been processed by the bus-reset, and the locking in the bus_reset_tasklet ensures that the update is done atomically with respect to modifications made by ohci_set_config_rom(). The -EBUSY error case is simply removed. [Stefan R: The bug always existed at least theoretically. But it became easy to trigger since 2.6.36 commit 02d37bed188c "firewire: core: integrate software-forced bus resets with bus management" which introduced long mandatory delays between janitorial bus resets.] Signed-off-by: Benjamin Buchalter Signed-off-by: Stefan Richter (trivial style changes) Cc: # 2.6.36.y and newer --- drivers/firewire/ohci.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index f903d7b6..23d1468 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2199,7 +2199,6 @@ static int ohci_set_config_rom(struct fw_card *card, { struct fw_ohci *ohci; unsigned long flags; - int ret = -EBUSY; __be32 *next_config_rom; dma_addr_t uninitialized_var(next_config_rom_bus); @@ -2240,22 +2239,37 @@ static int ohci_set_config_rom(struct fw_card *card, spin_lock_irqsave(&ohci->lock, flags); + /* + * If there is not an already pending config_rom update, + * push our new allocation into the ohci->next_config_rom + * and then mark the local variable as null so that we + * won't deallocate the new buffer. + * + * OTOH, if there is a pending config_rom update, just + * use that buffer with the new config_rom data, and + * let this routine free the unused DMA allocation. + */ + if (ohci->next_config_rom == NULL) { ohci->next_config_rom = next_config_rom; ohci->next_config_rom_bus = next_config_rom_bus; + next_config_rom = NULL; + } - copy_config_rom(ohci->next_config_rom, config_rom, length); + copy_config_rom(ohci->next_config_rom, config_rom, length); - ohci->next_header = config_rom[0]; - ohci->next_config_rom[0] = 0; + ohci->next_header = config_rom[0]; + ohci->next_config_rom[0] = 0; - reg_write(ohci, OHCI1394_ConfigROMmap, - ohci->next_config_rom_bus); - ret = 0; - } + reg_write(ohci, OHCI1394_ConfigROMmap, ohci->next_config_rom_bus); spin_unlock_irqrestore(&ohci->lock, flags); + /* If we didn't use the DMA allocation, delete it. */ + if (next_config_rom != NULL) + dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE, + next_config_rom, next_config_rom_bus); + /* * Now initiate a bus reset to have the changes take * effect. We clean up the old config rom memory and DMA @@ -2263,13 +2277,10 @@ static int ohci_set_config_rom(struct fw_card *card, * controller could need to access it before the bus reset * takes effect. */ - if (ret == 0) - fw_schedule_bus_reset(&ohci->card, true, true); - else - dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE, - next_config_rom, next_config_rom_bus); - return ret; + fw_schedule_bus_reset(&ohci->card, true, true); + + return 0; } static void ohci_send_request(struct fw_card *card, struct fw_packet *packet) -- cgit v1.1 From 983960b159a75621855283030d92a80bea92e071 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 2 May 2011 09:59:29 +0000 Subject: amd8111e: trivial typo spelling: Negotitate -> Negotiate Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- drivers/net/amd8111e.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c index 88495c4..241b185 100644 --- a/drivers/net/amd8111e.c +++ b/drivers/net/amd8111e.c @@ -106,7 +106,7 @@ MODULE_DESCRIPTION ("AMD8111 based 10/100 Ethernet Controller. Driver Version "M MODULE_LICENSE("GPL"); MODULE_DEVICE_TABLE(pci, amd8111e_pci_tbl); module_param_array(speed_duplex, int, NULL, 0); -MODULE_PARM_DESC(speed_duplex, "Set device speed and duplex modes, 0: Auto Negotitate, 1: 10Mbps Half Duplex, 2: 10Mbps Full Duplex, 3: 100Mbps Half Duplex, 4: 100Mbps Full Duplex"); +MODULE_PARM_DESC(speed_duplex, "Set device speed and duplex modes, 0: Auto Negotiate, 1: 10Mbps Half Duplex, 2: 10Mbps Full Duplex, 3: 100Mbps Half Duplex, 4: 100Mbps Full Duplex"); module_param_array(coalesce, bool, NULL, 0); MODULE_PARM_DESC(coalesce, "Enable or Disable interrupt coalescing, 1: Enable, 0: Disable"); module_param_array(dynamic_ipg, bool, NULL, 0); -- cgit v1.1 From 7806a49ab625ebeb1709e5e87299b64932b807a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 May 2011 14:33:24 -0700 Subject: x86, reboot: Fix relocations in reboot_32.S The use of base for %ebx in this file is arbitrary, *except* that we also use it to compute the real-mode segment. Therefore, make it so that r_base really is the true address to which %ebx points. This resolves kernel bugzilla 33302. Reported-and-tested-by: Alexey Zaytsev Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-08os5wi3yq1no0y4i5m4z7he@git.kernel.org --- arch/x86/kernel/reboot_32.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S index 29092b3..1d5c46d 100644 --- a/arch/x86/kernel/reboot_32.S +++ b/arch/x86/kernel/reboot_32.S @@ -21,26 +21,26 @@ r_base = . /* Get our own relocated address */ call 1f 1: popl %ebx - subl $1b, %ebx + subl $(1b - r_base), %ebx /* Compute the equivalent real-mode segment */ movl %ebx, %ecx shrl $4, %ecx /* Patch post-real-mode segment jump */ - movw dispatch_table(%ebx,%eax,2),%ax - movw %ax, 101f(%ebx) - movw %cx, 102f(%ebx) + movw (dispatch_table - r_base)(%ebx,%eax,2),%ax + movw %ax, (101f - r_base)(%ebx) + movw %cx, (102f - r_base)(%ebx) /* Set up the IDT for real mode. */ - lidtl machine_real_restart_idt(%ebx) + lidtl (machine_real_restart_idt - r_base)(%ebx) /* * Set up a GDT from which we can load segment descriptors for real * mode. The GDT is not used in real mode; it is just needed here to * prepare the descriptors. */ - lgdtl machine_real_restart_gdt(%ebx) + lgdtl (machine_real_restart_gdt - r_base)(%ebx) /* * Load the data segment registers with 16-bit compatible values -- cgit v1.1 From 41c31f318a5209922d051e293c61e4724daad11c Mon Sep 17 00:00:00 2001 From: Lifeng Sun Date: Wed, 27 Apr 2011 22:04:51 +0000 Subject: networking: inappropriate ioctl operation should return ENOTTY ioctl() calls against a socket with an inappropriate ioctl operation are incorrectly returning EINVAL rather than ENOTTY: [ENOTTY] Inappropriate I/O control operation. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=33992 Signed-off-by: Lifeng Sun Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index c2ac599..856b6ee 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4773,7 +4773,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm * is never reached */ WARN_ON(1); - err = -EINVAL; + err = -ENOTTY; break; } @@ -5041,7 +5041,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* Set the per device memory buffer space. * Not applicable in our case */ case SIOCSIFLINK: - return -EINVAL; + return -ENOTTY; /* * Unknown or private ioctl. @@ -5062,7 +5062,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* Take care of Wireless Extensions */ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) return wext_handle_ioctl(net, &ifr, cmd, arg); - return -EINVAL; + return -ENOTTY; } } -- cgit v1.1 From 6fdbab9d93e04bfe71f2b3fde485d092e2ffe3ec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Apr 2011 11:02:15 +0000 Subject: tg3: Fix failure to enable WoL by default when possible tg3 is supposed to enable WoL by default on adapters which support that, but it fails to do so unless the adapter's /sys/devices/.../power/wakeup file contains 'enabled' during the initialization of the adapter. Fix that by making tg3 use device_set_wakeup_enable() to enable wakeup automatically whenever WoL should be enabled by default. Signed-off-by: Rafael J. Wysocki Signed-off-by: David S. Miller --- drivers/net/tg3.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index b8c5f35..7a5daef 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -12327,8 +12327,10 @@ static void __devinit tg3_get_eeprom_hw_cfg(struct tg3 *tp) if (val & VCPU_CFGSHDW_ASPM_DBNC) tp->tg3_flags |= TG3_FLAG_ASPM_WORKAROUND; if ((val & VCPU_CFGSHDW_WOL_ENABLE) && - (val & VCPU_CFGSHDW_WOL_MAGPKT)) + (val & VCPU_CFGSHDW_WOL_MAGPKT)) { tp->tg3_flags |= TG3_FLAG_WOL_ENABLE; + device_set_wakeup_enable(&tp->pdev->dev, true); + } goto done; } @@ -12461,8 +12463,10 @@ static void __devinit tg3_get_eeprom_hw_cfg(struct tg3 *tp) tp->tg3_flags &= ~TG3_FLAG_WOL_CAP; if ((tp->tg3_flags & TG3_FLAG_WOL_CAP) && - (nic_cfg & NIC_SRAM_DATA_CFG_WOL_ENABLE)) + (nic_cfg & NIC_SRAM_DATA_CFG_WOL_ENABLE)) { tp->tg3_flags |= TG3_FLAG_WOL_ENABLE; + device_set_wakeup_enable(&tp->pdev->dev, true); + } if (cfg2 & (1 << 17)) tp->phy_flags |= TG3_PHYFLG_CAPACITIVE_COUPLING; -- cgit v1.1 From d946092000698fd204d82a9d239103c656fb63bf Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sat, 30 Apr 2011 08:29:27 +0000 Subject: smsc95xx: fix reset check The reset loop check should check the MII_BMCR register value for BMCR_RESET rather than for MII_BMCR (the register address, which also happens to be zero). Signed-off-by: Rabin Vincent Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 47a6c87..48d4efd 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -730,7 +730,7 @@ static int smsc95xx_phy_initialize(struct usbnet *dev) msleep(10); bmcr = smsc95xx_mdio_read(dev->net, dev->mii.phy_id, MII_BMCR); timeout++; - } while ((bmcr & MII_BMCR) && (timeout < 100)); + } while ((bmcr & BMCR_RESET) && (timeout < 100)); if (timeout >= 100) { netdev_warn(dev->net, "timeout on PHY Reset"); -- cgit v1.1 From 6c8c44462ac8ac3f95929328f0c56e9e8b6dd524 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 30 Apr 2011 01:28:17 +0000 Subject: Revert: veth: remove unneeded ifname code from veth_newlink() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 84c49d8c3e4abefb0a41a77b25aa37ebe8d6b743 ("veth: remove unneeded ifname code from veth_newlink()") caused regression on veth creation. This patch reverts the original one. Reported-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/veth.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 2de9b90..3b99f64 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -403,6 +403,17 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, if (tb[IFLA_ADDRESS] == NULL) random_ether_addr(dev->dev_addr); + if (tb[IFLA_IFNAME]) + nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + else + snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); + + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err_alloc_name; + } + err = register_netdevice(dev); if (err < 0) goto err_register_dev; @@ -422,6 +433,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, err_register_dev: /* nothing to do */ +err_alloc_name: err_configure_peer: unregister_netdevice(peer); return err; -- cgit v1.1 From ff538818f4a82c4cf02d2d6bd6ac5c7360b9d41d Mon Sep 17 00:00:00 2001 From: Lucian Adrian Grijincu Date: Sun, 1 May 2011 01:44:01 +0000 Subject: sysctl: net: call unregister_net_sysctl_table where needed ctl_table_headers registered with register_net_sysctl_table should have been unregistered with the equivalent unregister_net_sysctl_table Signed-off-by: Lucian Adrian Grijincu Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5345b0b..cd9ca08 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1680,7 +1680,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) return; cnf->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); + unregister_net_sysctl_table(t->sysctl_header); kfree(t->dev_name); kfree(t); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1493534..a7bda07 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4537,7 +4537,7 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) t = p->sysctl; p->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); + unregister_net_sysctl_table(t->sysctl_header); kfree(t->dev_name); kfree(t); } -- cgit v1.1 From d93da492d9a8840a0bdda88e74df8d0f593f1977 Mon Sep 17 00:00:00 2001 From: Arvid Brodin Date: Tue, 26 Apr 2011 21:46:47 +0200 Subject: usb/isp1760: Report correct urb status after unlink This fixes a bug in my previous (2.6.38) patch series which caused urb->status value to be wrong after unlink (broke usbtest 11, 12). Signed-off-by: Arvid Brodin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/isp1760-hcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c index 795345a..7b2e69a 100644 --- a/drivers/usb/host/isp1760-hcd.c +++ b/drivers/usb/host/isp1760-hcd.c @@ -1633,6 +1633,7 @@ static int isp1760_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) ints[i].qh = NULL; ints[i].qtd = NULL; + urb->status = status; isp1760_urb_done(hcd, urb); if (qtd) pe(hcd, qh, qtd); -- cgit v1.1 From cee6a262550f53a13acfefbc1e3e5ff35c96182c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 2 May 2011 14:21:44 -0400 Subject: USB: fix regression in usbip by setting has_tt flag This patch (as1460) fixes a regression in the usbip driver caused by the new check for Transaction Translators in USB-2 hubs. The root hub registered by vhci_hcd needs to have the has_tt flag set, because it can connect to low- and full-speed devices as well as high-speed devices. Signed-off-by: Alan Stern Reported-and-tested-by: Nikola Ciprich CC: Signed-off-by: Greg Kroah-Hartman --- drivers/staging/usbip/vhci_hcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/usbip/vhci_hcd.c b/drivers/staging/usbip/vhci_hcd.c index 0f02a4b..5d0caa8 100644 --- a/drivers/staging/usbip/vhci_hcd.c +++ b/drivers/staging/usbip/vhci_hcd.c @@ -1139,7 +1139,7 @@ static int vhci_hcd_probe(struct platform_device *pdev) usbip_uerr("create hcd failed\n"); return -ENOMEM; } - + hcd->has_tt = 1; /* this is private data for vhci_hcd */ the_controller = hcd_to_vhci(hcd); -- cgit v1.1 From 146f9f65bd13f56665205aed7205d531c810cb35 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 29 Apr 2011 06:52:43 -0400 Subject: cifs: refactor mid finding loop in cifs_demultiplex_thread ...to reduce the extreme indentation. This should introduce no behavioral changes. Cc: stable@kernel.org Acked-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 92 +++++++++++++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8b75a8ec..bfbf323 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -617,59 +617,59 @@ incomplete_rcv: list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - if ((mid_entry->mid == smb_buffer->Mid) && - (mid_entry->midState == MID_REQUEST_SUBMITTED) && - (mid_entry->command == smb_buffer->Command)) { - if (length == 0 && - check2ndT2(smb_buffer, server->maxBuf) > 0) { - /* We have a multipart transact2 resp */ - isMultiRsp = true; - if (mid_entry->resp_buf) { - /* merge response - fix up 1st*/ - if (coalesce_t2(smb_buffer, + if (mid_entry->mid != smb_buffer->Mid || + mid_entry->midState != MID_REQUEST_SUBMITTED || + mid_entry->command != smb_buffer->Command) { + mid_entry = NULL; + continue; + } + + if (length == 0 && + check2ndT2(smb_buffer, server->maxBuf) > 0) { + /* We have a multipart transact2 resp */ + isMultiRsp = true; + if (mid_entry->resp_buf) { + /* merge response - fix up 1st*/ + if (coalesce_t2(smb_buffer, mid_entry->resp_buf)) { - mid_entry->multiRsp = - true; - break; - } else { - /* all parts received */ - mid_entry->multiEnd = - true; - goto multi_t2_fnd; - } + mid_entry->multiRsp = true; + break; } else { - if (!isLargeBuf) { - cERROR(1, "1st trans2 resp needs bigbuf"); - /* BB maybe we can fix this up, switch - to already allocated large buffer? */ - } else { - /* Have first buffer */ - mid_entry->resp_buf = - smb_buffer; - mid_entry->largeBuf = - true; - bigbuf = NULL; - } + /* all parts received */ + mid_entry->multiEnd = true; + goto multi_t2_fnd; + } + } else { + if (!isLargeBuf) { + /* + * FIXME: switch to already + * allocated largebuf? + */ + cERROR(1, "1st trans2 resp " + "needs bigbuf"); + } else { + /* Have first buffer */ + mid_entry->resp_buf = + smb_buffer; + mid_entry->largeBuf = true; + bigbuf = NULL; } - break; } - mid_entry->resp_buf = smb_buffer; - mid_entry->largeBuf = isLargeBuf; + break; + } + mid_entry->resp_buf = smb_buffer; + mid_entry->largeBuf = isLargeBuf; multi_t2_fnd: - if (length == 0) - mid_entry->midState = - MID_RESPONSE_RECEIVED; - else - mid_entry->midState = - MID_RESPONSE_MALFORMED; + if (length == 0) + mid_entry->midState = MID_RESPONSE_RECEIVED; + else + mid_entry->midState = MID_RESPONSE_MALFORMED; #ifdef CONFIG_CIFS_STATS2 - mid_entry->when_received = jiffies; + mid_entry->when_received = jiffies; #endif - list_del_init(&mid_entry->qhead); - mid_entry->callback(mid_entry); - break; - } - mid_entry = NULL; + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); + break; } spin_unlock(&GlobalMid_Lock); -- cgit v1.1 From 16541ba11c4f04ffe94b073e301f00b749fb84a1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 29 Apr 2011 06:52:44 -0400 Subject: cifs: handle errors from coalesce_t2 cifs_demultiplex_thread calls coalesce_t2 to try and merge follow-on t2 responses into the original mid buffer. coalesce_t2 however can return errors, but the caller doesn't handle that situation properly. Fix the thread to treat such a case as it would a malformed packet. Mark the mid as being malformed and issue the callback. Cc: stable@kernel.org Acked-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bfbf323..05f1dcf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -630,12 +630,16 @@ incomplete_rcv: isMultiRsp = true; if (mid_entry->resp_buf) { /* merge response - fix up 1st*/ - if (coalesce_t2(smb_buffer, - mid_entry->resp_buf)) { + length = coalesce_t2(smb_buffer, + mid_entry->resp_buf); + if (length > 0) { + length = 0; mid_entry->multiRsp = true; break; } else { - /* all parts received */ + /* all parts received or + * packet is malformed + */ mid_entry->multiEnd = true; goto multi_t2_fnd; } -- cgit v1.1 From fae85b7c8bcc7de9c0a2698587e20c15beb7d5a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 26 Oct 2010 20:24:03 +0200 Subject: perf: Start the restructuring mv kernel/perf_event.c -> kernel/events/core.c. From there, all further sensible splitting can happen. The idea is that due to perf_event.c becoming pretty sizable and with the advent of the marriage with ftrace, splitting functionality into its logical parts should help speeding up the unification and to manage the complexity of the subsystem. Signed-off-by: Borislav Petkov --- kernel/Makefile | 5 +- kernel/events/Makefile | 5 + kernel/events/core.c | 7455 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/perf_event.c | 7455 ------------------------------------------------ 4 files changed, 7463 insertions(+), 7457 deletions(-) create mode 100644 kernel/events/Makefile create mode 100644 kernel/events/core.c delete mode 100644 kernel/perf_event.c diff --git a/kernel/Makefile b/kernel/Makefile index 85cbfb3..7981530 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_perf_event.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif @@ -103,7 +102,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o + +obj-$(CONFIG_PERF_EVENTS) += events/ + obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 0000000..26c00e4 --- /dev/null +++ b/kernel/events/Makefile @@ -0,0 +1,5 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_core.o = -pg +endif + +obj-y += core.o diff --git a/kernel/events/core.c b/kernel/events/core.c new file mode 100644 index 0000000..440bc48 --- /dev/null +++ b/kernel/events/core.c @@ -0,0 +1,7455 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +struct jump_label_key perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ + +/* + * max perf event sample rate + */ +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} + +static atomic64_t perf_event_id; + +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); + +void __weak perf_event_print_debug(void) { } + +extern __weak const char *perf_pmu_name(void) +{ + return "pmu"; +} + +static inline u64 perf_clock(void) +{ + return local_clock(); +} + +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp; + + /* + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) + */ + if (!is_cgroup_event(event)) + return; + + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* must be done before we fput() the file */ + perf_get_cgroup(event); + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } +out: + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} + +void perf_pmu_enable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); +} + +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + raw_spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + + return ctx ? ctx->time : 0; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) + run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = perf_event_time(event); + + event->total_time_running = run_end - event->tstamp_running; + +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; + + /* + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. + */ + if (event->group_leader == event) { + struct list_head *list; + + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } + + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + + list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + event->id_header_size = size; +} + +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader, *pos; + + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) + return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } + + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_rcu(&event->event_entry); + + if (event->group_leader == event) + list_del_init(&event->group_entry); + + update_group_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; +} + +static void perf_group_detach(struct perf_event *event) +{ + struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + goto out; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to whatever list we are on. + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + if (list) + list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; + } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); +} + +static inline int +event_filter_match(struct perf_event *event) +{ + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = tstamp - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = tstamp; + } + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + int state = group_event->state; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static int __perf_remove_from_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + raw_spin_lock(&ctx->lock); + event_sched_out(event, cpuctx, ctx); + list_del_event(event, ctx); + raw_spin_unlock(&ctx->lock); + + return 0; +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always successful. + */ + cpu_function_call(event->cpu, __perf_remove_from_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_remove_from_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. + */ + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to disable a performance event + */ +static int __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + cpu_function_call(event->cpu, __perf_event_disable, event); + return; + } + +retry: + if (!task_function_call(task, __perf_event_disable, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + raw_spin_unlock_irq(&ctx->lock); +} + +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = smp_processor_id(); + + /* + * Unthrottle events, since we scheduled we might have missed several + * ticks already, also for a heavily scheduling task there is little + * guarantee it'll get a tick in a timely manner. + */ + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { + perf_log_throttle(event, 1); + event->hw.interrupts = 0; + } + + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->add(event, PERF_EF_START)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event, *partial_group = NULL; + struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + pmu->start_txn(pmu); + + if (event_sched_in(group_event, cpuctx, ctx)) { + pmu->cancel_txn(pmu); + return -EAGAIN; + } + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx)) { + partial_group = event; + goto group_error; + } + } + + if (!pmu->commit_txn(pmu)) + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. + * + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } + } + event_sched_out(group_event, cpuctx, ctx); + + pmu->cancel_txn(pmu); + + return -EAGAIN; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (event->group_flags & PERF_GROUP_SOFTWARE) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + + list_add_event(event, ctx); + perf_group_attach(event); + event->tstamp_enabled = tstamp; + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; +} + +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static int __perf_install_in_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int err; + + /* + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. + */ + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx, ctx->task); + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); + + add_event_to_ctx(event, ctx); + + if (!event_filter_match(event)) + goto unlock; + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + +unlock: + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + event->ctx = ctx; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always successful. + */ + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. + */ + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + } +} + +/* + * Cross CPU call to enable a performance event + */ +static int __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int err; + + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, ctx); + + __perf_event_mark_enabled(event, ctx); + + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); + goto unlock; + } + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + if (event == leader) + err = group_sched_in(event, cpuctx, ctx); + else + err = event_sched_in(event, cpuctx, ctx); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + +unlock: + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + cpu_function_call(event->cpu, __perf_event_enable, event); + return; + } + + raw_spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + +retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + + raw_spin_unlock_irq(&ctx->lock); + + if (!task_function_call(task, __perf_event_enable, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; + goto retry; + } + +out: + raw_spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit || !is_sampling_event(event)) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + struct perf_event *event; + + raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + + if (!ctx->nr_active) + goto out; + + if (event_type & EVENT_PINNED) { + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry(event, &ctx->flexible_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + } +out: + perf_pmu_enable(ctx->pmu); + raw_spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + event->pmu->read(event); + /* fall-through */ + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + update_context_time(ctx); + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) +{ + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; + int do_switch = 1; + + if (likely(!ctx)) + return; + + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) + return; + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp[ctxn]; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + ctx_sched_out(ctx, cpuctx, EVENT_ALL); + cpuctx->task_ctx = NULL; + } +} + +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); +} + +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, event_type); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); +} + +static void +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + if (!event_filter_match(event)) + continue; + + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } +} + +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + int can_add_hw = 1; + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (!event_filter_match(event)) + continue; + + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx)) + can_add_hw = 0; + } + } +} + +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task) +{ + u64 now; + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + if (event_type & EVENT_PINNED) + ctx_pinned_sched_in(ctx, cpuctx); + + /* Then walk through the lower prio flexible groups */ + if (event_type & EVENT_FLEXIBLE) + ctx_flexible_sched_in(ctx, cpuctx); + +out: + raw_spin_unlock(&ctx->lock); +} + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + ctx_sched_in(ctx, cpuctx, event_type, task); +} + +static void task_ctx_sched_in(struct perf_event_context *ctx, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) + return; + + ctx_sched_in(ctx, cpuctx, event_type, NULL); + cpuctx->task_ctx = ctx; +} + +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) + return; + + perf_pmu_disable(ctx->pmu); + /* + * We want to keep the following priority order: + * cpu pinned (that don't need to move), task pinned, + * cpu flexible, task flexible. + */ + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + + cpuctx->task_ctx = ctx; + + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void __perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx, task); + } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); +} + +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + if (!divisor) + return dividend; + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period, sample_period; + s64 delta; + + period = perf_calculate_period(event, nsec, count); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; + + if (local64_read(&hwc->period_left) > 8*sample_period) { + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); + event->pmu->start(event, PERF_EF_RELOAD); + } +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, now; + s64 delta; + + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + if (!event_filter_match(event)) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->start(event, 0); + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + event->pmu->read(event); + now = local64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; + + if (delta > 0) + perf_adjust_period(event, period, delta); + } + raw_spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); + + raw_spin_unlock(&ctx->lock); +} + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_rotate_context(struct perf_cpu_context *cpuctx) +{ + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; + struct perf_event_context *ctx = NULL; + int rotate = 0, remove = 1; + + if (cpuctx->ctx.nr_events) { + remove = 0; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } + + ctx = cpuctx->task_ctx; + if (ctx && ctx->nr_events) { + remove = 0; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } + + perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); + + if (!rotate) + goto done; + + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + task_ctx_sched_out(ctx, EVENT_FLEXIBLE); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); + if (ctx) + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + +done: + if (remove) + list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; + + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } +} + +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct perf_event_context *ctx) +{ + struct perf_event *event; + unsigned long flags; + int enabled = 0; + int ret; + + local_irq_save(flags); + if (!ctx || !ctx->nr_events) + goto out; + + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); + task_ctx_sched_out(ctx, EVENT_ALL); + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + raw_spin_unlock(&ctx->lock); + + /* + * Also calls ctxswin for cgroup events, if any: + */ + perf_event_context_sched_in(ctx, ctx->task); +out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + raw_spin_lock(&ctx->lock); + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_event_times(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); + raw_spin_unlock(&ctx->lock); +} + +static inline u64 perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_event_times(event); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + return perf_event_count(event); +} + +/* + * Callchain support + */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void __perf_event_init_context(struct perf_event_context *ctx) +{ + raw_spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); +} + +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) +{ + struct perf_event_context *ctx; + + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; + + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; + + return ctx; +} + +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; + + rcu_read_lock(); + if (!vpid) + task = current; + else + task = find_task_by_vpid(vpid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + +/* + * Returns a matching context with refcount and pincount. + */ +static struct perf_event_context * +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + unsigned long flags; + int ctxn, err; + + if (!task) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_online(cpu)) + return ERR_PTR(-ENODEV); + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + ++ctx->pin_count; + + return ctx; + } + + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + +retry: + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + unclone_ctx(ctx); + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = alloc_perf_context(pmu, task); + err = -ENOMEM; + if (!ctx) + goto errout; + + get_ctx(ctx); + + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else { + ++ctx->pin_count; + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { + put_task_struct(task); + kfree(ctx); + + if (err == -EAGAIN) + goto retry; + goto errout; + } + } + + return ctx; + +errout: + return ERR_PTR(err); +} + +static void perf_event_free_filter(struct perf_event *event); + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + perf_event_free_filter(event); + kfree(event); +} + +static void perf_buffer_put(struct perf_buffer *buffer); + +static void free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending); + + if (!event->parent) { + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } + } + + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; + } + + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + call_rcu(&event->rcu_head, free_event_rcu); +} + +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + free_event(event); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct task_struct *owner; + + file->private_data = NULL; + + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + + return perf_event_release_kernel(event); +} + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event *child; + u64 total = 0; + + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); + total += perf_event_read(event); + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { + total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); + + return total; +} +EXPORT_SYMBOL_GPL(perf_event_read_value); + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + goto unlock; + + ret = size; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); + + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } + + ret += size; + } +unlock: + mutex_unlock(&ctx->mutex); + + return ret; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 enabled, running; + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < event->read_size) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_buffer *buffer; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + local64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + int ret = 0; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + raw_spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + raw_spin_unlock_irq(&ctx->lock); + + return ret; +} + +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } + + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_buffer *buffer; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto unlock; + + userpg = buffer->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = perf_event_count(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= local64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + +#ifndef CONFIG_PERF_USE_VMALLOC + +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ + +static struct page * +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) +{ + if (pgoff > buffer->nr_pages) + return NULL; + + if (pgoff == 0) + return virt_to_page(buffer->user_page); + + return virt_to_page(buffer->data_pages[pgoff - 1]); +} + +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *buffer; + unsigned long size; + int i; + + size = sizeof(struct perf_buffer); + size += nr_pages * sizeof(void *); + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto fail; + + buffer->user_page = perf_mmap_alloc_page(cpu); + if (!buffer->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!buffer->data_pages[i]) + goto fail_data_pages; + } + + buffer->nr_pages = nr_pages; + + perf_buffer_init(buffer, watermark, flags); + + return buffer; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)buffer->data_pages[i]); + + free_page((unsigned long)buffer->user_page); + +fail_user_page: + kfree(buffer); + +fail: + return NULL; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void perf_buffer_free(struct perf_buffer *buffer) +{ + int i; + + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); +} + +static inline int page_order(struct perf_buffer *buffer) +{ + return 0; +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static inline int page_order(struct perf_buffer *buffer) +{ + return buffer->page_order; +} + +static struct page * +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) +{ + if (pgoff > (1UL << page_order(buffer))) + return NULL; + + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_buffer_free_work(struct work_struct *work) +{ + struct perf_buffer *buffer; + void *base; + int i, nr; + + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); + + base = buffer->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(buffer); +} + +static void perf_buffer_free(struct perf_buffer *buffer) +{ + schedule_work(&buffer->work); +} + +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *buffer; + unsigned long size; + void *all_buf; + + size = sizeof(struct perf_buffer); + size += sizeof(void *); + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto fail; + + INIT_WORK(&buffer->work, perf_buffer_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; + + perf_buffer_init(buffer, watermark, flags); + + return buffer; + +fail_all_buf: + kfree(buffer); + +fail: + return NULL; +} + +#endif + +static unsigned long perf_data_size(struct perf_buffer *buffer) +{ + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_buffer *buffer; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_buffer *buffer; + + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); +} + +static struct perf_buffer *perf_buffer_get(struct perf_event *event) +{ + struct perf_buffer *buffer; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; + } + rcu_read_unlock(); + + return buffer; +} + +static void perf_buffer_put(struct perf_buffer *buffer) +{ + if (!atomic_dec_and_test(&buffer->refcount)) + return; + + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->buffer); + struct user_struct *user = event->mmap_user; + struct perf_buffer *buffer = event->buffer; + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->buffer, NULL); + mutex_unlock(&event->mmap_mutex); + + perf_buffer_put(buffer); + free_uid(user); + } +} + +static const struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + struct perf_buffer *buffer; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0, flags = 0; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same buffer. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have buffer pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); + else + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->buffer); + + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; + + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); + if (!buffer) { + ret = -ENOMEM; + goto unlock; + } + rcu_assign_pointer(event->buffer, buffer); + + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + +unlock: + if (!ret) + atomic_inc(&event->mmap_count); + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .llseek = no_llseek, + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +static void perf_pending_event(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + +/* + * Output + */ +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!buffer->writable) + return true; + + mask = perf_data_size(buffer) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->buffer->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); + } else + perf_event_wakeup(handle->event); +} + +/* + * We need to ensure a later event_id doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_get_handle(struct perf_output_handle *handle) +{ + struct perf_buffer *buffer = handle->buffer; + + preempt_disable(); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); +} + +static void perf_output_put_handle(struct perf_output_handle *handle) +{ + struct perf_buffer *buffer = handle->buffer; + unsigned long head; + +again: + head = local_read(&buffer->head); + + /* + * IRQ/NMI can happen here, which means we can miss a head update. + */ + + if (!local_dec_and_test(&buffer->nest)) + goto out; + + /* + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the buffer->head read and this + * write. + */ + buffer->user_page->data_head = head; + + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read buffer->head. + */ + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); + goto again; + } + + if (handle->wakeup != local_read(&buffer->wakeup)) + perf_output_wakeup(handle); + +out: + preempt_enable(); +} + +__always_inline void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + do { + unsigned long size = min_t(unsigned long, handle->size, len); + + memcpy(handle->addr, buf, size); + + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct perf_buffer *buffer = handle->buffer; + + handle->page++; + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); + } + } while (len); +} + +static void __perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + header->size += event->id_header_size; + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + } + + if (sample_type & PERF_SAMPLE_TIME) + data->time = perf_clock(); + + if (sample_type & PERF_SAMPLE_ID) + data->id = primary_event_id(event); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + data->stream_id = event->id; + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + } +} + +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + if (event->attr.sample_id_all) + __perf_event_header__init_id(header, data, event); +} + +static void __perf_event__output_id_sample(struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + u64 sample_type = data->type; + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); +} + +static void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) +{ + if (event->attr.sample_id_all) + __perf_event__output_id_sample(handle, sample); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_buffer *buffer; + unsigned long tail, offset, head; + int have_lost; + struct perf_sample_data sample_data; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto out; + + handle->buffer = buffer; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!buffer->nr_pages) + goto out; + + have_lost = local_read(&buffer->lost); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } + + perf_output_get_handle(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(buffer->user_page->data_tail); + smp_rmb(); + offset = head = local_read(&buffer->head); + head += size; + if (unlikely(!perf_output_space(buffer, tail, offset, head))) + goto fail; + } while (local_cmpxchg(&buffer->head, offset, head) != offset); + + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); + + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.id = event->id; + lost_event.lost = local_xchg(&buffer->lost, 0); + + perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); + } + + return 0; + +fail: + local_inc(&buffer->lost); + perf_output_put_handle(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_buffer *buffer = handle->buffer; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = local_inc_return(&buffer->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); + } + } + + perf_output_put_handle(handle); + rcu_read_unlock(); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event, + u64 enabled, u64 running) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = perf_event_count(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event, + u64 enabled, u64 running) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = perf_event_count(leader); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event, enabled, running); + else + perf_output_read_one(handle, event, enabled, running); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + __perf_event_header__init_id(header, data, event); + + if (sample_type & PERF_SAMPLE_IP) + data->ip = perf_instruction_pointer(regs); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + /* protect the callchain buffers */ + rcu_read_lock(); + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + goto exit; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); + +exit: + rcu_read_unlock(); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + event->read_size, + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + perf_event_header__init_id(&read_event.header, &sample, event); + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct task_struct *task = task_event->task; + int ret, size = task_event->event_id.header.size; + + perf_event_header__init_id(&task_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, + task_event->event_id.header.size, 0, 0); + if (ret) + goto out; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + perf_output_put(&handle, task_event->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + task_event->event_id.header.size = size; +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + .time = perf_clock(), + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int size = comm_event->event_id.header.size; + int ret; + + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + comm_event->event_id.header.size, 0, 0); + + if (ret) + goto out; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + comm_event->event_id.header.size = size; +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + char comm[TASK_COMM_LEN]; + unsigned int size; + struct pmu *pmu; + int ctxn; + + memset(comm, 0, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int size = mmap_event->event_id.header.size; + int ret; + + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + mmap_event->event_id.header.size, 0, 0); + if (ret) + goto out; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + mmap_event->event_id.header.size = size; +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event, + int executable) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event, + int executable) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event, executable)) + perf_event_mmap_output(event, mmap_event); + } +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + struct pmu *pmu; + int ctxn; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); + + kfree(buf); +} + +void perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = PERF_RECORD_MISC_USER, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + perf_event_header__init_id(&throttle_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + throttle_event.header.size, 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else + hwc->interrupts++; + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_time_stamp; + + hwc->freq_time_stamp = now; + + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + irq_work_queue(&event->pending); + } else + perf_event_disable(event); + } + + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = local64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (local64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + + data->period = event->hw.last_period; + if (!overflow) + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_event(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + local64_add(nr, &event->count); + + if (!regs) + return; + + if (!is_sampling_event(event)) + return; + + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (local64_add_negative(nr, &hwc->period_left)) + return; + + perf_swevent_overflow(event, 0, nmi, data, regs); +} + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 1; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->attr.type != type) + return 0; + + if (event->attr.config != event_id) + return 0; + + if (perf_exclude_event(event, regs)) + return 0; + + return 1; +} + +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ + u64 hash = swevent_hash(type, event_id); + + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; + + hlist = rcu_dereference(swhash->swevent_hlist); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; + + rcu_read_lock(); + head = find_swevent_head_rcu(swhash, type, event_id); + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_swevent_match(event, type, event_id, data, regs)) + perf_swevent_event(event, nr, nmi, data, regs); + } +end: + rcu_read_unlock(); +} + +int perf_swevent_get_recursion_context(void) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + return get_recursion_context(swhash->recursion); +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + +inline void perf_swevent_put_recursion_context(int rctx) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + put_recursion_context(swhash->recursion, rctx); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data; + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + perf_sample_data_init(&data, addr); + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_add(struct perf_event *event, int flags) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct hw_perf_event *hwc = &event->hw; + struct hlist_head *head; + + if (is_sampling_event(event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + + return 0; +} + +static void perf_swevent_del(struct perf_event *event, int flags) +{ + hlist_del_rcu(&event->hlist_entry); +} + +static void perf_swevent_start(struct perf_event *event, int flags) +{ + event->hw.state = 0; +} + +static void perf_swevent_stop(struct perf_event *event, int flags) +{ + event->hw.state = PERF_HES_STOPPED; +} + +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct swevent_htable *swhash) +{ + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); +} + +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct swevent_htable *swhash) +{ + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); + + if (!hlist) + return; + + rcu_assign_pointer(swhash->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); + + mutex_unlock(&swhash->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + int err = 0; + + mutex_lock(&swhash->hlist_mutex); + + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; +fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + +struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + jump_label_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id >= PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + jump_label_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +#ifdef CONFIG_EVENT_TRACING + +static int perf_tp_filter_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + /* + * All tracepoints are from kernel-space. + */ + if (event->attr.exclude_kernel) + return 0; + + if (!perf_tp_filter_match(event, data)) + return 0; + + return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head, int rctx) +{ + struct perf_sample_data data; + struct perf_event *event; + struct hlist_node *node; + + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, 1, &data, regs); + } + + perf_swevent_put_recursion_context(rctx); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + +#else + +static inline void perf_tp_register(void) +{ +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_TRACING */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); +} +#endif + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; + + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; + + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (is_sampling_event(event)) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); + + hrtimer_cancel(&hwc->hrtimer); + } +} + +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) +{ + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); +} + +static void cpu_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); +} + +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} + +static int cpu_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); + + return 0; +} + +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); +} + +static void cpu_clock_event_read(struct perf_event *event) +{ + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + + if (cpuctx->active_pmu == old_pmu) + cpuctx->active_pmu = pmu; + } +} + +static void free_pmu_context(struct pmu *pmu) +{ + struct pmu *i; + + mutex_lock(&pmus_lock); + /* + * Like a real lame refcount. + */ + list_for_each_entry(i, &pmus, entry) { + if (i->pmu_cpu_context == pmu->pmu_cpu_context) { + update_pmu_context(i, pmu); + goto out; + } + } + + free_percpu(pmu->pmu_cpu_context); +out: + mutex_unlock(&pmus_lock); +} +static struct idr pmu_idr; + +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); +} + +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_NULL, +}; + +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_attrs = pmu_dev_attrs, +}; + +static void pmu_dev_release(struct device *dev) +{ + kfree(dev); +} + +static int pmu_dev_alloc(struct pmu *pmu) +{ + int ret = -ENOMEM; + + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; + + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; + +out: + return ret; + +free_dev: + put_device(pmu->dev); + goto out; +} + +static struct lock_class_key cpuctx_mutex; + +int perf_pmu_register(struct pmu *pmu, char *name, int type) +{ + int cpu, ret; + + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; + + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; + + if (type < 0) { + int err = idr_pre_get(&pmu_idr, GFP_KERNEL); + if (!err) + goto free_pdc; + + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); + if (err) { + ret = err; + goto free_pdc; + } + } + pmu->type = type; + + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } + +skip_type: + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; + + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_dev; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + cpuctx->ctx.type = cpu_context; + cpuctx->ctx.pmu = pmu; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); + cpuctx->active_pmu = pmu; + } + +got_cpu_context: + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + + list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: + mutex_unlock(&pmus_lock); + + return ret; + +free_dev: + device_del(pmu->dev); + put_device(pmu->dev); + +free_idr: + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + /* + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. + */ + synchronize_srcu(&pmus_srcu); + synchronize_rcu(); + + free_percpu(pmu->pmu_disable_count); + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + device_del(pmu->dev); + put_device(pmu->dev); + free_pmu_context(pmu); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + int ret; + + idx = srcu_read_lock(&pmus_srcu); + + rcu_read_lock(); + pmu = idr_find(&pmu_idr, event->attr.type); + rcu_read_unlock(); + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); + goto unlock; + } + + list_for_each_entry_rcu(pmu, &pmus, entry) { + ret = pmu->event_init(event); + if (!ret) + goto unlock; + + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; + } + } + pmu = ERR_PTR(-ENOENT); +unlock: + srcu_read_unlock(&pmus_srcu, idx); + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) +{ + struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + if ((unsigned)cpu >= nr_cpu_ids) { + if (!task || cpu != -1) + return ERR_PTR(-EINVAL); + } + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; + + event->overflow_handler = overflow_handler; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + local64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + pmu = perf_init_event(event); + +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) +{ + struct perf_buffer *buffer = NULL, *old_buffer = NULL; + int ret = -EINVAL; + + if (!output_event) + goto set; + + /* don't allow circular references */ + if (event == output_event) + goto out; + + /* + * Don't allow cross-cpu buffers + */ + if (output_event->cpu != event->cpu) + goto out; + + /* + * If its not a per-cpu buffer, it must be the same task. + */ + if (output_event->cpu == -1 && output_event->ctx != event->ctx) + goto out; + +set: + mutex_lock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; + + if (output_event) { + /* get the buffer we want to redirect to */ + buffer = perf_buffer_get(output_event); + if (!buffer) + goto unlock; + } + + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); + ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_buffer) + perf_buffer_put(old_buffer); +out: + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + struct task_struct *task = NULL; + struct pmu *pmu; + int event_fd; + int move_group = 0; + int fput_needed = 0; + int err; + + /* for future expandability... */ + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + + event_fd = get_unused_fd_flags(O_RDWR); + if (event_fd < 0) + return event_fd; + + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_fd; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } + + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_task; + } + + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_alloc; + } + + if (task) { + put_task_struct(task); + task = NULL; + } + + /* + * Look up the group leader (we will attach this event to it): + */ + if (group_leader) { + err = -EINVAL; + + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_context; + } + + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; + } + + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + goto err_context; + } + + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + perf_unpin_context(ctx); + mutex_unlock(&ctx->mutex); + + event->owner = current; + + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ + fput_light(group_file, fput_needed); + fd_install(event_fd, event_file); + return event_fd; + +err_context: + perf_unpin_context(ctx); + put_ctx(ctx); +err_alloc: + free_event(event); +err_task: + if (task) + put_task_struct(task); +err_group_fd: + fput_light(group_file, fput_needed); +err_fd: + put_unused_fd(event_fd); + return err; +} + +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @task: task to profile (NULL for percpu) + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + perf_overflow_handler_t overflow_handler) +{ + struct perf_event_context *ctx; + struct perf_event *event; + int err; + + /* + * Get the target context (task or percpu): + */ + + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err; + } + + ctx = find_get_context(event->pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_free; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + perf_unpin_context(ctx); + mutex_unlock(&ctx->mutex); + + return event; + +err_free: + free_event(event); +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = perf_event_count(child_event); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->child_count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + if (child_event->parent) { + raw_spin_lock_irq(&child_ctx->lock); + perf_group_detach(child_event); + raw_spin_unlock_irq(&child_ctx->lock); + } + + perf_remove_from_context(child_event); + + /* + * It can happen that the parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped. + */ + if (child_event->parent) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp[ctxn])) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + task_ctx_sched_out(child_ctx, EVENT_ALL); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + raw_spin_lock(&child_ctx->lock); + child->perf_event_ctxp[ctxn] = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + update_context_time(child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock(&child_ctx->mutex); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *event, *tmp; + int ctxn; + + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + perf_group_detach(event); + list_del_event(event, ctx); + free_event(event); +} + +/* + * free an unexposed, unused context as created by inheritance by + * perf_event_init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); + + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); + + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + } +} + +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + unsigned long flags; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + child, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(child_event); + perf_event__id_header_size(child_event); + + /* + * Link it up in the child's context: + */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); + add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, int ctxn, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; + } + + child_ctx = child->perf_event_ctxp[ctxn]; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = alloc_perf_context(event->pmu, child); + if (!child_ctx) + return -ENOMEM; + + child->perf_event_ctxp[ctxn] = child_ctx; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_context(struct task_struct *child, int ctxn) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + unsigned long flags; + int ret = 0; + + if (likely(!parent->perf_event_ctxp[ctxn])) + return 0; + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent, ctxn); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); + if (ret) + break; + } + + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); + if (ret) + break; + } + + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + + child_ctx = child->perf_event_ctxp[ctxn]; + + if (child_ctx && inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * + * Note that if the parent is a clone, the holding of + * parent_ctx->lock avoids it from being uncloned. + */ + cloned_ctx = parent_ctx->parent_ctx; + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); + + return ret; +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + +static void __init perf_event_init_all_cpus(void) +{ + struct swevent_htable *swhash; + int cpu; + + for_each_possible_cpu(cpu) { + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + } +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + mutex_unlock(&swhash->hlist_mutex); +} + +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC +static void perf_pmu_rotate_stop(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + +static void __perf_event_exit_context(void *__info) +{ + struct perf_event_context *ctx = __info; + struct perf_event *event, *tmp; + + perf_pmu_rotate_stop(ctx->pmu); + + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + __perf_remove_from_context(event); +} + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); +} + +static void perf_event_exit_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); + + perf_event_exit_cpu_context(cpu); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) +{ + int cpu; + + for_each_online_cpu(cpu) + perf_event_exit_cpu(cpu); + + return NOTIFY_OK; +} + +/* + * Run the perf reboot notifier at the very last possible moment so that + * the generic watchdog code runs as long as possible. + */ +static struct notifier_block perf_reboot_notifier = { + .notifier_call = perf_reboot, + .priority = INT_MIN, +}; + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_UP_PREPARE: + case CPU_DOWN_FAILED: + perf_event_init_cpu(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +void __init perf_event_init(void) +{ + int ret; + + idr_init(&pmu_idr); + + perf_event_init_all_cpus(); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); + register_reboot_notifier(&perf_reboot_notifier); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); +} + +static int __init perf_event_sysfs_init(void) +{ + struct pmu *pmu; + int ret; + + mutex_lock(&pmus_lock); + + ret = bus_register(&pmu_bus); + if (ret) + goto unlock; + + list_for_each_entry(pmu, &pmus, entry) { + if (!pmu->name || pmu->type < 0) + continue; + + ret = pmu_dev_alloc(pmu); + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); + } + pmu_bus_running = 1; + ret = 0; + +unlock: + mutex_unlock(&pmus_lock); + + return ret; +} +device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + + jc = kzalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/perf_event.c b/kernel/perf_event.c deleted file mode 100644 index 440bc48..0000000 --- a/kernel/perf_event.c +++ /dev/null @@ -1,7455 +0,0 @@ -/* - * Performance events core code: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct remote_function_call { - struct task_struct *p; - int (*func)(void *info); - void *info; - int ret; -}; - -static void remote_function(void *data) -{ - struct remote_function_call *tfc = data; - struct task_struct *p = tfc->p; - - if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) - return; - } - - tfc->ret = tfc->func(tfc->info); -} - -/** - * task_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away - */ -static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) -{ - struct remote_function_call data = { - .p = p, - .func = func, - .info = info, - .ret = -ESRCH, /* No such (running) process */ - }; - - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); - - return data.ret; -} - -/** - * cpu_function_call - call a function on the cpu - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func on the remote cpu. - * - * returns: @func return value or -ENXIO when the cpu is offline - */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) -{ - struct remote_function_call data = { - .p = NULL, - .func = func, - .info = info, - .ret = -ENXIO, /* No such CPU */ - }; - - smp_call_function_single(cpu, remote_function, &data, 1); - - return data.ret; -} - -#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ - PERF_FLAG_FD_OUTPUT |\ - PERF_FLAG_PID_CGROUP) - -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - -/* - * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu - */ -struct jump_label_key perf_sched_events __read_mostly; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); - -static atomic_t nr_mmap_events __read_mostly; -static atomic_t nr_comm_events __read_mostly; -static atomic_t nr_task_events __read_mostly; - -static LIST_HEAD(pmus); -static DEFINE_MUTEX(pmus_lock); -static struct srcu_struct pmus_srcu; - -/* - * perf event paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu events for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_event_paranoid __read_mostly = 1; - -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ - -/* - * max perf event sample rate - */ -#define DEFAULT_MAX_SAMPLE_RATE 100000 -int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; -static int max_samples_per_tick __read_mostly = - DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); - -int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - - max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); - - return 0; -} - -static atomic64_t perf_event_id; - -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - -static void update_context_time(struct perf_event_context *ctx); -static u64 perf_event_time(struct perf_event *event); - -void __weak perf_event_print_debug(void) { } - -extern __weak const char *perf_pmu_name(void) -{ - return "pmu"; -} - -static inline u64 perf_clock(void) -{ - return local_clock(); -} - -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -#ifdef CONFIG_CGROUP_PERF - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); -} - -static inline bool -perf_cgroup_match(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - return !event->cgrp || event->cgrp == cpuctx->cgrp; -} - -static inline void perf_get_cgroup(struct perf_event *event) -{ - css_get(&event->cgrp->css); -} - -static inline void perf_put_cgroup(struct perf_event *event) -{ - css_put(&event->cgrp->css); -} - -static inline void perf_detach_cgroup(struct perf_event *event) -{ - perf_put_cgroup(event); - event->cgrp = NULL; -} - -static inline int is_cgroup_event(struct perf_event *event) -{ - return event->cgrp != NULL; -} - -static inline u64 perf_cgroup_event_time(struct perf_event *event) -{ - struct perf_cgroup_info *t; - - t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; -} - -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) -{ - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); - - info = this_cpu_ptr(cgrp->info); - - info->time += now - info->timestamp; - info->timestamp = now; -} - -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); -} - -static inline void update_cgrp_time_from_event(struct perf_event *event) -{ - struct perf_cgroup *cgrp; - - /* - * ensure we access cgroup data only when needed and - * when we know the cgroup is pinned (css_get) - */ - if (!is_cgroup_event(event)) - return; - - cgrp = perf_cgroup_from_task(current); - /* - * Do not update time when cgroup is not active - */ - if (cgrp == event->cgrp) - __update_cgrp_time(event->cgrp); -} - -static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ - struct perf_cgroup *cgrp; - struct perf_cgroup_info *info; - - /* - * ctx->lock held by caller - * ensure we do not access cgroup data - * unless we have the cgroup pinned (css_get) - */ - if (!task || !ctx->nr_cgroups) - return; - - cgrp = perf_cgroup_from_task(task); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; -} - -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - -/* - * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next - */ -void perf_cgroup_switch(struct task_struct *task, int mode) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. - */ - local_irq_save(flags); - - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } - - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around - */ - cpuctx->cgrp = perf_cgroup_from_task(task); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - } - - perf_pmu_enable(cpuctx->ctx.pmu); - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - -static inline void perf_cgroup_sched_out(struct task_struct *task) -{ - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); -} - -static inline void perf_cgroup_sched_in(struct task_struct *task) -{ - perf_cgroup_switch(task, PERF_CGROUP_SWIN); -} - -static inline int perf_cgroup_connect(int fd, struct perf_event *event, - struct perf_event_attr *attr, - struct perf_event *group_leader) -{ - struct perf_cgroup *cgrp; - struct cgroup_subsys_state *css; - struct file *file; - int ret = 0, fput_needed; - - file = fget_light(fd, &fput_needed); - if (!file) - return -EBADF; - - css = cgroup_css_from_dir(file, perf_subsys_id); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } - - cgrp = container_of(css, struct perf_cgroup, css); - event->cgrp = cgrp; - - /* must be done before we fput() the file */ - perf_get_cgroup(event); - - /* - * all events in a group must monitor - * the same cgroup because a task belongs - * to only one perf cgroup at a time - */ - if (group_leader && group_leader->cgrp != cgrp) { - perf_detach_cgroup(event); - ret = -EINVAL; - } -out: - fput_light(file, fput_needed); - return ret; -} - -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ - /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. - */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) - return; - - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } -} -#else /* !CONFIG_CGROUP_PERF */ - -static inline bool -perf_cgroup_match(struct perf_event *event) -{ - return true; -} - -static inline void perf_detach_cgroup(struct perf_event *event) -{} - -static inline int is_cgroup_event(struct perf_event *event) -{ - return 0; -} - -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - -static inline void update_cgrp_time_from_event(struct perf_event *event) -{ -} - -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ -} - -static inline void perf_cgroup_sched_out(struct task_struct *task) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *task) -{ -} - -static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, - struct perf_event_attr *attr, - struct perf_event *group_leader) -{ - return -EINVAL; -} - -static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) -{ -} - -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ -} - -static inline u64 perf_cgroup_event_time(struct perf_event *event) -{ - return 0; -} - -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ -} - -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ -} -#endif - -void perf_pmu_disable(struct pmu *pmu) -{ - int *count = this_cpu_ptr(pmu->pmu_disable_count); - if (!(*count)++) - pmu->pmu_disable(pmu); -} - -void perf_pmu_enable(struct pmu *pmu) -{ - int *count = this_cpu_ptr(pmu->pmu_disable_count); - if (!--(*count)) - pmu->pmu_enable(pmu); -} - -static DEFINE_PER_CPU(struct list_head, rotation_list); - -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ -static void perf_pmu_rotate_start(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); - - WARN_ON(!irqs_disabled()); - - if (list_empty(&cpuctx->rotation_list)) - list_add(&cpuctx->rotation_list, head); -} - -static void get_ctx(struct perf_event_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_event_context *ctx; - - ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_event_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_event_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_tgid_nr_ns(p, event->ns); -} - -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_pid_nr_ns(p, event->ns); -} - -/* - * If we inherit events we want to return the parent event id - * to userspace. - */ -static u64 primary_event_id(struct perf_event *event) -{ - u64 id = event->id; - - if (event->parent) - id = event->parent->id; - - return id; -} - -/* - * Get the perf_event_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) -{ - struct perf_event_context *ctx; - - rcu_read_lock(); -retry: - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_event_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) -{ - struct perf_event_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, ctxn, &flags); - if (ctx) { - ++ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_event_context *ctx) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -static u64 perf_event_time(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - - if (is_cgroup_event(event)) - return perf_cgroup_event_time(event); - - return ctx ? ctx->time : 0; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ - if (is_cgroup_event(event)) - run_end = perf_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) -{ - if (event->attr.pinned) - return &ctx->pinned_groups; - else - return &ctx->flexible_groups; -} - -/* - * Add a event from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_event(struct perf_event *event, struct perf_event_context *ctx) -{ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - event->attach_state |= PERF_ATTACH_CONTEXT; - - /* - * If we're a stand alone event or group leader, we go to the context - * list, group events are kept attached to the group so that - * perf_group_detach can, at all times, locate all siblings. - */ - if (event->group_leader == event) { - struct list_head *list; - - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); - } - - if (is_cgroup_event(event)) - ctx->nr_cgroups++; - - list_add_rcu(&event->event_entry, &ctx->event_list); - if (!ctx->nr_events) - perf_pmu_rotate_start(ctx->pmu); - ctx->nr_events++; - if (event->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - event->read_size = size; -} - -static void perf_event__header_size(struct perf_event *event) -{ - struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; - u16 size = 0; - - perf_event__read_size(event); - - if (sample_type & PERF_SAMPLE_IP) - size += sizeof(data->ip); - - if (sample_type & PERF_SAMPLE_ADDR) - size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_PERIOD) - size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - size += event->read_size; - - event->header_size = size; -} - -static void perf_event__id_header_size(struct perf_event *event) -{ - struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; - u16 size = 0; - - if (sample_type & PERF_SAMPLE_TID) - size += sizeof(data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - size += sizeof(data->time); - - if (sample_type & PERF_SAMPLE_ID) - size += sizeof(data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - size += sizeof(data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - size += sizeof(data->cpu_entry); - - event->id_header_size = size; -} - -static void perf_group_attach(struct perf_event *event) -{ - struct perf_event *group_leader = event->group_leader, *pos; - - /* - * We can have double attach due to group movement in perf_event_open. - */ - if (event->attach_state & PERF_ATTACH_GROUP) - return; - - event->attach_state |= PERF_ATTACH_GROUP; - - if (group_leader == event) - return; - - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - - perf_event__header_size(group_leader); - - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) - perf_event__header_size(pos); -} - -/* - * Remove a event from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_event(struct perf_event *event, struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx; - /* - * We can have double detach due to exit/hot-unplug + close. - */ - if (!(event->attach_state & PERF_ATTACH_CONTEXT)) - return; - - event->attach_state &= ~PERF_ATTACH_CONTEXT; - - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - cpuctx = __get_cpu_context(ctx); - /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } - - ctx->nr_events--; - if (event->attr.inherit_stat) - ctx->nr_stat--; - - list_del_rcu(&event->event_entry); - - if (event->group_leader == event) - list_del_init(&event->group_entry); - - update_group_times(event); - - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; -} - -static void perf_group_detach(struct perf_event *event) -{ - struct perf_event *sibling, *tmp; - struct list_head *list = NULL; - - /* - * We can have double detach due to exit/hot-unplug + close. - */ - if (!(event->attach_state & PERF_ATTACH_GROUP)) - return; - - event->attach_state &= ~PERF_ATTACH_GROUP; - - /* - * If this is a sibling, remove it from its group. - */ - if (event->group_leader != event) { - list_del_init(&event->group_entry); - event->group_leader->nr_siblings--; - goto out; - } - - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - - /* - * If this was a group event with sibling events then - * upgrade the siblings to singleton events by adding them - * to whatever list we are on. - */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); - sibling->group_leader = sibling; - - /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; - } - -out: - perf_event__header_size(event->group_leader); - - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) - perf_event__header_size(tmp); -} - -static inline int -event_filter_match(struct perf_event *event) -{ - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event); -} - -static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - u64 delta; - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return; - - event->state = PERF_EVENT_STATE_INACTIVE; - if (event->pending_disable) { - event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; - } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; - - if (!is_software_event(event)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event; - int state = group_event->state; - - event_sched_out(group_event, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) - event_sched_out(event, cpuctx, ctx); - - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance event - * - * We disable the event on the hardware level first. After that we - * remove it from the context list. - */ -static int __perf_remove_from_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); - - return 0; -} - - -/* - * Remove the event from a task's (or a CPU's) list of events. - * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_event_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_remove_from_context(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - lockdep_assert_held(&ctx->mutex); - - if (!task) { - /* - * Per cpu events are removed via an smp call and - * the removal is always successful. - */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); - return; - } - -retry: - if (!task_function_call(task, __perf_remove_from_context, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since the task isn't running, its safe to remove the event, us - * holding the ctx->lock ensures the task won't get scheduled in. - */ - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); -} - -/* - * Cross CPU call to disable a performance event - */ -static int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Disable a event. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_event_for_each_child or perf_event_for_each because they - * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. - * When called from perf_pending_event it's OK because event->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_event_task_sched_out for this context. - */ -void perf_event_disable(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_disable, event); - return; - } - -retry: - if (!task_function_call(task, __perf_event_disable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If the event is still active, we need to retry the cross-call. - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } - raw_spin_unlock_irq(&ctx->lock); -} - -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); - else - event->shadow_ctx_time = tstamp - ctx->timestamp; -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_event *event, int enable); - -static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - - if (event->state <= PERF_EVENT_STATE_OFF) - return 0; - - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); - - /* - * Unthrottle events, since we scheduled we might have missed several - * ticks already, also for a heavily scheduling task there is little - * guarantee it'll get a tick in a timely manner. - */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } - - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; - event->oncpu = -1; - return -EAGAIN; - } - - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - - if (!is_software_event(event)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (event->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; - u64 now = ctx->time; - bool simulate = false; - - if (group_event->state == PERF_EVENT_STATE_OFF) - return 0; - - pmu->start_txn(pmu); - - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - return -EAGAIN; - } - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { - partial_group = event; - goto group_error; - } - } - - if (!pmu->commit_txn(pmu)) - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event == partial_group) - simulate = true; - - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } - } - event_sched_out(group_event, cpuctx, ctx); - - pmu->cancel_txn(pmu); - - return -EAGAIN; -} - -/* - * Work out whether we can put this event group on the CPU now. - */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software events can always go on. - */ - if (event->group_flags & PERF_GROUP_SOFTWARE) - return 1; - /* - * If an exclusive group is already on, no other hardware - * events can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * events on the CPU, it can't go on. - */ - if (event->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_event_to_ctx(struct perf_event *event, - struct perf_event_context *ctx) -{ - u64 tstamp = perf_event_time(event); - - list_add_event(event, ctx); - perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; -} - -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *tsk); - -/* - * Cross CPU call to install and enable a performance event - * - * Must be called with ctx->mutex held - */ -static int __perf_install_in_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - /* - * In case we're installing a new context to an already running task, - * could also happen before perf_event_task_sched_in() on architectures - * which do context switches with IRQs enabled. - */ - if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx, ctx->task); - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); - - add_event_to_ctx(event, ctx); - - if (!event_filter_match(event)) - goto unlock; - - /* - * Don't put the event on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE || - (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive event can't go on if there are already active - * hardware events, and no hardware event can go on if there - * is already an exclusive event on. - */ - if (!group_can_go_on(event, cpuctx, 1)) - err = -EEXIST; - else - err = event_sched_in(event, cpuctx, ctx); - - if (err) { - /* - * This event couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the event group is pinned then put it in error state. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } - -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. - * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - */ -static void -perf_install_in_context(struct perf_event_context *ctx, - struct perf_event *event, - int cpu) -{ - struct task_struct *task = ctx->task; - - lockdep_assert_held(&ctx->mutex); - - event->ctx = ctx; - - if (!task) { - /* - * Per cpu events are installed via an smp call and - * the install is always successful. - */ - cpu_function_call(cpu, __perf_install_in_context, event); - return; - } - -retry: - if (!task_function_call(task, __perf_install_in_context, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); -} - -/* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - } -} - -/* - * Cross CPU call to enable a performance event - */ -static int __perf_event_enable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - if (WARN_ON_ONCE(!ctx->is_active)) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - update_context_time(ctx); - - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); - - __perf_event_mark_enabled(event, ctx); - - if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); - goto unlock; - } - - /* - * If the event is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } - -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} - -/* - * Enable a event. - * - * If event->ctx is a cloned context, callers must make sure that - * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_event_for_each_child or perf_event_for_each as described - * for perf_event_disable. - */ -void perf_event_enable(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_enable, event); - return; - } - - raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto out; - - /* - * If the event is in error state, clear that first. - * That way, if we see the event in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (event->state == PERF_EVENT_STATE_ERROR) - event->state = PERF_EVENT_STATE_OFF; - -retry: - if (!ctx->is_active) { - __perf_event_mark_enabled(event, ctx); - goto out; - } - - raw_spin_unlock_irq(&ctx->lock); - - if (!task_function_call(task, __perf_event_enable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the event is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { - /* - * task could have been flipped by a concurrent - * perf_event_context_sched_out() - */ - task = ctx->task; - goto retry; - } - -out: - raw_spin_unlock_irq(&ctx->lock); -} - -static int perf_event_refresh(struct perf_event *event, int refresh) -{ - /* - * not supported on inherited events - */ - if (event->attr.inherit || !is_sampling_event(event)) - return -EINVAL; - - atomic_add(refresh, &event->event_limit); - perf_event_enable(event); - - return 0; -} - -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - struct perf_event *event; - - raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); - ctx->is_active = 0; - if (likely(!ctx->nr_events)) - goto out; - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - - if (!ctx->nr_active) - goto out; - - if (event_type & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) - group_sched_out(event, cpuctx, ctx); - } - - if (event_type & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) - group_sched_out(event, cpuctx, ctx); - } -out: - perf_pmu_enable(ctx->pmu); - raw_spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_event_context *ctx1, - struct perf_event_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_event_sync_stat(struct perf_event *event, - struct perf_event *next_event) -{ - u64 value; - - if (!event->attr.inherit_stat) - return; - - /* - * Update the event value, we cannot use perf_event_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the event must be on the current CPU, therefore we - * don't need to use it. - */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: - event->pmu->read(event); - /* fall-through */ - - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the event - * values when we flip the contexts. - */ - value = local64_read(&next_event->count); - value = local64_xchg(&event->count, value); - local64_set(&next_event->count, value); - - swap(event->total_time_enabled, next_event->total_time_enabled); - swap(event->total_time_running, next_event->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_event_update_userpage(event); - perf_event_update_userpage(next_event); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_event_sync_stat(struct perf_event_context *ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event *event, *next_event; - - if (!ctx->nr_stat) - return; - - update_context_time(ctx); - - event = list_first_entry(&ctx->event_list, - struct perf_event, event_entry); - - next_event = list_first_entry(&next_ctx->event_list, - struct perf_event, event_entry); - - while (&event->event_entry != &ctx->event_list && - &next_event->event_entry != &next_ctx->event_list) { - - __perf_event_sync_stat(event, next_event); - - event = list_next_entry(event, event_entry); - next_event = list_next_entry(next_event, event_entry); - } -} - -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) -{ - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; - struct perf_event_context *next_ctx; - struct perf_event_context *parent; - struct perf_cpu_context *cpuctx; - int do_switch = 1; - - if (likely(!ctx)) - return; - - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - raw_spin_lock(&ctx->lock); - raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_event_sync_stat(ctx, next_ctx); - } - raw_spin_unlock(&next_ctx->lock); - raw_spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; - } -} - -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void __perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) -{ - int ctxn; - - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); - - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch out PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); -} - -static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, event_type); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); -} - -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_event *event; - - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; - } - } -} - -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_event *event; - int can_add_hw = 1; - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); - - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } -} - -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) -{ - u64 now; - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_events)) - goto out; - - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - if (event_type & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); - - /* Then walk through the lower prio flexible groups */ - if (event_type & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); - -out: - raw_spin_unlock(&ctx->lock); -} - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) -{ - struct perf_event_context *ctx = &cpuctx->ctx; - - ctx_sched_in(ctx, cpuctx, event_type, task); -} - -static void task_ctx_sched_in(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - ctx_sched_in(ctx, cpuctx, event_type, NULL); - cpuctx->task_ctx = ctx; -} - -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - perf_pmu_disable(ctx->pmu); - /* - * We want to keep the following priority order: - * cpu pinned (that don't need to move), task pinned, - * cpu flexible, task flexible. - */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); - - cpuctx->task_ctx = ctx; - - /* - * Since these rotations are per-cpu, we need to ensure the - * cpu-context we got scheduled on is actually rotating. - */ - perf_pmu_rotate_start(ctx->pmu); - perf_pmu_enable(ctx->pmu); -} - -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void __perf_event_task_sched_in(struct task_struct *task) -{ - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); -} - -static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) -{ - u64 frequency = event->attr.sample_freq; - u64 sec = NSEC_PER_SEC; - u64 divisor, dividend; - - int count_fls, nsec_fls, frequency_fls, sec_fls; - - count_fls = fls64(count); - nsec_fls = fls64(nsec); - frequency_fls = fls64(frequency); - sec_fls = 30; - - /* - * We got @count in @nsec, with a target of sample_freq HZ - * the target period becomes: - * - * @count * 10^9 - * period = ------------------- - * @nsec * sample_freq - * - */ - - /* - * Reduce accuracy by one bit such that @a and @b converge - * to a similar magnitude. - */ -#define REDUCE_FLS(a, b) \ -do { \ - if (a##_fls > b##_fls) { \ - a >>= 1; \ - a##_fls--; \ - } else { \ - b >>= 1; \ - b##_fls--; \ - } \ -} while (0) - - /* - * Reduce accuracy until either term fits in a u64, then proceed with - * the other, so that finally we can do a u64/u64 division. - */ - while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { - REDUCE_FLS(nsec, frequency); - REDUCE_FLS(sec, count); - } - - if (count_fls + sec_fls > 64) { - divisor = nsec * frequency; - - while (count_fls + sec_fls > 64) { - REDUCE_FLS(count, sec); - divisor >>= 1; - } - - dividend = count * sec; - } else { - dividend = count * sec; - - while (nsec_fls + frequency_fls > 64) { - REDUCE_FLS(nsec, frequency); - dividend >>= 1; - } - - divisor = nsec * frequency; - } - - if (!divisor) - return dividend; - - return div64_u64(dividend, divisor); -} - -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period, sample_period; - s64 delta; - - period = perf_calculate_period(event, nsec, count); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; - - if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); - local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); - } -} - -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) -{ - struct perf_event *event; - struct hw_perf_event *hwc; - u64 interrupts, now; - s64 delta; - - raw_spin_lock(&ctx->lock); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->state != PERF_EVENT_STATE_ACTIVE) - continue; - - if (!event_filter_match(event)) - continue; - - hwc = &event->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle events on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(event, 1); - event->pmu->start(event, 0); - } - - if (!event->attr.freq || !event->attr.sample_freq) - continue; - - event->pmu->read(event); - now = local64_read(&event->count); - delta = now - hwc->freq_count_stamp; - hwc->freq_count_stamp = now; - - if (delta > 0) - perf_adjust_period(event, period, delta); - } - raw_spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's events: - */ -static void rotate_ctx(struct perf_event_context *ctx) -{ - raw_spin_lock(&ctx->lock); - - /* - * Rotate the first entry last of non-pinned groups. Rotation might be - * disabled by the inheritance code. - */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); - - raw_spin_unlock(&ctx->lock); -} - -/* - * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized - * because they're strictly cpu affine and rotate_start is called with IRQs - * disabled, while rotate_context is called from IRQ context. - */ -static void perf_rotate_context(struct perf_cpu_context *cpuctx) -{ - u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; - struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; - - if (cpuctx->ctx.nr_events) { - remove = 0; - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; - } - - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - remove = 0; - if (ctx->nr_events != ctx->nr_active) - rotate = 1; - } - - perf_pmu_disable(cpuctx->ctx.pmu); - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - - if (!rotate) - goto done; - - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - task_ctx_sched_out(ctx, EVENT_FLEXIBLE); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); - if (ctx) - task_ctx_sched_in(ctx, EVENT_FLEXIBLE); - -done: - if (remove) - list_del_init(&cpuctx->rotation_list); - - perf_pmu_enable(cpuctx->ctx.pmu); -} - -void perf_event_task_tick(void) -{ - struct list_head *head = &__get_cpu_var(rotation_list); - struct perf_cpu_context *cpuctx, *tmp; - - WARN_ON(!irqs_disabled()); - - list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { - if (cpuctx->jiffies_interval == 1 || - !(jiffies % cpuctx->jiffies_interval)) - perf_rotate_context(cpuctx); - } -} - -static int event_enable_on_exec(struct perf_event *event, - struct perf_event_context *ctx) -{ - if (!event->attr.enable_on_exec) - return 0; - - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - return 0; - - __perf_event_mark_enabled(event, ctx); - - return 1; -} - -/* - * Enable all of a task's events that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_event_enable_on_exec(struct perf_event_context *ctx) -{ - struct perf_event *event; - unsigned long flags; - int enabled = 0; - int ret; - - local_irq_save(flags); - if (!ctx || !ctx->nr_events) - goto out; - - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current); - task_ctx_sched_out(ctx, EVENT_ALL); - - raw_spin_lock(&ctx->lock); - - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - /* - * Unclone this context if we enabled any event. - */ - if (enabled) - unclone_ctx(ctx); - - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); -out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware event - */ -static void __perf_event_read(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * event->count would have been updated to a recent sample - * when the event was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - raw_spin_lock(&ctx->lock); - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - update_event_times(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); - raw_spin_unlock(&ctx->lock); -} - -static inline u64 perf_event_count(struct perf_event *event) -{ - return local64_read(&event->count) + atomic64_read(&event->child_count); -} - -static u64 perf_event_read(struct perf_event *event) -{ - /* - * If event is enabled and currently active on a CPU, update the - * value in the event structure: - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { - struct perf_event_context *ctx = event->ctx; - unsigned long flags; - - raw_spin_lock_irqsave(&ctx->lock, flags); - /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time - */ - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - update_event_times(event); - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - return perf_event_count(event); -} - -/* - * Callchain support - */ - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -static int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -static void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} - -/* - * Initialize the perf_event context in a task_struct: - */ -static void __perf_event_init_context(struct perf_event_context *ctx) -{ - raw_spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); -} - -static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) -{ - struct perf_event_context *ctx; - - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!ctx) - return NULL; - - __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } - ctx->pmu = pmu; - - return ctx; -} - -static struct task_struct * -find_lively_task_by_vpid(pid_t vpid) -{ - struct task_struct *task; - int err; - - rcu_read_lock(); - if (!vpid) - task = current; - else - task = find_task_by_vpid(vpid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - -} - -/* - * Returns a matching context with refcount and pincount. - */ -static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) -{ - struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - unsigned long flags; - int ctxn, err; - - if (!task) { - /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - ++ctx->pin_count; - - return ctx; - } - - err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - -retry: - ctx = perf_lock_task_context(task, ctxn, &flags); - if (ctx) { - unclone_ctx(ctx); - ++ctx->pin_count; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = alloc_perf_context(pmu, task); - err = -ENOMEM; - if (!ctx) - goto errout; - - get_ctx(ctx); - - err = 0; - mutex_lock(&task->perf_event_mutex); - /* - * If it has already passed perf_event_exit_task(). - * we must see PF_EXITING, it takes this mutex too. - */ - if (task->flags & PF_EXITING) - err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) - err = -EAGAIN; - else { - ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); - } - mutex_unlock(&task->perf_event_mutex); - - if (unlikely(err)) { - put_task_struct(task); - kfree(ctx); - - if (err == -EAGAIN) - goto retry; - goto errout; - } - } - - return ctx; - -errout: - return ERR_PTR(err); -} - -static void perf_event_free_filter(struct perf_event *event); - -static void free_event_rcu(struct rcu_head *head) -{ - struct perf_event *event; - - event = container_of(head, struct perf_event, rcu_head); - if (event->ns) - put_pid_ns(event->ns); - perf_event_free_filter(event); - kfree(event); -} - -static void perf_buffer_put(struct perf_buffer *buffer); - -static void free_event(struct perf_event *event) -{ - irq_work_sync(&event->pending); - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_dec(&nr_mmap_events); - if (event->attr.comm) - atomic_dec(&nr_comm_events); - if (event->attr.task) - atomic_dec(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - if (is_cgroup_event(event)) { - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); - } - } - - if (event->buffer) { - perf_buffer_put(event->buffer); - event->buffer = NULL; - } - - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - call_rcu(&event->rcu_head, free_event_rcu); -} - -int perf_event_release_kernel(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - - /* - * Remove from the PMU, can't get re-enabled since we got - * here because the last ref went. - */ - perf_event_disable(event); - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - free_event(event); - - return 0; -} -EXPORT_SYMBOL_GPL(perf_event_release_kernel); - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_event *event = file->private_data; - struct task_struct *owner; - - file->private_data = NULL; - - rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); - /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on - * owner->perf_event_mutex. - */ - smp_read_barrier_depends(); - if (owner) { - /* - * Since delayed_put_task_struct() also drops the last - * task reference we can safely take a new reference - * while holding the rcu_read_lock(). - */ - get_task_struct(owner); - } - rcu_read_unlock(); - - if (owner) { - mutex_lock(&owner->perf_event_mutex); - /* - * We have to re-check the event->owner field, if it is cleared - * we raced with perf_event_exit_task(), acquiring the mutex - * ensured they're done, and we can proceed with freeing the - * event. - */ - if (event->owner) - list_del_init(&event->owner_entry); - mutex_unlock(&owner->perf_event_mutex); - put_task_struct(owner); - } - - return perf_event_release_kernel(event); -} - -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) -{ - struct perf_event *child; - u64 total = 0; - - *enabled = 0; - *running = 0; - - mutex_lock(&event->child_mutex); - total += perf_event_read(event); - *enabled += event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - *running += event->total_time_running + - atomic64_read(&event->child_total_time_running); - - list_for_each_entry(child, &event->child_list, child_list) { - total += perf_event_read(child); - *enabled += child->total_time_enabled; - *running += child->total_time_running; - } - mutex_unlock(&event->child_mutex); - - return total; -} -EXPORT_SYMBOL_GPL(perf_event_read_value); - -static int perf_event_read_group(struct perf_event *event, - u64 read_format, char __user *buf) -{ - struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, ret = -EFAULT; - struct perf_event_context *ctx = leader->ctx; - u64 values[5]; - u64 count, enabled, running; - - mutex_lock(&ctx->mutex); - count = perf_event_read_value(leader, &enabled, &running); - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - values[n++] = count; - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(leader); - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - goto unlock; - - ret = size; - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - values[n++] = perf_event_read_value(sub, &enabled, &running); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); - - size = n * sizeof(u64); - - if (copy_to_user(buf + ret, values, size)) { - ret = -EFAULT; - goto unlock; - } - - ret += size; - } -unlock: - mutex_unlock(&ctx->mutex); - - return ret; -} - -static int perf_event_read_one(struct perf_event *event, - u64 read_format, char __user *buf) -{ - u64 enabled, running; - u64 values[4]; - int n = 0; - - values[n++] = perf_event_read_value(event, &enabled, &running); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance event - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_event *event, char __user *buf, size_t count) -{ - u64 read_format = event->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a event that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (event->state == PERF_EVENT_STATE_ERROR) - return 0; - - if (count < event->read_size) - return -ENOSPC; - - WARN_ON_ONCE(event->ctx->parent_ctx); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_event_read_group(event, read_format, buf); - else - ret = perf_event_read_one(event, read_format, buf); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_event *event = file->private_data; - - return perf_read_hw(event, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_event *event = file->private_data; - struct perf_buffer *buffer; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) - events = atomic_xchg(&buffer->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &event->waitq, wait); - - return events; -} - -static void perf_event_reset(struct perf_event *event) -{ - (void)perf_event_read(event); - local64_set(&event->count, 0); - perf_event_update_userpage(event); -} - -/* - * Holding the top-level event's child_mutex means that any - * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the - * task existence requirements of perf_event_enable/disable. - */ -static void perf_event_for_each_child(struct perf_event *event, - void (*func)(struct perf_event *)) -{ - struct perf_event *child; - - WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); - func(event); - list_for_each_entry(child, &event->child_list, child_list) - func(child); - mutex_unlock(&event->child_mutex); -} - -static void perf_event_for_each(struct perf_event *event, - void (*func)(struct perf_event *)) -{ - struct perf_event_context *ctx = event->ctx; - struct perf_event *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - event = event->group_leader; - - perf_event_for_each_child(event, func); - func(event); - list_for_each_entry(sibling, &event->sibling_list, group_entry) - perf_event_for_each_child(event, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0; - u64 value; - - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; - - raw_spin_lock_irq(&ctx->lock); - if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } -unlock: - raw_spin_unlock_irq(&ctx->lock); - - return ret; -} - -static const struct file_operations perf_fops; - -static struct perf_event *perf_fget_light(int fd, int *fput_needed) -{ - struct file *file; - - file = fget_light(fd, fput_needed); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &perf_fops) { - fput_light(file, *fput_needed); - *fput_needed = 0; - return ERR_PTR(-EBADF); - } - - return file->private_data; -} - -static int perf_event_set_output(struct perf_event *event, - struct perf_event *output_event); -static int perf_event_set_filter(struct perf_event *event, void __user *arg); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_event *event = file->private_data; - void (*func)(struct perf_event *); - u32 flags = arg; - - switch (cmd) { - case PERF_EVENT_IOC_ENABLE: - func = perf_event_enable; - break; - case PERF_EVENT_IOC_DISABLE: - func = perf_event_disable; - break; - case PERF_EVENT_IOC_RESET: - func = perf_event_reset; - break; - - case PERF_EVENT_IOC_REFRESH: - return perf_event_refresh(event, arg); - - case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); - - case PERF_EVENT_IOC_SET_OUTPUT: - { - struct perf_event *output_event = NULL; - int fput_needed = 0; - int ret; - - if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); - } - - ret = perf_event_set_output(event, output_event); - if (output_event) - fput_light(output_event->filp, fput_needed); - - return ret; - } - - case PERF_EVENT_IOC_SET_FILTER: - return perf_event_set_filter(event, (void __user *)arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_event_for_each(event, func); - else - perf_event_for_each_child(event, func); - - return 0; -} - -int perf_event_task_enable(void) -{ - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_enable); - mutex_unlock(¤t->perf_event_mutex); - - return 0; -} - -int perf_event_task_disable(void) -{ - struct perf_event *event; - - mutex_lock(¤t->perf_event_mutex); - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) - perf_event_for_each_child(event, perf_event_disable); - mutex_unlock(¤t->perf_event_mutex); - - return 0; -} - -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - -static int perf_event_index(struct perf_event *event) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return 0; - - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_event_update_userpage(struct perf_event *event) -{ - struct perf_event_mmap_page *userpg; - struct perf_buffer *buffer; - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto unlock; - - userpg = buffer->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= local64_read(&event->hw.prev_count); - - userpg->time_enabled = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - - userpg->time_running = event->total_time_running + - atomic64_read(&event->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static unsigned long perf_data_size(struct perf_buffer *buffer); - -static void -perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) -{ - long max_size = perf_data_size(buffer); - - if (watermark) - buffer->watermark = min(max_size, watermark); - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - if (flags & PERF_BUFFER_WRITABLE) - buffer->writable = 1; - - atomic_set(&buffer->refcount, 1); -} - -#ifndef CONFIG_PERF_USE_VMALLOC - -/* - * Back perf_mmap() with regular GFP_KERNEL-0 pages. - */ - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > buffer->nr_pages) - return NULL; - - if (pgoff == 0) - return virt_to_page(buffer->user_page); - - return virt_to_page(buffer->data_pages[pgoff - 1]); -} - -static void *perf_mmap_alloc_page(int cpu) -{ - struct page *page; - int node; - - node = (cpu == -1) ? cpu : cpu_to_node(cpu); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) - return NULL; - - return page_address(page); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - int i; - - size = sizeof(struct perf_buffer); - size += nr_pages * sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - buffer->user_page = perf_mmap_alloc_page(cpu); - if (!buffer->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(cpu); - if (!buffer->data_pages[i]) - goto fail_data_pages; - } - - buffer->nr_pages = nr_pages; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)buffer->data_pages[i]); - - free_page((unsigned long)buffer->user_page); - -fail_user_page: - kfree(buffer); - -fail: - return NULL; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - int i; - - perf_mmap_free_page((unsigned long)buffer->user_page); - for (i = 0; i < buffer->nr_pages; i++) - perf_mmap_free_page((unsigned long)buffer->data_pages[i]); - kfree(buffer); -} - -static inline int page_order(struct perf_buffer *buffer) -{ - return 0; -} - -#else - -/* - * Back perf_mmap() with vmalloc memory. - * - * Required for architectures that have d-cache aliasing issues. - */ - -static inline int page_order(struct perf_buffer *buffer) -{ - return buffer->page_order; -} - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > (1UL << page_order(buffer))) - return NULL; - - return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); -} - -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - -static void perf_buffer_free_work(struct work_struct *work) -{ - struct perf_buffer *buffer; - void *base; - int i, nr; - - buffer = container_of(work, struct perf_buffer, work); - nr = 1 << page_order(buffer); - - base = buffer->user_page; - for (i = 0; i < nr + 1; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); - kfree(buffer); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - schedule_work(&buffer->work); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - void *all_buf; - - size = sizeof(struct perf_buffer); - size += sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - INIT_WORK(&buffer->work, perf_buffer_free_work); - - all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); - if (!all_buf) - goto fail_all_buf; - - buffer->user_page = all_buf; - buffer->data_pages[0] = all_buf + PAGE_SIZE; - buffer->page_order = ilog2(nr_pages); - buffer->nr_pages = 1; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_all_buf: - kfree(buffer); - -fail: - return NULL; -} - -#endif - -static unsigned long perf_data_size(struct perf_buffer *buffer) -{ - return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *buffer; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static void perf_buffer_free_rcu(struct rcu_head *rcu_head) -{ - struct perf_buffer *buffer; - - buffer = container_of(rcu_head, struct perf_buffer, rcu_head); - perf_buffer_free(buffer); -} - -static struct perf_buffer *perf_buffer_get(struct perf_event *event) -{ - struct perf_buffer *buffer; - - rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) { - if (!atomic_inc_not_zero(&buffer->refcount)) - buffer = NULL; - } - rcu_read_unlock(); - - return buffer; -} - -static void perf_buffer_put(struct perf_buffer *buffer) -{ - if (!atomic_dec_and_test(&buffer->refcount)) - return; - - call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_event *event = vma->vm_file->private_data; - - atomic_inc(&event->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_event *event = vma->vm_file->private_data; - - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->buffer); - struct user_struct *user = event->mmap_user; - struct perf_buffer *buffer = event->buffer; - - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->buffer, NULL); - mutex_unlock(&event->mmap_mutex); - - perf_buffer_put(buffer); - free_uid(user); - } -} - -static const struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - struct perf_buffer *buffer; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0, flags = 0; - - /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same buffer. - */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have buffer pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->mmap_mutex); - if (event->buffer) { - if (event->buffer->nr_pages == nr_pages) - atomic_inc(&event->buffer->refcount); - else - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(event->buffer); - - if (vma->vm_flags & VM_WRITE) - flags |= PERF_BUFFER_WRITABLE; - - buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, - event->cpu, flags); - if (!buffer) { - ret = -ENOMEM; - goto unlock; - } - rcu_assign_pointer(event->buffer, buffer); - - atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->locked_vm += event->mmap_locked; - -unlock: - if (!ret) - atomic_inc(&event->mmap_count); - mutex_unlock(&event->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_event *event = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .llseek = no_llseek, - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf event wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_event_wakeup(struct perf_event *event) -{ - wake_up_all(&event->waitq); - - if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); - event->pending_kill = 0; - } -} - -static void perf_pending_event(struct irq_work *entry) -{ - struct perf_event *event = container_of(entry, - struct perf_event, pending); - - if (event->pending_disable) { - event->pending_disable = 0; - __perf_event_disable(event); - } - - if (event->pending_wakeup) { - event->pending_wakeup = 0; - perf_event_wakeup(event); - } -} - -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; - -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) -{ - perf_guest_cbs = cbs; - return 0; -} -EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); - -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) -{ - perf_guest_cbs = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); - -/* - * Output - */ -static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!buffer->writable) - return true; - - mask = perf_data_size(buffer) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->buffer->poll, POLL_IN); - - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); -} - -/* - * We need to ensure a later event_id doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_get_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - - preempt_disable(); - local_inc(&buffer->nest); - handle->wakeup = local_read(&buffer->wakeup); -} - -static void perf_output_put_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - unsigned long head; - -again: - head = local_read(&buffer->head); - - /* - * IRQ/NMI can happen here, which means we can miss a head update. - */ - - if (!local_dec_and_test(&buffer->nest)) - goto out; - - /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the buffer->head read and this - * write. - */ - buffer->user_page->data_head = head; - - /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read buffer->head. - */ - if (unlikely(head != local_read(&buffer->head))) { - local_inc(&buffer->nest); - goto again; - } - - if (handle->wakeup != local_read(&buffer->wakeup)) - perf_output_wakeup(handle); - -out: - preempt_enable(); -} - -__always_inline void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct perf_buffer *buffer = handle->buffer; - - handle->page++; - handle->page &= buffer->nr_pages - 1; - handle->addr = buffer->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(buffer); - } - } while (len); -} - -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; - header->size += event->id_header_size; - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_event_pid(event, current); - data->tid_entry.tid = perf_event_tid(event, current); - } - - if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); - - if (sample_type & PERF_SAMPLE_ID) - data->id = primary_event_id(event); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - data->stream_id = event->id; - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - } -} - -static void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); -} - -static void __perf_event__output_id_sample(struct perf_output_handle *handle, - struct perf_sample_data *data) -{ - u64 sample_type = data->type; - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); -} - -static void perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample) -{ - if (event->attr.sample_id_all) - __perf_event__output_id_sample(handle, sample); -} - -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int nmi, int sample) -{ - struct perf_buffer *buffer; - unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited events we send all the output towards the parent. - */ - if (event->parent) - event = event->parent; - - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto out; - - handle->buffer = buffer; - handle->event = event; - handle->nmi = nmi; - handle->sample = sample; - - if (!buffer->nr_pages) - goto out; - - have_lost = local_read(&buffer->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; - } - - perf_output_get_handle(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(buffer->user_page->data_tail); - smp_rmb(); - offset = head = local_read(&buffer->head); - head += size; - if (unlikely(!perf_output_space(buffer, tail, offset, head))) - goto fail; - } while (local_cmpxchg(&buffer->head, offset, head) != offset); - - if (head - local_read(&buffer->wakeup) > buffer->watermark) - local_add(buffer->watermark, &buffer->wakeup); - - handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); - handle->page &= buffer->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); - handle->addr = buffer->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; - - if (have_lost) { - lost_event.header.type = PERF_RECORD_LOST; - lost_event.header.misc = 0; - lost_event.id = event->id; - lost_event.lost = local_xchg(&buffer->lost, 0); - - perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); - } - - return 0; - -fail: - local_inc(&buffer->lost); - perf_output_put_handle(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_event *event = handle->event; - struct perf_buffer *buffer = handle->buffer; - - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&buffer->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &buffer->events); - local_inc(&buffer->wakeup); - } - } - - perf_output_put_handle(handle); - rcu_read_unlock(); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) -{ - u64 read_format = event->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = perf_event_count(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = running + - atomic64_read(&event->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) -{ - struct perf_event *leader = event->group_leader, *sub; - u64 read_format = event->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = running; - - if (leader != event) - leader->pmu->read(leader); - - values[n++] = perf_event_count(leader); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - if (sub != event) - sub->pmu->read(sub); - - values[n++] = perf_event_count(sub); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ - PERF_FORMAT_TOTAL_TIME_RUNNING) - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_event *event) -{ - u64 enabled = 0, running = 0, now, ctx_time; - u64 read_format = event->attr.read_format; - - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context - */ - if (read_format & PERF_FORMAT_TOTAL_TIMES) { - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; - enabled = ctx_time - event->tstamp_enabled; - running = ctx_time - event->tstamp_running; - } - - if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event, enabled, running); - else - perf_output_read_one(handle, event, enabled, running); -} - -void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = data->type; - - perf_output_put(handle, *header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(handle, data->ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, event); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); - - perf_output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(handle, raw); - } - } -} - -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event, - struct pt_regs *regs) -{ - u64 sample_type = event->attr.sample_type; - - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - __perf_event_header__init_id(header, data, event); - - if (sample_type & PERF_SAMPLE_IP) - data->ip = perf_instruction_pointer(regs); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - data->callchain = perf_callchain(regs); - - if (data->callchain) - size += data->callchain->nr; - - header->size += size * sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; - } -} - -static void perf_event_output(struct perf_event *event, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_output_handle handle; - struct perf_event_header header; - - /* protect the callchain buffers */ - rcu_read_lock(); - - perf_prepare_sample(&header, data, event, regs); - - if (perf_output_begin(&handle, event, header.size, nmi, 1)) - goto exit; - - perf_output_sample(&handle, &header, data, event); - - perf_output_end(&handle); - -exit: - rcu_read_unlock(); -} - -/* - * read event_id - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_event_read_event(struct perf_event *event, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - struct perf_read_event read_event = { - .header = { - .type = PERF_RECORD_READ, - .misc = 0, - .size = sizeof(read_event) + event->read_size, - }, - .pid = perf_event_pid(event, task), - .tid = perf_event_tid(event, task), - }; - int ret; - - perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, read_event); - perf_output_read(&handle, event); - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_event_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - u64 time; - } event_id; -}; - -static void perf_event_task_output(struct perf_event *event, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - struct task_struct *task = task_event->task; - int ret, size = task_event->event_id.header.size; - - perf_event_header__init_id(&task_event->event_id.header, &sample, event); - - ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0, 0); - if (ret) - goto out; - - task_event->event_id.pid = perf_event_pid(event, task); - task_event->event_id.ppid = perf_event_pid(event, current); - - task_event->event_id.tid = perf_event_tid(event, task); - task_event->event_id.ptid = perf_event_tid(event, current); - - perf_output_put(&handle, task_event->event_id); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - task_event->event_id.header.size = size; -} - -static int perf_event_task_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task) - return 1; - - return 0; -} - -static void perf_event_task_ctx(struct perf_event_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_task_match(event)) - perf_event_task_output(event, task_event); - } -} - -static void perf_event_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct pmu *pmu; - int ctxn; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_task_ctx(&cpuctx->ctx, task_event); - - ctx = task_event->task_ctx; - if (!ctx) { - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - } - if (ctx) - perf_event_task_ctx(ctx, task_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); -} - -static void perf_event_task(struct task_struct *task, - struct perf_event_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_events) && - !atomic_read(&nr_mmap_events) && - !atomic_read(&nr_task_events)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event_id = { - .header = { - .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, - .misc = 0, - .size = sizeof(task_event.event_id), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - .time = perf_clock(), - }, - }; - - perf_event_task_event(&task_event); -} - -void perf_event_fork(struct task_struct *task) -{ - perf_event_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event_id; -}; - -static void perf_event_comm_output(struct perf_event *event, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int size = comm_event->event_id.header.size; - int ret; - - perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0, 0); - - if (ret) - goto out; - - comm_event->event_id.pid = perf_event_pid(event, comm_event->task); - comm_event->event_id.tid = perf_event_tid(event, comm_event->task); - - perf_output_put(&handle, comm_event->event_id); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - comm_event->event_id.header.size = size; -} - -static int perf_event_comm_match(struct perf_event *event) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if (event->attr.comm) - return 1; - - return 0; -} - -static void perf_event_comm_ctx(struct perf_event_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_comm_match(event)) - perf_event_comm_output(event, comm_event); - } -} - -static void perf_event_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - char comm[TASK_COMM_LEN]; - unsigned int size; - struct pmu *pmu; - int ctxn; - - memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); -} - -void perf_event_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - - if (!atomic_read(&nr_comm_events)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event_id = { - .header = { - .type = PERF_RECORD_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_event_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event_id; -}; - -static void perf_event_mmap_output(struct perf_event *event, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int size = mmap_event->event_id.header.size; - int ret; - - perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0, 0); - if (ret) - goto out; - - mmap_event->event_id.pid = perf_event_pid(event, current); - mmap_event->event_id.tid = perf_event_tid(event, current); - - perf_output_put(&handle, mmap_event->event_id); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - - perf_event__output_id_sample(event, &handle, &sample); - - perf_output_end(&handle); -out: - mmap_event->event_id.header.size = size; -} - -static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event, - int executable) -{ - if (event->state < PERF_EVENT_STATE_INACTIVE) - return 0; - - if (!event_filter_match(event)) - return 0; - - if ((!executable && event->attr.mmap_data) || - (executable && event->attr.mmap)) - return 1; - - return 0; -} - -static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event, - int executable) -{ - struct perf_event *event; - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event, executable)) - perf_event_mmap_output(event, mmap_event); - } -} - -static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - struct pmu *pmu; - int ctxn; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) - goto next; - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, - vma->vm_flags & VM_EXEC); - - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_event_mmap_ctx(ctx, mmap_event, - vma->vm_flags & VM_EXEC); - } -next: - put_cpu_ptr(pmu->pmu_cpu_context); - } - rcu_read_unlock(); - - kfree(buf); -} - -void perf_event_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_events)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event_id = { - .header = { - .type = PERF_RECORD_MMAP, - .misc = PERF_RECORD_MISC_USER, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, - }, - }; - - perf_event_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_event *event, int enable) -{ - struct perf_output_handle handle; - struct perf_sample_data sample; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_RECORD_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = perf_clock(), - .id = primary_event_id(event), - .stream_id = event->id, - }; - - if (enable) - throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - - perf_event_header__init_id(&throttle_event.header, &sample, event); - - ret = perf_output_begin(&handle, event, - throttle_event.header.size, 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_event__output_id_sample(event, &handle, &sample); - perf_output_end(&handle); -} - -/* - * Generic event overflow handling, sampling. - */ - -static int __perf_event_overflow(struct perf_event *event, int nmi, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) -{ - int events = atomic_read(&event->event_limit); - struct hw_perf_event *hwc = &event->hw; - int ret = 0; - - /* - * Non-sampling counters might still use the PMI to fold short - * hardware counters, ignore those. - */ - if (unlikely(!is_sampling_event(event))) - return 0; - - if (unlikely(hwc->interrupts >= max_samples_per_tick)) { - if (throttle) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else - hwc->interrupts++; - - if (event->attr.freq) { - u64 now = perf_clock(); - s64 delta = now - hwc->freq_time_stamp; - - hwc->freq_time_stamp = now; - - if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * events - */ - - event->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&event->event_limit)) { - ret = 1; - event->pending_kill = POLL_HUP; - if (nmi) { - event->pending_disable = 1; - irq_work_queue(&event->pending); - } else - perf_event_disable(event); - } - - if (event->overflow_handler) - event->overflow_handler(event, nmi, data, regs); - else - perf_event_output(event, nmi, data, regs); - - return ret; -} - -int perf_event_overflow(struct perf_event *event, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - return __perf_event_overflow(event, nmi, 1, data, regs); -} - -/* - * Generic software event infrastructure - */ - -struct swevent_htable { - struct swevent_hlist *swevent_hlist; - struct mutex hlist_mutex; - int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; -}; - -static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); - -/* - * We directly increment event->count and keep a second value in - * event->hw.period_left to count intervals. This period event - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swevent_set_period(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_event *hwc = &event->hw; - int throttle = 0; - - data->period = event->hw.last_period; - if (!overflow) - overflow = perf_swevent_set_period(event); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_event_overflow(event, nmi, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } -} - -static void perf_swevent_event(struct perf_event *event, u64 nr, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_event *hwc = &event->hw; - - local64_add(nr, &event->count); - - if (!regs) - return; - - if (!is_sampling_event(event)) - return; - - if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, nmi, data, regs); - - if (local64_add_negative(nr, &hwc->period_left)) - return; - - perf_swevent_overflow(event, 0, nmi, data, regs); -} - -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 1; - - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 1; - - if (event->attr.exclude_kernel && !user_mode(regs)) - return 1; - } - - return 0; -} - -static int perf_swevent_match(struct perf_event *event, - enum perf_type_id type, - u32 event_id, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->attr.type != type) - return 0; - - if (event->attr.config != event_id) - return 0; - - if (perf_exclude_event(event, regs)) - return 0; - - return 1; -} - -static inline u64 swevent_hash(u64 type, u32 event_id) -{ - u64 val = event_id | (type << 32); - - return hash_64(val, SWEVENT_HLIST_BITS); -} - -static inline struct hlist_head * -__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) -{ - u64 hash = swevent_hash(type, event_id); - - return &hlist->heads[hash]; -} - -/* For the read side: events when they trigger */ -static inline struct hlist_head * -find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) -{ - struct swevent_hlist *hlist; - - hlist = rcu_dereference(swhash->swevent_hlist); - if (!hlist) - return NULL; - - return __find_swevent_head(hlist, type, event_id); -} - -/* For the event head insertion and removal in the hlist */ -static inline struct hlist_head * -find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) -{ - struct swevent_hlist *hlist; - u32 event_id = event->attr.config; - u64 type = event->attr.type; - - /* - * Event scheduling is always serialized against hlist allocation - * and release. Which makes the protected version suitable here. - * The context lock guarantees that. - */ - hlist = rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&event->ctx->lock)); - if (!hlist) - return NULL; - - return __find_swevent_head(hlist, type, event_id); -} - -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - struct perf_event *event; - struct hlist_node *node; - struct hlist_head *head; - - rcu_read_lock(); - head = find_swevent_head_rcu(swhash, type, event_id); - if (!head) - goto end; - - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, nmi, data, regs); - } -end: - rcu_read_unlock(); -} - -int perf_swevent_get_recursion_context(void) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - - return get_recursion_context(swhash->recursion); -} -EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); - -inline void perf_swevent_put_recursion_context(int rctx) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - - put_recursion_context(swhash->recursion, rctx); -} - -void __perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data; - int rctx; - - preempt_disable_notrace(); - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - return; - - perf_sample_data_init(&data, addr); - - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); - - perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); -} - -static void perf_swevent_read(struct perf_event *event) -{ -} - -static int perf_swevent_add(struct perf_event *event, int flags) -{ - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - struct hw_perf_event *hwc = &event->hw; - struct hlist_head *head; - - if (is_sampling_event(event)) { - hwc->last_period = hwc->sample_period; - perf_swevent_set_period(event); - } - - hwc->state = !(flags & PERF_EF_START); - - head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) - return -EINVAL; - - hlist_add_head_rcu(&event->hlist_entry, head); - - return 0; -} - -static void perf_swevent_del(struct perf_event *event, int flags) -{ - hlist_del_rcu(&event->hlist_entry); -} - -static void perf_swevent_start(struct perf_event *event, int flags) -{ - event->hw.state = 0; -} - -static void perf_swevent_stop(struct perf_event *event, int flags) -{ - event->hw.state = PERF_HES_STOPPED; -} - -/* Deref the hlist from the update side */ -static inline struct swevent_hlist * -swevent_hlist_deref(struct swevent_htable *swhash) -{ - return rcu_dereference_protected(swhash->swevent_hlist, - lockdep_is_held(&swhash->hlist_mutex)); -} - -static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) -{ - struct swevent_hlist *hlist; - - hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); - kfree(hlist); -} - -static void swevent_hlist_release(struct swevent_htable *swhash) -{ - struct swevent_hlist *hlist = swevent_hlist_deref(swhash); - - if (!hlist) - return; - - rcu_assign_pointer(swhash->swevent_hlist, NULL); - call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); -} - -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - - if (!--swhash->hlist_refcount) - swevent_hlist_release(swhash); - - mutex_unlock(&swhash->hlist_mutex); -} - -static void swevent_hlist_put(struct perf_event *event) -{ - int cpu; - - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - - for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); -} - -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - int err = 0; - - mutex_lock(&swhash->hlist_mutex); - - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { - struct swevent_hlist *hlist; - - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - if (!hlist) { - err = -ENOMEM; - goto exit; - } - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - swhash->hlist_refcount++; -exit: - mutex_unlock(&swhash->hlist_mutex); - - return err; -} - -static int swevent_hlist_get(struct perf_event *event) -{ - int err; - int cpu, failed_cpu; - - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - - get_online_cpus(); - for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); - if (err) { - failed_cpu = cpu; - goto fail; - } - } - put_online_cpus(); - - return 0; -fail: - for_each_possible_cpu(cpu) { - if (cpu == failed_cpu) - break; - swevent_hlist_put_cpu(event, cpu); - } - - put_online_cpus(); - return err; -} - -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_event_destroy(struct perf_event *event) -{ - u64 event_id = event->attr.config; - - WARN_ON(event->parent); - - jump_label_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); -} - -static int perf_swevent_init(struct perf_event *event) -{ - int event_id = event->attr.config; - - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - case PERF_COUNT_SW_TASK_CLOCK: - return -ENOENT; - - default: - break; - } - - if (event_id >= PERF_COUNT_SW_MAX) - return -ENOENT; - - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return err; - - jump_label_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; - } - - return 0; -} - -static struct pmu perf_swevent = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_swevent_init, - .add = perf_swevent_add, - .del = perf_swevent_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -#ifdef CONFIG_EVENT_TRACING - -static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - if (event->hw.state & PERF_HES_STOPPED) - return 0; - /* - * All tracepoints are from kernel-space. - */ - if (event->attr.exclude_kernel) - return 0; - - if (!perf_tp_filter_match(event, data)) - return 0; - - return 1; -} - -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) -{ - struct perf_sample_data data; - struct perf_event *event; - struct hlist_node *node; - - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr); - data.raw = &raw; - - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, 1, &data, regs); - } - - perf_swevent_put_recursion_context(rctx); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - -static inline void perf_tp_register(void) -{ - perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - -static void perf_event_free_filter(struct perf_event *event) -{ - ftrace_profile_free_filter(event); -} - -#else - -static inline void perf_tp_register(void) -{ -} - -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - -static void perf_event_free_filter(struct perf_event *event) -{ -} - -#endif /* CONFIG_EVENT_TRACING */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -void perf_bp_event(struct perf_event *bp, void *data) -{ - struct perf_sample_data sample; - struct pt_regs *regs = data; - - perf_sample_data_init(&sample, bp->attr.bp_addr); - - if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, 1, &sample, regs); -} -#endif - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - - if (event->state != PERF_EVENT_STATE_ACTIVE) - return HRTIMER_NORESTART; - - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - s64 period; - - if (!is_sampling_event(event)) - return; - - period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; - - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (is_sampling_event(event)) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - local64_set(&hwc->period_left, ktime_to_ns(remaining)); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -static void perf_swevent_init_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (!is_sampling_event(event)) - return; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - - /* - * Since hrtimers have a fixed rate, we can do a static freq->period - * mapping and avoid the whole period adjust feedback stuff. - */ - if (event->attr.freq) { - long freq = event->attr.sample_freq; - - event->attr.sample_period = NSEC_PER_SEC / freq; - hwc->sample_period = event->attr.sample_period; - local64_set(&hwc->period_left, hwc->sample_period); - event->attr.freq = 0; - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_event_update(struct perf_event *event) -{ - s64 prev; - u64 now; - - now = local_clock(); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static void cpu_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, local_clock()); - perf_swevent_start_hrtimer(event); -} - -static void cpu_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); -} - -static int cpu_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - cpu_clock_event_start(event, flags); - - return 0; -} - -static void cpu_clock_event_del(struct perf_event *event, int flags) -{ - cpu_clock_event_stop(event, flags); -} - -static void cpu_clock_event_read(struct perf_event *event) -{ - cpu_clock_event_update(event); -} - -static int cpu_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) - return -ENOENT; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_cpu_clock = { - .task_ctx_nr = perf_sw_context, - - .event_init = cpu_clock_event_init, - .add = cpu_clock_event_add, - .del = cpu_clock_event_del, - .start = cpu_clock_event_start, - .stop = cpu_clock_event_stop, - .read = cpu_clock_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static void task_clock_event_start(struct perf_event *event, int flags) -{ - local64_set(&event->hw.prev_count, event->ctx->time); - perf_swevent_start_hrtimer(event); -} - -static void task_clock_event_stop(struct perf_event *event, int flags) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); -} - -static int task_clock_event_add(struct perf_event *event, int flags) -{ - if (flags & PERF_EF_START) - task_clock_event_start(event, flags); - - return 0; -} - -static void task_clock_event_del(struct perf_event *event, int flags) -{ - task_clock_event_stop(event, PERF_EF_UPDATE); -} - -static void task_clock_event_read(struct perf_event *event) -{ - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; - - task_clock_event_update(event, time); -} - -static int task_clock_event_init(struct perf_event *event) -{ - if (event->attr.type != PERF_TYPE_SOFTWARE) - return -ENOENT; - - if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) - return -ENOENT; - - perf_swevent_init_hrtimer(event); - - return 0; -} - -static struct pmu perf_task_clock = { - .task_ctx_nr = perf_sw_context, - - .event_init = task_clock_event_init, - .add = task_clock_event_add, - .del = task_clock_event_del, - .start = task_clock_event_start, - .stop = task_clock_event_stop, - .read = task_clock_event_read, -}; - -static void perf_pmu_nop_void(struct pmu *pmu) -{ -} - -static int perf_pmu_nop_int(struct pmu *pmu) -{ - return 0; -} - -static void perf_pmu_start_txn(struct pmu *pmu) -{ - perf_pmu_disable(pmu); -} - -static int perf_pmu_commit_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); - return 0; -} - -static void perf_pmu_cancel_txn(struct pmu *pmu) -{ - perf_pmu_enable(pmu); -} - -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static void *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; - } -} - -static void free_pmu_context(struct pmu *pmu) -{ - struct pmu *i; - - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - - free_percpu(pmu->pmu_cpu_context); -out: - mutex_unlock(&pmus_lock); -} -static struct idr pmu_idr; - -static ssize_t -type_show(struct device *dev, struct device_attribute *attr, char *page) -{ - struct pmu *pmu = dev_get_drvdata(dev); - - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); -} - -static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_NULL, -}; - -static int pmu_bus_running; -static struct bus_type pmu_bus = { - .name = "event_source", - .dev_attrs = pmu_dev_attrs, -}; - -static void pmu_dev_release(struct device *dev) -{ - kfree(dev); -} - -static int pmu_dev_alloc(struct pmu *pmu) -{ - int ret = -ENOMEM; - - pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!pmu->dev) - goto out; - - device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; - - dev_set_drvdata(pmu->dev, pmu); - pmu->dev->bus = &pmu_bus; - pmu->dev->release = pmu_dev_release; - ret = device_add(pmu->dev); - if (ret) - goto free_dev; - -out: - return ret; - -free_dev: - put_device(pmu->dev); - goto out; -} - -static struct lock_class_key cpuctx_mutex; - -int perf_pmu_register(struct pmu *pmu, char *name, int type) -{ - int cpu, ret; - - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; - - pmu->type = -1; - if (!name) - goto skip_type; - pmu->name = name; - - if (type < 0) { - int err = idr_pre_get(&pmu_idr, GFP_KERNEL); - if (!err) - goto free_pdc; - - err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); - if (err) { - ret = err; - goto free_pdc; - } - } - pmu->type = type; - - if (pmu_bus_running) { - ret = pmu_dev_alloc(pmu); - if (ret) - goto free_idr; - } - -skip_type: - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) - goto free_dev; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - cpuctx->ctx.type = cpu_context; - cpuctx->ctx.pmu = pmu; - cpuctx->jiffies_interval = 1; - INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; - } - -got_cpu_context: - if (!pmu->start_txn) { - if (pmu->pmu_enable) { - /* - * If we have pmu_enable/pmu_disable calls, install - * transaction stubs that use that to try and batch - * hardware accesses. - */ - pmu->start_txn = perf_pmu_start_txn; - pmu->commit_txn = perf_pmu_commit_txn; - pmu->cancel_txn = perf_pmu_cancel_txn; - } else { - pmu->start_txn = perf_pmu_nop_void; - pmu->commit_txn = perf_pmu_nop_int; - pmu->cancel_txn = perf_pmu_nop_void; - } - } - - if (!pmu->pmu_enable) { - pmu->pmu_enable = perf_pmu_nop_void; - pmu->pmu_disable = perf_pmu_nop_void; - } - - list_add_rcu(&pmu->entry, &pmus); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free_dev: - device_del(pmu->dev); - put_device(pmu->dev); - -free_idr: - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; -} - -void perf_pmu_unregister(struct pmu *pmu) -{ - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); - - /* - * We dereference the pmu list under both SRCU and regular RCU, so - * synchronize against both of those. - */ - synchronize_srcu(&pmus_srcu); - synchronize_rcu(); - - free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - device_del(pmu->dev); - put_device(pmu->dev); - free_pmu_context(pmu); -} - -struct pmu *perf_init_event(struct perf_event *event) -{ - struct pmu *pmu = NULL; - int idx; - int ret; - - idx = srcu_read_lock(&pmus_srcu); - - rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); - rcu_read_unlock(); - if (pmu) { - ret = pmu->event_init(event); - if (ret) - pmu = ERR_PTR(ret); - goto unlock; - } - - list_for_each_entry_rcu(pmu, &pmus, entry) { - ret = pmu->event_init(event); - if (!ret) - goto unlock; - - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } - } - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - - return pmu; -} - -/* - * Allocate and initialize a event structure - */ -static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct task_struct *task, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) -{ - struct pmu *pmu; - struct perf_event *event; - struct hw_perf_event *hwc; - long err; - - if ((unsigned)cpu >= nr_cpu_ids) { - if (!task || cpu != -1) - return ERR_PTR(-EINVAL); - } - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return ERR_PTR(-ENOMEM); - - /* - * Single events are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = event; - - mutex_init(&event->child_mutex); - INIT_LIST_HEAD(&event->child_list); - - INIT_LIST_HEAD(&event->group_entry); - INIT_LIST_HEAD(&event->event_entry); - INIT_LIST_HEAD(&event->sibling_list); - init_waitqueue_head(&event->waitq); - init_irq_work(&event->pending, perf_pending_event); - - mutex_init(&event->mmap_mutex); - - event->cpu = cpu; - event->attr = *attr; - event->group_leader = group_leader; - event->pmu = NULL; - event->oncpu = -1; - - event->parent = parent_event; - - event->ns = get_pid_ns(current->nsproxy->pid_ns); - event->id = atomic64_inc_return(&perf_event_id); - - event->state = PERF_EVENT_STATE_INACTIVE; - - if (task) { - event->attach_state = PERF_ATTACH_TASK; -#ifdef CONFIG_HAVE_HW_BREAKPOINT - /* - * hw_breakpoint is a bit difficult here.. - */ - if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif - } - - if (!overflow_handler && parent_event) - overflow_handler = parent_event->overflow_handler; - - event->overflow_handler = overflow_handler; - - if (attr->disabled) - event->state = PERF_EVENT_STATE_OFF; - - pmu = NULL; - - hwc = &event->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - local64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited events - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - pmu = perf_init_event(event); - -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (event->ns) - put_pid_ns(event->ns); - kfree(event); - return ERR_PTR(err); - } - - event->pmu = pmu; - - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_inc(&nr_mmap_events); - if (event->attr.comm) - atomic_inc(&nr_comm_events); - if (event->attr.task) - atomic_inc(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); - if (err) { - free_event(event); - return ERR_PTR(err); - } - } - } - - return event; -} - -static int perf_copy_attr(struct perf_event_attr __user *uattr, - struct perf_event_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -static int -perf_event_set_output(struct perf_event *event, struct perf_event *output_event) -{ - struct perf_buffer *buffer = NULL, *old_buffer = NULL; - int ret = -EINVAL; - - if (!output_event) - goto set; - - /* don't allow circular references */ - if (event == output_event) - goto out; - - /* - * Don't allow cross-cpu buffers - */ - if (output_event->cpu != event->cpu) - goto out; - - /* - * If its not a per-cpu buffer, it must be the same task. - */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) - goto out; - -set: - mutex_lock(&event->mmap_mutex); - /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) - goto unlock; - - if (output_event) { - /* get the buffer we want to redirect to */ - buffer = perf_buffer_get(output_event); - if (!buffer) - goto unlock; - } - - old_buffer = event->buffer; - rcu_assign_pointer(event->buffer, buffer); - ret = 0; -unlock: - mutex_unlock(&event->mmap_mutex); - - if (old_buffer) - perf_buffer_put(old_buffer); -out: - return ret; -} - -/** - * sys_perf_event_open - open a performance event, associate it to a task/cpu - * - * @attr_uptr: event_id type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader event fd - */ -SYSCALL_DEFINE5(perf_event_open, - struct perf_event_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_event *group_leader = NULL, *output_event = NULL; - struct perf_event *event, *sibling; - struct perf_event_attr attr; - struct perf_event_context *ctx; - struct file *event_file = NULL; - struct file *group_file = NULL; - struct task_struct *task = NULL; - struct pmu *pmu; - int event_fd; - int move_group = 0; - int fput_needed = 0; - int err; - - /* for future expandability... */ - if (flags & ~PERF_FLAG_ALL) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_event_sample_rate) - return -EINVAL; - } - - /* - * In cgroup mode, the pid argument is used to pass the fd - * opened to the cgroup directory in cgroupfs. The cpu argument - * designates the cpu on which to monitor threads from that - * cgroup. - */ - if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) - return -EINVAL; - - event_fd = get_unused_fd_flags(O_RDWR); - if (event_fd < 0) - return event_fd; - - if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); - goto err_fd; - } - group_file = group_leader->filp; - if (flags & PERF_FLAG_FD_OUTPUT) - output_event = group_leader; - if (flags & PERF_FLAG_FD_NO_GROUP) - group_leader = NULL; - } - - if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { - task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) { - err = PTR_ERR(task); - goto err_group_fd; - } - } - - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_task; - } - - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) - goto err_alloc; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); - } - - /* - * Special case software events and allow them to be part of - * any hardware group. - */ - pmu = event->pmu; - - if (group_leader && - (is_software_event(event) != is_software_event(group_leader))) { - if (is_software_event(event)) { - /* - * If event and group_leader are not both a software - * event, and event is, then group leader is not. - * - * Allow the addition of software events to !software - * groups, this is safe because software events never - * fail to schedule. - */ - pmu = group_leader->pmu; - } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pmu, task, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_alloc; - } - - if (task) { - put_task_struct(task); - task = NULL; - } - - /* - * Look up the group leader (we will attach this event to it): - */ - if (group_leader) { - err = -EINVAL; - - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (move_group) { - if (group_leader->ctx->type != ctx->type) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } - - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - goto err_context; - } - - if (move_group) { - struct perf_event_context *gctx = group_leader->ctx; - - mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { - perf_remove_from_context(sibling); - put_ctx(gctx); - } - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } - - event->filp = event_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - - if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); - get_ctx(ctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { - perf_install_in_context(ctx, sibling, cpu); - get_ctx(ctx); - } - } - - perf_install_in_context(ctx, event, cpu); - ++ctx->generation; - perf_unpin_context(ctx); - mutex_unlock(&ctx->mutex); - - event->owner = current; - - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - - /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). - */ - fput_light(group_file, fput_needed); - fd_install(event_fd, event_file); - return event_fd; - -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); -err_alloc: - free_event(event); -err_task: - if (task) - put_task_struct(task); -err_group_fd: - fput_light(group_file, fput_needed); -err_fd: - put_unused_fd(event_fd); - return err; -} - -/** - * perf_event_create_kernel_counter - * - * @attr: attributes of the counter to create - * @cpu: cpu in which the counter is bound - * @task: task to profile (NULL for percpu) - */ -struct perf_event * -perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - struct task_struct *task, - perf_overflow_handler_t overflow_handler) -{ - struct perf_event_context *ctx; - struct perf_event *event; - int err; - - /* - * Get the target context (task or percpu): - */ - - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err; - } - - ctx = find_get_context(event->pmu, task, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_free; - } - - event->filp = NULL; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, event, cpu); - ++ctx->generation; - perf_unpin_context(ctx); - mutex_unlock(&ctx->mutex); - - return event; - -err_free: - free_event(event); -err: - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); - -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) -{ - struct perf_event *parent_event = child_event->parent; - u64 child_val; - - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); - - child_val = perf_event_count(child_event); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_event->child_count); - atomic64_add(child_event->total_time_enabled, - &parent_event->child_total_time_enabled); - atomic64_add(child_event->total_time_running, - &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - fput(parent_event->filp); -} - -static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) -{ - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); - - /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. - */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } -} - -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) -{ - struct perf_event *child_event, *tmp; - struct perf_event_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); - task_ctx_sched_out(child_ctx, EVENT_ALL); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - raw_spin_lock(&child_ctx->lock); - child->perf_event_ctxp[ctxn] = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. - */ - unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the events so that we - * won't get any samples after PERF_RECORD_EXIT. We can however still - * get a few PERF_RECORD_READ events. - */ - perf_event_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) -{ - struct perf_event *event, *tmp; - int ctxn; - - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, - owner_entry) { - list_del_init(&event->owner_entry); - - /* - * Ensure the list deletion is visible before we clear - * the owner, closes a race against perf_release() where - * we need to serialize on the owner->perf_event_mutex. - */ - smp_wmb(); - event->owner = NULL; - } - mutex_unlock(&child->perf_event_mutex); - - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; - - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - perf_group_detach(event); - list_del_event(event, ctx); - free_event(event); -} - -/* - * free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. - */ -void perf_event_free_task(struct task_struct *task) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, - group_entry) - perf_free_event(event, ctx); - - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); - - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); - } -} - -void perf_event_delayed_put(struct task_struct *task) -{ - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); -} - -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - unsigned long flags; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, - child, - group_leader, parent_event, - NULL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - local64_set(&hwc->period_left, sample_period); - } - - child_event->ctx = child_ctx; - child_event->overflow_handler = parent_event->overflow_handler; - - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(child_event); - perf_event__id_header_size(child_event); - - /* - * Link it up in the child's context: - */ - raw_spin_lock_irqsave(&child_ctx->lock, flags); - add_event_to_ctx(child_event, child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static int -inherit_task_group(struct perf_event *event, struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, - int *inherited_all) -{ - int ret; - struct perf_event_context *child_ctx; - - if (!event->attr.inherit) { - *inherited_all = 0; - return 0; - } - - child_ctx = child->perf_event_ctxp[ctxn]; - if (!child_ctx) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = alloc_perf_context(event->pmu, child); - if (!child_ctx) - return -ENOMEM; - - child->perf_event_ctxp[ctxn] = child_ctx; - } - - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - - if (ret) - *inherited_all = 0; - - return ret; -} - -/* - * Initialize the perf_event context in task_struct - */ -int perf_event_init_context(struct task_struct *child, int ctxn) -{ - struct perf_event_context *child_ctx, *parent_ctx; - struct perf_event_context *cloned_ctx; - struct perf_event *event; - struct task_struct *parent = current; - int inherited_all = 1; - unsigned long flags; - int ret = 0; - - if (likely(!parent->perf_event_ctxp[ctxn])) - return 0; - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent, ctxn); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - break; - } - - /* - * We can't hold ctx->lock when iterating the ->flexible_group list due - * to allocations, but we need to prevent rotation because - * rotate_ctx() will change the list from interrupt context. - */ - raw_spin_lock_irqsave(&parent_ctx->lock, flags); - parent_ctx->rotate_disable = 1; - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); - if (ret) - break; - } - - raw_spin_lock_irqsave(&parent_ctx->lock, flags); - parent_ctx->rotate_disable = 0; - - child_ctx = child->perf_event_ctxp[ctxn]; - - if (child_ctx && inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * - * Note that if the parent is a clone, the holding of - * parent_ctx->lock avoids it from being uncloned. - */ - cloned_ctx = parent_ctx->parent_ctx; - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - put_ctx(parent_ctx); - - return ret; -} - -/* - * Initialize the perf_event context in task_struct - */ -int perf_event_init_task(struct task_struct *child) -{ - int ctxn, ret; - - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); - if (ret) - return ret; - } - - return 0; -} - -static void __init perf_event_init_all_cpus(void) -{ - struct swevent_htable *swhash; - int cpu; - - for_each_possible_cpu(cpu) { - swhash = &per_cpu(swevent_htable, cpu); - mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); - } -} - -static void __cpuinit perf_event_init_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { - struct swevent_hlist *hlist; - - hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); - WARN_ON(!hlist); - rcu_assign_pointer(swhash->swevent_hlist, hlist); - } - mutex_unlock(&swhash->hlist_mutex); -} - -#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - WARN_ON(!irqs_disabled()); - - list_del_init(&cpuctx->rotation_list); -} - -static void __perf_event_exit_context(void *__info) -{ - struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; - - perf_pmu_rotate_stop(ctx->pmu); - - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_remove_from_context(event); -} - -static void perf_event_exit_cpu_context(int cpu) -{ - struct perf_event_context *ctx; - struct pmu *pmu; - int idx; - - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - mutex_unlock(&ctx->mutex); - } - srcu_read_unlock(&pmus_srcu, idx); -} - -static void perf_event_exit_cpu(int cpu) -{ - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - - mutex_lock(&swhash->hlist_mutex); - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); -} -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif - -static int -perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) -{ - int cpu; - - for_each_online_cpu(cpu) - perf_event_exit_cpu(cpu); - - return NOTIFY_OK; -} - -/* - * Run the perf reboot notifier at the very last possible moment so that - * the generic watchdog code runs as long as possible. - */ -static struct notifier_block perf_reboot_notifier = { - .notifier_call = perf_reboot, - .priority = INT_MIN, -}; - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: - perf_event_init_cpu(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DOWN_PREPARE: - perf_event_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -void __init perf_event_init(void) -{ - int ret; - - idr_init(&pmu_idr); - - perf_event_init_all_cpus(); - init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); - perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); - register_reboot_notifier(&perf_reboot_notifier); - - ret = init_hw_breakpoint(); - WARN(ret, "hw_breakpoint initialization failed with: %d", ret); -} - -static int __init perf_event_sysfs_init(void) -{ - struct pmu *pmu; - int ret; - - mutex_lock(&pmus_lock); - - ret = bus_register(&pmu_bus); - if (ret) - goto unlock; - - list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) - continue; - - ret = pmu_dev_alloc(pmu); - WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); - } - pmu_bus_running = 1; - ret = 0; - -unlock: - mutex_unlock(&pmus_lock); - - return ret; -} -device_initcall(perf_event_sysfs_init); - -#ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) -{ - struct perf_cgroup *jc; - - jc = kzalloc(sizeof(*jc), GFP_KERNEL); - if (!jc) - return ERR_PTR(-ENOMEM); - - jc->info = alloc_percpu(struct perf_cgroup_info); - if (!jc->info) { - kfree(jc); - return ERR_PTR(-ENOMEM); - } - - return &jc->css; -} - -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), - struct perf_cgroup, css); - free_percpu(jc->info); - kfree(jc); -} - -static int __perf_cgroup_move(void *info) -{ - struct task_struct *task = info; - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); - return 0; -} - -static void perf_cgroup_move(struct task_struct *task) -{ - task_function_call(task, __perf_cgroup_move, task); -} - -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) -{ - perf_cgroup_move(task); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - perf_cgroup_move(c); - } - rcu_read_unlock(); - } -} - -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - perf_cgroup_move(task); -} - -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, - .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, -}; -#endif /* CONFIG_CGROUP_PERF */ -- cgit v1.1 From 48dbb6dc86ca5d1b2224937d774c7ba98bc3a485 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 3 May 2011 15:26:43 +0200 Subject: hw breakpoints: Move to kernel/events/ As part of the events sybsystem unification, relocate hw_breakpoint.c into its new destination. Cc: Frederic Weisbecker Signed-off-by: Borislav Petkov --- kernel/Makefile | 1 - kernel/events/Makefile | 3 +- kernel/events/hw_breakpoint.c | 659 ++++++++++++++++++++++++++++++++++++++++++ kernel/hw_breakpoint.c | 659 ------------------------------------------ 4 files changed, 661 insertions(+), 661 deletions(-) create mode 100644 kernel/events/hw_breakpoint.c delete mode 100644 kernel/hw_breakpoint.c diff --git a/kernel/Makefile b/kernel/Makefile index 7981530..e9cf191 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,6 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += events/ -obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 26c00e4..1ce23d3 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y += core.o +obj-y := core.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c new file mode 100644 index 0000000..086adf2 --- /dev/null +++ b/kernel/events/hw_breakpoint.c @@ -0,0 +1,659 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); + +static int nr_slots[TYPE_MAX]; + +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + +static int constraints_initialized; + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) +{ + int i; + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + + for (i = nr_slots[type] - 1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) +{ + struct task_struct *tsk = bp->hw.bp_target; + struct perf_event *iter; + int count = 0; + + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); + } + + return count; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->hw.bp_target; + + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu, type); + else + slots->pinned += task_bp_pinned(bp, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu, type); + else + nr += task_bp_pinned(bp, type); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible[type], cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, + enum bp_type_idx type, int weight) +{ + unsigned int *tsk_pinned; + int old_count = 0; + int old_idx = 0; + int idx = 0; + + old_count = task_bp_pinned(bp, type); + old_idx = old_count - 1; + idx = old_idx + weight; + + /* tsk_pinned[n] is the number of tasks having n breakpoints */ + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + if (enable) { + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; + } else { + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->hw.bp_target; + + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } + + /* Pinned counter task profiling */ + + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { + for_each_online_cpu(cpu) + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } + + if (enable) + list_add_tail(&bp->hw.bp_list, &bp_task_head); +} + +/* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + */ +static int __reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + enum bp_type_idx type; + int weight; + + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + + fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ + fetch_this_slot(&slots, weight); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + return -ENOSPC; + + toggle_bp_slot(bp, true, type, weight); + + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +static void __release_bp_slot(struct perf_event *bp) +{ + enum bp_type_idx type; + int weight; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + arch_unregister_hw_breakpoint(bp); + __release_bp_slot(bp); + + mutex_unlock(&nr_bp_mutex); +} + +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} + +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + ret = validate_hw_breakpoint(bp); + + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + + return ret; +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = validate_hw_breakpoint(bp); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event * __percpu * +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered) +{ + struct perf_event * __percpu *cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return (void __percpu __force *)ERR_PTR(-ENOMEM); + + get_online_cpus(); + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + put_online_cpus(); + + return cpu_events; + +fail: + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + put_online_cpus(); + + free_percpu(cpu_events); + return (void __percpu __force *)ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + +static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + + .event_init = hw_breakpoint_event_init, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, + .read = hw_breakpoint_pmu_read, +}; + +int __init init_hw_breakpoint(void) +{ + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); + *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], + GFP_KERNEL); + if (!*task_bp_pinned) + goto err_alloc; + } + } + + constraints_initialized = 1; + + perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); + + return register_die_notifier(&hw_breakpoint_exceptions_nb); + + err_alloc: + for_each_possible_cpu(err_cpu) { + if (err_cpu == cpu) + break; + for (i = 0; i < TYPE_MAX; i++) + kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + } + + return -ENOMEM; +} + + diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c deleted file mode 100644 index 086adf2..0000000 --- a/kernel/hw_breakpoint.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) 2007 Alan Stern - * Copyright (C) IBM Corporation, 2009 - * Copyright (C) 2009, Frederic Weisbecker - * - * Thanks to Ingo Molnar for his many suggestions. - * - * Authors: Alan Stern - * K.Prasad - * Frederic Weisbecker - */ - -/* - * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, - * using the CPU's debug registers. - * This file contains the arch-independent routines. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -/* - * Constraints data - */ - -/* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); - -/* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); - -/* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); - -static int nr_slots[TYPE_MAX]; - -/* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); - -static int constraints_initialized; - -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; - -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); - -__weak int hw_breakpoint_weight(struct perf_event *bp) -{ - return 1; -} - -static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) -{ - if (bp->attr.bp_type & HW_BREAKPOINT_RW) - return TYPE_DATA; - - return TYPE_INST; -} - -/* - * Report the maximum number of pinned breakpoints a task - * have in this cpu - */ -static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) -{ - int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; -} - -/* - * Count the number of breakpoints of the same type and same task. - * The given event must be not on the list. - */ -static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) -{ - struct task_struct *tsk = bp->hw.bp_target; - struct perf_event *iter; - int count = 0; - - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) - count += hw_breakpoint_weight(iter); - } - - return count; -} - -/* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). - */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) -{ - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - slots->pinned += max_task_bp_pinned(cpu, type); - else - slots->pinned += task_bp_pinned(bp, type); - slots->flexible = per_cpu(nr_bp_flexible[type], cpu); - - return; - } - - for_each_online_cpu(cpu) { - unsigned int nr; - - nr = per_cpu(nr_cpu_bp_pinned[type], cpu); - if (!tsk) - nr += max_task_bp_pinned(cpu, type); - else - nr += task_bp_pinned(bp, type); - - if (nr > slots->pinned) - slots->pinned = nr; - - nr = per_cpu(nr_bp_flexible[type], cpu); - - if (nr > slots->flexible) - slots->flexible = nr; - } -} - -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned; - int old_count = 0; - int old_idx = 0; - int idx = 0; - - old_count = task_bp_pinned(bp, type); - old_idx = old_count - 1; - idx = old_idx + weight; - - /* tsk_pinned[n] is the number of tasks having n breakpoints */ - tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - if (enable) { - tsk_pinned[idx]++; - if (old_count > 0) - tsk_pinned[old_idx]--; - } else { - tsk_pinned[idx]--; - if (old_count > 0) - tsk_pinned[old_idx]++; - } -} - -/* - * Add/remove the given breakpoint in our constraint table - */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) -{ - int cpu = bp->cpu; - struct task_struct *tsk = bp->hw.bp_target; - - /* Pinned counter cpu profiling */ - if (!tsk) { - - if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; - return; - } - - /* Pinned counter task profiling */ - - if (!enable) - list_del(&bp->hw.bp_list); - - if (cpu >= 0) { - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } else { - for_each_online_cpu(cpu) - toggle_bp_task_slot(bp, cpu, enable, type, weight); - } - - if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); -} - -/* - * Function to perform processor-specific cleanup during unregistration - */ -__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) -{ - /* - * A weak stub function here for those archs that don't define - * it inside arch/.../kernel/hw_breakpoint.c - */ -} - -/* - * Contraints to check before allowing this new breakpoint counter: - * - * == Non-pinned counter == (Considered as pinned for now) - * - * - If attached to a single cpu, check: - * - * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM - * - * -> If there are already non-pinned counters in this cpu, it means - * there is already a free slot for them. - * Otherwise, we check that the maximum number of per task - * breakpoints (for this cpu) plus the number of per cpu breakpoint - * (for this cpu) doesn't cover every registers. - * - * - If attached to every cpus, check: - * - * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM - * - * -> This is roughly the same, except we check the number of per cpu - * bp for every cpu and we keep the max one. Same for the per tasks - * breakpoints. - * - * - * == Pinned counter == - * - * - If attached to a single cpu, check: - * - * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM - * - * -> Same checks as before. But now the nr_bp_flexible, if any, must keep - * one register at least (or they will never be fed). - * - * - If attached to every cpus, check: - * - * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM - */ -static int __reserve_bp_slot(struct perf_event *bp) -{ - struct bp_busy_slots slots = {0}; - enum bp_type_idx type; - int weight; - - /* We couldn't initialize breakpoint constraints on boot */ - if (!constraints_initialized) - return -ENOMEM; - - /* Basic checks */ - if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || - bp->attr.bp_type == HW_BREAKPOINT_INVALID) - return -EINVAL; - - type = find_slot_idx(bp); - weight = hw_breakpoint_weight(bp); - - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) - return -ENOSPC; - - toggle_bp_slot(bp, true, type, weight); - - return 0; -} - -int reserve_bp_slot(struct perf_event *bp) -{ - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp); - - mutex_unlock(&nr_bp_mutex); - - return ret; -} - -static void __release_bp_slot(struct perf_event *bp) -{ - enum bp_type_idx type; - int weight; - - type = find_slot_idx(bp); - weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); -} - -void release_bp_slot(struct perf_event *bp) -{ - mutex_lock(&nr_bp_mutex); - - arch_unregister_hw_breakpoint(bp); - __release_bp_slot(bp); - - mutex_unlock(&nr_bp_mutex); -} - -/* - * Allow the kernel debugger to reserve breakpoint slots without - * taking a lock using the dbg_* variant of for the reserve and - * release breakpoint slots. - */ -int dbg_reserve_bp_slot(struct perf_event *bp) -{ - if (mutex_is_locked(&nr_bp_mutex)) - return -1; - - return __reserve_bp_slot(bp); -} - -int dbg_release_bp_slot(struct perf_event *bp) -{ - if (mutex_is_locked(&nr_bp_mutex)) - return -1; - - __release_bp_slot(bp); - - return 0; -} - -static int validate_hw_breakpoint(struct perf_event *bp) -{ - int ret; - - ret = arch_validate_hwbkpt_settings(bp); - if (ret) - return ret; - - if (arch_check_bp_in_kernelspace(bp)) { - if (bp->attr.exclude_kernel) - return -EINVAL; - /* - * Don't let unprivileged users set a breakpoint in the trap - * path to avoid trap recursion attacks. - */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - } - - return 0; -} - -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - int ret; - - ret = reserve_bp_slot(bp); - if (ret) - return ret; - - ret = validate_hw_breakpoint(bp); - - /* if arch_validate_hwbkpt_settings() fails then release bp slot */ - if (ret) - release_bp_slot(bp); - - return ret; -} - -/** - * register_user_hw_breakpoint - register a hardware breakpoint for user space - * @attr: breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs - */ -struct perf_event * -register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered, - struct task_struct *tsk) -{ - return perf_event_create_kernel_counter(attr, -1, tsk, triggered); -} -EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); - -/** - * modify_user_hw_breakpoint - modify a user-space hardware breakpoint - * @bp: the breakpoint structure to modify - * @attr: new breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs - */ -int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) -{ - u64 old_addr = bp->attr.bp_addr; - u64 old_len = bp->attr.bp_len; - int old_type = bp->attr.bp_type; - int err = 0; - - perf_event_disable(bp); - - bp->attr.bp_addr = attr->bp_addr; - bp->attr.bp_type = attr->bp_type; - bp->attr.bp_len = attr->bp_len; - - if (attr->disabled) - goto end; - - err = validate_hw_breakpoint(bp); - if (!err) - perf_event_enable(bp); - - if (err) { - bp->attr.bp_addr = old_addr; - bp->attr.bp_type = old_type; - bp->attr.bp_len = old_len; - if (!bp->attr.disabled) - perf_event_enable(bp); - - return err; - } - -end: - bp->attr.disabled = attr->disabled; - - return 0; -} -EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); - -/** - * unregister_hw_breakpoint - unregister a user-space hardware breakpoint - * @bp: the breakpoint structure to unregister - */ -void unregister_hw_breakpoint(struct perf_event *bp) -{ - if (!bp) - return; - perf_event_release_kernel(bp); -} -EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); - -/** - * register_wide_hw_breakpoint - register a wide breakpoint in the kernel - * @attr: breakpoint attributes - * @triggered: callback to trigger when we hit the breakpoint - * - * @return a set of per_cpu pointers to perf events - */ -struct perf_event * __percpu * -register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_overflow_handler_t triggered) -{ - struct perf_event * __percpu *cpu_events, **pevent, *bp; - long err; - int cpu; - - cpu_events = alloc_percpu(typeof(*cpu_events)); - if (!cpu_events) - return (void __percpu __force *)ERR_PTR(-ENOMEM); - - get_online_cpus(); - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); - - *pevent = bp; - - if (IS_ERR(bp)) { - err = PTR_ERR(bp); - goto fail; - } - } - put_online_cpus(); - - return cpu_events; - -fail: - for_each_online_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent)) - break; - unregister_hw_breakpoint(*pevent); - } - put_online_cpus(); - - free_percpu(cpu_events); - return (void __percpu __force *)ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); - -/** - * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel - * @cpu_events: the per cpu set of events to unregister - */ -void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) -{ - int cpu; - struct perf_event **pevent; - - for_each_possible_cpu(cpu) { - pevent = per_cpu_ptr(cpu_events, cpu); - unregister_hw_breakpoint(*pevent); - } - free_percpu(cpu_events); -} -EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); - -static struct notifier_block hw_breakpoint_exceptions_nb = { - .notifier_call = hw_breakpoint_exceptions_notify, - /* we need to be notified first */ - .priority = 0x7fffffff -}; - -static void bp_perf_event_destroy(struct perf_event *event) -{ - release_bp_slot(event); -} - -static int hw_breakpoint_event_init(struct perf_event *bp) -{ - int err; - - if (bp->attr.type != PERF_TYPE_BREAKPOINT) - return -ENOENT; - - err = register_perf_hw_breakpoint(bp); - if (err) - return err; - - bp->destroy = bp_perf_event_destroy; - - return 0; -} - -static int hw_breakpoint_add(struct perf_event *bp, int flags) -{ - if (!(flags & PERF_EF_START)) - bp->hw.state = PERF_HES_STOPPED; - - return arch_install_hw_breakpoint(bp); -} - -static void hw_breakpoint_del(struct perf_event *bp, int flags) -{ - arch_uninstall_hw_breakpoint(bp); -} - -static void hw_breakpoint_start(struct perf_event *bp, int flags) -{ - bp->hw.state = 0; -} - -static void hw_breakpoint_stop(struct perf_event *bp, int flags) -{ - bp->hw.state = PERF_HES_STOPPED; -} - -static struct pmu perf_breakpoint = { - .task_ctx_nr = perf_sw_context, /* could eventually get its own */ - - .event_init = hw_breakpoint_event_init, - .add = hw_breakpoint_add, - .del = hw_breakpoint_del, - .start = hw_breakpoint_start, - .stop = hw_breakpoint_stop, - .read = hw_breakpoint_pmu_read, -}; - -int __init init_hw_breakpoint(void) -{ - unsigned int **task_bp_pinned; - int cpu, err_cpu; - int i; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); - *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], - GFP_KERNEL); - if (!*task_bp_pinned) - goto err_alloc; - } - } - - constraints_initialized = 1; - - perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); - - return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - if (err_cpu == cpu) - break; - for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); - } - - return -ENOMEM; -} - - -- cgit v1.1 From 3772d26d87efc2d91b2e4247e0001c89ed09a980 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 May 2011 09:28:08 -0700 Subject: ceph: use ihold() when i_lock is held See 0444d76ae64fffc7851797fc1b6ebdbb44ac504a. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5323c33..010ba9c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1357,7 +1357,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) list_add(&ci->i_dirty_item, &mdsc->cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { - igrab(inode); + ihold(inode); dirty |= I_DIRTY_SYNC; } } @@ -1991,7 +1991,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) ci->i_wr_ref++; if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wrbuffer_ref == 0) - igrab(&ci->vfs_inode); + ihold(&ci->vfs_inode); ci->i_wrbuffer_ref++; dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n", &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref); -- cgit v1.1 From ca20892db7567c40e8ed0668f46cf0d085d7db6d Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Tue, 3 May 2011 02:29:56 +0000 Subject: libceph: fix ceph_msg_new error path If memory allocation failed, calling ceph_msg_put() will cause GPF since some of ceph_msg variables are not initialized first. Fix Bug #970. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- net/ceph/messenger.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 05f3578..e15a82c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2267,6 +2267,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) m->more_to_follow = false; m->pool = NULL; + /* middle */ + m->middle = NULL; + + /* data */ + m->nr_pages = 0; + m->page_alignment = 0; + m->pages = NULL; + m->pagelist = NULL; + m->bio = NULL; + m->bio_iter = NULL; + m->bio_seg = 0; + m->trail = NULL; + /* front */ if (front_len) { if (front_len > PAGE_CACHE_SIZE) { @@ -2286,19 +2299,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) } m->front.iov_len = front_len; - /* middle */ - m->middle = NULL; - - /* data */ - m->nr_pages = 0; - m->page_alignment = 0; - m->pages = NULL; - m->pagelist = NULL; - m->bio = NULL; - m->bio_iter = NULL; - m->bio_seg = 0; - m->trail = NULL; - dout("ceph_msg_new %p front %d\n", m, front_len); return m; -- cgit v1.1 From 8c71897be2ddfd84969412635ca42fa9e137f7b6 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Tue, 3 May 2011 09:45:16 +0000 Subject: ceph: handle ceph_osdc_new_request failure in ceph_writepages_start We should unlock the page and return -ENOMEM if ceph_osdc_new_request failed. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/addr.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e159c52..38b8ab5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -775,6 +775,13 @@ get_more_pages: ci->i_truncate_seq, ci->i_truncate_size, &inode->i_mtime, true, 1, 0); + + if (!req) { + rc = -ENOMEM; + unlock_page(page); + break; + } + max_pages = req->r_num_pages; alloc_page_vec(fsc, req); -- cgit v1.1 From 4ad12621e442b7a072e81270808f617cb65c5672 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 May 2011 09:23:36 -0700 Subject: libceph: fix ceph_osdc_alloc_request error checks ceph_osdc_alloc_request returns NULL on failure. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 4 ++-- net/ceph/osd_client.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 16dc364..3e90471 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -777,9 +777,9 @@ static int rbd_do_request(struct request *rq, ops, false, GFP_NOIO, pages, bio); - if (IS_ERR(req)) { + if (!req) { up_read(&header->snap_rwsem); - ret = PTR_ERR(req); + ret = -ENOMEM; goto done_pages; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5a80f41..6b5dda1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -470,8 +470,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, snapc, ops, use_mempool, GFP_NOFS, NULL, NULL); - if (IS_ERR(req)) - return req; + if (!req) + return NULL; /* calculate max write size */ calc_layout(osdc, vino, layout, off, plen, req, ops); -- cgit v1.1 From 005967a1df80980acb47c72d758ec05059105492 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 30 Apr 2011 22:28:20 +0200 Subject: ASoC: JZ4740: Fix i2s shutdown The i2s shutdown callback has the check whether it should be disabled reversed. Currently it is disabled if another stream is still active, but kept enabled if the last stream is closed. This patch fixes it. Signed-off-by: Lars-Peter Clausen Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/jz4740/jz4740-i2s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/jz4740/jz4740-i2s.c b/sound/soc/jz4740/jz4740-i2s.c index 419bf4f..cd22a54 100644 --- a/sound/soc/jz4740/jz4740-i2s.c +++ b/sound/soc/jz4740/jz4740-i2s.c @@ -133,7 +133,7 @@ static void jz4740_i2s_shutdown(struct snd_pcm_substream *substream, struct jz4740_i2s *i2s = snd_soc_dai_get_drvdata(dai); uint32_t conf; - if (!dai->active) + if (dai->active) return; conf = jz4740_i2s_read(i2s, JZ_REG_AIC_CONF); -- cgit v1.1 From 4f0871a6c7811a433513c3788a7cce27033bb8b8 Mon Sep 17 00:00:00 2001 From: Andiry Xu Date: Tue, 19 Apr 2011 17:17:39 +0800 Subject: xHCI: Clear PLC in xhci_bus_resume() This patch clears PORT_PLC if xhci_bus_resume() resumes a previous suspended port, because if a port transition from U3 to U0 state, it will report a port link state change, and software should clear the corresponding PLC bit. It also uses hcd->speed to check if a port is a USB2 protocol port. The patch fixes the issue that USB keyboard can not wakeup system from hibernation. Signed-off-by: Andiry Xu Signed-off-by: Sarah Sharp --- drivers/usb/host/xhci-hub.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index a78f2eb..73f75d2 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -777,7 +777,7 @@ int xhci_bus_suspend(struct usb_hcd *hcd) if (t1 != t2) xhci_writel(xhci, t2, port_array[port_index]); - if (DEV_HIGHSPEED(t1)) { + if (hcd->speed != HCD_USB3) { /* enable remote wake up for USB 2.0 */ u32 __iomem *addr; u32 tmp; @@ -866,6 +866,21 @@ int xhci_bus_resume(struct usb_hcd *hcd) temp |= PORT_LINK_STROBE | XDEV_U0; xhci_writel(xhci, temp, port_array[port_index]); } + /* wait for the port to enter U0 and report port link + * state change. + */ + spin_unlock_irqrestore(&xhci->lock, flags); + msleep(20); + spin_lock_irqsave(&xhci->lock, flags); + + /* Clear PLC */ + temp = xhci_readl(xhci, port_array[port_index]); + if (temp & PORT_PLC) { + temp = xhci_port_state_to_neutral(temp); + temp |= PORT_PLC; + xhci_writel(xhci, temp, port_array[port_index]); + } + slot_id = xhci_find_slot_id_by_port(hcd, xhci, port_index + 1); if (slot_id) @@ -873,7 +888,7 @@ int xhci_bus_resume(struct usb_hcd *hcd) } else xhci_writel(xhci, temp, port_array[port_index]); - if (DEV_HIGHSPEED(temp)) { + if (hcd->speed != HCD_USB3) { /* disable remote wake up for USB 2.0 */ u32 __iomem *addr; u32 tmp; -- cgit v1.1 From 9005fcd89c24f2a0fa6f87588b9208aae6d4d6cd Mon Sep 17 00:00:00 2001 From: Harry Wei Date: Thu, 28 Apr 2011 09:30:01 +0800 Subject: staging: Remove a warning for drivers/staging/wlan-ng/cfg80211.c Hi us, When i was compiling kernel, a warning happened to me. The warning said like following. drivers/staging/wlan-ng/cfg80211.c:709: warning: initialization from incompatible pointer type. See http://s1202.photobucket.com/albums/bb364/harrywei/?action=view¤t=patched2.png for more details. So i patch like following. Signed-off-by: Harry Wei Acked-by: Randy Dunlap Signed-off-by: Greg Kroah-Hartman --- drivers/staging/wlan-ng/cfg80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/wlan-ng/cfg80211.c b/drivers/staging/wlan-ng/cfg80211.c index 6a71f52..7637839 100644 --- a/drivers/staging/wlan-ng/cfg80211.c +++ b/drivers/staging/wlan-ng/cfg80211.c @@ -273,7 +273,7 @@ exit: } int prism2_set_default_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_index) + u8 key_index, bool unicast, bool multicast) { wlandevice_t *wlandev = dev->ml_priv; -- cgit v1.1 From c055f5b2614b4f758ae6cc86733f31fa4c2c5844 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 1 May 2011 09:42:07 -0500 Subject: [SCSI] fix oops in scsi_run_queue() The recent commit closing the race window in device teardown: commit 86cbfb5607d4b81b1a993ff689bbd2addd5d3a9b Author: James Bottomley Date: Fri Apr 22 10:39:59 2011 -0500 [SCSI] put stricter guards on queue dead checks is causing a potential NULL deref in scsi_run_queue() because the q->queuedata may already be NULL by the time this function is called. Since we shouldn't be running a queue that is being torn down, simply add a NULL check in scsi_run_queue() to forestall this. Tested-by: Jim Schutt Cc: stable@kernel.org Signed-off-by: James Bottomley --- drivers/scsi/scsi_lib.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e9901b8..0bac91e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -400,10 +400,15 @@ static inline int scsi_host_is_busy(struct Scsi_Host *shost) static void scsi_run_queue(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; - struct Scsi_Host *shost = sdev->host; + struct Scsi_Host *shost; LIST_HEAD(starved_list); unsigned long flags; + /* if the device is dead, sdev will be NULL, so no queue to run */ + if (!sdev) + return; + + shost = sdev->host; if (scsi_target(sdev)->single_lun) scsi_single_lun_run(sdev); -- cgit v1.1 From ed77cc122a8402db8f9c3492649aa0c3fee7b385 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 3 May 2011 18:25:34 +0100 Subject: ASoC: Don't crash on PM operations The move over to exposing snd_soc_register_card() let the initialisation of the driver data we use to find the card in PM operations go AWOL. Fix this by setting the driver data when we register the card. Signed-off-by: Mark Brown Acked-by: Liam Girdwood --- sound/soc/soc-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index d8562ce..dd55d10 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -3291,6 +3291,8 @@ int snd_soc_register_card(struct snd_soc_card *card) if (!card->name || !card->dev) return -EINVAL; + dev_set_drvdata(card->dev, card); + snd_soc_initialize_card_lists(card); soc_init_card_debugfs(card); -- cgit v1.1 From 9ab88434e8b5ffc5a638b5b1d3b9a67dceb28e5d Mon Sep 17 00:00:00 2001 From: xingchao Date: Wed, 27 Apr 2011 16:58:54 -0400 Subject: ASoC: sst_platform: add hw_free callback to fix resource leak Signed-off-by: xingchao Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/mid-x86/sst_platform.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/mid-x86/sst_platform.c b/sound/soc/mid-x86/sst_platform.c index d567c32..6b1f9d3 100644 --- a/sound/soc/mid-x86/sst_platform.c +++ b/sound/soc/mid-x86/sst_platform.c @@ -376,6 +376,11 @@ static int sst_platform_pcm_hw_params(struct snd_pcm_substream *substream, return 0; } +static int sst_platform_pcm_hw_free(struct snd_pcm_substream *substream) +{ + return snd_pcm_lib_free_pages(substream); +} + static struct snd_pcm_ops sst_platform_ops = { .open = sst_platform_open, .close = sst_platform_close, @@ -384,6 +389,7 @@ static struct snd_pcm_ops sst_platform_ops = { .trigger = sst_platform_pcm_trigger, .pointer = sst_platform_pcm_pointer, .hw_params = sst_platform_pcm_hw_params, + .hw_free = sst_platform_pcm_hw_free, }; static void sst_pcm_free(struct snd_pcm *pcm) -- cgit v1.1 From cce2c56e7666199916525907dc817209dd58287c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 3 May 2011 16:10:25 -0700 Subject: logfs: initialize superblock entries earlier In particular, s_freeing_list needs to be initialized early, since it is used on some of the error paths when mounts fail. The mapping inode, for example, would be initialized and then free'd on an error path before s_freeing_list was initialized, but the inode drop operation needs the s_freeing_list to be set up. Normally you'd never see this, because not only is logfs fairly rare, but a successful mount will never have any issues. Reported-by: werner Signed-off-by: Linus Torvalds --- fs/logfs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 33435e4..ce03a18 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -480,10 +480,6 @@ static int logfs_read_sb(struct super_block *sb, int read_only) !read_only) return -EIO; - mutex_init(&super->s_dirop_mutex); - mutex_init(&super->s_object_alias_mutex); - INIT_LIST_HEAD(&super->s_freeing_list); - ret = logfs_init_rw(sb); if (ret) return ret; @@ -601,6 +597,10 @@ static struct dentry *logfs_mount(struct file_system_type *type, int flags, if (!super) return ERR_PTR(-ENOMEM); + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + if (!devname) err = logfs_get_sb_bdev(super, type, devname); else if (strncmp(devname, "mtd", 3)) -- cgit v1.1 From e2c85d8e3974c9041ad7b080846b28d2243e771b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 3 May 2011 15:15:55 -0400 Subject: drm/radeon/kms: add some new pci ids Signed-off-by: Alex Deucher Cc: stable@kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 816e30c..f04b2a3 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -155,6 +155,7 @@ {0x1002, 0x6719, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAYMAN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x671c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAYMAN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x671d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAYMAN|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x671f, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAYMAN|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6720, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6721, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6722, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_NEW_MEMMAP}, \ @@ -167,6 +168,7 @@ {0x1002, 0x6729, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6738, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6739, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x673e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_BARTS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6740, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6741, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6742, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ @@ -199,6 +201,7 @@ {0x1002, 0x688D, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6898, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6899, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x689b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x689c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_HEMLOCK|RADEON_NEW_MEMMAP}, \ {0x1002, 0x689d, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_HEMLOCK|RADEON_NEW_MEMMAP}, \ {0x1002, 0x689e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYPRESS|RADEON_NEW_MEMMAP}, \ @@ -209,7 +212,9 @@ {0x1002, 0x68b0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_JUNIPER|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x68b8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_JUNIPER|RADEON_NEW_MEMMAP}, \ {0x1002, 0x68b9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_JUNIPER|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x68ba, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_JUNIPER|RADEON_NEW_MEMMAP}, \ {0x1002, 0x68be, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_JUNIPER|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x68bf, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_JUNIPER|RADEON_NEW_MEMMAP}, \ {0x1002, 0x68c0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_REDWOOD|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x68c1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_REDWOOD|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x68c7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_REDWOOD|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ -- cgit v1.1 From eaa4f5e1d0b816291a59a47917e569c0384f2b6f Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Sun, 1 May 2011 20:16:30 +1000 Subject: drm/radeon: fix regression on atom cards with hardcoded EDID record. Since fafcf94e2b5732d1e13b440291c53115d2b172e9 introduced an edid size, it seems to have broken this path. This manifest as oops on T500 Lenovo laptops with dual graphics primarily. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33812 cc: stable@kernel.org Reviewed-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_atombios.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index f5d12fb..f116516 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -1599,9 +1599,10 @@ struct radeon_encoder_atom_dig *radeon_atombios_get_lvds_info(struct memcpy((u8 *)edid, (u8 *)&fake_edid_record->ucFakeEDIDString[0], fake_edid_record->ucFakeEDIDLength); - if (drm_edid_is_valid(edid)) + if (drm_edid_is_valid(edid)) { rdev->mode_info.bios_hardcoded_edid = edid; - else + rdev->mode_info.bios_hardcoded_edid_size = edid_size; + } else kfree(edid); } } -- cgit v1.1 From 498548ec69c6897fe4376b2ca90758762fa0b817 Mon Sep 17 00:00:00 2001 From: Christopher James Halse Rogers Date: Wed, 27 Apr 2011 16:10:57 +1000 Subject: drm: Send pending vblank events before disabling vblank. This is the least-bad behaviour. It means that we signal the vblank event before it actually happens, but since we're disabling vblanks there's no guarantee that it will *ever* happen otherwise. This prevents GL applications which use WaitMSC from hanging indefinitely. Signed-off-by: Christopher James Halse Rogers Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_irq.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index 741457b..a1f12cb 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -932,11 +932,34 @@ EXPORT_SYMBOL(drm_vblank_put); void drm_vblank_off(struct drm_device *dev, int crtc) { + struct drm_pending_vblank_event *e, *t; + struct timeval now; unsigned long irqflags; + unsigned int seq; spin_lock_irqsave(&dev->vbl_lock, irqflags); vblank_disable_and_save(dev, crtc); DRM_WAKEUP(&dev->vbl_queue[crtc]); + + /* Send any queued vblank events, lest the natives grow disquiet */ + seq = drm_vblank_count_and_time(dev, crtc, &now); + list_for_each_entry_safe(e, t, &dev->vblank_event_list, base.link) { + if (e->pipe != crtc) + continue; + DRM_DEBUG("Sending premature vblank event on disable: \ + wanted %d, current %d\n", + e->event.sequence, seq); + + e->event.sequence = seq; + e->event.tv_sec = now.tv_sec; + e->event.tv_usec = now.tv_usec; + drm_vblank_put(dev, e->pipe); + list_move_tail(&e->base.link, &e->base.file_priv->event_list); + wake_up_interruptible(&e->base.file_priv->event_wait); + trace_drm_vblank_event_delivered(e->base.pid, e->pipe, + e->event.sequence); + } + spin_unlock_irqrestore(&dev->vbl_lock, irqflags); } EXPORT_SYMBOL(drm_vblank_off); -- cgit v1.1 From 8aeb96f80232e9a701b5c4715504f4c9173978bd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 3 May 2011 19:28:02 -0400 Subject: drm/radeon/kms: fix gart setup on fusion parts (v2) Out of the entire GART/VM subsystem, the hw designers changed the location of 3 regs. v2: airlied: add parameter for userspace to work from. Signed-off-by: Alex Deucher Signed-off-by: Jerome Glisse Cc: stable@kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/evergreen.c | 17 +++++++++-------- drivers/gpu/drm/radeon/evergreend.h | 5 +++++ drivers/gpu/drm/radeon/radeon_kms.c | 3 +++ include/drm/radeon_drm.h | 1 + 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index e9bc135..c20eac3 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -862,9 +862,15 @@ int evergreen_pcie_gart_enable(struct radeon_device *rdev) SYSTEM_ACCESS_MODE_NOT_IN_SYS | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU | EFFECTIVE_L1_TLB_SIZE(5) | EFFECTIVE_L1_QUEUE_SIZE(5); - WREG32(MC_VM_MD_L1_TLB0_CNTL, tmp); - WREG32(MC_VM_MD_L1_TLB1_CNTL, tmp); - WREG32(MC_VM_MD_L1_TLB2_CNTL, tmp); + if (rdev->flags & RADEON_IS_IGP) { + WREG32(FUS_MC_VM_MD_L1_TLB0_CNTL, tmp); + WREG32(FUS_MC_VM_MD_L1_TLB1_CNTL, tmp); + WREG32(FUS_MC_VM_MD_L1_TLB2_CNTL, tmp); + } else { + WREG32(MC_VM_MD_L1_TLB0_CNTL, tmp); + WREG32(MC_VM_MD_L1_TLB1_CNTL, tmp); + WREG32(MC_VM_MD_L1_TLB2_CNTL, tmp); + } WREG32(MC_VM_MB_L1_TLB0_CNTL, tmp); WREG32(MC_VM_MB_L1_TLB1_CNTL, tmp); WREG32(MC_VM_MB_L1_TLB2_CNTL, tmp); @@ -2923,11 +2929,6 @@ static int evergreen_startup(struct radeon_device *rdev) rdev->asic->copy = NULL; dev_warn(rdev->dev, "failed blitter (%d) falling back to memcpy\n", r); } - /* XXX: ontario has problems blitting to gart at the moment */ - if (rdev->family == CHIP_PALM) { - rdev->asic->copy = NULL; - radeon_ttm_set_active_vram_size(rdev, rdev->mc.visible_vram_size); - } /* allocate wb buffer */ r = radeon_wb_init(rdev); diff --git a/drivers/gpu/drm/radeon/evergreend.h b/drivers/gpu/drm/radeon/evergreend.h index 9aaa3f0..9453384 100644 --- a/drivers/gpu/drm/radeon/evergreend.h +++ b/drivers/gpu/drm/radeon/evergreend.h @@ -221,6 +221,11 @@ #define MC_VM_MD_L1_TLB0_CNTL 0x2654 #define MC_VM_MD_L1_TLB1_CNTL 0x2658 #define MC_VM_MD_L1_TLB2_CNTL 0x265C + +#define FUS_MC_VM_MD_L1_TLB0_CNTL 0x265C +#define FUS_MC_VM_MD_L1_TLB1_CNTL 0x2660 +#define FUS_MC_VM_MD_L1_TLB2_CNTL 0x2664 + #define MC_VM_SYSTEM_APERTURE_DEFAULT_ADDR 0x203C #define MC_VM_SYSTEM_APERTURE_HIGH_ADDR 0x2038 #define MC_VM_SYSTEM_APERTURE_LOW_ADDR 0x2034 diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 871df03..bd58af6 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -234,6 +234,9 @@ int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return -EINVAL; } break; + case RADEON_INFO_FUSION_GART_WORKING: + value = 1; + break; default: DRM_DEBUG_KMS("Invalid request %d\n", info->request); return -EINVAL; diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h index 7aa5ddd..787f7b6 100644 --- a/include/drm/radeon_drm.h +++ b/include/drm/radeon_drm.h @@ -910,6 +910,7 @@ struct drm_radeon_cs { #define RADEON_INFO_CLOCK_CRYSTAL_FREQ 0x09 /* clock crystal frequency */ #define RADEON_INFO_NUM_BACKENDS 0x0a /* DB/backends for r600+ - need for OQ */ #define RADEON_INFO_NUM_TILE_PIPES 0x0b /* tile pipes for r600+ */ +#define RADEON_INFO_FUSION_GART_WORKING 0x0c /* fusion writes to GTT were broken before this */ struct drm_radeon_info { uint32_t request; -- cgit v1.1 From 0ee5623f9a6e52df90a78bd21179f8ab370e102e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 3 May 2011 19:59:13 -0700 Subject: Linux 2.6.39-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5a7a2e4..28820f7 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 39 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* -- cgit v1.1 From e7e7ee2eab2080248084d71fe0a115ab745eb2aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 May 2011 08:42:29 +0200 Subject: perf events: Clean up definitions and initializers, update copyrights Fix a few inconsistent style bits that were added over the past few months. Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-yv4hwf9yhnzoada8pcpb3a97@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 96 +++++++++++++++++++++------------------------- kernel/events/core.c | 40 +++++++++---------- 2 files changed, 64 insertions(+), 72 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9eec53d..207c169 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -2,8 +2,8 @@ * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner - * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * @@ -468,9 +468,9 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; -#define PERF_FLAG_FD_NO_GROUP (1U << 0) -#define PERF_FLAG_FD_OUTPUT (1U << 1) -#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) +#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ #ifdef __KERNEL__ /* @@ -484,9 +484,9 @@ enum perf_callchain_context { #endif struct perf_guest_info_callbacks { - int (*is_in_guest) (void); - int (*is_user_mode) (void); - unsigned long (*get_guest_ip) (void); + int (*is_in_guest)(void); + int (*is_user_mode)(void); + unsigned long (*get_guest_ip)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -652,19 +652,19 @@ struct pmu { * Start the transaction, after this ->add() doesn't need to * do schedulability tests. */ - void (*start_txn) (struct pmu *pmu); /* optional */ + void (*start_txn) (struct pmu *pmu); /* optional */ /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. */ - int (*commit_txn) (struct pmu *pmu); /* optional */ + int (*commit_txn) (struct pmu *pmu); /* optional */ /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. */ - void (*cancel_txn) (struct pmu *pmu); /* optional */ + void (*cancel_txn) (struct pmu *pmu); /* optional */ }; /** @@ -712,15 +712,15 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, int, struct pt_regs *regs); enum perf_group_flag { - PERF_GROUP_SOFTWARE = 0x1, + PERF_GROUP_SOFTWARE = 0x1, }; -#define SWEVENT_HLIST_BITS 8 -#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) +#define SWEVENT_HLIST_BITS 8 +#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { - struct hlist_head heads[SWEVENT_HLIST_SIZE]; - struct rcu_head rcu_head; + struct hlist_head heads[SWEVENT_HLIST_SIZE]; + struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 @@ -733,13 +733,13 @@ struct swevent_hlist { * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { - u64 time; - u64 timestamp; + u64 time; + u64 timestamp; }; struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info *info; /* timing info, one per cpu */ + struct cgroup_subsys_state css; + struct perf_cgroup_info *info; /* timing info, one per cpu */ }; #endif @@ -923,7 +923,7 @@ struct perf_event_context { /* * Number of contexts where an event can trigger: - * task, softirq, hardirq, nmi. + * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 @@ -1001,8 +1001,7 @@ struct perf_sample_data { struct perf_raw_record *raw; }; -static inline -void perf_sample_data_init(struct perf_sample_data *data, u64 addr) +static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; @@ -1039,8 +1038,7 @@ extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs -static inline void -perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } +static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* @@ -1080,8 +1078,7 @@ static inline void perf_event_task_sched_in(struct task_struct *task) __perf_event_task_sched_in(task); } -static inline -void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) +static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); @@ -1099,14 +1096,10 @@ extern void perf_event_fork(struct task_struct *tsk); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); -extern void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs); -extern void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs); - +extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs); +extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs); -static inline void -perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) +static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { if (entry->nr < PERF_MAX_STACK_DEPTH) entry->ip[entry->nr++] = ip; @@ -1142,9 +1135,9 @@ extern void perf_tp_event(u64 addr, u64 count, void *record, extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags -#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ - PERF_RECORD_MISC_KERNEL) -#define perf_instruction_pointer(regs) instruction_pointer(regs) +# define perf_misc_flags(regs) \ + (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) +# define perf_instruction_pointer(regs) instruction_pointer(regs) #endif extern int perf_output_begin(struct perf_output_handle *handle, @@ -1179,9 +1172,9 @@ static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } @@ -1194,23 +1187,22 @@ static inline void perf_event_disable(struct perf_event *event) { } static inline void perf_event_task_tick(void) { } #endif -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) +#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) /* * This has to have a higher priority than migration_notifier in sched.c. */ -#define perf_cpu_notifier(fn) \ -do { \ - static struct notifier_block fn##_nb __cpuinitdata = \ - { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ - fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ - (void *)(unsigned long)smp_processor_id()); \ - fn(&fn##_nb, (unsigned long)CPU_STARTING, \ - (void *)(unsigned long)smp_processor_id()); \ - fn(&fn##_nb, (unsigned long)CPU_ONLINE, \ - (void *)(unsigned long)smp_processor_id()); \ - register_cpu_notifier(&fn##_nb); \ +#define perf_cpu_notifier(fn) \ +do { \ + static struct notifier_block fn##_nb __cpuinitdata = \ + { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ + fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ + (void *)(unsigned long)smp_processor_id()); \ + fn(&fn##_nb, (unsigned long)CPU_STARTING, \ + (void *)(unsigned long)smp_processor_id()); \ + fn(&fn##_nb, (unsigned long)CPU_ONLINE, \ + (void *)(unsigned long)smp_processor_id()); \ + register_cpu_notifier(&fn##_nb); \ } while (0) #endif /* __KERNEL__ */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 440bc48..0fc34a3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2,8 +2,8 @@ * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING @@ -39,10 +39,10 @@ #include struct remote_function_call { - struct task_struct *p; - int (*func)(void *info); - void *info; - int ret; + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; }; static void remote_function(void *data) @@ -76,10 +76,10 @@ static int task_function_call(struct task_struct *p, int (*func) (void *info), void *info) { struct remote_function_call data = { - .p = p, - .func = func, - .info = info, - .ret = -ESRCH, /* No such (running) process */ + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ }; if (task_curr(p)) @@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) static int cpu_function_call(int cpu, int (*func) (void *info), void *info) { struct remote_function_call data = { - .p = NULL, - .func = func, - .info = info, - .ret = -ENXIO, /* No such CPU */ + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); @@ -7445,11 +7445,11 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, } struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, - .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.1 From 931aeeda0dca81152aec48f30be01e86a268bf89 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 3 May 2011 22:31:07 +0400 Subject: sched: Remove unused 'this_best_prio arg' from balance_tasks() It's passed across multiple functions but is never really used, so remove it. Signed-off-by: Vladimir Davydov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1304447467-29200-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5280272..37f2262 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2142,7 +2142,7 @@ static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct cfs_rq *busiest_cfs_rq) + struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0; long rem_load_move = max_load_move; @@ -2180,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, */ if (rem_load_move <= 0) break; - - if (p->prio < *this_best_prio) - *this_best_prio = p->prio; } out: /* @@ -2242,7 +2239,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *all_pinned) { long rem_load_move = max_load_move; int busiest_cpu = cpu_of(busiest); @@ -2267,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, busiest_cfs_rq); if (!moved_load) @@ -2293,11 +2290,11 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *all_pinned) { return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, idle, all_pinned, - this_best_prio, &busiest->cfs); + &busiest->cfs); } #endif @@ -2314,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { unsigned long total_load_moved = 0, load_moved; - int this_best_prio = this_rq->curr->prio; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); + sd, idle, all_pinned); total_load_moved += load_moved; -- cgit v1.1 From 904cc1e637a00dba1b58e7752f485f90ebf2a568 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Tue, 26 Apr 2011 17:05:18 +0000 Subject: [CPUFREQ] Fix _OSC UUID in pcc-cpufreq UUID needs to be written out the way it is described in Sec 18.5.124 of ACPI 4.0a Specification. Platform firmware's use of this UUID/_OSC is optional, which is why we didn't notice this bug earlier. Signed-off-by: Naga Chumbalkar Signed-off-by: Dave Jones Cc: stable@kernel.org --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 755a31e..907c8e6 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -39,7 +39,7 @@ #include -#define PCC_VERSION "1.00.00" +#define PCC_VERSION "1.10.00" #define POLL_LOOPS 300 #define CMD_COMPLETE 0x1 @@ -102,7 +102,7 @@ static struct acpi_generic_address doorbell; static u64 doorbell_preserve; static u64 doorbell_write; -static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, +static u8 OSC_UUID[16] = {0x9F, 0x2C, 0x9B, 0x63, 0x91, 0x70, 0x1f, 0x49, 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; struct pcc_cpu { -- cgit v1.1 From 27ecddc2a9f99ce4ac9a59a0acd77f7100b6d034 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Wed, 27 Apr 2011 13:32:11 -0500 Subject: [CPUFREQ] CPU hotplug, re-create sysfs directory and symlinks When we discover CPUs that are affected by each other's frequency/voltage transitions, the first CPU gets a sysfs directory created, and rest of the siblings get symlinks. Currently, when we hotplug off only the first CPU, all of the symlinks and the sysfs directory gets removed. Even though rest of the siblings are still online and functional, they are orphaned, and no longer governed by cpufreq. This patch, given the above scenario, creates a sysfs directory for the first sibling and symlinks for the rest of the siblings. Please note the recursive call, it was rather too ugly to roll it out. And the removal of redundant NULL setting (it is already taken care of near the top of the function). Signed-off-by: Jacob Shin Acked-by: Mark Langsdorf Reviewed-by: Thomas Renninger Signed-off-by: Dave Jones Cc: stable@kernel.org --- drivers/cpufreq/cpufreq.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 2dafc5c..7c10f96 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1208,12 +1208,28 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) cpufreq_driver->exit(data); unlock_policy_rwsem_write(cpu); + cpufreq_debug_enable_ratelimit(); + +#ifdef CONFIG_HOTPLUG_CPU + /* when the CPU which is the parent of the kobj is hotplugged + * offline, check for siblings, and create cpufreq sysfs interface + * and symlinks + */ + if (unlikely(cpumask_weight(data->cpus) > 1)) { + /* first sibling now owns the new sysfs dir */ + cpumask_clear_cpu(cpu, data->cpus); + cpufreq_add_dev(get_cpu_sysdev(cpumask_first(data->cpus))); + + /* finally remove our own symlink */ + lock_policy_rwsem_write(cpu); + __cpufreq_remove_dev(sys_dev); + } +#endif + free_cpumask_var(data->related_cpus); free_cpumask_var(data->cpus); kfree(data); - per_cpu(cpufreq_cpu_data, cpu) = NULL; - cpufreq_debug_enable_ratelimit(); return 0; } -- cgit v1.1 From 2d06d8c49afdcc9bb35a85039fa50f0fe35bd40e Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Sun, 27 Mar 2011 15:04:46 +0200 Subject: [CPUFREQ] use dynamic debug instead of custom infrastructure With dynamic debug having gained the capability to report debug messages also during the boot process, it offers a far superior interface for debug messages than the custom cpufreq infrastructure. As a first step, remove the old cpufreq_debug_printk() function and replace it with a call to the generic pr_debug() function. How can dynamic debug be used on cpufreq? You need a kernel which has CONFIG_DYNAMIC_DEBUG enabled. To enabled debugging during runtime, mount debugfs and $ echo -n 'module cpufreq +p' > /sys/kernel/debug/dynamic_debug/control for debugging the complete "cpufreq" module. To achieve the same goal during boot, append ddebug_query="module cpufreq +p" as a boot parameter to the kernel of your choice. For more detailled instructions, please see Documentation/dynamic-debug-howto.txt Signed-off-by: Dominik Brodowski Signed-off-by: Dave Jones --- arch/arm/mach-davinci/cpufreq.c | 4 +- arch/blackfin/mach-common/dpmc.c | 3 - arch/ia64/kernel/cpufreq/acpi-cpufreq.c | 44 +++--- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 45 +++--- arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | 6 +- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 21 ++- arch/x86/kernel/cpu/cpufreq/longhaul.c | 11 +- arch/x86/kernel/cpu/cpufreq/longrun.c | 17 +-- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 10 +- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 47 +++--- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 33 ++--- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 100 ++++++------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 2 - arch/x86/kernel/cpu/cpufreq/sc520_freq.c | 6 +- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 23 ++- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 28 ++-- arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 43 +++--- arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 41 +++--- drivers/acpi/processor_perflib.c | 6 +- drivers/cpufreq/Kconfig | 13 -- drivers/cpufreq/cpufreq.c | 180 +++++------------------ drivers/cpufreq/cpufreq_performance.c | 5 +- drivers/cpufreq/cpufreq_powersave.c | 5 +- drivers/cpufreq/cpufreq_userspace.c | 13 +- drivers/cpufreq/freq_table.c | 19 +-- include/linux/cpufreq.h | 19 --- 26 files changed, 270 insertions(+), 474 deletions(-) diff --git a/arch/arm/mach-davinci/cpufreq.c b/arch/arm/mach-davinci/cpufreq.c index 0a95be1..41669ec 100644 --- a/arch/arm/mach-davinci/cpufreq.c +++ b/arch/arm/mach-davinci/cpufreq.c @@ -94,9 +94,7 @@ static int davinci_target(struct cpufreq_policy *policy, if (freqs.old == freqs.new) return ret; - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, - dev_driver_string(cpufreq.dev), - "transition: %u --> %u\n", freqs.old, freqs.new); + dev_dbg(&cpufreq.dev, "transition: %u --> %u\n", freqs.old, freqs.new); ret = cpufreq_frequency_table_target(policy, pdata->freq_table, freqs.new, relation, &idx); diff --git a/arch/blackfin/mach-common/dpmc.c b/arch/blackfin/mach-common/dpmc.c index 382099f..5e4112e 100644 --- a/arch/blackfin/mach-common/dpmc.c +++ b/arch/blackfin/mach-common/dpmc.c @@ -19,9 +19,6 @@ #define DRIVER_NAME "bfin dpmc" -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, DRIVER_NAME, msg) - struct bfin_dpmc_platform_data *pdata; /** diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c index 22f6152..f09b174 100644 --- a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c +++ b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c @@ -23,8 +23,6 @@ #include #include -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg) - MODULE_AUTHOR("Venkatesh Pallipadi"); MODULE_DESCRIPTION("ACPI Processor P-States Driver"); MODULE_LICENSE("GPL"); @@ -47,12 +45,12 @@ processor_set_pstate ( { s64 retval; - dprintk("processor_set_pstate\n"); + pr_debug("processor_set_pstate\n"); retval = ia64_pal_set_pstate((u64)value); if (retval) { - dprintk("Failed to set freq to 0x%x, with error 0x%lx\n", + pr_debug("Failed to set freq to 0x%x, with error 0x%lx\n", value, retval); return -ENODEV; } @@ -67,14 +65,14 @@ processor_get_pstate ( u64 pstate_index = 0; s64 retval; - dprintk("processor_get_pstate\n"); + pr_debug("processor_get_pstate\n"); retval = ia64_pal_get_pstate(&pstate_index, PAL_GET_PSTATE_TYPE_INSTANT); *value = (u32) pstate_index; if (retval) - dprintk("Failed to get current freq with " + pr_debug("Failed to get current freq with " "error 0x%lx, idx 0x%x\n", retval, *value); return (int)retval; @@ -90,7 +88,7 @@ extract_clock ( { unsigned long i; - dprintk("extract_clock\n"); + pr_debug("extract_clock\n"); for (i = 0; i < data->acpi_data.state_count; i++) { if (value == data->acpi_data.states[i].status) @@ -110,7 +108,7 @@ processor_get_freq ( cpumask_t saved_mask; unsigned long clock_freq; - dprintk("processor_get_freq\n"); + pr_debug("processor_get_freq\n"); saved_mask = current->cpus_allowed; set_cpus_allowed_ptr(current, cpumask_of(cpu)); @@ -148,7 +146,7 @@ processor_set_freq ( cpumask_t saved_mask; int retval; - dprintk("processor_set_freq\n"); + pr_debug("processor_set_freq\n"); saved_mask = current->cpus_allowed; set_cpus_allowed_ptr(current, cpumask_of(cpu)); @@ -159,16 +157,16 @@ processor_set_freq ( if (state == data->acpi_data.state) { if (unlikely(data->resume)) { - dprintk("Called after resume, resetting to P%d\n", state); + pr_debug("Called after resume, resetting to P%d\n", state); data->resume = 0; } else { - dprintk("Already at target state (P%d)\n", state); + pr_debug("Already at target state (P%d)\n", state); retval = 0; goto migrate_end; } } - dprintk("Transitioning from P%d to P%d\n", + pr_debug("Transitioning from P%d to P%d\n", data->acpi_data.state, state); /* cpufreq frequency struct */ @@ -186,7 +184,7 @@ processor_set_freq ( value = (u32) data->acpi_data.states[state].control; - dprintk("Transitioning to state: 0x%08x\n", value); + pr_debug("Transitioning to state: 0x%08x\n", value); ret = processor_set_pstate(value); if (ret) { @@ -219,7 +217,7 @@ acpi_cpufreq_get ( { struct cpufreq_acpi_io *data = acpi_io_data[cpu]; - dprintk("acpi_cpufreq_get\n"); + pr_debug("acpi_cpufreq_get\n"); return processor_get_freq(data, cpu); } @@ -235,7 +233,7 @@ acpi_cpufreq_target ( unsigned int next_state = 0; unsigned int result = 0; - dprintk("acpi_cpufreq_setpolicy\n"); + pr_debug("acpi_cpufreq_setpolicy\n"); result = cpufreq_frequency_table_target(policy, data->freq_table, target_freq, relation, &next_state); @@ -255,7 +253,7 @@ acpi_cpufreq_verify ( unsigned int result = 0; struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; - dprintk("acpi_cpufreq_verify\n"); + pr_debug("acpi_cpufreq_verify\n"); result = cpufreq_frequency_table_verify(policy, data->freq_table); @@ -273,7 +271,7 @@ acpi_cpufreq_cpu_init ( struct cpufreq_acpi_io *data; unsigned int result = 0; - dprintk("acpi_cpufreq_cpu_init\n"); + pr_debug("acpi_cpufreq_cpu_init\n"); data = kzalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL); if (!data) @@ -288,7 +286,7 @@ acpi_cpufreq_cpu_init ( /* capability check */ if (data->acpi_data.state_count <= 1) { - dprintk("No P-States\n"); + pr_debug("No P-States\n"); result = -ENODEV; goto err_unreg; } @@ -297,7 +295,7 @@ acpi_cpufreq_cpu_init ( ACPI_ADR_SPACE_FIXED_HARDWARE) || (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Unsupported address space [%d, %d]\n", + pr_debug("Unsupported address space [%d, %d]\n", (u32) (data->acpi_data.control_register.space_id), (u32) (data->acpi_data.status_register.space_id)); result = -ENODEV; @@ -348,7 +346,7 @@ acpi_cpufreq_cpu_init ( "activated.\n", cpu); for (i = 0; i < data->acpi_data.state_count; i++) - dprintk(" %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n", + pr_debug(" %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n", (i == data->acpi_data.state?'*':' '), i, (u32) data->acpi_data.states[i].core_frequency, (u32) data->acpi_data.states[i].power, @@ -383,7 +381,7 @@ acpi_cpufreq_cpu_exit ( { struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; - dprintk("acpi_cpufreq_cpu_exit\n"); + pr_debug("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); @@ -418,7 +416,7 @@ static struct cpufreq_driver acpi_cpufreq_driver = { static int __init acpi_cpufreq_init (void) { - dprintk("acpi_cpufreq_init\n"); + pr_debug("acpi_cpufreq_init\n"); return cpufreq_register_driver(&acpi_cpufreq_driver); } @@ -427,7 +425,7 @@ acpi_cpufreq_init (void) static void __exit acpi_cpufreq_exit (void) { - dprintk("acpi_cpufreq_exit\n"); + pr_debug("acpi_cpufreq_exit\n"); cpufreq_unregister_driver(&acpi_cpufreq_driver); return; diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index a2baafb..4e04e12 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -47,9 +47,6 @@ #include #include "mperf.h" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "acpi-cpufreq", msg) - MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); MODULE_DESCRIPTION("ACPI Processor P-States Driver"); MODULE_LICENSE("GPL"); @@ -233,7 +230,7 @@ static u32 get_cur_val(const struct cpumask *mask) cmd.mask = mask; drv_read(&cmd); - dprintk("get_cur_val = %u\n", cmd.val); + pr_debug("get_cur_val = %u\n", cmd.val); return cmd.val; } @@ -244,7 +241,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) unsigned int freq; unsigned int cached_freq; - dprintk("get_cur_freq_on_cpu (%d)\n", cpu); + pr_debug("get_cur_freq_on_cpu (%d)\n", cpu); if (unlikely(data == NULL || data->acpi_data == NULL || data->freq_table == NULL)) { @@ -261,7 +258,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) data->resume = 1; } - dprintk("cur freq = %u\n", freq); + pr_debug("cur freq = %u\n", freq); return freq; } @@ -293,7 +290,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int i; int result = 0; - dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); + pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); if (unlikely(data == NULL || data->acpi_data == NULL || data->freq_table == NULL)) { @@ -313,11 +310,11 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, next_perf_state = data->freq_table[next_state].index; if (perf->state == next_perf_state) { if (unlikely(data->resume)) { - dprintk("Called after resume, resetting to P%d\n", + pr_debug("Called after resume, resetting to P%d\n", next_perf_state); data->resume = 0; } else { - dprintk("Already at target state (P%d)\n", + pr_debug("Already at target state (P%d)\n", next_perf_state); goto out; } @@ -357,7 +354,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, if (acpi_pstate_strict) { if (!check_freqs(cmd.mask, freqs.new, data)) { - dprintk("acpi_cpufreq_target failed (%d)\n", + pr_debug("acpi_cpufreq_target failed (%d)\n", policy->cpu); result = -EAGAIN; goto out; @@ -378,7 +375,7 @@ static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - dprintk("acpi_cpufreq_verify\n"); + pr_debug("acpi_cpufreq_verify\n"); return cpufreq_frequency_table_verify(policy, data->freq_table); } @@ -433,11 +430,11 @@ static void free_acpi_perf_data(void) static int __init acpi_cpufreq_early_init(void) { unsigned int i; - dprintk("acpi_cpufreq_early_init\n"); + pr_debug("acpi_cpufreq_early_init\n"); acpi_perf_data = alloc_percpu(struct acpi_processor_performance); if (!acpi_perf_data) { - dprintk("Memory allocation error for acpi_perf_data.\n"); + pr_debug("Memory allocation error for acpi_perf_data.\n"); return -ENOMEM; } for_each_possible_cpu(i) { @@ -519,7 +516,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) static int blacklisted; #endif - dprintk("acpi_cpufreq_cpu_init\n"); + pr_debug("acpi_cpufreq_cpu_init\n"); #ifdef CONFIG_SMP if (blacklisted) @@ -566,7 +563,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) /* capability check */ if (perf->state_count <= 1) { - dprintk("No P-States\n"); + pr_debug("No P-States\n"); result = -ENODEV; goto err_unreg; } @@ -578,11 +575,11 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) switch (perf->control_register.space_id) { case ACPI_ADR_SPACE_SYSTEM_IO: - dprintk("SYSTEM IO addr space\n"); + pr_debug("SYSTEM IO addr space\n"); data->cpu_feature = SYSTEM_IO_CAPABLE; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: - dprintk("HARDWARE addr space\n"); + pr_debug("HARDWARE addr space\n"); if (!check_est_cpu(cpu)) { result = -ENODEV; goto err_unreg; @@ -590,7 +587,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; break; default: - dprintk("Unknown addr space %d\n", + pr_debug("Unknown addr space %d\n", (u32) (perf->control_register.space_id)); result = -ENODEV; goto err_unreg; @@ -661,9 +658,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (cpu_has(c, X86_FEATURE_APERFMPERF)) acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; - dprintk("CPU%u - ACPI performance management activated.\n", cpu); + pr_debug("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) - dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", + pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", (i == perf->state ? '*' : ' '), i, (u32) perf->states[i].core_frequency, (u32) perf->states[i].power, @@ -694,7 +691,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - dprintk("acpi_cpufreq_cpu_exit\n"); + pr_debug("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); @@ -712,7 +709,7 @@ static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - dprintk("acpi_cpufreq_resume\n"); + pr_debug("acpi_cpufreq_resume\n"); data->resume = 1; @@ -743,7 +740,7 @@ static int __init acpi_cpufreq_init(void) if (acpi_disabled) return 0; - dprintk("acpi_cpufreq_init\n"); + pr_debug("acpi_cpufreq_init\n"); ret = acpi_cpufreq_early_init(); if (ret) @@ -758,7 +755,7 @@ static int __init acpi_cpufreq_init(void) static void __exit acpi_cpufreq_exit(void) { - dprintk("acpi_cpufreq_exit\n"); + pr_debug("acpi_cpufreq_exit\n"); cpufreq_unregister_driver(&acpi_cpufreq_driver); diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index 141abeb..7bac808 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -57,8 +57,6 @@ MODULE_PARM_DESC(min_fsb, "Minimum FSB to use, if not defined: current FSB - 50"); #define PFX "cpufreq-nforce2: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "cpufreq-nforce2", msg) /** * nforce2_calc_fsb - calculate FSB @@ -270,7 +268,7 @@ static int nforce2_target(struct cpufreq_policy *policy, if (freqs.old == freqs.new) return 0; - dprintk("Old CPU frequency %d kHz, new %d kHz\n", + pr_debug("Old CPU frequency %d kHz, new %d kHz\n", freqs.old, freqs.new); cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -282,7 +280,7 @@ static int nforce2_target(struct cpufreq_policy *policy, printk(KERN_ERR PFX "Changing FSB to %d failed\n", target_fsb); else - dprintk("Changed FSB successfully to %d\n", + pr_debug("Changed FSB successfully to %d\n", target_fsb); /* Enable IRQs */ diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 32974cf..ffe1f2c 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -142,9 +142,6 @@ module_param(max_duration, int, 0444); #define POLICY_MIN_DIV 20 -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "gx-suspmod", msg) - /** * we can detect a core multipiler from dir0_lsb * from GX1 datasheet p.56, @@ -191,7 +188,7 @@ static __init struct pci_dev *gx_detect_chipset(void) /* check if CPU is a MediaGX or a Geode. */ if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { - dprintk("error: no MediaGX/Geode processor found!\n"); + pr_debug("error: no MediaGX/Geode processor found!\n"); return NULL; } @@ -201,7 +198,7 @@ static __init struct pci_dev *gx_detect_chipset(void) return gx_pci; } - dprintk("error: no supported chipset found!\n"); + pr_debug("error: no supported chipset found!\n"); return NULL; } @@ -305,14 +302,14 @@ static void gx_set_cpuspeed(unsigned int khz) break; default: local_irq_restore(flags); - dprintk("fatal: try to set unknown chipset.\n"); + pr_debug("fatal: try to set unknown chipset.\n"); return; } } else { suscfg = gx_params->pci_suscfg & ~(SUSMOD); gx_params->off_duration = 0; gx_params->on_duration = 0; - dprintk("suspend modulation disabled: cpu runs 100%% speed.\n"); + pr_debug("suspend modulation disabled: cpu runs 100%% speed.\n"); } gx_write_byte(PCI_MODOFF, gx_params->off_duration); @@ -327,9 +324,9 @@ static void gx_set_cpuspeed(unsigned int khz) cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", + pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", gx_params->on_duration * 32, gx_params->off_duration * 32); - dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); + pr_debug("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); } /**************************************************************** @@ -428,8 +425,8 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) stock_freq = maxfreq; curfreq = gx_get_cpuspeed(0); - dprintk("cpu max frequency is %d.\n", maxfreq); - dprintk("cpu current frequency is %dkHz.\n", curfreq); + pr_debug("cpu max frequency is %d.\n", maxfreq); + pr_debug("cpu current frequency is %dkHz.\n", curfreq); /* setup basic struct for cpufreq API */ policy->cpu = 0; @@ -475,7 +472,7 @@ static int __init cpufreq_gx_init(void) if (max_duration > 0xff) max_duration = 0xff; - dprintk("geode suspend modulation available.\n"); + pr_debug("geode suspend modulation available.\n"); params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL); if (params == NULL) diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index cf48cdd..f47d26e 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -77,9 +77,6 @@ static int scale_voltage; static int disable_acpi_c3; static int revid_errata; -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longhaul", msg) - /* Clock ratios multiplied by 10 */ static int mults[32]; @@ -87,7 +84,6 @@ static int eblcr[32]; static int longhaul_version; static struct cpufreq_frequency_table *longhaul_table; -#ifdef CONFIG_CPU_FREQ_DEBUG static char speedbuffer[8]; static char *print_speed(int speed) @@ -106,7 +102,6 @@ static char *print_speed(int speed) return speedbuffer; } -#endif static unsigned int calc_speed(int mult) @@ -275,7 +270,7 @@ static void longhaul_setstate(unsigned int table_index) cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", + pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", fsb, mult/10, mult%10, print_speed(speed/1000)); retry_loop: preempt_disable(); @@ -460,12 +455,12 @@ static int __cpuinit longhaul_get_ranges(void) break; } - dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n", + pr_debug("MinMult:%d.%dx MaxMult:%d.%dx\n", minmult/10, minmult%10, maxmult/10, maxmult%10); highest_speed = calc_speed(maxmult); lowest_speed = calc_speed(minmult); - dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, + pr_debug("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, print_speed(lowest_speed/1000), print_speed(highest_speed/1000)); diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index d9f5136..34ea359 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -15,9 +15,6 @@ #include #include -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longrun", msg) - static struct cpufreq_driver longrun_driver; /** @@ -40,14 +37,14 @@ static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) u32 msr_lo, msr_hi; rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi); + pr_debug("longrun flags are %x - %x\n", msr_lo, msr_hi); if (msr_lo & 0x01) policy->policy = CPUFREQ_POLICY_PERFORMANCE; else policy->policy = CPUFREQ_POLICY_POWERSAVE; rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi); + pr_debug("longrun ctrl is %x - %x\n", msr_lo, msr_hi); msr_lo &= 0x0000007F; msr_hi &= 0x0000007F; @@ -150,7 +147,7 @@ static unsigned int longrun_get(unsigned int cpu) return 0; cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - dprintk("cpuid eax is %u\n", eax); + pr_debug("cpuid eax is %u\n", eax); return eax * 1000; } @@ -196,7 +193,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); *high_freq = msr_lo * 1000; /* to kHz */ - dprintk("longrun table interface told %u - %u kHz\n", + pr_debug("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq); if (*low_freq > *high_freq) @@ -207,7 +204,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, /* set the upper border to the value determined during TSC init */ *high_freq = (cpu_khz / 1000); *high_freq = *high_freq * 1000; - dprintk("high frequency is %u kHz\n", *high_freq); + pr_debug("high frequency is %u kHz\n", *high_freq); /* get current borders */ rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); @@ -233,7 +230,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, /* restore values */ wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); } - dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax); + pr_debug("percentage is %u %%, freq is %u MHz\n", ecx, eax); /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) * eqals @@ -249,7 +246,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, edx = ((eax - ebx) * 100) / (100 - ecx); *low_freq = edx * 1000; /* back to kHz */ - dprintk("low frequency is %u kHz\n", *low_freq); + pr_debug("low frequency is %u kHz\n", *low_freq); if (*low_freq > *high_freq) *low_freq = *high_freq; diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 52c9364..6be3e07 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -35,8 +35,6 @@ #include "speedstep-lib.h" #define PFX "p4-clockmod: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "p4-clockmod", msg) /* * Duty Cycle (3bits), note DC_DISABLE is not specified in @@ -66,7 +64,7 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); if (l & 0x01) - dprintk("CPU#%d currently thermal throttled\n", cpu); + pr_debug("CPU#%d currently thermal throttled\n", cpu); if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT)) @@ -74,10 +72,10 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); if (newstate == DC_DISABLE) { - dprintk("CPU#%d disabling modulation\n", cpu); + pr_debug("CPU#%d disabling modulation\n", cpu); wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); } else { - dprintk("CPU#%d setting duty cycle to %d%%\n", + pr_debug("CPU#%d setting duty cycle to %d%%\n", cpu, ((125 * newstate) / 10)); /* bits 63 - 5 : reserved * bit 4 : enable/disable @@ -217,7 +215,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) case 0x0f11: case 0x0f12: has_N44_O17_errata[policy->cpu] = 1; - dprintk("has errata -- disabling low frequencies\n"); + pr_debug("has errata -- disabling low frequencies\n"); } if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D && diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 907c8e6..7b0603e 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -48,9 +48,6 @@ #define BUF_SZ 4 -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "pcc-cpufreq", msg) - struct pcc_register_resource { u8 descriptor; u16 length; @@ -152,7 +149,7 @@ static unsigned int pcc_get_freq(unsigned int cpu) spin_lock(&pcc_lock); - dprintk("get: get_freq for CPU %d\n", cpu); + pr_debug("get: get_freq for CPU %d\n", cpu); pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); input_buffer = 0x1; @@ -170,7 +167,7 @@ static unsigned int pcc_get_freq(unsigned int cpu) status = ioread16(&pcch_hdr->status); if (status != CMD_COMPLETE) { - dprintk("get: FAILED: for CPU %d, status is %d\n", + pr_debug("get: FAILED: for CPU %d, status is %d\n", cpu, status); goto cmd_incomplete; } @@ -178,14 +175,14 @@ static unsigned int pcc_get_freq(unsigned int cpu) curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) / 100) * 1000); - dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " - "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + pr_debug("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%p, contains a value of: 0x%x. Speed is: %d MHz\n", cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), output_buffer, curr_freq); freq_limit = (output_buffer >> 8) & 0xff; if (freq_limit != 0xff) { - dprintk("get: frequency for cpu %d is being temporarily" + pr_debug("get: frequency for cpu %d is being temporarily" " capped at %d\n", cpu, curr_freq); } @@ -212,8 +209,8 @@ static int pcc_cpufreq_target(struct cpufreq_policy *policy, cpu = policy->cpu; pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - dprintk("target: CPU %d should go to target freq: %d " - "(virtual) input_offset is 0x%x\n", + pr_debug("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%p\n", cpu, target_freq, (pcch_virt_addr + pcc_cpu_data->input_offset)); @@ -234,14 +231,14 @@ static int pcc_cpufreq_target(struct cpufreq_policy *policy, status = ioread16(&pcch_hdr->status); if (status != CMD_COMPLETE) { - dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + pr_debug("target: FAILED for cpu %d, with status: 0x%x\n", cpu, status); goto cmd_incomplete; } iowrite16(0, &pcch_hdr->status); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu); spin_unlock(&pcc_lock); return 0; @@ -293,7 +290,7 @@ static int pcc_get_offset(int cpu) memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); - dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + pr_debug("pcc_get_offset: for CPU %d: pcc_cpu_data " "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); out_free: @@ -410,7 +407,7 @@ static int __init pcc_cpufreq_probe(void) if (ACPI_SUCCESS(status)) { ret = pcc_cpufreq_do_osc(&osc_handle); if (ret) - dprintk("probe: _OSC evaluation did not succeed\n"); + pr_debug("probe: _OSC evaluation did not succeed\n"); /* Firmware's use of _OSC is optional */ ret = 0; } @@ -433,7 +430,7 @@ static int __init pcc_cpufreq_probe(void) mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; - dprintk("probe: mem_resource descriptor: 0x%x," + pr_debug("probe: mem_resource descriptor: 0x%x," " length: %d, space_id: %d, resource_usage: %d," " type_specific: %d, granularity: 0x%llx," " minimum: 0x%llx, maximum: 0x%llx," @@ -453,13 +450,13 @@ static int __init pcc_cpufreq_probe(void) pcch_virt_addr = ioremap_nocache(mem_resource->minimum, mem_resource->address_length); if (pcch_virt_addr == NULL) { - dprintk("probe: could not map shared mem region\n"); + pr_debug("probe: could not map shared mem region\n"); goto out_free; } pcch_hdr = pcch_virt_addr; - dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); - dprintk("probe: PCCH header is at physical address: 0x%llx," + pr_debug("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); + pr_debug("probe: PCCH header is at physical address: 0x%llx," " signature: 0x%x, length: %d bytes, major: %d, minor: %d," " supported features: 0x%x, command field: 0x%x," " status field: 0x%x, nominal latency: %d us\n", @@ -469,7 +466,7 @@ static int __init pcc_cpufreq_probe(void) ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), ioread32(&pcch_hdr->latency)); - dprintk("probe: min time between commands: %d us," + pr_debug("probe: min time between commands: %d us," " max time between commands: %d us," " nominal CPU frequency: %d MHz," " minimum CPU frequency: %d MHz," @@ -494,7 +491,7 @@ static int __init pcc_cpufreq_probe(void) doorbell.access_width = 64; doorbell.address = reg_resource->address; - dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + pr_debug("probe: doorbell: space_id is %d, bit_width is %d, " "bit_offset is %d, access_width is %d, address is 0x%llx\n", doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, doorbell.access_width, reg_resource->address); @@ -515,7 +512,7 @@ static int __init pcc_cpufreq_probe(void) doorbell_write = member->integer.value; - dprintk("probe: doorbell_preserve: 0x%llx," + pr_debug("probe: doorbell_preserve: 0x%llx," " doorbell_write: 0x%llx\n", doorbell_preserve, doorbell_write); @@ -550,7 +547,7 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) result = pcc_get_offset(cpu); if (result) { - dprintk("init: PCCP evaluation failed\n"); + pr_debug("init: PCCP evaluation failed\n"); goto out; } @@ -561,12 +558,12 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) policy->cur = pcc_get_freq(cpu); if (!policy->cur) { - dprintk("init: Unable to get current CPU frequency\n"); + pr_debug("init: Unable to get current CPU frequency\n"); result = -EINVAL; goto out; } - dprintk("init: policy->max is %d, policy->min is %d\n", + pr_debug("init: policy->max is %d, policy->min is %d\n", policy->max, policy->min); out: return result; @@ -597,7 +594,7 @@ static int __init pcc_cpufreq_init(void) ret = pcc_cpufreq_probe(); if (ret) { - dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n"); return ret; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 4a45fd6..d71d9f3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -68,7 +68,6 @@ union powernow_acpi_control_t { }; #endif -#ifdef CONFIG_CPU_FREQ_DEBUG /* divide by 1000 to get VCore voltage in V. */ static const int mobile_vid_table[32] = { 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, @@ -76,7 +75,6 @@ static const int mobile_vid_table[32] = { 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, 1075, 1050, 1025, 1000, 975, 950, 925, 0, }; -#endif /* divide by 10 to get FID. */ static const int fid_codes[32] = { @@ -103,9 +101,6 @@ static unsigned int fsb; static unsigned int latency; static char have_a0; -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "powernow-k7", msg) - static int check_fsb(unsigned int fsbspeed) { int delta; @@ -209,7 +204,7 @@ static int get_ranges(unsigned char *pst) vid = *pst++; powernow_table[j].index |= (vid << 8); /* upper 8 bits */ - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " + pr_debug(" FID: 0x%x (%d.%dx [%dMHz]) " "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, fid_codes[fid] % 10, speed/1000, vid, mobile_vid_table[vid]/1000, @@ -367,7 +362,7 @@ static int powernow_acpi_init(void) unsigned int speed, speed_mhz; pc.val = (unsigned long) state->control; - dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", + pr_debug("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", i, (u32) state->core_frequency, (u32) state->power, @@ -401,7 +396,7 @@ static int powernow_acpi_init(void) invalidate_entry(i); } - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " + pr_debug(" FID: 0x%x (%d.%dx [%dMHz]) " "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, fid_codes[fid] % 10, speed_mhz, vid, mobile_vid_table[vid]/1000, @@ -409,7 +404,7 @@ static int powernow_acpi_init(void) if (state->core_frequency != speed_mhz) { state->core_frequency = speed_mhz; - dprintk(" Corrected ACPI frequency to %d\n", + pr_debug(" Corrected ACPI frequency to %d\n", speed_mhz); } @@ -453,8 +448,8 @@ static int powernow_acpi_init(void) static void print_pst_entry(struct pst_s *pst, unsigned int j) { - dprintk("PST:%d (@%p)\n", j, pst); - dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", + pr_debug("PST:%d (@%p)\n", j, pst); + pr_debug(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); } @@ -474,20 +469,20 @@ static int powernow_decode_bios(int maxfid, int startvid) p = phys_to_virt(i); if (memcmp(p, "AMDK7PNOW!", 10) == 0) { - dprintk("Found PSB header at %p\n", p); + pr_debug("Found PSB header at %p\n", p); psb = (struct psb_s *) p; - dprintk("Table version: 0x%x\n", psb->tableversion); + pr_debug("Table version: 0x%x\n", psb->tableversion); if (psb->tableversion != 0x12) { printk(KERN_INFO PFX "Sorry, only v1.2 tables" " supported right now\n"); return -ENODEV; } - dprintk("Flags: 0x%x\n", psb->flags); + pr_debug("Flags: 0x%x\n", psb->flags); if ((psb->flags & 1) == 0) - dprintk("Mobile voltage regulator\n"); + pr_debug("Mobile voltage regulator\n"); else - dprintk("Desktop voltage regulator\n"); + pr_debug("Desktop voltage regulator\n"); latency = psb->settlingtime; if (latency < 100) { @@ -497,9 +492,9 @@ static int powernow_decode_bios(int maxfid, int startvid) "Correcting.\n", latency); latency = 100; } - dprintk("Settling Time: %d microseconds.\n", + pr_debug("Settling Time: %d microseconds.\n", psb->settlingtime); - dprintk("Has %d PST tables. (Only dumping ones " + pr_debug("Has %d PST tables. (Only dumping ones " "relevant to this CPU).\n", psb->numpst); @@ -650,7 +645,7 @@ static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) printk(KERN_WARNING PFX "can not determine bus frequency\n"); return -EINVAL; } - dprintk("FSB: %3dMHz\n", fsb/1000); + pr_debug("FSB: %3dMHz\n", fsb/1000); if (dmi_check_system(powernow_dmi_table) || acpi_force) { printk(KERN_INFO PFX "PSB/PST known to be broken. " diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2368e38..83479b6 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -139,7 +139,7 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) } do { if (i++ > 10000) { - dprintk("detected change pending stuck\n"); + pr_debug("detected change pending stuck\n"); return 1; } rdmsr(MSR_FIDVID_STATUS, lo, hi); @@ -176,7 +176,7 @@ static void fidvid_msr_init(void) fid = lo & MSR_S_LO_CURRENT_FID; lo = fid | (vid << MSR_C_LO_VID_SHIFT); hi = MSR_C_HI_STP_GNT_BENIGN; - dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); + pr_debug("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); wrmsr(MSR_FIDVID_CTL, lo, hi); } @@ -196,7 +196,7 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid) lo |= (data->currvid << MSR_C_LO_VID_SHIFT); lo |= MSR_C_LO_INIT_FID_VID; - dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", + pr_debug("writing fid 0x%x, lo 0x%x, hi 0x%x\n", fid, lo, data->plllock * PLL_LOCK_CONVERSION); do { @@ -244,7 +244,7 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid) lo |= (vid << MSR_C_LO_VID_SHIFT); lo |= MSR_C_LO_INIT_FID_VID; - dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", + pr_debug("writing vid 0x%x, lo 0x%x, hi 0x%x\n", vid, lo, STOP_GRANT_5NS); do { @@ -325,7 +325,7 @@ static int transition_fid_vid(struct powernow_k8_data *data, return 1; } - dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", + pr_debug("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", smp_processor_id(), data->currfid, data->currvid); return 0; @@ -339,7 +339,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 savefid = data->currfid; u32 maxvid, lo, rvomult = 1; - dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " + pr_debug("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " "reqvid 0x%x, rvo 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqvid, data->rvo); @@ -349,12 +349,12 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, rvosteps *= rvomult; rdmsr(MSR_FIDVID_STATUS, lo, maxvid); maxvid = 0x1f & (maxvid >> 16); - dprintk("ph1 maxvid=0x%x\n", maxvid); + pr_debug("ph1 maxvid=0x%x\n", maxvid); if (reqvid < maxvid) /* lower numbers are higher voltages */ reqvid = maxvid; while (data->currvid > reqvid) { - dprintk("ph1: curr 0x%x, req vid 0x%x\n", + pr_debug("ph1: curr 0x%x, req vid 0x%x\n", data->currvid, reqvid); if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) return 1; @@ -365,7 +365,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, if (data->currvid == maxvid) { rvosteps = 0; } else { - dprintk("ph1: changing vid for rvo, req 0x%x\n", + pr_debug("ph1: changing vid for rvo, req 0x%x\n", data->currvid - 1); if (decrease_vid_code_by_step(data, data->currvid-1, 1)) return 1; @@ -382,7 +382,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data, return 1; } - dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n", + pr_debug("ph1 complete, currfid 0x%x, currvid 0x%x\n", data->currfid, data->currvid); return 0; @@ -400,7 +400,7 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) return 0; } - dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " + pr_debug("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " "reqfid 0x%x\n", smp_processor_id(), data->currfid, data->currvid, reqfid); @@ -457,7 +457,7 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) return 1; } - dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n", + pr_debug("ph2 complete, currfid 0x%x, currvid 0x%x\n", data->currfid, data->currvid); return 0; @@ -470,7 +470,7 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 savefid = data->currfid; u32 savereqvid = reqvid; - dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", + pr_debug("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", smp_processor_id(), data->currfid, data->currvid); @@ -498,17 +498,17 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, return 1; if (savereqvid != data->currvid) { - dprintk("ph3 failed, currvid 0x%x\n", data->currvid); + pr_debug("ph3 failed, currvid 0x%x\n", data->currvid); return 1; } if (savefid != data->currfid) { - dprintk("ph3 failed, currfid changed 0x%x\n", + pr_debug("ph3 failed, currfid changed 0x%x\n", data->currfid); return 1; } - dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n", + pr_debug("ph3 complete, currfid 0x%x, currvid 0x%x\n", data->currfid, data->currvid); return 0; @@ -707,7 +707,7 @@ static int fill_powernow_table(struct powernow_k8_data *data, return -EIO; } - dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); + pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); data->powernow_table = powernow_table; if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); @@ -717,7 +717,7 @@ static int fill_powernow_table(struct powernow_k8_data *data, (pst[j].vid == data->currvid)) return 0; - dprintk("currfid/vid do not match PST, ignoring\n"); + pr_debug("currfid/vid do not match PST, ignoring\n"); return 0; } @@ -739,36 +739,36 @@ static int find_psb_table(struct powernow_k8_data *data) if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) continue; - dprintk("found PSB header at 0x%p\n", psb); + pr_debug("found PSB header at 0x%p\n", psb); - dprintk("table vers: 0x%x\n", psb->tableversion); + pr_debug("table vers: 0x%x\n", psb->tableversion); if (psb->tableversion != PSB_VERSION_1_4) { printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); return -ENODEV; } - dprintk("flags: 0x%x\n", psb->flags1); + pr_debug("flags: 0x%x\n", psb->flags1); if (psb->flags1) { printk(KERN_ERR FW_BUG PFX "unknown flags\n"); return -ENODEV; } data->vstable = psb->vstable; - dprintk("voltage stabilization time: %d(*20us)\n", + pr_debug("voltage stabilization time: %d(*20us)\n", data->vstable); - dprintk("flags2: 0x%x\n", psb->flags2); + pr_debug("flags2: 0x%x\n", psb->flags2); data->rvo = psb->flags2 & 3; data->irt = ((psb->flags2) >> 2) & 3; mvs = ((psb->flags2) >> 4) & 3; data->vidmvs = 1 << mvs; data->batps = ((psb->flags2) >> 6) & 3; - dprintk("ramp voltage offset: %d\n", data->rvo); - dprintk("isochronous relief time: %d\n", data->irt); - dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); + pr_debug("ramp voltage offset: %d\n", data->rvo); + pr_debug("isochronous relief time: %d\n", data->irt); + pr_debug("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); - dprintk("numpst: 0x%x\n", psb->num_tables); + pr_debug("numpst: 0x%x\n", psb->num_tables); cpst = psb->num_tables; if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0)) { @@ -783,13 +783,13 @@ static int find_psb_table(struct powernow_k8_data *data) } data->plllock = psb->plllocktime; - dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); - dprintk("maxfid: 0x%x\n", psb->maxfid); - dprintk("maxvid: 0x%x\n", psb->maxvid); + pr_debug("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); + pr_debug("maxfid: 0x%x\n", psb->maxfid); + pr_debug("maxvid: 0x%x\n", psb->maxvid); maxvid = psb->maxvid; data->numps = psb->numps; - dprintk("numpstates: 0x%x\n", data->numps); + pr_debug("numpstates: 0x%x\n", data->numps); return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); } @@ -834,13 +834,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) u64 control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { - dprintk("register performance failed: bad ACPI data\n"); + pr_debug("register performance failed: bad ACPI data\n"); return -EIO; } /* verify the data contained in the ACPI structures */ if (data->acpi_data.state_count <= 1) { - dprintk("No ACPI P-States\n"); + pr_debug("No ACPI P-States\n"); goto err_out; } @@ -849,7 +849,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Invalid control/status registers (%x - %x)\n", + pr_debug("Invalid control/status registers (%llx - %llx)\n", control, status); goto err_out; } @@ -858,7 +858,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) * (data->acpi_data.state_count + 1)), GFP_KERNEL); if (!powernow_table) { - dprintk("powernow_table memory alloc failure\n"); + pr_debug("powernow_table memory alloc failure\n"); goto err_out; } @@ -928,7 +928,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, } rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); if (!(hi & HW_PSTATE_VALID_MASK)) { - dprintk("invalid pstate %d, ignoring\n", index); + pr_debug("invalid pstate %d, ignoring\n", index); invalidate_entry(powernow_table, i); continue; } @@ -968,7 +968,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, vid = (control >> VID_SHIFT) & VID_MASK; } - dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); + pr_debug(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); index = fid | (vid<<8); powernow_table[i].index = index; @@ -978,7 +978,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, /* verify frequency is OK */ if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { - dprintk("invalid freq %u kHz, ignoring\n", freq); + pr_debug("invalid freq %u kHz, ignoring\n", freq); invalidate_entry(powernow_table, i); continue; } @@ -986,7 +986,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, /* verify voltage is OK - * BIOSs are using "off" to indicate invalid */ if (vid == VID_OFF) { - dprintk("invalid vid %u, ignoring\n", vid); + pr_debug("invalid vid %u, ignoring\n", vid); invalidate_entry(powernow_table, i); continue; } @@ -1047,7 +1047,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, int res, i; struct cpufreq_freqs freqs; - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); + pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index); /* fid/vid correctness check for k8 */ /* fid are the lower 8 bits of the index we stored into @@ -1057,18 +1057,18 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, fid = data->powernow_table[index].index & 0xFF; vid = (data->powernow_table[index].index & 0xFF00) >> 8; - dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); + pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); if (query_current_values_with_pending_wait(data)) return 1; if ((data->currvid == vid) && (data->currfid == fid)) { - dprintk("target matches current values (fid 0x%x, vid 0x%x)\n", + pr_debug("target matches current values (fid 0x%x, vid 0x%x)\n", fid, vid); return 0; } - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", + pr_debug("cpu %d, changing to fid 0x%x, vid 0x%x\n", smp_processor_id(), fid, vid); freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); @@ -1096,7 +1096,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, int res, i; struct cpufreq_freqs freqs; - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); + pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index); /* get MSR index for hardware pstate transition */ pstate = index & HW_PSTATE_MASK; @@ -1156,14 +1156,14 @@ static int powernowk8_target(struct cpufreq_policy *pol, goto err_out; } - dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", + pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", pol->cpu, targfreq, pol->min, pol->max, relation); if (query_current_values_with_pending_wait(data)) goto err_out; if (cpu_family != CPU_HW_PSTATE) { - dprintk("targ: curr fid 0x%x, vid 0x%x\n", + pr_debug("targ: curr fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); if ((checkvid != data->currvid) || @@ -1319,7 +1319,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) data->currpstate); else pol->cur = find_khz_freq_from_fid(data->currfid); - dprintk("policy current frequency %d kHz\n", pol->cur); + pr_debug("policy current frequency %d kHz\n", pol->cur); /* min/max the cpu is capable of */ if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { @@ -1337,10 +1337,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); if (cpu_family == CPU_HW_PSTATE) - dprintk("cpu_init done, current pstate 0x%x\n", + pr_debug("cpu_init done, current pstate 0x%x\n", data->currpstate); else - dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", + pr_debug("cpu_init done, current fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); per_cpu(powernow_data, pol->cpu) = data; @@ -1586,7 +1586,7 @@ static int __cpuinit powernowk8_init(void) /* driver entry point for term */ static void __exit powernowk8_exit(void) { - dprintk("exit\n"); + pr_debug("exit\n"); if (boot_cpu_has(X86_FEATURE_CPB)) { msrs_free(msrs); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index df3529b..3744d26 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -211,8 +211,6 @@ struct pst_s { u8 vid; }; -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) - static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid, u32 regfid); static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c index 435a996..1e205e6 100644 --- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c +++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c @@ -29,8 +29,6 @@ static __u8 __iomem *cpuctl; -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "sc520_freq", msg) #define PFX "sc520_freq: " static struct cpufreq_frequency_table sc520_freq_table[] = { @@ -66,7 +64,7 @@ static void sc520_freq_set_cpu_state(unsigned int state) cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - dprintk("attempting to set frequency to %i kHz\n", + pr_debug("attempting to set frequency to %i kHz\n", sc520_freq_table[state].frequency); local_irq_disable(); @@ -161,7 +159,7 @@ static int __init sc520_freq_init(void) /* Test if we have the right hardware */ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 != 4 || c->x86_model != 9) { - dprintk("no Elan SC520 processor found!\n"); + pr_debug("no Elan SC520 processor found!\n"); return -ENODEV; } cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 9b1ff37..6ea3455 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -29,9 +29,6 @@ #define PFX "speedstep-centrino: " #define MAINTAINER "cpufreq@vger.kernel.org" -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) - #define INTEL_MSR_RANGE (0xffff) struct cpu_id @@ -244,7 +241,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) if (model->cpu_id == NULL) { /* No match at all */ - dprintk("no support for CPU model \"%s\": " + pr_debug("no support for CPU model \"%s\": " "send /proc/cpuinfo to " MAINTAINER "\n", cpu->x86_model_id); return -ENOENT; @@ -252,15 +249,15 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) if (model->op_points == NULL) { /* Matched a non-match */ - dprintk("no table support for CPU model \"%s\"\n", + pr_debug("no table support for CPU model \"%s\"\n", cpu->x86_model_id); - dprintk("try using the acpi-cpufreq driver\n"); + pr_debug("try using the acpi-cpufreq driver\n"); return -ENOENT; } per_cpu(centrino_model, policy->cpu) = model; - dprintk("found \"%s\": max frequency: %dkHz\n", + pr_debug("found \"%s\": max frequency: %dkHz\n", model->model_name, model->max_freq); return 0; @@ -369,7 +366,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; if (!per_cpu(centrino_cpu, policy->cpu)) { - dprintk("found unsupported CPU with " + pr_debug("found unsupported CPU with " "Enhanced SpeedStep: send /proc/cpuinfo to " MAINTAINER "\n"); return -ENODEV; @@ -385,7 +382,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); + pr_debug("trying to enable Enhanced SpeedStep (%x)\n", l); wrmsr(MSR_IA32_MISC_ENABLE, l, h); /* check to see if it stuck */ @@ -402,7 +399,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) /* 10uS transition latency */ policy->cur = freq; - dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); + pr_debug("centrino_cpu_init: cur=%dkHz\n", policy->cur); ret = cpufreq_frequency_table_cpuinfo(policy, per_cpu(centrino_model, policy->cpu)->op_points); @@ -498,7 +495,7 @@ static int centrino_target (struct cpufreq_policy *policy, good_cpu = j; if (good_cpu >= nr_cpu_ids) { - dprintk("couldn't limit to CPUs in this domain\n"); + pr_debug("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { /* We haven't started the transition yet. */ @@ -512,7 +509,7 @@ static int centrino_target (struct cpufreq_policy *policy, if (first_cpu) { rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); if (msr == (oldmsr & 0xffff)) { - dprintk("no change needed - msr was and needs " + pr_debug("no change needed - msr was and needs " "to be %x\n", oldmsr); retval = 0; goto out; @@ -521,7 +518,7 @@ static int centrino_target (struct cpufreq_policy *policy, freqs.old = extract_clock(oldmsr, cpu, 0); freqs.new = extract_clock(msr, cpu, 0); - dprintk("target=%dkHz old=%d new=%d msr=%04x\n", + pr_debug("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); for_each_cpu(k, policy->cpus) { diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 561758e..a748ce7 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -53,10 +53,6 @@ static struct cpufreq_frequency_table speedstep_freqs[] = { }; -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-ich", msg) - - /** * speedstep_find_register - read the PMBASE address * @@ -80,7 +76,7 @@ static int speedstep_find_register(void) return -ENODEV; } - dprintk("pmbase is 0x%x\n", pmbase); + pr_debug("pmbase is 0x%x\n", pmbase); return 0; } @@ -106,13 +102,13 @@ static void speedstep_set_state(unsigned int state) /* read state */ value = inb(pmbase + 0x50); - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); + pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); /* write new state */ value &= 0xFE; value |= state; - dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); + pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); /* Disable bus master arbitration */ pm2_blk = inb(pmbase + 0x20); @@ -132,10 +128,10 @@ static void speedstep_set_state(unsigned int state) /* Enable IRQs */ local_irq_restore(flags); - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); + pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); if (state == (value & 0x1)) - dprintk("change to %u MHz succeeded\n", + pr_debug("change to %u MHz succeeded\n", speedstep_get_frequency(speedstep_processor) / 1000); else printk(KERN_ERR "cpufreq: change failed - I/O error\n"); @@ -165,7 +161,7 @@ static int speedstep_activate(void) pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); if (!(value & 0x08)) { value |= 0x08; - dprintk("activating SpeedStep (TM) registers\n"); + pr_debug("activating SpeedStep (TM) registers\n"); pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); } @@ -218,7 +214,7 @@ static unsigned int speedstep_detect_chipset(void) return 2; /* 2-M */ if (hostbridge->revision < 5) { - dprintk("hostbridge does not support speedstep\n"); + pr_debug("hostbridge does not support speedstep\n"); speedstep_chipset_dev = NULL; pci_dev_put(hostbridge); return 0; @@ -246,7 +242,7 @@ static unsigned int speedstep_get(unsigned int cpu) if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", speed); + pr_debug("detected %u kHz as current frequency\n", speed); return speed; } @@ -276,7 +272,7 @@ static int speedstep_target(struct cpufreq_policy *policy, freqs.new = speedstep_freqs[newstate].frequency; freqs.cpu = policy->cpu; - dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new); + pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new); /* no transition necessary */ if (freqs.old == freqs.new) @@ -351,7 +347,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) if (!speed) return -EIO; - dprintk("currently at %s speed setting - %i MHz\n", + pr_debug("currently at %s speed setting - %i MHz\n", (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", (speed / 1000)); @@ -405,14 +401,14 @@ static int __init speedstep_init(void) /* detect processor */ speedstep_processor = speedstep_detect_processor(); if (!speedstep_processor) { - dprintk("Intel(R) SpeedStep(TM) capable processor " + pr_debug("Intel(R) SpeedStep(TM) capable processor " "not found\n"); return -ENODEV; } /* detect chipset */ if (!speedstep_detect_chipset()) { - dprintk("Intel(R) SpeedStep(TM) for this chipset not " + pr_debug("Intel(R) SpeedStep(TM) for this chipset not " "(yet) available.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index a94ec6b..8af2d2f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -18,9 +18,6 @@ #include #include "speedstep-lib.h" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-lib", msg) - #define PFX "speedstep-lib: " #ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK @@ -75,7 +72,7 @@ static unsigned int pentium3_get_frequency(enum speedstep_processor processor) /* read MSR 0x2a - we only need the low 32 bits */ rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); + pr_debug("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); msr_tmp = msr_lo; /* decode the FSB */ @@ -89,7 +86,7 @@ static unsigned int pentium3_get_frequency(enum speedstep_processor processor) /* decode the multiplier */ if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) { - dprintk("workaround for early PIIIs\n"); + pr_debug("workaround for early PIIIs\n"); msr_lo &= 0x03c00000; } else msr_lo &= 0x0bc00000; @@ -100,7 +97,7 @@ static unsigned int pentium3_get_frequency(enum speedstep_processor processor) j++; } - dprintk("speed is %u\n", + pr_debug("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100; @@ -112,7 +109,7 @@ static unsigned int pentiumM_get_frequency(void) u32 msr_lo, msr_tmp; rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); + pr_debug("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); /* see table B-2 of 24547212.pdf */ if (msr_lo & 0x00040000) { @@ -122,7 +119,7 @@ static unsigned int pentiumM_get_frequency(void) } msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", + pr_debug("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000)); return msr_tmp * 100 * 1000; @@ -160,11 +157,11 @@ static unsigned int pentium_core_get_frequency(void) } rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", + pr_debug("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", + pr_debug("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb)); ret = (msr_tmp * fsb); @@ -190,7 +187,7 @@ static unsigned int pentium4_get_frequency(void) rdmsr(0x2c, msr_lo, msr_hi); - dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); + pr_debug("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); /* decode the FSB: see IA-32 Intel (C) Architecture Software * Developer's Manual, Volume 3: System Prgramming Guide, @@ -217,7 +214,7 @@ static unsigned int pentium4_get_frequency(void) /* Multiplier. */ mult = msr_lo >> 24; - dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", + pr_debug("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); ret = (fsb * mult); @@ -257,7 +254,7 @@ unsigned int speedstep_detect_processor(void) struct cpuinfo_x86 *c = &cpu_data(0); u32 ebx, msr_lo, msr_hi; - dprintk("x86: %x, model: %x\n", c->x86, c->x86_model); + pr_debug("x86: %x, model: %x\n", c->x86, c->x86_model); if ((c->x86_vendor != X86_VENDOR_INTEL) || ((c->x86 != 6) && (c->x86 != 0xF))) @@ -272,7 +269,7 @@ unsigned int speedstep_detect_processor(void) ebx = cpuid_ebx(0x00000001); ebx &= 0x000000FF; - dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); + pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); switch (c->x86_mask) { case 4: @@ -327,7 +324,7 @@ unsigned int speedstep_detect_processor(void) /* cpuid_ebx(1) is 0x04 for desktop PIII, * 0x06 for mobile PIII-M */ ebx = cpuid_ebx(0x00000001); - dprintk("ebx is %x\n", ebx); + pr_debug("ebx is %x\n", ebx); ebx &= 0x000000FF; @@ -344,7 +341,7 @@ unsigned int speedstep_detect_processor(void) /* all mobile PIII Coppermines have FSB 100 MHz * ==> sort out a few desktop PIIIs. */ rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", + pr_debug("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi); msr_lo &= 0x00c0000; if (msr_lo != 0x0080000) @@ -357,12 +354,12 @@ unsigned int speedstep_detect_processor(void) * bit 56 or 57 is set */ rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", + pr_debug("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi); if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) { if (c->x86_mask == 0x01) { - dprintk("early PIII version\n"); + pr_debug("early PIII version\n"); return SPEEDSTEP_CPU_PIII_C_EARLY; } else return SPEEDSTEP_CPU_PIII_C; @@ -393,14 +390,14 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor, if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) return -EINVAL; - dprintk("trying to determine both speeds\n"); + pr_debug("trying to determine both speeds\n"); /* get current speed */ prev_speed = speedstep_get_frequency(processor); if (!prev_speed) return -EIO; - dprintk("previous speed is %u\n", prev_speed); + pr_debug("previous speed is %u\n", prev_speed); local_irq_save(flags); @@ -412,7 +409,7 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor, goto out; } - dprintk("low speed is %u\n", *low_speed); + pr_debug("low speed is %u\n", *low_speed); /* start latency measurement */ if (transition_latency) @@ -431,7 +428,7 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor, goto out; } - dprintk("high speed is %u\n", *high_speed); + pr_debug("high speed is %u\n", *high_speed); if (*low_speed == *high_speed) { ret = -ENODEV; @@ -445,7 +442,7 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor, if (transition_latency) { *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC + tv2.tv_usec - tv1.tv_usec; - dprintk("transition latency is %u uSec\n", *transition_latency); + pr_debug("transition latency is %u uSec\n", *transition_latency); /* convert uSec to nSec and add 20% for safety reasons */ *transition_latency *= 1200; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 91bc25b..c76ead3 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -55,9 +55,6 @@ static struct cpufreq_frequency_table speedstep_freqs[] = { * of DMA activity going on? */ #define SMI_TRIES 5 -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-smi", msg) - /** * speedstep_smi_ownership */ @@ -70,7 +67,7 @@ static int speedstep_smi_ownership(void) command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); magic = virt_to_phys(magic_data); - dprintk("trying to obtain ownership with command %x at port %x\n", + pr_debug("trying to obtain ownership with command %x at port %x\n", command, smi_port); __asm__ __volatile__( @@ -85,7 +82,7 @@ static int speedstep_smi_ownership(void) : "memory" ); - dprintk("result is %x\n", result); + pr_debug("result is %x\n", result); return result; } @@ -106,13 +103,13 @@ static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) u32 function = GET_SPEEDSTEP_FREQS; if (!(ist_info.event & 0xFFFF)) { - dprintk("bug #1422 -- can't read freqs from BIOS\n"); + pr_debug("bug #1422 -- can't read freqs from BIOS\n"); return -ENODEV; } command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - dprintk("trying to determine frequencies with command %x at port %x\n", + pr_debug("trying to determine frequencies with command %x at port %x\n", command, smi_port); __asm__ __volatile__( @@ -129,7 +126,7 @@ static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) "d" (smi_port), "S" (0), "D" (0) ); - dprintk("result %x, low_freq %u, high_freq %u\n", + pr_debug("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); /* abort if results are obviously incorrect... */ @@ -154,7 +151,7 @@ static int speedstep_get_state(void) command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - dprintk("trying to determine current setting with command %x " + pr_debug("trying to determine current setting with command %x " "at port %x\n", command, smi_port); __asm__ __volatile__( @@ -168,7 +165,7 @@ static int speedstep_get_state(void) "d" (smi_port), "S" (0), "D" (0) ); - dprintk("state is %x, result is %x\n", state, result); + pr_debug("state is %x, result is %x\n", state, result); return state & 1; } @@ -194,13 +191,13 @@ static void speedstep_set_state(unsigned int state) command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - dprintk("trying to set frequency to state %u " + pr_debug("trying to set frequency to state %u " "with command %x at port %x\n", state, command, smi_port); do { if (retry) { - dprintk("retry %u, previous result %u, waiting...\n", + pr_debug("retry %u, previous result %u, waiting...\n", retry, result); mdelay(retry * 50); } @@ -221,7 +218,7 @@ static void speedstep_set_state(unsigned int state) local_irq_restore(flags); if (new_state == state) - dprintk("change to %u MHz succeeded after %u tries " + pr_debug("change to %u MHz succeeded after %u tries " "with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); @@ -292,7 +289,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) result = speedstep_smi_ownership(); if (result) { - dprintk("fails in acquiring ownership of a SMI interface.\n"); + pr_debug("fails in acquiring ownership of a SMI interface.\n"); return -EINVAL; } @@ -304,7 +301,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) if (result) { /* fall back to speedstep_lib.c dection mechanism: * try both states out */ - dprintk("could not detect low and high frequencies " + pr_debug("could not detect low and high frequencies " "by SMI call.\n"); result = speedstep_get_freqs(speedstep_processor, low, high, @@ -312,18 +309,18 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) &speedstep_set_state); if (result) { - dprintk("could not detect two different speeds" + pr_debug("could not detect two different speeds" " -- aborting.\n"); return result; } else - dprintk("workaround worked.\n"); + pr_debug("workaround worked.\n"); } /* get current speed setting */ state = speedstep_get_state(); speed = speedstep_freqs[state].frequency; - dprintk("currently at %s speed setting - %i MHz\n", + pr_debug("currently at %s speed setting - %i MHz\n", (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", (speed / 1000)); @@ -360,7 +357,7 @@ static int speedstep_resume(struct cpufreq_policy *policy) int result = speedstep_smi_ownership(); if (result) - dprintk("fails in re-acquiring ownership of a SMI interface.\n"); + pr_debug("fails in re-acquiring ownership of a SMI interface.\n"); return result; } @@ -403,12 +400,12 @@ static int __init speedstep_init(void) } if (!speedstep_processor) { - dprintk("No supported Intel CPU detected.\n"); + pr_debug("No supported Intel CPU detected.\n"); return -ENODEV; } - dprintk("signature:0x%.8lx, command:0x%.8lx, " - "event:0x%.8lx, perf_level:0x%.8lx.\n", + pr_debug("signature:0x%.8ulx, command:0x%.8ulx, " + "event:0x%.8ulx, perf_level:0x%.8ulx.\n", ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level); diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 3a73a93..85b3237 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -49,10 +49,6 @@ ACPI_MODULE_NAME("processor_perflib"); static DEFINE_MUTEX(performance_mutex); -/* Use cpufreq debug layer for _PPC changes. */ -#define cpufreq_printk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \ - "cpufreq-core", msg) - /* * _PPC support is implemented as a CPUfreq policy notifier: * This means each time a CPUfreq driver registered also with @@ -145,7 +141,7 @@ static int acpi_processor_get_platform_limit(struct acpi_processor *pr) return -ENODEV; } - cpufreq_printk("CPU %d: _PPC is %d - frequency %s limited\n", pr->id, + pr_debug("CPU %d: _PPC is %d - frequency %s limited\n", pr->id, (int)ppc, ppc ? "" : "not"); pr->performance_platform_limit = (int)ppc; diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index ca8ee80..b78baa54 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -18,19 +18,6 @@ if CPU_FREQ config CPU_FREQ_TABLE tristate -config CPU_FREQ_DEBUG - bool "Enable CPUfreq debugging" - help - Say Y here to enable CPUfreq subsystem (including drivers) - debugging. You will need to activate it via the kernel - command line by passing - cpufreq.debug= - - To get , add - 1 to activate CPUfreq core debugging, - 2 to activate CPUfreq drivers debugging, and - 4 to activate CPUfreq governor debugging - config CPU_FREQ_STAT tristate "CPU frequency translation statistics" select CPU_FREQ_TABLE diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7c10f96..1e08af4 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -32,9 +32,6 @@ #include -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \ - "cpufreq-core", msg) - /** * The "cpufreq driver" - the arch- or hardware-dependent low * level driver of CPUFreq support, and its spinlock. This lock @@ -181,93 +178,6 @@ EXPORT_SYMBOL_GPL(cpufreq_cpu_put); /********************************************************************* - * UNIFIED DEBUG HELPERS * - *********************************************************************/ -#ifdef CONFIG_CPU_FREQ_DEBUG - -/* what part(s) of the CPUfreq subsystem are debugged? */ -static unsigned int debug; - -/* is the debug output ratelimit'ed using printk_ratelimit? User can - * set or modify this value. - */ -static unsigned int debug_ratelimit = 1; - -/* is the printk_ratelimit'ing enabled? It's enabled after a successful - * loading of a cpufreq driver, temporarily disabled when a new policy - * is set, and disabled upon cpufreq driver removal - */ -static unsigned int disable_ratelimit = 1; -static DEFINE_SPINLOCK(disable_ratelimit_lock); - -static void cpufreq_debug_enable_ratelimit(void) -{ - unsigned long flags; - - spin_lock_irqsave(&disable_ratelimit_lock, flags); - if (disable_ratelimit) - disable_ratelimit--; - spin_unlock_irqrestore(&disable_ratelimit_lock, flags); -} - -static void cpufreq_debug_disable_ratelimit(void) -{ - unsigned long flags; - - spin_lock_irqsave(&disable_ratelimit_lock, flags); - disable_ratelimit++; - spin_unlock_irqrestore(&disable_ratelimit_lock, flags); -} - -void cpufreq_debug_printk(unsigned int type, const char *prefix, - const char *fmt, ...) -{ - char s[256]; - va_list args; - unsigned int len; - unsigned long flags; - - WARN_ON(!prefix); - if (type & debug) { - spin_lock_irqsave(&disable_ratelimit_lock, flags); - if (!disable_ratelimit && debug_ratelimit - && !printk_ratelimit()) { - spin_unlock_irqrestore(&disable_ratelimit_lock, flags); - return; - } - spin_unlock_irqrestore(&disable_ratelimit_lock, flags); - - len = snprintf(s, 256, KERN_DEBUG "%s: ", prefix); - - va_start(args, fmt); - len += vsnprintf(&s[len], (256 - len), fmt, args); - va_end(args); - - printk(s); - - WARN_ON(len < 5); - } -} -EXPORT_SYMBOL(cpufreq_debug_printk); - - -module_param(debug, uint, 0644); -MODULE_PARM_DESC(debug, "CPUfreq debugging: add 1 to debug core," - " 2 to debug drivers, and 4 to debug governors."); - -module_param(debug_ratelimit, uint, 0644); -MODULE_PARM_DESC(debug_ratelimit, "CPUfreq debugging:" - " set to 0 to disable ratelimiting."); - -#else /* !CONFIG_CPU_FREQ_DEBUG */ - -static inline void cpufreq_debug_enable_ratelimit(void) { return; } -static inline void cpufreq_debug_disable_ratelimit(void) { return; } - -#endif /* CONFIG_CPU_FREQ_DEBUG */ - - -/********************************************************************* * EXTERNALLY AFFECTING FREQUENCY CHANGES * *********************************************************************/ @@ -291,7 +201,7 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) if (!l_p_j_ref_freq) { l_p_j_ref = loops_per_jiffy; l_p_j_ref_freq = ci->old; - dprintk("saving %lu as reference value for loops_per_jiffy; " + pr_debug("saving %lu as reference value for loops_per_jiffy; " "freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq); } if ((val == CPUFREQ_PRECHANGE && ci->old < ci->new) || @@ -299,7 +209,7 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE)) { loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq, ci->new); - dprintk("scaling loops_per_jiffy to %lu " + pr_debug("scaling loops_per_jiffy to %lu " "for frequency %u kHz\n", loops_per_jiffy, ci->new); } } @@ -326,7 +236,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state) BUG_ON(irqs_disabled()); freqs->flags = cpufreq_driver->flags; - dprintk("notification %u of frequency transition to %u kHz\n", + pr_debug("notification %u of frequency transition to %u kHz\n", state, freqs->new); policy = per_cpu(cpufreq_cpu_data, freqs->cpu); @@ -340,7 +250,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state) if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) { if ((policy) && (policy->cpu == freqs->cpu) && (policy->cur) && (policy->cur != freqs->old)) { - dprintk("Warning: CPU frequency is" + pr_debug("Warning: CPU frequency is" " %u, cpufreq assumed %u kHz.\n", freqs->old, policy->cur); freqs->old = policy->cur; @@ -353,7 +263,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state) case CPUFREQ_POSTCHANGE: adjust_jiffies(CPUFREQ_POSTCHANGE, freqs); - dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new, + pr_debug("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new, (unsigned long)freqs->cpu); trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu); trace_cpu_frequency(freqs->new, freqs->cpu); @@ -753,7 +663,7 @@ no_policy: static void cpufreq_sysfs_release(struct kobject *kobj) { struct cpufreq_policy *policy = to_policy(kobj); - dprintk("last reference is dropped\n"); + pr_debug("last reference is dropped\n"); complete(&policy->kobj_unregister); } @@ -788,7 +698,7 @@ static int cpufreq_add_dev_policy(unsigned int cpu, gov = __find_governor(per_cpu(cpufreq_cpu_governor, cpu)); if (gov) { policy->governor = gov; - dprintk("Restoring governor %s for cpu %d\n", + pr_debug("Restoring governor %s for cpu %d\n", policy->governor->name, cpu); } #endif @@ -824,7 +734,7 @@ static int cpufreq_add_dev_policy(unsigned int cpu, per_cpu(cpufreq_cpu_data, cpu) = managed_policy; spin_unlock_irqrestore(&cpufreq_driver_lock, flags); - dprintk("CPU already managed, adding link\n"); + pr_debug("CPU already managed, adding link\n"); ret = sysfs_create_link(&sys_dev->kobj, &managed_policy->kobj, "cpufreq"); @@ -865,7 +775,7 @@ static int cpufreq_add_dev_symlink(unsigned int cpu, if (!cpu_online(j)) continue; - dprintk("CPU %u already managed, adding link\n", j); + pr_debug("CPU %u already managed, adding link\n", j); managed_policy = cpufreq_cpu_get(cpu); cpu_sys_dev = get_cpu_sysdev(j); ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj, @@ -941,7 +851,7 @@ static int cpufreq_add_dev_interface(unsigned int cpu, policy->user_policy.governor = policy->governor; if (ret) { - dprintk("setting policy failed\n"); + pr_debug("setting policy failed\n"); if (cpufreq_driver->exit) cpufreq_driver->exit(policy); } @@ -977,8 +887,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) if (cpu_is_offline(cpu)) return 0; - cpufreq_debug_disable_ratelimit(); - dprintk("adding CPU %u\n", cpu); + pr_debug("adding CPU %u\n", cpu); #ifdef CONFIG_SMP /* check whether a different CPU already registered this @@ -986,7 +895,6 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) policy = cpufreq_cpu_get(cpu); if (unlikely(policy)) { cpufreq_cpu_put(policy); - cpufreq_debug_enable_ratelimit(); return 0; } #endif @@ -1037,7 +945,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) */ ret = cpufreq_driver->init(policy); if (ret) { - dprintk("initialization failed\n"); + pr_debug("initialization failed\n"); goto err_unlock_policy; } policy->user_policy.min = policy->min; @@ -1063,8 +971,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) kobject_uevent(&policy->kobj, KOBJ_ADD); module_put(cpufreq_driver->owner); - dprintk("initialization complete\n"); - cpufreq_debug_enable_ratelimit(); + pr_debug("initialization complete\n"); return 0; @@ -1088,7 +995,6 @@ err_free_policy: nomem_out: module_put(cpufreq_driver->owner); module_out: - cpufreq_debug_enable_ratelimit(); return ret; } @@ -1112,15 +1018,13 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) unsigned int j; #endif - cpufreq_debug_disable_ratelimit(); - dprintk("unregistering CPU %u\n", cpu); + pr_debug("unregistering CPU %u\n", cpu); spin_lock_irqsave(&cpufreq_driver_lock, flags); data = per_cpu(cpufreq_cpu_data, cpu); if (!data) { spin_unlock_irqrestore(&cpufreq_driver_lock, flags); - cpufreq_debug_enable_ratelimit(); unlock_policy_rwsem_write(cpu); return -EINVAL; } @@ -1132,12 +1036,11 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) * only need to unlink, put and exit */ if (unlikely(cpu != data->cpu)) { - dprintk("removing link\n"); + pr_debug("removing link\n"); cpumask_clear_cpu(cpu, data->cpus); spin_unlock_irqrestore(&cpufreq_driver_lock, flags); kobj = &sys_dev->kobj; cpufreq_cpu_put(data); - cpufreq_debug_enable_ratelimit(); unlock_policy_rwsem_write(cpu); sysfs_remove_link(kobj, "cpufreq"); return 0; @@ -1170,7 +1073,7 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) for_each_cpu(j, data->cpus) { if (j == cpu) continue; - dprintk("removing link for cpu %u\n", j); + pr_debug("removing link for cpu %u\n", j); #ifdef CONFIG_HOTPLUG_CPU strncpy(per_cpu(cpufreq_cpu_governor, j), data->governor->name, CPUFREQ_NAME_LEN); @@ -1199,17 +1102,15 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev) * not referenced anymore by anybody before we proceed with * unloading. */ - dprintk("waiting for dropping of refcount\n"); + pr_debug("waiting for dropping of refcount\n"); wait_for_completion(cmp); - dprintk("wait complete\n"); + pr_debug("wait complete\n"); lock_policy_rwsem_write(cpu); if (cpufreq_driver->exit) cpufreq_driver->exit(data); unlock_policy_rwsem_write(cpu); - cpufreq_debug_enable_ratelimit(); - #ifdef CONFIG_HOTPLUG_CPU /* when the CPU which is the parent of the kobj is hotplugged * offline, check for siblings, and create cpufreq sysfs interface @@ -1255,7 +1156,7 @@ static void handle_update(struct work_struct *work) struct cpufreq_policy *policy = container_of(work, struct cpufreq_policy, update); unsigned int cpu = policy->cpu; - dprintk("handle_update for cpu %u called\n", cpu); + pr_debug("handle_update for cpu %u called\n", cpu); cpufreq_update_policy(cpu); } @@ -1273,7 +1174,7 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq, { struct cpufreq_freqs freqs; - dprintk("Warning: CPU frequency out of sync: cpufreq and timing " + pr_debug("Warning: CPU frequency out of sync: cpufreq and timing " "core thinks of %u, is %u kHz.\n", old_freq, new_freq); freqs.cpu = cpu; @@ -1376,7 +1277,7 @@ static int cpufreq_bp_suspend(void) int cpu = smp_processor_id(); struct cpufreq_policy *cpu_policy; - dprintk("suspending cpu %u\n", cpu); + pr_debug("suspending cpu %u\n", cpu); /* If there's no policy for the boot CPU, we have nothing to do. */ cpu_policy = cpufreq_cpu_get(cpu); @@ -1414,7 +1315,7 @@ static void cpufreq_bp_resume(void) int cpu = smp_processor_id(); struct cpufreq_policy *cpu_policy; - dprintk("resuming cpu %u\n", cpu); + pr_debug("resuming cpu %u\n", cpu); /* If there's no policy for the boot CPU, we have nothing to do. */ cpu_policy = cpufreq_cpu_get(cpu); @@ -1526,7 +1427,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, { int retval = -EINVAL; - dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu, + pr_debug("target for CPU %u: %u kHz, relation %u\n", policy->cpu, target_freq, relation); if (cpu_online(policy->cpu) && cpufreq_driver->target) retval = cpufreq_driver->target(policy, target_freq, relation); @@ -1612,7 +1513,7 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, if (!try_module_get(policy->governor->owner)) return -EINVAL; - dprintk("__cpufreq_governor for CPU %u, event %u\n", + pr_debug("__cpufreq_governor for CPU %u, event %u\n", policy->cpu, event); ret = policy->governor->governor(policy, event); @@ -1713,8 +1614,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, { int ret = 0; - cpufreq_debug_disable_ratelimit(); - dprintk("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu, + pr_debug("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu, policy->min, policy->max); memcpy(&policy->cpuinfo, &data->cpuinfo, @@ -1751,19 +1651,19 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, data->min = policy->min; data->max = policy->max; - dprintk("new min and max freqs are %u - %u kHz\n", + pr_debug("new min and max freqs are %u - %u kHz\n", data->min, data->max); if (cpufreq_driver->setpolicy) { data->policy = policy->policy; - dprintk("setting range\n"); + pr_debug("setting range\n"); ret = cpufreq_driver->setpolicy(policy); } else { if (policy->governor != data->governor) { /* save old, working values */ struct cpufreq_governor *old_gov = data->governor; - dprintk("governor switch\n"); + pr_debug("governor switch\n"); /* end old governor */ if (data->governor) @@ -1773,7 +1673,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, data->governor = policy->governor; if (__cpufreq_governor(data, CPUFREQ_GOV_START)) { /* new governor failed, so re-start old one */ - dprintk("starting governor %s failed\n", + pr_debug("starting governor %s failed\n", data->governor->name); if (old_gov) { data->governor = old_gov; @@ -1785,12 +1685,11 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, } /* might be a policy change, too, so fall through */ } - dprintk("governor: change or update limits\n"); + pr_debug("governor: change or update limits\n"); __cpufreq_governor(data, CPUFREQ_GOV_LIMITS); } error_out: - cpufreq_debug_enable_ratelimit(); return ret; } @@ -1817,7 +1716,7 @@ int cpufreq_update_policy(unsigned int cpu) goto fail; } - dprintk("updating policy for CPU %u\n", cpu); + pr_debug("updating policy for CPU %u\n", cpu); memcpy(&policy, data, sizeof(struct cpufreq_policy)); policy.min = data->user_policy.min; policy.max = data->user_policy.max; @@ -1829,7 +1728,7 @@ int cpufreq_update_policy(unsigned int cpu) if (cpufreq_driver->get) { policy.cur = cpufreq_driver->get(cpu); if (!data->cur) { - dprintk("Driver did not initialize current freq"); + pr_debug("Driver did not initialize current freq"); data->cur = policy.cur; } else { if (data->cur != policy.cur) @@ -1905,7 +1804,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) ((!driver_data->setpolicy) && (!driver_data->target))) return -EINVAL; - dprintk("trying to register driver %s\n", driver_data->name); + pr_debug("trying to register driver %s\n", driver_data->name); if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; @@ -1936,15 +1835,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) /* if all ->init() calls failed, unregister */ if (ret) { - dprintk("no CPU initialized for driver %s\n", + pr_debug("no CPU initialized for driver %s\n", driver_data->name); goto err_sysdev_unreg; } } register_hotcpu_notifier(&cpufreq_cpu_notifier); - dprintk("driver %s up and running\n", driver_data->name); - cpufreq_debug_enable_ratelimit(); + pr_debug("driver %s up and running\n", driver_data->name); return 0; err_sysdev_unreg: @@ -1971,14 +1869,10 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) { unsigned long flags; - cpufreq_debug_disable_ratelimit(); - - if (!cpufreq_driver || (driver != cpufreq_driver)) { - cpufreq_debug_enable_ratelimit(); + if (!cpufreq_driver || (driver != cpufreq_driver)) return -EINVAL; - } - dprintk("unregistering driver %s\n", driver->name); + pr_debug("unregistering driver %s\n", driver->name); sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver); unregister_hotcpu_notifier(&cpufreq_cpu_notifier); diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index 7e2e5150..f13a8a9 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -15,9 +15,6 @@ #include #include -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_GOVERNOR, "performance", msg) - static int cpufreq_governor_performance(struct cpufreq_policy *policy, unsigned int event) @@ -25,7 +22,7 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy, switch (event) { case CPUFREQ_GOV_START: case CPUFREQ_GOV_LIMITS: - dprintk("setting to %u kHz because of event %u\n", + pr_debug("setting to %u kHz because of event %u\n", policy->max, event); __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index e6db5fa..4c2eb51 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -15,16 +15,13 @@ #include #include -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_GOVERNOR, "powersave", msg) - static int cpufreq_governor_powersave(struct cpufreq_policy *policy, unsigned int event) { switch (event) { case CPUFREQ_GOV_START: case CPUFREQ_GOV_LIMITS: - dprintk("setting to %u kHz because of event %u\n", + pr_debug("setting to %u kHz because of event %u\n", policy->min, event); __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 66d2d1d..f231015 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -37,9 +37,6 @@ static DEFINE_PER_CPU(unsigned int, cpu_is_managed); static DEFINE_MUTEX(userspace_mutex); static int cpus_using_userspace_governor; -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_GOVERNOR, "userspace", msg) - /* keep track of frequency transitions */ static int userspace_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -50,7 +47,7 @@ userspace_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!per_cpu(cpu_is_managed, freq->cpu)) return 0; - dprintk("saving cpu_cur_freq of cpu %u to be %u kHz\n", + pr_debug("saving cpu_cur_freq of cpu %u to be %u kHz\n", freq->cpu, freq->new); per_cpu(cpu_cur_freq, freq->cpu) = freq->new; @@ -73,7 +70,7 @@ static int cpufreq_set(struct cpufreq_policy *policy, unsigned int freq) { int ret = -EINVAL; - dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); + pr_debug("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq); mutex_lock(&userspace_mutex); if (!per_cpu(cpu_is_managed, policy->cpu)) @@ -134,7 +131,7 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy, per_cpu(cpu_max_freq, cpu) = policy->max; per_cpu(cpu_cur_freq, cpu) = policy->cur; per_cpu(cpu_set_freq, cpu) = policy->cur; - dprintk("managing cpu %u started " + pr_debug("managing cpu %u started " "(%u - %u kHz, currently %u kHz)\n", cpu, per_cpu(cpu_min_freq, cpu), @@ -156,12 +153,12 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy, per_cpu(cpu_min_freq, cpu) = 0; per_cpu(cpu_max_freq, cpu) = 0; per_cpu(cpu_set_freq, cpu) = 0; - dprintk("managing cpu %u stopped\n", cpu); + pr_debug("managing cpu %u stopped\n", cpu); mutex_unlock(&userspace_mutex); break; case CPUFREQ_GOV_LIMITS: mutex_lock(&userspace_mutex); - dprintk("limit event for cpu %u: %u - %u kHz, " + pr_debug("limit event for cpu %u: %u - %u kHz, " "currently %u kHz, last set to %u kHz\n", cpu, policy->min, policy->max, per_cpu(cpu_cur_freq, cpu), diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 0543221..90431cb 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -14,9 +14,6 @@ #include #include -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, "freq-table", msg) - /********************************************************************* * FREQUENCY TABLE HELPERS * *********************************************************************/ @@ -31,11 +28,11 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { unsigned int freq = table[i].frequency; if (freq == CPUFREQ_ENTRY_INVALID) { - dprintk("table entry %u is invalid, skipping\n", i); + pr_debug("table entry %u is invalid, skipping\n", i); continue; } - dprintk("table entry %u: %u kHz, %u index\n", + pr_debug("table entry %u: %u kHz, %u index\n", i, freq, table[i].index); if (freq < min_freq) min_freq = freq; @@ -61,7 +58,7 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy *policy, unsigned int i; unsigned int count = 0; - dprintk("request for verification of policy (%u - %u kHz) for cpu %u\n", + pr_debug("request for verification of policy (%u - %u kHz) for cpu %u\n", policy->min, policy->max, policy->cpu); if (!cpu_online(policy->cpu)) @@ -86,7 +83,7 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy *policy, cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, policy->cpuinfo.max_freq); - dprintk("verification lead to (%u - %u kHz) for cpu %u\n", + pr_debug("verification lead to (%u - %u kHz) for cpu %u\n", policy->min, policy->max, policy->cpu); return 0; @@ -110,7 +107,7 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy, }; unsigned int i; - dprintk("request for target %u kHz (relation: %u) for cpu %u\n", + pr_debug("request for target %u kHz (relation: %u) for cpu %u\n", target_freq, relation, policy->cpu); switch (relation) { @@ -167,7 +164,7 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy, } else *index = optimal.index; - dprintk("target is %u (%u kHz, %u)\n", *index, table[*index].frequency, + pr_debug("target is %u (%u kHz, %u)\n", *index, table[*index].frequency, table[*index].index); return 0; @@ -216,14 +213,14 @@ EXPORT_SYMBOL_GPL(cpufreq_freq_attr_scaling_available_freqs); void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table, unsigned int cpu) { - dprintk("setting show_table for cpu %u to %p\n", cpu, table); + pr_debug("setting show_table for cpu %u to %p\n", cpu, table); per_cpu(cpufreq_show_table, cpu) = table; } EXPORT_SYMBOL_GPL(cpufreq_frequency_table_get_attr); void cpufreq_frequency_table_put_attr(unsigned int cpu) { - dprintk("clearing show_table for cpu %u\n", cpu); + pr_debug("clearing show_table for cpu %u\n", cpu); per_cpu(cpufreq_show_table, cpu) = NULL; } EXPORT_SYMBOL_GPL(cpufreq_frequency_table_put_attr); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9343dd3..2845f6e6 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -397,23 +397,4 @@ void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table, void cpufreq_frequency_table_put_attr(unsigned int cpu); -/********************************************************************* - * UNIFIED DEBUG HELPERS * - *********************************************************************/ - -#define CPUFREQ_DEBUG_CORE 1 -#define CPUFREQ_DEBUG_DRIVER 2 -#define CPUFREQ_DEBUG_GOVERNOR 4 - -#ifdef CONFIG_CPU_FREQ_DEBUG - -extern void cpufreq_debug_printk(unsigned int type, const char *prefix, - const char *fmt, ...); - -#else - -#define cpufreq_debug_printk(msg...) do { } while(0) - -#endif /* CONFIG_CPU_FREQ_DEBUG */ - #endif /* _LINUX_CPUFREQ_H */ -- cgit v1.1 From 335dc3335ff692173bc766b248f7a97f3a23b30a Mon Sep 17 00:00:00 2001 From: Thiago Farina Date: Thu, 28 Apr 2011 20:42:53 -0300 Subject: [CPUFREQ] cpufreq.h: Fix some checkpatch.pl coding style issues. Before: $ scripts/checkpatch.pl --file --terse include/linux/cpufreq.h total: 14 errors, 11 warnings, 419 lines checked After: $ scripts/checkpatch.pl --file --terse include/linux/cpufreq.h total: 2 errors, 4 warnings, 422 lines checked Signed-off-by: Thiago Farina Signed-off-by: Dave Jones --- include/linux/cpufreq.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2845f6e6..11be48e 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -3,7 +3,7 @@ * * Copyright (C) 2001 Russell King * (C) 2002 - 2003 Dominik Brodowski - * + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -56,9 +56,9 @@ static inline int cpufreq_unregister_notifier(struct notifier_block *nb, #define CPUFREQ_POLICY_POWERSAVE (1) #define CPUFREQ_POLICY_PERFORMANCE (2) -/* Frequency values here are CPU kHz so that hardware which doesn't run - * with some frequencies can complain without having to guess what per - * cent / per mille means. +/* Frequency values here are CPU kHz so that hardware which doesn't run + * with some frequencies can complain without having to guess what per + * cent / per mille means. * Maximum transition latency is in nanoseconds - if it's unknown, * CPUFREQ_ETERNAL shall be used. */ @@ -72,13 +72,15 @@ extern struct kobject *cpufreq_global_kobject; struct cpufreq_cpuinfo { unsigned int max_freq; unsigned int min_freq; - unsigned int transition_latency; /* in 10^(-9) s = nanoseconds */ + + /* in 10^(-9) s = nanoseconds */ + unsigned int transition_latency; }; struct cpufreq_real_policy { unsigned int min; /* in kHz */ unsigned int max; /* in kHz */ - unsigned int policy; /* see above */ + unsigned int policy; /* see above */ struct cpufreq_governor *governor; /* see below */ }; @@ -94,7 +96,7 @@ struct cpufreq_policy { unsigned int max; /* in kHz */ unsigned int cur; /* in kHz, only needed if cpufreq * governors are used */ - unsigned int policy; /* see above */ + unsigned int policy; /* see above */ struct cpufreq_governor *governor; /* see below */ struct work_struct update; /* if update_policy() needs to be @@ -167,11 +169,11 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, u_int mu struct cpufreq_governor { char name[CPUFREQ_NAME_LEN]; - int (*governor) (struct cpufreq_policy *policy, + int (*governor) (struct cpufreq_policy *policy, unsigned int event); ssize_t (*show_setspeed) (struct cpufreq_policy *policy, char *buf); - int (*store_setspeed) (struct cpufreq_policy *policy, + int (*store_setspeed) (struct cpufreq_policy *policy, unsigned int freq); unsigned int max_transition_latency; /* HW must be able to switch to next freq faster than this value in nano secs or we @@ -180,7 +182,8 @@ struct cpufreq_governor { struct module *owner; }; -/* pass a target to the cpufreq driver +/* + * Pass a target to the cpufreq driver. */ extern int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, @@ -237,9 +240,9 @@ struct cpufreq_driver { /* flags */ -#define CPUFREQ_STICKY 0x01 /* the driver isn't removed even if +#define CPUFREQ_STICKY 0x01 /* the driver isn't removed even if * all ->init() calls failed */ -#define CPUFREQ_CONST_LOOPS 0x02 /* loops_per_jiffy or other kernel +#define CPUFREQ_CONST_LOOPS 0x02 /* loops_per_jiffy or other kernel * "constants" aren't affected by * frequency transitions */ #define CPUFREQ_PM_NO_WARN 0x04 /* don't warn on suspend/resume speed @@ -252,7 +255,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state); -static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) +static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) { if (policy->min < min) policy->min = min; @@ -386,12 +389,12 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy, /* the following 3 funtions are for cpufreq core use only */ struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu); struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu); -void cpufreq_cpu_put (struct cpufreq_policy *data); +void cpufreq_cpu_put(struct cpufreq_policy *data); /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; -void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table, +void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table, unsigned int cpu); void cpufreq_frequency_table_put_attr(unsigned int cpu); -- cgit v1.1 From 98586ed8b8878e10691203687e89a42fa3355300 Mon Sep 17 00:00:00 2001 From: steven finney Date: Mon, 2 May 2011 11:29:17 -0700 Subject: [CPUFREQ] Fix memory leak in cpufreq_stat When a CPU is taken offline in an SMP system, cpufreq_remove_dev() nulls out the per-cpu policy before cpufreq_stats_free_table() can make use of it. cpufreq_stats_free_table() then skips the call to sysfs_remove_group(), leaving about 100 bytes of sysfs-related memory unclaimed each time a CPU-removal occurs. Break up cpu_stats_free_table into sysfs and table portions, and call the sysfs portion early. Signed-off-by: Steven Finney Signed-off-by: Dave Jones Cc: stable@kernel.org --- drivers/cpufreq/cpufreq_stats.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 00d73fc..4f1b8de 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -165,17 +165,27 @@ static int freq_table_get_index(struct cpufreq_stats *stat, unsigned int freq) return -1; } +/* should be called late in the CPU removal sequence so that the stats + * memory is still available in case someone tries to use it. + */ static void cpufreq_stats_free_table(unsigned int cpu) { struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, cpu); - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - if (policy && policy->cpu == cpu) - sysfs_remove_group(&policy->kobj, &stats_attr_group); if (stat) { kfree(stat->time_in_state); kfree(stat); } per_cpu(cpufreq_stats_table, cpu) = NULL; +} + +/* must be called early in the CPU removal sequence (before + * cpufreq_remove_dev) so that policy is still valid. + */ +static void cpufreq_stats_free_sysfs(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + if (policy && policy->cpu == cpu) + sysfs_remove_group(&policy->kobj, &stats_attr_group); if (policy) cpufreq_cpu_put(policy); } @@ -316,6 +326,9 @@ static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: cpufreq_update_policy(cpu); break; + case CPU_DOWN_PREPARE: + cpufreq_stats_free_sysfs(cpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: cpufreq_stats_free_table(cpu); @@ -324,9 +337,11 @@ static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } +/* priority=1 so this will get called before cpufreq_remove_dev */ static struct notifier_block cpufreq_stat_cpu_notifier __refdata = { .notifier_call = cpufreq_stat_cpu_callback, + .priority = 1, }; static struct notifier_block notifier_policy_block = { -- cgit v1.1 From 469057d587a9de2cd6087d71a008b908e785a5b6 Mon Sep 17 00:00:00 2001 From: Karthigan Srinivasan Date: Fri, 1 Apr 2011 17:34:47 -0500 Subject: [CPUFREQ] cpufreq_stats.c: Fixed brace coding style issue Fixed brace coding style issue. Signed-off-by: Karthigan Srinivasan Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq_stats.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 4f1b8de..b60a4c2 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -338,8 +338,7 @@ static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb, } /* priority=1 so this will get called before cpufreq_remove_dev */ -static struct notifier_block cpufreq_stat_cpu_notifier __refdata = -{ +static struct notifier_block cpufreq_stat_cpu_notifier __refdata = { .notifier_call = cpufreq_stat_cpu_callback, .priority = 1, }; -- cgit v1.1 From 1a8e1463a49aaa452da1cefe184a00d4df47f1ef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 May 2011 08:38:56 -0700 Subject: [CPUFREQ] remove redundant sprintf from request_module call. Since format string handling is part of request_module, there is no need to construct the module name. As such, drop the redundant sprintf and heap usage. Signed-off-by: Kees Cook Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1e08af4..0a5bea9 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -321,21 +321,14 @@ static int cpufreq_parse_governor(char *str_governor, unsigned int *policy, t = __find_governor(str_governor); if (t == NULL) { - char *name = kasprintf(GFP_KERNEL, "cpufreq_%s", - str_governor); + int ret; - if (name) { - int ret; + mutex_unlock(&cpufreq_governor_mutex); + ret = request_module("cpufreq_%s", str_governor); + mutex_lock(&cpufreq_governor_mutex); - mutex_unlock(&cpufreq_governor_mutex); - ret = request_module("%s", name); - mutex_lock(&cpufreq_governor_mutex); - - if (ret == 0) - t = __find_governor(str_governor); - } - - kfree(name); + if (ret == 0) + t = __find_governor(str_governor); } if (t != NULL) { -- cgit v1.1 From b7534f002d3c81d18abfbf57179d07d3ec763bb5 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 30 Apr 2011 10:34:05 -0300 Subject: [media] v4l: Release module if subdev registration fails If v4l2_device_register_subdev() fails, the reference to the subdev module taken by the function isn't released. Fix this. Signed-off-by: Laurent Pinchart Cc: stable@kernel.org Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/v4l2-device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/media/video/v4l2-device.c b/drivers/media/video/v4l2-device.c index 5aeaf87..4aae501 100644 --- a/drivers/media/video/v4l2-device.c +++ b/drivers/media/video/v4l2-device.c @@ -155,8 +155,10 @@ int v4l2_device_register_subdev(struct v4l2_device *v4l2_dev, sd->v4l2_dev = v4l2_dev; if (sd->internal_ops && sd->internal_ops->registered) { err = sd->internal_ops->registered(sd); - if (err) + if (err) { + module_put(sd->owner); return err; + } } /* This just returns 0 if either of the two args is NULL */ @@ -164,6 +166,7 @@ int v4l2_device_register_subdev(struct v4l2_device *v4l2_dev, if (err) { if (sd->internal_ops && sd->internal_ops->unregistered) sd->internal_ops->unregistered(sd); + module_put(sd->owner); return err; } -- cgit v1.1 From 2a164d02dd34c6b49a3f0995900e0f8af102b804 Mon Sep 17 00:00:00 2001 From: Lawrence Rust Date: Fri, 8 Apr 2011 09:50:45 -0300 Subject: [media] Fix cx88 remote control input In the IR interrupt handler of cx88-input.c there's a 32-bit multiply overflow which causes IR pulse durations to be incorrectly calculated. This is a regression caused by commit 2997137be8eba. Cc: stable@kernel.org Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/cx88/cx88-input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/cx88/cx88-input.c b/drivers/media/video/cx88/cx88-input.c index c820e2f..3f44200 100644 --- a/drivers/media/video/cx88/cx88-input.c +++ b/drivers/media/video/cx88/cx88-input.c @@ -524,7 +524,7 @@ void cx88_ir_irq(struct cx88_core *core) for (todo = 32; todo > 0; todo -= bits) { ev.pulse = samples & 0x80000000 ? false : true; bits = min(todo, 32U - fls(ev.pulse ? samples : ~samples)); - ev.duration = (bits * NSEC_PER_SEC) / (1000 * ir_samplerate); + ev.duration = (bits * (NSEC_PER_SEC / 1000)) / ir_samplerate; ir_raw_event_store_with_filter(ir->dev, &ev); samples <<= bits; } -- cgit v1.1 From 2dd251f0a294300a1cf8f4b63768145fa6153c4d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 16 Apr 2011 10:23:51 +0100 Subject: drm/i915: Release object along create user fb error path Reported-by: Alan Cox Signed-off-by: Chris Wilson Cc: stable@kernel.org Signed-off-by: Keith Packard --- drivers/gpu/drm/i915/intel_display.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index e522c70..aab06cf 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -6579,8 +6579,10 @@ intel_user_framebuffer_create(struct drm_device *dev, return ERR_PTR(-ENOENT); intel_fb = kzalloc(sizeof(*intel_fb), GFP_KERNEL); - if (!intel_fb) + if (!intel_fb) { + drm_gem_object_unreference_unlocked(&obj->base); return ERR_PTR(-ENOMEM); + } ret = intel_framebuffer_init(dev, intel_fb, mode_cmd, obj); if (ret) { -- cgit v1.1 From 31acbcc408f412d1ba73765b846c38642be553c3 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sun, 17 Apr 2011 06:38:35 +0100 Subject: drm/i915/dp: Be paranoid in case we disable a DP before it is attached Given that the hardware may be left in a random condition by the BIOS, it is conceivable that we then attempt to clear the DP_PIPEB_SELECT bit without us ever enabling/attaching the DP encoder to a pipe. Thus causing a NULL deference when we attempt to wait for a vblank on that crtc. Reported-and-tested-by: Bryan Christ Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=36314 Signed-off-by: Chris Wilson Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=36456 Reported-and-tested-by: Bo Wang Cc: stable@kernel.org Signed-off-by: Keith Packard --- drivers/gpu/drm/i915/intel_dp.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index cb8578b..a4d8031 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -1470,7 +1470,8 @@ intel_dp_link_down(struct intel_dp *intel_dp) if (!HAS_PCH_CPT(dev) && I915_READ(intel_dp->output_reg) & DP_PIPEB_SELECT) { - struct intel_crtc *intel_crtc = to_intel_crtc(intel_dp->base.base.crtc); + struct drm_crtc *crtc = intel_dp->base.base.crtc; + /* Hardware workaround: leaving our transcoder select * set to transcoder B while it's off will prevent the * corresponding HDMI output on transcoder A. @@ -1485,7 +1486,19 @@ intel_dp_link_down(struct intel_dp *intel_dp) /* Changes to enable or select take place the vblank * after being written. */ - intel_wait_for_vblank(dev, intel_crtc->pipe); + if (crtc == NULL) { + /* We can arrive here never having been attached + * to a CRTC, for instance, due to inheriting + * random state from the BIOS. + * + * If the pipe is not running, play safe and + * wait for the clocks to stabilise before + * continuing. + */ + POSTING_READ(intel_dp->output_reg); + msleep(50); + } else + intel_wait_for_vblank(dev, to_intel_crtc(crtc)->pipe); } I915_WRITE(intel_dp->output_reg, DP & ~DP_PORT_EN); -- cgit v1.1 From 0b84834a5a9f5fe8f3760560ef8c5b1536d22bd1 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 2 May 2011 08:09:25 -0300 Subject: [media] v4l2-subdev: fix broken subdev control enumeration The v4l2_subdev_* functions are meant for older V4L2 drivers that do not use the control framework yet. These functions should not be used by subdev_do_ioctl. Most of those backwards compatibility functions are just stubs, but commit 87a0c94ce616b231f3c0bd09d7dbd39d43b0557a actually changed the behavior of v4l2_subdev_queryctrl, so calling that one from subdev_do_ioctl broke the control enumeration in subdev nodes. The fix is simply not to use those compatibility functions in v4l2-subdev.c. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/v4l2-subdev.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/media/video/v4l2-subdev.c b/drivers/media/video/v4l2-subdev.c index 0b80644..812729e 100644 --- a/drivers/media/video/v4l2-subdev.c +++ b/drivers/media/video/v4l2-subdev.c @@ -155,25 +155,25 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg) switch (cmd) { case VIDIOC_QUERYCTRL: - return v4l2_subdev_queryctrl(sd, arg); + return v4l2_queryctrl(sd->ctrl_handler, arg); case VIDIOC_QUERYMENU: - return v4l2_subdev_querymenu(sd, arg); + return v4l2_querymenu(sd->ctrl_handler, arg); case VIDIOC_G_CTRL: - return v4l2_subdev_g_ctrl(sd, arg); + return v4l2_g_ctrl(sd->ctrl_handler, arg); case VIDIOC_S_CTRL: - return v4l2_subdev_s_ctrl(sd, arg); + return v4l2_s_ctrl(sd->ctrl_handler, arg); case VIDIOC_G_EXT_CTRLS: - return v4l2_subdev_g_ext_ctrls(sd, arg); + return v4l2_g_ext_ctrls(sd->ctrl_handler, arg); case VIDIOC_S_EXT_CTRLS: - return v4l2_subdev_s_ext_ctrls(sd, arg); + return v4l2_s_ext_ctrls(sd->ctrl_handler, arg); case VIDIOC_TRY_EXT_CTRLS: - return v4l2_subdev_try_ext_ctrls(sd, arg); + return v4l2_try_ext_ctrls(sd->ctrl_handler, arg); case VIDIOC_DQEVENT: if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) -- cgit v1.1 From dca6b6d18fa4428c4539e45f7a43040e388ab99e Mon Sep 17 00:00:00 2001 From: Sergio Aguirre Date: Mon, 11 Apr 2011 13:14:33 -0300 Subject: [media] V4L: soc-camera: regression fix: calculate .sizeimage in soc_camera.c A recent patch has given individual soc-camera host drivers a possibility to calculate .sizeimage and .bytesperline pixel format fields internally, however, some drivers relied on the core calculating these values for them, following a default algorithm. This patch restores the default calculation for such drivers. Based on initial patch by Guennadi Liakhovetski, found here: http://www.spinics.net/lists/linux-media/msg31282.html Except that this covers try_fmt aswell. Signed-off-by: Sergio Aguirre Signed-off-by: Guennadi Liakhovetski Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/soc_camera.c | 48 +++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/media/video/soc_camera.c b/drivers/media/video/soc_camera.c index 3973f9a..ddb4c09 100644 --- a/drivers/media/video/soc_camera.c +++ b/drivers/media/video/soc_camera.c @@ -136,11 +136,50 @@ unsigned long soc_camera_apply_sensor_flags(struct soc_camera_link *icl, } EXPORT_SYMBOL(soc_camera_apply_sensor_flags); +#define pixfmtstr(x) (x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, \ + ((x) >> 24) & 0xff + +static int soc_camera_try_fmt(struct soc_camera_device *icd, + struct v4l2_format *f) +{ + struct soc_camera_host *ici = to_soc_camera_host(icd->dev.parent); + struct v4l2_pix_format *pix = &f->fmt.pix; + int ret; + + dev_dbg(&icd->dev, "TRY_FMT(%c%c%c%c, %ux%u)\n", + pixfmtstr(pix->pixelformat), pix->width, pix->height); + + pix->bytesperline = 0; + pix->sizeimage = 0; + + ret = ici->ops->try_fmt(icd, f); + if (ret < 0) + return ret; + + if (!pix->sizeimage) { + if (!pix->bytesperline) { + const struct soc_camera_format_xlate *xlate; + + xlate = soc_camera_xlate_by_fourcc(icd, pix->pixelformat); + if (!xlate) + return -EINVAL; + + ret = soc_mbus_bytes_per_line(pix->width, + xlate->host_fmt); + if (ret > 0) + pix->bytesperline = ret; + } + if (pix->bytesperline) + pix->sizeimage = pix->bytesperline * pix->height; + } + + return 0; +} + static int soc_camera_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct soc_camera_device *icd = file->private_data; - struct soc_camera_host *ici = to_soc_camera_host(icd->dev.parent); WARN_ON(priv != file->private_data); @@ -149,7 +188,7 @@ static int soc_camera_try_fmt_vid_cap(struct file *file, void *priv, return -EINVAL; /* limit format to hardware capabilities */ - return ici->ops->try_fmt(icd, f); + return soc_camera_try_fmt(icd, f); } static int soc_camera_enum_input(struct file *file, void *priv, @@ -362,9 +401,6 @@ static void soc_camera_free_user_formats(struct soc_camera_device *icd) icd->user_formats = NULL; } -#define pixfmtstr(x) (x) & 0xff, ((x) >> 8) & 0xff, ((x) >> 16) & 0xff, \ - ((x) >> 24) & 0xff - /* Called with .vb_lock held, or from the first open(2), see comment there */ static int soc_camera_set_fmt(struct soc_camera_device *icd, struct v4l2_format *f) @@ -377,7 +413,7 @@ static int soc_camera_set_fmt(struct soc_camera_device *icd, pixfmtstr(pix->pixelformat), pix->width, pix->height); /* We always call try_fmt() before set_fmt() or set_crop() */ - ret = ici->ops->try_fmt(icd, f); + ret = soc_camera_try_fmt(icd, f); if (ret < 0) return ret; -- cgit v1.1 From fca65b4ad72d28cbb43a029114d04b89f06faadb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 4 May 2011 11:33:47 -0700 Subject: ceph: do not call __mark_dirty_inode under i_lock The __mark_dirty_inode helper now takes i_lock as of 250df6ed. Fix the one ceph callers that held i_lock (__ceph_mark_dirty_caps) to return the flags value so that the callers can do it outside of i_lock. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 10 +++++----- fs/ceph/file.c | 5 ++++- fs/ceph/inode.c | 6 +++++- fs/ceph/super.h | 2 +- fs/ceph/xattr.c | 12 ++++++++---- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 010ba9c..9fa0866 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1331,10 +1331,11 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) } /* - * Mark caps dirty. If inode is newly dirty, add to the global dirty - * list. + * Mark caps dirty. If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value. */ -void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1365,9 +1366,8 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - if (dirty) - __mark_inode_dirty(inode, dirty); __cap_delay_requeue(mdsc, ci); + return dirty; } /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 159b512..203252d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -734,9 +734,12 @@ retry_snap: } } if (ret >= 0) { + int dirty; spin_lock(&inode->i_lock); - __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&inode->i_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); } out: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b54c97da..03d6daf 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1567,6 +1567,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) int release = 0, dirtied = 0; int mask = 0; int err = 0; + int inode_dirty_flags = 0; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -1725,13 +1726,16 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dout("setattr %p ATTR_FILE ... hrm!\n", inode); if (dirtied) { - __ceph_mark_dirty_caps(ci, dirtied); + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); inode->i_ctime = CURRENT_TIME; } release &= issued; spin_unlock(&inode->i_lock); + if (inode_dirty_flags) + __mark_inode_dirty(inode, inode_dirty_flags); + if (mask) { req->r_inode = igrab(inode); req->r_inode_drop = release; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 619fe71..b1f1b8b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -506,7 +506,7 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 8c9eba6..f2b6286 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -703,6 +703,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, struct ceph_inode_xattr *xattr = NULL; int issued; int required_blob_size; + int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -763,11 +764,12 @@ retry: dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); err = __set_xattr(ci, newname, name_len, newval, val_len, 1, 1, 1, &xattr); - __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; spin_unlock(&inode->i_lock); - + if (dirty) + __mark_inode_dirty(inode, dirty); return err; do_sync: @@ -810,6 +812,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; + int dirty; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -833,12 +836,13 @@ int ceph_removexattr(struct dentry *dentry, const char *name) goto do_sync; err = __remove_xattr_by_name(ceph_inode(inode), name); - __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; spin_unlock(&inode->i_lock); - + if (dirty) + __mark_inode_dirty(inode, dirty); return err; do_sync: spin_unlock(&inode->i_lock); -- cgit v1.1 From 64f3b9e203bd06855072e295557dca1485a2ecba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 May 2011 10:02:26 +0000 Subject: net: ip_expire() must revalidate route Commit 4a94445c9a5c (net: Use ip_route_input_noref() in input path) added a bug in IP defragmentation handling, in case timeout is fired. When a frame is defragmented, we use last skb dst field when building final skb. Its dst is valid, since we are in rcu read section. But if a timeout occurs, we take first queued fragment to build one ICMP TIME EXCEEDED message. Problem is all queued skb have weak dst pointers, since we escaped RCU critical section after their queueing. icmp_send() might dereference a now freed (and possibly reused) part of memory. Calling skb_dst_drop() and ip_route_input_noref() to revalidate route is the only possible choice. Reported-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a1151b8..b1d282f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -223,31 +223,30 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; + /* skb dst is stale, drop it, and perform route lookup again */ + skb_dst_drop(head); + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + /* - * Only search router table for the head fragment, - * when defraging timeout at PRE_ROUTING HOOK. + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { - const struct iphdr *iph = ip_hdr(head); - int err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (unlikely(err)) - goto out_rcu_unlock; - - /* - * Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (skb_rtable(head)->rt_type != RTN_LOCAL) - goto out_rcu_unlock; + if (qp->user == IP_DEFRAG_CONNTRACK_IN && + skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; - } /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); -- cgit v1.1 From 30106b8ce2cc2243514116d6f29086e6deecc754 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 May 2011 15:38:19 +0200 Subject: slub: Fix the lockless code on 32-bit platforms with no 64-bit cmpxchg The SLUB allocator use of the cmpxchg_double logic was wrong: it actually needs the irq-safe one. That happens automatically when we use the native unlocked 'cmpxchg8b' instruction, but when compiling the kernel for older x86 CPUs that do not support that instruction, we fall back to the generic emulation code. And if you don't specify that you want the irq-safe version, the generic code ends up just open-coding the cmpxchg8b equivalent without any protection against interrupts or preemption. Which definitely doesn't work for SLUB. This was reported by Werner Landgraf , who saw instability with his distro-kernel that was compiled to support pretty much everything under the sun. Most big Linux distributions tend to compile for PPro and later, and would never have noticed this problem. This also fixes the prototypes for the irqsafe cmpxchg_double functions to use 'bool' like they should. [ Btw, that whole "generic code defaults to no protection" design just sounds stupid - if the code needs no protection, there is no reason to use "cmpxchg_double" to begin with. So we should probably just remove the unprotected version entirely as pointless. - Linus ] Signed-off-by: Thomas Gleixner Reported-and-tested-by: werner Acked-and-tested-by: Ingo Molnar Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Jens Axboe Cc: Tejun Heo Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1105041539050.3005@ionos Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 2 +- mm/slub.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 3a5c444..8b97308 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -948,7 +948,7 @@ do { \ irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) # endif # define irqsafe_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ - __pcpu_double_call_return_int(irqsafe_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) + __pcpu_double_call_return_bool(irqsafe_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2)) #endif #endif /* __LINUX_PERCPU_H */ diff --git a/mm/slub.c b/mm/slub.c index 94d2a33..9d2e5e4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1940,7 +1940,7 @@ redo: * Since this is without lock semantics the protection is only against * code executing on this cpu *not* from access by other cpus. */ - if (unlikely(!this_cpu_cmpxchg_double( + if (unlikely(!irqsafe_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, object, tid, get_freepointer(s, object), next_tid(tid)))) { @@ -2145,7 +2145,7 @@ redo: set_freepointer(s, object, c->freelist); #ifdef CONFIG_CMPXCHG_LOCAL - if (unlikely(!this_cpu_cmpxchg_double( + if (unlikely(!irqsafe_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, c->freelist, tid, object, next_tid(tid)))) { -- cgit v1.1 From a1fde08c74e90accd62d4cfdbf580d2ede938fe7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 May 2011 21:30:28 -0700 Subject: VM: skip the stack guard page lookup in get_user_pages only for mlock The logic in __get_user_pages() used to skip the stack guard page lookup whenever the caller wasn't interested in seeing what the actual page was. But Michel Lespinasse points out that there are cases where we don't care about the physical page itself (so 'pages' may be NULL), but do want to make sure a page is mapped into the virtual address space. So using the existence of the "pages" array as an indication of whether to look up the guard page or not isn't actually so great, and we really should just use the FOLL_MLOCK bit. But because that bit was only set for the VM_LOCKED case (and not all vma's necessarily have it, even for mlock()), we couldn't do that originally. Fix that by moving the VM_LOCKED check deeper into the call-chain, which actually simplifies many things. Now mlock() gets simpler, and we can also check for FOLL_MLOCK in __get_user_pages() and the code ends up much more straightforward. Reported-and-reviewed-by: Michel Lespinasse Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 7 +++---- mm/mlock.c | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 607098d4..27f4253 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1359,7 +1359,7 @@ split_fallthrough: */ mark_page_accessed(page); } - if (flags & FOLL_MLOCK) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -1552,10 +1552,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } /* - * If we don't actually want the page itself, - * and it's the stack guard page, just skip it. + * For mlock, just skip the stack guard page. */ - if (!pages && stack_guard_page(vma, start)) + if ((gup_flags & FOLL_MLOCK) && stack_guard_page(vma, start)) goto next_page; do { diff --git a/mm/mlock.c b/mm/mlock.c index 6b55e3e..516b2c2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -162,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -178,9 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE; - if (vma->vm_flags & VM_LOCKED) - gup_flags |= FOLL_MLOCK; - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, NULL, NULL, nonblocking); } -- cgit v1.1 From e05b2efb82596905ebfe88e8612ee81dec9b6592 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 4 May 2011 18:16:50 -0700 Subject: clocksource: Install completely before selecting Christian Hoffmann reported that the command line clocksource override with acpi_pm timer fails: Kernel command line: clocksource=acpi_pm hpet clockevent registered Switching to clocksource hpet Override clocksource acpi_pm is not HRT compatible. Cannot switch while in HRT/NOHZ mode. The watchdog code is what enables CLOCK_SOURCE_VALID_FOR_HRES, but we actually end up selecting the clocksource before we enqueue it into the watchdog list, so that's why we see the warning and fail to switch to acpi_pm timer as requested. That's particularly bad when we want to debug timekeeping related problems in early boot. Put the selection call last. Reported-by: Christian Hoffmann Signed-off-by: John Stultz Cc: stable@kernel.org # 32... Link: http://lkml.kernel.org/r/%3C1304558210.2943.24.camel%40work-vm%3E Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6519cf62..0e17c10 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } @@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs) mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } -- cgit v1.1 From 1dbe7dada2d03d1313183d439068f1f951a91244 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Thu, 5 May 2011 08:40:14 -0700 Subject: Input: ads7846 - make transfer buffers DMA safe req.sample needs its own cacheline otherwise accessing req.msg fetches it in again. Note: This effect doesn't occur if the underlying SPI driver doesn't use DMA at all. Signed-off-by: Alexander Stein Acked-by: Jonathan Cameron Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ads7846.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index c24946f..ab51a8db 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -281,17 +281,25 @@ struct ser_req { u8 command; u8 ref_off; u16 scratch; - __be16 sample; struct spi_message msg; struct spi_transfer xfer[6]; + /* + * DMA (thus cache coherency maintenance) requires the + * transfer buffers to live in their own cache lines. + */ + __be16 sample ____cacheline_aligned; }; struct ads7845_ser_req { u8 command[3]; u8 pwrdown[3]; - u8 sample[3]; struct spi_message msg; struct spi_transfer xfer[2]; + /* + * DMA (thus cache coherency maintenance) requires the + * transfer buffers to live in their own cache lines. + */ + u8 sample[3] ____cacheline_aligned; }; static int ads7846_read12_ser(struct device *dev, unsigned command) -- cgit v1.1 From 28350e330cfab46b60a1dbf763b678d859f9f3d9 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Thu, 5 May 2011 08:40:46 -0700 Subject: Input: ads7846 - remove unused variable from struct ads7845_ser_req Signed-off-by: Alexander Stein Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ads7846.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c index ab51a8db..1de1c19 100644 --- a/drivers/input/touchscreen/ads7846.c +++ b/drivers/input/touchscreen/ads7846.c @@ -292,7 +292,6 @@ struct ser_req { struct ads7845_ser_req { u8 command[3]; - u8 pwrdown[3]; struct spi_message msg; struct spi_transfer xfer[2]; /* -- cgit v1.1 From 75bd0cbdc21d80859c80bdd5dd00125c1a3ccbca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 28 Apr 2011 22:37:09 +0000 Subject: usbnet: runtime pm: fix out of memory This patch makes use of the EVENT_DEV_OPEN flag introduced recently to fix one out of memory issue, which can be reproduced on omap3/4 based pandaboard/beagle XM easily with steps below: - enable runtime pm echo auto > /sys/devices/platform/usbhs-omap.0/ehci-omap.0/usb1/1-1/1-1.1/power/control - ifconfig eth0 up - then out of memroy happened, see [1] for kernel message. Follows my analysis: - 'ifconfig eth0 up' brings eth0 out of suspend, and usbnet_resume is called to schedule dev->bh, then rx urbs are submited to prepare for recieving data; - some usbnet devices will produce garbage rx packets flood if info->reset is not called in usbnet_open. - so there is no enough chances for usbnet_bh to handle and release recieved skb buffers since many rx interrupts consumes cpu, so out of memory for atomic allocation in rx_submit happened. This patch fixes the issue by simply not allowing schedule of usbnet_bh until device is opened. [1], dmesg [ 234.712005] smsc95xx 1-1.1:1.0: rpm_resume flags 0x4 [ 234.712066] usb 1-1.1: rpm_resume flags 0x0 [ 234.712066] usb 1-1: rpm_resume flags 0x0 [ 234.712097] usb usb1: rpm_resume flags 0x0 [ 234.712127] usb usb1: usb auto-resume [ 234.712158] ehci-omap ehci-omap.0: resume root hub [ 234.754028] hub 1-0:1.0: hub_resume [ 234.754821] hub 1-0:1.0: port 1: status 0507 change 0000 [ 234.756011] hub 1-0:1.0: state 7 ports 3 chg 0000 evt 0000 [ 234.756042] hub 1-0:1.0: rpm_resume flags 0x4 [ 234.756072] usb usb1: rpm_resume flags 0x0 [ 234.756164] usb usb1: rpm_resume returns 1 [ 234.756195] hub 1-0:1.0: rpm_resume returns 0 [ 234.756195] hub 1-0:1.0: rpm_suspend flags 0x4 [ 234.756225] hub 1-0:1.0: rpm_suspend returns 0 [ 234.756256] usb usb1: rpm_resume returns 0 [ 234.757141] usb 1-1: usb auto-resume [ 234.793151] ehci-omap ehci-omap.0: GetStatus port:1 status 001005 0 ACK POWER sig=se0 PE CONNECT [ 234.816558] usb 1-1: finish resume [ 234.817871] hub 1-1:1.0: hub_resume [ 234.818420] hub 1-1:1.0: port 1: status 0507 change 0000 [ 234.820495] ehci-omap ehci-omap.0: reused qh eec50220 schedule [ 234.820495] usb 1-1: link qh256-0001/eec50220 start 1 [1/0 us] [ 234.820587] usb 1-1: rpm_resume returns 0 [ 234.820800] hub 1-1:1.0: state 7 ports 5 chg 0000 evt 0000 [ 234.820800] hub 1-1:1.0: rpm_resume flags 0x4 [ 234.820831] hub 1-1:1.0: rpm_resume returns 0 [ 234.820861] hub 1-1:1.0: rpm_suspend flags 0x4 [ 234.820861] hub 1-1:1.0: rpm_suspend returns 0 [ 234.821777] usb 1-1.1: usb auto-resume [ 234.868591] hub 1-1:1.0: state 7 ports 5 chg 0000 evt 0002 [ 234.868591] hub 1-1:1.0: rpm_resume flags 0x4 [ 234.868621] hub 1-1:1.0: rpm_resume returns 0 [ 234.868652] hub 1-1:1.0: rpm_suspend flags 0x4 [ 234.868652] hub 1-1:1.0: rpm_suspend returns 0 [ 234.879486] usb 1-1.1: finish resume [ 234.880279] usb 1-1.1: rpm_resume returns 0 [ 234.880310] smsc95xx 1-1.1:1.0: rpm_resume returns 0 [ 238.880187] ksoftirqd/0: page allocation failure. order:0, mode:0x20 [ 238.880218] Backtrace: [ 238.880249] [] (dump_backtrace+0x0/0xf8) from [] (dump_stack+0x18/0x1c) [ 238.880249] r6:00000000 r5:00000000 r4:00000020 r3:00000002 [ 238.880310] [] (dump_stack+0x0/0x1c) from [] (__alloc_pages_nodemask+0x620/0x724) [ 238.880340] [] (__alloc_pages_nodemask+0x0/0x724) from [] (kmem_getpages.clone.34+0x34/0xc8) [ 238.880371] [] (kmem_getpages.clone.34+0x0/0xc8) from [] (cache_grow.clone.42+0x84/0x154) [ 238.880371] r6:ef871aa4 r5:ef871a80 r4:ef81fd40 r3:00000020 [ 238.880401] [] (cache_grow.clone.42+0x0/0x154) from [] (cache_alloc_refill+0x19c/0x1f0) [ 238.880432] [] (cache_alloc_refill+0x0/0x1f0) from [] (kmem_cache_alloc+0x90/0x190) [ 238.880462] [] (kmem_cache_alloc+0x0/0x190) from [] (__alloc_skb+0x34/0xe8) [ 238.880493] [] (__alloc_skb+0x0/0xe8) from [] (rx_submit+0x2c/0x1d4 [usbnet]) [ 238.880523] [] (rx_submit+0x0/0x1d4 [usbnet]) from [] (rx_complete+0x19c/0x1b0 [usbnet]) [ 238.880737] [] (rx_complete+0x0/0x1b0 [usbnet]) from [] (usb_hcd_giveback_urb+0xa8/0xf4 [usbcore]) [ 238.880737] r8:eeeced34 r7:eeecec00 r6:eeecec00 r5:00000000 r4:eec2dd20 [ 238.880767] r3:bf050b9c [ 238.880859] [] (usb_hcd_giveback_urb+0x0/0xf4 [usbcore]) from [] (ehci_urb_done+0xb0/0xbc [ehci_hcd]) [ 238.880859] r6:00000000 r5:eec2dd20 r4:eeeced44 r3:eec2dd34 [ 238.880920] [] (ehci_urb_done+0x0/0xbc [ehci_hcd]) from [] (qh_completions+0x308/0x3bc [ehci_hcd]) [ 238.880920] r7:00000000 r6:eeda21a0 r5:ffdfe3c0 r4:eeda21ac [ 238.880981] [] (qh_completions+0x0/0x3bc [ehci_hcd]) from [] (scan_async+0xb0/0x16c [ehci_hcd]) [ 238.881011] [] (scan_async+0x0/0x16c [ehci_hcd]) from [] (ehci_work+0x38/0x90 [ehci_hcd]) [ 238.881042] [] (ehci_work+0x0/0x90 [ehci_hcd]) from [] (ehci_irq+0x300/0x34c [ehci_hcd]) [ 238.881072] r4:eeeced34 r3:00000001 [ 238.881134] [] (ehci_irq+0x0/0x34c [ehci_hcd]) from [] (usb_hcd_irq+0x40/0xac [usbcore]) [ 238.881195] [] (usb_hcd_irq+0x0/0xac [usbcore]) from [] (handle_irq_event_percpu+0xb8/0x240) [ 238.881225] r6:eec504e0 r5:0000006d r4:eec504e0 r3:bf0067e8 [ 238.881256] [] (handle_irq_event_percpu+0x0/0x240) from [] (handle_irq_event+0x44/0x64) [ 238.881256] [] (handle_irq_event+0x0/0x64) from [] (handle_level_irq+0xe0/0x114) [ 238.881286] r6:0000006d r5:c080c14c r4:c080c100 r3:00020000 [ 238.881317] [] (handle_level_irq+0x0/0x114) from [] (asm_do_IRQ+0x90/0xd0) [ 238.881317] r5:00000000 r4:0000006d [ 238.881347] [] (asm_do_IRQ+0x0/0xd0) from [] (__irq_svc+0x50/0x134) [ 238.881378] Exception stack(0xef837e20 to 0xef837e68) [ 238.881378] 7e20: 00000001 00185610 016cc000 c00490c0 eb380000 ef800540 00000020 00004ae0 [ 238.881408] 7e40: 00000020 bf0509f4 60000013 ef837e9c ef837e40 ef837e68 c0226f0c c0298ca0 [ 238.881408] 7e60: 20000013 ffffffff [ 238.881408] r5:fa240100 r4:ffffffff [ 238.881439] [] (__kmalloc_track_caller+0x0/0x1d0) from [] (__alloc_skb+0x58/0xe8) [ 238.881469] [] (__alloc_skb+0x0/0xe8) from [] (rx_submit+0x2c/0x1d4 [usbnet]) [ 238.881500] [] (rx_submit+0x0/0x1d4 [usbnet]) from [] (usbnet_bh+0x1b4/0x250 [usbnet]) [ 238.881530] [] (usbnet_bh+0x0/0x250 [usbnet]) from [] (tasklet_action+0xb0/0x1f8) [ 238.881530] r6:00000000 r5:ef9757f0 r4:ef9757ec r3:bf051224 [ 238.881561] [] (tasklet_action+0x0/0x1f8) from [] (__do_softirq+0x140/0x290) [ 238.881561] r8:00000006 r7:00000101 r6:00000000 r5:c0806098 r4:00000001 [ 238.881591] r3:c01f907c [ 238.881622] [] (__do_softirq+0x0/0x290) from [] (run_ksoftirqd+0xd0/0x1f4) [ 238.881622] [] (run_ksoftirqd+0x0/0x1f4) from [] (kthread+0x90/0x98) [ 238.881652] r7:00000013 r6:c01f98fc r5:00000000 r4:ef831efc [ 238.881683] [] (kthread+0x0/0x98) from [] (do_exit+0x0/0x374) [ 238.881713] r6:c01f62f4 r5:c0211320 r4:ef831efc [ 238.881713] Mem-info: [ 238.881744] Normal per-cpu: [ 238.881744] CPU 0: hi: 186, btch: 31 usd: 38 [ 238.881744] CPU 1: hi: 186, btch: 31 usd: 169 [ 238.881774] HighMem per-cpu: [ 238.881774] CPU 0: hi: 90, btch: 15 usd: 66 [ 238.881774] CPU 1: hi: 90, btch: 15 usd: 86 [ 238.881805] active_anon:544 inactive_anon:71 isolated_anon:0 [ 238.881805] active_file:926 inactive_file:2538 isolated_file:0 [ 238.881805] unevictable:0 dirty:10 writeback:0 unstable:0 [ 238.881805] free:57782 slab_reclaimable:864 slab_unreclaimable:186898 [ 238.881805] mapped:632 shmem:144 pagetables:50 bounce:0 [ 238.881835] Normal free:1328kB min:3532kB low:4412kB high:5296kB active_anon:0kB inactive_anon:0kB active_file:880kB inactive_file:848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:780288kB mlocked:0kB dirty:36kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:3456kB slab_unreclaimable:747592kB kernel_stack:392kB pagetables:200kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no [ 238.881866] lowmem_reserve[]: 0 1904 1904 [ 238.881896] HighMem free:229800kB min:236kB low:508kB high:784kB active_anon:2176kB inactive_anon:284kB active_file:2824kB inactive_file:9304kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:243712kB mlocked:0kB dirty:4kB writeback:0kB mapped:2528kB shmem:576kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no [ 238.881927] lowmem_reserve[]: 0 0 0 [ 238.881958] Normal: 0*4kB 4*8kB 6*16kB 0*32kB 1*64kB 1*128kB 0*256kB 2*512kB 0*1024kB 0*2048kB 0*4096kB = 1344kB [ 238.882019] HighMem: 6*4kB 2*8kB 4*16kB 4*32kB 1*64kB 1*128kB 0*256kB 2*512kB 3*1024kB 0*2048kB 55*4096kB = 229800kB [ 238.882080] 3610 total pagecache pages [ 238.882080] 0 pages in swap cache [ 238.882080] Swap cache stats: add 0, delete 0, find 0/0 [ 238.882110] Free swap = 0kB [ 238.882110] Total swap = 0kB [ 238.933776] 262144 pages of RAM [ 238.933776] 58240 free pages [ 238.933776] 10503 reserved pages [ 238.933776] 187773 slab pages [ 238.933807] 2475 pages shared [ 238.933807] 0 pages swap cached Signed-off-by: Ming Lei Acked-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 009bba3..9ab439d 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -645,6 +645,7 @@ int usbnet_stop (struct net_device *net) struct driver_info *info = dev->driver_info; int retval; + clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); netif_info(dev, ifdown, dev->net, @@ -1524,9 +1525,12 @@ int usbnet_resume (struct usb_interface *intf) smp_mb(); clear_bit(EVENT_DEV_ASLEEP, &dev->flags); spin_unlock_irq(&dev->txq.lock); - if (!(dev->txq.qlen >= TX_QLEN(dev))) - netif_start_queue(dev->net); - tasklet_schedule (&dev->bh); + + if (test_bit(EVENT_DEV_OPEN, &dev->flags)) { + if (!(dev->txq.qlen >= TX_QLEN(dev))) + netif_start_queue(dev->net); + tasklet_schedule (&dev->bh); + } } return 0; } -- cgit v1.1 From 87e9af6cc67d842cd92b52b81f3f14e665e7ab05 Mon Sep 17 00:00:00 2001 From: Kurt Van Dijck Date: Mon, 2 May 2011 04:50:48 +0000 Subject: can: fix SJA1000 dlc for RTR packets RTR frames do have a valid data length code on CAN. The driver for SJA1000 did not handle that situation properly. Signed-off-by: Kurt Van Dijck Acked-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- drivers/net/can/sja1000/sja1000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index a358ea9..f501bba 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -346,10 +346,10 @@ static void sja1000_rx(struct net_device *dev) | (priv->read_reg(priv, REG_ID2) >> 5); } + cf->can_dlc = get_can_dlc(fi & 0x0F); if (fi & FI_RTR) { id |= CAN_RTR_FLAG; } else { - cf->can_dlc = get_can_dlc(fi & 0x0F); for (i = 0; i < cf->can_dlc; i++) cf->data[i] = priv->read_reg(priv, dreg++); } -- cgit v1.1 From a00e0d714fbded07a7a2254391ce9ed5a5cb9d82 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Feb 2011 17:14:39 -0800 Subject: rcu: Remove conditional compilation for RCU CPU stall warnings The RCU CPU stall warnings can now be controlled using the rcu_cpu_stall_suppress boot-time parameter or via the same parameter from sysfs. There is therefore no longer any reason to have kernel config parameters for this feature. This commit therefore removes the RCU_CPU_STALL_DETECTOR and RCU_CPU_STALL_DETECTOR_RUNNABLE kernel config parameters. The RCU_CPU_STALL_TIMEOUT parameter remains to allow the timeout to be tuned and the RCU_CPU_STALL_VERBOSE parameter remains to allow task-stall information to be suppressed if desired. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/00-INDEX | 2 +- Documentation/RCU/stallwarn.txt | 23 +++++++++++++---------- kernel/rcutree.c | 26 +------------------------- kernel/rcutree.h | 12 ------------ kernel/rcutree_plugin.h | 12 ------------ lib/Kconfig.debug | 30 ++---------------------------- 6 files changed, 17 insertions(+), 88 deletions(-) diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 71b6f50..1d7a885 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -21,7 +21,7 @@ rcu.txt RTFP.txt - List of RCU papers (bibliography) going back to 1980. stallwarn.txt - - RCU CPU stall warnings (CONFIG_RCU_CPU_STALL_DETECTOR) + - RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress) torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) trace.txt diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 862c08e..4e95920 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -1,22 +1,25 @@ Using RCU's CPU Stall Detector -The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables -RCU's CPU stall detector, which detects conditions that unduly delay -RCU grace periods. The stall detector's idea of what constitutes -"unduly delayed" is controlled by a set of C preprocessor macros: +The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall +detector, which detects conditions that unduly delay RCU grace periods. +This module parameter enables CPU stall detection by default, but +may be overridden via boot-time parameter or at runtime via sysfs. +The stall detector's idea of what constitutes "unduly delayed" is +controlled by a set of kernel configuration variables and cpp macros: -RCU_SECONDS_TILL_STALL_CHECK +CONFIG_RCU_CPU_STALL_TIMEOUT - This macro defines the period of time that RCU will wait from - the beginning of a grace period until it issues an RCU CPU - stall warning. This time period is normally ten seconds. + This kernel configuration parameter defines the period of time + that RCU will wait from the beginning of a grace period until it + issues an RCU CPU stall warning. This time period is normally + ten seconds. RCU_SECONDS_TILL_STALL_RECHECK This macro defines the period of time that RCU will wait after issuing a stall warning until it issues another stall warning - for the same stall. This time period is normally set to thirty - seconds. + for the same stall. This time period is normally set to three + times the check interval plus thirty seconds. RCU_STALL_RAT_DELAY diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea8..18f7a59 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -140,10 +140,8 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +int rcu_cpu_stall_suppress __read_mostly; module_param(rcu_cpu_stall_suppress, int, 0644); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,8 +448,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_NO_HZ */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -587,26 +583,6 @@ static void __init check_cpu_stall_init(void) atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_state *rsp) -{ -} - -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) -{ -} - -void rcu_cpu_stall_reset(void) -{ -} - -static void __init check_cpu_stall_init(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e8f057e..e1a6663 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -254,7 +254,6 @@ struct rcu_data { #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -272,13 +271,6 @@ struct rcu_data { /* scheduling clock irq */ /* before ratting on them. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE -#define RCU_CPU_STALL_SUPPRESS_INIT 0 -#else -#define RCU_CPU_STALL_SUPPRESS_INIT 1 -#endif - -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -325,12 +317,10 @@ struct rcu_state { /* due to lock unavailable. */ unsigned long n_force_qs_ngp; /* Number of calls leaving */ /* due to no GP active. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ char *name; /* Name of structure. */ }; @@ -366,11 +356,9 @@ static int rcu_preempted_readers(struct rcu_node *rnp); static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_stall_reset(void); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a363871..38426ef 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO - "\tRCU-based detection of stalled CPUs is disabled.\n"); -#endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif @@ -356,8 +352,6 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* @@ -430,8 +424,6 @@ static void rcu_preempt_stall_reset(void) rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace @@ -862,8 +854,6 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - /* * Because preemptable RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. @@ -888,8 +878,6 @@ static void rcu_preempt_stall_reset(void) { } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Because there is no preemptable RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c768bcd..93ce6de3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -875,22 +875,9 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. -config RCU_CPU_STALL_DETECTOR - bool "Check for stalled CPUs delaying RCU grace periods" - depends on TREE_RCU || TREE_PREEMPT_RCU - default y - help - This option causes RCU to printk information on which - CPUs are delaying the current grace period, but only when - the grace period extends for excessive time periods. - - Say N if you want to disable such checks. - - Say Y if you are unsure. - config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" - depends on RCU_CPU_STALL_DETECTOR + depends on TREE_RCU || TREE_PREEMPT_RCU range 3 300 default 60 help @@ -899,22 +886,9 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. -config RCU_CPU_STALL_DETECTOR_RUNNABLE - bool "RCU CPU stall checking starts automatically at boot" - depends on RCU_CPU_STALL_DETECTOR - default y - help - If set, start checking for RCU CPU stalls immediately on - boot. Otherwise, RCU CPU stall checking must be manually - enabled. - - Say Y if you are unsure. - - Say N if you wish to suppress RCU CPU stall checking during boot. - config RCU_CPU_STALL_VERBOSE bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" - depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU + depends on TREE_PREEMPT_RCU default y help This option causes RCU to printk detailed per-task information -- cgit v1.1 From e59fb3120becfb36b22ddb8bd27d065d3cdca499 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Sep 2010 10:38:22 -0700 Subject: rcu: Decrease memory-barrier usage based on semi-formal proof Commit d09b62d fixed grace-period synchronization, but left some smp_mb() invocations in rcu_process_callbacks() that are no longer needed, but sheer paranoia prevented them from being removed. This commit removes them and provides a proof of correctness in their absence. It also adds a memory barrier to rcu_report_qs_rsp() immediately before the update to rsp->completed in order to handle the theoretical possibility that the compiler or CPU might move massive quantities of code into a lock-based critical section. This also proves that the sheer paranoia was not entirely unjustified, at least from a theoretical point of view. In addition, the old dyntick-idle synchronization depended on the fact that grace periods were many milliseconds in duration, so that it could be assumed that no dyntick-idle CPU could reorder a memory reference across an entire grace period. Unfortunately for this design, the addition of expedited grace periods breaks this assumption, which has the unfortunate side-effect of requiring atomic operations in the functions that track dyntick-idle state for RCU. (There is some hope that the algorithms used in user-level RCU might be applied here, but some work is required to handle the NMIs that user-space applications can happily ignore. For the short term, better safe than sorry.) This proof assumes that neither compiler nor CPU will allow a lock acquisition and release to be reordered, as doing so can result in deadlock. The proof is as follows: 1. A given CPU declares a quiescent state under the protection of its leaf rcu_node's lock. 2. If there is more than one level of rcu_node hierarchy, the last CPU to declare a quiescent state will also acquire the ->lock of the next rcu_node up in the hierarchy, but only after releasing the lower level's lock. The acquisition of this lock clearly cannot occur prior to the acquisition of the leaf node's lock. 3. Step 2 repeats until we reach the root rcu_node structure. Please note again that only one lock is held at a time through this process. The acquisition of the root rcu_node's ->lock must occur after the release of that of the leaf rcu_node. 4. At this point, we set the ->completed field in the rcu_state structure in rcu_report_qs_rsp(). However, if the rcu_node hierarchy contains only one rcu_node, then in theory the code preceding the quiescent state could leak into the critical section. We therefore precede the update of ->completed with a memory barrier. All CPUs will therefore agree that any updates preceding any report of a quiescent state will have happened before the update of ->completed. 5. Regardless of whether a new grace period is needed, rcu_start_gp() will propagate the new value of ->completed to all of the leaf rcu_node structures, under the protection of each rcu_node's ->lock. If a new grace period is needed immediately, this propagation will occur in the same critical section that ->completed was set in, but courtesy of the memory barrier in #4 above, is still seen to follow any pre-quiescent-state activity. 6. When a given CPU invokes __rcu_process_gp_end(), it becomes aware of the end of the old grace period and therefore makes any RCU callbacks that were waiting on that grace period eligible for invocation. If this CPU is the same one that detected the end of the grace period, and if there is but a single rcu_node in the hierarchy, we will still be in the single critical section. In this case, the memory barrier in step #4 guarantees that all callbacks will be seen to execute after each CPU's quiescent state. On the other hand, if this is a different CPU, it will acquire the leaf rcu_node's ->lock, and will again be serialized after each CPU's quiescent state for the old grace period. On the strength of this proof, this commit therefore removes the memory barriers from rcu_process_callbacks() and adds one to rcu_report_qs_rsp(). The effect is to reduce the number of memory barriers by one and to reduce the frequency of execution from about once per scheduling tick per CPU to once per grace period. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 48 +++++++--------- kernel/rcutree.c | 130 +++++++++++++++++++------------------------- kernel/rcutree.h | 9 +-- kernel/rcutree_plugin.h | 7 +-- kernel/rcutree_trace.c | 12 ++-- 5 files changed, 88 insertions(+), 118 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 6a8c73f..e731ad2 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -21,23 +21,23 @@ rcu_pending() function decided that there was core RCU work to do). The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 - 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 - 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 - 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10 - 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10 - 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10 - 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10 - 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10 + 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1/0 df=1101 of=0 ri=36 ql=0 b=10 + 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1/0 df=1015 of=0 ri=0 ql=0 b=10 + 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1/0 df=1839 of=0 ri=0 ql=0 b=10 + 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1/0 df=1545 of=0 ri=0 ql=0 b=10 + 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1/0 df=1992 of=0 ri=0 ql=0 b=10 + 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1/0 df=3331 of=0 ri=4 ql=2 b=10 + 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1/0 df=3224 of=0 ri=0 ql=0 b=10 + 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1/0 df=1818 of=0 ri=0 ql=2 b=10 rcu_bh: - 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10 - 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10 - 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10 - 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 - 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1/0 df=0 of=0 ri=0 ql=0 b=10 + 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1/0 df=13 of=0 ri=0 ql=0 b=10 + 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1/0 df=15 of=0 ri=0 ql=0 b=10 + 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1/0 df=9 of=0 ri=0 ql=0 b=10 + 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1/0 df=15 of=0 ri=0 ql=0 b=10 + 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1/0 df=15 of=0 ri=0 ql=0 b=10 + 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1/0 df=15 of=0 ri=0 ql=0 b=10 + 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1/0 df=15 of=0 ri=0 ql=0 b=10 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -85,18 +85,10 @@ o "qp" indicates that RCU still expects a quiescent state from o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the - scheduler or by irq. The number after the "/" is the interrupt - nesting depth when in dyntick-idle state, or one greater than - the interrupt-nesting depth otherwise. - - This field is displayed only for CONFIG_NO_HZ kernels. - -o "dn" is the current value of the dyntick counter that is incremented - when entering or leaving dynticks idle state via NMI. If both - the "dt" and "dn" values are even, then this CPU is in dynticks - idle mode and may be ignored by RCU. If either of these two - counters is odd, then RCU must be alert to the possibility of - an RCU read-side critical section running on this CPU. + scheduler or by irq. The number after the first "/" is the + interrupt nesting depth when in dyntick-idle state, or one + greater than the interrupt-nesting depth otherwise. The number + after the second "/" is the NMI nesting depth. This field is displayed only for CONFIG_NO_HZ kernels. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 18f7a59..90104a1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -128,7 +128,7 @@ void rcu_note_context_switch(int cpu) #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = 1, - .dynticks = 1, + .dynticks = ATOMIC_INIT(1), }; #endif /* #ifdef CONFIG_NO_HZ */ @@ -262,13 +262,25 @@ void rcu_enter_nohz(void) unsigned long flags; struct rcu_dynticks *rdtp; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - rdtp->dynticks++; - rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks & 0x1); + if (--rdtp->dynticks_nesting) { + local_irq_restore(flags); + return; + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); local_irq_restore(flags); + + /* If the interrupt queued a callback, get out of dyntick mode. */ + if (in_irq() && + (__get_cpu_var(rcu_sched_data).nxtlist || + __get_cpu_var(rcu_bh_data).nxtlist || + rcu_preempt_needs_cpu(smp_processor_id()))) + set_need_resched(); } /* @@ -284,11 +296,16 @@ void rcu_exit_nohz(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - rdtp->dynticks++; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); + if (rdtp->dynticks_nesting++) { + local_irq_restore(flags); + return; + } + smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); local_irq_restore(flags); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } /** @@ -302,11 +319,15 @@ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks & 0x1) + if (rdtp->dynticks_nmi_nesting == 0 && + (atomic_read(&rdtp->dynticks) & 0x1)) return; - rdtp->dynticks_nmi++; - WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + rdtp->dynticks_nmi_nesting++; + smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); } /** @@ -320,11 +341,14 @@ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks & 0x1) + if (rdtp->dynticks_nmi_nesting == 0 || + --rdtp->dynticks_nmi_nesting != 0) return; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - rdtp->dynticks_nmi++; - WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force delay to next write. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } /** @@ -335,13 +359,7 @@ void rcu_nmi_exit(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (rdtp->dynticks_nesting++) - return; - rdtp->dynticks++; - WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + rcu_exit_nohz(); } /** @@ -353,18 +371,7 @@ void rcu_irq_enter(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (--rdtp->dynticks_nesting) - return; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - rdtp->dynticks++; - WARN_ON_ONCE(rdtp->dynticks & 0x1); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__this_cpu_read(rcu_sched_data.nxtlist) || - __this_cpu_read(rcu_bh_data.nxtlist)) - set_need_resched(); + rcu_enter_nohz(); } #ifdef CONFIG_SMP @@ -376,19 +383,8 @@ void rcu_irq_exit(void) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - int ret; - int snap; - int snap_nmi; - - snap = rdp->dynticks->dynticks; - snap_nmi = rdp->dynticks->dynticks_nmi; - smp_mb(); /* Order sampling of snap with end of grace period. */ - rdp->dynticks_snap = snap; - rdp->dynticks_nmi_snap = snap_nmi; - ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); - if (ret) - rdp->dynticks_fqs++; - return ret; + rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + return 0; } /* @@ -399,16 +395,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - long curr; - long curr_nmi; - long snap; - long snap_nmi; + unsigned long curr; + unsigned long snap; - curr = rdp->dynticks->dynticks; - snap = rdp->dynticks_snap; - curr_nmi = rdp->dynticks->dynticks_nmi; - snap_nmi = rdp->dynticks_nmi_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned long)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -418,8 +409,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr != snap || (curr & 0x1) == 0) && - (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { + if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { rdp->dynticks_fqs++; return 1; } @@ -841,6 +831,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + + /* + * Ensure that all grace-period and pre-grace-period activity + * is seen before the assignment to rsp->completed. + */ + smp_mb(); /* See above block comment. */ rsp->completed = rsp->gpnum; rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -1367,25 +1363,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_process_callbacks(struct softirq_action *unused) { - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be seen before any RCU - * grace-period manipulations below. - */ - smp_mb(); /* See above block comment. */ - __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be seen after any RCU - * grace-period manipulations above. - */ - smp_mb(); /* See above block comment. */ - /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ rcu_needs_cpu_flush(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e1a6663..bd891de 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,11 +84,9 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track nesting level, sort of. */ - int dynticks; /* Even value for dynticks-idle, else odd. */ - int dynticks_nmi; /* Even value for either dynticks-idle or */ - /* not in nmi handler, else odd. So this */ - /* remains even for nmi from irq handler. */ + int dynticks_nesting; /* Track irq/process nesting level. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ }; /* @@ -218,7 +216,6 @@ struct rcu_data { /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ #endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 38426ef..764b5fc 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1182,7 +1182,6 @@ int rcu_needs_cpu(int cpu) { int c = 0; int snap; - int snap_nmi; int thatcpu; /* Check for being in the holdoff period. */ @@ -1193,10 +1192,10 @@ int rcu_needs_cpu(int cpu) for_each_online_cpu(thatcpu) { if (thatcpu == cpu) continue; - snap = per_cpu(rcu_dynticks, thatcpu).dynticks; - snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; + snap = atomic_add_return(0, &per_cpu(rcu_dynticks, + thatcpu).dynticks); smp_mb(); /* Order sampling of snap with end of grace period. */ - if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { + if ((snap & 0x1) != 0) { per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; return rcu_needs_cpu_quick_check(cpu); diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c8e9785..4a21ca5 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -57,10 +57,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->passed_quiesc, rdp->passed_quiesc_completed, rdp->qs_pending); #ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d dn=%d df=%lu", - rdp->dynticks->dynticks, + seq_printf(m, " dt=%d/%d/%d df=%lu", + atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi, + rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); @@ -115,9 +115,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", - rdp->dynticks->dynticks, + atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi, + rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); @@ -130,7 +130,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); #ifdef CONFIG_NO_HZ - seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); + seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.1 From 12f5f524cafef3ab689929b118f2dfb8bf2be321 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Nov 2010 21:56:39 -0800 Subject: rcu: merge TREE_PREEPT_RCU blocked_tasks[] lists Combine the current TREE_PREEMPT_RCU ->blocked_tasks[] lists in the rcu_node structure into a single ->blkd_tasks list with ->gp_tasks and ->exp_tasks tail pointers. This is in preparation for RCU priority boosting, which will add a third dimension to the combinatorial explosion in the ->blocked_tasks[] case, but simply a third pointer in the new ->blkd_tasks case. Also update documentation to reflect blocked_tasks[] merge Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 29 +++++--- kernel/rcutree.c | 5 +- kernel/rcutree.h | 21 ++++-- kernel/rcutree_plugin.h | 163 ++++++++++++++++++++++++++------------------ kernel/rcutree_trace.c | 11 ++- 5 files changed, 135 insertions(+), 94 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index e731ad2..5a704ff 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -166,14 +166,14 @@ o "gpnum" is the number of grace periods that have started. It is The output of "cat rcu/rcuhier" looks as follows, with very long lines: c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 -1/1 .>. 0:127 ^0 -3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 -3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 +1/1 ..>. 0:127 ^0 +3/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 +3/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3 rcu_bh: c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 -0/1 .>. 0:127 ^0 -0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 -0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 +0/1 ..>. 0:127 ^0 +0/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 +0/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3 This is once again split into "rcu_sched" and "rcu_bh" portions, and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional @@ -232,13 +232,20 @@ o Each element of the form "1/1 0:127 ^0" represents one struct current grace period. o The characters separated by the ">" indicate the state - of the blocked-tasks lists. A "T" preceding the ">" + of the blocked-tasks lists. A "G" preceding the ">" indicates that at least one task blocked in an RCU read-side critical section blocks the current grace - period, while a "." preceding the ">" indicates otherwise. - The character following the ">" indicates similarly for - the next grace period. A "T" should appear in this - field only for rcu-preempt. + period, while a "E" preceding the ">" indicates that + at least one task blocked in an RCU read-side critical + section blocks the current expedited grace period. + A "T" character following the ">" indicates that at + least one task is blocked within an RCU read-side + critical section, regardless of whether any current + grace period (expedited or normal) is inconvenienced. + A "." character appears if the corresponding condition + does not hold, so that "..>." indicates that no tasks + are blocked. In contrast, "GE>T" indicates maximal + inconvenience from blocked tasks. o The numbers separated by the ":" are the range of CPUs served by this struct rcu_node. This can be helpful diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 90104a1..0ac1cc0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1901,10 +1901,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, j / rsp->levelspread[i - 1]; } rnp->level = i; - INIT_LIST_HEAD(&rnp->blocked_tasks[0]); - INIT_LIST_HEAD(&rnp->blocked_tasks[1]); - INIT_LIST_HEAD(&rnp->blocked_tasks[2]); - INIT_LIST_HEAD(&rnp->blocked_tasks[3]); + INIT_LIST_HEAD(&rnp->blkd_tasks); } } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index bd891de..5a439c1 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -107,7 +107,7 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blocked_tasks[] */ + unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ @@ -120,11 +120,20 @@ struct rcu_node { u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; - struct list_head blocked_tasks[4]; - /* Tasks blocked in RCU read-side critsect. */ - /* Grace period number (->gpnum) x blocked */ - /* by tasks on the (x & 0x1) element of the */ - /* blocked_tasks[] array. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is no such task. */ + struct list_head *exp_tasks; + /* Pointer to the first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there can cannot be any such task. */ } ____cacheline_internodealigned_in_smp; /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 764b5fc..774f010 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -130,12 +130,12 @@ static void rcu_preempt_qs(int cpu) * We have entered the scheduler, and the current task might soon be * context-switched away from. If this task is in an RCU read-side * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the appropriate entry - * of the blocked_tasks[] array. The task will dequeue itself when - * it exits the outermost enclosing RCU read-side critical section. - * Therefore, the current grace period cannot be permitted to complete - * until the blocked_tasks[] entry indexed by the low-order bit of - * rnp->gpnum empties. + * record that fact, so we enqueue the task on the blkd_tasks list. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the blkd_tasks list entries + * predating the current grace period drain, in other words, until + * rnp->gp_tasks becomes NULL. * * Caller must disable preemption. */ @@ -143,7 +143,6 @@ static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; - int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -165,15 +164,26 @@ static void rcu_preempt_note_context_switch(int cpu) * (i.e., this CPU has not yet passed through a quiescent * state for the current grace period), then as long * as that task remains queued, the current grace period - * cannot end. + * cannot end. Note that there is some uncertainty as + * to exactly when the current grace period started. + * We take a conservative approach, which can result + * in unnecessarily waiting on tasks that started very + * slightly after the current grace period began. C'est + * la vie!!! * * But first, note that the current CPU must still be * on line! */ WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; - list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); + if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { + list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); + rnp->gp_tasks = &t->rcu_node_entry; + } else { + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + if (rnp->qsmask & rdp->grpmask) + rnp->gp_tasks = &t->rcu_node_entry; + } raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -210,10 +220,7 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); */ static int rcu_preempted_readers(struct rcu_node *rnp) { - int phase = rnp->gpnum & 0x1; - - return !list_empty(&rnp->blocked_tasks[phase]) || - !list_empty(&rnp->blocked_tasks[phase + 2]); + return rnp->gp_tasks != NULL; } /* @@ -253,6 +260,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) } /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t, + struct rcu_node *rnp) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rnp->blkd_tasks) + np = NULL; + return np; +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -262,6 +284,7 @@ static void rcu_read_unlock_special(struct task_struct *t) int empty; int empty_exp; unsigned long flags; + struct list_head *np; struct rcu_node *rnp; int special; @@ -305,7 +328,12 @@ static void rcu_read_unlock_special(struct task_struct *t) empty = !rcu_preempted_readers(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp->gp_tasks = np; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp->exp_tasks = np; t->rcu_blocked_node = NULL; /* @@ -361,18 +389,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) { unsigned long flags; - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - raw_spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } + if (!rcu_preempted_readers(rnp)) + return; + raw_spin_lock_irqsave(&rnp->lock, flags); + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -402,16 +428,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) */ static void rcu_print_task_stall(struct rcu_node *rnp) { - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - printk(" P%d", t->pid); - } + if (!rcu_preempted_readers(rnp)) + return; + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + printk(" P%d", t->pid); } /* @@ -430,10 +454,15 @@ static void rcu_preempt_stall_reset(void) * period that still has RCU readers blocked! This function must be * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock * must be held by the caller. + * + * Also, if there are blocked tasks on the list, they automatically + * block the newly created grace period, so set up ->gp_tasks accordingly. */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rcu_preempted_readers(rnp)); + if (!list_empty(&rnp->blkd_tasks)) + rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -457,45 +486,49 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; struct list_head *lp; struct list_head *lp_root; int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *tp; + struct task_struct *t; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); return 0; /* Shouldn't happen: at least one CPU online. */ } - WARN_ON_ONCE(rnp != rdp->mynode && - (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]) || - !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]))); + + /* If we are on an internal node, complain bitterly. */ + WARN_ON_ONCE(rnp != rdp->mynode); /* - * Move tasks up to root rcu_node. Rely on the fact that the - * root rcu_node can be at most one ahead of the rest of the - * rcu_nodes in terms of gp_num value. This fact allows us to - * move the blocked_tasks[] array directly, element by element. + * Move tasks up to root rcu_node. Don't try to get fancy for + * this corner-case operation -- just put this node's tasks + * at the head of the root node's list, and update the root node's + * ->gp_tasks and ->exp_tasks pointers to those of this node's, + * if non-NULL. This might result in waiting for more tasks than + * absolutely necessary, but this is a good performance/complexity + * tradeoff. */ if (rcu_preempted_readers(rnp)) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; - for (i = 0; i < 4; i++) { - lp = &rnp->blocked_tasks[i]; - lp_root = &rnp_root->blocked_tasks[i]; - while (!list_empty(lp)) { - tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - list_del(&tp->rcu_node_entry); - tp->rcu_blocked_node = rnp_root; - list_add(&tp->rcu_node_entry, lp_root); - raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ - } + lp = &rnp->blkd_tasks; + lp_root = &rnp_root->blkd_tasks; + while (!list_empty(lp)) { + t = list_entry(lp->next, typeof(*t), rcu_node_entry); + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&t->rcu_node_entry); + t->rcu_blocked_node = rnp_root; + list_add(&t->rcu_node_entry, lp_root); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp_root->gp_tasks = rnp->gp_tasks; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp_root->exp_tasks = rnp->exp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; return retval; } @@ -586,8 +619,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); */ static int rcu_preempted_readers_exp(struct rcu_node *rnp) { - return !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]); + return rnp->exp_tasks != NULL; } /* @@ -647,12 +679,13 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { - int must_wait; + int must_wait = 0; raw_spin_lock(&rnp->lock); /* irqs already disabled */ - list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); - list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); - must_wait = rcu_preempted_readers_exp(rnp); + if (!list_empty(&rnp->blkd_tasks)) { + rnp->exp_tasks = rnp->blkd_tasks.next; + must_wait = 1; + } raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ if (!must_wait) rcu_report_exp_rnp(rsp, rnp); @@ -661,9 +694,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) /* * Wait for an rcu-preempt grace period, but expedite it. The basic idea * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blocked_tasks[] lists, move all entries from the first set of - * ->blocked_tasks[] lists to the second set, and finally wait for this - * second set to drain. + * the ->blkd_tasks lists and wait for this list to drain. */ void synchronize_rcu_expedited(void) { @@ -695,7 +726,7 @@ void synchronize_rcu_expedited(void) if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) goto unlock_mb_ret; /* Others did our work for us. */ - /* force all RCU readers onto blocked_tasks[]. */ + /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -707,7 +738,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - /* Snapshot current state of ->blocked_tasks[] lists. */ + /* Snapshot current state of ->blkd_tasks lists. */ rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init(rsp, rnp); if (NUM_RCU_NODES > 1) @@ -715,7 +746,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, sync_rcu_preempt_exp_done(rnp)); diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4a21ca5..1cedf94 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -161,7 +161,6 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { unsigned long gpnum; int level = 0; - int phase; struct rcu_node *rnp; gpnum = rsp->gpnum; @@ -178,13 +177,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - phase = gpnum & 0x1; - seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", + seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", rnp->qsmask, rnp->qsmaskinit, - "T."[list_empty(&rnp->blocked_tasks[phase])], - "E."[list_empty(&rnp->blocked_tasks[phase + 2])], - "T."[list_empty(&rnp->blocked_tasks[!phase])], - "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], + ".G"[rnp->gp_tasks != NULL], + ".E"[rnp->exp_tasks != NULL], + ".T"[!list_empty(&rnp->blkd_tasks)], rnp->grplo, rnp->grphi, rnp->grpnum); } seq_puts(m, "\n"); -- cgit v1.1 From a26ac2455ffcf3be5c6ef92bc6df7182700f2114 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2011 14:10:23 -0800 Subject: rcu: move TREE_RCU from softirq to kthread If RCU priority boosting is to be meaningful, callback invocation must be boosted in addition to preempted RCU readers. Otherwise, in presence of CPU real-time threads, the grace period ends, but the callbacks don't get invoked. If the callbacks don't get invoked, the associated memory doesn't get freed, so the system is still subject to OOM. But it is not reasonable to priority-boost RCU_SOFTIRQ, so this commit moves the callback invocations to a kthread, which can be boosted easily. Also add comments and properly synchronized all accesses to rcu_cpu_kthread_task, as suggested by Lai Jiangshan. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/filesystems/proc.txt | 1 - include/linux/interrupt.h | 1 - include/trace/events/irq.h | 3 +- kernel/rcutree.c | 340 +++++++++++++++++++++++++++++++++++- kernel/rcutree.h | 8 + kernel/rcutree_plugin.h | 4 +- kernel/softirq.c | 2 +- tools/perf/util/trace-event-parse.c | 1 - 8 files changed, 348 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b0b814d..60740e8 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -836,7 +836,6 @@ Provides counts of softirq handlers serviced since boot time, for each cpu. TASKLET: 0 0 0 290 SCHED: 27035 26983 26971 26746 HRTIMER: 0 0 0 0 - RCU: 1678 1769 2178 2250 1.3 IDE devices in /proc/ide diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index bea0ac7..6c12989 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -414,7 +414,6 @@ enum TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, - RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 1c09820..ae045ca 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -20,8 +20,7 @@ struct softirq_action; softirq_name(BLOCK_IOPOLL), \ softirq_name(TASKLET), \ softirq_name(SCHED), \ - softirq_name(HRTIMER), \ - softirq_name(RCU)) + softirq_name(HRTIMER)) /** * irq_handler_entry - called immediately before the irq action handler diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0ac1cc0..18e3331 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include "rcutree.h" @@ -83,6 +85,20 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); /* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); +static DEFINE_PER_CPU(char, rcu_cpu_has_work); +static char rcu_kthreads_spawnable; + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp); +static void invoke_rcu_kthread(void); + +#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ + +/* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -1009,6 +1025,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) /* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { @@ -1017,6 +1035,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1054,6 +1080,19 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); + + /* + * If there are no more online CPUs for this rcu_node structure, + * kill the rcu_node structure's kthread. Otherwise, adjust its + * affinity. + */ + t = rnp->node_kthread_task; + if (t != NULL && + rnp->qsmaskinit == 0) { + kthread_stop(t); + rnp->node_kthread_task = NULL; + } else + rcu_node_kthread_setaffinity(rnp); } /* @@ -1151,7 +1190,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -1197,7 +1236,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } #ifdef CONFIG_SMP @@ -1361,7 +1400,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* * Do softirq processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static void rcu_process_callbacks(void) { __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1372,6 +1411,281 @@ static void rcu_process_callbacks(struct softirq_action *unused) rcu_needs_cpu_flush(); } +/* + * Wake up the current CPU's kthread. This replaces raise_softirq() + * in earlier versions of RCU. Note that because we are running on + * the current CPU with interrupts disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. + */ +static void invoke_rcu_kthread(void) +{ + unsigned long flags; + wait_queue_head_t *q; + int cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + per_cpu(rcu_cpu_has_work, cpu) = 1; + if (per_cpu(rcu_cpu_kthread_task, cpu) == NULL) { + local_irq_restore(flags); + return; + } + q = &per_cpu(rcu_cpu_wq, cpu); + wake_up(q); + local_irq_restore(flags); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + unsigned long flags; + struct rcu_data *rdp = (struct rcu_data *)arg; + struct rcu_node *rnp = rdp->mynode; + struct task_struct *t; + + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->wakemask |= rdp->grpmask; + t = rnp->node_kthread_task; + if (t == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + wake_up_process(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, rcu_cpu_kthread_timer, (unsigned long)rdp); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + wait_event_interruptible(*wqp, + *workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(cpu); + spincnt = 0; + } + } + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + kthread_bind(t, cpu); + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 || + kthread_should_stop()); + if (kthread_should_stop()) + break; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = rnp->wakemask; + rnp->wakemask = 0; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL || + rnp->qsmaskinit == 0) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if (mask & 0x1) + cpumask_set_cpu(cpu, cm); + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + rnp->qsmaskinit == 0 || + rnp->node_kthread_task != NULL) + return 0; + t = kthread_create(rcu_node_kthread, (void *)rnp, "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + rnp->node_kthread_task = t; + wake_up_process(t); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_kthreads_spawnable = 1; + for_each_possible_cpu(cpu) { + init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu)); + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rcu_for_each_leaf_node(&rcu_sched_state, rnp) { + init_waitqueue_head(&rnp->node_wq); + (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -1771,6 +2085,19 @@ static void __cpuinit rcu_online_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } +static void __cpuinit rcu_online_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_kthreads_spawnable) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp); + } +} + /* * Handle CPU online/offline notification events. */ @@ -1778,11 +2105,17 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct rcu_node *rnp = rdp->mynode; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); + rcu_online_kthreads(cpu); + break; + case CPU_ONLINE: + rcu_node_kthread_setaffinity(rnp); break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -1923,7 +2256,6 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5a439c1..c021380 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -111,6 +111,7 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ + unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -134,6 +135,13 @@ struct rcu_node { /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there can cannot be any such task. */ + struct task_struct *node_kthread_task; + /* kthread that takes care of this rcu_node */ + /* structure, for example, awakening the */ + /* per-CPU kthreads as needed. */ + wait_queue_head_t node_wq; + /* Wait queue on which to park the per-node */ + /* kthread. */ } ____cacheline_internodealigned_in_smp; /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 774f010..b9bd69a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1206,7 +1206,7 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * raise_softirq() to cause rcu_process_callbacks() to be invoked later. + * invoke_rcu_kthread() to cause rcu_process_callbacks() to be invoked later. * The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) @@ -1257,7 +1257,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); return c; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 174f976..1396017 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER", "RCU" + "TASKLET", "SCHED", "HRTIMER" }; /* diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 0a7ed5b..1e88485 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -2187,7 +2187,6 @@ static const struct flag flags[] = { { "TASKLET_SOFTIRQ", 6 }, { "SCHED_SOFTIRQ", 7 }, { "HRTIMER_SOFTIRQ", 8 }, - { "RCU_SOFTIRQ", 9 }, { "HRTIMER_NORESTART", 0 }, { "HRTIMER_RESTART", 1 }, -- cgit v1.1 From 27f4d28057adf98750cf863c40baefb12f5b6d21 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Feb 2011 12:47:15 -0800 Subject: rcu: priority boosting for TREE_PREEMPT_RCU Add priority boosting for TREE_PREEMPT_RCU, similar to that for TINY_PREEMPT_RCU. This is enabled by the default-off RCU_BOOST kernel parameter. The priority to which to boost preempted RCU readers is controlled by the RCU_BOOST_PRIO kernel parameter (defaulting to real-time priority 1) and the time to wait before boosting the readers who are blocking a given grace period is controlled by the RCU_BOOST_DELAY kernel parameter (defaulting to 500 milliseconds). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- init/Kconfig | 2 +- kernel/rcutree.c | 115 ++++++++++++------ kernel/rcutree.h | 31 ++++- kernel/rcutree_plugin.h | 314 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 411 insertions(+), 51 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index d886b1e..2d964fa 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -485,7 +485,7 @@ config TREE_RCU_TRACE config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && TINY_PREEMPT_RCU + depends on RT_MUTEXES && PREEMPT_RCU default n help This option boosts the priority of preempted RCU readers that diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 18e3331..28fd92a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -81,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +static struct rcu_state *rcu_state; + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); @@ -94,7 +96,7 @@ static DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; static void rcu_node_kthread_setaffinity(struct rcu_node *rnp); -static void invoke_rcu_kthread(void); +static void invoke_rcu_cpu_kthread(void); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ @@ -791,6 +793,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -826,6 +829,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -882,7 +886,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } rnp->qsmask &= ~mask; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1089,8 +1093,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) t = rnp->node_kthread_task; if (t != NULL && rnp->qsmaskinit == 0) { - kthread_stop(t); + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->node_kthread_task = NULL; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + kthread_stop(t); + rcu_stop_boost_kthread(rnp); } else rcu_node_kthread_setaffinity(rnp); } @@ -1190,7 +1197,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_kthread(); + invoke_rcu_cpu_kthread(); } /* @@ -1236,7 +1243,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - invoke_rcu_kthread(); + invoke_rcu_cpu_kthread(); } #ifdef CONFIG_SMP @@ -1244,6 +1251,8 @@ void rcu_check_callbacks(int cpu, int user) /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. + * Also initiate boosting for any threads blocked on the root rcu_node. + * * The caller must have suppressed start of new grace periods. */ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) @@ -1262,6 +1271,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) return; } if (rnp->qsmask == 0) { + rcu_initiate_boost(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; } @@ -1280,6 +1290,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) } raw_spin_unlock_irqrestore(&rnp->lock, flags); } + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rnp->qsmask == 0) + rcu_initiate_boost(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1417,7 +1432,7 @@ static void rcu_process_callbacks(void) * the current CPU with interrupts disabled, the rcu_cpu_kthread_task * cannot disappear out from under us. */ -static void invoke_rcu_kthread(void) +static void invoke_rcu_cpu_kthread(void) { unsigned long flags; wait_queue_head_t *q; @@ -1436,24 +1451,33 @@ static void invoke_rcu_kthread(void) } /* + * Wake up the specified per-rcu_node-structure kthread. + * The caller must hold ->lock. + */ +static void invoke_rcu_node_kthread(struct rcu_node *rnp) +{ + struct task_struct *t; + + t = rnp->node_kthread_task; + if (t != NULL) + wake_up_process(t); +} + +/* * Timer handler to initiate the waking up of per-CPU kthreads that * have yielded the CPU due to excess numbers of RCU callbacks. + * We wake up the per-rcu_node kthread, which in turn will wake up + * the booster kthread. */ static void rcu_cpu_kthread_timer(unsigned long arg) { unsigned long flags; - struct rcu_data *rdp = (struct rcu_data *)arg; + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); struct rcu_node *rnp = rdp->mynode; - struct task_struct *t; raw_spin_lock_irqsave(&rnp->lock, flags); rnp->wakemask |= rdp->grpmask; - t = rnp->node_kthread_task; - if (t == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - wake_up_process(t); + invoke_rcu_node_kthread(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1463,13 +1487,12 @@ static void rcu_cpu_kthread_timer(unsigned long arg) * remain preempted. Either way, we restore our real-time priority * before returning. */ -static void rcu_yield(int cpu) +static void rcu_yield(void (*f)(unsigned long), unsigned long arg) { - struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); struct sched_param sp; struct timer_list yield_timer; - setup_timer_on_stack(&yield_timer, rcu_cpu_kthread_timer, (unsigned long)rdp); + setup_timer_on_stack(&yield_timer, f, arg); mod_timer(&yield_timer, jiffies + 2); sp.sched_priority = 0; sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); @@ -1540,7 +1563,7 @@ static int rcu_cpu_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { - rcu_yield(cpu); + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); spincnt = 0; } } @@ -1597,6 +1620,7 @@ static int rcu_node_kthread(void *arg) raw_spin_lock_irqsave(&rnp->lock, flags); mask = rnp->wakemask; rnp->wakemask = 0; + rcu_initiate_boost(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { if ((mask & 0x1) == 0) @@ -1618,7 +1642,8 @@ static int rcu_node_kthread(void *arg) /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. */ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) { @@ -1626,8 +1651,7 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) int cpu; unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL || - rnp->qsmaskinit == 0) + if (rnp->node_kthread_task == NULL || mask == 0) return; if (!alloc_cpumask_var(&cm, GFP_KERNEL)) return; @@ -1636,31 +1660,40 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) if (mask & 0x1) cpumask_set_cpu(cpu, cm); set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + rcu_boost_kthread_setaffinity(rnp, cm); free_cpumask_var(cm); } /* * Spawn a per-rcu_node kthread, setting priority and affinity. + * Called during boot before online/offline can happen, or, if + * during runtime, with the main CPU-hotplug locks held. So only + * one of these can be executing at a time. */ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, struct rcu_node *rnp) { + unsigned long flags; int rnp_index = rnp - &rsp->node[0]; struct sched_param sp; struct task_struct *t; if (!rcu_kthreads_spawnable || - rnp->qsmaskinit == 0 || - rnp->node_kthread_task != NULL) + rnp->qsmaskinit == 0) return 0; - t = kthread_create(rcu_node_kthread, (void *)rnp, "rcun%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - rnp->node_kthread_task = t; - wake_up_process(t); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - return 0; + if (rnp->node_kthread_task == NULL) { + t = kthread_create(rcu_node_kthread, (void *)rnp, + "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->node_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); } /* @@ -1678,10 +1711,16 @@ static int __init rcu_spawn_kthreads(void) if (cpu_online(cpu)) (void)rcu_spawn_one_cpu_kthread(cpu); } - rcu_for_each_leaf_node(&rcu_sched_state, rnp) { - init_waitqueue_head(&rnp->node_wq); - (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp); - } + rnp = rcu_get_root(rcu_state); + init_waitqueue_head(&rnp->node_wq); + rcu_init_boost_waitqueue(rnp); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) + rcu_for_each_leaf_node(rcu_state, rnp) { + init_waitqueue_head(&rnp->node_wq); + rcu_init_boost_waitqueue(rnp); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } return 0; } early_initcall(rcu_spawn_kthreads); @@ -2087,14 +2126,14 @@ static void __cpuinit rcu_online_cpu(int cpu) static void __cpuinit rcu_online_kthreads(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_kthreads_spawnable) { (void)rcu_spawn_one_cpu_kthread(cpu); if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); } } @@ -2105,7 +2144,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; switch (action) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c021380..8db0cdc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -135,6 +135,24 @@ struct rcu_node { /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there can cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority boosted, or NULL if no priority */ + /* boosting is needed for this rcu_node */ + /* structure. If there are no tasks */ + /* queued on this rcu_node structure that */ + /* are blocking the current grace period, */ + /* there can be no such task. */ + unsigned long boost_time; + /* When to start boosting (jiffies). */ + struct task_struct *boost_kthread_task; + /* kthread that takes care of priority */ + /* boosting for this rcu_node structure. */ + wait_queue_head_t boost_wq; + /* Wait queue on which to park the boost */ + /* kthread. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ struct task_struct *node_kthread_task; /* kthread that takes care of this rcu_node */ /* structure, for example, awakening the */ @@ -365,7 +383,7 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); static void rcu_bootup_announce(void); long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(int cpu); -static int rcu_preempted_readers(struct rcu_node *rnp); +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); @@ -392,5 +410,16 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp); +static void rcu_initiate_boost(struct rcu_node *rnp); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_stop_boost_kthread(struct rcu_node *rnp); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index b9bd69a..5964f82 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -66,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void) struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -179,6 +180,10 @@ static void rcu_preempt_note_context_switch(int cpu) if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); rnp->gp_tasks = &t->rcu_node_entry; +#ifdef CONFIG_RCU_BOOST + if (rnp->boost_tasks != NULL) + rnp->boost_tasks = rnp->gp_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ } else { list_add(&t->rcu_node_entry, &rnp->blkd_tasks); if (rnp->qsmask & rdp->grpmask) @@ -218,7 +223,7 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { return rnp->gp_tasks != NULL; } @@ -236,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) unsigned long mask; struct rcu_node *rnp_p; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -325,7 +330,7 @@ static void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempted_readers(rnp); + empty = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); @@ -334,6 +339,10 @@ static void rcu_read_unlock_special(struct task_struct *t) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; /* @@ -346,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t) else rcu_report_unblock_qs_rnp(rnp, flags); +#ifdef CONFIG_RCU_BOOST + /* Unboost if we were boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. @@ -391,7 +409,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - if (!rcu_preempted_readers(rnp)) + if (!rcu_preempt_blocked_readers_cgp(rnp)) return; raw_spin_lock_irqsave(&rnp->lock, flags); t = list_entry(rnp->gp_tasks, @@ -430,7 +448,7 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { struct task_struct *t; - if (!rcu_preempted_readers(rnp)) + if (!rcu_preempt_blocked_readers_cgp(rnp)) return; t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); @@ -460,7 +478,7 @@ static void rcu_preempt_stall_reset(void) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(rcu_preempted_readers(rnp)); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (!list_empty(&rnp->blkd_tasks)) rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); @@ -509,7 +527,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * absolutely necessary, but this is a good performance/complexity * tradeoff. */ - if (rcu_preempted_readers(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp)) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; @@ -525,8 +543,22 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, rnp_root->gp_tasks = rnp->gp_tasks; if (&t->rcu_node_entry == rnp->exp_tasks) rnp_root->exp_tasks = rnp->exp_tasks; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp_root->boost_tasks = rnp->boost_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + +#ifdef CONFIG_RCU_BOOST + /* In case root is being boosted and leaf is not. */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + if (rnp_root->boost_tasks != NULL && + rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks = rnp_root->gp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + rnp->gp_tasks = NULL; rnp->exp_tasks = NULL; return retval; @@ -684,6 +716,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) raw_spin_lock(&rnp->lock); /* irqs already disabled */ if (!list_empty(&rnp->blkd_tasks)) { rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp); must_wait = 1; } raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ @@ -830,6 +863,8 @@ void exit_rcu(void) #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +static struct rcu_state *rcu_state = &rcu_sched_state; + /* * Tell them what RCU they are running. */ @@ -870,7 +905,7 @@ static void rcu_preempt_note_context_switch(int cpu) * Because preemptable RCU does not exist, there are never any preempted * RCU readers. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { return 0; } @@ -1034,6 +1069,263 @@ static void __init __rcu_init_preempt(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +/* + * Carry out RCU priority boosting on the task indicated by ->exp_tasks + * or ->boost_tasks, advancing the pointer to the next task in the + * ->blkd_tasks list. + * + * Note that irqs must be enabled: boosting the task can block. + * Returns 1 if there are more tasks needing to be boosted. + */ +static int rcu_boost(struct rcu_node *rnp) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + return 0; /* Nothing left to boost. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + + /* + * Recheck under the lock: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own. + */ + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rnp->exp_tasks != NULL) + tb = rnp->exp_tasks; + else + tb = rnp->boost_tasks; + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + * + * Note that task t must acquire rnp->lock to remove itself from + * the ->blkd_tasks list, which it will do from exit() if from + * nowhere else. We therefore are guaranteed that task t will + * stay around at least until we drop rnp->lock. Note that + * rnp->lock also resolves races between our priority boosting + * and task t's exiting its outermost RCU read-side critical + * section. + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; +} + +/* + * Timer handler to initiate waking up of boost kthreads that + * have yielded the CPU due to excessive numbers of tasks to + * boost. We wake up the per-rcu_node kthread, which in turn + * will wake up the booster kthread. + */ +static void rcu_boost_kthread_timer(unsigned long arg) +{ + unsigned long flags; + struct rcu_node *rnp = (struct rcu_node *)arg; + + raw_spin_lock_irqsave(&rnp->lock, flags); + invoke_rcu_node_kthread(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Priority-boosting kthread. One per leaf rcu_node and one for the + * root rcu_node. + */ +static int rcu_boost_kthread(void *arg) +{ + struct rcu_node *rnp = (struct rcu_node *)arg; + int spincnt = 0; + int more2boost; + + for (;;) { + wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || + rnp->exp_tasks || + kthread_should_stop()); + if (kthread_should_stop()) + break; + more2boost = rcu_boost(rnp); + if (more2boost) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + spincnt = 0; + } + } + return 0; +} + +/* + * Check to see if it is time to start boosting RCU readers that are + * blocking the current grace period, and, if so, tell the per-rcu_node + * kthread to start boosting them. If there is an expedited grace + * period in progress, it is always time to boost. + * + * The caller must hold rnp->lock. + */ +static void rcu_initiate_boost(struct rcu_node *rnp) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) + return; + if (rnp->exp_tasks != NULL || + (rnp->gp_tasks != NULL && + rnp->boost_tasks == NULL && + rnp->qsmask == 0 && + ULONG_CMP_GE(jiffies, rnp->boost_time))) { + if (rnp->exp_tasks == NULL) + rnp->boost_tasks = rnp->gp_tasks; + t = rnp->boost_kthread_task; + if (t != NULL) + wake_up_process(t); + } +} + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave(&rnp->lock, flags); + t = rnp->boost_kthread_task; + if (t != NULL) + set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +/* + * Initialize the RCU-boost waitqueue. + */ +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->boost_wq); +} + +/* + * Create an RCU-boost kthread for the specified node if one does not + * already exist. We only create this kthread for preemptible RCU. + * Returns zero if all is well, a negated errno otherwise. + */ +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + unsigned long flags; + struct sched_param sp; + struct task_struct *t; + + if (&rcu_preempt_state != rsp) + return 0; + if (rnp->boost_kthread_task != NULL) + return 0; + t = kthread_create(rcu_boost_kthread, (void *)rnp, + "rcub%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void rcu_stop_boost_kthread(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave(&rnp->lock, flags); + t = rnp->boost_kthread_task; + rnp->boost_kthread_task = NULL; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (t != NULL) + kthread_stop(t); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_initiate_boost(struct rcu_node *rnp) +{ +} + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ +} + +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ +} + +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) +{ +} + +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void rcu_stop_boost_kthread(struct rcu_node *rnp) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifndef CONFIG_SMP void synchronize_sched_expedited(void) @@ -1206,8 +1498,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_kthread() to cause rcu_process_callbacks() to be invoked later. - * The per-cpu rcu_dyntick_drain variable controls the sequencing. + * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked + * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) { @@ -1257,7 +1549,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - invoke_rcu_kthread(); + invoke_rcu_cpu_kthread(); return c; } -- cgit v1.1 From 0f962a5e7277c34987b77dc82fc9aefcedc95e27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2011 12:13:53 -0700 Subject: rcu: Force per-rcu_node kthreads off of the outgoing CPU The scheduler has had some heartburn in the past when too many real-time kthreads were affinitied to the outgoing CPU. So, this commit lightens the load by forcing the per-rcu_node and the boost kthreads off of the outgoing CPU. Note that RCU's per-CPU kthread remains on the outgoing CPU until the bitter end, as it must in order to preserve correctness. Also avoid disabling hardirqs across calls to set_cpus_allowed_ptr(), given that this function can block. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 24 +++++++++++++++++++----- kernel/rcutree_plugin.h | 8 +++++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 28fd92a..51eef41 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -95,7 +95,7 @@ static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); static DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp); +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_cpu_kthread(void); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ @@ -1099,7 +1099,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) kthread_stop(t); rcu_stop_boost_kthread(rnp); } else - rcu_node_kthread_setaffinity(rnp); + rcu_node_kthread_setaffinity(rnp, -1); } /* @@ -1644,8 +1644,12 @@ static int rcu_node_kthread(void *arg) * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { cpumask_var_t cm; int cpu; @@ -1657,8 +1661,14 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp) return; cpumask_clear(cm); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if (mask & 0x1) + if ((mask & 0x1) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } set_cpus_allowed_ptr(rnp->node_kthread_task, cm); rcu_boost_kthread_setaffinity(rnp, cm); free_cpumask_var(cm); @@ -2154,7 +2164,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_online_kthreads(cpu); break; case CPU_ONLINE: - rcu_node_kthread_setaffinity(rnp); + case CPU_DOWN_FAILED: + rcu_node_kthread_setaffinity(rnp, -1); + break; + case CPU_DOWN_PREPARE: + rcu_node_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5964f82..4e48625 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1212,17 +1212,19 @@ static void rcu_initiate_boost(struct rcu_node *rnp) } } +/* + * Set the affinity of the boost kthread. The CPU-hotplug locks are + * held, so no one should be messing with the existence of the boost + * kthread. + */ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, cpumask_var_t cm) { - unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); t = rnp->boost_kthread_task; if (t != NULL) set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); - raw_spin_unlock_irqrestore(&rnp->lock, flags); } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) -- cgit v1.1 From e3995a25fa361ce987a7d0ade00b17e3151519d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Apr 2011 15:31:26 -0700 Subject: rcu: put per-CPU kthread at non-RT priority during CPU hotplug operations If you are doing CPU hotplug operations, it is best not to have CPU-bound realtime tasks running CPU-bound on the outgoing CPU. So this commit makes per-CPU kthreads run at non-realtime priority during that time. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 51eef41..198e4df 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1464,6 +1464,30 @@ static void invoke_rcu_node_kthread(struct rcu_node *rnp) } /* + * Set the specified CPU's kthread to run RT or not, as specified by + * the to_rt argument. The CPU-hotplug locks are held, so the task + * is not going away. + */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ + int policy; + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t == NULL) + return; + if (to_rt) { + policy = SCHED_FIFO; + sp.sched_priority = RCU_KTHREAD_PRIO; + } else { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + } + sched_setscheduler_nocheck(t, policy, &sp); +} + +/* * Timer handler to initiate the waking up of per-CPU kthreads that * have yielded the CPU due to excess numbers of RCU callbacks. * We wake up the per-rcu_node kthread, which in turn will wake up @@ -2166,9 +2190,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_ONLINE: case CPU_DOWN_FAILED: rcu_node_kthread_setaffinity(rnp, -1); + rcu_cpu_kthread_setrt(cpu, 1); break; case CPU_DOWN_PREPARE: rcu_node_kthread_setaffinity(rnp, cpu); + rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: case CPU_DYING_FROZEN: -- cgit v1.1 From 3acf4a9a3d63f23430f940842829175b0778a1b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Apr 2011 23:45:23 -0700 Subject: rcu: avoid hammering sched with yet another bound RT kthread The scheduler does not appear to take kindly to having multiple real-time threads bound to a CPU that is going offline. So this commit is a temporary hack-around to avoid that happening. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c224da4..4d256db 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -163,11 +163,11 @@ static int stutter_pause_test; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#ifdef CONFIG_RCU_BOOST +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 -#else /* #ifdef CONFIG_RCU_BOOST */ +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ #define rcu_can_boost() 0 -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ -- cgit v1.1 From 67b98dba474f293c389fc2b7254dcf7c0492e3bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Feb 2011 13:31:55 -0800 Subject: rcu: eliminate unused boosting statistics The n_rcu_torture_boost_allocerror and n_rcu_torture_boost_afferror statistics are not actually incremented anymore, so eliminate them. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 4d256db..22b0e74 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_allocerror; -static long n_rcu_torture_boost_afferror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; @@ -1067,7 +1065,7 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " + "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, rcu_torture_current_version, @@ -1078,16 +1076,12 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror, - n_rcu_torture_boost_allocerror, - n_rcu_torture_boost_afferror, n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_allocerror != 0 || - n_rcu_torture_boost_afferror != 0 || n_rcu_torture_boost_failure != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); @@ -1486,8 +1480,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_boost_ktrerror = 0; n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_allocerror = 0; - n_rcu_torture_boost_afferror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) -- cgit v1.1 From 0ea1f2ebeb217d38770aebf91c4ecaa8e01b3305 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Feb 2011 13:42:43 -0800 Subject: rcu: Add boosting to TREE_PREEMPT_RCU tracing Includes total number of tasks boosted, number boosted on behalf of each of normal and expedited grace periods, and statistics on attempts to initiate boosting that failed for various reasons. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.h | 19 ++++++++++++++ kernel/rcutree_plugin.h | 42 +++++++++++++++++++++++++++--- kernel/rcutree_trace.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8db0cdc..d49046c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -152,6 +152,25 @@ struct rcu_node { wait_queue_head_t boost_wq; /* Wait queue on which to park the boost */ /* kthread. */ + unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ + unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ + unsigned long n_normal_boosts; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notblocked; + /* Refused to boost: RCU RS CS still running. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ struct task_struct *node_kthread_task; /* kthread that takes care of this rcu_node */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 4e48625..07d3464 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1073,6 +1073,33 @@ static void __init __rcu_init_preempt(void) #include "rtmutex_common.h" +#ifdef CONFIG_RCU_TRACE + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ + if (list_empty(&rnp->blkd_tasks)) + rnp->n_balk_blkd_tasks++; + else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) + rnp->n_balk_exp_gp_tasks++; + else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) + rnp->n_balk_boost_tasks++; + else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) + rnp->n_balk_notblocked++; + else if (rnp->gp_tasks != NULL && + ULONG_CMP_GE(jiffies, rnp->boost_time)) + rnp->n_balk_notyet++; + else + rnp->n_balk_nos++; +} + +#else /* #ifdef CONFIG_RCU_TRACE */ + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1108,10 +1135,14 @@ static int rcu_boost(struct rcu_node *rnp) * expedited grace period must boost all blocked tasks, including * those blocking the pre-existing normal grace period. */ - if (rnp->exp_tasks != NULL) + if (rnp->exp_tasks != NULL) { tb = rnp->exp_tasks; - else + rnp->n_exp_boosts++; + } else { tb = rnp->boost_tasks; + rnp->n_normal_boosts++; + } + rnp->n_tasks_boosted++; /* * We boost task t by manufacturing an rt_mutex that appears to @@ -1197,8 +1228,10 @@ static void rcu_initiate_boost(struct rcu_node *rnp) { struct task_struct *t; - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + rnp->n_balk_exp_gp_tasks++; return; + } if (rnp->exp_tasks != NULL || (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && @@ -1209,7 +1242,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp) t = rnp->boost_kthread_task; if (t != NULL) wake_up_process(t); - } + } else + rcu_initiate_boost_trace(rnp); } /* diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 1cedf94..ead5736 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -157,6 +157,71 @@ static const struct file_operations rcudata_csv_fops = { .release = single_release, }; +#ifdef CONFIG_RCU_BOOST + +static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) +{ + seq_printf(m, "%d:%d tasks=%c%c%c%c ntb=%lu neb=%lu nnb=%lu " + "j=%04x bt=%04x\n", + rnp->grplo, rnp->grphi, + "T."[list_empty(&rnp->blkd_tasks)], + "N."[!rnp->gp_tasks], + "E."[!rnp->exp_tasks], + "B."[!rnp->boost_tasks], + rnp->n_tasks_boosted, rnp->n_exp_boosts, + rnp->n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rnp->boost_time & 0xffff)); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", + " balk", + rnp->n_balk_blkd_tasks, + rnp->n_balk_exp_gp_tasks, + rnp->n_balk_boost_tasks, + rnp->n_balk_notblocked, + rnp->n_balk_notyet, + rnp->n_balk_nos); +} + +static int show_rcu_node_boost(struct seq_file *m, void *unused) +{ + struct rcu_node *rnp; + + rcu_for_each_leaf_node(&rcu_preempt_state, rnp) + print_one_rcu_node_boost(m, rnp); + return 0; +} + +static int rcu_node_boost_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_node_boost, NULL); +} + +static const struct file_operations rcu_node_boost_fops = { + .owner = THIS_MODULE, + .open = rcu_node_boost_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Create the rcuboost debugfs entry. Standard error return. + */ +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, + &rcu_node_boost_fops); +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return 0; /* There cannot be an error if we didn't create it! */ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { unsigned long gpnum; @@ -315,6 +380,9 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + if (rcu_boost_trace_create_file(rcudir)) + goto free_out; + retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); if (!retval) goto free_out; -- cgit v1.1 From 2fa218d8bbcff239302f9f36e19d7187077dd636 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 27 Mar 2011 21:37:58 -0700 Subject: rcu: Update RCU's trace.txt documentation for new format The trace.txt file had obsolete output for the debugfs rcu/rcudata file, so update it. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 65 ++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 5a704ff..13dbf9b 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -21,23 +21,23 @@ rcu_pending() function decided that there was core RCU work to do). The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1/0 df=1101 of=0 ri=36 ql=0 b=10 - 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1/0 df=1015 of=0 ri=0 ql=0 b=10 - 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1/0 df=1839 of=0 ri=0 ql=0 b=10 - 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1/0 df=1545 of=0 ri=0 ql=0 b=10 - 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1/0 df=1992 of=0 ri=0 ql=0 b=10 - 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1/0 df=3331 of=0 ri=4 ql=2 b=10 - 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1/0 df=3224 of=0 ri=0 ql=0 b=10 - 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1/0 df=1818 of=0 ri=0 ql=2 b=10 + 0!c=423090 g=423091 pq=1 pqc=423090 qp=1 dt=86475/1/0 df=16319 of=163 ri=1519 ql=0 b=10 ci=1460693 co=1648 ca=6448 + 1!c=423329 g=423330 pq=1 pqc=423329 qp=1 dt=90875/1/0 df=16231 of=157 ri=1249 ql=0 b=10 ci=1459002 co=1614 ca=3310 + 2!c=423370 g=423371 pq=1 pqc=423370 qp=1 dt=69661/1/0 df=16125 of=163 ri=1469 ql=0 b=10 ci=1610701 co=2015 ca=2378 + 3!c=422967 g=422968 pq=1 pqc=422967 qp=1 dt=70349/1/0 df=12528 of=163 ri=1450 ql=0 b=10 ci=1427543 co=1430 ca=897 + 4!c=423196 g=423197 pq=1 pqc=423196 qp=0 dt=38935/1/0 df=10959 of=177 ri=1657 ql=0 b=10 ci=1562249 co=1896 ca=533 + 5!c=422950 g=422951 pq=1 pqc=422950 qp=0 dt=25127/1/0 df=5895 of=167 ri=1549 ql=0 b=10 ci=1777260 co=2137 ca=274 + 6!c=423396 g=423397 pq=1 pqc=423396 qp=1 dt=22639/1/0 df=4590 of=149 ri=1572 ql=0 b=10 ci=1471186 co=1530 ca=243 + 7 c=460203 g=460203 pq=1 pqc=460202 qp=0 dt=937087/1/0 df=3298 of=149 ri=1584 ql=6 b=10 ci=4026154 co=1948 ca=135 rcu_bh: - 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1/0 df=0 of=0 ri=0 ql=0 b=10 - 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1/0 df=13 of=0 ri=0 ql=0 b=10 - 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1/0 df=15 of=0 ri=0 ql=0 b=10 - 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1/0 df=9 of=0 ri=0 ql=0 b=10 - 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1/0 df=15 of=0 ri=0 ql=0 b=10 - 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1/0 df=15 of=0 ri=0 ql=0 b=10 - 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1/0 df=15 of=0 ri=0 ql=0 b=10 - 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1/0 df=15 of=0 ri=0 ql=0 b=10 + 0!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=86475/1/0 df=11 of=0 ri=0 ql=0 b=10 ci=112 co=0 ca=0 + 1!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=90875/1/0 df=15 of=0 ri=0 ql=0 b=10 ci=143 co=0 ca=0 + 2!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=69661/1/0 df=21 of=0 ri=1 ql=0 b=10 ci=88 co=0 ca=0 + 3!c=18446744073709551494 g=18446744073709551494 pq=1 pqc=18446744073709551493 qp=0 dt=70349/1/0 df=13 of=0 ri=0 ql=0 b=10 ci=100 co=0 ca=0 + 4!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=38935/1/0 df=17 of=0 ri=0 ql=0 b=10 ci=36 co=0 ca=0 + 5!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=25127/1/0 df=7 of=0 ri=0 ql=0 b=10 ci=32 co=0 ca=0 + 6!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=22639/1/0 df=9 of=0 ri=0 ql=0 b=10 ci=44 co=0 ca=0 + 7 c=182 g=182 pq=1 pqc=181 qp=0 dt=937087/1/0 df=14 of=0 ri=1 ql=0 b=10 ci=627 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -52,17 +52,18 @@ o The number at the beginning of each line is the CPU number. substantially larger than the number of actual CPUs. o "c" is the count of grace periods that this CPU believes have - completed. CPUs in dynticks idle mode may lag quite a ways - behind, for example, CPU 4 under "rcu_sched" above, which has - slept through the past 25 RCU grace periods. It is not unusual - to see CPUs lagging by thousands of grace periods. + completed. Offlined CPUs and CPUs in dynticks idle mode may + lag quite a ways behind, for example, CPU 6 under "rcu_sched" + above, which has been offline through not quite 40,000 RCU grace + periods. It is not unusual to see CPUs lagging by thousands of + grace periods. o "g" is the count of grace periods that this CPU believes have - started. Again, CPUs in dynticks idle mode may lag behind. - If the "c" and "g" values are equal, this CPU has already - reported a quiescent state for the last RCU grace period that - it is aware of, otherwise, the CPU believes that it owes RCU a - quiescent state. + started. Again, offlined CPUs and CPUs in dynticks idle mode + may lag behind. If the "c" and "g" values are equal, this CPU + has already reported a quiescent state for the last RCU grace + period that it is aware of, otherwise, the CPU believes that it + owes RCU a quiescent state. o "pq" indicates that this CPU has passed through a quiescent state for the current grace period. It is possible for "pq" to be @@ -81,14 +82,16 @@ o "pqc" indicates which grace period the last-observed quiescent the next grace period! o "qp" indicates that RCU still expects a quiescent state from - this CPU. + this CPU. Offlined CPUs and CPUs in dyntick idle mode might + well have qp=1, which is OK: RCU is still ignoring them. o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the - scheduler or by irq. The number after the first "/" is the - interrupt nesting depth when in dyntick-idle state, or one - greater than the interrupt-nesting depth otherwise. The number - after the second "/" is the NMI nesting depth. + scheduler or by irq. This number is even if the CPU is in + dyntick idle mode and odd otherwise. The number after the first + "/" is the interrupt nesting depth when in dyntick-idle state, + or one greater than the interrupt-nesting depth otherwise. + The number after the second "/" is the NMI nesting depth. This field is displayed only for CONFIG_NO_HZ kernels. @@ -100,7 +103,7 @@ o "df" is the number of times that some other CPU has forced a o "of" is the number of times that some other CPU has forced a quiescent state on behalf of this CPU due to this CPU being - offline. In a perfect world, this might neve happen, but it + offline. In a perfect world, this might never happen, but it turns out that offlining and onlining a CPU can take several grace periods, and so there is likely to be an extended period of time when RCU believes that the CPU is online when it really is not. -- cgit v1.1 From 0ac3d136b2e3cdf1161178223bc5da14a06241d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Mar 2011 15:47:07 -0700 Subject: rcu: add callback-queue information to rcudata output This commit adds an indication of the state of the callback queue using a string of four characters following the "ql=" integer queue length. The first character is "N" if there are callbacks that have been queued that are not yet ready to be handled by the next grace period, or "." otherwise. The second character is "R" if there are callbacks queued that are ready to be handled by the next grace period, or "." otherwise. The third character is "W" if there are callbacks waiting for the current grace period, or "." otherwise. Finally, the fourth character is "D" if there are callbacks that have been handled by a prior grace period and are waiting to be invoked, or ".". Note that callbacks that are in the process of being invoked are not shown. These callbacks would have been removed from the rcu_data structure's list by rcu_do_batch() prior to being executed. (These callbacks are also not reflected in the "ql=" total, FWIW.) Also, document the new callback-queue trace information. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 58 ++++++++++++++++++++++++++++++++------------- kernel/rcutree_trace.c | 21 ++++++++++++++-- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 13dbf9b..5aefd5f 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -21,23 +21,23 @@ rcu_pending() function decided that there was core RCU work to do). The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0!c=423090 g=423091 pq=1 pqc=423090 qp=1 dt=86475/1/0 df=16319 of=163 ri=1519 ql=0 b=10 ci=1460693 co=1648 ca=6448 - 1!c=423329 g=423330 pq=1 pqc=423329 qp=1 dt=90875/1/0 df=16231 of=157 ri=1249 ql=0 b=10 ci=1459002 co=1614 ca=3310 - 2!c=423370 g=423371 pq=1 pqc=423370 qp=1 dt=69661/1/0 df=16125 of=163 ri=1469 ql=0 b=10 ci=1610701 co=2015 ca=2378 - 3!c=422967 g=422968 pq=1 pqc=422967 qp=1 dt=70349/1/0 df=12528 of=163 ri=1450 ql=0 b=10 ci=1427543 co=1430 ca=897 - 4!c=423196 g=423197 pq=1 pqc=423196 qp=0 dt=38935/1/0 df=10959 of=177 ri=1657 ql=0 b=10 ci=1562249 co=1896 ca=533 - 5!c=422950 g=422951 pq=1 pqc=422950 qp=0 dt=25127/1/0 df=5895 of=167 ri=1549 ql=0 b=10 ci=1777260 co=2137 ca=274 - 6!c=423396 g=423397 pq=1 pqc=423396 qp=1 dt=22639/1/0 df=4590 of=149 ri=1572 ql=0 b=10 ci=1471186 co=1530 ca=243 - 7 c=460203 g=460203 pq=1 pqc=460202 qp=0 dt=937087/1/0 df=3298 of=149 ri=1584 ql=6 b=10 ci=4026154 co=1948 ca=135 + 0!c=423090 g=423091 pq=1 pqc=423090 qp=1 dt=86475/1/0 df=16319 of=163 ri=1519 ql=0 qs=.... b=10 ci=1460693 co=1648 ca=6448 + 1!c=423329 g=423330 pq=1 pqc=423329 qp=1 dt=90875/1/0 df=16231 of=157 ri=1249 ql=0 qs=.... b=10 ci=1459002 co=1614 ca=3310 + 2!c=423370 g=423371 pq=1 pqc=423370 qp=1 dt=69661/1/0 df=16125 of=163 ri=1469 ql=0 qs=.... b=10 ci=1610701 co=2015 ca=2378 + 3!c=422967 g=422968 pq=1 pqc=422967 qp=1 dt=70349/1/0 df=12528 of=163 ri=1450 ql=0 qs=.... b=10 ci=1427543 co=1430 ca=897 + 4!c=423196 g=423197 pq=1 pqc=423196 qp=0 dt=38935/1/0 df=10959 of=177 ri=1657 ql=0 qs=.... b=10 ci=1562249 co=1896 ca=533 + 5!c=422950 g=422951 pq=1 pqc=422950 qp=0 dt=25127/1/0 df=5895 of=167 ri=1549 ql=0 qs=.... b=10 ci=1777260 co=2137 ca=274 + 6!c=423396 g=423397 pq=1 pqc=423396 qp=1 dt=22639/1/0 df=4590 of=149 ri=1572 ql=0 qs=.... b=10 ci=1471186 co=1530 ca=243 + 7 c=460203 g=460203 pq=1 pqc=460202 qp=0 dt=937087/1/0 df=3298 of=149 ri=1584 ql=6 qs=N.W. b=10 ci=4026154 co=1948 ca=135 rcu_bh: - 0!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=86475/1/0 df=11 of=0 ri=0 ql=0 b=10 ci=112 co=0 ca=0 - 1!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=90875/1/0 df=15 of=0 ri=0 ql=0 b=10 ci=143 co=0 ca=0 - 2!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=69661/1/0 df=21 of=0 ri=1 ql=0 b=10 ci=88 co=0 ca=0 - 3!c=18446744073709551494 g=18446744073709551494 pq=1 pqc=18446744073709551493 qp=0 dt=70349/1/0 df=13 of=0 ri=0 ql=0 b=10 ci=100 co=0 ca=0 - 4!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=38935/1/0 df=17 of=0 ri=0 ql=0 b=10 ci=36 co=0 ca=0 - 5!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=25127/1/0 df=7 of=0 ri=0 ql=0 b=10 ci=32 co=0 ca=0 - 6!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=22639/1/0 df=9 of=0 ri=0 ql=0 b=10 ci=44 co=0 ca=0 - 7 c=182 g=182 pq=1 pqc=181 qp=0 dt=937087/1/0 df=14 of=0 ri=1 ql=0 b=10 ci=627 co=0 ca=0 + 0!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=86475/1/0 df=11 of=0 ri=0 ql=0 qs=.... b=10 ci=112 co=0 ca=0 + 1!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=90875/1/0 df=15 of=0 ri=0 ql=0 qs=.... b=10 ci=143 co=0 ca=0 + 2!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=69661/1/0 df=21 of=0 ri=1 ql=0 qs=.... b=10 ci=88 co=0 ca=0 + 3!c=18446744073709551494 g=18446744073709551494 pq=1 pqc=18446744073709551493 qp=0 dt=70349/1/0 df=13 of=0 ri=0 ql=0 qs=.... b=10 ci=100 co=0 ca=0 + 4!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=38935/1/0 df=17 of=0 ri=0 ql=0 qs=.... b=10 ci=36 co=0 ca=0 + 5!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=25127/1/0 df=7 of=0 ri=0 ql=0 qs=.... b=10 ci=32 co=0 ca=0 + 6!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=22639/1/0 df=9 of=0 ri=0 ql=0 qs=.... b=10 ci=44 co=0 ca=0 + 7 c=182 g=182 pq=1 pqc=181 qp=0 dt=937087/1/0 df=14 of=0 ri=1 ql=0 qs=.... b=10 ci=627 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -120,6 +120,32 @@ o "ql" is the number of RCU callbacks currently residing on of what state they are in (new, waiting for grace period to start, waiting for grace period to end, ready to invoke). +o "qs" gives an indication of the state of the callback queue + with four characters: + + "N" Indicates that there are callbacks queued that are not + ready to be handled by the next grace period, and thus + will be handled by the grace period following the next + one. + + "R" Indicates that there are callbacks queued that are + ready to be handled by the next grace period. + + "W" Indicates that there are callbacks queued that are + waiting on the current grace period. + + "D" Indicates that there are callbacks queued that have + already been handled by a prior grace period, and are + thus waiting to be invoked. Note that callbacks in + the process of being invoked are not counted here. + Callbacks in the process of being invoked are those + that have been removed from the rcu_data structures + queues by rcu_do_batch(), but which have not yet been + invoked. + + If there are no callbacks in a given one of the above states, + the corresponding character is replaced by ".". + o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ead5736..afd262f 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld qs=%c%c%c%c b=%ld", + rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -121,7 +130,15 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,\"%c%c%c%c\",%ld", rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + rdp->blimit); seq_printf(m, ",%lu,%lu,%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } -- cgit v1.1 From d71df90eadfc35aa549ff9a850842673febca71f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2011 17:48:28 -0700 Subject: rcu: add tracing for RCU's kthread run states. Add tracing to help debugging situations when RCU's kthreads are not running but are supposed to be. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 11 ++++++++++- kernel/rcutree.h | 11 +++++++++++ kernel/rcutree_plugin.h | 3 +++ kernel/rcutree_trace.c | 23 ++++++++++++++++++++--- 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 198e4df..d891740 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -91,8 +91,9 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); * handle all flavors of RCU. */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); -static DEFINE_PER_CPU(char, rcu_cpu_has_work); +DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); @@ -1563,11 +1564,13 @@ static int rcu_cpu_kthread(void *arg) int cpu = (int)(long)arg; unsigned long flags; int spincnt = 0; + unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu); char work; char *workp = &per_cpu(rcu_cpu_has_work, cpu); for (;;) { + *statusp = RCU_KTHREAD_WAITING; wait_event_interruptible(*wqp, *workp != 0 || kthread_should_stop()); local_bh_disable(); @@ -1575,6 +1578,7 @@ static int rcu_cpu_kthread(void *arg) local_bh_enable(); break; } + *statusp = RCU_KTHREAD_RUNNING; local_irq_save(flags); work = *workp; *workp = 0; @@ -1587,10 +1591,12 @@ static int rcu_cpu_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + *statusp = RCU_KTHREAD_YIELDING; rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); spincnt = 0; } } + *statusp = RCU_KTHREAD_STOPPED; return 0; } @@ -1637,10 +1643,12 @@ static int rcu_node_kthread(void *arg) struct task_struct *t; for (;;) { + rnp->node_kthread_status = RCU_KTHREAD_WAITING; wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 || kthread_should_stop()); if (kthread_should_stop()) break; + rnp->node_kthread_status = RCU_KTHREAD_RUNNING; raw_spin_lock_irqsave(&rnp->lock, flags); mask = rnp->wakemask; rnp->wakemask = 0; @@ -1661,6 +1669,7 @@ static int rcu_node_kthread(void *arg) preempt_enable(); } } + rnp->node_kthread_status = RCU_KTHREAD_STOPPED; return 0; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d49046c..67341db 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -89,6 +89,13 @@ struct rcu_dynticks { atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ }; +/* RCU's kthread states for tracing. */ +#define RCU_KTHREAD_STOPPED 0 +#define RCU_KTHREAD_RUNNING 1 +#define RCU_KTHREAD_WAITING 2 +#define RCU_KTHREAD_YIELDING 3 +#define RCU_KTHREAD_MAX 3 + /* * Definition for node within the RCU grace-period-detection hierarchy. */ @@ -152,6 +159,8 @@ struct rcu_node { wait_queue_head_t boost_wq; /* Wait queue on which to park the boost */ /* kthread. */ + unsigned int boost_kthread_status; + /* State of boost_kthread_task for tracing. */ unsigned long n_tasks_boosted; /* Total number of tasks boosted. */ unsigned long n_exp_boosts; @@ -179,6 +188,8 @@ struct rcu_node { wait_queue_head_t node_wq; /* Wait queue on which to park the per-node */ /* kthread. */ + unsigned int node_kthread_status; + /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 07d3464..22a6a46 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1198,11 +1198,13 @@ static int rcu_boost_kthread(void *arg) int more2boost; for (;;) { + rnp->boost_kthread_status = RCU_KTHREAD_WAITING; wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || rnp->exp_tasks || kthread_should_stop()); if (kthread_should_stop()) break; + rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) spincnt++; @@ -1213,6 +1215,7 @@ static int rcu_boost_kthread(void *arg) spincnt = 0; } } + rnp->boost_kthread_status = RCU_KTHREAD_STOPPED; return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index afd262f..fc40e89 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,16 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(char, rcu_cpu_has_work); + +static char convert_kthread_status(unsigned int kthread_status) +{ + if (kthread_status > RCU_KTHREAD_MAX) + return '?'; + return "SRWY"[kthread_status]; +} + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -64,7 +74,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c b=%ld", + seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c b=%ld", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -73,6 +83,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) ".W"[rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); @@ -130,7 +143,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\",%ld", rdp->qlen, + seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -138,6 +151,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) ".W"[rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_WAIT_TAIL]], ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), rdp->blimit); seq_printf(m, ",%lu,%lu,%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); @@ -178,13 +194,14 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) { - seq_printf(m, "%d:%d tasks=%c%c%c%c ntb=%lu neb=%lu nnb=%lu " + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " "j=%04x bt=%04x\n", rnp->grplo, rnp->grphi, "T."[list_empty(&rnp->blkd_tasks)], "N."[!rnp->gp_tasks], "E."[!rnp->exp_tasks], "B."[!rnp->boost_tasks], + convert_kthread_status(rnp->boost_kthread_status), rnp->n_tasks_boosted, rnp->n_exp_boosts, rnp->n_normal_boosts, (int)(jiffies & 0xffff), -- cgit v1.1 From 4a29865689dbb87a02e3b0fff4a4ae5041273173 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 3 Apr 2011 21:33:51 -0700 Subject: rcu: make rcutorture version numbers available through debugfs It is not possible to accurately correlate rcutorture output with that of debugfs. This patch therefore adds a debugfs file that prints out the rcutorture version number, permitting easy correlation. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 13 ++++++++++++- include/linux/rcutree.h | 3 +++ kernel/rcutorture.c | 8 +++++--- kernel/rcutree.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/rcutree_trace.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 4 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ff422d2..9e169c2 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,6 +47,18 @@ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) +extern void rcutorture_record_test_transition(void); +extern void rcutorture_record_progress(unsigned long vernum); +#else +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#endif + #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) @@ -68,7 +80,6 @@ extern void call_rcu_sched(struct rcu_head *head, extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); -extern int sched_expedited_torture_stats(char *page); static inline void __rcu_read_lock_bh(void) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3a93348..284dad1 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -58,9 +58,12 @@ static inline void synchronize_rcu_bh_expedited(void) extern void rcu_barrier(void); +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); extern long rcu_batches_completed_sched(void); + extern void rcu_force_quiescent_state(void); extern void rcu_bh_force_quiescent_state(void); extern void rcu_sched_force_quiescent_state(void); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 22b0e74..c2f58ec 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -131,7 +131,7 @@ struct rcu_torture { static LIST_HEAD(rcu_torture_freelist); static struct rcu_torture __rcu *rcu_torture_current; -static long rcu_torture_current_version; +static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -884,7 +884,7 @@ rcu_torture_writer(void *arg) old_rp->rtort_pipe_count++; cur_ops->deferred_free(old_rp); } - rcu_torture_current_version++; + rcutorture_record_progress(++rcu_torture_current_version); oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1064,7 +1064,7 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, @@ -1325,6 +1325,7 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { printk(KERN_WARNING /* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); @@ -1616,6 +1617,7 @@ rcu_torture_init(void) } } register_reboot_notifier(&rcutorture_shutdown_nb); + rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d891740..bb84dec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -102,6 +102,18 @@ static void invoke_rcu_cpu_kthread(void); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ /* + * Track the rcutorture test sequence number and the update version + * number within a given test. The rcutorture_testseq is incremented + * on every rcutorture module load and unload, so has an odd value + * when a test is running. The rcutorture_vernum is set to zero + * when rcutorture starts and is incremented on each rcutorture update. + * These variables enable correlating rcutorture output with the + * RCU tracing information. + */ +unsigned long rcutorture_testseq; +unsigned long rcutorture_vernum; + +/* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -193,6 +205,31 @@ void rcu_bh_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); /* + * Record the number of times rcutorture tests have been initiated and + * terminated. This information allows the debugfs tracing stats to be + * correlated to the rcutorture messages, even when the rcutorture module + * is being repeatedly loaded and unloaded. In other words, we cannot + * store this state in rcutorture itself. + */ +void rcutorture_record_test_transition(void) +{ + rcutorture_testseq++; + rcutorture_vernum = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); + +/* + * Record the number of writer passes through the current rcutorture test. + * This is also used to correlate debugfs tracing stats with the rcutorture + * messages. + */ +void rcutorture_record_progress(unsigned long vernum) +{ + rcutorture_vernum++; +} +EXPORT_SYMBOL_GPL(rcutorture_record_progress); + +/* * Force a quiescent state for RCU-sched. */ void rcu_sched_force_quiescent_state(void) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index fc40e89..3baa235 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -394,6 +394,29 @@ static const struct file_operations rcu_pending_fops = { .release = single_release, }; +static int show_rcutorture(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcutorture test sequence: %lu %s\n", + rcutorture_testseq >> 1, + (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); + seq_printf(m, "rcutorture update version number: %lu\n", + rcutorture_vernum); + return 0; +} + +static int rcutorture_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcutorture, NULL); +} + +static const struct file_operations rcutorture_fops = { + .owner = THIS_MODULE, + .open = rcutorture_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *rcudir; static int __init rcutree_trace_init(void) @@ -430,6 +453,11 @@ static int __init rcutree_trace_init(void) NULL, &rcu_pending_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcutorture", 0444, rcudir, + NULL, &rcutorture_fops); + if (!retval) + goto free_out; return 0; free_out: debugfs_remove_recursive(rcudir); -- cgit v1.1 From 90e6ac3657fd3b0446d585082000af3cf46439a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Apr 2011 15:20:47 -0700 Subject: rcu: update tracing documentation for new rcutorture and rcuboost This commit documents the new debugfs rcu/rcutorture and rcu/rcuboost trace files. The description has been updated as suggested by Josh Triplett. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 186 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 161 insertions(+), 25 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 5aefd5f..40b530d 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -10,34 +10,46 @@ for rcutree and next for rcutiny. CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats -These implementations of RCU provides five debugfs files under the -top-level directory RCU: rcu/rcudata (which displays fields in struct -rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of -rcu/rcudata), rcu/rcugp (which displays grace-period counters), -rcu/rcuhier (which displays the struct rcu_node hierarchy), and -rcu/rcu_pending (which displays counts of the reasons that the -rcu_pending() function decided that there was core RCU work to do). +These implementations of RCU provides several debugfs files under the +top-level directory "rcu": + +rcu/rcudata: + Displays fields in struct rcu_data. +rcu/rcudata.csv: + Comma-separated values spreadsheet version of rcudata. +rcu/rcugp: + Displays grace-period counters. +rcu/rcuhier: + Displays the struct rcu_node hierarchy. +rcu/rcu_pending: + Displays counts of the reasons rcu_pending() decided that RCU had + work to do. +rcu/rcutorture: + Displays rcutorture test progress. +rcu/rcuboost: + Displays RCU boosting statistics. Only present if + CONFIG_RCU_BOOST=y. The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0!c=423090 g=423091 pq=1 pqc=423090 qp=1 dt=86475/1/0 df=16319 of=163 ri=1519 ql=0 qs=.... b=10 ci=1460693 co=1648 ca=6448 - 1!c=423329 g=423330 pq=1 pqc=423329 qp=1 dt=90875/1/0 df=16231 of=157 ri=1249 ql=0 qs=.... b=10 ci=1459002 co=1614 ca=3310 - 2!c=423370 g=423371 pq=1 pqc=423370 qp=1 dt=69661/1/0 df=16125 of=163 ri=1469 ql=0 qs=.... b=10 ci=1610701 co=2015 ca=2378 - 3!c=422967 g=422968 pq=1 pqc=422967 qp=1 dt=70349/1/0 df=12528 of=163 ri=1450 ql=0 qs=.... b=10 ci=1427543 co=1430 ca=897 - 4!c=423196 g=423197 pq=1 pqc=423196 qp=0 dt=38935/1/0 df=10959 of=177 ri=1657 ql=0 qs=.... b=10 ci=1562249 co=1896 ca=533 - 5!c=422950 g=422951 pq=1 pqc=422950 qp=0 dt=25127/1/0 df=5895 of=167 ri=1549 ql=0 qs=.... b=10 ci=1777260 co=2137 ca=274 - 6!c=423396 g=423397 pq=1 pqc=423396 qp=1 dt=22639/1/0 df=4590 of=149 ri=1572 ql=0 qs=.... b=10 ci=1471186 co=1530 ca=243 - 7 c=460203 g=460203 pq=1 pqc=460202 qp=0 dt=937087/1/0 df=3298 of=149 ri=1584 ql=6 qs=N.W. b=10 ci=4026154 co=1948 ca=135 + 0!c=423090 g=423091 pq=1 pqc=423090 qp=1 dt=86475/1/0 df=16319 of=163 ri=1519 ql=0 qs=.... kt=0/W b=10 ci=1460693 co=1648 ca=6448 + 1!c=423329 g=423330 pq=1 pqc=423329 qp=1 dt=90875/1/0 df=16231 of=157 ri=1249 ql=0 qs=.... kt=0/W b=10 ci=1459002 co=1614 ca=3310 + 2!c=423370 g=423371 pq=1 pqc=423370 qp=1 dt=69661/1/0 df=16125 of=163 ri=1469 ql=0 qs=.... kt=0/W b=10 ci=1610701 co=2015 ca=2378 + 3!c=422967 g=422968 pq=1 pqc=422967 qp=1 dt=70349/1/0 df=12528 of=163 ri=1450 ql=0 qs=.... kt=0/W b=10 ci=1427543 co=1430 ca=897 + 4!c=423196 g=423197 pq=1 pqc=423196 qp=0 dt=38935/1/0 df=10959 of=177 ri=1657 ql=0 qs=.... kt=0/W b=10 ci=1562249 co=1896 ca=533 + 5!c=422950 g=422951 pq=1 pqc=422950 qp=0 dt=25127/1/0 df=5895 of=167 ri=1549 ql=0 qs=.... kt=0/W b=10 ci=1777260 co=2137 ca=274 + 6!c=423396 g=423397 pq=1 pqc=423396 qp=1 dt=22639/1/0 df=4590 of=149 ri=1572 ql=0 qs=.... kt=0/W b=10 ci=1471186 co=1530 ca=243 + 7 c=460203 g=460203 pq=1 pqc=460202 qp=0 dt=937087/1/0 df=3298 of=149 ri=1584 ql=6 qs=N.W. kt=0/W b=10 ci=4026154 co=1948 ca=135 rcu_bh: - 0!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=86475/1/0 df=11 of=0 ri=0 ql=0 qs=.... b=10 ci=112 co=0 ca=0 - 1!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=90875/1/0 df=15 of=0 ri=0 ql=0 qs=.... b=10 ci=143 co=0 ca=0 - 2!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=69661/1/0 df=21 of=0 ri=1 ql=0 qs=.... b=10 ci=88 co=0 ca=0 - 3!c=18446744073709551494 g=18446744073709551494 pq=1 pqc=18446744073709551493 qp=0 dt=70349/1/0 df=13 of=0 ri=0 ql=0 qs=.... b=10 ci=100 co=0 ca=0 - 4!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=38935/1/0 df=17 of=0 ri=0 ql=0 qs=.... b=10 ci=36 co=0 ca=0 - 5!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=25127/1/0 df=7 of=0 ri=0 ql=0 qs=.... b=10 ci=32 co=0 ca=0 - 6!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=22639/1/0 df=9 of=0 ri=0 ql=0 qs=.... b=10 ci=44 co=0 ca=0 - 7 c=182 g=182 pq=1 pqc=181 qp=0 dt=937087/1/0 df=14 of=0 ri=1 ql=0 qs=.... b=10 ci=627 co=0 ca=0 + 0!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=86475/1/0 df=11 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=112 co=0 ca=0 + 1!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=90875/1/0 df=15 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=143 co=0 ca=0 + 2!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=69661/1/0 df=21 of=0 ri=1 ql=0 qs=.... kt=0/W b=10 ci=88 co=0 ca=0 + 3!c=18446744073709551494 g=18446744073709551494 pq=1 pqc=18446744073709551493 qp=0 dt=70349/1/0 df=13 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=100 co=0 ca=0 + 4!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=38935/1/0 df=17 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=36 co=0 ca=0 + 5!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=25127/1/0 df=7 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=32 co=0 ca=0 + 6!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=22639/1/0 df=9 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=44 co=0 ca=0 + 7 c=182 g=182 pq=1 pqc=181 qp=0 dt=937087/1/0 df=14 of=0 ri=1 ql=0 qs=.... kt=0/W b=10 ci=627 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -146,6 +158,23 @@ o "qs" gives an indication of the state of the callback queue If there are no callbacks in a given one of the above states, the corresponding character is replaced by ".". +o "kt" is the per-CPU kernel-thread state. The digit preceding + the slash is zero if there is no work pending and 1 otherwise. + The character after the slash is as follows: + + "S" The kernel thread is stopped, in other words, all + CPUs corresponding to this rcu_node structure are + offline. + + "R" The kernel thread is running. + + "W" The kernel thread is waiting because there is no work + for it to do. + + "Y" The kernel thread is yielding to avoid hogging CPU. + + "?" Unknown value, indicates a bug. + o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. @@ -356,6 +385,113 @@ o "nn" is the number of times that this CPU needed nothing. Alert is due to short-circuit evaluation in rcu_pending(). +The output of "cat rcu/rcutorture" looks as follows: + +rcutorture test sequence: 0 (test in progress) +rcutorture update version number: 615 + +The first line shows the number of rcutorture tests that have completed +since boot. If a test is currently running, the "(test in progress)" +string will appear as shown above. The second line shows the number of +update cycles that the current test has started, or zero if there is +no test in progress. + + +The output of "cat rcu/rcuboost" looks as follows: + +0:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f + balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16 +6:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f + balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6 + +This information is output only for rcu_preempt. Each two-line entry +corresponds to a leaf rcu_node strcuture. The fields are as follows: + +o "n:m" is the CPU-number range for the corresponding two-line + entry. In the sample output above, the first entry covers + CPUs zero through five and the second entry covers CPUs 6 + and 7. + +o "tasks=TNEB" gives the state of the various segments of the + rnp->blocked_tasks list: + + "T" This indicates that there are some tasks that blocked + while running on one of the corresponding CPUs while + in an RCU read-side critical section. + + "N" This indicates that some of the blocked tasks are preventing + the current normal (non-expedited) grace period from + completing. + + "E" This indicates that some of the blocked tasks are preventing + the current expedited grace period from completing. + + "B" This indicates that some of the blocked tasks are in + need of RCU priority boosting. + + Each character is replaced with "." if the corresponding + condition does not hold. + +o "kt" is the state of the RCU priority-boosting kernel + thread associated with the corresponding rcu_node structure. + The state can be one of the following: + + "S" The kernel thread is stopped, in other words, all + CPUs corresponding to this rcu_node structure are + offline. + + "R" The kernel thread is running. + + "W" The kernel thread is waiting because there is no work + for it to do. + + "Y" The kernel thread is yielding to avoid hogging CPU. + + "?" Unknown value, indicates a bug. + +o "ntb" is the number of tasks boosted. + +o "neb" is the number of tasks boosted in order to complete an + expedited grace period. + +o "nnb" is the number of tasks boosted in order to complete a + normal (non-expedited) grace period. When boosting a task + that was blocking both an expedited and a normal grace period, + it is counted against the expedited total above. + +o "j" is the low-order 16 bits of the jiffies counter in + hexadecimal. + +o "bt" is the low-order 16 bits of the value that the jiffies + counter will have when we next start boosting, assuming that + the current grace period does not end beforehand. This is + also in hexadecimal. + +o "balk: nt" counts the number of times we didn't boost (in + other words, we balked) even though it was time to boost because + there were no blocked tasks to boost. This situation occurs + when there is one blocked task on one rcu_node structure and + none on some other rcu_node structure. + +o "egt" counts the number of times we balked because although + there were blocked tasks, none of them were blocking the + current grace period, whether expedited or otherwise. + +o "bt" counts the number of times we balked because boosting + had already been initiated for the current grace period. + +o "nb" counts the number of times we balked because there + was at least one task blocking the current non-expedited grace + period that never had blocked. If it is already running, it + just won't help to boost its priority! + +o "ny" counts the number of times we balked because it was + not yet time to start boosting. + +o "nos" counts the number of times we balked for other + reasons, e.g., the grace period ended first. + + CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats These implementations of RCU provides a single debugfs file under the @@ -422,9 +558,9 @@ o "neb" is the number of expedited grace periods that have had o "nnb" is the number of normal grace periods that have had to resort to RCU priority boosting since boot. -o "j" is the low-order 12 bits of the jiffies counter in hexadecimal. +o "j" is the low-order 16 bits of the jiffies counter in hexadecimal. -o "bt" is the low-order 12 bits of the value that the jiffies counter +o "bt" is the low-order 16 bits of the value that the jiffies counter will have at the next time that boosting is scheduled to begin. o In the line beginning with "normal balk", the fields are as follows: -- cgit v1.1 From a9f4793d8900dc5dc09b3951bdcd4731290e06fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2011 03:46:10 -0700 Subject: rcu: fix tracing bug thinko on boost-balk attribution The rcu_initiate_boost_trace() function mis-attributed refusals to initiate RCU priority boosting that were in fact due to its not yet being time to boost. This patch fixes the faulty comparison. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 22a6a46..a21413d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1086,7 +1086,7 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) rnp->n_balk_notblocked++; else if (rnp->gp_tasks != NULL && - ULONG_CMP_GE(jiffies, rnp->boost_time)) + ULONG_CMP_LT(jiffies, rnp->boost_time)) rnp->n_balk_notyet++; else rnp->n_balk_nos++; -- cgit v1.1 From 15ba0ba860871cf74b48b1bb47c26c91a66126f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Apr 2011 16:01:16 -0700 Subject: rcu: add grace-period age and more kthread state to tracing This commit adds the age in jiffies of the current grace period along with the duration in jiffies of the longest grace period since boot to the rcu/rcugp debugfs file. It also adds an additional "O" state to kthread tracing to differentiate between the kthread waiting due to having nothing to do on the one hand and waiting due to being on the wrong CPU on the other hand. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 12 ++++++++++-- kernel/rcutree.c | 10 ++++++++++ kernel/rcutree.h | 7 +++++-- kernel/rcutree_trace.c | 37 +++++++++++++++++++++++++++++-------- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 40b530d..fd4bffb 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -159,8 +159,8 @@ o "qs" gives an indication of the state of the callback queue the corresponding character is replaced by ".". o "kt" is the per-CPU kernel-thread state. The digit preceding - the slash is zero if there is no work pending and 1 otherwise. - The character after the slash is as follows: + the first slash is zero if there is no work pending and 1 + otherwise. The character between the slashes is as follows: "S" The kernel thread is stopped, in other words, all CPUs corresponding to this rcu_node structure are @@ -171,10 +171,18 @@ o "kt" is the per-CPU kernel-thread state. The digit preceding "W" The kernel thread is waiting because there is no work for it to do. + "O" The kernel thread is waiting because it has been + forced off of its designated CPU or because its + ->cpus_allowed mask permits it to run on other than + its designated CPU. + "Y" The kernel thread is yielding to avoid hogging CPU. "?" Unknown value, indicates a bug. + The number after the final slash is the CPU that the kthread + is actually running on. + o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index bb84dec..27b6d8d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -92,6 +92,7 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; @@ -888,6 +889,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + unsigned long gp_duration; + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); /* @@ -895,6 +898,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * is seen before the assignment to rsp->completed. */ smp_mb(); /* See above block comment. */ + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; rsp->completed = rsp->gpnum; rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -1583,12 +1589,15 @@ static int rcu_cpu_kthread_should_stop(int cpu) smp_processor_id() != cpu) { if (kthread_should_stop()) return 1; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); local_bh_enable(); schedule_timeout_uninterruptible(1); if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) set_cpus_allowed_ptr(current, cpumask_of(cpu)); local_bh_disable(); } + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; return 0; } @@ -1656,6 +1665,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) if (IS_ERR(t)) return PTR_ERR(t); kthread_bind(t, cpu); + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); per_cpu(rcu_cpu_kthread_task, cpu) = t; wake_up_process(t); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 67341db..37502a2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -93,8 +93,9 @@ struct rcu_dynticks { #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 #define RCU_KTHREAD_WAITING 2 -#define RCU_KTHREAD_YIELDING 3 -#define RCU_KTHREAD_MAX 3 +#define RCU_KTHREAD_OFFCPU 3 +#define RCU_KTHREAD_YIELDING 4 +#define RCU_KTHREAD_MAX 4 /* * Definition for node within the RCU grace-period-detection hierarchy. @@ -383,6 +384,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ char *name; /* Name of structure. */ }; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3baa235..564b8fe 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -47,13 +47,14 @@ #include "rcutree.h" DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); DECLARE_PER_CPU(char, rcu_cpu_has_work); static char convert_kthread_status(unsigned int kthread_status) { if (kthread_status > RCU_KTHREAD_MAX) return '?'; - return "SRWY"[kthread_status]; + return "SRWOY"[kthread_status]; } static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) @@ -74,7 +75,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c b=%ld", + seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d b=%ld", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -86,6 +87,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), + per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); @@ -312,16 +314,35 @@ static const struct file_operations rcuhier_fops = { .release = single_release, }; +static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long completed; + unsigned long gpnum; + unsigned long gpage; + unsigned long gpmax; + struct rcu_node *rnp = &rsp->node[0]; + + raw_spin_lock_irqsave(&rnp->lock, flags); + completed = rsp->completed; + gpnum = rsp->gpnum; + if (rsp->completed == rsp->gpnum) + gpage = 0; + else + gpage = jiffies - rsp->gp_start; + gpmax = rsp->gp_max; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", + rsp->name, completed, gpnum, gpage, gpmax); +} + static int show_rcugp(struct seq_file *m, void *unused) { #ifdef CONFIG_TREE_PREEMPT_RCU - seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", - rcu_preempt_state.completed, rcu_preempt_state.gpnum); + show_one_rcugp(m, &rcu_preempt_state); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", - rcu_sched_state.completed, rcu_sched_state.gpnum); - seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", - rcu_bh_state.completed, rcu_bh_state.gpnum); + show_one_rcugp(m, &rcu_sched_state); + show_one_rcugp(m, &rcu_bh_state); return 0; } -- cgit v1.1 From 5ece5bab3ed8594ce2c85c6c6e6b82109db36ca7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Apr 2011 18:08:51 -0700 Subject: rcu: Add forward-progress diagnostic for per-CPU kthreads Increment a per-CPU counter on each pass through rcu_cpu_kthread()'s service loop, and add it to the rcudata trace output. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/trace.txt | 39 ++++++++++++++++++++++----------------- kernel/rcutree.c | 2 ++ kernel/rcutree_trace.c | 4 +++- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index fd4bffb..8173cec 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -33,23 +33,23 @@ rcu/rcuboost: The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0!c=423090 g=423091 pq=1 pqc=423090 qp=1 dt=86475/1/0 df=16319 of=163 ri=1519 ql=0 qs=.... kt=0/W b=10 ci=1460693 co=1648 ca=6448 - 1!c=423329 g=423330 pq=1 pqc=423329 qp=1 dt=90875/1/0 df=16231 of=157 ri=1249 ql=0 qs=.... kt=0/W b=10 ci=1459002 co=1614 ca=3310 - 2!c=423370 g=423371 pq=1 pqc=423370 qp=1 dt=69661/1/0 df=16125 of=163 ri=1469 ql=0 qs=.... kt=0/W b=10 ci=1610701 co=2015 ca=2378 - 3!c=422967 g=422968 pq=1 pqc=422967 qp=1 dt=70349/1/0 df=12528 of=163 ri=1450 ql=0 qs=.... kt=0/W b=10 ci=1427543 co=1430 ca=897 - 4!c=423196 g=423197 pq=1 pqc=423196 qp=0 dt=38935/1/0 df=10959 of=177 ri=1657 ql=0 qs=.... kt=0/W b=10 ci=1562249 co=1896 ca=533 - 5!c=422950 g=422951 pq=1 pqc=422950 qp=0 dt=25127/1/0 df=5895 of=167 ri=1549 ql=0 qs=.... kt=0/W b=10 ci=1777260 co=2137 ca=274 - 6!c=423396 g=423397 pq=1 pqc=423396 qp=1 dt=22639/1/0 df=4590 of=149 ri=1572 ql=0 qs=.... kt=0/W b=10 ci=1471186 co=1530 ca=243 - 7 c=460203 g=460203 pq=1 pqc=460202 qp=0 dt=937087/1/0 df=3298 of=149 ri=1584 ql=6 qs=N.W. kt=0/W b=10 ci=4026154 co=1948 ca=135 + 0 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 + 1 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 + 2 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 + 3 c=20942 g=20943 pq=1 pqc=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 + 4 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 + 5 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 + 6 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 + 7 c=20897 g=20897 pq=1 pqc=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 rcu_bh: - 0!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=86475/1/0 df=11 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=112 co=0 ca=0 - 1!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=90875/1/0 df=15 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=143 co=0 ca=0 - 2!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=69661/1/0 df=21 of=0 ri=1 ql=0 qs=.... kt=0/W b=10 ci=88 co=0 ca=0 - 3!c=18446744073709551494 g=18446744073709551494 pq=1 pqc=18446744073709551493 qp=0 dt=70349/1/0 df=13 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=100 co=0 ca=0 - 4!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=38935/1/0 df=17 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=36 co=0 ca=0 - 5!c=18446744073709551494 g=18446744073709551494 pq=0 pqc=18446744073709551493 qp=1 dt=25127/1/0 df=7 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=32 co=0 ca=0 - 6!c=18446744073709551496 g=18446744073709551496 pq=1 pqc=18446744073709551495 qp=0 dt=22639/1/0 df=9 of=0 ri=0 ql=0 qs=.... kt=0/W b=10 ci=44 co=0 ca=0 - 7 c=182 g=182 pq=1 pqc=181 qp=0 dt=937087/1/0 df=14 of=0 ri=1 ql=0 qs=.... kt=0/W b=10 ci=627 co=0 ca=0 + 0 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 + 1 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 + 2 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 + 3 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 + 4 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 + 5 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 + 6 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 + 7 c=1474 g=1474 pq=1 pqc=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -160,7 +160,8 @@ o "qs" gives an indication of the state of the callback queue o "kt" is the per-CPU kernel-thread state. The digit preceding the first slash is zero if there is no work pending and 1 - otherwise. The character between the slashes is as follows: + otherwise. The character between the first pair of slashes is + as follows: "S" The kernel thread is stopped, in other words, all CPUs corresponding to this rcu_node structure are @@ -183,6 +184,10 @@ o "kt" is the per-CPU kernel-thread state. The digit preceding The number after the final slash is the CPU that the kthread is actually running on. +o "ktl" is the low-order 16 bits (in hexadecimal) of the count of + the number of times that this CPU's per-CPU kthread has gone + through its loop servicing invoke_rcu_cpu_kthread() requests. + o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 27b6d8d..575d641 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -93,6 +93,7 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; @@ -1625,6 +1626,7 @@ static int rcu_cpu_kthread(void *arg) break; } *statusp = RCU_KTHREAD_RUNNING; + per_cpu(rcu_cpu_kthread_loops, cpu)++; local_irq_save(flags); work = *workp; *workp = 0; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 564b8fe..9678cc3 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -48,6 +48,7 @@ DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DECLARE_PER_CPU(char, rcu_cpu_has_work); static char convert_kthread_status(unsigned int kthread_status) @@ -75,7 +76,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d b=%ld", + seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -88,6 +89,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff, rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -- cgit v1.1 From fc2ecf7ec76c5ee150b83dcefc863fa03fd365fb Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 23 Feb 2011 09:42:14 -0800 Subject: rcu: Enable DEBUG_OBJECTS_RCU_HEAD from !PREEMPT The prohibition of DEBUG_OBJECTS_RCU_HEAD from !PREEMPT was due to the fixup actions. So just produce a warning from !PREEMPT. Signed-off-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 26 +++++++++++++++++++++----- lib/Kconfig.debug | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f3240e9..b54d6d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -142,7 +142,14 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { WARN_ON(1); @@ -184,7 +191,14 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { WARN_ON(1); @@ -214,12 +228,14 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. - * Note that the machinery to reliably determine whether - * or not we are in an RCU read-side critical section - * exists only in the preemptible RCU implementations - * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why - * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { WARN_ON(1); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 93ce6de3..3aa2780 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -337,7 +337,7 @@ config DEBUG_OBJECTS_WORK config DEBUG_OBJECTS_RCU_HEAD bool "Debug RCU callbacks objects" - depends on DEBUG_OBJECTS && PREEMPT + depends on DEBUG_OBJECTS help Enable this to turn on debugging of RCU list heads (call_rcu() usage). -- cgit v1.1 From b0c9d7ff2793502650ad987c3f237d5fe5587a1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Mar 2011 12:56:56 -0700 Subject: rcu: add DEBUG_OBJECTS_RCU_HEAD check for alignment Verify that rcu_head structures are aligned to a four-byte boundary. This check is enabled by CONFIG_DEBUG_OBJECTS_RCU_HEAD. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9e169c2..c7aeacf 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -785,6 +785,7 @@ extern struct debug_obj_descr rcuhead_debug_descr; static inline void debug_rcu_head_queue(struct rcu_head *head) { + WARN_ON_ONCE((unsigned long)head & 0x3); debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, -- cgit v1.1 From 561190e3b3db372403fb6a327b0121b4cae1b87e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Mar 2011 09:10:44 -0700 Subject: rcu: mark rcutorture boosting callback as being on-stack The CONFIG_DEBUG_OBJECTS_RCU_HEAD facility requires that on-stack RCU callbacks be flagged explicitly to debug-objects using the init_rcu_head_on_stack() and destroy_rcu_head_on_stack() functions. This commit applies those functions to the rcutorture code that tests RCU priority boosting. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c2f58ec..2e138db 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -749,6 +749,7 @@ static int rcu_torture_boost(void *arg) n_rcu_torture_boost_rterror++; } + init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { /* Wait for the next test interval. */ @@ -808,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + destroy_rcu_head_on_stack(&rbi.rcu); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); -- cgit v1.1 From 108aae22339f445c134aeb48eca25df1014ab08d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Feb 2011 09:56:00 -0800 Subject: rcu: Use WARN_ON_ONCE for DEBUG_OBJECTS_RCU_HEAD warnings Avoid additional multiple-warning confusion in memory-corruption scenarios. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index b54d6d1..7784bd2 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -147,12 +147,12 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) * attempt any fixup and just print a warning. */ #ifndef CONFIG_PREEMPT - WARN_ON(1); + WARN_ON_ONCE(1); return 0; #endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -196,12 +196,12 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) * attempt any fixup and just print a warning. */ #ifndef CONFIG_PREEMPT - WARN_ON(1); + WARN_ON_ONCE(1); return 0; #endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -233,12 +233,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * attempt any fixup and just print a warning. */ #ifndef CONFIG_PREEMPT - WARN_ON(1); + WARN_ON_ONCE(1); return 0; #endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); -- cgit v1.1 From f0a07aeaf8935b7e9ef8032ce6546169f143951c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Feb 2011 11:10:52 -0800 Subject: rcu: Switch to this_cpu() primitives This removes a couple of lines from invoke_rcu_cpu_kthread(), improving readability. Reported-by: Christoph Lameter Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 575d641..27304bc 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1480,18 +1480,14 @@ static void rcu_process_callbacks(void) static void invoke_rcu_cpu_kthread(void) { unsigned long flags; - wait_queue_head_t *q; - int cpu; local_irq_save(flags); - cpu = smp_processor_id(); - per_cpu(rcu_cpu_has_work, cpu) = 1; - if (per_cpu(rcu_cpu_kthread_task, cpu) == NULL) { + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { local_irq_restore(flags); return; } - q = &per_cpu(rcu_cpu_wq, cpu); - wake_up(q); + wake_up(&__get_cpu_var(rcu_cpu_wq)); local_irq_restore(flags); } -- cgit v1.1 From ddeb75814f09205df795121d9e373e82de7f2aca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Feb 2011 17:03:06 -0800 Subject: rcu: code cleanups in TINY_RCU priority boosting. Extraneous semicolon, bad comment, and fold INIT_LIST_HEAD() into list_del() to get list_del_init(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3cb8e36..0bc9f73 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -339,7 +339,7 @@ static void rcu_initiate_expedited_boost(void) raw_local_irq_restore(flags); } -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* * Do priority-boost accounting for the start of a new grace period. @@ -418,7 +418,7 @@ static void rcu_preempt_cpu_qs(void) if (!rcu_preempt_gp_in_progress()) return; /* - * Check up on boosting. If there are no readers blocking the + * Check up on boosting. If there are readers blocking the * current grace period, leave. */ if (rcu_initiate_boost()) @@ -578,7 +578,7 @@ static void rcu_read_unlock_special(struct task_struct *t) empty = !rcu_preempt_blocked_readers_cgp(); empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; np = rcu_next_node_entry(t); - list_del(&t->rcu_node_entry); + list_del_init(&t->rcu_node_entry); if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) rcu_preempt_ctrlblk.gp_tasks = np; if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) @@ -587,7 +587,6 @@ static void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) rcu_preempt_ctrlblk.boost_tasks = np; #endif /* #ifdef CONFIG_RCU_BOOST */ - INIT_LIST_HEAD(&t->rcu_node_entry); /* * If this was the last task on the current list, and if -- cgit v1.1 From 203373c81b83e98da82836c4b8b5dd1e6fc9011f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Feb 2011 15:25:21 -0800 Subject: rcu: remove useless ->boosted_this_gp field The ->boosted_this_gp field is a holdover from an earlier design that was to carry out multiple boost operations in parallel. It is not required by the current design, which boosts one task at a time. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 0bc9f73..2b8d529 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -100,7 +100,6 @@ struct rcu_preempt_ctrlblk { u8 completed; /* Last grace period completed. */ /* If all three are equal, RCU is idle. */ #ifdef CONFIG_RCU_BOOST - s8 boosted_this_gp; /* Has boosting already happened? */ unsigned long boost_time; /* When to start boosting (jiffies) */ #endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_TRACE @@ -112,7 +111,6 @@ struct rcu_preempt_ctrlblk { unsigned long n_normal_balk_blkd_tasks; unsigned long n_normal_balk_gp_tasks; unsigned long n_normal_balk_boost_tasks; - unsigned long n_normal_balk_boosted; unsigned long n_normal_balk_notyet; unsigned long n_normal_balk_nos; unsigned long n_exp_balk_blkd_tasks; @@ -219,36 +217,19 @@ static void show_tiny_preempt_stats(struct seq_file *m) "N."[!rcu_preempt_ctrlblk.gp_tasks], "E."[!rcu_preempt_ctrlblk.exp_tasks]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " ttb=%c btg=", - "B."[!rcu_preempt_ctrlblk.boost_tasks]); - switch (rcu_preempt_ctrlblk.boosted_this_gp) { - case -1: - seq_puts(m, "exp"); - break; - case 0: - seq_puts(m, "no"); - break; - case 1: - seq_puts(m, "begun"); - break; - case 2: - seq_puts(m, "done"); - break; - default: - seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); - } - seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + " ", + "B."[!rcu_preempt_ctrlblk.boost_tasks], rcu_preempt_ctrlblk.n_tasks_boosted, rcu_preempt_ctrlblk.n_exp_boosts, rcu_preempt_ctrlblk.n_normal_boosts, (int)(jiffies & 0xffff), (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", + seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu ny=%lu nos=%lu\n", "normal balk", rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, - rcu_preempt_ctrlblk.n_normal_balk_boosted, rcu_preempt_ctrlblk.n_normal_balk_notyet, rcu_preempt_ctrlblk.n_normal_balk_nos); seq_printf(m, " exp balk: bt=%lu nos=%lu\n", @@ -277,7 +258,6 @@ static int rcu_boost(void) if (rcu_preempt_ctrlblk.boost_tasks == NULL) return 0; /* Nothing to boost. */ raw_local_irq_save(flags); - rcu_preempt_ctrlblk.boosted_this_gp++; t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, rcu_node_entry); np = rcu_next_node_entry(t); @@ -287,7 +267,6 @@ static int rcu_boost(void) raw_local_irq_restore(flags); rt_mutex_lock(&mtx); RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - rcu_preempt_ctrlblk.boosted_this_gp++; rt_mutex_unlock(&mtx); return rcu_preempt_ctrlblk.boost_tasks != NULL; } @@ -310,7 +289,6 @@ static int rcu_initiate_boost(void) } if (rcu_preempt_ctrlblk.gp_tasks != NULL && rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.boosted_this_gp == 0 && ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_kthread(); @@ -331,7 +309,6 @@ static void rcu_initiate_expedited_boost(void) if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.blkd_tasks.next; - rcu_preempt_ctrlblk.boosted_this_gp = -1; invoke_rcu_kthread(); RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); } else @@ -347,8 +324,6 @@ static void rcu_initiate_expedited_boost(void) static void rcu_preempt_boost_start_gp(void) { rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; - if (rcu_preempt_ctrlblk.boosted_this_gp > 0) - rcu_preempt_ctrlblk.boosted_this_gp = 0; } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -934,8 +909,6 @@ static void rcu_initiate_boost_trace(void) rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; else if (rcu_preempt_ctrlblk.boost_tasks != NULL) rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; - else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) - rcu_preempt_ctrlblk.n_normal_balk_boosted++; else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) rcu_preempt_ctrlblk.n_normal_balk_notyet++; else -- cgit v1.1 From 7e8b4c72344e0d904b0e3fa9fd2eb116f04b3d41 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Feb 2011 19:26:21 -0800 Subject: rcu: Converge TINY_RCU expedited and normal boosting This applies a trick from TREE_RCU boosting to TINY_RCU, eliminating code and adding comments. The key point is that it is possible for the booster thread itself to work out whether there is a normal or expedited boost required based solely on local information. There is therefore no need for boost initiation to know or care what type of boosting is required. In addition, when boosting is complete for a given grace period, then by definition there cannot be any more boosting for that grace period. This allows eliminating yet more state and statistics. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny_plugin.h | 163 ++++++++++++++++++++++++++---------------------- 1 file changed, 89 insertions(+), 74 deletions(-) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 2b8d529..f259c67 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -106,15 +106,22 @@ struct rcu_preempt_ctrlblk { unsigned long n_grace_periods; #ifdef CONFIG_RCU_BOOST unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; - unsigned long n_normal_balk_blkd_tasks; - unsigned long n_normal_balk_gp_tasks; - unsigned long n_normal_balk_boost_tasks; - unsigned long n_normal_balk_notyet; - unsigned long n_normal_balk_nos; - unsigned long n_exp_balk_blkd_tasks; - unsigned long n_exp_balk_nos; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ #endif /* #ifdef CONFIG_RCU_TRACE */ }; @@ -199,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t) #ifdef CONFIG_RCU_BOOST static void rcu_initiate_boost_trace(void); -static void rcu_initiate_exp_boost_trace(void); #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -225,16 +231,13 @@ static void show_tiny_preempt_stats(struct seq_file *m) rcu_preempt_ctrlblk.n_normal_boosts, (int)(jiffies & 0xffff), (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu ny=%lu nos=%lu\n", - "normal balk", - rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, - rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, - rcu_preempt_ctrlblk.n_normal_balk_notyet, - rcu_preempt_ctrlblk.n_normal_balk_nos); - seq_printf(m, " exp balk: bt=%lu nos=%lu\n", - rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_exp_balk_nos); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", + " balk", + rcu_preempt_ctrlblk.n_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, + rcu_preempt_ctrlblk.n_balk_boost_tasks, + rcu_preempt_ctrlblk.n_balk_notyet, + rcu_preempt_ctrlblk.n_balk_nos); #endif /* #ifdef CONFIG_RCU_BOOST */ } @@ -252,23 +255,59 @@ static int rcu_boost(void) { unsigned long flags; struct rt_mutex mtx; - struct list_head *np; struct task_struct *t; + struct list_head *tb; - if (rcu_preempt_ctrlblk.boost_tasks == NULL) + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) return 0; /* Nothing to boost. */ + raw_local_irq_save(flags); - t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, - rcu_node_entry); - np = rcu_next_node_entry(t); + + /* + * Recheck with irqs disabled: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own + * if we are preempted just before disabling irqs. + */ + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + raw_local_irq_restore(flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rcu_preempt_ctrlblk.exp_tasks != NULL) { + tb = rcu_preempt_ctrlblk.exp_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); + } else { + tb = rcu_preempt_ctrlblk.boost_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); + } + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + */ + t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; raw_local_irq_restore(flags); rt_mutex_lock(&mtx); - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - rt_mutex_unlock(&mtx); - return rcu_preempt_ctrlblk.boost_tasks != NULL; + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rcu_preempt_ctrlblk.boost_tasks != NULL || + rcu_preempt_ctrlblk.exp_tasks != NULL; } /* @@ -283,39 +322,24 @@ static int rcu_boost(void) */ static int rcu_initiate_boost(void) { - if (!rcu_preempt_blocked_readers_cgp()) { - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); + if (!rcu_preempt_blocked_readers_cgp() && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); return 0; } - if (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { - rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; + if (rcu_preempt_ctrlblk.exp_tasks != NULL || + (rcu_preempt_ctrlblk.gp_tasks != NULL && + rcu_preempt_ctrlblk.boost_tasks == NULL && + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { + if (rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.boost_tasks = + rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_kthread(); - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); } else RCU_TRACE(rcu_initiate_boost_trace()); return 1; } -/* - * Initiate boosting for an expedited grace period. - */ -static void rcu_initiate_expedited_boost(void) -{ - unsigned long flags; - - raw_local_irq_save(flags); - if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - invoke_rcu_kthread(); - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else - RCU_TRACE(rcu_initiate_exp_boost_trace()); - raw_local_irq_restore(flags); -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -347,13 +371,6 @@ static int rcu_initiate_boost(void) } /* - * If there is no RCU priority boosting, we don't initiate expedited boosting. - */ -static void rcu_initiate_expedited_boost(void) -{ -} - -/* * If there is no RCU priority boosting, nothing to do at grace-period start. */ static void rcu_preempt_boost_start_gp(void) @@ -786,13 +803,16 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = rpcp->blkd_tasks.next; if (rpcp->exp_tasks == &rpcp->blkd_tasks) rpcp->exp_tasks = NULL; - local_irq_restore(flags); /* Wait for tail of ->blkd_tasks list to drain. */ - if (rcu_preempted_readers_exp()) - rcu_initiate_expedited_boost(); + if (!rcu_preempted_readers_exp()) + local_irq_restore(flags); + else { + rcu_initiate_boost(); + local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, !rcu_preempted_readers_exp()); + } /* Clean up and exit. */ barrier(); /* ensure expedited GP seen before counter increment. */ @@ -905,22 +925,17 @@ void __init rcu_scheduler_starting(void) static void rcu_initiate_boost_trace(void) { - if (rcu_preempt_ctrlblk.gp_tasks == NULL) - rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) + rcu_preempt_ctrlblk.n_balk_blkd_tasks++; + else if (rcu_preempt_ctrlblk.gp_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; + rcu_preempt_ctrlblk.n_balk_boost_tasks++; else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_normal_balk_notyet++; - else - rcu_preempt_ctrlblk.n_normal_balk_nos++; -} - -static void rcu_initiate_exp_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; + rcu_preempt_ctrlblk.n_balk_notyet++; else - rcu_preempt_ctrlblk.n_exp_balk_nos++; + rcu_preempt_ctrlblk.n_balk_nos++; } #endif /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.1 From 13491a0ee1ef862b6c842132b6eb9c5e721af5ad Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 25 Feb 2011 11:37:59 -0800 Subject: rcu: call __rcu_read_unlock() in exit_rcu for tree RCU Using __rcu_read_lock() in place of rcu_read_lock() leaves any debug state as it really should be, namely with the lock still held. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a21413d..11b27f3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -858,7 +858,7 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -- cgit v1.1 From 6cc68793e380bb51f447d8d02af873b7bc01f222 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Mar 2011 13:15:15 -0800 Subject: rcu: fix spelling The "preemptible" spelling is preferable. May as well fix it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 ++++---- kernel/rcutree.h | 2 +- kernel/rcutree_plugin.h | 62 ++++++++++++++++++++++++------------------------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 27304bc..b579e4f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -290,8 +290,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* If preemptable RCU, no point in sending reschedule IPI. */ - if (rdp->preemptable) + /* If preemptible RCU, no point in sending reschedule IPI. */ + if (rdp->preemptible) return 0; /* The CPU is online, so send it a reschedule IPI. */ @@ -1982,7 +1982,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * or RCU-bh, force a local reschedule. */ rdp->n_rp_qs_pending++; - if (!rdp->preemptable && + if (!rdp->preemptible && ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); @@ -2159,7 +2159,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void __cpuinit -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) { unsigned long flags; unsigned long mask; @@ -2171,7 +2171,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptable = preemptable; + rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 37502a2..a6a9717 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -239,7 +239,7 @@ struct rcu_data { bool passed_quiesc; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool preemptable; /* Preemptable RCU? */ + bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 11b27f3..f629479 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1,7 +1,7 @@ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -75,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); + printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -108,7 +108,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Record a preemptable-RCU quiescent state for the specified CPU. Note + * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. @@ -207,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Tree-preemptable RCU implementation for rcu_read_lock(). + * Tree-preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated * if we block. */ @@ -376,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* - * Tree-preemptable RCU implementation for rcu_read_unlock(). + * Tree-preemptible RCU implementation for rcu_read_unlock(). * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then * invoke rcu_read_unlock_special() to clean up after a context switch @@ -565,7 +565,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, } /* - * Do CPU-offline processing for preemptable RCU. + * Do CPU-offline processing for preemptible RCU. */ static void rcu_preempt_offline_cpu(int cpu) { @@ -594,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Process callbacks for preemptable RCU. + * Process callbacks for preemptible RCU. */ static void rcu_preempt_process_callbacks(void) { @@ -603,7 +603,7 @@ static void rcu_preempt_process_callbacks(void) } /* - * Queue a preemptable-RCU callback for invocation after a grace period. + * Queue a preemptible-RCU callback for invocation after a grace period. */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { @@ -795,7 +795,7 @@ mb_ret: EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /* - * Check to see if there is any immediate preemptable-RCU-related work + * Check to see if there is any immediate preemptible-RCU-related work * to be done. */ static int rcu_preempt_pending(int cpu) @@ -805,7 +805,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptable RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU need the CPU to stay out of dynticks mode? */ static int rcu_preempt_needs_cpu(int cpu) { @@ -822,7 +822,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Initialize preemptable RCU's per-CPU data. + * Initialize preemptible RCU's per-CPU data. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { @@ -830,7 +830,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptable RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU. */ static void rcu_preempt_send_cbs_to_online(void) { @@ -838,7 +838,7 @@ static void rcu_preempt_send_cbs_to_online(void) } /* - * Initialize preemptable RCU's state structures. + * Initialize preemptible RCU's state structures. */ static void __init __rcu_init_preempt(void) { @@ -846,7 +846,7 @@ static void __init __rcu_init_preempt(void) } /* - * Check for a task exiting while in a preemptable-RCU read-side + * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep * is enabled. @@ -894,7 +894,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ static void rcu_preempt_note_context_switch(int cpu) @@ -902,7 +902,7 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Because preemptable RCU does not exist, there are never any preempted + * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) @@ -921,7 +921,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_detail_task_stall(struct rcu_state *rsp) @@ -929,7 +929,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) } /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_task_stall(struct rcu_node *rnp) @@ -945,7 +945,7 @@ static void rcu_preempt_stall_reset(void) } /* - * Because there is no preemptable RCU, there can be no readers blocked, + * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ @@ -957,7 +957,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, it never needs to migrate + * Because preemptible RCU does not exist, it never needs to migrate * tasks that were blocked within RCU read-side critical sections, and * such non-existent tasks cannot possibly have been blocking the current * grace period. @@ -970,7 +970,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, } /* - * Because preemptable RCU does not exist, it never needs CPU-offline + * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ static void rcu_preempt_offline_cpu(int cpu) @@ -980,7 +980,7 @@ static void rcu_preempt_offline_cpu(int cpu) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to check. */ static void rcu_preempt_check_callbacks(int cpu) @@ -988,7 +988,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to process. */ static void rcu_preempt_process_callbacks(void) @@ -997,7 +997,7 @@ static void rcu_preempt_process_callbacks(void) /* * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptable RCU does not exist, map to rcu-sched. + * But because preemptible RCU does not exist, map to rcu-sched. */ void synchronize_rcu_expedited(void) { @@ -1008,7 +1008,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, there is never any need to + * Because preemptible RCU does not exist, there is never any need to * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ @@ -1020,7 +1020,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any work to do. + * Because preemptible RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) { @@ -1028,7 +1028,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptable RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never needs any CPU. */ static int rcu_preempt_needs_cpu(int cpu) { @@ -1036,7 +1036,7 @@ static int rcu_preempt_needs_cpu(int cpu) } /* - * Because preemptable RCU does not exist, rcu_barrier() is just + * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ void rcu_barrier(void) @@ -1046,7 +1046,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Because preemptable RCU does not exist, there is no per-CPU + * Because preemptible RCU does not exist, there is no per-CPU * data to initialize. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) @@ -1054,14 +1054,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptable RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there are no callbacks to move. */ static void rcu_preempt_send_cbs_to_online(void) { } /* - * Because preemptable RCU does not exist, it need not be initialized. + * Because preemptible RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) { -- cgit v1.1 From 9ab1544eb4196ca8d05c433b2eb56f74496b1ee3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:15:47 +0800 Subject: rcu: introduce kfree_rcu() Many rcu callbacks functions just call kfree() on the base structure. These functions are trivial, but their size adds up, and furthermore when they are used in a kernel module, that module must invoke the high-latency rcu_barrier() function at module-unload time. The kfree_rcu() function introduced by this commit addresses this issue. Rather than encoding a function address in the embedded rcu_head structure, kfree_rcu() instead encodes the offset of the rcu_head structure within the base structure. Because the functions are not allowed in the low-order 4096 bytes of kernel virtual memory, offsets up to 4095 bytes can be accommodated. If the offset is larger than 4095 bytes, a compile-time error will be generated in __kfree_rcu(). If this error is triggered, you can either fall back to use of call_rcu() or rearrange the structure to position the rcu_head structure into the first 4096 bytes. Note that the allowable offset might decrease in the future, for example, to allow something like kmem_cache_free_rcu(). The new kfree_rcu() function can replace code as follows: call_rcu(&p->rcu, simple_kfree_callback); where "simple_kfree_callback()" might be defined as follows: void simple_kfree_callback(struct rcu_head *p) { struct foo *q = container_of(p, struct foo, rcu); kfree(q); } with the following: kfree_rcu(&p->rcu, rcu); Note that the "rcu" is the name of a field in the structure being freed. The reason for using this rather than passing in a pointer to the base structure is that the above approach allows better type checking. This commit is based on earlier work by Lai Jiangshan and Manfred Spraul: Lai's V1 patch: http://lkml.org/lkml/2008/9/18/1 Manfred's patch: http://lkml.org/lkml/2009/1/2/115 Signed-off-by: Lai Jiangshan Signed-off-by: Manfred Spraul Signed-off-by: Paul E. McKenney Reviewed-by: David Howells Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutiny.c | 2 +- kernel/rcutree.c | 2 +- 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c7aeacf..99f9aa7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -809,4 +809,60 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static __always_inline bool __is_kfree_rcu_offset(unsigned long offset) +{ + return offset < 4096; +} + +static __always_inline +void __kfree_rcu(struct rcu_head *head, unsigned long offset) +{ + typedef void (*rcu_callback)(struct rcu_head *); + + BUILD_BUG_ON(!__builtin_constant_p(offset)); + + /* See the kfree_rcu() header comment. */ + BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); + + call_rcu(head, (rcu_callback)offset); +} + +extern void kfree(const void *); + +static inline void __rcu_reclaim(struct rcu_head *head) +{ + unsigned long offset = (unsigned long)head->func; + + if (__is_kfree_rcu_offset(offset)) + kfree((void *)head - offset); + else + head->func(head); +} + +/** + * kfree_rcu() - kfree an object after a grace period. + * @ptr: pointer to kfree + * @rcu_head: the name of the struct rcu_head within the type of @ptr. + * + * Many rcu callbacks functions just call kfree() on the base structure. + * These functions are trivial, but their size adds up, and furthermore + * when they are used in a kernel module, that module must invoke the + * high-latency rcu_barrier() function at module-unload time. + * + * The kfree_rcu() function handles this issue. Rather than encoding a + * function address in the embedded rcu_head structure, kfree_rcu() instead + * encodes the offset of the rcu_head structure within the base structure. + * Because the functions are not allowed in the low-order 4096 bytes of + * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * If the offset is larger than 4095 bytes, a compile-time error will + * be generated in __kfree_rcu(). If this error is triggered, you can + * either fall back to use of call_rcu() or rearrange the structure to + * position the rcu_head structure into the first 4096 bytes. + * + * Note that the allowable offset might decrease in the future, for example, + * to allow something like kmem_cache_free_rcu(). + */ +#define kfree_rcu(ptr, rcu_head) \ + __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9..4d60fbc 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -167,7 +167,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - list->func(list); + __rcu_reclaim(list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b579e4f..2c07adb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1206,7 +1206,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - list->func(list); + __rcu_reclaim(list); list = next; if (++count >= rdp->blimit) break; -- cgit v1.1 From baa1ae0c9f1c618bc60706efa75fef3508bcee58 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 26 Mar 2011 22:01:35 -0700 Subject: rcu: further lower priority in rcu_yield() Although rcu_yield() dropped from real-time to normal priority, there is always the possibility that the competing tasks have been niced. So nice to 19 in rcu_yield() to help ensure that other tasks have a better chance of running. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2c07adb..fe85600 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1561,6 +1561,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) mod_timer(&yield_timer, jiffies + 2); sp.sched_priority = 0; sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + set_user_nice(current, 19); schedule(); sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -- cgit v1.1 From 2655d57ef35aa327a2e58a1c5dc7b65c65003f4e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Apr 2011 22:47:23 -0700 Subject: rcu: prevent call_rcu() from diving into rcu core if irqs disabled This commit marks a first step towards making call_rcu() have real-time behavior. If irqs are disabled, don't dive into the RCU core. Later on, this new early exit will wake up the per-CPU kthread, which first must be modified to handle the cases involving callback storms. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index fe85600..78923a5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1839,6 +1839,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + rdp->qlen++; + + /* If interrupts were disabled, don't dive into RCU core. */ + if (irqs_disabled_flags(flags)) { + local_irq_restore(flags); + return; + } /* * Force the grace period if too many callbacks or too long waiting. @@ -1847,7 +1854,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ rcu_process_gp_end(rsp, rdp); -- cgit v1.1 From b554d7de8d112fca4188da3bf0d7f8b56f42fb95 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Apr 2011 07:23:45 +0200 Subject: rcu: optimize rcutiny rcu_sched_qs() currently calls local_irq_save()/local_irq_restore() up to three times. Remove irq masking from rcu_qsctr_help() / invoke_rcu_kthread() and do it once in rcu_sched_qs() / rcu_bh_qs() This generates smaller code as well. text data bss dec hex filename 2314 156 24 2494 9be kernel/rcutiny.old.o 2250 156 24 2430 97e kernel/rcutiny.new.o Fix an outdated comment for rcu_qsctr_help() Move invoke_rcu_kthread() definition before its use. Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutiny.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 4d60fbc..421abfd 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -40,10 +40,10 @@ static struct task_struct *rcu_kthread_task; static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); static unsigned long have_rcu_kthread_work; -static void invoke_rcu_kthread(void); /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; +static void invoke_rcu_kthread(void); static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); static int rcu_kthread(void *arg); static void __call_rcu(struct rcu_head *head, @@ -79,36 +79,45 @@ void rcu_exit_nohz(void) #endif /* #ifdef CONFIG_NO_HZ */ /* - * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). - * Also disable irqs to avoid confusion due to interrupt handlers + * Helper function for rcu_sched_qs() and rcu_bh_qs(). + * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - unsigned long flags; - - local_irq_save(flags); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; - local_irq_restore(flags); return 1; } - local_irq_restore(flags); return 0; } /* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_kthread(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +/* * Record an rcu quiescent state. And an rcu_bh quiescent state while we * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ void rcu_sched_qs(int cpu) { + unsigned long flags; + + local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) invoke_rcu_kthread(); + local_irq_restore(flags); } /* @@ -116,8 +125,12 @@ void rcu_sched_qs(int cpu) */ void rcu_bh_qs(int cpu) { + unsigned long flags; + + local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) invoke_rcu_kthread(); + local_irq_restore(flags); } /* @@ -208,20 +221,6 @@ static int rcu_kthread(void *arg) } /* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); - local_irq_restore(flags); -} - -/* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_sched() from within an RCU read-side critical section. * Therefore, any legal call to synchronize_sched() is a quiescent -- cgit v1.1 From bad6e1393cb505fe17747344a23666464daa3fa7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2011 23:40:04 -0700 Subject: rcu: get rid of signed overflow in check_cpu_stall() Signed integer overflow is undefined by the C standard, so move calculations to unsigned. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 78923a5..b2fe2a2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -581,21 +581,24 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - long delta; + unsigned long j; + unsigned long js; struct rcu_node *rnp; if (rcu_cpu_stall_suppress) return; - delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); + j = ACCESS_ONCE(jiffies); + js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { - /* They had two time units to dump stack, so complain. */ + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(rsp); } } -- cgit v1.1 From 29ce831000081dd757d3116bf774aafffc4b6b20 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 4 May 2011 16:31:03 +0300 Subject: rcu: provide rcu_virt_note_context_switch() function. Provide rcu_virt_note_context_switch() for vitalization use to note quiescent state during guest entry. Signed-off-by: Gleb Natapov Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 8 ++++++++ include/linux/rcutree.h | 10 ++++++++++ kernel/rcutree.c | 1 + 3 files changed, 19 insertions(+) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 30ebd7c..52b3e02 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -100,6 +100,14 @@ static inline void rcu_note_context_switch(int cpu) } /* + * Take advantage of the fact that there is only one CPU, which + * allows us to ignore virtualization-based context switches. + */ +static inline void rcu_virt_note_context_switch(int cpu) +{ +} + +/* * Return the number of grace periods. */ static inline long rcu_batches_completed(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 284dad1..e65d066 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,6 +35,16 @@ extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); extern void rcu_cpu_stall_reset(void); +/* + * Note a virtualization-based context switch. This is simply a + * wrapper around rcu_note_context_switch(), which allows TINY_RCU + * to save a few bytes. + */ +static inline void rcu_virt_note_context_switch(int cpu) +{ + rcu_note_context_switch(cpu); +} + #ifdef CONFIG_TREE_PREEMPT_RCU extern void exit_rcu(void); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fe2a2..54ff7eb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -157,6 +157,7 @@ void rcu_note_context_switch(int cpu) rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); } +EXPORT_SYMBOL_GPL(rcu_note_context_switch); #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { -- cgit v1.1 From 4934a4d3d3fa775601a9f1b35cc0e2aa93f81355 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Wed, 4 May 2011 22:53:46 +0600 Subject: sched: Wrap the 'cfs_rq->nr_spread_over' field with CONFIG_SCHED_DEBUG cfs_rq->nr_spread_over is only used when CONFIG_SCHED_DEBUG is set. So wrap it with CONFIG_SCHED_DEBUG. Signed-off-by: Rakib Mullick Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1304528026.15681.3.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index f11a2a5..3d8a1b2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -328,7 +328,9 @@ struct cfs_rq { */ struct sched_entity *curr, *next, *last, *skip; +#ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ -- cgit v1.1 From 7142d17e8f935fa842e9f6eece2281b6d41625d6 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Thu, 5 May 2011 20:53:20 +0800 Subject: sched: Shorten the construction of the span cpu mask of sched domain For a given node, when constructing the cpumask for its sched_domain to span, if there is no best node available after searching, further efforts could be saved, based on small change in the return value of find_next_best_node(). Signed-off-by: Hillf Danton Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Yong Zhang Link: http://lkml.kernel.org/r/BANLkTi%3DqPWxRAa6%2BdT3ohEP6Z%3D0v%2Be4EXA@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 3d8a1b2..da93381 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6807,7 +6807,7 @@ __setup("isolcpus=", isolated_cpu_setup); */ static int find_next_best_node(int node, nodemask_t *used_nodes) { - int i, n, val, min_val, best_node = 0; + int i, n, val, min_val, best_node = -1; min_val = INT_MAX; @@ -6831,7 +6831,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) } } - node_set(best_node, *used_nodes); + if (best_node != -1) + node_set(best_node, *used_nodes); return best_node; } @@ -6857,7 +6858,8 @@ static void sched_domain_node_span(int node, struct cpumask *span) for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - + if (next_node < 0) + break; cpumask_or(span, span, cpumask_of_node(next_node)); } } -- cgit v1.1 From 56f3aeb2c14b9d000dfc77f352250bc3b67af5c0 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 6 May 2011 08:14:57 +0100 Subject: ARM: RiscPC: etherh: fix section mismatches WARNING: drivers/net/arm/built-in.o(.data+0x0): Section mismatch in reference from the variable etherh_driver to the function .init.text:etherh_probe() The variable etherh_driver references the function __init etherh_probe() If the reference is valid then annotate the variable with __init* or __refdata (see linux/init.h) or name the variable: *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console Signed-off-by: Russell King --- drivers/net/arm/etherh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c index 4af235d..fbfb5b4 100644 --- a/drivers/net/arm/etherh.c +++ b/drivers/net/arm/etherh.c @@ -527,7 +527,7 @@ static void __init etherh_banner(void) * Read the ethernet address string from the on board rom. * This is an ascii string... */ -static int __init etherh_addr(char *addr, struct expansion_card *ec) +static int __devinit etherh_addr(char *addr, struct expansion_card *ec) { struct in_chunk_dir cd; char *s; @@ -655,7 +655,7 @@ static const struct net_device_ops etherh_netdev_ops = { static u32 etherh_regoffsets[16]; static u32 etherm_regoffsets[16]; -static int __init +static int __devinit etherh_probe(struct expansion_card *ec, const struct ecard_id *id) { const struct etherh_data *data = id->data; -- cgit v1.1 From 52fe116376129b29572f55acc9c73ebd485052c9 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 6 May 2011 08:16:51 +0100 Subject: ARM: RiscPC: acornfb: fix section mismatches WARNING: drivers/video/built-in.o(.devinit.text+0x38): Section mismatch in reference from the function acornfb_probe() to the function .init.text:acornfb_setup() The function __devinit acornfb_probe() references a function __init acornfb_setup(). If acornfb_setup is only used by acornfb_probe then annotate acornfb_setup with a matching annotation. WARNING: drivers/video/built-in.o(.devinit.text+0x3c): Section mismatch in reference from the function acornfb_probe() to the function .init.text:acornfb_init_fbinfo() The function __devinit acornfb_probe() references a function __init acornfb_init_fbinfo(). If acornfb_init_fbinfo is only used by acornfb_probe then annotate acornfb_init_fbinfo with a matching annotation. WARNING: drivers/video/built-in.o(.devinit.text+0x4c0): Section mismatch in reference from the function acornfb_probe() to the (unknown reference) .init.data:(unknown) The function __devinit acornfb_probe() references a (unknown reference) __initdata (unknown). If (unknown) is only used by acornfb_probe then annotate (unknown) with a matching annotation. WARNING: drivers/video/built-in.o(.devinit.text+0x4c8): Section mismatch in reference from the function acornfb_probe() to the (unknown reference) .init.data:(unknown) The function __devinit acornfb_probe() references a (unknown reference) __initdata (unknown). If (unknown) is only used by acornfb_probe then annotate (unknown) with a matching annotation. WARNING: drivers/video/built-in.o(.devinit.text+0x4cc): Section mismatch in reference from the function acornfb_probe() to the (unknown reference) .init.data:(unknown) The function __devinit acornfb_probe() references a (unknown reference) __initdata (unknown). If (unknown) is only used by acornfb_probe then annotate (unknown) with a matching annotation. Signed-off-by: Russell King --- drivers/video/acornfb.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index 82acb8d..6183a57 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -66,7 +66,7 @@ * have. Allow 1% either way on the nominal for TVs. */ #define NR_MONTYPES 6 -static struct fb_monspecs monspecs[NR_MONTYPES] __initdata = { +static struct fb_monspecs monspecs[NR_MONTYPES] __devinitdata = { { /* TV */ .hfmin = 15469, .hfmax = 15781, @@ -873,7 +873,7 @@ static struct fb_ops acornfb_ops = { /* * Everything after here is initialisation!!! */ -static struct fb_videomode modedb[] __initdata = { +static struct fb_videomode modedb[] __devinitdata = { { /* 320x256 @ 50Hz */ NULL, 50, 320, 256, 125000, 92, 62, 35, 19, 38, 2, FB_SYNC_COMP_HIGH_ACT, @@ -925,8 +925,7 @@ static struct fb_videomode modedb[] __initdata = { } }; -static struct fb_videomode __initdata -acornfb_default_mode = { +static struct fb_videomode acornfb_default_mode __devinitdata = { .name = NULL, .refresh = 60, .xres = 640, @@ -942,7 +941,7 @@ acornfb_default_mode = { .vmode = FB_VMODE_NONINTERLACED }; -static void __init acornfb_init_fbinfo(void) +static void __devinit acornfb_init_fbinfo(void) { static int first = 1; @@ -1018,8 +1017,7 @@ static void __init acornfb_init_fbinfo(void) * size can optionally be followed by 'M' or 'K' for * MB or KB respectively. */ -static void __init -acornfb_parse_mon(char *opt) +static void __devinit acornfb_parse_mon(char *opt) { char *p = opt; @@ -1066,8 +1064,7 @@ bad: current_par.montype = -1; } -static void __init -acornfb_parse_montype(char *opt) +static void __devinit acornfb_parse_montype(char *opt) { current_par.montype = -2; @@ -1108,8 +1105,7 @@ acornfb_parse_montype(char *opt) } } -static void __init -acornfb_parse_dram(char *opt) +static void __devinit acornfb_parse_dram(char *opt) { unsigned int size; @@ -1134,15 +1130,14 @@ acornfb_parse_dram(char *opt) static struct options { char *name; void (*parse)(char *opt); -} opt_table[] __initdata = { +} opt_table[] __devinitdata = { { "mon", acornfb_parse_mon }, { "montype", acornfb_parse_montype }, { "dram", acornfb_parse_dram }, { NULL, NULL } }; -int __init -acornfb_setup(char *options) +static int __devinit acornfb_setup(char *options) { struct options *optp; char *opt; @@ -1179,8 +1174,7 @@ acornfb_setup(char *options) * Detect type of monitor connected * For now, we just assume SVGA */ -static int __init -acornfb_detect_monitortype(void) +static int __devinit acornfb_detect_monitortype(void) { return 4; } -- cgit v1.1 From e04d1b23f9706186187dcb0be1a752e48dcc540b Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 6 May 2011 07:14:02 +0000 Subject: perf events, x86: Add SandyBridge stalled-cycles-frontend/backend events Extend the Intel SandyBridge PMU driver with definitions for generic front-end and back-end stall events. ( As commit 3011203 "perf events, x86: Add Westmere stalled-cycles-frontend/backend events" says, these are only approximations. ) Signed-off-by: Lin Ming Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1304666042-17577-1-git-send-email-ming.m.lin@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index e61539b..7cf2ec5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1474,6 +1474,12 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_events; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; + pr_cont("SandyBridge events, "); break; -- cgit v1.1 From 925f83c085e1bb08435556c5b4844a60de002e31 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 May 2011 01:53:18 +0200 Subject: hw_breakpoints, powerpc: Fix CONFIG_HAVE_HW_BREAKPOINT off-case in ptrace_set_debugreg() We make use of ptrace_get_breakpoints() / ptrace_put_breakpoints() to protect ptrace_set_debugreg() even if CONFIG_HAVE_HW_BREAKPOINT if off. However in this case, these APIs are not implemented. To fix this, push the protection down inside the relevant ifdef. Best would be to export the code inside CONFIG_HAVE_HW_BREAKPOINT into a standalone function to cleanup the ifdefury there and call the breakpoint ref API inside. But as it is more invasive, this should be rather made in an -rc1. Fixes this build error: arch/powerpc/kernel/ptrace.c:1594: error: implicit declaration of function 'ptrace_get_breakpoints' make[2]: *** Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: LPPC Cc: Prasad Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1304639598-4707-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/ptrace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 4edeeb3..a6ae1cf 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -933,12 +933,16 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, if (data && !(data & DABR_TRANSLATION)) return -EIO; #ifdef CONFIG_HAVE_HW_BREAKPOINT + if (ptrace_get_breakpoints(task) < 0) + return -ESRCH; + bp = thread->ptrace_bps[0]; if ((!data) || !(data & (DABR_DATA_WRITE | DABR_DATA_READ))) { if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } + ptrace_put_breakpoints(task); return 0; } if (bp) { @@ -948,9 +952,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, (DABR_DATA_WRITE | DABR_DATA_READ), &attr.bp_type); ret = modify_user_hw_breakpoint(bp, &attr); - if (ret) + if (ret) { + ptrace_put_breakpoints(task); return ret; + } thread->ptrace_bps[0] = bp; + ptrace_put_breakpoints(task); thread->dabr = data; return 0; } @@ -965,9 +972,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ptrace_triggered, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; + ptrace_put_breakpoints(task); return PTR_ERR(bp); } + ptrace_put_breakpoints(task); + #endif /* CONFIG_HAVE_HW_BREAKPOINT */ /* Move contents to the DABR register */ @@ -1591,10 +1601,7 @@ long arch_ptrace(struct task_struct *child, long request, } case PTRACE_SET_DEBUGREG: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; ret = ptrace_set_debugreg(child, addr, data); - ptrace_put_breakpoints(child); break; #ifdef CONFIG_PPC64 -- cgit v1.1 From 63b6a6758eede2f9283c3594265b6e32e75d7456 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 Apr 2011 00:57:42 +0200 Subject: perf events, x86: Fix Intel Nehalem and Westmere last level cache event definitions The Intel Nehalem offcore bits implemented in: e994d7d23a0b: perf: Fix LLC-* events on Intel Nehalem/Westmere ... are wrong: they implemented _ACCESS as _HIT and counted OTHER_CORE_HIT* as MISS even though its clearly documented as an L3 hit ... Fix them and the Westmere definitions as well. Cc: Andi Kleen Cc: Lin Ming Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1299119690-13991-3-git-send-email-ming.m.lin@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 87 ++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index e61539b..447a28d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -184,26 +184,23 @@ static __initconst const u64 snb_hw_cache_event_ids }, }, [ C(LL ) ] = { - /* - * TBD: Need Off-core Response Performance Monitoring support - */ [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -285,26 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, /* * Use RFO, not WRITEBACK, because a write miss would typically occur * on RFO. */ [ C(OP_WRITE) ] = { - /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */ - [ C(RESULT_ACCESS) ] = 0x01bb, - /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01bb, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -352,16 +349,36 @@ static __initconst const u64 westmere_hw_cache_event_ids }; /* - * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3 + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 */ -#define DMND_DATA_RD (1 << 0) -#define DMND_RFO (1 << 1) -#define DMND_WB (1 << 3) -#define PF_DATA_RD (1 << 4) -#define PF_DATA_RFO (1 << 5) -#define RESP_UNCORE_HIT (1 << 8) -#define RESP_MISS (0xf600) /* non uncore hit */ +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) static __initconst const u64 nehalem_hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] @@ -370,16 +387,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs { [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT, - [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS, + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT, - [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS, + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT, - [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS, + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, }, } }; -- cgit v1.1 From fa039d5f6b126fbd65eefa05db2f67e44df8f121 Mon Sep 17 00:00:00 2001 From: Timo Warns Date: Fri, 6 May 2011 13:47:35 +0200 Subject: Validate size of EFI GUID partition entries. Otherwise corrupted EFI partition tables can cause total confusion. Signed-off-by: Timo Warns Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/partitions/efi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index ac0ccb5..19d6750 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -348,6 +348,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Check that sizeof_partition_entry has the correct value */ + if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { + pr_debug("GUID Partitition Entry Size check failed.\n"); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; -- cgit v1.1 From ed16648eb5b86917f0b90bdcdbc857202da72f90 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Fri, 6 May 2011 09:22:02 -0700 Subject: Move kvm, uml, and lguest subdirectories under a common "virtual" directory, I.E: cd Documentation mkdir virtual git mv kvm uml lguest virtual Signed-off-by: Rob Landley Signed-off-by: Randy Dunlap --- Documentation/kvm/api.txt | 1451 ------- Documentation/kvm/cpuid.txt | 45 - Documentation/kvm/locking.txt | 25 - Documentation/kvm/mmu.txt | 348 -- Documentation/kvm/msr.txt | 187 - Documentation/kvm/ppc-pv.txt | 196 - Documentation/kvm/review-checklist.txt | 38 - Documentation/kvm/timekeeping.txt | 612 --- Documentation/lguest/.gitignore | 1 - Documentation/lguest/Makefile | 8 - Documentation/lguest/extract | 58 - Documentation/lguest/lguest.c | 2095 ---------- Documentation/lguest/lguest.txt | 128 - Documentation/uml/UserModeLinux-HOWTO.txt | 4579 --------------------- Documentation/virtual/kvm/api.txt | 1451 +++++++ Documentation/virtual/kvm/cpuid.txt | 45 + Documentation/virtual/kvm/locking.txt | 25 + Documentation/virtual/kvm/mmu.txt | 348 ++ Documentation/virtual/kvm/msr.txt | 187 + Documentation/virtual/kvm/ppc-pv.txt | 196 + Documentation/virtual/kvm/review-checklist.txt | 38 + Documentation/virtual/kvm/timekeeping.txt | 612 +++ Documentation/virtual/lguest/.gitignore | 1 + Documentation/virtual/lguest/Makefile | 8 + Documentation/virtual/lguest/extract | 58 + Documentation/virtual/lguest/lguest.c | 2095 ++++++++++ Documentation/virtual/lguest/lguest.txt | 128 + Documentation/virtual/uml/UserModeLinux-HOWTO.txt | 4579 +++++++++++++++++++++ 28 files changed, 9771 insertions(+), 9771 deletions(-) delete mode 100644 Documentation/kvm/api.txt delete mode 100644 Documentation/kvm/cpuid.txt delete mode 100644 Documentation/kvm/locking.txt delete mode 100644 Documentation/kvm/mmu.txt delete mode 100644 Documentation/kvm/msr.txt delete mode 100644 Documentation/kvm/ppc-pv.txt delete mode 100644 Documentation/kvm/review-checklist.txt delete mode 100644 Documentation/kvm/timekeeping.txt delete mode 100644 Documentation/lguest/.gitignore delete mode 100644 Documentation/lguest/Makefile delete mode 100644 Documentation/lguest/extract delete mode 100644 Documentation/lguest/lguest.c delete mode 100644 Documentation/lguest/lguest.txt delete mode 100644 Documentation/uml/UserModeLinux-HOWTO.txt create mode 100644 Documentation/virtual/kvm/api.txt create mode 100644 Documentation/virtual/kvm/cpuid.txt create mode 100644 Documentation/virtual/kvm/locking.txt create mode 100644 Documentation/virtual/kvm/mmu.txt create mode 100644 Documentation/virtual/kvm/msr.txt create mode 100644 Documentation/virtual/kvm/ppc-pv.txt create mode 100644 Documentation/virtual/kvm/review-checklist.txt create mode 100644 Documentation/virtual/kvm/timekeeping.txt create mode 100644 Documentation/virtual/lguest/.gitignore create mode 100644 Documentation/virtual/lguest/Makefile create mode 100644 Documentation/virtual/lguest/extract create mode 100644 Documentation/virtual/lguest/lguest.c create mode 100644 Documentation/virtual/lguest/lguest.txt create mode 100644 Documentation/virtual/uml/UserModeLinux-HOWTO.txt diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt deleted file mode 100644 index 9bef4e4..0000000 --- a/Documentation/kvm/api.txt +++ /dev/null @@ -1,1451 +0,0 @@ -The Definitive KVM (Kernel-based Virtual Machine) API Documentation -=================================================================== - -1. General description - -The kvm API is a set of ioctls that are issued to control various aspects -of a virtual machine. The ioctls belong to three classes - - - System ioctls: These query and set global attributes which affect the - whole kvm subsystem. In addition a system ioctl is used to create - virtual machines - - - VM ioctls: These query and set attributes that affect an entire virtual - machine, for example memory layout. In addition a VM ioctl is used to - create virtual cpus (vcpus). - - Only run VM ioctls from the same process (address space) that was used - to create the VM. - - - vcpu ioctls: These query and set attributes that control the operation - of a single virtual cpu. - - Only run vcpu ioctls from the same thread that was used to create the - vcpu. - -2. File descriptors - -The kvm API is centered around file descriptors. An initial -open("/dev/kvm") obtains a handle to the kvm subsystem; this handle -can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descriptor which can be used to issue VM -ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu -and return a file descriptor pointing to it. Finally, ioctls on a vcpu -fd can be used to control the vcpu, including the important task of -actually running guest code. - -In general file descriptors can be migrated among processes by means -of fork() and the SCM_RIGHTS facility of unix domain socket. These -kinds of tricks are explicitly not supported by kvm. While they will -not cause harm to the host, their actual behavior is not guaranteed by -the API. The only supported use is one virtual machine per process, -and one vcpu per thread. - -3. Extensions - -As of Linux 2.6.22, the KVM ABI has been stabilized: no backward -incompatible change are allowed. However, there is an extension -facility that allows backward-compatible extensions to the API to be -queried and used. - -The extension mechanism is not based on on the Linux version number. -Instead, kvm defines extension identifiers and a facility to query -whether a particular extension identifier is available. If it is, a -set of ioctls is available for application use. - -4. API description - -This section describes ioctls that can be used to control kvm guests. -For each ioctl, the following information is provided along with a -description: - - Capability: which KVM extension provides this ioctl. Can be 'basic', - which means that is will be provided by any kernel that supports - API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which - means availability needs to be checked with KVM_CHECK_EXTENSION - (see section 4.4). - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Type: system, vm, or vcpu. - - Parameters: what parameters are accepted by the ioctl. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - -4.1 KVM_GET_API_VERSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: the constant KVM_API_VERSION (=12) - -This identifies the API version as the stable kvm API. It is not -expected that this number will change. However, Linux 2.6.20 and -2.6.21 report earlier versions; these are not documented and not -supported. Applications should refuse to run if KVM_GET_API_VERSION -returns a value other than 12. If this check passes, all ioctls -described as 'basic' will be available. - -4.2 KVM_CREATE_VM - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: a VM fd that can be used to control the new virtual machine. - -The new VM has no virtual cpus and no memory. An mmap() of a VM fd -will access the virtual machine's physical address space; offset zero -corresponds to guest physical address zero. Use of mmap() on a VM fd -is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is -available. - -4.3 KVM_GET_MSR_INDEX_LIST - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_msr_list (in/out) -Returns: 0 on success; -1 on error -Errors: - E2BIG: the msr index list is to be to fit in the array specified by - the user. - -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - -This ioctl returns the guest msrs that are supported. The list varies -by kvm version and host processor, but does not change otherwise. The -user fills in the size of the indices array in nmsrs, and in return -kvm adjusts nmsrs to reflect the actual number of msrs and fills in -the indices array with their numbers. - -Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are -not returned in the MSR list, as different vcpus can have a different number -of banks, as set via the KVM_X86_SETUP_MCE ioctl. - -4.4 KVM_CHECK_EXTENSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: extension identifier (KVM_CAP_*) -Returns: 0 if unsupported; 1 (or some other positive integer) if supported - -The API allows the application to query about extensions to the core -kvm API. Userspace passes an extension identifier (an integer) and -receives an integer that describes the extension availability. -Generally 0 means no and 1 means yes, but some extensions may report -additional information in the integer return value. - -4.5 KVM_GET_VCPU_MMAP_SIZE - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: size of vcpu mmap area, in bytes - -The KVM_RUN ioctl (cf.) communicates with userspace via a shared -memory region. This ioctl returns the size of that region. See the -KVM_RUN documentation for details. - -4.6 KVM_SET_MEMORY_REGION - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_memory_region (in) -Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - -4.7 KVM_CREATE_VCPU - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: vcpu id (apic id on x86) -Returns: vcpu fd on success, -1 on error - -This API adds a vcpu to a virtual machine. The vcpu id is a small integer -in the range [0, max_vcpus). - -4.8 KVM_GET_DIRTY_LOG (vm ioctl) - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_dirty_log (in/out) -Returns: 0 on success, -1 on error - -/* for KVM_GET_DIRTY_LOG */ -struct kvm_dirty_log { - __u32 slot; - __u32 padding; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -Given a memory slot, return a bitmap containing any pages dirtied -since the last call to this ioctl. Bit 0 is the first page in the -memory slot. Ensure the entire structure is cleared to avoid padding -issues. - -4.9 KVM_SET_MEMORY_ALIAS - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_memory_alias (in) -Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - -4.10 KVM_RUN - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error -Errors: - EINTR: an unmasked signal is pending - -This ioctl is used to run a guest virtual cpu. While there are no -explicit parameters, there is an implicit parameter block that can be -obtained by mmap()ing the vcpu fd at offset 0, with the size given by -KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct -kvm_run' (see below). - -4.11 KVM_GET_REGS - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_regs (out) -Returns: 0 on success, -1 on error - -Reads the general purpose registers from the vcpu. - -/* x86 */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -4.12 KVM_SET_REGS - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_regs (in) -Returns: 0 on success, -1 on error - -Writes the general purpose registers into the vcpu. - -See KVM_GET_REGS for the data structure. - -4.13 KVM_GET_SREGS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_sregs (out) -Returns: 0 on success, -1 on error - -Reads special registers from the vcpu. - -/* x86 */ -struct kvm_sregs { - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -interrupt_bitmap is a bitmap of pending external interrupts. At most -one bit may be set. This interrupt has been acknowledged by the APIC -but not yet injected into the cpu core. - -4.14 KVM_SET_SREGS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_sregs (in) -Returns: 0 on success, -1 on error - -Writes special registers into the vcpu. See KVM_GET_SREGS for the -data structures. - -4.15 KVM_TRANSLATE - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_translation (in/out) -Returns: 0 on success, -1 on error - -Translates a virtual address according to the vcpu's current address -translation mode. - -struct kvm_translation { - /* in */ - __u64 linear_address; - - /* out */ - __u64 physical_address; - __u8 valid; - __u8 writeable; - __u8 usermode; - __u8 pad[5]; -}; - -4.16 KVM_INTERRUPT - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_interrupt (in) -Returns: 0 on success, -1 on error - -Queues a hardware interrupt vector to be injected. This is only -useful if in-kernel local APIC or equivalent is not used. - -/* for KVM_INTERRUPT */ -struct kvm_interrupt { - /* in */ - __u32 irq; -}; - -X86: - -Note 'irq' is an interrupt vector, not an interrupt pin or line. - -PPC: - -Queues an external interrupt to be injected. This ioctl is overleaded -with 3 different irq values: - -a) KVM_INTERRUPT_SET - - This injects an edge type external interrupt into the guest once it's ready - to receive interrupts. When injected, the interrupt is done. - -b) KVM_INTERRUPT_UNSET - - This unsets any pending interrupt. - - Only available with KVM_CAP_PPC_UNSET_IRQ. - -c) KVM_INTERRUPT_SET_LEVEL - - This injects a level type external interrupt into the guest context. The - interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET - is triggered. - - Only available with KVM_CAP_PPC_IRQ_LEVEL. - -Note that any value for 'irq' other than the ones stated above is invalid -and incurs unexpected behavior. - -4.17 KVM_DEBUG_GUEST - -Capability: basic -Architectures: none -Type: vcpu ioctl -Parameters: none) -Returns: -1 on error - -Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. - -4.18 KVM_GET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in/out) -Returns: 0 on success, -1 on error - -Reads model-specific registers from the vcpu. Supported msr indices can -be obtained using KVM_GET_MSR_INDEX_LIST. - -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array) and the 'index' member of each array entry. -kvm will fill in the 'data' member. - -4.19 KVM_SET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in) -Returns: 0 on success, -1 on error - -Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the -data structures. - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array), and the 'index' and 'data' members of each -array entry. - -4.20 KVM_SET_CPUID - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid (in) -Returns: 0 on success, -1 on error - -Defines the vcpu responses to the cpuid instruction. Applications -should use the KVM_SET_CPUID2 ioctl if available. - - -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - -4.21 KVM_SET_SIGNAL_MASK - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_signal_mask (in) -Returns: 0 on success, -1 on error - -Defines which signals are blocked during execution of KVM_RUN. This -signal mask temporarily overrides the threads signal mask. Any -unblocked signal received (except SIGKILL and SIGSTOP, which retain -their traditional behaviour) will cause KVM_RUN to return with -EINTR. - -Note the signal will only be delivered if not blocked by the original -signal mask. - -/* for KVM_SET_SIGNAL_MASK */ -struct kvm_signal_mask { - __u32 len; - __u8 sigset[0]; -}; - -4.22 KVM_GET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (out) -Returns: 0 on success, -1 on error - -Reads the floating point state from the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - -4.23 KVM_SET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (in) -Returns: 0 on success, -1 on error - -Writes the floating point state to the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - -4.24 KVM_CREATE_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Creates an interrupt controller model in the kernel. On x86, creates a virtual -ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a -local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 -only go to the IOAPIC. On ia64, a IOSAPIC is created. - -4.25 KVM_IRQ_LINE - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: struct kvm_irq_level -Returns: 0 on success, -1 on error - -Sets the level of a GSI input to the interrupt controller model in the kernel. -Requires that an interrupt controller model has been previously created with -KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level -to be set to 1 and then back to 0. - -struct kvm_irq_level { - union { - __u32 irq; /* GSI */ - __s32 status; /* not used for KVM_IRQ_LEVEL */ - }; - __u32 level; /* 0 or 1 */ -}; - -4.26 KVM_GET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: struct kvm_irqchip (in/out) -Returns: 0 on success, -1 on error - -Reads the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP into a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - -4.27 KVM_SET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, ia64 -Type: vm ioctl -Parameters: struct kvm_irqchip (in) -Returns: 0 on success, -1 on error - -Sets the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP from a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - -4.28 KVM_XEN_HVM_CONFIG - -Capability: KVM_CAP_XEN_HVM -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_xen_hvm_config (in) -Returns: 0 on success, -1 on error - -Sets the MSR that the Xen HVM guest uses to initialize its hypercall -page, and provides the starting address and size of the hypercall -blobs in userspace. When the guest writes the MSR, kvm copies one -page of a blob (32- or 64-bit, depending on the vcpu mode) to guest -memory. - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; - -4.29 KVM_GET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (out) -Returns: 0 on success, -1 on error - -Gets the current timestamp of kvmclock as seen by the current guest. In -conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - -4.30 KVM_SET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (in) -Returns: 0 on success, -1 on error - -Sets the current timestamp of kvmclock to the value specified in its parameter. -In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - -4.31 KVM_GET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_vcpu_event (out) -Returns: 0 on success, -1 on error - -Gets currently pending exceptions, interrupts, and NMIs as well as related -states of the vcpu. - -struct kvm_vcpu_events { - struct { - __u8 injected; - __u8 nr; - __u8 has_error_code; - __u8 pad; - __u32 error_code; - } exception; - struct { - __u8 injected; - __u8 nr; - __u8 soft; - __u8 shadow; - } interrupt; - struct { - __u8 injected; - __u8 pending; - __u8 masked; - __u8 pad; - } nmi; - __u32 sipi_vector; - __u32 flags; -}; - -KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that -interrupt.shadow contains a valid state. Otherwise, this field is undefined. - -4.32 KVM_SET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_vcpu_event (in) -Returns: 0 on success, -1 on error - -Set pending exceptions, interrupts, and NMIs as well as related states of the -vcpu. - -See KVM_GET_VCPU_EVENTS for the data structure. - -Fields that may be modified asynchronously by running VCPUs can be excluded -from the update. These fields are nmi.pending and sipi_vector. Keep the -corresponding bits in the flags field cleared to suppress overwriting the -current in-kernel state. The bits are: - -KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel -KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector - -If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in -the flags field to signal that interrupt.shadow contains a valid state and -shall be written into the VCPU. - -4.33 KVM_GET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (out) -Returns: 0 on success, -1 on error - -Reads debug registers from the vcpu. - -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - -4.34 KVM_SET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (in) -Returns: 0 on success, -1 on error - -Writes debug registers into the vcpu. - -See KVM_GET_DEBUGREGS for the data structure. The flags field is unused -yet and must be cleared on entry. - -4.35 KVM_SET_USER_MEMORY_REGION - -Capability: KVM_CAP_USER_MEM -Architectures: all -Type: vm ioctl -Parameters: struct kvm_userspace_memory_region (in) -Returns: 0 on success, -1 on error - -struct kvm_userspace_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ - __u64 userspace_addr; /* start of the userspace allocated memory */ -}; - -/* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES 1UL - -This ioctl allows the user to create or modify a guest physical memory -slot. When changing an existing slot, it may be moved in the guest -physical memory space, or its flags may be modified. It may not be -resized. Slots may not overlap in guest physical address space. - -Memory for the region is taken starting at the address denoted by the -field userspace_addr, which must point at user addressable memory for -the entire memory slot size. Any object may back this memory, including -anonymous memory, ordinary files, and hugetlbfs. - -It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr -be identical. This allows large pages in the guest to be backed by large -pages in the host. - -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which -instructs kvm to keep track of writes to memory within the slot. See -the KVM_GET_DIRTY_LOG ioctl. - -When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory -region are automatically reflected into the guest. For example, an mmap() -that affects the region will be made visible immediately. Another example -is madvise(MADV_DROP). - -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - -4.36 KVM_SET_TSS_ADDR - -Capability: KVM_CAP_SET_TSS_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long tss_address (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a three-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -4.37 KVM_ENABLE_CAP - -Capability: KVM_CAP_ENABLE_CAP -Architectures: ppc -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -+Not all extensions are enabled by default. Using this ioctl the application -can enable an extension, making it available to the guest. - -On systems that do not support this ioctl, it always fails. On systems that -do support it, it only works for extensions that are supported for enablement. - -To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should -be used. - -struct kvm_enable_cap { - /* in */ - __u32 cap; - -The capability that is supposed to get enabled. - - __u32 flags; - -A bitfield indicating future enhancements. Has to be 0 for now. - - __u64 args[4]; - -Arguments for enabling a feature. If a feature needs initial values to -function properly, this is the place to put them. - - __u8 pad[64]; -}; - -4.38 KVM_GET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, ia64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (out) -Returns: 0 on success; -1 on error - -struct kvm_mp_state { - __u32 mp_state; -}; - -Returns the vcpu's current "multiprocessing state" (though also valid on -uniprocessor guests). - -Possible values are: - - - KVM_MP_STATE_RUNNABLE: the vcpu is currently running - - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) - which has not yet received an INIT signal - - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is - now ready for a SIPI - - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and - is waiting for an interrupt - - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector - accessible via KVM_GET_VCPU_EVENTS) - -This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel -irqchip, the multiprocessing state must be maintained by userspace. - -4.39 KVM_SET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, ia64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (in) -Returns: 0 on success; -1 on error - -Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for -arguments. - -This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel -irqchip, the multiprocessing state must be maintained by userspace. - -4.40 KVM_SET_IDENTITY_MAP_ADDR - -Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long identity (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a one-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -4.41 KVM_SET_BOOT_CPU_ID - -Capability: KVM_CAP_SET_BOOT_CPU_ID -Architectures: x86, ia64 -Type: vm ioctl -Parameters: unsigned long vcpu_id -Returns: 0 on success, -1 on error - -Define which vcpu is the Bootstrap Processor (BSP). Values are the same -as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default -is vcpu 0. - -4.42 KVM_GET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (out) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy current vcpu's xsave struct to the userspace. - -4.43 KVM_SET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (in) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy userspace's xsave struct to the kernel. - -4.44 KVM_GET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (out) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would copy current vcpu's xcrs to the userspace. - -4.45 KVM_SET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (in) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would set vcpu's xcr to the value userspace specified. - -4.46 KVM_GET_SUPPORTED_CPUID - -Capability: KVM_CAP_EXT_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are supported by both the hardware -and kvm. Userspace can use the information returned by this ioctl to -construct cpuid information (for KVM_SET_CPUID2) that is consistent with -hardware, kernel, and userspace capabilities, and with user requirements (for -example, the user may wish to constrain cpuid to emulate older hardware, -or for feature consistency across a cluster). - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe the cpu -capabilities, an error (E2BIG) is returned. If the number is too high, -the 'nent' field is adjusted and an error (ENOMEM) is returned. If the -number is just right, the 'nent' field is adjusted to the number of valid -entries in the 'entries' array, which is then filled. - -The entries returned are the host cpuid as returned by the cpuid instruction, -with unknown or unsupported features masked out. Some features (for example, -x2apic), may not be present in the host cpu, but are exposed by kvm if it can -emulate them efficiently. The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -4.47 KVM_PPC_GET_PVINFO - -Capability: KVM_CAP_PPC_GET_PVINFO -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_pvinfo (out) -Returns: 0 on success, !0 on error - -struct kvm_ppc_pvinfo { - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -This ioctl fetches PV specific information that need to be passed to the guest -using the device tree or other means from vm context. - -For now the only implemented piece of information distributed here is an array -of 4 instructions that make up a hypercall. - -If any additional field gets added to this structure later on, a bit for that -additional piece of information will be set in the flags bitmap. - -4.48 KVM_ASSIGN_PCI_DEVICE - -Capability: KVM_CAP_DEVICE_ASSIGNMENT -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Assigns a host PCI device to the VM. - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -The PCI device is specified by the triple segnr, busnr, and devfn. -Identification in succeeding service requests is done via assigned_dev_id. The -following flags are specified: - -/* Depends on KVM_CAP_IOMMU */ -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) - -4.49 KVM_DEASSIGN_PCI_DEVICE - -Capability: KVM_CAP_DEVICE_DEASSIGNMENT -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Ends PCI device assignment, releasing all associated resources. - -See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is -used in kvm_assigned_pci_dev to identify the device. - -4.50 KVM_ASSIGN_DEV_IRQ - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Assigns an IRQ to a passed-through device. - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; - __u32 guest_irq; - __u32 flags; - union { - struct { - __u32 addr_lo; - __u32 addr_hi; - __u32 data; - } guest_msi; - __u32 reserved[12]; - }; -}; - -The following flags are defined: - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -It is not valid to specify multiple types per host or guest IRQ. However, the -IRQ type of host and guest can differ or can even be null. - -4.51 KVM_DEASSIGN_DEV_IRQ - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Ends an IRQ assignment to a passed-through device. - -See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified -by assigned_dev_id, flags must correspond to the IRQ type specified on -KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. - -4.52 KVM_SET_GSI_ROUTING - -Capability: KVM_CAP_IRQ_ROUTING -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_irq_routing (in) -Returns: 0 on success, -1 on error - -Sets the GSI routing table entries, overwriting any previously set entries. - -struct kvm_irq_routing { - __u32 nr; - __u32 flags; - struct kvm_irq_routing_entry entries[0]; -}; - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_entry { - __u32 gsi; - __u32 type; - __u32 flags; - __u32 pad; - union { - struct kvm_irq_routing_irqchip irqchip; - struct kvm_irq_routing_msi msi; - __u32 pad[8]; - } u; -}; - -/* gsi routing entry types */ -#define KVM_IRQ_ROUTING_IRQCHIP 1 -#define KVM_IRQ_ROUTING_MSI 2 - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_irqchip { - __u32 irqchip; - __u32 pin; -}; - -struct kvm_irq_routing_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - __u32 pad; -}; - -4.53 KVM_ASSIGN_SET_MSIX_NR - -Capability: KVM_CAP_DEVICE_MSIX -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_nr (in) -Returns: 0 on success, -1 on error - -Set the number of MSI-X interrupts for an assigned device. This service can -only be called once in the lifetime of an assigned device. - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 - -4.54 KVM_ASSIGN_SET_MSIX_ENTRY - -Capability: KVM_CAP_DEVICE_MSIX -Architectures: x86 ia64 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_entry (in) -Returns: 0 on success, -1 on error - -Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting -the GSI vector to zero means disabling the interrupt. - -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -5. The kvm_run structure - -Application code obtains a pointer to the kvm_run structure by -mmap()ing a vcpu fd. From that point, application code can control -execution by changing fields in kvm_run prior to calling the KVM_RUN -ioctl, and obtain information about the reason KVM_RUN returned by -looking up structure members. - -struct kvm_run { - /* in */ - __u8 request_interrupt_window; - -Request that KVM_RUN return when it becomes possible to inject external -interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - - __u8 padding1[7]; - - /* out */ - __u32 exit_reason; - -When KVM_RUN has returned successfully (return value 0), this informs -application code why KVM_RUN has returned. Allowable values for this -field are detailed below. - - __u8 ready_for_interrupt_injection; - -If request_interrupt_window has been specified, this field indicates -an interrupt can be injected now with KVM_INTERRUPT. - - __u8 if_flag; - -The value of the current interrupt flag. Only valid if in-kernel -local APIC is not used. - - __u8 padding2[2]; - - /* in (pre_kvm_run), out (post_kvm_run) */ - __u64 cr8; - -The value of the cr8 register. Only valid if in-kernel local APIC is -not used. Both input and output. - - __u64 apic_base; - -The value of the APIC BASE msr. Only valid if in-kernel local -APIC is not used. Both input and output. - - union { - /* KVM_EXIT_UNKNOWN */ - struct { - __u64 hardware_exit_reason; - } hw; - -If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown -reasons. Further architecture-specific information is available in -hardware_exit_reason. - - /* KVM_EXIT_FAIL_ENTRY */ - struct { - __u64 hardware_entry_failure_reason; - } fail_entry; - -If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due -to unknown reasons. Further architecture-specific information is -available in hardware_entry_failure_reason. - - /* KVM_EXIT_EXCEPTION */ - struct { - __u32 exception; - __u32 error_code; - } ex; - -Unused. - - /* KVM_EXIT_IO */ - struct { -#define KVM_EXIT_IO_IN 0 -#define KVM_EXIT_IO_OUT 1 - __u8 direction; - __u8 size; /* bytes */ - __u16 port; - __u32 count; - __u64 data_offset; /* relative to kvm_run start */ - } io; - -If exit_reason is KVM_EXIT_IO, then the vcpu has -executed a port I/O instruction which could not be satisfied by kvm. -data_offset describes where the data is located (KVM_EXIT_IO_OUT) or -where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. - - struct { - struct kvm_debug_exit_arch arch; - } debug; - -Unused. - - /* KVM_EXIT_MMIO */ - struct { - __u64 phys_addr; - __u8 data[8]; - __u32 len; - __u8 is_write; - } mmio; - -If exit_reason is KVM_EXIT_MMIO, then the vcpu has -executed a memory-mapped I/O instruction which could not be satisfied -by kvm. The 'data' member contains the written data if 'is_write' is -true, and should be filled by application code otherwise. - -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. - - /* KVM_EXIT_HYPERCALL */ - struct { - __u64 nr; - __u64 args[6]; - __u64 ret; - __u32 longmode; - __u32 pad; - } hypercall; - -Unused. This was once used for 'hypercall to userspace'. To implement -such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). -Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. - - /* KVM_EXIT_TPR_ACCESS */ - struct { - __u64 rip; - __u32 is_write; - __u32 pad; - } tpr_access; - -To be documented (KVM_TPR_ACCESS_REPORTING). - - /* KVM_EXIT_S390_SIEIC */ - struct { - __u8 icptcode; - __u64 mask; /* psw upper half */ - __u64 addr; /* psw lower half */ - __u16 ipa; - __u32 ipb; - } s390_sieic; - -s390 specific. - - /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 - __u64 s390_reset_flags; - -s390 specific. - - /* KVM_EXIT_DCR */ - struct { - __u32 dcrn; - __u32 data; - __u8 is_write; - } dcr; - -powerpc specific. - - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; - -MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch -hypercalls and exit with this exit struct that contains all the guest gprs. - -If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. -Userspace can now handle the hypercall and when it's done modify the gprs as -necessary. Upon guest entry all guest GPRs will then be replaced by the values -in this struct. - - /* Fix the size of the union. */ - char padding[256]; - }; -}; diff --git a/Documentation/kvm/cpuid.txt b/Documentation/kvm/cpuid.txt deleted file mode 100644 index 8820685..0000000 --- a/Documentation/kvm/cpuid.txt +++ /dev/null @@ -1,45 +0,0 @@ -KVM CPUID bits -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -A guest running on a kvm host, can check some of its features using -cpuid. This is not always guaranteed to work, since userspace can -mask-out some, or even all KVM-related cpuid features before launching -a guest. - -KVM cpuid functions are: - -function: KVM_CPUID_SIGNATURE (0x40000000) -returns : eax = 0, - ebx = 0x4b4d564b, - ecx = 0x564b4d56, - edx = 0x4d. -Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". -This function queries the presence of KVM cpuid leafs. - - -function: define KVM_CPUID_FEATURES (0x40000001) -returns : ebx, ecx, edx = 0 - eax = and OR'ed group of (1 << flag), where each flags is: - - -flag || value || meaning -============================================================================= -KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs - || || 0x11 and 0x12. ------------------------------------------------------------------------------- -KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays - || || on PIO operations. ------------------------------------------------------------------------------- -KVM_FEATURE_MMU_OP || 2 || deprecated. ------------------------------------------------------------------------------- -KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs - || || 0x4b564d00 and 0x4b564d01 ------------------------------------------------------------------------------- -KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by - || || writing to msr 0x4b564d02 ------------------------------------------------------------------------------- -KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side - || || per-cpu warps are expected in - || || kvmclock. ------------------------------------------------------------------------------- diff --git a/Documentation/kvm/locking.txt b/Documentation/kvm/locking.txt deleted file mode 100644 index 3b4cd3b..0000000 --- a/Documentation/kvm/locking.txt +++ /dev/null @@ -1,25 +0,0 @@ -KVM Lock Overview -================= - -1. Acquisition Orders ---------------------- - -(to be written) - -2. Reference ------------- - -Name: kvm_lock -Type: raw_spinlock -Arch: any -Protects: - vm_list - - hardware virtualization enable/disable -Comment: 'raw' because hardware enabling/disabling must be atomic /wrt - migration. - -Name: kvm_arch::tsc_write_lock -Type: raw_spinlock -Arch: x86 -Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - - tsc offset in vmcb -Comment: 'raw' because updating the tsc offsets must not be preempted. diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt deleted file mode 100644 index f46aa58..0000000 --- a/Documentation/kvm/mmu.txt +++ /dev/null @@ -1,348 +0,0 @@ -The x86 kvm shadow mmu -====================== - -The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible -for presenting a standard x86 mmu to the guest, while translating guest -physical addresses to host physical addresses. - -The mmu code attempts to satisfy the following requirements: - -- correctness: the guest should not be able to determine that it is running - on an emulated mmu except for timing (we attempt to comply - with the specification, not emulate the characteristics of - a particular implementation such as tlb size) -- security: the guest must not be able to touch host memory not assigned - to it -- performance: minimize the performance penalty imposed by the mmu -- scaling: need to scale to large memory and large vcpu guests -- hardware: support the full range of x86 virtualization hardware -- integration: Linux memory management code must be in control of guest memory - so that swapping, page migration, page merging, transparent - hugepages, and similar features work without change -- dirty tracking: report writes to guest memory to enable live migration - and framebuffer-based displays -- footprint: keep the amount of pinned kernel memory low (most memory - should be shrinkable) -- reliability: avoid multipage or GFP_ATOMIC allocations - -Acronyms -======== - -pfn host page frame number -hpa host physical address -hva host virtual address -gfn guest frame number -gpa guest physical address -gva guest virtual address -ngpa nested guest physical address -ngva nested guest virtual address -pte page table entry (used also to refer generically to paging structure - entries) -gpte guest pte (referring to gfns) -spte shadow pte (referring to pfns) -tdp two dimensional paging (vendor neutral term for NPT and EPT) - -Virtual and real hardware supported -=================================== - -The mmu supports first-generation mmu hardware, which allows an atomic switch -of the current paging mode and cr3 during guest entry, as well as -two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware -it exposes is the traditional 2/3/4 level x86 mmu, with support for global -pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support -exposing NPT capable hardware on NPT capable hosts. - -Translation -=========== - -The primary job of the mmu is to program the processor's mmu to translate -addresses for the guest. Different translations are required at different -times: - -- when guest paging is disabled, we translate guest physical addresses to - host physical addresses (gpa->hpa) -- when guest paging is enabled, we translate guest virtual addresses, to - guest physical addresses, to host physical addresses (gva->gpa->hpa) -- when the guest launches a guest of its own, we translate nested guest - virtual addresses, to nested guest physical addresses, to guest physical - addresses, to host physical addresses (ngva->ngpa->gpa->hpa) - -The primary challenge is to encode between 1 and 3 translations into hardware -that support only 1 (traditional) and 2 (tdp) translations. When the -number of required translations matches the hardware, the mmu operates in -direct mode; otherwise it operates in shadow mode (see below). - -Memory -====== - -Guest memory (gpa) is part of the user address space of the process that is -using kvm. Userspace defines the translation between guest addresses and user -addresses (gpa->hva); note that two gpas may alias to the same hva, but not -vice versa. - -These hvas may be backed using any method available to the host: anonymous -memory, file backed memory, and device memory. Memory might be paged by the -host at any time. - -Events -====== - -The mmu is driven by events, some from the guest, some from the host. - -Guest generated events: -- writes to control registers (especially cr3) -- invlpg/invlpga instruction execution -- access to missing or protected translations - -Host generated events: -- changes in the gpa->hpa translation (either through gpa->hva changes or - through hva->hpa changes) -- memory pressure (the shrinker) - -Shadow pages -============ - -The principal data structure is the shadow page, 'struct kvm_mmu_page'. A -shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A -shadow page may contain a mix of leaf and nonleaf sptes. - -A nonleaf spte allows the hardware mmu to reach the leaf pages and -is not related to a translation directly. It points to other shadow pages. - -A leaf spte corresponds to either one or two translations encoded into -one paging structure entry. These are always the lowest level of the -translation stack, with optional higher level translations left to NPT/EPT. -Leaf ptes point at guest pages. - -The following table shows translations encoded by leaf ptes, with higher-level -translations in parentheses: - - Non-nested guests: - nonpaging: gpa->hpa - paging: gva->gpa->hpa - paging, tdp: (gva->)gpa->hpa - Nested guests: - non-tdp: ngva->gpa->hpa (*) - tdp: (ngva->)ngpa->gpa->hpa - -(*) the guest hypervisor will encode the ngva->gpa translation into its page - tables if npt is not present - -Shadow pages contain the following information: - role.level: - The level in the shadow paging hierarchy that this shadow page belongs to. - 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. - role.direct: - If set, leaf sptes reachable from this page are for a linear range. - Examples include real mode translation, large guest pages backed by small - host pages, and gpa->hpa translations when NPT or EPT is active. - The linear range starts at (gfn << PAGE_SHIFT) and its size is determined - by role.level (2MB for first level, 1GB for second level, 0.5TB for third - level, 256TB for fourth level) - If clear, this page corresponds to a guest page table denoted by the gfn - field. - role.quadrant: - When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit - sptes. That means a guest page table contains more ptes than the host, - so multiple shadow pages are needed to shadow one guest page. - For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the - first or second 512-gpte block in the guest page table. For second-level - page tables, each 32-bit gpte is converted to two 64-bit sptes - (since each first-level guest page is shadowed by two first-level - shadow pages) so role.quadrant takes values in the range 0..3. Each - quadrant maps 1GB virtual address space. - role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. - role.invalid: - The page is invalid and should not be used. It is a root page that is - currently pinned (by a cpu hardware register pointing to it); once it is - unpinned it will be destroyed. - role.cr4_pae: - Contains the value of cr4.pae for which the page is valid (e.g. whether - 32-bit or 64-bit gptes are in use). - role.nxe: - Contains the value of efer.nxe for which the page is valid. - role.cr0_wp: - Contains the value of cr0.wp for which the page is valid. - gfn: - Either the guest page table containing the translations shadowed by this - page, or the base page frame for linear translations. See role.direct. - spt: - A pageful of 64-bit sptes containing the translations for this page. - Accessed by both kvm and hardware. - The page pointed to by spt will have its page->private pointing back - at the shadow page structure. - sptes in spt point either at guest pages, or at lower-level shadow pages. - Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point - at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. - The spt array forms a DAG structure with the shadow page as a node, and - guest pages as leaves. - gfns: - An array of 512 guest frame numbers, one for each present pte. Used to - perform a reverse map from a pte to a gfn. When role.direct is set, any - element of this array can be calculated from the gfn field when used, in - this case, the array of gfns is not allocated. See role.direct and gfn. - slot_bitmap: - A bitmap containing one bit per memory slot. If the page contains a pte - mapping a page from memory slot n, then bit n of slot_bitmap will be set - (if a page is aliased among several slots, then it is not guaranteed that - all slots will be marked). - Used during dirty logging to avoid scanning a shadow page if none if its - pages need tracking. - root_count: - A counter keeping track of how many hardware registers (guest cr3 or - pdptrs) are now pointing at the page. While this counter is nonzero, the - page cannot be destroyed. See role.invalid. - multimapped: - Whether there exist multiple sptes pointing at this page. - parent_pte/parent_ptes: - If multimapped is zero, parent_pte points at the single spte that points at - this page's spt. Otherwise, parent_ptes points at a data structure - with a list of parent_ptes. - unsync: - If true, then the translations in this page may not match the guest's - translation. This is equivalent to the state of the tlb when a pte is - changed but before the tlb entry is flushed. Accordingly, unsync ptes - are synchronized when the guest executes invlpg or flushes its tlb by - other means. Valid for leaf pages. - unsync_children: - How many sptes in the page point at pages that are unsync (or have - unsynchronized children). - unsync_child_bitmap: - A bitmap indicating which sptes in spt point (directly or indirectly) at - pages that may be unsynchronized. Used to quickly locate all unsychronized - pages reachable from a given page. - -Reverse map -=========== - -The mmu maintains a reverse mapping whereby all ptes mapping a page can be -reached given its gfn. This is used, for example, when swapping out a page. - -Synchronized and unsynchronized pages -===================================== - -The guest uses two events to synchronize its tlb and page tables: tlb flushes -and page invalidations (invlpg). - -A tlb flush means that we need to synchronize all sptes reachable from the -guest's cr3. This is expensive, so we keep all guest page tables write -protected, and synchronize sptes to gptes when a gpte is written. - -A special case is when a guest page table is reachable from the current -guest cr3. In this case, the guest is obliged to issue an invlpg instruction -before using the translation. We take advantage of that by removing write -protection from the guest page, and allowing the guest to modify it freely. -We synchronize modified gptes when the guest invokes invlpg. This reduces -the amount of emulation we have to do when the guest modifies multiple gptes, -or when the a guest page is no longer used as a page table and is used for -random guest data. - -As a side effect we have to resynchronize all reachable unsynchronized shadow -pages on a tlb flush. - - -Reaction to events -================== - -- guest page fault (or npt page fault, or ept violation) - -This is the most complicated event. The cause of a page fault can be: - - - a true guest fault (the guest translation won't allow the access) (*) - - access to a missing translation - - access to a protected translation - - when logging dirty pages, memory is write protected - - synchronized shadow pages are write protected (*) - - access to untranslatable memory (mmio) - - (*) not applicable in direct mode - -Handling a page fault is performed as follows: - - - if needed, walk the guest page tables to determine the guest translation - (gva->gpa or ngpa->gpa) - - if permissions are insufficient, reflect the fault back to the guest - - determine the host page - - if this is an mmio request, there is no host page; call the emulator - to emulate the instruction instead - - walk the shadow page table to find the spte for the translation, - instantiating missing intermediate page tables as necessary - - try to unsynchronize the page - - if successful, we can let the guest continue and modify the gpte - - emulate the instruction - - if failed, unshadow the page and let the guest continue - - update any translations that were modified by the instruction - -invlpg handling: - - - walk the shadow page hierarchy and drop affected translations - - try to reinstantiate the indicated translation in the hope that the - guest will use it in the near future - -Guest control register updates: - -- mov to cr3 - - look up new shadow roots - - synchronize newly reachable shadow pages - -- mov to cr0/cr4/efer - - set up mmu context for new paging mode - - look up new shadow roots - - synchronize newly reachable shadow pages - -Host translation updates: - - - mmu notifier called with updated hva - - look up affected sptes through reverse map - - drop (or update) translations - -Emulating cr0.wp -================ - -If tdp is not enabled, the host must keep cr0.wp=1 so page write protection -works for the guest kernel, not guest guest userspace. When the guest -cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, -we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the -semantics require allowing any guest kernel access plus user read access). - -We handle this by mapping the permissions to two possible sptes, depending -on fault type: - -- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, - disallows user access) -- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel - write access) - -(user write faults generate a #PF) - -Large pages -=========== - -The mmu supports all combinations of large and small guest and host pages. -Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as -two separate 2M pages, on both guest and host, since the mmu always uses PAE -paging. - -To instantiate a large spte, four constraints must be satisfied: - -- the spte must point to a large host page -- the guest pte must be a large pte of at least equivalent size (if tdp is - enabled, there is no guest pte and this condition is satisified) -- if the spte will be writeable, the large page frame may not overlap any - write-protected pages -- the guest page must be wholly contained by a single memory slot - -To check the last two conditions, the mmu maintains a ->write_count set of -arrays for each memory slot and large page size. Every write protected page -causes its write_count to be incremented, thus preventing instantiation of -a large spte. The frames at the end of an unaligned memory slot have -artificically inflated ->write_counts so they can never be instantiated. - -Further reading -=============== - -- NPT presentation from KVM Forum 2008 - http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf - diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt deleted file mode 100644 index d079aed..0000000 --- a/Documentation/kvm/msr.txt +++ /dev/null @@ -1,187 +0,0 @@ -KVM-specific MSRs. -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -KVM makes use of some custom MSRs to service some requests. - -Custom MSRs have a range reserved for them, that goes from -0x4b564d00 to 0x4b564dff. There are MSRs outside this area, -but they are deprecated and their use is discouraged. - -Custom MSR list --------- - -The current supported Custom MSR list is: - -MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 - - data: 4-byte alignment physical address of a memory area which must be - in guest RAM. This memory is expected to hold a copy of the following - structure: - - struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - } __attribute__((__packed__)); - - whose data will be filled in by the hypervisor. The hypervisor is only - guaranteed to update this data at the moment of MSR write. - Users that want to reliably query this information more than once have - to write more than once to this MSR. Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - sec: number of seconds for wallclock. - - nsec: number of nanoseconds for wallclock. - - Note that although MSRs are per-CPU entities, the effect of this - particular MSR is global. - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 - - data: 4-byte aligned physical address of a memory area which must be in - guest RAM, plus an enable bit in bit 0. This memory is expected to hold - a copy of the following structure: - - struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; - } __attribute__((__packed__)); /* 32 bytes */ - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. - - Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - tsc_timestamp: the tsc value at the current VCPU at the time - of the update of this structure. Guests can subtract this value - from current tsc to derive a notion of elapsed time since the - structure update. - - system_time: a host notion of monotonic time, including sleep - time at the time this structure was last updated. Unit is - nanoseconds. - - tsc_to_system_mul: a function of the tsc frequency. One has - to multiply any tsc-related quantity by this value to get - a value in nanoseconds, besides dividing by 2^tsc_shift - - tsc_shift: cycle to nanosecond divider, as a power of two, to - allow for shift rights. One has to shift right any tsc-related - quantity by this value to get a value in nanoseconds, besides - multiplying by tsc_to_system_mul. - - With this information, guests can derive per-CPU time by - doing: - - time = (current_tsc - tsc_timestamp) - time = (time * tsc_to_system_mul) >> tsc_shift - time = time + system_time - - flags: bits in this field indicate extended capabilities - coordinated between the guest and the hypervisor. Availability - of specific flags has to be checked in 0x40000001 cpuid leaf. - Current flags are: - - flag bit | cpuid bit | meaning - ------------------------------------------------------------- - | | time measures taken across - 0 | 24 | multiple cpus are guaranteed to - | | be monotonic - ------------------------------------------------------------- - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - - -MSR_KVM_WALL_CLOCK: 0x11 - - data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME: 0x12 - - data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - - The suggested algorithm for detecting kvmclock presence is then: - - if (!kvm_para_available()) /* refer to cpuid.txt */ - return NON_PRESENT; - - flags = cpuid_eax(0x40000001); - if (flags & 3) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - return PRESENT; - } else if (flags & 0) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; - return PRESENT; - } else - return NON_PRESENT; - -MSR_KVM_ASYNC_PF_EN: 0x4b564d02 - data: Bits 63-6 hold 64-byte aligned physical address of a - 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 - when asynchronous page faults are enabled on the vcpu 0 when - disabled. Bit 2 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. - - First 4 byte of 64 byte memory location will be written to by - the hypervisor at the time of asynchronous page fault (APF) - injection to indicate type of asynchronous page fault. Value - of 1 means that the page referred to by the page fault is not - present. Value 2 means that the page is now available. Disabling - interrupt inhibits APFs. Guest must not enable interrupt - before the reason is read, or it may be overwritten by another - APF. Since APF uses the same exception vector as regular page - fault guest must reset the reason to 0 before it does - something that can generate normal page fault. If during page - fault APF reason is 0 it means that this is regular page - fault. - - During delivery of type 1 APF cr2 contains a token that will - be used to notify a guest when missing page becomes - available. When page becomes available type 2 APF is sent with - cr2 set to the token associated with the page. There is special - kind of token 0xffffffff which tells vcpu that it should wake - up all processes waiting for APFs and no individual type 2 APFs - will be sent. - - If APF is disabled while there are outstanding APFs, they will - not be delivered. - - Currently type 2 APF will be always delivered on the same vcpu as - type 1 was, but guest should not rely on that. diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt deleted file mode 100644 index 3ab969c..0000000 --- a/Documentation/kvm/ppc-pv.txt +++ /dev/null @@ -1,196 +0,0 @@ -The PPC KVM paravirtual interface -================================= - -The basic execution principle by which KVM on PowerPC works is to run all kernel -space code in PR=1 which is user space. This way we trap all privileged -instructions and can emulate them accordingly. - -Unfortunately that is also the downfall. There are quite some privileged -instructions that needlessly return us to the hypervisor even though they -could be handled differently. - -This is what the PPC PV interface helps with. It takes privileged instructions -and transforms them into unprivileged ones with some help from the hypervisor. -This cuts down virtualization costs by about 50% on some of my benchmarks. - -The code for that interface can be found in arch/powerpc/kernel/kvm* - -Querying for existence -====================== - -To find out if we're running on KVM or not, we leverage the device tree. When -Linux is running on KVM, a node /hypervisor exists. That node contains a -compatible property with the value "linux,kvm". - -Once you determined you're running under a PV capable KVM, you can now use -hypercalls as described below. - -KVM hypercalls -============== - -Inside the device tree's /hypervisor node there's a property called -'hypercall-instructions'. This property contains at most 4 opcodes that make -up the hypercall. To call a hypercall, just call these instructions. - -The parameters are as follows: - - Register IN OUT - - r0 - volatile - r3 1st parameter Return code - r4 2nd parameter 1st output value - r5 3rd parameter 2nd output value - r6 4th parameter 3rd output value - r7 5th parameter 4th output value - r8 6th parameter 5th output value - r9 7th parameter 6th output value - r10 8th parameter 7th output value - r11 hypercall number 8th output value - r12 - volatile - -Hypercall definitions are shared in generic code, so the same hypercall numbers -apply for x86 and powerpc alike with the exception that each KVM hypercall -also needs to be ORed with the KVM vendor code which is (42 << 16). - -Return codes can be as follows: - - Code Meaning - - 0 Success - 12 Hypercall not implemented - <0 Error - -The magic page -============== - -To enable communication between the hypervisor and guest there is a new shared -page that contains parts of supervisor visible register state. The guest can -map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. - -With this hypercall issued the guest always gets the magic page mapped at the -desired location in effective and physical address space. For now, we always -map the page to -4096. This way we can access it using absolute load and store -functions. The following instruction reads the first field of the magic page: - - ld rX, -4096(0) - -The interface is designed to be extensible should there be need later to add -additional registers to the magic page. If you add fields to the magic page, -also define a new hypercall feature to indicate that the host can give you more -registers. Only if the host supports the additional features, make use of them. - -The magic page has the following layout as described in -arch/powerpc/include/asm/kvm_para.h: - -struct kvm_vcpu_arch_shared { - __u64 scratch1; - __u64 scratch2; - __u64 scratch3; - __u64 critical; /* Guest may not get interrupts if == r1 */ - __u64 sprg0; - __u64 sprg1; - __u64 sprg2; - __u64 sprg3; - __u64 srr0; - __u64 srr1; - __u64 dar; - __u64 msr; - __u32 dsisr; - __u32 int_pending; /* Tells the guest if we have an interrupt */ -}; - -Additions to the page must only occur at the end. Struct fields are always 32 -or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. - -Magic page features -=================== - -When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, -a second return value is passed to the guest. This second return value contains -a bitmap of available features inside the magic page. - -The following enhancements to the magic page are currently available: - - KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page - -For enhanced features in the magic page, please check for the existence of the -feature before using them! - -MSR bits -======== - -The MSR contains bits that require hypervisor intervention and bits that do -not require direct hypervisor intervention because they only get interpreted -when entering the guest or don't have any impact on the hypervisor's behavior. - -The following bits are safe to be set inside the guest: - - MSR_EE - MSR_RI - MSR_CR - MSR_ME - -If any other bit changes in the MSR, please still use mtmsr(d). - -Patched instructions -==================== - -The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions -respectively on 32 bit systems with an added offset of 4 to accommodate for big -endianness. - -The following is a list of mapping the Linux kernel performs when running as -guest. Implementing any of those mappings is optional, as the instruction traps -also act on the shared page. So calling privileged instructions still works as -before. - -From To -==== == - -mfmsr rX ld rX, magic_page->msr -mfsprg rX, 0 ld rX, magic_page->sprg0 -mfsprg rX, 1 ld rX, magic_page->sprg1 -mfsprg rX, 2 ld rX, magic_page->sprg2 -mfsprg rX, 3 ld rX, magic_page->sprg3 -mfsrr0 rX ld rX, magic_page->srr0 -mfsrr1 rX ld rX, magic_page->srr1 -mfdar rX ld rX, magic_page->dar -mfdsisr rX lwz rX, magic_page->dsisr - -mtmsr rX std rX, magic_page->msr -mtsprg 0, rX std rX, magic_page->sprg0 -mtsprg 1, rX std rX, magic_page->sprg1 -mtsprg 2, rX std rX, magic_page->sprg2 -mtsprg 3, rX std rX, magic_page->sprg3 -mtsrr0 rX std rX, magic_page->srr0 -mtsrr1 rX std rX, magic_page->srr1 -mtdar rX std rX, magic_page->dar -mtdsisr rX stw rX, magic_page->dsisr - -tlbsync nop - -mtmsrd rX, 0 b -mtmsr rX b - -mtmsrd rX, 1 b - -[Book3S only] -mtsrin rX, rY b - -[BookE only] -wrteei [0|1] b - - -Some instructions require more logic to determine what's going on than a load -or store instruction can deliver. To enable patching of those, we keep some -RAM around where we can live translate instructions to. What happens is the -following: - - 1) copy emulation code to memory - 2) patch that code to fit the emulated instruction - 3) patch that code to return to the original pc + 4 - 4) patch the original instruction to branch to the new code - -That way we can inject an arbitrary amount of code as replacement for a single -instruction. This allows us to check for pending interrupts when setting EE=1 -for example. diff --git a/Documentation/kvm/review-checklist.txt b/Documentation/kvm/review-checklist.txt deleted file mode 100644 index 730475a..0000000 --- a/Documentation/kvm/review-checklist.txt +++ /dev/null @@ -1,38 +0,0 @@ -Review checklist for kvm patches -================================ - -1. The patch must follow Documentation/CodingStyle and - Documentation/SubmittingPatches. - -2. Patches should be against kvm.git master branch. - -3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/kvm/api.txt - - the API must be discoverable using KVM_CHECK_EXTENSION - -4. New state must include support for save/restore. - -5. New features must default to off (userspace should explicitly request them). - Performance improvements can and should default to on. - -6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 - -7. Emulator changes should be accompanied by unit tests for qemu-kvm.git - kvm/test directory. - -8. Changes should be vendor neutral when possible. Changes to common code - are better than duplicating changes to vendor code. - -9. Similarly, prefer changes to arch independent code than to arch dependent - code. - -10. User/kernel interfaces and guest/host interfaces must be 64-bit clean - (all variables and sizes naturally aligned on 64-bit; use specific types - only - u64 rather than ulong). - -11. New guest visible features must either be documented in a hardware manual - or be accompanied by documentation. - -12. Features must be robust against reset and kexec - for example, shared - host/guest memory must be unshared to prevent the host from writing to - guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/kvm/timekeeping.txt b/Documentation/kvm/timekeeping.txt deleted file mode 100644 index df894637..0000000 --- a/Documentation/kvm/timekeeping.txt +++ /dev/null @@ -1,612 +0,0 @@ - - Timekeeping Virtualization for X86-Based Architectures - - Zachary Amsden - Copyright (c) 2010, Red Hat. All rights reserved. - -1) Overview -2) Timing Devices -3) TSC Hardware -4) Virtualization Problems - -========================================================================= - -1) Overview - -One of the most complicated parts of the X86 platform, and specifically, -the virtualization of this platform is the plethora of timing devices available -and the complexity of emulating those devices. In addition, virtualization of -time introduces a new set of challenges because it introduces a multiplexed -division of time beyond the control of the guest CPU. - -First, we will describe the various timekeeping hardware available, then -present some of the problems which arise and solutions available, giving -specific recommendations for certain classes of KVM guests. - -The purpose of this document is to collect data and information relevant to -timekeeping which may be difficult to find elsewhere, specifically, -information relevant to KVM and hardware-based virtualization. - -========================================================================= - -2) Timing Devices - -First we discuss the basic hardware devices available. TSC and the related -KVM clock are special enough to warrant a full exposition and are described in -the following section. - -2.1) i8254 - PIT - -One of the first timer devices available is the programmable interrupt timer, -or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three -channels which can be programmed to deliver periodic or one-shot interrupts. -These three channels can be configured in different modes and have individual -counters. Channel 1 and 2 were not available for general use in the original -IBM PC, and historically were connected to control RAM refresh and the PC -speaker. Now the PIT is typically integrated as part of an emulated chipset -and a separate physical PIT is not used. - -The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done -using single or multiple byte access to the I/O ports. There are 6 modes -available, but not all modes are available to all timers, as only timer 2 -has a connected gate input, required for modes 1 and 5. The gate line is -controlled by port 61h, bit 0, as illustrated in the following diagram. - - -------------- ---------------- -| | | | -| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 -| Clock | | | | - -------------- | +->| GATE TIMER 0 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM - | | | (aka /dev/null) - | +->| GATE TIMER 1 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> Port 61h, bit 5 - | | | -Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ - ---------------- _| )--|LPF|---Speaker - / *---- \___/ -Port 61h, bit 1 -----------------------------------/ - -The timer modes are now described. - -Mode 0: Single Timeout. This is a one-shot software timeout that counts down - when the gate is high (always true for timers 0 and 1). When the count - reaches zero, the output goes high. - -Mode 1: Triggered One-shot. The output is initially set high. When the gate - line is set high, a countdown is initiated (which does not stop if the gate is - lowered), during which the output is set low. When the count reaches zero, - the output goes high. - -Mode 2: Rate Generator. The output is initially set high. When the countdown - reaches 1, the output goes low for one count and then returns high. The value - is reloaded and the countdown automatically resumes. If the gate line goes - low, the count is halted. If the output is low when the gate is lowered, the - output automatically goes high (this only affects timer 2). - -Mode 3: Square Wave. This generates a high / low square wave. The count - determines the length of the pulse, which alternates between high and low - when zero is reached. The count only proceeds when gate is high and is - automatically reloaded on reaching zero. The count is decremented twice at - each clock to generate a full high / low cycle at the full periodic rate. - If the count is even, the clock remains high for N/2 counts and low for N/2 - counts; if the clock is odd, the clock is high for (N+1)/2 counts and low - for (N-1)/2 counts. Only even values are latched by the counter, so odd - values are not observed when reading. This is the intended mode for timer 2, - which generates sine-like tones by low-pass filtering the square wave output. - -Mode 4: Software Strobe. After programming this mode and loading the counter, - the output remains high until the counter reaches zero. Then the output - goes low for 1 clock cycle and returns high. The counter is not reloaded. - Counting only occurs when gate is high. - -Mode 5: Hardware Strobe. After programming and loading the counter, the - output remains high. When the gate is raised, a countdown is initiated - (which does not stop if the gate is lowered). When the counter reaches zero, - the output goes low for 1 clock cycle and then returns high. The counter is - not reloaded. - -In addition to normal binary counting, the PIT supports BCD counting. The -command port, 0x43 is used to set the counter and mode for each of the three -timers. - -PIT commands, issued to port 0x43, using the following bit encoding: - -Bit 7-4: Command (See table below) -Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) -Bit 0 : Binary (0) / BCD (1) - -Command table: - -0000 - Latch Timer 0 count for port 0x40 - sample and hold the count to be read in port 0x40; - additional commands ignored until counter is read; - mode bits ignored. - -0001 - Set Timer 0 LSB mode for port 0x40 - set timer to read LSB only and force MSB to zero; - mode bits set timer mode - -0010 - Set Timer 0 MSB mode for port 0x40 - set timer to read MSB only and force LSB to zero; - mode bits set timer mode - -0011 - Set Timer 0 16-bit mode for port 0x40 - set timer to read / write LSB first, then MSB; - mode bits set timer mode - -0100 - Latch Timer 1 count for port 0x41 - as described above -0101 - Set Timer 1 LSB mode for port 0x41 - as described above -0110 - Set Timer 1 MSB mode for port 0x41 - as described above -0111 - Set Timer 1 16-bit mode for port 0x41 - as described above - -1000 - Latch Timer 2 count for port 0x42 - as described above -1001 - Set Timer 2 LSB mode for port 0x42 - as described above -1010 - Set Timer 2 MSB mode for port 0x42 - as described above -1011 - Set Timer 2 16-bit mode for port 0x42 as described above - -1101 - General counter latch - Latch combination of counters into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - Bit 0 = Unused - -1110 - Latch timer status - Latch combination of counter mode into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - - The output of ports 0x40-0x42 following this command will be: - - Bit 7 = Output pin - Bit 6 = Count loaded (0 if timer has expired) - Bit 5-4 = Read / Write mode - 01 = MSB only - 10 = LSB only - 11 = LSB / MSB (16-bit) - Bit 3-1 = Mode - Bit 0 = Binary (0) / BCD mode (1) - -2.2) RTC - -The second device which was available in the original PC was the MC146818 real -time clock. The original device is now obsolete, and usually emulated by the -system chipset, sometimes by an HPET and some frankenstein IRQ routing. - -The RTC is accessed through CMOS variables, which uses an index register to -control which bytes are read. Since there is only one index register, read -of the CMOS and read of the RTC require lock protection (in addition, it is -dangerous to allow userspace utilities such as hwclock to have direct RTC -access, as they could corrupt kernel reads and writes of CMOS memory). - -The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt -can function as a periodic timer, an additional once a day alarm, and can issue -interrupts after an update of the CMOS registers by the MC146818 is complete. -The type of interrupt is signalled in the RTC status registers. - -The RTC will update the current time fields by battery power even while the -system is off. The current time fields should not be read while an update is -in progress, as indicated in the status register. - -The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be -programmed to a 32kHz divider if the RTC is to count seconds. - -This is the RAM map originally used for the RTC/CMOS: - -Location Size Description ------------------------------------------- -00h byte Current second (BCD) -01h byte Seconds alarm (BCD) -02h byte Current minute (BCD) -03h byte Minutes alarm (BCD) -04h byte Current hour (BCD) -05h byte Hours alarm (BCD) -06h byte Current day of week (BCD) -07h byte Current day of month (BCD) -08h byte Current month (BCD) -09h byte Current year (BCD) -0Ah byte Register A - bit 7 = Update in progress - bit 6-4 = Divider for clock - 000 = 4.194 MHz - 001 = 1.049 MHz - 010 = 32 kHz - 10X = test modes - 110 = reset / disable - 111 = reset / disable - bit 3-0 = Rate selection for periodic interrupt - 000 = periodic timer disabled - 001 = 3.90625 uS - 010 = 7.8125 uS - 011 = .122070 mS - 100 = .244141 mS - ... - 1101 = 125 mS - 1110 = 250 mS - 1111 = 500 mS -0Bh byte Register B - bit 7 = Run (0) / Halt (1) - bit 6 = Periodic interrupt enable - bit 5 = Alarm interrupt enable - bit 4 = Update-ended interrupt enable - bit 3 = Square wave interrupt enable - bit 2 = BCD calendar (0) / Binary (1) - bit 1 = 12-hour mode (0) / 24-hour mode (1) - bit 0 = 0 (DST off) / 1 (DST enabled) -OCh byte Register C (read only) - bit 7 = interrupt request flag (IRQF) - bit 6 = periodic interrupt flag (PF) - bit 5 = alarm interrupt flag (AF) - bit 4 = update interrupt flag (UF) - bit 3-0 = reserved -ODh byte Register D (read only) - bit 7 = RTC has power - bit 6-0 = reserved -32h byte Current century BCD (*) - (*) location vendor specific and now determined from ACPI global tables - -2.3) APIC - -On Pentium and later processors, an on-board timer is available to each CPU -as part of the Advanced Programmable Interrupt Controller. The APIC is -accessed through memory-mapped registers and provides interrupt service to each -CPU, used for IPIs and local timer interrupts. - -Although in theory the APIC is a safe and stable source for local interrupts, -in practice, many bugs and glitches have occurred due to the special nature of -the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect -the use of the APIC and that workarounds may be required. In addition, some of -these workarounds pose unique constraints for virtualization - requiring either -extra overhead incurred from extra reads of memory-mapped I/O or additional -functionality that may be more computationally expensive to implement. - -Since the APIC is documented quite well in the Intel and AMD manuals, we will -avoid repetition of the detail here. It should be pointed out that the APIC -timer is programmed through the LVT (local vector timer) register, is capable -of one-shot or periodic operation, and is based on the bus clock divided down -by the programmable divider register. - -2.4) HPET - -HPET is quite complex, and was originally intended to replace the PIT / RTC -support of the X86 PC. It remains to be seen whether that will be the case, as -the de facto standard of PC hardware is to emulate these older devices. Some -systems designated as legacy free may support only the HPET as a hardware timer -device. - -The HPET spec is rather loose and vague, requiring at least 3 hardware timers, -but allowing implementation freedom to support many more. It also imposes no -fixed rate on the timer frequency, but does impose some extremal values on -frequency, error and slew. - -In general, the HPET is recommended as a high precision (compared to PIT /RTC) -time source which is independent of local variation (as there is only one HPET -in any given system). The HPET is also memory-mapped, and its presence is -indicated through ACPI tables by the BIOS. - -Detailed specification of the HPET is beyond the current scope of this -document, as it is also very well documented elsewhere. - -2.5) Offboard Timers - -Several cards, both proprietary (watchdog boards) and commonplace (e1000) have -timing chips built into the cards which may have registers which are accessible -to kernel or user drivers. To the author's knowledge, using these to generate -a clocksource for a Linux or other kernel has not yet been attempted and is in -general frowned upon as not playing by the agreed rules of the game. Such a -timer device would require additional support to be virtualized properly and is -not considered important at this time as no known operating system does this. - -========================================================================= - -3) TSC Hardware - -The TSC or time stamp counter is relatively simple in theory; it counts -instruction cycles issued by the processor, which can be used as a measure of -time. In practice, due to a number of problems, it is the most complicated -timekeeping device to use. - -The TSC is represented internally as a 64-bit MSR which can be read with the -RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware -limitations made it possible to write the TSC, but generally on old hardware it -was only possible to write the low 32-bits of the 64-bit counter, and the upper -32-bits of the counter were cleared. Now, however, on Intel processors family -0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction -has been lifted and all 64-bits are writable. On AMD systems, the ability to -write the TSC MSR is not an architectural guarantee. - -The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by -means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. - -Some vendors have implemented an additional instruction, RDTSCP, which returns -atomically not just the TSC, but an indicator which corresponds to the -processor number. This can be used to index into an array of TSC variables to -determine offset information in SMP systems where TSCs are not synchronized. -The presence of this instruction must be determined by consulting CPUID feature -bits. - -Both VMX and SVM provide extension fields in the virtualization hardware which -allows the guest visible TSC to be offset by a constant. Newer implementations -promise to allow the TSC to additionally be scaled, but this hardware is not -yet widely available. - -3.1) TSC synchronization - -The TSC is a CPU-local clock in most implementations. This means, on SMP -platforms, the TSCs of different CPUs may start at different times depending -on when the CPUs are powered on. Generally, CPUs on the same die will share -the same clock, however, this is not always the case. - -The BIOS may attempt to resynchronize the TSCs during the poweron process and -the operating system or other system software may attempt to do this as well. -Several hardware limitations make the problem worse - if it is not possible to -write the full 64-bits of the TSC, it may be impossible to match the TSC in -newly arriving CPUs to that of the rest of the system, resulting in -unsynchronized TSCs. This may be done by BIOS or system software, but in -practice, getting a perfectly synchronized TSC will not be possible unless all -values are read from the same clock, which generally only is possible on single -socket systems or those with special hardware support. - -3.2) TSC and CPU hotplug - -As touched on already, CPUs which arrive later than the boot time of the system -may not have a TSC value that is synchronized with the rest of the system. -Either system software, BIOS, or SMM code may actually try to establish the TSC -to a value matching the rest of the system, but a perfect match is usually not -a guarantee. This can have the effect of bringing a system from a state where -TSC is synchronized back to a state where TSC synchronization flaws, however -small, may be exposed to the OS and any virtualization environment. - -3.3) TSC and multi-socket / NUMA - -Multi-socket systems, especially large multi-socket systems are likely to have -individual clocksources rather than a single, universally distributed clock. -Since these clocks are driven by different crystals, they will not have -perfectly matched frequency, and temperature and electrical variations will -cause the CPU clocks, and thus the TSCs to drift over time. Depending on the -exact clock and bus design, the drift may or may not be fixed in absolute -error, and may accumulate over time. - -In addition, very large systems may deliberately slew the clocks of individual -cores. This technique, known as spread-spectrum clocking, reduces EMI at the -clock frequency and harmonics of it, which may be required to pass FCC -standards for telecommunications and computer equipment. - -It is recommended not to trust the TSCs to remain synchronized on NUMA or -multiple socket systems for these reasons. - -3.4) TSC and C-states - -C-states, or idling states of the processor, especially C1E and deeper sleep -states may be problematic for TSC as well. The TSC may stop advancing in such -a state, resulting in a TSC which is behind that of other CPUs when execution -is resumed. Such CPUs must be detected and flagged by the operating system -based on CPU and chipset identifications. - -The TSC in such a case may be corrected by catching it up to a known external -clocksource. - -3.5) TSC frequency change / P-states - -To make things slightly more interesting, some CPUs may change frequency. They -may or may not run the TSC at the same rate, and because the frequency change -may be staggered or slewed, at some points in time, the TSC rate may not be -known other than falling within a range of values. In this case, the TSC will -not be a stable time source, and must be calibrated against a known, stable, -external clock to be a usable source of time. - -Whether the TSC runs at a constant rate or scales with the P-state is model -dependent and must be determined by inspecting CPUID, chipset or vendor -specific MSR fields. - -In addition, some vendors have known bugs where the P-state is actually -compensated for properly during normal operation, but when the processor is -inactive, the P-state may be raised temporarily to service cache misses from -other processors. In such cases, the TSC on halted CPUs could advance faster -than that of non-halted processors. AMD Turion processors are known to have -this problem. - -3.6) TSC and STPCLK / T-states - -External signals given to the processor may also have the effect of stopping -the TSC. This is typically done for thermal emergency power control to prevent -an overheating condition, and typically, there is no way to detect that this -condition has happened. - -3.7) TSC virtualization - VMX - -VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET -field specified in the VMCS. Special instructions must be used to read and -write the VMCS field. - -3.8) TSC virtualization - SVM - -SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, SVM allows passing through the host TSC plus an additional offset -field specified in the SVM control block. - -3.9) TSC feature bits in Linux - -In summary, there is no way to guarantee the TSC remains in perfect -synchronization unless it is explicitly guaranteed by the architecture. Even -if so, the TSCs in multi-sockets or NUMA systems may still run independently -despite being locally consistent. - -The following feature bits are used by Linux to signal various TSC attributes, -but they can only be taken to be meaningful for UP or single node systems. - -X86_FEATURE_TSC : The TSC is available in hardware -X86_FEATURE_RDTSCP : The RDTSCP instruction is available -X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states -X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states -X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) - -4) Virtualization Problems - -Timekeeping is especially problematic for virtualization because a number of -challenges arise. The most obvious problem is that time is now shared between -the host and, potentially, a number of virtual machines. Thus the virtual -operating system does not run with 100% usage of the CPU, despite the fact that -it may very well make that assumption. It may expect it to remain true to very -exacting bounds when interrupt sources are disabled, but in reality only its -virtual interrupt sources are disabled, and the machine may still be preempted -at any time. This causes problems as the passage of real time, the injection -of machine interrupts and the associated clock sources are no longer completely -synchronized with real time. - -This same problem can occur on native harware to a degree, as SMM mode may -steal cycles from the naturally on X86 systems when SMM mode is used by the -BIOS, but not in such an extreme fashion. However, the fact that SMM mode may -cause similar problems to virtualization makes it a good justification for -solving many of these problems on bare metal. - -4.1) Interrupt clocking - -One of the most immediate problems that occurs with legacy operating systems -is that the system timekeeping routines are often designed to keep track of -time by counting periodic interrupts. These interrupts may come from the PIT -or the RTC, but the problem is the same: the host virtualization engine may not -be able to deliver the proper number of interrupts per second, and so guest -time may fall behind. This is especially problematic if a high interrupt rate -is selected, such as 1000 HZ, which is unfortunately the default for many Linux -guests. - -There are three approaches to solving this problem; first, it may be possible -to simply ignore it. Guests which have a separate time source for tracking -'wall clock' or 'real time' may not need any adjustment of their interrupts to -maintain proper time. If this is not sufficient, it may be necessary to inject -additional interrupts into the guest in order to increase the effective -interrupt rate. This approach leads to complications in extreme conditions, -where host load or guest lag is too much to compensate for, and thus another -solution to the problem has risen: the guest may need to become aware of lost -ticks and compensate for them internally. Although promising in theory, the -implementation of this policy in Linux has been extremely error prone, and a -number of buggy variants of lost tick compensation are distributed across -commonly used Linux systems. - -Windows uses periodic RTC clocking as a means of keeping time internally, and -thus requires interrupt slewing to keep proper time. It does use a low enough -rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in -practice. - -4.2) TSC sampling and serialization - -As the highest precision time source available, the cycle counter of the CPU -has aroused much interest from developers. As explained above, this timer has -many problems unique to its nature as a local, potentially unstable and -potentially unsynchronized source. One issue which is not unique to the TSC, -but is highlighted because of its very precise nature is sampling delay. By -definition, the counter, once read is already old. However, it is also -possible for the counter to be read ahead of the actual use of the result. -This is a consequence of the superscalar execution of the instruction stream, -which may execute instructions out of order. Such execution is called -non-serialized. Forcing serialized execution is necessary for precise -measurement with the TSC, and requires a serializing instruction, such as CPUID -or an MSR read. - -Since CPUID may actually be virtualized by a trap and emulate mechanism, this -serialization can pose a performance issue for hardware virtualization. An -accurate time stamp counter reading may therefore not always be available, and -it may be necessary for an implementation to guard against "backwards" reads of -the TSC as seen from other CPUs, even in an otherwise perfectly synchronized -system. - -4.3) Timespec aliasing - -Additionally, this lack of serialization from the TSC poses another challenge -when using results of the TSC when measured against another time source. As -the TSC is much higher precision, many possible values of the TSC may be read -while another clock is still expressing the same value. - -That is, you may read (T,T+10) while external clock C maintains the same value. -Due to non-serialized reads, you may actually end up with a range which -fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but -calibrated against an external value may have a range of valid values. -Re-calibrating this computation may actually cause time, as computed after the -calibration, to go backwards, compared with time computed before the -calibration. - -This problem is particularly pronounced with an internal time source in Linux, -the kernel time, which is expressed in the theoretically high resolution -timespec - but which advances in much larger granularity intervals, sometimes -at the rate of jiffies, and possibly in catchup modes, at a much larger step. - -This aliasing requires care in the computation and recalibration of kvmclock -and any other values derived from TSC computation (such as TSC virtualization -itself). - -4.4) Migration - -Migration of a virtual machine raises problems for timekeeping in two ways. -First, the migration itself may take time, during which interrupts cannot be -delivered, and after which, the guest time may need to be caught up. NTP may -be able to help to some degree here, as the clock correction required is -typically small enough to fall in the NTP-correctable window. - -An additional concern is that timers based off the TSC (or HPET, if the raw bus -clock is exposed) may now be running at different rates, requiring compensation -in some way in the hypervisor by virtualizing these timers. In addition, -migrating to a faster machine may preclude the use of a passthrough TSC, as a -faster clock cannot be made visible to a guest without the potential of time -advancing faster than usual. A slower clock is less of a problem, as it can -always be caught up to the original rate. KVM clock avoids these problems by -simply storing multipliers and offsets against the TSC for the guest to convert -back into nanosecond resolution values. - -4.5) Scheduling - -Since scheduling may be based on precise timing and firing of interrupts, the -scheduling algorithms of an operating system may be adversely affected by -virtualization. In theory, the effect is random and should be universally -distributed, but in contrived as well as real scenarios (guest device access, -causes of virtualization exits, possible context switch), this may not always -be the case. The effect of this has not been well studied. - -In an attempt to work around this, several implementations have provided a -paravirtualized scheduler clock, which reveals the true amount of CPU time for -which a virtual machine has been running. - -4.6) Watchdogs - -Watchdog timers, such as the lock detector in Linux may fire accidentally when -running under hardware virtualization due to timer interrupts being delayed or -misinterpretation of the passage of real time. Usually, these warnings are -spurious and can be ignored, but in some circumstances it may be necessary to -disable such detection. - -4.7) Delays and precision timing - -Precise timing and delays may not be possible in a virtualized system. This -can happen if the system is controlling physical hardware, or issues delays to -compensate for slower I/O to and from devices. The first issue is not solvable -in general for a virtualized system; hardware control software can't be -adequately virtualized without a full real-time operating system, which would -require an RT aware virtualization platform. - -The second issue may cause performance problems, but this is unlikely to be a -significant issue. In many cases these delays may be eliminated through -configuration or paravirtualization. - -4.8) Covert channels and leaks - -In addition to the above problems, time information will inevitably leak to the -guest about the host in anything but a perfect implementation of virtualized -time. This may allow the guest to infer the presence of a hypervisor (as in a -red-pill type detection), and it may allow information to leak between guests -by using CPU utilization itself as a signalling channel. Preventing such -problems would require completely isolated virtual time which may not track -real time any longer. This may be useful in certain security or QA contexts, -but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/lguest/.gitignore b/Documentation/lguest/.gitignore deleted file mode 100644 index 115587f..0000000 --- a/Documentation/lguest/.gitignore +++ /dev/null @@ -1 +0,0 @@ -lguest diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile deleted file mode 100644 index bebac6b..0000000 --- a/Documentation/lguest/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# This creates the demonstration utility "lguest" which runs a Linux guest. -# Missing headers? Add "-I../../include -I../../arch/x86/include" -CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE - -all: lguest - -clean: - rm -f lguest diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract deleted file mode 100644 index 7730bb6..0000000 --- a/Documentation/lguest/extract +++ /dev/null @@ -1,58 +0,0 @@ -#! /bin/sh - -set -e - -PREFIX=$1 -shift - -trap 'rm -r $TMPDIR' 0 -TMPDIR=`mktemp -d` - -exec 3>/dev/null -for f; do - while IFS=" -" read -r LINE; do - case "$LINE" in - *$PREFIX:[0-9]*:\**) - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` - if [ -f $TMPDIR/$NUM ]; then - echo "$TMPDIR/$NUM already exits prior to $f" - exit 1 - fi - exec 3>>$TMPDIR/$NUM - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM - /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 - ;; - *$PREFIX:[0-9]*) - NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` - if [ -f $TMPDIR/$NUM ]; then - echo "$TMPDIR/$NUM already exits prior to $f" - exit 1 - fi - exec 3>>$TMPDIR/$NUM - echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM - /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 - ;; - *:\**) - /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 - echo >&3 - exec 3>/dev/null - ;; - *) - /bin/echo "$LINE" >&3 - ;; - esac - done < $f - echo >&3 - exec 3>/dev/null -done - -LASTFILE="" -for f in $TMPDIR/*; do - if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then - LASTFILE=$(cat $TMPDIR/.$(basename $f) ) - echo "[ $LASTFILE ]" - fi - cat $f -done - diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c deleted file mode 100644 index d9da7e1..0000000 --- a/Documentation/lguest/lguest.c +++ /dev/null @@ -1,2095 +0,0 @@ -/*P:100 - * This is the Launcher code, a simple program which lays out the "physical" - * memory for the new Guest by mapping the kernel image and the virtual - * devices, then opens /dev/lguest to tell the kernel about the Guest and - * control it. -:*/ -#define _LARGEFILE64_SOURCE -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include "../../include/linux/lguest_launcher.h" -/*L:110 - * We can ignore the 42 include files we need for this program, but I do want - * to draw attention to the use of kernel-style types. - * - * As Linus said, "C is a Spartan language, and so should your naming be." I - * like these abbreviations, so we define them here. Note that u64 is always - * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. - */ -typedef unsigned long long u64; -typedef uint32_t u32; -typedef uint16_t u16; -typedef uint8_t u8; -/*:*/ - -#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ -#define BRIDGE_PFX "bridge:" -#ifndef SIOCBRADDIF -#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ -#endif -/* We can have up to 256 pages for devices. */ -#define DEVICE_PAGES 256 -/* This will occupy 3 pages: it must be a power of 2. */ -#define VIRTQUEUE_NUM 256 - -/*L:120 - * verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. - */ -static bool verbose; -#define verbose(args...) \ - do { if (verbose) printf(args); } while(0) -/*:*/ - -/* The pointer to the start of guest memory. */ -static void *guest_base; -/* The maximum guest physical address allowed, and maximum possible. */ -static unsigned long guest_limit, guest_max; -/* The /dev/lguest file descriptor. */ -static int lguest_fd; - -/* a per-cpu variable indicating whose vcpu is currently running */ -static unsigned int __thread cpu_id; - -/* This is our list of devices. */ -struct device_list { - /* Counter to assign interrupt numbers. */ - unsigned int next_irq; - - /* Counter to print out convenient device numbers. */ - unsigned int device_num; - - /* The descriptor page for the devices. */ - u8 *descpage; - - /* A single linked list of devices. */ - struct device *dev; - /* And a pointer to the last device for easy append. */ - struct device *lastdev; -}; - -/* The list of Guest devices, based on command line arguments. */ -static struct device_list devices; - -/* The device structure describes a single device. */ -struct device { - /* The linked-list pointer. */ - struct device *next; - - /* The device's descriptor, as mapped into the Guest. */ - struct lguest_device_desc *desc; - - /* We can't trust desc values once Guest has booted: we use these. */ - unsigned int feature_len; - unsigned int num_vq; - - /* The name of this device, for --verbose. */ - const char *name; - - /* Any queues attached to this device */ - struct virtqueue *vq; - - /* Is it operational */ - bool running; - - /* Does Guest want an intrrupt on empty? */ - bool irq_on_empty; - - /* Device-specific data. */ - void *priv; -}; - -/* The virtqueue structure describes a queue attached to a device. */ -struct virtqueue { - struct virtqueue *next; - - /* Which device owns me. */ - struct device *dev; - - /* The configuration for this queue. */ - struct lguest_vqconfig config; - - /* The actual ring of buffers. */ - struct vring vring; - - /* Last available index we saw. */ - u16 last_avail_idx; - - /* How many are used since we sent last irq? */ - unsigned int pending_used; - - /* Eventfd where Guest notifications arrive. */ - int eventfd; - - /* Function for the thread which is servicing this virtqueue. */ - void (*service)(struct virtqueue *vq); - pid_t thread; -}; - -/* Remember the arguments to the program so we can "reboot" */ -static char **main_args; - -/* The original tty settings to restore on exit. */ -static struct termios orig_term; - -/* - * We have to be careful with barriers: our devices are all run in separate - * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. - */ -#define wmb() __asm__ __volatile__("" : : : "memory") -#define mb() __asm__ __volatile__("" : : : "memory") - -/* - * Convert an iovec element to the given type. - * - * This is a fairly ugly trick: we need to know the size of the type and - * alignment requirement to check the pointer is kosher. It's also nice to - * have the name of the type in case we report failure. - * - * Typing those three things all the time is cumbersome and error prone, so we - * have a macro which sets them all up and passes to the real function. - */ -#define convert(iov, type) \ - ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) - -static void *_convert(struct iovec *iov, size_t size, size_t align, - const char *name) -{ - if (iov->iov_len != size) - errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); - if ((unsigned long)iov->iov_base % align != 0) - errx(1, "Bad alignment %p for %s", iov->iov_base, name); - return iov->iov_base; -} - -/* Wrapper for the last available index. Makes it easier to change. */ -#define lg_last_avail(vq) ((vq)->last_avail_idx) - -/* - * The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. - */ -#define cpu_to_le16(v16) (v16) -#define cpu_to_le32(v32) (v32) -#define cpu_to_le64(v64) (v64) -#define le16_to_cpu(v16) (v16) -#define le32_to_cpu(v32) (v32) -#define le64_to_cpu(v64) (v64) - -/* Is this iovec empty? */ -static bool iov_empty(const struct iovec iov[], unsigned int num_iov) -{ - unsigned int i; - - for (i = 0; i < num_iov; i++) - if (iov[i].iov_len) - return false; - return true; -} - -/* Take len bytes from the front of this iovec. */ -static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) -{ - unsigned int i; - - for (i = 0; i < num_iov; i++) { - unsigned int used; - - used = iov[i].iov_len < len ? iov[i].iov_len : len; - iov[i].iov_base += used; - iov[i].iov_len -= used; - len -= used; - } - assert(len == 0); -} - -/* The device virtqueue descriptors are followed by feature bitmasks. */ -static u8 *get_feature_bits(struct device *dev) -{ - return (u8 *)(dev->desc + 1) - + dev->num_vq * sizeof(struct lguest_vqconfig); -} - -/*L:100 - * The Launcher code itself takes us out into userspace, that scary place where - * pointers run wild and free! Unfortunately, like most userspace programs, - * it's quite boring (which is why everyone likes to hack on the kernel!). - * Perhaps if you make up an Lguest Drinking Game at this point, it will get - * you through this section. Or, maybe not. - * - * The Launcher sets up a big chunk of memory to be the Guest's "physical" - * memory and stores it in "guest_base". In other words, Guest physical == - * Launcher virtual with an offset. - * - * This can be tough to get your head around, but usually it just means that we - * use these trivial conversion functions when the Guest gives us its - * "physical" addresses: - */ -static void *from_guest_phys(unsigned long addr) -{ - return guest_base + addr; -} - -static unsigned long to_guest_phys(const void *addr) -{ - return (addr - guest_base); -} - -/*L:130 - * Loading the Kernel. - * - * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: - */ -static int open_or_die(const char *name, int flags) -{ - int fd = open(name, flags); - if (fd < 0) - err(1, "Failed to open %s", name); - return fd; -} - -/* map_zeroed_pages() takes a number of pages. */ -static void *map_zeroed_pages(unsigned int num) -{ - int fd = open_or_die("/dev/zero", O_RDONLY); - void *addr; - - /* - * We use a private mapping (ie. if we write to the page, it will be - * copied). We allocate an extra two pages PROT_NONE to act as guard - * pages against read/write attempts that exceed allocated space. - */ - addr = mmap(NULL, getpagesize() * (num+2), - PROT_NONE, MAP_PRIVATE, fd, 0); - - if (addr == MAP_FAILED) - err(1, "Mmapping %u pages of /dev/zero", num); - - if (mprotect(addr + getpagesize(), getpagesize() * num, - PROT_READ|PROT_WRITE) == -1) - err(1, "mprotect rw %u pages failed", num); - - /* - * One neat mmap feature is that you can close the fd, and it - * stays mapped. - */ - close(fd); - - /* Return address after PROT_NONE page */ - return addr + getpagesize(); -} - -/* Get some more pages for a device. */ -static void *get_pages(unsigned int num) -{ - void *addr = from_guest_phys(guest_limit); - - guest_limit += num * getpagesize(); - if (guest_limit > guest_max) - errx(1, "Not enough memory for devices"); - return addr; -} - -/* - * This routine is used to load the kernel or initrd. It tries mmap, but if - * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. - */ -static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) -{ - ssize_t r; - - /* - * We map writable even though for some segments are marked read-only. - * The kernel really wants to be writable: it patches its own - * instructions. - * - * MAP_PRIVATE means that the page won't be copied until a write is - * done to it. This allows us to share untouched memory between - * Guests. - */ - if (mmap(addr, len, PROT_READ|PROT_WRITE, - MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) - return; - - /* pread does a seek and a read in one shot: saves a few lines. */ - r = pread(fd, addr, len, offset); - if (r != len) - err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); -} - -/* - * This routine takes an open vmlinux image, which is in ELF, and maps it into - * the Guest memory. ELF = Embedded Linking Format, which is the format used - * by all modern binaries on Linux including the kernel. - * - * The ELF headers give *two* addresses: a physical address, and a virtual - * address. We use the physical address; the Guest will map itself to the - * virtual address. - * - * We return the starting address. - */ -static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) -{ - Elf32_Phdr phdr[ehdr->e_phnum]; - unsigned int i; - - /* - * Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. - */ - if (ehdr->e_type != ET_EXEC - || ehdr->e_machine != EM_386 - || ehdr->e_phentsize != sizeof(Elf32_Phdr) - || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) - errx(1, "Malformed elf header"); - - /* - * An ELF executable contains an ELF header and a number of "program" - * headers which indicate which parts ("segments") of the program to - * load where. - */ - - /* We read in all the program headers at once: */ - if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) - err(1, "Seeking to program headers"); - if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) - err(1, "Reading program headers"); - - /* - * Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. - */ - for (i = 0; i < ehdr->e_phnum; i++) { - /* If this isn't a loadable segment, we ignore it */ - if (phdr[i].p_type != PT_LOAD) - continue; - - verbose("Section %i: size %i addr %p\n", - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); - - /* We map this section of the file at its physical address. */ - map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), - phdr[i].p_offset, phdr[i].p_filesz); - } - - /* The entry point is given in the ELF header. */ - return ehdr->e_entry; -} - -/*L:150 - * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed - * to jump into it and it will unpack itself. We used to have to perform some - * hairy magic because the unpacking code scared me. - * - * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote - * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! - */ -static unsigned long load_bzimage(int fd) -{ - struct boot_params boot; - int r; - /* Modern bzImages get loaded at 1M. */ - void *p = from_guest_phys(0x100000); - - /* - * Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/i386/boot.txt) - */ - lseek(fd, 0, SEEK_SET); - read(fd, &boot, sizeof(boot)); - - /* Inside the setup_hdr, we expect the magic "HdrS" */ - if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) - errx(1, "This doesn't look like a bzImage to me"); - - /* Skip over the extra sectors of the header. */ - lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); - - /* Now read everything into memory. in nice big chunks. */ - while ((r = read(fd, p, 65536)) > 0) - p += r; - - /* Finally, code32_start tells us where to enter the kernel. */ - return boot.hdr.code32_start; -} - -/*L:140 - * Loading the kernel is easy when it's a "vmlinux", but most kernels - * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. - */ -static unsigned long load_kernel(int fd) -{ - Elf32_Ehdr hdr; - - /* Read in the first few bytes. */ - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) - err(1, "Reading kernel"); - - /* If it's an ELF file, it starts with "\177ELF" */ - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - return map_elf(fd, &hdr); - - /* Otherwise we assume it's a bzImage, and try to load it. */ - return load_bzimage(fd); -} - -/* - * This is a trivial little helper to align pages. Andi Kleen hated it because - * it calls getpagesize() twice: "it's dumb code." - * - * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. - */ -static inline unsigned long page_align(unsigned long addr) -{ - /* Add upwards and truncate downwards. */ - return ((addr + getpagesize()-1) & ~(getpagesize()-1)); -} - -/*L:180 - * An "initial ram disk" is a disk image loaded into memory along with the - * kernel which the kernel can use to boot from without needing any drivers. - * Most distributions now use this as standard: the initrd contains the code to - * load the appropriate driver modules for the current machine. - * - * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). - */ -static unsigned long load_initrd(const char *name, unsigned long mem) -{ - int ifd; - struct stat st; - unsigned long len; - - ifd = open_or_die(name, O_RDONLY); - /* fstat() is needed to get the file size. */ - if (fstat(ifd, &st) < 0) - err(1, "fstat() on initrd '%s'", name); - - /* - * We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. - */ - len = page_align(st.st_size); - map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* - * Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. - */ - close(ifd); - verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); - - /* We return the initrd size. */ - return len; -} -/*:*/ - -/* - * Simple routine to roll all the commandline arguments together with spaces - * between them. - */ -static void concat(char *dst, char *args[]) -{ - unsigned int i, len = 0; - - for (i = 0; args[i]; i++) { - if (i) { - strcat(dst+len, " "); - len++; - } - strcpy(dst+len, args[i]); - len += strlen(args[i]); - } - /* In case it's empty. */ - dst[len] = '\0'; -} - -/*L:185 - * This is where we actually tell the kernel to initialize the Guest. We - * saw the arguments it expects when we looked at initialize() in lguest_user.c: - * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. - */ -static void tell_kernel(unsigned long start) -{ - unsigned long args[] = { LHREQ_INITIALIZE, - (unsigned long)guest_base, - guest_limit / getpagesize(), start }; - verbose("Guest: %p - %p (%#lx)\n", - guest_base, guest_base + guest_limit, guest_limit); - lguest_fd = open_or_die("/dev/lguest", O_RDWR); - if (write(lguest_fd, args, sizeof(args)) < 0) - err(1, "Writing to /dev/lguest"); -} -/*:*/ - -/*L:200 - * Device Handling. - * - * When the Guest gives us a buffer, it sends an array of addresses and sizes. - * We need to make sure it's not trying to reach into the Launcher itself, so - * we have a convenient routine which checks it and exits with an error message - * if something funny is going on: - */ -static void *_check_pointer(unsigned long addr, unsigned int size, - unsigned int line) -{ - /* - * Check if the requested address and size exceeds the allocated memory, - * or addr + size wraps around. - */ - if ((addr + size) > guest_limit || (addr + size) < addr) - errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); - /* - * We return a pointer for the caller's convenience, now we know it's - * safe to use. - */ - return from_guest_phys(addr); -} -/* A macro which transparently hands the line number to the real function. */ -#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) - -/* - * Each buffer in the virtqueues is actually a chain of descriptors. This - * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. - */ -static unsigned next_desc(struct vring_desc *desc, - unsigned int i, unsigned int max) -{ - unsigned int next; - - /* If this descriptor says it doesn't chain, we're done. */ - if (!(desc[i].flags & VRING_DESC_F_NEXT)) - return max; - - /* Check they're not leading us off end of descriptors. */ - next = desc[i].next; - /* Make sure compiler knows to grab that: we don't want it changing! */ - wmb(); - - if (next >= max) - errx(1, "Desc next is %u", next); - - return next; -} - -/* - * This actually sends the interrupt for this virtqueue, if we've used a - * buffer. - */ -static void trigger_irq(struct virtqueue *vq) -{ - unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; - - /* Don't inform them if nothing used. */ - if (!vq->pending_used) - return; - vq->pending_used = 0; - - /* If they don't want an interrupt, don't send one... */ - if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { - /* ... unless they've asked us to force one on empty. */ - if (!vq->dev->irq_on_empty - || lg_last_avail(vq) != vq->vring.avail->idx) - return; - } - - /* Send the Guest an interrupt tell them we used something up. */ - if (write(lguest_fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", vq->config.irq); -} - -/* - * This looks in the virtqueue for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. - * - * This function waits if necessary, and returns the descriptor number found. - */ -static unsigned wait_for_vq_desc(struct virtqueue *vq, - struct iovec iov[], - unsigned int *out_num, unsigned int *in_num) -{ - unsigned int i, head, max; - struct vring_desc *desc; - u16 last_avail = lg_last_avail(vq); - - /* There's nothing available? */ - while (last_avail == vq->vring.avail->idx) { - u64 event; - - /* - * Since we're about to sleep, now is a good time to tell the - * Guest about what we've used up to now. - */ - trigger_irq(vq); - - /* OK, now we need to know about added descriptors. */ - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - - /* - * They could have slipped one in as we were doing that: make - * sure it's written, then check again. - */ - mb(); - if (last_avail != vq->vring.avail->idx) { - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - break; - } - - /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) - errx(1, "Event read failed?"); - - /* We don't need to be notified again. */ - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - } - - /* Check it isn't doing very strange things with descriptor numbers. */ - if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) - errx(1, "Guest moved used index from %u to %u", - last_avail, vq->vring.avail->idx); - - /* - * Grab the next descriptor number they're advertising, and increment - * the index we've seen. - */ - head = vq->vring.avail->ring[last_avail % vq->vring.num]; - lg_last_avail(vq)++; - - /* If their number is silly, that's a fatal mistake. */ - if (head >= vq->vring.num) - errx(1, "Guest says index %u is available", head); - - /* When we start there are none of either input nor output. */ - *out_num = *in_num = 0; - - max = vq->vring.num; - desc = vq->vring.desc; - i = head; - - /* - * If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. - */ - if (desc[i].flags & VRING_DESC_F_INDIRECT) { - if (desc[i].len % sizeof(struct vring_desc)) - errx(1, "Invalid size for indirect buffer table"); - - max = desc[i].len / sizeof(struct vring_desc); - desc = check_pointer(desc[i].addr, desc[i].len); - i = 0; - } - - do { - /* Grab the first descriptor, and check it's OK. */ - iov[*out_num + *in_num].iov_len = desc[i].len; - iov[*out_num + *in_num].iov_base - = check_pointer(desc[i].addr, desc[i].len); - /* If this is an input descriptor, increment that count. */ - if (desc[i].flags & VRING_DESC_F_WRITE) - (*in_num)++; - else { - /* - * If it's an output descriptor, they're all supposed - * to come before any input descriptors. - */ - if (*in_num) - errx(1, "Descriptor has out after in"); - (*out_num)++; - } - - /* If we've got too many, that implies a descriptor loop. */ - if (*out_num + *in_num > max) - errx(1, "Looped descriptor"); - } while ((i = next_desc(desc, i, max)) != max); - - return head; -} - -/* - * After we've used one of their buffers, we tell the Guest about it. Sometime - * later we'll want to send them an interrupt using trigger_irq(); note that - * wait_for_vq_desc() does that for us if it has to wait. - */ -static void add_used(struct virtqueue *vq, unsigned int head, int len) -{ - struct vring_used_elem *used; - - /* - * The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. - */ - used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; - used->id = head; - used->len = len; - /* Make sure buffer is written before we update index. */ - wmb(); - vq->vring.used->idx++; - vq->pending_used++; -} - -/* And here's the combo meal deal. Supersize me! */ -static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) -{ - add_used(vq, head, len); - trigger_irq(vq); -} - -/* - * The Console - * - * We associate some data with the console for our exit hack. - */ -struct console_abort { - /* How many times have they hit ^C? */ - int count; - /* When did they start? */ - struct timeval start; -}; - -/* This is the routine which handles console input (ie. stdin). */ -static void console_input(struct virtqueue *vq) -{ - int len; - unsigned int head, in_num, out_num; - struct console_abort *abort = vq->dev->priv; - struct iovec iov[vq->vring.num]; - - /* Make sure there's a descriptor available. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - if (out_num) - errx(1, "Output buffers in console in queue?"); - - /* Read into it. This is where we usually wait. */ - len = readv(STDIN_FILENO, iov, in_num); - if (len <= 0) { - /* Ran out of input? */ - warnx("Failed to get console input, ignoring console."); - /* - * For simplicity, dying threads kill the whole Launcher. So - * just nap here. - */ - for (;;) - pause(); - } - - /* Tell the Guest we used a buffer. */ - add_used_and_trigger(vq, head, len); - - /* - * Three ^C within one second? Exit. - * - * This is such a hack, but works surprisingly well. Each ^C has to - * be in a buffer by itself, so they can't be too fast. But we check - * that we get three within about a second, so they can't be too - * slow. - */ - if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { - abort->count = 0; - return; - } - - abort->count++; - if (abort->count == 1) - gettimeofday(&abort->start, NULL); - else if (abort->count == 3) { - struct timeval now; - gettimeofday(&now, NULL); - /* Kill all Launcher processes with SIGINT, like normal ^C */ - if (now.tv_sec <= abort->start.tv_sec+1) - kill(0, SIGINT); - abort->count = 0; - } -} - -/* This is the routine which handles console output (ie. stdout). */ -static void console_output(struct virtqueue *vq) -{ - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - - /* We usually wait in here, for the Guest to give us something. */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - errx(1, "Input buffers in console output queue?"); - - /* writev can return a partial write, so we loop here. */ - while (!iov_empty(iov, out)) { - int len = writev(STDOUT_FILENO, iov, out); - if (len <= 0) - err(1, "Write to stdout gave %i", len); - iov_consume(iov, out, len); - } - - /* - * We're finished with that buffer: if we're going to sleep, - * wait_for_vq_desc() will prod the Guest with an interrupt. - */ - add_used(vq, head, 0); -} - -/* - * The Network - * - * Handling output for network is also simple: we get all the output buffers - * and write them to /dev/net/tun. - */ -struct net_info { - int tunfd; -}; - -static void net_output(struct virtqueue *vq) -{ - struct net_info *net_info = vq->dev->priv; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - - /* We usually wait in here for the Guest to give us a packet. */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (in) - errx(1, "Input buffers in net output queue?"); - /* - * Send the whole thing through to /dev/net/tun. It expects the exact - * same format: what a coincidence! - */ - if (writev(net_info->tunfd, iov, out) < 0) - errx(1, "Write to tun failed?"); - - /* - * Done with that one; wait_for_vq_desc() will send the interrupt if - * all packets are processed. - */ - add_used(vq, head, 0); -} - -/* - * Handling network input is a bit trickier, because I've tried to optimize it. - * - * First we have a helper routine which tells is if from this file descriptor - * (ie. the /dev/net/tun device) will block: - */ -static bool will_block(int fd) -{ - fd_set fdset; - struct timeval zero = { 0, 0 }; - FD_ZERO(&fdset); - FD_SET(fd, &fdset); - return select(fd+1, &fdset, NULL, NULL, &zero) != 1; -} - -/* - * This handles packets coming in from the tun device to our Guest. Like all - * service routines, it gets called again as soon as it returns, so you don't - * see a while(1) loop here. - */ -static void net_input(struct virtqueue *vq) -{ - int len; - unsigned int head, out, in; - struct iovec iov[vq->vring.num]; - struct net_info *net_info = vq->dev->priv; - - /* - * Get a descriptor to write an incoming packet into. This will also - * send an interrupt if they're out of descriptors. - */ - head = wait_for_vq_desc(vq, iov, &out, &in); - if (out) - errx(1, "Output buffers in net input queue?"); - - /* - * If it looks like we'll block reading from the tun device, send them - * an interrupt. - */ - if (vq->pending_used && will_block(net_info->tunfd)) - trigger_irq(vq); - - /* - * Read in the packet. This is where we normally wait (when there's no - * incoming network traffic). - */ - len = readv(net_info->tunfd, iov, in); - if (len <= 0) - err(1, "Failed to read from tun."); - - /* - * Mark that packet buffer as used, but don't interrupt here. We want - * to wait until we've done as much work as we can. - */ - add_used(vq, head, len); -} -/*:*/ - -/* This is the helper to create threads: run the service routine in a loop. */ -static int do_thread(void *_vq) -{ - struct virtqueue *vq = _vq; - - for (;;) - vq->service(vq); - return 0; -} - -/* - * When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! - */ -static void kill_launcher(int signal) -{ - kill(0, SIGTERM); -} - -static void reset_device(struct device *dev) -{ - struct virtqueue *vq; - - verbose("Resetting device %s\n", dev->name); - - /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); - - /* We're going to be explicitly killing threads, so ignore them. */ - signal(SIGCHLD, SIG_IGN); - - /* Zero out the virtqueues, get rid of their threads */ - for (vq = dev->vq; vq; vq = vq->next) { - if (vq->thread != (pid_t)-1) { - kill(vq->thread, SIGTERM); - waitpid(vq->thread, NULL, 0); - vq->thread = (pid_t)-1; - } - memset(vq->vring.desc, 0, - vring_size(vq->config.num, LGUEST_VRING_ALIGN)); - lg_last_avail(vq) = 0; - } - dev->running = false; - - /* Now we care if threads die. */ - signal(SIGCHLD, (void *)kill_launcher); -} - -/*L:216 - * This actually creates the thread which services the virtqueue for a device. - */ -static void create_thread(struct virtqueue *vq) -{ - /* - * Create stack for thread. Since the stack grows upwards, we point - * the stack pointer to the end of this region. - */ - char *stack = malloc(32768); - unsigned long args[] = { LHREQ_EVENTFD, - vq->config.pfn*getpagesize(), 0 }; - - /* Create a zero-initialized eventfd. */ - vq->eventfd = eventfd(0, 0); - if (vq->eventfd < 0) - err(1, "Creating eventfd"); - args[2] = vq->eventfd; - - /* - * Attach an eventfd to this virtqueue: it will go off when the Guest - * does an LHCALL_NOTIFY for this vq. - */ - if (write(lguest_fd, &args, sizeof(args)) != 0) - err(1, "Attaching eventfd"); - - /* - * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so - * we get a signal if it dies. - */ - vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); - if (vq->thread == (pid_t)-1) - err(1, "Creating clone"); - - /* We close our local copy now the child has it. */ - close(vq->eventfd); -} - -static bool accepted_feature(struct device *dev, unsigned int bit) -{ - const u8 *features = get_feature_bits(dev) + dev->feature_len; - - if (dev->feature_len < bit / CHAR_BIT) - return false; - return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); -} - -static void start_device(struct device *dev) -{ - unsigned int i; - struct virtqueue *vq; - - verbose("Device %s OK: offered", dev->name); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev)[i]); - verbose(", accepted"); - for (i = 0; i < dev->feature_len; i++) - verbose(" %02x", get_feature_bits(dev) - [dev->feature_len+i]); - - dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); - - for (vq = dev->vq; vq; vq = vq->next) { - if (vq->service) - create_thread(vq); - } - dev->running = true; -} - -static void cleanup_devices(void) -{ - struct device *dev; - - for (dev = devices.dev; dev; dev = dev->next) - reset_device(dev); - - /* If we saved off the original terminal settings, restore them now. */ - if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); -} - -/* When the Guest tells us they updated the status field, we handle it. */ -static void update_device_status(struct device *dev) -{ - /* A zero status is a reset, otherwise it's a set of flags. */ - if (dev->desc->status == 0) - reset_device(dev); - else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { - warnx("Device %s configuration FAILED", dev->name); - if (dev->running) - reset_device(dev); - } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - if (!dev->running) - start_device(dev); - } -} - -/*L:215 - * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In - * particular, it's used to notify us of device status changes during boot. - */ -static void handle_output(unsigned long addr) -{ - struct device *i; - - /* Check each device. */ - for (i = devices.dev; i; i = i->next) { - struct virtqueue *vq; - - /* - * Notifications to device descriptors mean they updated the - * device status. - */ - if (from_guest_phys(addr) == i->desc) { - update_device_status(i); - return; - } - - /* - * Devices *can* be used before status is set to DRIVER_OK. - * The original plan was that they would never do this: they - * would always finish setting up their status bits before - * actually touching the virtqueues. In practice, we allowed - * them to, and they do (eg. the disk probes for partition - * tables as part of initialization). - * - * If we see this, we start the device: once it's running, we - * expect the device to catch all the notifications. - */ - for (vq = i->vq; vq; vq = vq->next) { - if (addr != vq->config.pfn*getpagesize()) - continue; - if (i->running) - errx(1, "Notification on running %s", i->name); - /* This just calls create_thread() for each virtqueue */ - start_device(i); - return; - } - } - - /* - * Early console write is done using notify on a nul-terminated string - * in Guest memory. It's also great for hacking debugging messages - * into a Guest. - */ - if (addr >= guest_limit) - errx(1, "Bad NOTIFY %#lx", addr); - - write(STDOUT_FILENO, from_guest_phys(addr), - strnlen(from_guest_phys(addr), guest_limit - addr)); -} - -/*L:190 - * Device Setup - * - * All devices need a descriptor so the Guest knows it exists, and a "struct - * device" so the Launcher can keep track of it. We have common helper - * routines to allocate and manage them. - */ - -/* - * The layout of the device page is a "struct lguest_device_desc" followed by a - * number of virtqueue descriptors, then two sets of feature bits, then an - * array of configuration bytes. This routine returns the configuration - * pointer. - */ -static u8 *device_config(const struct device *dev) -{ - return (void *)(dev->desc + 1) - + dev->num_vq * sizeof(struct lguest_vqconfig) - + dev->feature_len * 2; -} - -/* - * This routine allocates a new "struct lguest_device_desc" from descriptor - * table page just above the Guest's normal memory. It returns a pointer to - * that descriptor. - */ -static struct lguest_device_desc *new_dev_desc(u16 type) -{ - struct lguest_device_desc d = { .type = type }; - void *p; - - /* Figure out where the next device config is, based on the last one. */ - if (devices.lastdev) - p = device_config(devices.lastdev) - + devices.lastdev->desc->config_len; - else - p = devices.descpage; - - /* We only have one page for all the descriptors. */ - if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) - errx(1, "Too many devices"); - - /* p might not be aligned, so we memcpy in. */ - return memcpy(p, &d, sizeof(d)); -} - -/* - * Each device descriptor is followed by the description of its virtqueues. We - * specify how many descriptors the virtqueue is to have. - */ -static void add_virtqueue(struct device *dev, unsigned int num_descs, - void (*service)(struct virtqueue *)) -{ - unsigned int pages; - struct virtqueue **i, *vq = malloc(sizeof(*vq)); - void *p; - - /* First we need some memory for this virtqueue. */ - pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) - / getpagesize(); - p = get_pages(pages); - - /* Initialize the virtqueue */ - vq->next = NULL; - vq->last_avail_idx = 0; - vq->dev = dev; - - /* - * This is the routine the service thread will run, and its Process ID - * once it's running. - */ - vq->service = service; - vq->thread = (pid_t)-1; - - /* Initialize the configuration. */ - vq->config.num = num_descs; - vq->config.irq = devices.next_irq++; - vq->config.pfn = to_guest_phys(p) / getpagesize(); - - /* Initialize the vring. */ - vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); - - /* - * Append virtqueue to this device's descriptor. We use - * device_config() to get the end of the device's current virtqueues; - * we check that we haven't added any config or feature information - * yet, otherwise we'd be overwriting them. - */ - assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); - memcpy(device_config(dev), &vq->config, sizeof(vq->config)); - dev->num_vq++; - dev->desc->num_vq++; - - verbose("Virtqueue page %#lx\n", to_guest_phys(p)); - - /* - * Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. - */ - for (i = &dev->vq; *i; i = &(*i)->next); - *i = vq; -} - -/* - * The first half of the feature bitmask is for us to advertise features. The - * second half is for the Guest to accept features. - */ -static void add_feature(struct device *dev, unsigned bit) -{ - u8 *features = get_feature_bits(dev); - - /* We can't extend the feature bits once we've added config bytes */ - if (dev->desc->feature_len <= bit / CHAR_BIT) { - assert(dev->desc->config_len == 0); - dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; - } - - features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); -} - -/* - * This routine sets the configuration fields for an existing device's - * descriptor. It only works for the last device, but that's OK because that's - * how we use it. - */ -static void set_config(struct device *dev, unsigned len, const void *conf) -{ - /* Check we haven't overflowed our single page. */ - if (device_config(dev) + len > devices.descpage + getpagesize()) - errx(1, "Too many devices"); - - /* Copy in the config information, and store the length. */ - memcpy(device_config(dev), conf, len); - dev->desc->config_len = len; - - /* Size must fit in config_len field (8 bits)! */ - assert(dev->desc->config_len == len); -} - -/* - * This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. We - * don't actually start the service threads until later. - * - * See what I mean about userspace being boring? - */ -static struct device *new_device(const char *name, u16 type) -{ - struct device *dev = malloc(sizeof(*dev)); - - /* Now we populate the fields one at a time. */ - dev->desc = new_dev_desc(type); - dev->name = name; - dev->vq = NULL; - dev->feature_len = 0; - dev->num_vq = 0; - dev->running = false; - - /* - * Append to device list. Prepending to a single-linked list is - * easier, but the user expects the devices to be arranged on the bus - * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. - */ - if (devices.lastdev) - devices.lastdev->next = dev; - else - devices.dev = dev; - devices.lastdev = dev; - - return dev; -} - -/* - * Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. - */ -static void setup_console(void) -{ - struct device *dev; - - /* If we can save the initial standard input settings... */ - if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { - struct termios term = orig_term; - /* - * Then we turn off echo, line buffering and ^C etc: We want a - * raw input stream to the Guest. - */ - term.c_lflag &= ~(ISIG|ICANON|ECHO); - tcsetattr(STDIN_FILENO, TCSANOW, &term); - } - - dev = new_device("console", VIRTIO_ID_CONSOLE); - - /* We store the console state in dev->priv, and initialize it. */ - dev->priv = malloc(sizeof(struct console_abort)); - ((struct console_abort *)dev->priv)->count = 0; - - /* - * The console needs two virtqueues: the input then the output. When - * they put something the input queue, we make sure we're listening to - * stdin. When they put something in the output queue, we write it to - * stdout. - */ - add_virtqueue(dev, VIRTQUEUE_NUM, console_input); - add_virtqueue(dev, VIRTQUEUE_NUM, console_output); - - verbose("device %u: console\n", ++devices.device_num); -} -/*:*/ - -/*M:010 - * Inter-guest networking is an interesting area. Simplest is to have a - * --sharenet= option which opens or creates a named pipe. This can be - * used to send packets to another guest in a 1:1 manner. - * - * More sopisticated is to use one of the tools developed for project like UML - * to do networking. - * - * Faster is to do virtio bonding in kernel. Doing this 1:1 would be - * completely generic ("here's my vring, attach to your vring") and would work - * for any traffic. Of course, namespace and permissions issues need to be - * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide - * multiple inter-guest channels behind one interface, although it would - * require some manner of hotplugging new virtio channels. - * - * Finally, we could implement a virtio network switch in the kernel. -:*/ - -static u32 str2ip(const char *ipaddr) -{ - unsigned int b[4]; - - if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) - errx(1, "Failed to parse IP address '%s'", ipaddr); - return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; -} - -static void str2mac(const char *macaddr, unsigned char mac[6]) -{ - unsigned int m[6]; - if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", - &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) - errx(1, "Failed to parse mac address '%s'", macaddr); - mac[0] = m[0]; - mac[1] = m[1]; - mac[2] = m[2]; - mac[3] = m[3]; - mac[4] = m[4]; - mac[5] = m[5]; -} - -/* - * This code is "adapted" from libbridge: it attaches the Host end of the - * network device to the bridge device specified by the command line. - * - * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. - */ -static void add_to_bridge(int fd, const char *if_name, const char *br_name) -{ - int ifidx; - struct ifreq ifr; - - if (!*br_name) - errx(1, "must specify bridge name"); - - ifidx = if_nametoindex(if_name); - if (!ifidx) - errx(1, "interface %s does not exist!", if_name); - - strncpy(ifr.ifr_name, br_name, IFNAMSIZ); - ifr.ifr_name[IFNAMSIZ-1] = '\0'; - ifr.ifr_ifindex = ifidx; - if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) - err(1, "can't add %s to bridge %s", if_name, br_name); -} - -/* - * This sets up the Host end of the network device with an IP address, brings - * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. - */ -static void configure_device(int fd, const char *tapif, u32 ipaddr) -{ - struct ifreq ifr; - struct sockaddr_in sin; - - memset(&ifr, 0, sizeof(ifr)); - strcpy(ifr.ifr_name, tapif); - - /* Don't read these incantations. Just cut & paste them like I did! */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(ipaddr); - memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); - if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) - err(1, "Setting %s interface address", tapif); - ifr.ifr_flags = IFF_UP; - if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) - err(1, "Bringing interface %s up", tapif); -} - -static int get_tun_device(char tapif[IFNAMSIZ]) -{ - struct ifreq ifr; - int netfd; - - /* Start with this zeroed. Messy but sure. */ - memset(&ifr, 0, sizeof(ifr)); - - /* - * We open the /dev/net/tun device and tell it we want a tap device. A - * tap device is like a tun device, only somehow different. To tell - * the truth, I completely blundered my way through this code, but it - * works now! - */ - netfd = open_or_die("/dev/net/tun", O_RDWR); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - strcpy(ifr.ifr_name, "tap%d"); - if (ioctl(netfd, TUNSETIFF, &ifr) != 0) - err(1, "configuring /dev/net/tun"); - - if (ioctl(netfd, TUNSETOFFLOAD, - TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) - err(1, "Could not set features for tun device"); - - /* - * We don't need checksums calculated for packets coming in this - * device: trust us! - */ - ioctl(netfd, TUNSETNOCSUM, 1); - - memcpy(tapif, ifr.ifr_name, IFNAMSIZ); - return netfd; -} - -/*L:195 - * Our network is a Host<->Guest network. This can either use bridging or - * routing, but the principle is the same: it uses the "tun" device to inject - * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. - */ -static void setup_tun_net(char *arg) -{ - struct device *dev; - struct net_info *net_info = malloc(sizeof(*net_info)); - int ipfd; - u32 ip = INADDR_ANY; - bool bridging = false; - char tapif[IFNAMSIZ], *p; - struct virtio_net_config conf; - - net_info->tunfd = get_tun_device(tapif); - - /* First we create a new network device. */ - dev = new_device("net", VIRTIO_ID_NET); - dev->priv = net_info; - - /* Network devices need a recv and a send queue, just like console. */ - add_virtqueue(dev, VIRTQUEUE_NUM, net_input); - add_virtqueue(dev, VIRTQUEUE_NUM, net_output); - - /* - * We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! - */ - ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); - if (ipfd < 0) - err(1, "opening IP socket"); - - /* If the command line was --tunnet=bridge: do bridging. */ - if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { - arg += strlen(BRIDGE_PFX); - bridging = true; - } - - /* A mac address may follow the bridge name or IP address */ - p = strchr(arg, ':'); - if (p) { - str2mac(p+1, conf.mac); - add_feature(dev, VIRTIO_NET_F_MAC); - *p = '\0'; - } - - /* arg is now either an IP address or a bridge name */ - if (bridging) - add_to_bridge(ipfd, tapif, arg); - else - ip = str2ip(arg); - - /* Set up the tun device. */ - configure_device(ipfd, tapif, ip); - - add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); - /* Expect Guest to handle everything except UFO */ - add_feature(dev, VIRTIO_NET_F_CSUM); - add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); - add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); - add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); - add_feature(dev, VIRTIO_NET_F_GUEST_ECN); - add_feature(dev, VIRTIO_NET_F_HOST_TSO4); - add_feature(dev, VIRTIO_NET_F_HOST_TSO6); - add_feature(dev, VIRTIO_NET_F_HOST_ECN); - /* We handle indirect ring entries */ - add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); - set_config(dev, sizeof(conf), &conf); - - /* We don't need the socket any more; setup is done. */ - close(ipfd); - - devices.device_num++; - - if (bridging) - verbose("device %u: tun %s attached to bridge: %s\n", - devices.device_num, tapif, arg); - else - verbose("device %u: tun %s: %s\n", - devices.device_num, tapif, arg); -} -/*:*/ - -/* This hangs off device->priv. */ -struct vblk_info { - /* The size of the file. */ - off64_t len; - - /* The file descriptor for the file. */ - int fd; - -}; - -/*L:210 - * The Disk - * - * The disk only has one virtqueue, so it only has one thread. It is really - * simple: the Guest asks for a block number and we read or write that position - * in the file. - * - * Before we serviced each virtqueue in a separate thread, that was unacceptably - * slow: the Guest waits until the read is finished before running anything - * else, even if it could have been doing useful work. - * - * We could have used async I/O, except it's reputed to suck so hard that - * characters actually go missing from your code when you try to use it. - */ -static void blk_request(struct virtqueue *vq) -{ - struct vblk_info *vblk = vq->dev->priv; - unsigned int head, out_num, in_num, wlen; - int ret; - u8 *in; - struct virtio_blk_outhdr *out; - struct iovec iov[vq->vring.num]; - off64_t off; - - /* - * Get the next request, where we normally wait. It triggers the - * interrupt to acknowledge previously serviced requests (if any). - */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - - /* - * Every block request should contain at least one output buffer - * (detailing the location on disk and the type of request) and one - * input buffer (to hold the result). - */ - if (out_num == 0 || in_num == 0) - errx(1, "Bad virtblk cmd %u out=%u in=%u", - head, out_num, in_num); - - out = convert(&iov[0], struct virtio_blk_outhdr); - in = convert(&iov[out_num+in_num-1], u8); - /* - * For historical reasons, block operations are expressed in 512 byte - * "sectors". - */ - off = out->sector * 512; - - /* - * In general the virtio block driver is allowed to try SCSI commands. - * It'd be nice if we supported eject, for example, but we don't. - */ - if (out->type & VIRTIO_BLK_T_SCSI_CMD) { - fprintf(stderr, "Scsi commands unsupported\n"); - *in = VIRTIO_BLK_S_UNSUPP; - wlen = sizeof(*in); - } else if (out->type & VIRTIO_BLK_T_OUT) { - /* - * Write - * - * Move to the right location in the block file. This can fail - * if they try to write past end. - */ - if (lseek64(vblk->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %llu", out->sector); - - ret = writev(vblk->fd, iov+1, out_num-1); - verbose("WRITE to sector %llu: %i\n", out->sector, ret); - - /* - * Grr... Now we know how long the descriptor they sent was, we - * make sure they didn't try to write over the end of the block - * file (possibly extending it). - */ - if (ret > 0 && off + ret > vblk->len) { - /* Trim it back to the correct length */ - ftruncate64(vblk->fd, vblk->len); - /* Die, bad Guest, die. */ - errx(1, "Write past end %llu+%u", off, ret); - } - - wlen = sizeof(*in); - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); - } else if (out->type & VIRTIO_BLK_T_FLUSH) { - /* Flush */ - ret = fdatasync(vblk->fd); - verbose("FLUSH fdatasync: %i\n", ret); - wlen = sizeof(*in); - *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); - } else { - /* - * Read - * - * Move to the right location in the block file. This can fail - * if they try to read past end. - */ - if (lseek64(vblk->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %llu", out->sector); - - ret = readv(vblk->fd, iov+1, in_num-1); - verbose("READ from sector %llu: %i\n", out->sector, ret); - if (ret >= 0) { - wlen = sizeof(*in) + ret; - *in = VIRTIO_BLK_S_OK; - } else { - wlen = sizeof(*in); - *in = VIRTIO_BLK_S_IOERR; - } - } - - /* Finished that request. */ - add_used(vq, head, wlen); -} - -/*L:198 This actually sets up a virtual block device. */ -static void setup_block_file(const char *filename) -{ - struct device *dev; - struct vblk_info *vblk; - struct virtio_blk_config conf; - - /* Creat the device. */ - dev = new_device("block", VIRTIO_ID_BLOCK); - - /* The device has one virtqueue, where the Guest places requests. */ - add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); - - /* Allocate the room for our own bookkeeping */ - vblk = dev->priv = malloc(sizeof(*vblk)); - - /* First we open the file and store the length. */ - vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); - vblk->len = lseek64(vblk->fd, 0, SEEK_END); - - /* We support FLUSH. */ - add_feature(dev, VIRTIO_BLK_F_FLUSH); - - /* Tell Guest how many sectors this device has. */ - conf.capacity = cpu_to_le64(vblk->len / 512); - - /* - * Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. - */ - add_feature(dev, VIRTIO_BLK_F_SEG_MAX); - conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); - - /* Don't try to put whole struct: we have 8 bit limit. */ - set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); - - verbose("device %u: virtblock %llu sectors\n", - ++devices.device_num, le64_to_cpu(conf.capacity)); -} - -/*L:211 - * Our random number generator device reads from /dev/random into the Guest's - * input buffers. The usual case is that the Guest doesn't want random numbers - * and so has no buffers although /dev/random is still readable, whereas - * console is the reverse. - * - * The same logic applies, however. - */ -struct rng_info { - int rfd; -}; - -static void rng_input(struct virtqueue *vq) -{ - int len; - unsigned int head, in_num, out_num, totlen = 0; - struct rng_info *rng_info = vq->dev->priv; - struct iovec iov[vq->vring.num]; - - /* First we need a buffer from the Guests's virtqueue. */ - head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - if (out_num) - errx(1, "Output buffers in rng?"); - - /* - * Just like the console write, we loop to cover the whole iovec. - * In this case, short reads actually happen quite a bit. - */ - while (!iov_empty(iov, in_num)) { - len = readv(rng_info->rfd, iov, in_num); - if (len <= 0) - err(1, "Read from /dev/random gave %i", len); - iov_consume(iov, in_num, len); - totlen += len; - } - - /* Tell the Guest about the new input. */ - add_used(vq, head, totlen); -} - -/*L:199 - * This creates a "hardware" random number device for the Guest. - */ -static void setup_rng(void) -{ - struct device *dev; - struct rng_info *rng_info = malloc(sizeof(*rng_info)); - - /* Our device's privat info simply contains the /dev/random fd. */ - rng_info->rfd = open_or_die("/dev/random", O_RDONLY); - - /* Create the new device. */ - dev = new_device("rng", VIRTIO_ID_RNG); - dev->priv = rng_info; - - /* The device has one virtqueue, where the Guest places inbufs. */ - add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); - - verbose("device %u: rng\n", devices.device_num++); -} -/* That's the end of device setup. */ - -/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ -static void __attribute__((noreturn)) restart_guest(void) -{ - unsigned int i; - - /* - * Since we don't track all open fds, we simply close everything beyond - * stderr. - */ - for (i = 3; i < FD_SETSIZE; i++) - close(i); - - /* Reset all the devices (kills all threads). */ - cleanup_devices(); - - execv(main_args[0], main_args); - err(1, "Could not exec %s", main_args[0]); -} - -/*L:220 - * Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. - */ -static void __attribute__((noreturn)) run_guest(void) -{ - for (;;) { - unsigned long notify_addr; - int readval; - - /* We read from the /dev/lguest device to run the Guest. */ - readval = pread(lguest_fd, ¬ify_addr, - sizeof(notify_addr), cpu_id); - - /* One unsigned long means the Guest did HCALL_NOTIFY */ - if (readval == sizeof(notify_addr)) { - verbose("Notify on address %#lx\n", notify_addr); - handle_output(notify_addr); - /* ENOENT means the Guest died. Reading tells us why. */ - } else if (errno == ENOENT) { - char reason[1024] = { 0 }; - pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); - errx(1, "%s", reason); - /* ERESTART means that we need to reboot the guest */ - } else if (errno == ERESTART) { - restart_guest(); - /* Anything else means a bug or incompatible change. */ - } else - err(1, "Running guest failed"); - } -} -/*L:240 - * This is the end of the Launcher. The good news: we are over halfway - * through! The bad news: the most fiendish part of the code still lies ahead - * of us. - * - * Are you ready? Take a deep breath and join me in the core of the Host, in - * "make Host". -:*/ - -static struct option opts[] = { - { "verbose", 0, NULL, 'v' }, - { "tunnet", 1, NULL, 't' }, - { "block", 1, NULL, 'b' }, - { "rng", 0, NULL, 'r' }, - { "initrd", 1, NULL, 'i' }, - { "username", 1, NULL, 'u' }, - { "chroot", 1, NULL, 'c' }, - { NULL }, -}; -static void usage(void) -{ - errx(1, "Usage: lguest [--verbose] " - "[--tunnet=(:|bridge::)\n" - "|--block=|--initrd=]...\n" - " vmlinux [args...]"); -} - -/*L:105 The main routine is where the real work begins: */ -int main(int argc, char *argv[]) -{ - /* Memory, code startpoint and size of the (optional) initrd. */ - unsigned long mem = 0, start, initrd_size = 0; - /* Two temporaries. */ - int i, c; - /* The boot information for the Guest. */ - struct boot_params *boot; - /* If they specify an initrd file to load. */ - const char *initrd_name = NULL; - - /* Password structure for initgroups/setres[gu]id */ - struct passwd *user_details = NULL; - - /* Directory to chroot to */ - char *chroot_path = NULL; - - /* Save the args: we "reboot" by execing ourselves again. */ - main_args = argv; - - /* - * First we initialize the device list. We keep a pointer to the last - * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). - */ - devices.lastdev = NULL; - devices.next_irq = 1; - - /* We're CPU 0. In fact, that's the only CPU possible right now. */ - cpu_id = 0; - - /* - * We need to know how much memory so we can set up the device - * descriptor and memory pages for the devices as we parse the command - * line. So we quickly look through the arguments to find the amount - * of memory now. - */ - for (i = 1; i < argc; i++) { - if (argv[i][0] != '-') { - mem = atoi(argv[i]) * 1024 * 1024; - /* - * We start by mapping anonymous pages over all of - * guest-physical memory range. This fills it with 0, - * and ensures that the Guest won't be killed when it - * tries to access it. - */ - guest_base = map_zeroed_pages(mem / getpagesize() - + DEVICE_PAGES); - guest_limit = mem; - guest_max = mem + DEVICE_PAGES*getpagesize(); - devices.descpage = get_pages(1); - break; - } - } - - /* The options are fairly straight-forward */ - while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { - switch (c) { - case 'v': - verbose = true; - break; - case 't': - setup_tun_net(optarg); - break; - case 'b': - setup_block_file(optarg); - break; - case 'r': - setup_rng(); - break; - case 'i': - initrd_name = optarg; - break; - case 'u': - user_details = getpwnam(optarg); - if (!user_details) - err(1, "getpwnam failed, incorrect username?"); - break; - case 'c': - chroot_path = optarg; - break; - default: - warnx("Unknown argument %s", argv[optind]); - usage(); - } - } - /* - * After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. - */ - if (optind + 2 > argc) - usage(); - - verbose("Guest base is at %p\n", guest_base); - - /* We always have a console device */ - setup_console(); - - /* Now we load the kernel */ - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); - - /* Boot information is stashed at physical address 0 */ - boot = from_guest_phys(0); - - /* Map the initrd image if requested (at top of physical memory) */ - if (initrd_name) { - initrd_size = load_initrd(initrd_name, mem); - /* - * These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. - */ - boot->hdr.ramdisk_image = mem - initrd_size; - boot->hdr.ramdisk_size = initrd_size; - /* The bootloader type 0xFF means "unknown"; that's OK. */ - boot->hdr.type_of_loader = 0xFF; - } - - /* - * The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. - */ - boot->e820_entries = 1; - boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); - /* - * The boot header contains a command line pointer: we put the command - * line after the boot header. - */ - boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); - /* We use a simple helper to copy the arguments separated by spaces. */ - concat((char *)(boot + 1), argv+optind+2); - - /* Boot protocol version: 2.07 supports the fields for lguest. */ - boot->hdr.version = 0x207; - - /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = 1; - - /* Tell the entry path not to try to reload segment registers. */ - boot->hdr.loadflags |= KEEP_SEGMENTS; - - /* - * We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. - */ - tell_kernel(start); - - /* Ensure that we terminate if a device-servicing child dies. */ - signal(SIGCHLD, kill_launcher); - - /* If we exit via err(), this kills all the threads, restores tty. */ - atexit(cleanup_devices); - - /* If requested, chroot to a directory */ - if (chroot_path) { - if (chroot(chroot_path) != 0) - err(1, "chroot(\"%s\") failed", chroot_path); - - if (chdir("/") != 0) - err(1, "chdir(\"/\") failed"); - - verbose("chroot done\n"); - } - - /* If requested, drop privileges */ - if (user_details) { - uid_t u; - gid_t g; - - u = user_details->pw_uid; - g = user_details->pw_gid; - - if (initgroups(user_details->pw_name, g) != 0) - err(1, "initgroups failed"); - - if (setresgid(g, g, g) != 0) - err(1, "setresgid failed"); - - if (setresuid(u, u, u) != 0) - err(1, "setresuid failed"); - - verbose("Dropping privileges completed\n"); - } - - /* Finally, run the Guest. This doesn't return. */ - run_guest(); -} -/*:*/ - -/*M:999 - * Mastery is done: you now know everything I do. - * - * But surely you have seen code, features and bugs in your wanderings which - * you now yearn to attack? That is the real game, and I look forward to you - * patching and forking lguest into the Your-Name-Here-visor. - * - * Farewell, and good coding! - * Rusty Russell. - */ diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt deleted file mode 100644 index dad9997..0000000 --- a/Documentation/lguest/lguest.txt +++ /dev/null @@ -1,128 +0,0 @@ - __ - (___()'`; Rusty's Remarkably Unreliable Guide to Lguest - /, /` - or, A Young Coder's Illustrated Hypervisor - \\"--\\ http://lguest.ozlabs.org - -Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, -for Linux developers and users to experiment with virtualization with the -minimum of complexity. Nonetheless, it should have sufficient features to -make it useful for specific tasks, and, of course, you are encouraged to fork -and enhance it (see drivers/lguest/README). - -Features: - -- Kernel module which runs in a normal kernel. -- Simple I/O model for communication. -- Simple program to create new guests. -- Logo contains cute puppies: http://lguest.ozlabs.org - -Developer features: - -- Fun to hack on. -- No ABI: being tied to a specific kernel anyway, you can change anything. -- Many opportunities for improvement or feature implementation. - -Running Lguest: - -- The easiest way to run lguest is to use same kernel as guest and host. - You can configure them differently, but usually it's easiest not to. - - You will need to configure your kernel with the following options: - - "General setup": - "Prompt for development and/or incomplete code/drivers" = Y - (CONFIG_EXPERIMENTAL=y) - - "Processor type and features": - "Paravirtualized guest support" = Y - "Lguest guest support" = Y - "High Memory Support" = off/4GB - "Alignment value to which kernel should be aligned" = 0x100000 - (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and - CONFIG_PHYSICAL_ALIGN=0x100000) - - "Device Drivers": - "Block devices" - "Virtio block driver (EXPERIMENTAL)" = M/Y - "Network device support" - "Universal TUN/TAP device driver support" = M/Y - "Virtio network driver (EXPERIMENTAL)" = M/Y - (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) - - "Virtualization" - "Linux hypervisor example code" = M/Y - (CONFIG_LGUEST=m) - -- A tool called "lguest" is available in this directory: type "make" - to build it. If you didn't build your kernel in-tree, use "make - O=". - -- Create or find a root disk image. There are several useful ones - around, such as the xm-test tiny root image at - http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img - - For more serious work, I usually use a distribution ISO image and - install it under qemu, then make multiple copies: - - dd if=/dev/zero of=rootfile bs=1M count=2048 - qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d - - Make sure that you install a getty on /dev/hvc0 if you want to log in on the - console! - -- "modprobe lg" if you built it as a module. - -- Run an lguest as root: - - Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda - - Explanation: - 64: the amount of memory to use, in MB. - - vmlinux: the kernel image found in the top of your build directory. You - can also use a standard bzImage. - - --tunnet=192.168.19.1: configures a "tap" device for networking with this - IP address. - - --block=rootfile: a file or block device which becomes /dev/vda - inside the guest. - - root=/dev/vda: this (and anything else on the command line) are - kernel boot parameters. - -- Configuring networking. I usually have the host masquerade, using - "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > - /proc/sys/net/ipv4/ip_forward". In this example, I would configure - eth0 inside the guest at 192.168.19.2. - - Another method is to bridge the tap device to an external interface - using --tunnet=bridge:, and perhaps run dhcp on the guest - to obtain an IP address. The bridge needs to be configured first: - this option simply adds the tap interface to it. - - A simple example on my system: - - ifconfig eth0 0.0.0.0 - brctl addbr lg0 - ifconfig lg0 up - brctl addif lg0 eth0 - dhclient lg0 - - Then use --tunnet=bridge:lg0 when launching the guest. - - See: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge - - for general information on how to get bridging to work. - -- Random number generation. Using the --rng option will provide a - /dev/hwrng in the guest that will read from the host's /dev/random. - Use this option in conjunction with rng-tools (see ../hw_random.txt) - to provide entropy to the guest kernel's /dev/random. - -There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest - -Good luck! -Rusty Russell rusty@rustcorp.com.au. diff --git a/Documentation/uml/UserModeLinux-HOWTO.txt b/Documentation/uml/UserModeLinux-HOWTO.txt deleted file mode 100644 index 9b7e190..0000000 --- a/Documentation/uml/UserModeLinux-HOWTO.txt +++ /dev/null @@ -1,4579 +0,0 @@ - User Mode Linux HOWTO - User Mode Linux Core Team - Mon Nov 18 14:16:16 EST 2002 - - This document describes the use and abuse of Jeff Dike's User Mode - Linux: a port of the Linux kernel as a normal Intel Linux process. - ______________________________________________________________________ - - Table of Contents - - 1. Introduction - - 1.1 How is User Mode Linux Different? - 1.2 Why Would I Want User Mode Linux? - - 2. Compiling the kernel and modules - - 2.1 Compiling the kernel - 2.2 Compiling and installing kernel modules - 2.3 Compiling and installing uml_utilities - - 3. Running UML and logging in - - 3.1 Running UML - 3.2 Logging in - 3.3 Examples - - 4. UML on 2G/2G hosts - - 4.1 Introduction - 4.2 The problem - 4.3 The solution - - 5. Setting up serial lines and consoles - - 5.1 Specifying the device - 5.2 Specifying the channel - 5.3 Examples - - 6. Setting up the network - - 6.1 General setup - 6.2 Userspace daemons - 6.3 Specifying ethernet addresses - 6.4 UML interface setup - 6.5 Multicast - 6.6 TUN/TAP with the uml_net helper - 6.7 TUN/TAP with a preconfigured tap device - 6.8 Ethertap - 6.9 The switch daemon - 6.10 Slip - 6.11 Slirp - 6.12 pcap - 6.13 Setting up the host yourself - - 7. Sharing Filesystems between Virtual Machines - - 7.1 A warning - 7.2 Using layered block devices - 7.3 Note! - 7.4 Another warning - 7.5 uml_moo : Merging a COW file with its backing file - - 8. Creating filesystems - - 8.1 Create the filesystem file - 8.2 Assign the file to a UML device - 8.3 Creating and mounting the filesystem - - 9. Host file access - - 9.1 Using hostfs - 9.2 hostfs as the root filesystem - 9.3 Building hostfs - - 10. The Management Console - 10.1 version - 10.2 halt and reboot - 10.3 config - 10.4 remove - 10.5 sysrq - 10.6 help - 10.7 cad - 10.8 stop - 10.9 go - - 11. Kernel debugging - - 11.1 Starting the kernel under gdb - 11.2 Examining sleeping processes - 11.3 Running ddd on UML - 11.4 Debugging modules - 11.5 Attaching gdb to the kernel - 11.6 Using alternate debuggers - - 12. Kernel debugging examples - - 12.1 The case of the hung fsck - 12.2 Episode 2: The case of the hung fsck - - 13. What to do when UML doesn't work - - 13.1 Strange compilation errors when you build from source - 13.2 (obsolete) - 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem - 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' - 13.5 UML doesn't work when /tmp is an NFS filesystem - 13.6 UML hangs on boot when compiled with gprof support - 13.7 syslogd dies with a SIGTERM on startup - 13.8 TUN/TAP networking doesn't work on a 2.4 host - 13.9 You can network to the host but not to other machines on the net - 13.10 I have no root and I want to scream - 13.11 UML build conflict between ptrace.h and ucontext.h - 13.12 The UML BogoMips is exactly half the host's BogoMips - 13.13 When you run UML, it immediately segfaults - 13.14 xterms appear, then immediately disappear - 13.15 Any other panic, hang, or strange behavior - - 14. Diagnosing Problems - - 14.1 Case 1 : Normal kernel panics - 14.2 Case 2 : Tracing thread panics - 14.3 Case 3 : Tracing thread panics caused by other threads - 14.4 Case 4 : Hangs - - 15. Thanks - - 15.1 Code and Documentation - 15.2 Flushing out bugs - 15.3 Buglets and clean-ups - 15.4 Case Studies - 15.5 Other contributions - - - ______________________________________________________________________ - - 11.. IInnttrroodduuccttiioonn - - Welcome to User Mode Linux. It's going to be fun. - - - - 11..11.. HHooww iiss UUsseerr MMooddee LLiinnuuxx DDiiffffeerreenntt?? - - Normally, the Linux Kernel talks straight to your hardware (video - card, keyboard, hard drives, etc), and any programs which run ask the - kernel to operate the hardware, like so: - - - - +-----------+-----------+----+ - | Process 1 | Process 2 | ...| - +-----------+-----------+----+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - The User Mode Linux Kernel is different; instead of talking to the - hardware, it talks to a `real' Linux kernel (called the `host kernel' - from now on), like any other program. Programs can then run inside - User-Mode Linux as if they were running under a normal kernel, like - so: - - - - +----------------+ - | Process 2 | ...| - +-----------+----------------+ - | Process 1 | User-Mode Linux| - +----------------------------+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - - 11..22.. WWhhyy WWoouulldd II WWaanntt UUsseerr MMooddee LLiinnuuxx?? - - - 1. If User Mode Linux crashes, your host kernel is still fine. - - 2. You can run a usermode kernel as a non-root user. - - 3. You can debug the User Mode Linux like any normal process. - - 4. You can run gprof (profiling) and gcov (coverage testing). - - 5. You can play with your kernel without breaking things. - - 6. You can use it as a sandbox for testing new apps. - - 7. You can try new development kernels safely. - - 8. You can run different distributions simultaneously. - - 9. It's extremely fun. - - - - - - 22.. CCoommppiilliinngg tthhee kkeerrnneell aanndd mmoodduulleess - - - - - 22..11.. CCoommppiilliinngg tthhee kkeerrnneell - - - Compiling the user mode kernel is just like compiling any other - kernel. Let's go through the steps, using 2.4.0-prerelease (current - as of this writing) as an example: - - - 1. Download the latest UML patch from - - the download page - . - - - 3. Make a directory and unpack the kernel into it. - - - - host% - mkdir ~/uml - - - - - - - host% - cd ~/uml - - - - - - - host% - tar -xzvf linux-2.4.0-prerelease.tar.bz2 - - - - - - - 4. Apply the patch using - - - - host% - cd ~/uml/linux - - - - host% - bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 - - - - - - - 5. Run your favorite config; `make xconfig ARCH=um' is the most - convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' - will work as well. The defaults will give you a useful kernel. If - you want to change something, go ahead, it probably won't hurt - anything. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries - will not run. They will immediately segfault. See ``UML on 2G/2G - hosts'' for the scoop on running UML on your system. - - - - 6. Finish with `make linux ARCH=um': the result is a file called - `linux' in the top directory of your source tree. - - Make sure that you don't build this kernel in /usr/src/linux. On some - distributions, /usr/include/asm is a link into this pool. The user- - mode build changes the other end of that link, and things that include - stop compiling. - - The sources are also available from cvs at the project's cvs page, - which has directions on getting the sources. You can also browse the - CVS pool from there. - - If you get the CVS sources, you will have to check them out into an - empty directory. You will then have to copy each file into the - corresponding directory in the appropriate kernel pool. - - If you don't have the latest kernel pool, you can get the - corresponding user-mode sources with - - - host% cvs co -r v_2_3_x linux - - - - - where 'x' is the version in your pool. Note that you will not get the - bug fixes and enhancements that have gone into subsequent releases. - - - 22..22.. CCoommppiilliinngg aanndd iinnssttaalllliinngg kkeerrnneell mmoodduulleess - - UML modules are built in the same way as the native kernel (with the - exception of the 'ARCH=um' that you always need for UML): - - - host% make modules ARCH=um - - - - - Any modules that you want to load into this kernel need to be built in - the user-mode pool. Modules from the native kernel won't work. - - You can install them by using ftp or something to copy them into the - virtual machine and dropping them into /lib/modules/`uname -r`. - - You can also get the kernel build process to install them as follows: - - 1. with the kernel not booted, mount the root filesystem in the top - level of the kernel pool: - - - host% mount root_fs mnt -o loop - - - - - - - 2. run - - - host% - make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um - - - - - - - 3. unmount the filesystem - - - host% umount mnt - - - - - - - 4. boot the kernel on it - - - When the system is booted, you can use insmod as usual to get the - modules into the kernel. A number of things have been loaded into UML - as modules, especially filesystems and network protocols and filters, - so most symbols which need to be exported probably already are. - However, if you do find symbols that need exporting, let us - know, and - they'll be "taken care of". - - - - 22..33.. CCoommppiilliinngg aanndd iinnssttaalllliinngg uummll__uuttiilliittiieess - - Many features of the UML kernel require a user-space helper program, - so a uml_utilities package is distributed separately from the kernel - patch which provides these helpers. Included within this is: - - +o port-helper - Used by consoles which connect to xterms or ports - - +o tunctl - Configuration tool to create and delete tap devices - - +o uml_net - Setuid binary for automatic tap device configuration - - +o uml_switch - User-space virtual switch required for daemon - transport - - The uml_utilities tree is compiled with: - - - host# - make && make install - - - - - Note that UML kernel patches may require a specific version of the - uml_utilities distribution. If you don't keep up with the mailing - lists, ensure that you have the latest release of uml_utilities if you - are experiencing problems with your UML kernel, particularly when - dealing with consoles or command-line switches to the helper programs - - - - - - - - - 33.. RRuunnnniinngg UUMMLL aanndd llooggggiinngg iinn - - - - 33..11.. RRuunnnniinngg UUMMLL - - It runs on 2.2.15 or later, and all 2.4 kernels. - - - Booting UML is straightforward. Simply run 'linux': it will try to - mount the file `root_fs' in the current directory. You do not need to - run it as root. If your root filesystem is not named `root_fs', then - you need to put a `ubd0=root_fs_whatever' switch on the linux command - line. - - - You will need a filesystem to boot UML from. There are a number - available for download from here . There are also several tools - which can be - used to generate UML-compatible filesystem images from media. - The kernel will boot up and present you with a login prompt. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries will - not run. They will immediately segfault. See ``UML on 2G/2G hosts'' - for the scoop on running UML on your system. - - - - 33..22.. LLooggggiinngg iinn - - - - The prepackaged filesystems have a root account with password 'root' - and a user account with password 'user'. The login banner will - generally tell you how to log in. So, you log in and you will find - yourself inside a little virtual machine. Our filesystems have a - variety of commands and utilities installed (and it is fairly easy to - add more), so you will have a lot of tools with which to poke around - the system. - - There are a couple of other ways to log in: - - +o On a virtual console - - - - Each virtual console that is configured (i.e. the device exists in - /dev and /etc/inittab runs a getty on it) will come up in its own - xterm. If you get tired of the xterms, read ``Setting up serial - lines and consoles'' to see how to attach the consoles to - something else, like host ptys. - - - - +o Over the serial line - - - In the boot output, find a line that looks like: - - - - serial line 0 assigned pty /dev/ptyp1 - - - - - Attach your favorite terminal program to the corresponding tty. I.e. - for minicom, the command would be - - - host% minicom -o -p /dev/ttyp1 - - - - - - - +o Over the net - - - If the network is running, then you can telnet to the virtual - machine and log in to it. See ``Setting up the network'' to learn - about setting up a virtual network. - - When you're done using it, run halt, and the kernel will bring itself - down and the process will exit. - - - 33..33.. EExxaammpplleess - - Here are some examples of UML in action: - - +o A login session - - +o A virtual network - - - - - - - - 44.. UUMMLL oonn 22GG//22GG hhoossttss - - - - - 44..11.. IInnttrroodduuccttiioonn - - - Most Linux machines are configured so that the kernel occupies the - upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and - processes use the lower 3G (0x00000000 - 0xbfffffff). However, some - machine are configured with a 2G/2G split, with the kernel occupying - the upper 2G (0x80000000 - 0xffffffff) and processes using the lower - 2G (0x00000000 - 0x7fffffff). - - - - - 44..22.. TThhee pprroobblleemm - - - The prebuilt UML binaries on this site will not run on 2G/2G hosts - because UML occupies the upper .5G of the 3G process address space - (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right - in the middle of the kernel address space, so UML won't even load - it - will immediately segfault. - - - - - 44..33.. TThhee ssoolluuttiioonn - - - The fix for this is to rebuild UML from source after enabling - CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to - load itself in the top .5G of that smaller process address space, - where it will run fine. See ``Compiling the kernel and modules'' if - you need help building UML from source. - - - - - - - - - - - 55.. SSeettttiinngg uupp sseerriiaall lliinneess aanndd ccoonnssoolleess - - - It is possible to attach UML serial lines and consoles to many types - of host I/O channels by specifying them on the command line. - - - You can attach them to host ptys, ttys, file descriptors, and ports. - This allows you to do things like - - +o have a UML console appear on an unused host console, - - +o hook two virtual machines together by having one attach to a pty - and having the other attach to the corresponding tty - - +o make a virtual machine accessible from the net by attaching a - console to a port on the host. - - - The general format of the command line option is device=channel. - - - - 55..11.. SSppeecciiffyyiinngg tthhee ddeevviiccee - - Devices are specified with "con" or "ssl" (console or serial line, - respectively), optionally with a device number if you are talking - about a specific device. - - - Using just "con" or "ssl" describes all of the consoles or serial - lines. If you want to talk about console #3 or serial line #10, they - would be "con3" and "ssl10", respectively. - - - A specific device name will override a less general "con=" or "ssl=". - So, for example, you can assign a pty to each of the serial lines - except for the first two like this: - - - ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 - - - - - The specificity of the device name is all that matters; order on the - command line is irrelevant. - - - - 55..22.. SSppeecciiffyyiinngg tthhee cchhaannnneell - - There are a number of different types of channels to attach a UML - device to, each with a different way of specifying exactly what to - attach to. - - +o pseudo-terminals - device=pty pts terminals - device=pts - - - This will cause UML to allocate a free host pseudo-terminal for the - device. The terminal that it got will be announced in the boot - log. You access it by attaching a terminal program to the - corresponding tty: - - +o screen /dev/pts/n - - +o screen /dev/ttyxx - - +o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts - devices - - +o kermit - start it up, 'open' the device, then 'connect' - - - - - - +o terminals - device=tty:tty device file - - - This will make UML attach the device to the specified tty (i.e - - - con1=tty:/dev/tty3 - - - - - will attach UML's console 1 to the host's /dev/tty3). If the tty that - you specify is the slave end of a tty/pty pair, something else must - have already opened the corresponding pty in order for this to work. - - - - - - +o xterms - device=xterm - - - UML will run an xterm and the device will be attached to it. - - - - - - +o Port - device=port:port number - - - This will attach the UML devices to the specified host port. - Attaching console 1 to the host's port 9000 would be done like - this: - - - con1=port:9000 - - - - - Attaching all the serial lines to that port would be done similarly: - - - ssl=port:9000 - - - - - You access these devices by telnetting to that port. Each active tel- - net session gets a different device. If there are more telnets to a - port than UML devices attached to it, then the extra telnet sessions - will block until an existing telnet detaches, or until another device - becomes active (i.e. by being activated in /etc/inittab). - - This channel has the advantage that you can both attach multiple UML - devices to it and know how to access them without reading the UML boot - log. It is also unique in allowing access to a UML from remote - machines without requiring that the UML be networked. This could be - useful in allowing public access to UMLs because they would be - accessible from the net, but wouldn't need any kind of network - filtering or access control because they would have no network access. - - - If you attach the main console to a portal, then the UML boot will - appear to hang. In reality, it's waiting for a telnet to connect, at - which point the boot will proceed. - - - - - - +o already-existing file descriptors - device=file descriptor - - - If you set up a file descriptor on the UML command line, you can - attach a UML device to it. This is most commonly used to put the - main console back on stdin and stdout after assigning all the other - consoles to something else: - - - con0=fd:0,fd:1 con=pts - - - - - - - - - +o Nothing - device=null - - - This allows the device to be opened, in contrast to 'none', but - reads will block, and writes will succeed and the data will be - thrown out. - - - - - - +o None - device=none - - - This causes the device to disappear. - - - - You can also specify different input and output channels for a device - by putting a comma between them: - - - ssl3=tty:/dev/tty2,xterm - - - - - will cause serial line 3 to accept input on the host's /dev/tty3 and - display output on an xterm. That's a silly example - the most common - use of this syntax is to reattach the main console to stdin and stdout - as shown above. - - - If you decide to move the main console away from stdin/stdout, the - initial boot output will appear in the terminal that you're running - UML in. However, once the console driver has been officially - initialized, then the boot output will start appearing wherever you - specified that console 0 should be. That device will receive all - subsequent output. - - - - 55..33.. EExxaammpplleess - - There are a number of interesting things you can do with this - capability. - - - First, this is how you get rid of those bleeding console xterms by - attaching them to host ptys: - - - con=pty con0=fd:0,fd:1 - - - - - This will make a UML console take over an unused host virtual console, - so that when you switch to it, you will see the UML login prompt - rather than the host login prompt: - - - con1=tty:/dev/tty6 - - - - - You can attach two virtual machines together with what amounts to a - serial line as follows: - - Run one UML with a serial line attached to a pty - - - - ssl1=pty - - - - - Look at the boot log to see what pty it got (this example will assume - that it got /dev/ptyp1). - - Boot the other UML with a serial line attached to the corresponding - tty - - - - ssl1=tty:/dev/ttyp1 - - - - - Log in, make sure that it has no getty on that serial line, attach a - terminal program like minicom to it, and you should see the login - prompt of the other virtual machine. - - - 66.. SSeettttiinngg uupp tthhee nneettwwoorrkk - - - - This page describes how to set up the various transports and to - provide a UML instance with network access to the host, other machines - on the local net, and the rest of the net. - - - As of 2.4.5, UML networking has been completely redone to make it much - easier to set up, fix bugs, and add new features. - - - There is a new helper, uml_net, which does the host setup that - requires root privileges. - - - There are currently five transport types available for a UML virtual - machine to exchange packets with other hosts: - - +o ethertap - - +o TUN/TAP - - +o Multicast - - +o a switch daemon - - +o slip - - +o slirp - - +o pcap - - The TUN/TAP, ethertap, slip, and slirp transports allow a UML - instance to exchange packets with the host. They may be directed - to the host or the host may just act as a router to provide access - to other physical or virtual machines. - - - The pcap transport is a synthetic read-only interface, using the - libpcap binary to collect packets from interfaces on the host and - filter them. This is useful for building preconfigured traffic - monitors or sniffers. - - - The daemon and multicast transports provide a completely virtual - network to other virtual machines. This network is completely - disconnected from the physical network unless one of the virtual - machines on it is acting as a gateway. - - - With so many host transports, which one should you use? Here's when - you should use each one: - - +o ethertap - if you want access to the host networking and it is - running 2.2 - - +o TUN/TAP - if you want access to the host networking and it is - running 2.4. Also, the TUN/TAP transport is able to use a - preconfigured device, allowing it to avoid using the setuid uml_net - helper, which is a security advantage. - - +o Multicast - if you want a purely virtual network and you don't want - to set up anything but the UML - - +o a switch daemon - if you want a purely virtual network and you - don't mind running the daemon in order to get somewhat better - performance - - +o slip - there is no particular reason to run the slip backend unless - ethertap and TUN/TAP are just not available for some reason - - +o slirp - if you don't have root access on the host to setup - networking, or if you don't want to allocate an IP to your UML - - +o pcap - not much use for actual network connectivity, but great for - monitoring traffic on the host - - Ethertap is available on 2.4 and works fine. TUN/TAP is preferred - to it because it has better performance and ethertap is officially - considered obsolete in 2.4. Also, the root helper only needs to - run occasionally for TUN/TAP, rather than handling every packet, as - it does with ethertap. This is a slight security advantage since - it provides fewer opportunities for a nasty UML user to somehow - exploit the helper's root privileges. - - - 66..11.. GGeenneerraall sseettuupp - - First, you must have the virtual network enabled in your UML. If are - running a prebuilt kernel from this site, everything is already - enabled. If you build the kernel yourself, under the "Network device - support" menu, enable "Network device support", and then the three - transports. - - - The next step is to provide a network device to the virtual machine. - This is done by describing it on the kernel command line. - - The general format is - - - eth = , - - - - - For example, a virtual ethernet device may be attached to a host - ethertap device as follows: - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - This sets up eth0 inside the virtual machine to attach itself to the - host /dev/tap0, assigns it an ethernet address, and assigns the host - tap0 interface an IP address. - - - - Note that the IP address you assign to the host end of the tap device - must be different than the IP you assign to the eth device inside UML. - If you are short on IPs and don't want to consume two per UML, then - you can reuse the host's eth IP address for the host ends of the tap - devices. Internally, the UMLs must still get unique IPs for their eth - devices. You can also give the UMLs non-routable IPs (192.168.x.x or - 10.x.x.x) and have the host masquerade them. This will let outgoing - connections work, but incoming connections won't without more work, - such as port forwarding from the host. - Also note that when you configure the host side of an interface, it is - only acting as a gateway. It will respond to pings sent to it - locally, but is not useful to do that since it's a host interface. - You are not talking to the UML when you ping that interface and get a - response. - - - You can also add devices to a UML and remove them at runtime. See the - ``The Management Console'' page for details. - - - The sections below describe this in more detail. - - - Once you've decided how you're going to set up the devices, you boot - UML, log in, configure the UML side of the devices, and set up routes - to the outside world. At that point, you will be able to talk to any - other machines, physical or virtual, on the net. - - - If ifconfig inside UML fails and the network refuses to come up, run - tell you what went wrong. - - - - 66..22.. UUsseerrssppaaccee ddaaeemmoonnss - - You will likely need the setuid helper, or the switch daemon, or both. - They are both installed with the RPM and deb, so if you've installed - either, you can skip the rest of this section. - - - If not, then you need to check them out of CVS, build them, and - install them. The helper is uml_net, in CVS /tools/uml_net, and the - daemon is uml_switch, in CVS /tools/uml_router. They are both built - with a plain 'make'. Both need to be installed in a directory that's - in your path - /usr/bin is recommend. On top of that, uml_net needs - to be setuid root. - - - - 66..33.. SSppeecciiffyyiinngg eetthheerrnneett aaddddrreesssseess - - Below, you will see that the TUN/TAP, ethertap, and daemon interfaces - allow you to specify hardware addresses for the virtual ethernet - devices. This is generally not necessary. If you don't have a - specific reason to do it, you probably shouldn't. If one is not - specified on the command line, the driver will assign one based on the - device IP address. It will provide the address fe:fd:nn:nn:nn:nn - where nn.nn.nn.nn is the device IP address. This is nearly always - sufficient to guarantee a unique hardware address for the device. A - couple of exceptions are: - - +o Another set of virtual ethernet devices are on the same network and - they are assigned hardware addresses using a different scheme which - may conflict with the UML IP address-based scheme - - +o You aren't going to use the device for IP networking, so you don't - assign the device an IP address - - If you let the driver provide the hardware address, you should make - sure that the device IP address is known before the interface is - brought up. So, inside UML, this will guarantee that: - - - - UML# - ifconfig eth0 192.168.0.250 up - - - - - If you decide to assign the hardware address yourself, make sure that - the first byte of the address is even. Addresses with an odd first - byte are broadcast addresses, which you don't want assigned to a - device. - - - - 66..44.. UUMMLL iinntteerrffaaccee sseettuupp - - Once the network devices have been described on the command line, you - should boot UML and log in. - - - The first thing to do is bring the interface up: - - - UML# ifconfig ethn ip-address up - - - - - You should be able to ping the host at this point. - - - To reach the rest of the world, you should set a default route to the - host: - - - UML# route add default gw host ip - - - - - Again, with host ip of 192.168.0.4: - - - UML# route add default gw 192.168.0.4 - - - - - This page used to recommend setting a network route to your local net. - This is wrong, because it will cause UML to try to figure out hardware - addresses of the local machines by arping on the interface to the - host. Since that interface is basically a single strand of ethernet - with two nodes on it (UML and the host) and arp requests don't cross - networks, they will fail to elicit any responses. So, what you want - is for UML to just blindly throw all packets at the host and let it - figure out what to do with them, which is what leaving out the network - route and adding the default route does. - - - Note: If you can't communicate with other hosts on your physical - ethernet, it's probably because of a network route that's - automatically set up. If you run 'route -n' and see a route that - looks like this: - - - - - Destination Gateway Genmask Flags Metric Ref Use Iface - 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 - - - - - with a mask that's not 255.255.255.255, then replace it with a route - to your host: - - - UML# - route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 - - - - - - - UML# - route add -host 192.168.0.4 dev eth0 - - - - - This, plus the default route to the host, will allow UML to exchange - packets with any machine on your ethernet. - - - - 66..55.. MMuullttiiccaasstt - - The simplest way to set up a virtual network between multiple UMLs is - to use the mcast transport. This was written by Harald Welte and is - present in UML version 2.4.5-5um and later. Your system must have - multicast enabled in the kernel and there must be a multicast-capable - network device on the host. Normally, this is eth0, but if there is - no ethernet card on the host, then you will likely get strange error - messages when you bring the device up inside UML. - - - To use it, run two UMLs with - - - eth0=mcast - - - - - on their command lines. Log in, configure the ethernet device in each - machine with different IP addresses: - - - UML1# ifconfig eth0 192.168.0.254 - - - - - - - UML2# ifconfig eth0 192.168.0.253 - - - - - and they should be able to talk to each other. - - The full set of command line options for this transport are - - - - ethn=mcast,ethernet address,multicast - address,multicast port,ttl - - - - - Harald's original README is here and explains these in detail, as well as - some other issues. - - - - 66..66.. TTUUNN//TTAAPP wwiitthh tthhee uummll__nneett hheellppeerr - - TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the - host. The TUN/TAP backend has been in UML since 2.4.9-3um. - - - The easiest way to get up and running is to let the setuid uml_net - helper do the host setup for you. This involves insmod-ing the tun.o - module if necessary, configuring the device, and setting up IP - forwarding, routing, and proxy arp. If you are new to UML networking, - do this first. If you're concerned about the security implications of - the setuid helper, use it to get up and running, then read the next - section to see how to have UML use a preconfigured tap device, which - avoids the use of uml_net. - - - If you specify an IP address for the host side of the device, the - uml_net helper will do all necessary setup on the host - the only - requirement is that TUN/TAP be available, either built in to the host - kernel or as the tun.o module. - - The format of the command line switch to attach a device to a TUN/TAP - device is - - - eth =tuntap,,, - - - - - For example, this argument will attach the UML's eth0 to the next - available tap device and assign an ethernet address to it based on its - IP address - - - eth0=tuntap,,,192.168.0.254 - - - - - - - Note that the IP address that must be used for the eth device inside - UML is fixed by the routing and proxy arp that is set up on the - TUN/TAP device on the host. You can use a different one, but it won't - work because reply packets won't reach the UML. This is a feature. - It prevents a nasty UML user from doing things like setting the UML IP - to the same as the network's nameserver or mail server. - - - There are a couple potential problems with running the TUN/TAP - transport on a 2.4 host kernel - - +o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host - kernel or use the ethertap transport. - - +o With an upgraded kernel, TUN/TAP may fail with - - - File descriptor in bad state - - - - - This is due to a header mismatch between the upgraded kernel and the - kernel that was originally installed on the machine. The fix is to - make sure that /usr/src/linux points to the headers for the running - kernel. - - These were pointed out by Tim Robinson in - name="this uml- - user post"> . - - - - 66..77.. TTUUNN//TTAAPP wwiitthh aa pprreeccoonnffiigguurreedd ttaapp ddeevviiccee - - If you prefer not to have UML use uml_net (which is somewhat - insecure), with UML 2.4.17-11, you can set up a TUN/TAP device - beforehand. The setup needs to be done as root, but once that's done, - there is no need for root assistance. Setting up the device is done - as follows: - - +o Create the device with tunctl (available from the UML utilities - tarball) - - - - - host# tunctl -u uid - - - - - where uid is the user id or username that UML will be run as. This - will tell you what device was created. - - +o Configure the device IP (change IP addresses and device name to - suit) - - - - - host# ifconfig tap0 192.168.0.254 up - - - - - - +o Set up routing and arping if desired - this is my recipe, there are - other ways of doing the same thing - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' - - host# - route add -host 192.168.0.253 dev tap0 - - - - - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' - - - - - - - host# - arp -Ds 192.168.0.253 eth0 pub - - - - - Note that this must be done every time the host boots - this configu- - ration is not stored across host reboots. So, it's probably a good - idea to stick it in an rc file. An even better idea would be a little - utility which reads the information from a config file and sets up - devices at boot time. - - +o Rather than using up two IPs and ARPing for one of them, you can - also provide direct access to your LAN by the UML by using a - bridge. - - - host# - brctl addbr br0 - - - - - - - host# - ifconfig eth0 0.0.0.0 promisc up - - - - - - - host# - ifconfig tap0 0.0.0.0 promisc up - - - - - - - host# - ifconfig br0 192.168.0.1 netmask 255.255.255.0 up - - - - - - - - host# - brctl stp br0 off - - - - - - - host# - brctl setfd br0 1 - - - - - - - host# - brctl sethello br0 1 - - - - - - - host# - brctl addif br0 eth0 - - - - - - - host# - brctl addif br0 tap0 - - - - - Note that 'br0' should be setup using ifconfig with the existing IP - address of eth0, as eth0 no longer has its own IP. - - +o - - - Also, the /dev/net/tun device must be writable by the user running - UML in order for the UML to use the device that's been configured - for it. The simplest thing to do is - - - host# chmod 666 /dev/net/tun - - - - - Making it world-writable looks bad, but it seems not to be - exploitable as a security hole. However, it does allow anyone to cre- - ate useless tap devices (useless because they can't configure them), - which is a DOS attack. A somewhat more secure alternative would to be - to create a group containing all the users who have preconfigured tap - devices and chgrp /dev/net/tun to that group with mode 664 or 660. - - - +o Once the device is set up, run UML with 'eth0=tuntap,device name' - (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the - mconsole config command). - - +o Bring the eth device up in UML and you're in business. - - If you don't want that tap device any more, you can make it non- - persistent with - - - host# tunctl -d tap device - - - - - Finally, tunctl has a -b (for brief mode) switch which causes it to - output only the name of the tap device it created. This makes it - suitable for capture by a script: - - - host# TAP=`tunctl -u 1000 -b` - - - - - - - 66..88.. EEtthheerrttaapp - - Ethertap is the general mechanism on 2.2 for userspace processes to - exchange packets with the kernel. - - - - To use this transport, you need to describe the virtual network device - on the UML command line. The general format for this is - - - eth =ethertap, , , - - - - - So, the previous example - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - attaches the UML eth0 device to the host /dev/tap0, assigns it the - ethernet address fe:fd:0:0:0:1, and assigns the IP address - 192.168.0.254 to the tap device. - - - - The tap device is mandatory, but the others are optional. If the - ethernet address is omitted, one will be assigned to it. - - - The presence of the tap IP address will cause the helper to run and do - whatever host setup is needed to allow the virtual machine to - communicate with the outside world. If you're not sure you know what - you're doing, this is the way to go. - - - If it is absent, then you must configure the tap device and whatever - arping and routing you will need on the host. However, even in this - case, the uml_net helper still needs to be in your path and it must be - setuid root if you're not running UML as root. This is because the - tap device doesn't support SIGIO, which UML needs in order to use - something as a source of input. So, the helper is used as a - convenient asynchronous IO thread. - - If you're using the uml_net helper, you can ignore the following host - setup - uml_net will do it for you. You just need to make sure you - have ethertap available, either built in to the host kernel or - available as a module. - - - If you want to set things up yourself, you need to make sure that the - appropriate /dev entry exists. If it doesn't, become root and create - it as follows: - - - mknod /dev/tap c 36 + 16 - - - - - For example, this is how to create /dev/tap0: - - - mknod /dev/tap0 c 36 0 + 16 - - - - - You also need to make sure that the host kernel has ethertap support. - If ethertap is enabled as a module, you apparently need to insmod - ethertap once for each ethertap device you want to enable. So, - - - host# - insmod ethertap - - - - - will give you the tap0 interface. To get the tap1 interface, you need - to run - - - host# - insmod ethertap unit=1 -o ethertap1 - - - - - - - - 66..99.. TThhee sswwiittcchh ddaaeemmoonn - - NNoottee: This is the daemon formerly known as uml_router, but which was - renamed so the network weenies of the world would stop growling at me. - - - The switch daemon, uml_switch, provides a mechanism for creating a - totally virtual network. By default, it provides no connection to the - host network (but see -tap, below). - - - The first thing you need to do is run the daemon. Running it with no - arguments will make it listen on a default pair of unix domain - sockets. - - - If you want it to listen on a different pair of sockets, use - - - -unix control socket data socket - - - - - - If you want it to act as a hub rather than a switch, use - - - -hub - - - - - - If you want the switch to be connected to host networking (allowing - the umls to get access to the outside world through the host), use - - - -tap tap0 - - - - - - Note that the tap device must be preconfigured (see "TUN/TAP with a - preconfigured tap device", above). If you're using a different tap - device than tap0, specify that instead of tap0. - - - uml_switch can be backgrounded as follows - - - host% - uml_switch [ options ] < /dev/null > /dev/null - - - - - The reason it doesn't background by default is that it listens to - stdin for EOF. When it sees that, it exits. - - - The general format of the kernel command line switch is - - - - ethn=daemon,ethernet address,socket - type,control socket,data socket - - - - - You can leave off everything except the 'daemon'. You only need to - specify the ethernet address if the one that will be assigned to it - isn't acceptable for some reason. The rest of the arguments describe - how to communicate with the daemon. You should only specify them if - you told the daemon to use different sockets than the default. So, if - you ran the daemon with no arguments, running the UML on the same - machine with - eth0=daemon - - - - - will cause the eth0 driver to attach itself to the daemon correctly. - - - - 66..1100.. SSlliipp - - Slip is another, less general, mechanism for a process to communicate - with the host networking. In contrast to the ethertap interface, - which exchanges ethernet frames with the host and can be used to - transport any higher-level protocol, it can only be used to transport - IP. - - - The general format of the command line switch is - - - - ethn=slip,slip IP - - - - - The slip IP argument is the IP address that will be assigned to the - host end of the slip device. If it is specified, the helper will run - and will set up the host so that the virtual machine can reach it and - the rest of the network. - - - There are some oddities with this interface that you should be aware - of. You should only specify one slip device on a given virtual - machine, and its name inside UML will be 'umn', not 'eth0' or whatever - you specified on the command line. These problems will be fixed at - some point. - - - - 66..1111.. SSlliirrpp - - slirp uses an external program, usually /usr/bin/slirp, to provide IP - only networking connectivity through the host. This is similar to IP - masquerading with a firewall, although the translation is performed in - user-space, rather than by the kernel. As slirp does not set up any - interfaces on the host, or changes routing, slirp does not require - root access or setuid binaries on the host. - - - The general format of the command line switch for slirp is: - - - - ethn=slirp,ethernet address,slirp path - - - - - The ethernet address is optional, as UML will set up the interface - with an ethernet address based upon the initial IP address of the - interface. The slirp path is generally /usr/bin/slirp, although it - will depend on distribution. - - - The slirp program can have a number of options passed to the command - line and we can't add them to the UML command line, as they will be - parsed incorrectly. Instead, a wrapper shell script can be written or - the options inserted into the /.slirprc file. More information on - all of the slirp options can be found in its man pages. - - - The eth0 interface on UML should be set up with the IP 10.2.0.15, - although you can use anything as long as it is not used by a network - you will be connecting to. The default route on UML should be set to - use - - - UML# - route add default dev eth0 - - - - - slirp provides a number of useful IP addresses which can be used by - UML, such as 10.0.2.3 which is an alias for the DNS server specified - in /etc/resolv.conf on the host or the IP given in the 'dns' option - for slirp. - - - Even with a baudrate setting higher than 115200, the slirp connection - is limited to 115200. If you need it to go faster, the slirp binary - needs to be compiled with FULL_BOLT defined in config.h. - - - - 66..1122.. ppccaapp - - The pcap transport is attached to a UML ethernet device on the command - line or with uml_mconsole with the following syntax: - - - - ethn=pcap,host interface,filter - expression,option1,option2 - - - - - The expression and options are optional. - - - The interface is whatever network device on the host you want to - sniff. The expression is a pcap filter expression, which is also what - tcpdump uses, so if you know how to specify tcpdump filters, you will - use the same expressions here. The options are up to two of - 'promisc', control whether pcap puts the host interface into - promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap - expression optimizer is used. - - - Example: - - - - eth0=pcap,eth0,tcp - - eth1=pcap,eth0,!tcp - - - - will cause the UML eth0 to emit all tcp packets on the host eth0 and - the UML eth1 to emit all non-tcp packets on the host eth0. - - - - 66..1133.. SSeettttiinngg uupp tthhee hhoosstt yyoouurrsseellff - - If you don't specify an address for the host side of the ethertap or - slip device, UML won't do any setup on the host. So this is what is - needed to get things working (the examples use a host-side IP of - 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your - own network): - - +o The device needs to be configured with its IP address. Tap devices - are also configured with an mtu of 1484. Slip devices are - configured with a point-to-point address pointing at the UML ip - address. - - - host# ifconfig tap0 arp mtu 1484 192.168.0.251 up - - - - - - - host# - ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up - - - - - - +o If a tap device is being set up, a route is set to the UML IP. - - - UML# route add -host 192.168.0.250 gw 192.168.0.251 - - - - - - +o To allow other hosts on your network to see the virtual machine, - proxy arp is set up for it. - - - host# arp -Ds 192.168.0.250 eth0 pub - - - - - - +o Finally, the host is set up to route packets. - - - host# echo 1 > /proc/sys/net/ipv4/ip_forward - - - - - - - - - - - 77.. SShhaarriinngg FFiilleessyysstteemmss bbeettwweeeenn VViirrttuuaall MMaacchhiinneess - - - - - 77..11.. AA wwaarrnniinngg - - Don't attempt to share filesystems simply by booting two UMLs from the - same file. That's the same thing as booting two physical machines - from a shared disk. It will result in filesystem corruption. - - - - 77..22.. UUssiinngg llaayyeerreedd bblloocckk ddeevviicceess - - The way to share a filesystem between two virtual machines is to use - the copy-on-write (COW) layering capability of the ubd block driver. - As of 2.4.6-2um, the driver supports layering a read-write private - device over a read-only shared device. A machine's writes are stored - in the private device, while reads come from either device - the - private one if the requested block is valid in it, the shared one if - not. Using this scheme, the majority of data which is unchanged is - shared between an arbitrary number of virtual machines, each of which - has a much smaller file containing the changes that it has made. With - a large number of UMLs booting from a large root filesystem, this - leads to a huge disk space saving. It will also help performance, - since the host will be able to cache the shared data using a much - smaller amount of memory, so UML disk requests will be served from the - host's memory rather than its disks. - - - - - To add a copy-on-write layer to an existing block device file, simply - add the name of the COW file to the appropriate ubd switch: - - - ubd0=root_fs_cow,root_fs_debian_22 - - - - - where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is - the existing shared filesystem. The COW file need not exist. If it - doesn't, the driver will create and initialize it. Once the COW file - has been initialized, it can be used on its own on the command line: - - - ubd0=root_fs_cow - - - - - The name of the backing file is stored in the COW file header, so it - would be redundant to continue specifying it on the command line. - - - - 77..33.. NNoottee!! - - When checking the size of the COW file in order to see the gobs of - space that you're saving, make sure you use 'ls -ls' to see the actual - disk consumption rather than the length of the file. The COW file is - sparse, so the length will be very different from the disk usage. - Here is a 'ls -l' of a COW file and backing file from one boot and - shutdown: - host% ls -l cow.debian debian2.2 - -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Doesn't look like much saved space, does it? Well, here's 'ls -ls': - - - host% ls -ls cow.debian debian2.2 - 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Now, you can see that the COW file has less than a meg of disk, rather - than 492 meg. - - - - 77..44.. AAnnootthheerr wwaarrnniinngg - - Once a filesystem is being used as a readonly backing file for a COW - file, do not boot directly from it or modify it in any way. Doing so - will invalidate any COW files that are using it. The mtime and size - of the backing file are stored in the COW file header at its creation, - and they must continue to match. If they don't, the driver will - refuse to use the COW file. - - - - - If you attempt to evade this restriction by changing either the - backing file or the COW header by hand, you will get a corrupted - filesystem. - - - - - Among other things, this means that upgrading the distribution in a - backing file and expecting that all of the COW files using it will see - the upgrade will not work. - - - - - 77..55.. uummll__mmoooo :: MMeerrggiinngg aa CCOOWW ffiillee wwiitthh iittss bbaacckkiinngg ffiillee - - Depending on how you use UML and COW devices, it may be advisable to - merge the changes in the COW file into the backing file every once in - a while. - - - - - The utility that does this is uml_moo. Its usage is - - - host% uml_moo COW file new backing file - - - - - There's no need to specify the backing file since that information is - already in the COW file header. If you're paranoid, boot the new - merged file, and if you're happy with it, move it over the old backing - file. - - - - - uml_moo creates a new backing file by default as a safety measure. It - also has a destructive merge option which will merge the COW file - directly into its current backing file. This is really only usable - when the backing file only has one COW file associated with it. If - there are multiple COWs associated with a backing file, a -d merge of - one of them will invalidate all of the others. However, it is - convenient if you're short of disk space, and it should also be - noticeably faster than a non-destructive merge. - - - - - uml_moo is installed with the UML deb and RPM. If you didn't install - UML from one of those packages, you can also get it from the UML - utilities tar file in tools/moo. - - - - - - - - - 88.. CCrreeaattiinngg ffiilleessyysstteemmss - - - You may want to create and mount new UML filesystems, either because - your root filesystem isn't large enough or because you want to use a - filesystem other than ext2. - - - This was written on the occasion of reiserfs being included in the - 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will - talk about reiserfs. This information is generic, and the examples - should be easy to translate to the filesystem of your choice. - - - 88..11.. CCrreeaattee tthhee ffiilleessyysstteemm ffiillee - - dd is your friend. All you need to do is tell dd to create an empty - file of the appropriate size. I usually make it sparse to save time - and to avoid allocating disk space until it's actually used. For - example, the following command will create a sparse 100 meg file full - of zeroes. - - - host% - dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M - - - - - - - 88..22.. AAssssiiggnn tthhee ffiillee ttoo aa UUMMLL ddeevviiccee - - Add an argument like the following to the UML command line: - - ubd4=new_filesystem - - - - - making sure that you use an unassigned ubd device number. - - - - 88..33.. CCrreeaattiinngg aanndd mmoouunnttiinngg tthhee ffiilleessyysstteemm - - Make sure that the filesystem is available, either by being built into - the kernel, or available as a module, then boot up UML and log in. If - the root filesystem doesn't have the filesystem utilities (mkfs, fsck, - etc), then get them into UML by way of the net or hostfs. - - - Make the new filesystem on the device assigned to the new file: - - - host# mkreiserfs /dev/ubd/4 - - - <----------- MKREISERFSv2 -----------> - - ReiserFS version 3.6.25 - Block size 4096 bytes - Block count 25856 - Used blocks 8212 - Journal - 8192 blocks (18-8209), journal header is in block 8210 - Bitmaps: 17 - Root block 8211 - Hash function "r5" - ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y - journal size 8192 (from 18) - Initializing journal - 0%....20%....40%....60%....80%....100% - Syncing..done. - - - - - Now, mount it: - - - UML# - mount /dev/ubd/4 /mnt - - - - - and you're in business. - - - - - - - - - - 99.. HHoosstt ffiillee aacccceessss - - - If you want to access files on the host machine from inside UML, you - can treat it as a separate machine and either nfs mount directories - from the host or copy files into the virtual machine with scp or rcp. - However, since UML is running on the host, it can access those - files just like any other process and make them available inside the - virtual machine without needing to use the network. - - - This is now possible with the hostfs virtual filesystem. With it, you - can mount a host directory into the UML filesystem and access the - files contained in it just as you would on the host. - - - 99..11.. UUssiinngg hhoossttffss - - To begin with, make sure that hostfs is available inside the virtual - machine with - - - UML# cat /proc/filesystems - - - - . hostfs should be listed. If it's not, either rebuild the kernel - with hostfs configured into it or make sure that hostfs is built as a - module and available inside the virtual machine, and insmod it. - - - Now all you need to do is run mount: - - - UML# mount none /mnt/host -t hostfs - - - - - will mount the host's / on the virtual machine's /mnt/host. - - - If you don't want to mount the host root directory, then you can - specify a subdirectory to mount with the -o switch to mount: - - - UML# mount none /mnt/home -t hostfs -o /home - - - - - will mount the hosts's /home on the virtual machine's /mnt/home. - - - - 99..22.. hhoossttffss aass tthhee rroooott ffiilleessyysstteemm - - It's possible to boot from a directory hierarchy on the host using - hostfs rather than using the standard filesystem in a file. - - To start, you need that hierarchy. The easiest way is to loop mount - an existing root_fs file: - - - host# mount root_fs uml_root_dir -o loop - - - - - You need to change the filesystem type of / in etc/fstab to be - 'hostfs', so that line looks like this: - - /dev/ubd/0 / hostfs defaults 1 1 - - - - - Then you need to chown to yourself all the files in that directory - that are owned by root. This worked for me: - - - host# find . -uid 0 -exec chown jdike {} \; - - - - - Next, make sure that your UML kernel has hostfs compiled in, not as a - module. Then run UML with the boot device pointing at that directory: - - - ubd0=/path/to/uml/root/directory - - - - - UML should then boot as it does normally. - - - 99..33.. BBuuiillddiinngg hhoossttffss - - If you need to build hostfs because it's not in your kernel, you have - two choices: - - - - +o Compiling hostfs into the kernel: - - - Reconfigure the kernel and set the 'Host filesystem' option under - - - +o Compiling hostfs as a module: - - - Reconfigure the kernel and set the 'Host filesystem' option under - be in arch/um/fs/hostfs/hostfs.o. Install that in - /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and - - - UML# insmod hostfs - - - - - - - - - - - - - 1100.. TThhee MMaannaaggeemmeenntt CCoonnssoollee - - - - The UML management console is a low-level interface to the kernel, - somewhat like the i386 SysRq interface. Since there is a full-blown - operating system under UML, there is much greater flexibility possible - than with the SysRq mechanism. - - - There are a number of things you can do with the mconsole interface: - - +o get the kernel version - - +o add and remove devices - - +o halt or reboot the machine - - +o Send SysRq commands - - +o Pause and resume the UML - - - You need the mconsole client (uml_mconsole) which is present in CVS - (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in - 2.4.6. - - - You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. - When you boot UML, you'll see a line like: - - - mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole - - - - - If you specify a unique machine id one the UML command line, i.e. - - - umid=debian - - - - - you'll see this - - - mconsole initialized on /home/jdike/.uml/debian/mconsole - - - - - That file is the socket that uml_mconsole will use to communicate with - UML. Run it with either the umid or the full path as its argument: - - - host% uml_mconsole debian - - - - - or - - - host% uml_mconsole /home/jdike/.uml/debian/mconsole - - - - - You'll get a prompt, at which you can run one of these commands: - - +o version - - +o halt - - +o reboot - - +o config - - +o remove - - +o sysrq - - +o help - - +o cad - - +o stop - - +o go - - - 1100..11.. vveerrssiioonn - - This takes no arguments. It prints the UML version. - - - (mconsole) version - OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 - - - - - There are a couple actual uses for this. It's a simple no-op which - can be used to check that a UML is running. It's also a way of - sending an interrupt to the UML. This is sometimes useful on SMP - hosts, where there's a bug which causes signals to UML to be lost, - often causing it to appear to hang. Sending such a UML the mconsole - version command is a good way to 'wake it up' before networking has - been enabled, as it does not do anything to the function of the UML. - - - - 1100..22.. hhaalltt aanndd rreebboooott - - These take no arguments. They shut the machine down immediately, with - no syncing of disks and no clean shutdown of userspace. So, they are - pretty close to crashing the machine. - - - (mconsole) halt - OK - - - - - - - 1100..33.. ccoonnffiigg - - "config" adds a new device to the virtual machine. Currently the ubd - and network drivers support this. It takes one argument, which is the - device to add, with the same syntax as the kernel command line. - - - - - (mconsole) - config ubd3=/home/jdike/incoming/roots/root_fs_debian22 - - OK - (mconsole) config eth1=mcast - OK - - - - - - - 1100..44.. rreemmoovvee - - "remove" deletes a device from the system. Its argument is just the - name of the device to be removed. The device must be idle in whatever - sense the driver considers necessary. In the case of the ubd driver, - the removed block device must not be mounted, swapped on, or otherwise - open, and in the case of the network driver, the device must be down. - - - (mconsole) remove ubd3 - OK - (mconsole) remove eth1 - OK - - - - - - - 1100..55.. ssyyssrrqq - - This takes one argument, which is a single letter. It calls the - generic kernel's SysRq driver, which does whatever is called for by - that argument. See the SysRq documentation in Documentation/sysrq.txt - in your favorite kernel tree to see what letters are valid and what - they do. - - - - 1100..66.. hheellpp - - "help" returns a string listing the valid commands and what each one - does. - - - - 1100..77.. ccaadd - - This invokes the Ctl-Alt-Del action on init. What exactly this ends - up doing is up to /etc/inittab. Normally, it reboots the machine. - With UML, this is usually not desired, so if a halt would be better, - then find the section of inittab that looks like this - - - # What to do when CTRL-ALT-DEL is pressed. - ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now - - - - - and change the command to halt. - - - - 1100..88.. ssttoopp - - This puts the UML in a loop reading mconsole requests until a 'go' - mconsole command is received. This is very useful for making backups - of UML filesystems, as the UML can be stopped, then synced via 'sysrq - s', so that everything is written to the filesystem. You can then copy - the filesystem and then send the UML 'go' via mconsole. - - - Note that a UML running with more than one CPU will have problems - after you send the 'stop' command, as only one CPU will be held in a - mconsole loop and all others will continue as normal. This is a bug, - and will be fixed. - - - - 1100..99.. ggoo - - This resumes a UML after being paused by a 'stop' command. Note that - when the UML has resumed, TCP connections may have timed out and if - the UML is paused for a long period of time, crond might go a little - crazy, running all the jobs it didn't do earlier. - - - - - - - - - 1111.. KKeerrnneell ddeebbuuggggiinngg - - - NNoottee:: The interface that makes debugging, as described here, possible - is present in 2.4.0-test6 kernels and later. - - - Since the user-mode kernel runs as a normal Linux process, it is - possible to debug it with gdb almost like any other process. It is - slightly different because the kernel's threads are already being - ptraced for system call interception, so gdb can't ptrace them. - However, a mechanism has been added to work around that problem. - - - In order to debug the kernel, you need build it from source. See - ``Compiling the kernel and modules'' for information on doing that. - Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during - the config. These will compile the kernel with -g, and enable the - ptrace proxy so that gdb works with UML, respectively. - - - - - 1111..11.. SSttaarrttiinngg tthhee kkeerrnneell uunnddeerr ggddbb - - You can have the kernel running under the control of gdb from the - beginning by putting 'debug' on the command line. You will get an - xterm with gdb running inside it. The kernel will send some commands - to gdb which will leave it stopped at the beginning of start_kernel. - At this point, you can get things going with 'next', 'step', or - 'cont'. - - - There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an - interrupt handler. - 1111..22.. EExxaammiinniinngg sslleeeeppiinngg pprroocceesssseess - - Not every bug is evident in the currently running process. Sometimes, - processes hang in the kernel when they shouldn't because they've - deadlocked on a semaphore or something similar. In this case, when - you ^C gdb and get a backtrace, you will see the idle thread, which - isn't very relevant. - - - What you want is the stack of whatever process is sleeping when it - shouldn't be. You need to figure out which process that is, which is - generally fairly easy. Then you need to get its host process id, - which you can do either by looking at ps on the host or at - task.thread.extern_pid in gdb. - - - Now what you do is this: - - +o detach from the current thread - - - (UML gdb) det - - - - - - +o attach to the thread you are interested in - - - (UML gdb) att - - - - - - +o look at its stack and anything else of interest - - - (UML gdb) bt - - - - - Note that you can't do anything at this point that requires that a - process execute, e.g. calling a function - - +o when you're done looking at that process, reattach to the current - thread and continue it - - - (UML gdb) - att 1 - - - - - - - (UML gdb) - c - - - - - Here, specifying any pid which is not the process id of a UML thread - will cause gdb to reattach to the current thread. I commonly use 1, - but any other invalid pid would work. - - - - 1111..33.. RRuunnnniinngg dddddd oonn UUMMLL - - ddd works on UML, but requires a special kludge. The process goes - like this: - - +o Start ddd - - - host% ddd linux - - - - - - +o With ps, get the pid of the gdb that ddd started. You can ask the - gdb to tell you, but for some reason that confuses things and - causes a hang. - - +o run UML with 'debug=parent gdb-pid=' added to the command line - - it will just sit there after you hit return - - +o type 'att 1' to the ddd gdb and you will see something like - - - 0xa013dc51 in __kill () - - - (gdb) - - - - - - +o At this point, type 'c', UML will boot up, and you can use ddd just - as you do on any other process. - - - - 1111..44.. DDeebbuuggggiinngg mmoodduulleess - - gdb has support for debugging code which is dynamically loaded into - the process. This support is what is needed to debug kernel modules - under UML. - - - Using that support is somewhat complicated. You have to tell gdb what - object file you just loaded into UML and where in memory it is. Then, - it can read the symbol table, and figure out where all the symbols are - from the load address that you provided. It gets more interesting - when you load the module again (i.e. after an rmmod). You have to - tell gdb to forget about all its symbols, including the main UML ones - for some reason, then load then all back in again. - - - There's an easy way and a hard way to do this. The easy way is to use - the umlgdb expect script written by Chandan Kudige. It basically - automates the process for you. - - - First, you must tell it where your modules are. There is a list in - the script that looks like this: - set MODULE_PATHS { - "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" - "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" - "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" - } - - - - - You change that to list the names and paths of the modules that you - are going to debug. Then you run it from the toplevel directory of - your UML pool and it basically tells you what to do: - - - - - ******** GDB pid is 21903 ******** - Start UML as: ./linux debug gdb-pid=21903 - - - - GNU gdb 5.0rh-5 Red Hat Linux 7.1 - Copyright 2001 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) b sys_init_module - Breakpoint 1 at 0xa0011923: file module.c, line 349. - (gdb) att 1 - - - - - After you run UML and it sits there doing nothing, you hit return at - the 'att 1' and continue it: - - - Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 - 0xa00f4221 in __kill () - (UML gdb) c - Continuing. - - - - - At this point, you debug normally. When you insmod something, the - expect magic will kick in and you'll see something like: - - - - - - - - - - - - - - - - - - *** Module hostfs loaded *** - Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 349 char *name, *n_name, *name_tmp = NULL; - (UML gdb) finish - Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 - 411 else res = EXECUTE_SYSCALL(syscall, regs); - Value returned is $1 = 0 - (UML gdb) - p/x (int)module_list + module_list->size_of_struct - - $2 = 0xa9021054 - (UML gdb) symbol-file ./linux - Load new symbol table from "./linux"? (y or n) y - Reading symbols from ./linux... - done. - (UML gdb) - add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 - - add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at - .text_addr = 0xa9021054 - (y or n) y - - Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... - done. - (UML gdb) p *module_list - $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", - size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, - nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, - init = 0xa90221f0 , cleanup = 0xa902222c , - ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, - persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, - kallsyms_end = 0x0, - archdata_start = 0x1b855
, - archdata_end = 0xe5890000
, - kernel_data = 0xf689c35d
} - >> Finished loading symbols for hostfs ... - - - - - That's the easy way. It's highly recommended. The hard way is - described below in case you're interested in what's going on. - - - Boot the kernel under the debugger and load the module with insmod or - modprobe. With gdb, do: - - - (UML gdb) p module_list - - - - - This is a list of modules that have been loaded into the kernel, with - the most recently loaded module first. Normally, the module you want - is at module_list. If it's not, walk down the next links, looking at - the name fields until find the module you want to debug. Take the - address of that structure, and add module.size_of_struct (which in - 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition - for you :-): - - - - (UML gdb) - printf "%#x\n", (int)module_list module_list->size_of_struct - - - - - The offset from the module start occasionally changes (before 2.4.0, - it was module.size_of_struct + 4), so it's a good idea to check the - init and cleanup addresses once in a while, as describe below. Now - do: - - - (UML gdb) - add-symbol-file /path/to/module/on/host that_address - - - - - Tell gdb you really want to do it, and you're in business. - - - If there's any doubt that you got the offset right, like breakpoints - appear not to work, or they're appearing in the wrong place, you can - check it by looking at the module structure. The init and cleanup - fields should look like: - - - init = 0x588066b0 , cleanup = 0x588066c0 - - - - - with no offsets on the symbol names. If the names are right, but they - are offset, then the offset tells you how much you need to add to the - address you gave to add-symbol-file. - - - When you want to load in a new version of the module, you need to get - gdb to forget about the old one. The only way I've found to do that - is to tell gdb to forget about all symbols that it knows about: - - - (UML gdb) symbol-file - - - - - Then reload the symbols from the kernel binary: - - - (UML gdb) symbol-file /path/to/kernel - - - - - and repeat the process above. You'll also need to re-enable break- - points. They were disabled when you dumped all the symbols because - gdb couldn't figure out where they should go. - - - - 1111..55.. AAttttaacchhiinngg ggddbb ttoo tthhee kkeerrnneell - - If you don't have the kernel running under gdb, you can attach gdb to - it later by sending the tracing thread a SIGUSR1. The first line of - the console output identifies its pid: - tracing thread pid = 20093 - - - - - When you send it the signal: - - - host% kill -USR1 20093 - - - - - you will get an xterm with gdb running in it. - - - If you have the mconsole compiled into UML, then the mconsole client - can be used to start gdb: - - - (mconsole) (mconsole) config gdb=xterm - - - - - will fire up an xterm with gdb running in it. - - - - 1111..66.. UUssiinngg aalltteerrnnaattee ddeebbuuggggeerrss - - UML has support for attaching to an already running debugger rather - than starting gdb itself. This is present in CVS as of 17 Apr 2001. - I sent it to Alan for inclusion in the ac tree, and it will be in my - 2.4.4 release. - - - This is useful when gdb is a subprocess of some UI, such as emacs or - ddd. It can also be used to run debuggers other than gdb on UML. - Below is an example of using strace as an alternate debugger. - - - To do this, you need to get the pid of the debugger and pass it in - with the - - - If you are using gdb under some UI, then tell it to 'att 1', and - you'll find yourself attached to UML. - - - If you are using something other than gdb as your debugger, then - you'll need to get it to do the equivalent of 'att 1' if it doesn't do - it automatically. - - - An example of an alternate debugger is strace. You can strace the - actual kernel as follows: - - +o Run the following in a shell - - - host% - sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' - - - - +o Run UML with 'debug' and 'gdb-pid=' with the pid printed out - by the previous command - - +o Hit return in the shell, and UML will start running, and strace - output will start accumulating in the output file. - - Note that this is different from running - - - host% strace ./linux - - - - - That will strace only the main UML thread, the tracing thread, which - doesn't do any of the actual kernel work. It just oversees the vir- - tual machine. In contrast, using strace as described above will show - you the low-level activity of the virtual machine. - - - - - - 1122.. KKeerrnneell ddeebbuuggggiinngg eexxaammpplleess - - 1122..11.. TThhee ccaassee ooff tthhee hhuunngg ffsscckk - - When booting up the kernel, fsck failed, and dropped me into a shell - to fix things up. I ran fsck -y, which hung: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 was not cleanly unmounted, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Inode 19780, i_blocks is 1548, should be 540. Fix? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - - - - - The standard drill in this sort of situation is to fire up gdb on the - signal thread, which, in this case, was pid 1935. In another window, - I run gdb and attach pid 1935. - - - - - ~/linux/2.3.26/um 1016: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - - (gdb) att 1935 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 - 0x100756d9 in __wait4 () - - - - - - - Let's see what's currently running: - - - - (gdb) p current_task.pid - $1 = 0 - - - - - - It's the idle thread, which means that fsck went to sleep for some - reason and never woke up. - - - Let's guess that the last process in the process list is fsck: - - - - (gdb) p current_task.prev_task.comm - $13 = "fsck.ext2\000\000\000\000\000\000" - - - - - - It is, so let's see what it thinks it's up to: - - - - (gdb) p current_task.prev_task.thread - $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, - kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { - 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, - request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { - regs = {1350467584, 2952789424, 0 }, sigstack = 0, - pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, - arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { - op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} - - - - - - The interesting things here are the fact that its .thread.syscall.id - is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or - the defines in include/asm-um/arch/unistd.h), and that it never - returned. Also, its .request.op is OP_SWITCH (see - arch/um/include/user_util.h). These mean that it went into a write, - and, for some reason, called schedule(). - - - The fact that it never returned from write means that its stack should - be fairly interesting. Its pid is 1980 (.thread.extern_pid). That - process is being ptraced by the signal thread, so it must be detached - before gdb can attach it: - - - - - - - - - - - (gdb) call detach(1980) - - Program received signal SIGSEGV, Segmentation fault. - - The program being debugged stopped while in a function called from GDB. - When the function (detach) is done executing, GDB will silently - stop (instead of continuing to evaluate the expression containing - the function call). - (gdb) call detach(1980) - $15 = 0 - - - - - - The first detach segfaults for some reason, and the second one - succeeds. - - - Now I detach from the signal thread, attach to the fsck thread, and - look at its stack: - - - (gdb) det - Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 - (gdb) att 1980 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 - 0x10070451 in __kill () - (gdb) bt - #0 0x10070451 in __kill () - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - #4 0x10001d12 in schedule () at sched.c:777 - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #9 - #10 0x10155404 in errno () - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #14 - #15 0xc0fd in ?? () - #16 0x10016647 in sys_write (fd=3, - buf=0x80b8800
, count=1024) - at read_write.c:159 - #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #19 - #20 0x400dc8b0 in ?? () - - - - - - The interesting things here are : - - +o There are two segfaults on this stack (frames 9 and 14) - - +o The first faulting address (frame 11) is 0x50000800 - - (gdb) p (void *)1342179328 - $16 = (void *) 0x50000800 - - - - - - The initial faulting address is interesting because it is on the idle - thread's stack. I had been seeing the idle thread segfault for no - apparent reason, and the cause looked like stack corruption. In hopes - of catching the culprit in the act, I had turned off all protections - to that stack while the idle thread wasn't running. This apparently - tripped that trap. - - - However, the more immediate problem is that second segfault and I'm - going to concentrate on that. First, I want to see where the fault - happened, so I have to go look at the sigcontent struct in frame 8: - - - - (gdb) up - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - 30 kill(pid, SIGUSR1); - (gdb) - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - 156 usr1_pid(getpid()); - (gdb) - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - 161 _switch_to(prev, next); - (gdb) - #4 0x10001d12 in schedule () at sched.c:777 - 777 switch_to(prev, next, prev); - (gdb) - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - 71 schedule(); - (gdb) - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - 157 } - (gdb) - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - 182 segv_handler(sc); - (gdb) p *sc - Cannot access memory at address 0x0. - - - - - That's not very useful, so I'll try a more manual method: - - - (gdb) p *((struct sigcontext *) (&sig + 1)) - $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, - esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, - trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, - esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1280} - - - - The ip is in handle_mm_fault: - - - (gdb) p (void *)268480945 - $20 = (void *) 0x1000b1b1 - (gdb) i sym $20 - handle_mm_fault + 57 in section .text - - - - - - Specifically, it's in pte_alloc: - - - (gdb) i line *$20 - Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b1 - and ends at 0x1000b1b7 . - - - - - - To find where in handle_mm_fault this is, I'll jump forward in the - code until I see an address in that procedure: - - - - (gdb) i line *0x1000b1c0 - Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b7 - and ends at 0x1000b1c3 . - (gdb) i line *0x1000b1d0 - Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1d0 - and ends at 0x1000b1da . - (gdb) i line *0x1000b1e0 - Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1da - and ends at 0x1000b1e1 . - (gdb) i line *0x1000b1f0 - Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1f0 - and ends at 0x1000b200 . - (gdb) i line *0x1000b200 - Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b200 - and ends at 0x1000b208 . - (gdb) i line *0x1000b210 - Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b210 - and ends at 0x1000b219 . - (gdb) i line *0x1000b220 - Line 1168 of "memory.c" starts at address 0x1000b21e - and ends at 0x1000b222 . - - - - - - Something is apparently wrong with the page tables or vma_structs, so - lets go back to frame 11 and have a look at them: - - - - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - 50 handle_mm_fault(current, vma, address, is_write); - (gdb) call pgd_offset_proc(vma->vm_mm, address) - $22 = (pgd_t *) 0x80a548c - - - - - - That's pretty bogus. Page tables aren't supposed to be in process - text or data areas. Let's see what's in the vma: - - - (gdb) p *vma - $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, - vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, - vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, - vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, - vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, - vm_private_data = 0x62} - (gdb) p *vma.vm_mm - $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, - pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, - map_count = 134909076, mmap_sem = {count = {counter = 135073792}, - sleepers = -1342177872, wait = {lock = , - task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, - __magic = -1342177670, __creator = -1342177300}, __magic = 98}, - page_table_lock = {}, context = 138, start_code = 0, end_code = 0, - start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, - arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, - total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, - swap_address = 0, segments = 0x0} - - - - - - This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx - addresses, this is looking like a stack was plonked down on top of - these structures. Maybe it's a stack overflow from the next page: - - - - (gdb) p vma - $25 = (struct vm_area_struct *) 0x507d2434 - - - - - - That's towards the lower quarter of the page, so that would have to - have been pretty heavy stack overflow: - - - - - - - - - - - - - - - (gdb) x/100x $25 - 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c - 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 - 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a - 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 - 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 - 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 - 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 - - - - - - It's not stack overflow. The only "stack-like" piece of this data is - the vma_struct itself. - - - At this point, I don't see any avenues to pursue, so I just have to - admit that I have no idea what's going on. What I will do, though, is - stick a trap on the segfault handler which will stop if it sees any - writes to the idle thread's stack. That was the thing that happened - first, and it may be that if I can catch it immediately, what's going - on will be somewhat clearer. - - - 1122..22.. EEppiissooddee 22:: TThhee ccaassee ooff tthhee hhuunngg ffsscckk - - After setting a trap in the SEGV handler for accesses to the signal - thread's stack, I reran the kernel. - - - fsck hung again, this time by hitting the trap: - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 contains a file system with errors, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - Untested (4127) [100fe44c]: trap_kern.c line 31 - - - - - - I need to get the signal thread to detach from pid 4127 so that I can - attach to it with gdb. This is done by sending it a SIGUSR1, which is - caught by the signal thread, which detaches the process: - - - kill -USR1 4127 - - - - - - Now I can run gdb on it: - - - - - - - - - - - - - - ~/linux/2.3.26/um 1034: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) att 4127 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 - 0x10075891 in __libc_nanosleep () - - - - - - The backtrace shows that it was in a write and that the fault address - (address in frame 3) is 0x50000800, which is right in the middle of - the signal thread's stack page: - - - (gdb) bt - #0 0x10075891 in __libc_nanosleep () - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - #2 0x1006ce9a in stop () at user_util.c:191 - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 - #6 - #7 0xc0fd in ?? () - #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) - at read_write.c:159 - #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #11 - #12 0x400dc8b0 in ?? () - #13 - #14 0x400dc8b0 in ?? () - #15 0x80545fd in ?? () - #16 0x804daae in ?? () - #17 0x8054334 in ?? () - #18 0x804d23e in ?? () - #19 0x8049632 in ?? () - #20 0x80491d2 in ?? () - #21 0x80596b5 in ?? () - (gdb) p (void *)1342179328 - $3 = (void *) 0x50000800 - - - - - - Going up the stack to the segv_handler frame and looking at where in - the code the access happened shows that it happened near line 110 of - block_dev.c: - - - - - - - - - - (gdb) up - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. - (gdb) - #2 0x1006ce9a in stop () at user_util.c:191 - 191 while(1) sleep(1000000); - (gdb) - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - 31 KERN_UNTESTED(); - (gdb) - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) p *sc - $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, - esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, - err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, - esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1342179328} - (gdb) p (void *)268550834 - $2 = (void *) 0x1001c2b2 - (gdb) i sym $2 - block_write + 1090 in section .text - (gdb) i line *$2 - Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" - starts at address 0x1001c2a1 - and ends at 0x1001c2bf . - (gdb) i line *0x1001c2c0 - Line 110 of "block_dev.c" starts at address 0x1001c2bf - and ends at 0x1001c2e3 . - - - - - - Looking at the source shows that the fault happened during a call to - copy_to_user to copy the data into the kernel: - - - 107 count -= chars; - 108 copy_from_user(p,buf,chars); - 109 p += chars; - 110 buf += chars; - - - - - - p is the pointer which must contain 0x50000800, since buf contains - 0x80b8800 (frame 8 above). It is defined as: - - - p = offset + bh->b_data; - - - - - - I need to figure out what bh is, and it just so happens that bh is - passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a - few lines later, so I do a little disassembly: - - - - - (gdb) disas 0x1001c2bf 0x1001c2e0 - Dump of assembler code from 0x1001c2bf to 0x1001c2d0: - 0x1001c2bf : addl %eax,0xc(%ebp) - 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx - 0x1001c2c8 : btsl $0x0,0x18(%edx) - 0x1001c2cd : btsl $0x1,0x18(%edx) - 0x1001c2d2 : sbbl %ecx,%ecx - 0x1001c2d4 : testl %ecx,%ecx - 0x1001c2d6 : jne 0x1001c2e3 - 0x1001c2d8 : pushl $0x0 - 0x1001c2da : pushl %edx - 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> - End of assembler dump. - - - - - - At that point, bh is in %edx (address 0x1001c2da), which is calculated - at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, - taking %ebp from the sigcontext_struct above: - - - (gdb) p (void *)1342631484 - $5 = (void *) 0x5006ee3c - (gdb) p 0x5006ee3c+0xfffffdd4 - $6 = 1342630928 - (gdb) p (void *)$6 - $7 = (void *) 0x5006ec10 - (gdb) p *((void **)$7) - $8 = (void *) 0x50100200 - - - - - - Now, I look at the structure to see what's in it, and particularly, - what its b_data field contains: - - - (gdb) p *((struct buffer_head *)0x50100200) - $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, - b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, - b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, - b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, - b_data = 0x50000800 "", b_page = 0x50004000, - b_end_io = 0x10017f60 , b_dev_id = 0x0, - b_rsector = 98810, b_wait = {lock = , - task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, - __creator = 0}, b_kiobuf = 0x0} - - - - - - The b_data field is indeed 0x50000800, so the question becomes how - that happened. The rest of the structure looks fine, so this probably - is not a case of data corruption. It happened on purpose somehow. - - - The b_page field is a pointer to the page_struct representing the - 0x50000000 page. Looking at it shows the kernel's idea of the state - of that page: - - - - (gdb) p *$13.b_page - $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, - index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { - next = 0x50008460, prev = 0x50019350}, wait = { - lock = , task_list = {next = 0x50004024, - prev = 0x50004024}, __magic = 1342193708, __creator = 0}, - pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, - zone = 0x100c5160} - - - - - - Some sanity-checking: the virtual field shows the "virtual" address of - this page, which in this kernel is the same as its "physical" address, - and the page_struct itself should be mem_map[0], since it represents - the first page of memory: - - - - (gdb) p (void *)1342177280 - $18 = (void *) 0x50000000 - (gdb) p mem_map - $19 = (mem_map_t *) 0x50004000 - - - - - - These check out fine. - - - Now to check out the page_struct itself. In particular, the flags - field shows whether the page is considered free or not: - - - (gdb) p (void *)132 - $21 = (void *) 0x84 - - - - - - The "reserved" bit is the high bit, which is definitely not set, so - the kernel considers the signal stack page to be free and available to - be used. - - - At this point, I jump to conclusions and start looking at my early - boot code, because that's where that page is supposed to be reserved. - - - In my setup_arch procedure, I have the following code which looks just - fine: - - - - bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); - free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); - - - - - - Two stack pages have already been allocated, and low_physmem points to - the third page, which is the beginning of free memory. - The init_bootmem call declares the entire memory to the boot memory - manager, which marks it all reserved. The free_bootmem call frees up - all of it, except for the first two pages. This looks correct to me. - - - So, I decide to see init_bootmem run and make sure that it is marking - those first two pages as reserved. I never get that far. - - - Stepping into init_bootmem, and looking at bootmem_map before looking - at what it contains shows the following: - - - - (gdb) p bootmem_map - $3 = (void *) 0x50000000 - - - - - - Aha! The light dawns. That first page is doing double duty as a - stack and as the boot memory map. The last thing that the boot memory - manager does is to free the pages used by its memory map, so this page - is getting freed even its marked as reserved. - - - The fix was to initialize the boot memory manager before allocating - those two stack pages, and then allocate them through the boot memory - manager. After doing this, and fixing a couple of subsequent buglets, - the stack corruption problem disappeared. - - - - - - 1133.. WWhhaatt ttoo ddoo wwhheenn UUMMLL ddooeessnn''tt wwoorrkk - - - - - 1133..11.. SSttrraannggee ccoommppiillaattiioonn eerrrroorrss wwhheenn yyoouu bbuuiilldd ffrroomm ssoouurrccee - - As of test11, it is necessary to have "ARCH=um" in the environment or - on the make command line for all steps in building UML, including - clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, - and linux. If you forget for any of them, the i386 build seems to - contaminate the UML build. If this happens, start from scratch with - - - host% - make mrproper ARCH=um - - - - - and repeat the build process with ARCH=um on all the steps. - - - See ``Compiling the kernel and modules'' for more details. - - - Another cause of strange compilation errors is building UML in - /usr/src/linux. If you do this, the first thing you need to do is - clean up the mess you made. The /usr/src/linux/asm link will now - point to /usr/src/linux/asm-um. Make it point back to - /usr/src/linux/asm-i386. Then, move your UML pool someplace else and - build it there. Also see below, where a more specific set of symptoms - is described. - - - - 1133..33.. AA vvaarriieettyy ooff ppaanniiccss aanndd hhaannggss wwiitthh //ttmmpp oonn aa rreeiisseerrffss ffiilleessyyss-- - tteemm - - I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. - Panics preceded by - - - Detaching pid nnnn - - - - are diagnostic of this problem. This is a reiserfs bug which causes a - thread to occasionally read stale data from a mmapped page shared with - another thread. The fix is to upgrade the filesystem or to have /tmp - be an ext2 filesystem. - - - - 1133..44.. TThhee ccoommppiillee ffaaiillss wwiitthh eerrrroorrss aabboouutt ccoonnfflliiccttiinngg ttyyppeess ffoorr - ''ooppeenn'',, ''dduupp'',, aanndd ''wwaaiittppiidd'' - - This happens when you build in /usr/src/linux. The UML build makes - the include/asm link point to include/asm-um. /usr/include/asm points - to /usr/src/linux/include/asm, so when that link gets moved, files - which need to include the asm-i386 versions of headers get the - incompatible asm-um versions. The fix is to move the include/asm link - back to include/asm-i386 and to do UML builds someplace else. - - - - 1133..55.. UUMMLL ddooeessnn''tt wwoorrkk wwhheenn //ttmmpp iiss aann NNFFSS ffiilleessyysstteemm - - This seems to be a similar situation with the ReiserFS problem above. - Some versions of NFS seems not to handle mmap correctly, which UML - depends on. The workaround is have /tmp be a non-NFS directory. - - - 1133..66.. UUMMLL hhaannggss oonn bboooott wwhheenn ccoommppiilleedd wwiitthh ggpprrooff ssuuppppoorrtt - - If you build UML with gprof support and, early in the boot, it does - this - - - kernel BUG at page_alloc.c:100! - - - - - you have a buggy gcc. You can work around the problem by removing - UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up - another bug, but that one is fairly hard to reproduce. - - - - 1133..77.. ssyyssllooggdd ddiieess wwiitthh aa SSIIGGTTEERRMM oonn ssttaarrttuupp - - The exact boot error depends on the distribution that you're booting, - but Debian produces this: - - - /etc/rc2.d/S10sysklogd: line 49: 93 Terminated - start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD - - - - - This is a syslogd bug. There's a race between a parent process - installing a signal handler and its child sending the signal. See - this uml-devel post for the details. - - - - 1133..88.. TTUUNN//TTAAPP nneettwwoorrkkiinngg ddooeessnn''tt wwoorrkk oonn aa 22..44 hhoosstt - - There are a couple of problems which were - name="pointed - out"> by Tim Robinson - - +o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. - The fix is to upgrade to something more recent and then read the - next item. - - +o If you see - - - File descriptor in bad state - - - - when you bring up the device inside UML, you have a header mismatch - between the original kernel and the upgraded one. Make /usr/src/linux - point at the new headers. This will only be a problem if you build - uml_net yourself. - - - - 1133..99.. YYoouu ccaann nneettwwoorrkk ttoo tthhee hhoosstt bbuutt nnoott ttoo ootthheerr mmaacchhiinneess oonn tthhee - nneett - - If you can connect to the host, and the host can connect to UML, but - you cannot connect to any other machines, then you may need to enable - IP Masquerading on the host. Usually this is only experienced when - using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML - networking, rather than the public address space that your host is - connected to. UML does not enable IP Masquerading, so you will need - to create a static rule to enable it: - - - host% - iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE - - - - - Replace eth0 with the interface that you use to talk to the rest of - the world. - - - Documentation on IP Masquerading, and SNAT, can be found at - www.netfilter.org . - - - If you can reach the local net, but not the outside Internet, then - that is usually a routing problem. The UML needs a default route: - - - UML# - route add default gw gateway IP - - - - - The gateway IP can be any machine on the local net that knows how to - reach the outside world. Usually, this is the host or the local net- - work's gateway. - - - Occasionally, we hear from someone who can reach some machines, but - not others on the same net, or who can reach some ports on other - machines, but not others. These are usually caused by strange - firewalling somewhere between the UML and the other box. You track - this down by running tcpdump on every interface the packets travel - over and see where they disappear. When you find a machine that takes - the packets in, but does not send them onward, that's the culprit. - - - - 1133..1100.. II hhaavvee nnoo rroooott aanndd II wwaanntt ttoo ssccrreeaamm - - Thanks to Birgit Wahlich for telling me about this strange one. It - turns out that there's a limit of six environment variables on the - kernel command line. When that limit is reached or exceeded, argument - processing stops, which means that the 'root=' argument that UML - usually adds is not seen. So, the filesystem has no idea what the - root device is, so it panics. - - - The fix is to put less stuff on the command line. Glomming all your - setup variables into one is probably the best way to go. - - - - 1133..1111.. UUMMLL bbuuiilldd ccoonnfflliicctt bbeettwweeeenn ppttrraaccee..hh aanndd uuccoonntteexxtt..hh - - On some older systems, /usr/include/asm/ptrace.h and - /usr/include/sys/ucontext.h define the same names. So, when they're - included together, the defines from one completely mess up the parsing - of the other, producing errors like: - /usr/include/sys/ucontext.h:47: parse error before - `10' - - - - - plus a pile of warnings. - - - This is a libc botch, which has since been fixed, and I don't see any - way around it besides upgrading. - - - - 1133..1122.. TThhee UUMMLL BBooggooMMiippss iiss eexxaaccttllyy hhaallff tthhee hhoosstt''ss BBooggooMMiippss - - On i386 kernels, there are two ways of running the loop that is used - to calculate the BogoMips rating, using the TSC if it's there or using - a one-instruction loop. The TSC produces twice the BogoMips as the - loop. UML uses the loop, since it has nothing resembling a TSC, and - will get almost exactly the same BogoMips as a host using the loop. - However, on a host with a TSC, its BogoMips will be double the loop - BogoMips, and therefore double the UML BogoMips. - - - - 1133..1133.. WWhheenn yyoouu rruunn UUMMLL,, iitt iimmmmeeddiiaatteellyy sseeggffaauullttss - - If the host is configured with the 2G/2G address space split, that's - why. See ``UML on 2G/2G hosts'' for the details on getting UML to - run on your host. - - - - 1133..1144.. xxtteerrmmss aappppeeaarr,, tthheenn iimmmmeeddiiaatteellyy ddiissaappppeeaarr - - If you're running an up to date kernel with an old release of - uml_utilities, the port-helper program will not work properly, so - xterms will exit straight after they appear. The solution is to - upgrade to the latest release of uml_utilities. Usually this problem - occurs when you have installed a packaged release of UML then compiled - your own development kernel without upgrading the uml_utilities from - the source distribution. - - - - 1133..1155.. AAnnyy ootthheerr ppaanniicc,, hhaanngg,, oorr ssttrraannggee bbeehhaavviioorr - - If you're seeing truly strange behavior, such as hangs or panics that - happen in random places, or you try running the debugger to see what's - happening and it acts strangely, then it could be a problem in the - host kernel. If you're not running a stock Linus or -ac kernel, then - try that. An early version of the preemption patch and a 2.4.10 SuSE - kernel have caused very strange problems in UML. - - - Otherwise, let me know about it. Send a message to one of the UML - mailing lists - either the developer list - user-mode-linux-devel at - lists dot sourceforge dot net (subscription info) or the user list - - user-mode-linux-user at lists dot sourceforge do net (subscription - info), whichever you prefer. Don't assume that everyone knows about - it and that a fix is imminent. - - - If you want to be super-helpful, read ``Diagnosing Problems'' and - follow the instructions contained therein. - 1144.. DDiiaaggnnoossiinngg PPrroobblleemmss - - - If you get UML to crash, hang, or otherwise misbehave, you should - report this on one of the project mailing lists, either the developer - list - user-mode-linux-devel at lists dot sourceforge dot net - (subscription info) or the user list - user-mode-linux-user at lists - dot sourceforge dot net (subscription info). When you do, it is - likely that I will want more information. So, it would be helpful to - read the stuff below, do whatever is applicable in your case, and - report the results to the list. - - - For any diagnosis, you're going to need to build a debugging kernel. - The binaries from this site aren't debuggable. If you haven't done - this before, read about ``Compiling the kernel and modules'' and - ``Kernel debugging'' UML first. - - - 1144..11.. CCaassee 11 :: NNoorrmmaall kkeerrnneell ppaanniiccss - - The most common case is for a normal thread to panic. To debug this, - you will need to run it under the debugger (add 'debug' to the command - line). An xterm will start up with gdb running inside it. Continue - it when it stops in start_kernel and make it crash. Now ^C gdb and - - - If the panic was a "Kernel mode fault", then there will be a segv - frame on the stack and I'm going to want some more information. The - stack might look something like this: - - - (UML gdb) backtrace - #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) - at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 - #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 - #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 - #3 0x1009bf38 in __restore () - at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 - #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) - at trap_kern.c:66 - #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 - #6 0x1009bf38 in __restore () - - - - - I'm going to want to see the symbol and line information for the value - of ip in the segv frame. In this case, you would do the following: - - - (UML gdb) i sym 268849158 - - - - - and - - - (UML gdb) i line *268849158 - - - - - The reason for this is the __restore frame right above the segv_han- - dler frame is hiding the frame that actually segfaulted. So, I have - to get that information from the faulting ip. - - - 1144..22.. CCaassee 22 :: TTrraacciinngg tthhrreeaadd ppaanniiccss - - The less common and more painful case is when the tracing thread - panics. In this case, the kernel debugger will be useless because it - needs a healthy tracing thread in order to work. The first thing to - do is get a backtrace from the tracing thread. This is done by - figuring out what its pid is, firing up gdb, and attaching it to that - pid. You can figure out the tracing thread pid by looking at the - first line of the console output, which will look like this: - - - tracing thread pid = 15851 - - - - - or by running ps on the host and finding the line that looks like - this: - - - jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] - - - - - If the panic was 'segfault in signals', then follow the instructions - above for collecting information about the location of the seg fault. - - - If the tracing thread flaked out all by itself, then send that - backtrace in and wait for our crack debugging team to fix the problem. - - - 1144..33.. CCaassee 33 :: TTrraacciinngg tthhrreeaadd ppaanniiccss ccaauusseedd bbyy ootthheerr tthhrreeaaddss - - However, there are cases where the misbehavior of another thread - caused the problem. The most common panic of this type is: - - - wait_for_stop failed to wait for to stop with - - - - - In this case, you'll need to get a backtrace from the process men- - tioned in the panic, which is complicated by the fact that the kernel - debugger is defunct and without some fancy footwork, another gdb can't - attach to it. So, this is how the fancy footwork goes: - - In a shell: - - - host% kill -STOP pid - - - - - Run gdb on the tracing thread as described in case 2 and do: - - - (host gdb) call detach(pid) - - - If you get a segfault, do it again. It always works the second time. - - Detach from the tracing thread and attach to that other thread: - - - (host gdb) detach - - - - - - - (host gdb) attach pid - - - - - If gdb hangs when attaching to that process, go back to a shell and - do: - - - host% - kill -CONT pid - - - - - And then get the backtrace: - - - (host gdb) backtrace - - - - - - 1144..44.. CCaassee 44 :: HHaannggss - - Hangs seem to be fairly rare, but they sometimes happen. When a hang - happens, we need a backtrace from the offending process. Run the - kernel debugger as described in case 1 and get a backtrace. If the - current process is not the idle thread, then send in the backtrace. - You can tell that it's the idle thread if the stack looks like this: - - - #0 0x100b1401 in __libc_nanosleep () - #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 - #2 0x100a546f in do_idle () at process_kern.c:445 - #3 0x100a5508 in cpu_idle () at process_kern.c:471 - #4 0x100ec18f in start_kernel () at init/main.c:592 - #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 - #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 - - - - - If this is the case, then some other process is at fault, and went to - sleep when it shouldn't have. Run ps on the host and figure out which - process should not have gone to sleep and stayed asleep. Then attach - to it with gdb and get a backtrace as described in case 3. - - - - - - - 1155.. TThhaannkkss - - - A number of people have helped this project in various ways, and this - page gives recognition where recognition is due. - - - If you're listed here and you would prefer a real link on your name, - or no link at all, instead of the despammed email address pseudo-link, - let me know. - - - If you're not listed here and you think maybe you should be, please - let me know that as well. I try to get everyone, but sometimes my - bookkeeping lapses and I forget about contributions. - - - 1155..11.. CCooddee aanndd DDooccuummeennttaattiioonn - - Rusty Russell - - - +o wrote the HOWTO - - +o prodded me into making this project official and putting it on - SourceForge - - +o came up with the way cool UML logo - - +o redid the config process - - - Peter Moulder - Fixed my config and build - processes, and added some useful code to the block driver - - - Bill Stearns - - - +o HOWTO updates - - +o lots of bug reports - - +o lots of testing - - +o dedicated a box (uml.ists.dartmouth.edu) to support UML development - - +o wrote the mkrootfs script, which allows bootable filesystems of - RPM-based distributions to be cranked out - - +o cranked out a large number of filesystems with said script - - - Jim Leu - Wrote the virtual ethernet driver - and associated usermode tools - - Lars Brinkhoff - Contributed the ptrace - proxy from his own project to allow easier - kernel debugging - - - Andrea Arcangeli - Redid some of the early boot - code so that it would work on machines with Large File Support - - - Chris Emerson - Did - the first UML port to Linux/ppc - - - Harald Welte - Wrote the multicast - transport for the network driver - - - Jorgen Cederlof - Added special file support to hostfs - - - Greg Lonnon - Changed the ubd driver - to allow it to layer a COW file on a shared read-only filesystem and - wrote the iomem emulation support - - - Henrik Nordstrom - Provided a variety - of patches, fixes, and clues - - - Lennert Buytenhek - Contributed various patches, a rewrite of the - network driver, the first implementation of the mconsole driver, and - did the bulk of the work needed to get SMP working again. - - - Yon Uriarte - Fixed the TUN/TAP network backend while I slept. - - - Adam Heath - Made a bunch of nice cleanups to the initialization code, - plus various other small patches. - - - Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and - is doing a real nice job of it. He also noticed and fixed a number of - actually and potentially exploitable security holes in uml_net. Plus - the occasional patch. I like patches. - - - James McMechan - James seems to have taken over maintenance of the ubd - driver and is doing a nice job of it. - - - Chandan Kudige - wrote the umlgdb script which automates the reloading - of module symbols. - - - Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, - enabling UML processes to access audio devices on the host. He also - submitted patches for the slip transport and lots of other things. - - - David Coulson - - - +o Set up the usermodelinux.org site, - which is a great way of keeping the UML user community on top of - UML goings-on. - - +o Site documentation and updates - - +o Nifty little UML management daemon UMLd - - - +o Lots of testing and bug reports - - - - - 1155..22.. FFlluusshhiinngg oouutt bbuuggss - - - - +o Yuri Pudgorodsky - - +o Gerald Britton - - +o Ian Wehrman - - +o Gord Lamb - - +o Eugene Koontz - - +o John H. Hartman - - +o Anders Karlsson - - +o Daniel Phillips - - +o John Fremlin - - +o Rainer Burgstaller - - +o James Stevenson - - +o Matt Clay - - +o Cliff Jefferies - - +o Geoff Hoff - - +o Lennert Buytenhek - - +o Al Viro - - +o Frank Klingenhoefer - - +o Livio Baldini Soares - - +o Jon Burgess - - +o Petru Paler - - +o Paul - - +o Chris Reahard - - +o Sverker Nilsson - - +o Gong Su - - +o johan verrept - - +o Bjorn Eriksson - - +o Lorenzo Allegrucci - - +o Muli Ben-Yehuda - - +o David Mansfield - - +o Howard Goff - - +o Mike Anderson - - +o John Byrne - - +o Sapan J. Batia - - +o Iris Huang - - +o Jan Hudec - - +o Voluspa - - - - - 1155..33.. BBuugglleettss aanndd cclleeaann--uuppss - - - - +o Dave Zarzycki - - +o Adam Lazur - - +o Boria Feigin - - +o Brian J. Murrell - - +o JS - - +o Roman Zippel - - +o Wil Cooley - - +o Ayelet Shemesh - - +o Will Dyson - - +o Sverker Nilsson - - +o dvorak - - +o v.naga srinivas - - +o Shlomi Fish - - +o Roger Binns - - +o johan verrept - - +o MrChuoi - - +o Peter Cleve - - +o Vincent Guffens - - +o Nathan Scott - - +o Patrick Caulfield - - +o jbearce - - +o Catalin Marinas - - +o Shane Spencer - - +o Zou Min - - - +o Ryan Boder - - +o Lorenzo Colitti - - +o Gwendal Grignou - - +o Andre' Breiler - - +o Tsutomu Yasuda - - - - 1155..44.. CCaassee SSttuuddiieess - - - +o Jon Wright - - +o William McEwan - - +o Michael Richardson - - - - 1155..55.. OOtthheerr ccoonnttrriibbuuttiioonnss - - - Bill Carr made the Red Hat mkrootfs script - work with RH 6.2. - - Michael Jennings sent in some material which - is now gracing the top of the index page of this site. - - SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com - . The bandwidth there made it possible to - produce most of the filesystems available on the project download - page. - - Laurent Bonnaud took the old grotty - Debian filesystem that I've been distributing and updated it to 2.2. - It is now available by itself here. - - Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make - releases even when Sourceforge is broken. - - Rodrigo de Castro looked at my broken pte code and told me what was - wrong with it, letting me fix a long-standing (several weeks) and - serious set of bugs. - - Chris Reahard built a specialized root filesystem for running a DNS - server jailed inside UML. It's available from the download - page in the Jail - Filesystems section. - - - - - - - - - - - - diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt new file mode 100644 index 0000000..9bef4e4 --- /dev/null +++ b/Documentation/virtual/kvm/api.txt @@ -0,0 +1,1451 @@ +The Definitive KVM (Kernel-based Virtual Machine) API Documentation +=================================================================== + +1. General description + +The kvm API is a set of ioctls that are issued to control various aspects +of a virtual machine. The ioctls belong to three classes + + - System ioctls: These query and set global attributes which affect the + whole kvm subsystem. In addition a system ioctl is used to create + virtual machines + + - VM ioctls: These query and set attributes that affect an entire virtual + machine, for example memory layout. In addition a VM ioctl is used to + create virtual cpus (vcpus). + + Only run VM ioctls from the same process (address space) that was used + to create the VM. + + - vcpu ioctls: These query and set attributes that control the operation + of a single virtual cpu. + + Only run vcpu ioctls from the same thread that was used to create the + vcpu. + +2. File descriptors + +The kvm API is centered around file descriptors. An initial +open("/dev/kvm") obtains a handle to the kvm subsystem; this handle +can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this +handle will create a VM file descriptor which can be used to issue VM +ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu +and return a file descriptor pointing to it. Finally, ioctls on a vcpu +fd can be used to control the vcpu, including the important task of +actually running guest code. + +In general file descriptors can be migrated among processes by means +of fork() and the SCM_RIGHTS facility of unix domain socket. These +kinds of tricks are explicitly not supported by kvm. While they will +not cause harm to the host, their actual behavior is not guaranteed by +the API. The only supported use is one virtual machine per process, +and one vcpu per thread. + +3. Extensions + +As of Linux 2.6.22, the KVM ABI has been stabilized: no backward +incompatible change are allowed. However, there is an extension +facility that allows backward-compatible extensions to the API to be +queried and used. + +The extension mechanism is not based on on the Linux version number. +Instead, kvm defines extension identifiers and a facility to query +whether a particular extension identifier is available. If it is, a +set of ioctls is available for application use. + +4. API description + +This section describes ioctls that can be used to control kvm guests. +For each ioctl, the following information is provided along with a +description: + + Capability: which KVM extension provides this ioctl. Can be 'basic', + which means that is will be provided by any kernel that supports + API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which + means availability needs to be checked with KVM_CHECK_EXTENSION + (see section 4.4). + + Architectures: which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Type: system, vm, or vcpu. + + Parameters: what parameters are accepted by the ioctl. + + Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + +4.1 KVM_GET_API_VERSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: the constant KVM_API_VERSION (=12) + +This identifies the API version as the stable kvm API. It is not +expected that this number will change. However, Linux 2.6.20 and +2.6.21 report earlier versions; these are not documented and not +supported. Applications should refuse to run if KVM_GET_API_VERSION +returns a value other than 12. If this check passes, all ioctls +described as 'basic' will be available. + +4.2 KVM_CREATE_VM + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: a VM fd that can be used to control the new virtual machine. + +The new VM has no virtual cpus and no memory. An mmap() of a VM fd +will access the virtual machine's physical address space; offset zero +corresponds to guest physical address zero. Use of mmap() on a VM fd +is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is +available. + +4.3 KVM_GET_MSR_INDEX_LIST + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_msr_list (in/out) +Returns: 0 on success; -1 on error +Errors: + E2BIG: the msr index list is to be to fit in the array specified by + the user. + +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +This ioctl returns the guest msrs that are supported. The list varies +by kvm version and host processor, but does not change otherwise. The +user fills in the size of the indices array in nmsrs, and in return +kvm adjusts nmsrs to reflect the actual number of msrs and fills in +the indices array with their numbers. + +Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are +not returned in the MSR list, as different vcpus can have a different number +of banks, as set via the KVM_X86_SETUP_MCE ioctl. + +4.4 KVM_CHECK_EXTENSION + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: extension identifier (KVM_CAP_*) +Returns: 0 if unsupported; 1 (or some other positive integer) if supported + +The API allows the application to query about extensions to the core +kvm API. Userspace passes an extension identifier (an integer) and +receives an integer that describes the extension availability. +Generally 0 means no and 1 means yes, but some extensions may report +additional information in the integer return value. + +4.5 KVM_GET_VCPU_MMAP_SIZE + +Capability: basic +Architectures: all +Type: system ioctl +Parameters: none +Returns: size of vcpu mmap area, in bytes + +The KVM_RUN ioctl (cf.) communicates with userspace via a shared +memory region. This ioctl returns the size of that region. See the +KVM_RUN documentation for details. + +4.6 KVM_SET_MEMORY_REGION + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: struct kvm_memory_region (in) +Returns: 0 on success, -1 on error + +This ioctl is obsolete and has been removed. + +4.7 KVM_CREATE_VCPU + +Capability: basic +Architectures: all +Type: vm ioctl +Parameters: vcpu id (apic id on x86) +Returns: vcpu fd on success, -1 on error + +This API adds a vcpu to a virtual machine. The vcpu id is a small integer +in the range [0, max_vcpus). + +4.8 KVM_GET_DIRTY_LOG (vm ioctl) + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in/out) +Returns: 0 on success, -1 on error + +/* for KVM_GET_DIRTY_LOG */ +struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +Given a memory slot, return a bitmap containing any pages dirtied +since the last call to this ioctl. Bit 0 is the first page in the +memory slot. Ensure the entire structure is cleared to avoid padding +issues. + +4.9 KVM_SET_MEMORY_ALIAS + +Capability: basic +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_memory_alias (in) +Returns: 0 (success), -1 (error) + +This ioctl is obsolete and has been removed. + +4.10 KVM_RUN + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: none +Returns: 0 on success, -1 on error +Errors: + EINTR: an unmasked signal is pending + +This ioctl is used to run a guest virtual cpu. While there are no +explicit parameters, there is an implicit parameter block that can be +obtained by mmap()ing the vcpu fd at offset 0, with the size given by +KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct +kvm_run' (see below). + +4.11 KVM_GET_REGS + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_regs (out) +Returns: 0 on success, -1 on error + +Reads the general purpose registers from the vcpu. + +/* x86 */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +4.12 KVM_SET_REGS + +Capability: basic +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_regs (in) +Returns: 0 on success, -1 on error + +Writes the general purpose registers into the vcpu. + +See KVM_GET_REGS for the data structure. + +4.13 KVM_GET_SREGS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_sregs (out) +Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. + +/* x86 */ +struct kvm_sregs { + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +interrupt_bitmap is a bitmap of pending external interrupts. At most +one bit may be set. This interrupt has been acknowledged by the APIC +but not yet injected into the cpu core. + +4.14 KVM_SET_SREGS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_sregs (in) +Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. See KVM_GET_SREGS for the +data structures. + +4.15 KVM_TRANSLATE + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_translation (in/out) +Returns: 0 on success, -1 on error + +Translates a virtual address according to the vcpu's current address +translation mode. + +struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; +}; + +4.16 KVM_INTERRUPT + +Capability: basic +Architectures: x86, ppc +Type: vcpu ioctl +Parameters: struct kvm_interrupt (in) +Returns: 0 on success, -1 on error + +Queues a hardware interrupt vector to be injected. This is only +useful if in-kernel local APIC or equivalent is not used. + +/* for KVM_INTERRUPT */ +struct kvm_interrupt { + /* in */ + __u32 irq; +}; + +X86: + +Note 'irq' is an interrupt vector, not an interrupt pin or line. + +PPC: + +Queues an external interrupt to be injected. This ioctl is overleaded +with 3 different irq values: + +a) KVM_INTERRUPT_SET + + This injects an edge type external interrupt into the guest once it's ready + to receive interrupts. When injected, the interrupt is done. + +b) KVM_INTERRUPT_UNSET + + This unsets any pending interrupt. + + Only available with KVM_CAP_PPC_UNSET_IRQ. + +c) KVM_INTERRUPT_SET_LEVEL + + This injects a level type external interrupt into the guest context. The + interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET + is triggered. + + Only available with KVM_CAP_PPC_IRQ_LEVEL. + +Note that any value for 'irq' other than the ones stated above is invalid +and incurs unexpected behavior. + +4.17 KVM_DEBUG_GUEST + +Capability: basic +Architectures: none +Type: vcpu ioctl +Parameters: none) +Returns: -1 on error + +Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + +4.18 KVM_GET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in/out) +Returns: 0 on success, -1 on error + +Reads model-specific registers from the vcpu. Supported msr indices can +be obtained using KVM_GET_MSR_INDEX_LIST. + +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array) and the 'index' member of each array entry. +kvm will fill in the 'data' member. + +4.19 KVM_SET_MSRS + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_msrs (in) +Returns: 0 on success, -1 on error + +Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the +data structures. + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array), and the 'index' and 'data' members of each +array entry. + +4.20 KVM_SET_CPUID + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid (in) +Returns: 0 on success, -1 on error + +Defines the vcpu responses to the cpuid instruction. Applications +should use the KVM_SET_CPUID2 ioctl if available. + + +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; +}; + +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; +}; + +4.21 KVM_SET_SIGNAL_MASK + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_signal_mask (in) +Returns: 0 on success, -1 on error + +Defines which signals are blocked during execution of KVM_RUN. This +signal mask temporarily overrides the threads signal mask. Any +unblocked signal received (except SIGKILL and SIGSTOP, which retain +their traditional behaviour) will cause KVM_RUN to return with -EINTR. + +Note the signal will only be delivered if not blocked by the original +signal mask. + +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; +}; + +4.22 KVM_GET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (out) +Returns: 0 on success, -1 on error + +Reads the floating point state from the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +4.23 KVM_SET_FPU + +Capability: basic +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_fpu (in) +Returns: 0 on success, -1 on error + +Writes the floating point state to the vcpu. + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +4.24 KVM_CREATE_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. On x86, creates a virtual +ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a +local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 +only go to the IOAPIC. On ia64, a IOSAPIC is created. + +4.25 KVM_IRQ_LINE + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irq_level +Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +Requires that an interrupt controller model has been previously created with +KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level +to be set to 1 and then back to 0. + +struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ +}; + +4.26 KVM_GET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in/out) +Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +4.27 KVM_SET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in) +Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +4.28 KVM_XEN_HVM_CONFIG + +Capability: KVM_CAP_XEN_HVM +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_xen_hvm_config (in) +Returns: 0 on success, -1 on error + +Sets the MSR that the Xen HVM guest uses to initialize its hypercall +page, and provides the starting address and size of the hypercall +blobs in userspace. When the guest writes the MSR, kvm copies one +page of a blob (32- or 64-bit, depending on the vcpu mode) to guest +memory. + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +4.29 KVM_GET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (out) +Returns: 0 on success, -1 on error + +Gets the current timestamp of kvmclock as seen by the current guest. In +conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + +4.30 KVM_SET_CLOCK + +Capability: KVM_CAP_ADJUST_CLOCK +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_clock_data (in) +Returns: 0 on success, -1 on error + +Sets the current timestamp of kvmclock to the value specified in its parameter. +In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; +}; + +4.31 KVM_GET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_vcpu_event (out) +Returns: 0 on success, -1 on error + +Gets currently pending exceptions, interrupts, and NMIs as well as related +states of the vcpu. + +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; +}; + +KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that +interrupt.shadow contains a valid state. Otherwise, this field is undefined. + +4.32 KVM_SET_VCPU_EVENTS + +Capability: KVM_CAP_VCPU_EVENTS +Extended by: KVM_CAP_INTR_SHADOW +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_vcpu_event (in) +Returns: 0 on success, -1 on error + +Set pending exceptions, interrupts, and NMIs as well as related states of the +vcpu. + +See KVM_GET_VCPU_EVENTS for the data structure. + +Fields that may be modified asynchronously by running VCPUs can be excluded +from the update. These fields are nmi.pending and sipi_vector. Keep the +corresponding bits in the flags field cleared to suppress overwriting the +current in-kernel state. The bits are: + +KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel +KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector + +If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in +the flags field to signal that interrupt.shadow contains a valid state and +shall be written into the VCPU. + +4.33 KVM_GET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (out) +Returns: 0 on success, -1 on error + +Reads debug registers from the vcpu. + +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + +4.34 KVM_SET_DEBUGREGS + +Capability: KVM_CAP_DEBUGREGS +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_debugregs (in) +Returns: 0 on success, -1 on error + +Writes debug registers into the vcpu. + +See KVM_GET_DEBUGREGS for the data structure. The flags field is unused +yet and must be cleared on entry. + +4.35 KVM_SET_USER_MEMORY_REGION + +Capability: KVM_CAP_USER_MEM +Architectures: all +Type: vm ioctl +Parameters: struct kvm_userspace_memory_region (in) +Returns: 0 on success, -1 on error + +struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ +}; + +/* for kvm_memory_region::flags */ +#define KVM_MEM_LOG_DIRTY_PAGES 1UL + +This ioctl allows the user to create or modify a guest physical memory +slot. When changing an existing slot, it may be moved in the guest +physical memory space, or its flags may be modified. It may not be +resized. Slots may not overlap in guest physical address space. + +Memory for the region is taken starting at the address denoted by the +field userspace_addr, which must point at user addressable memory for +the entire memory slot size. Any object may back this memory, including +anonymous memory, ordinary files, and hugetlbfs. + +It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr +be identical. This allows large pages in the guest to be backed by large +pages in the host. + +The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which +instructs kvm to keep track of writes to memory within the slot. See +the KVM_GET_DIRTY_LOG ioctl. + +When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory +region are automatically reflected into the guest. For example, an mmap() +that affects the region will be made visible immediately. Another example +is madvise(MADV_DROP). + +It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. +The KVM_SET_MEMORY_REGION does not allow fine grained control over memory +allocation and is deprecated. + +4.36 KVM_SET_TSS_ADDR + +Capability: KVM_CAP_SET_TSS_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long tss_address (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a three-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +4.37 KVM_ENABLE_CAP + +Capability: KVM_CAP_ENABLE_CAP +Architectures: ppc +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + ++Not all extensions are enabled by default. Using this ioctl the application +can enable an extension, making it available to the guest. + +On systems that do not support this ioctl, it always fails. On systems that +do support it, it only works for extensions that are supported for enablement. + +To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should +be used. + +struct kvm_enable_cap { + /* in */ + __u32 cap; + +The capability that is supposed to get enabled. + + __u32 flags; + +A bitfield indicating future enhancements. Has to be 0 for now. + + __u64 args[4]; + +Arguments for enabling a feature. If a feature needs initial values to +function properly, this is the place to put them. + + __u8 pad[64]; +}; + +4.38 KVM_GET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, ia64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (out) +Returns: 0 on success; -1 on error + +struct kvm_mp_state { + __u32 mp_state; +}; + +Returns the vcpu's current "multiprocessing state" (though also valid on +uniprocessor guests). + +Possible values are: + + - KVM_MP_STATE_RUNNABLE: the vcpu is currently running + - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) + which has not yet received an INIT signal + - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is + now ready for a SIPI + - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and + is waiting for an interrupt + - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector + accessible via KVM_GET_VCPU_EVENTS) + +This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel +irqchip, the multiprocessing state must be maintained by userspace. + +4.39 KVM_SET_MP_STATE + +Capability: KVM_CAP_MP_STATE +Architectures: x86, ia64 +Type: vcpu ioctl +Parameters: struct kvm_mp_state (in) +Returns: 0 on success; -1 on error + +Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for +arguments. + +This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel +irqchip, the multiprocessing state must be maintained by userspace. + +4.40 KVM_SET_IDENTITY_MAP_ADDR + +Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR +Architectures: x86 +Type: vm ioctl +Parameters: unsigned long identity (in) +Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a one-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +4.41 KVM_SET_BOOT_CPU_ID + +Capability: KVM_CAP_SET_BOOT_CPU_ID +Architectures: x86, ia64 +Type: vm ioctl +Parameters: unsigned long vcpu_id +Returns: 0 on success, -1 on error + +Define which vcpu is the Bootstrap Processor (BSP). Values are the same +as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default +is vcpu 0. + +4.42 KVM_GET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (out) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy current vcpu's xsave struct to the userspace. + +4.43 KVM_SET_XSAVE + +Capability: KVM_CAP_XSAVE +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xsave (in) +Returns: 0 on success, -1 on error + +struct kvm_xsave { + __u32 region[1024]; +}; + +This ioctl would copy userspace's xsave struct to the kernel. + +4.44 KVM_GET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (out) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would copy current vcpu's xcrs to the userspace. + +4.45 KVM_SET_XCRS + +Capability: KVM_CAP_XCRS +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_xcrs (in) +Returns: 0 on success, -1 on error + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + +This ioctl would set vcpu's xcr to the value userspace specified. + +4.46 KVM_GET_SUPPORTED_CPUID + +Capability: KVM_CAP_EXT_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 +#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 +#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are supported by both the hardware +and kvm. Userspace can use the information returned by this ioctl to +construct cpuid information (for KVM_SET_CPUID2) that is consistent with +hardware, kernel, and userspace capabilities, and with user requirements (for +example, the user may wish to constrain cpuid to emulate older hardware, +or for feature consistency across a cluster). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe the cpu +capabilities, an error (E2BIG) is returned. If the number is too high, +the 'nent' field is adjusted and an error (ENOMEM) is returned. If the +number is just right, the 'nent' field is adjusted to the number of valid +entries in the 'entries' array, which is then filled. + +The entries returned are the host cpuid as returned by the cpuid instruction, +with unknown or unsupported features masked out. Some features (for example, +x2apic), may not be present in the host cpu, but are exposed by kvm if it can +emulate them efficiently. The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + +4.47 KVM_PPC_GET_PVINFO + +Capability: KVM_CAP_PPC_GET_PVINFO +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_pvinfo (out) +Returns: 0 on success, !0 on error + +struct kvm_ppc_pvinfo { + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; +}; + +This ioctl fetches PV specific information that need to be passed to the guest +using the device tree or other means from vm context. + +For now the only implemented piece of information distributed here is an array +of 4 instructions that make up a hypercall. + +If any additional field gets added to this structure later on, a bit for that +additional piece of information will be set in the flags bitmap. + +4.48 KVM_ASSIGN_PCI_DEVICE + +Capability: KVM_CAP_DEVICE_ASSIGNMENT +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Assigns a host PCI device to the VM. + +struct kvm_assigned_pci_dev { + __u32 assigned_dev_id; + __u32 busnr; + __u32 devfn; + __u32 flags; + __u32 segnr; + union { + __u32 reserved[11]; + }; +}; + +The PCI device is specified by the triple segnr, busnr, and devfn. +Identification in succeeding service requests is done via assigned_dev_id. The +following flags are specified: + +/* Depends on KVM_CAP_IOMMU */ +#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) + +4.49 KVM_DEASSIGN_PCI_DEVICE + +Capability: KVM_CAP_DEVICE_DEASSIGNMENT +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Ends PCI device assignment, releasing all associated resources. + +See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is +used in kvm_assigned_pci_dev to identify the device. + +4.50 KVM_ASSIGN_DEV_IRQ + +Capability: KVM_CAP_ASSIGN_DEV_IRQ +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_irq (in) +Returns: 0 on success, -1 on error + +Assigns an IRQ to a passed-through device. + +struct kvm_assigned_irq { + __u32 assigned_dev_id; + __u32 host_irq; + __u32 guest_irq; + __u32 flags; + union { + struct { + __u32 addr_lo; + __u32 addr_hi; + __u32 data; + } guest_msi; + __u32 reserved[12]; + }; +}; + +The following flags are defined: + +#define KVM_DEV_IRQ_HOST_INTX (1 << 0) +#define KVM_DEV_IRQ_HOST_MSI (1 << 1) +#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) + +#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) +#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) +#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) + +It is not valid to specify multiple types per host or guest IRQ. However, the +IRQ type of host and guest can differ or can even be null. + +4.51 KVM_DEASSIGN_DEV_IRQ + +Capability: KVM_CAP_ASSIGN_DEV_IRQ +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_irq (in) +Returns: 0 on success, -1 on error + +Ends an IRQ assignment to a passed-through device. + +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified +by assigned_dev_id, flags must correspond to the IRQ type specified on +KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. + +4.52 KVM_SET_GSI_ROUTING + +Capability: KVM_CAP_IRQ_ROUTING +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_irq_routing (in) +Returns: 0 on success, -1 on error + +Sets the GSI routing table entries, overwriting any previously set entries. + +struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; +}; + +No flags are specified so far, the corresponding field must be set to zero. + +struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + __u32 pad[8]; + } u; +}; + +/* gsi routing entry types */ +#define KVM_IRQ_ROUTING_IRQCHIP 1 +#define KVM_IRQ_ROUTING_MSI 2 + +No flags are specified so far, the corresponding field must be set to zero. + +struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; +}; + +struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 pad; +}; + +4.53 KVM_ASSIGN_SET_MSIX_NR + +Capability: KVM_CAP_DEVICE_MSIX +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_msix_nr (in) +Returns: 0 on success, -1 on error + +Set the number of MSI-X interrupts for an assigned device. This service can +only be called once in the lifetime of an assigned device. + +struct kvm_assigned_msix_nr { + __u32 assigned_dev_id; + __u16 entry_nr; + __u16 padding; +}; + +#define KVM_MAX_MSIX_PER_DEV 256 + +4.54 KVM_ASSIGN_SET_MSIX_ENTRY + +Capability: KVM_CAP_DEVICE_MSIX +Architectures: x86 ia64 +Type: vm ioctl +Parameters: struct kvm_assigned_msix_entry (in) +Returns: 0 on success, -1 on error + +Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting +the GSI vector to zero means disabling the interrupt. + +struct kvm_assigned_msix_entry { + __u32 assigned_dev_id; + __u32 gsi; + __u16 entry; /* The index of entry in the MSI-X table */ + __u16 padding[3]; +}; + +5. The kvm_run structure + +Application code obtains a pointer to the kvm_run structure by +mmap()ing a vcpu fd. From that point, application code can control +execution by changing fields in kvm_run prior to calling the KVM_RUN +ioctl, and obtain information about the reason KVM_RUN returned by +looking up structure members. + +struct kvm_run { + /* in */ + __u8 request_interrupt_window; + +Request that KVM_RUN return when it becomes possible to inject external +interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. + + __u8 padding1[7]; + + /* out */ + __u32 exit_reason; + +When KVM_RUN has returned successfully (return value 0), this informs +application code why KVM_RUN has returned. Allowable values for this +field are detailed below. + + __u8 ready_for_interrupt_injection; + +If request_interrupt_window has been specified, this field indicates +an interrupt can be injected now with KVM_INTERRUPT. + + __u8 if_flag; + +The value of the current interrupt flag. Only valid if in-kernel +local APIC is not used. + + __u8 padding2[2]; + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + +The value of the cr8 register. Only valid if in-kernel local APIC is +not used. Both input and output. + + __u64 apic_base; + +The value of the APIC BASE msr. Only valid if in-kernel local +APIC is not used. Both input and output. + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + +If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown +reasons. Further architecture-specific information is available in +hardware_exit_reason. + + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + +If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due +to unknown reasons. Further architecture-specific information is +available in hardware_entry_failure_reason. + + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + +Unused. + + /* KVM_EXIT_IO */ + struct { +#define KVM_EXIT_IO_IN 0 +#define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + +If exit_reason is KVM_EXIT_IO, then the vcpu has +executed a port I/O instruction which could not be satisfied by kvm. +data_offset describes where the data is located (KVM_EXIT_IO_OUT) or +where kvm expects application code to place the data for the next +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. + + struct { + struct kvm_debug_exit_arch arch; + } debug; + +Unused. + + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + +If exit_reason is KVM_EXIT_MMIO, then the vcpu has +executed a memory-mapped I/O instruction which could not be satisfied +by kvm. The 'data' member contains the written data if 'is_write' is +true, and should be filled by application code otherwise. + +NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding +operations are complete (and guest state is consistent) only after userspace +has re-entered the kernel with KVM_RUN. The kernel side will first finish +incomplete operations and then check for pending signals. Userspace +can re-enter the guest with an unmasked signal pending to complete +pending operations. + + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + +Unused. This was once used for 'hypercall to userspace'. To implement +such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). +Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. + + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + +To be documented (KVM_TPR_ACCESS_REPORTING). + + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u64 mask; /* psw upper half */ + __u64 addr; /* psw lower half */ + __u16 ipa; + __u32 ipb; + } s390_sieic; + +s390 specific. + + /* KVM_EXIT_S390_RESET */ +#define KVM_S390_RESET_POR 1 +#define KVM_S390_RESET_CLEAR 2 +#define KVM_S390_RESET_SUBSYSTEM 4 +#define KVM_S390_RESET_CPU_INIT 8 +#define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + +s390 specific. + + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + +powerpc specific. + + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + +MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch +hypercalls and exit with this exit struct that contains all the guest gprs. + +If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. +Userspace can now handle the hypercall and when it's done modify the gprs as +necessary. Upon guest entry all guest GPRs will then be replaced by the values +in this struct. + + /* Fix the size of the union. */ + char padding[256]; + }; +}; diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt new file mode 100644 index 0000000..8820685 --- /dev/null +++ b/Documentation/virtual/kvm/cpuid.txt @@ -0,0 +1,45 @@ +KVM CPUID bits +Glauber Costa , Red Hat Inc, 2010 +===================================================== + +A guest running on a kvm host, can check some of its features using +cpuid. This is not always guaranteed to work, since userspace can +mask-out some, or even all KVM-related cpuid features before launching +a guest. + +KVM cpuid functions are: + +function: KVM_CPUID_SIGNATURE (0x40000000) +returns : eax = 0, + ebx = 0x4b4d564b, + ecx = 0x564b4d56, + edx = 0x4d. +Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". +This function queries the presence of KVM cpuid leafs. + + +function: define KVM_CPUID_FEATURES (0x40000001) +returns : ebx, ecx, edx = 0 + eax = and OR'ed group of (1 << flag), where each flags is: + + +flag || value || meaning +============================================================================= +KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs + || || 0x11 and 0x12. +------------------------------------------------------------------------------ +KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays + || || on PIO operations. +------------------------------------------------------------------------------ +KVM_FEATURE_MMU_OP || 2 || deprecated. +------------------------------------------------------------------------------ +KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs + || || 0x4b564d00 and 0x4b564d01 +------------------------------------------------------------------------------ +KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by + || || writing to msr 0x4b564d02 +------------------------------------------------------------------------------ +KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side + || || per-cpu warps are expected in + || || kvmclock. +------------------------------------------------------------------------------ diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt new file mode 100644 index 0000000..3b4cd3b --- /dev/null +++ b/Documentation/virtual/kvm/locking.txt @@ -0,0 +1,25 @@ +KVM Lock Overview +================= + +1. Acquisition Orders +--------------------- + +(to be written) + +2. Reference +------------ + +Name: kvm_lock +Type: raw_spinlock +Arch: any +Protects: - vm_list + - hardware virtualization enable/disable +Comment: 'raw' because hardware enabling/disabling must be atomic /wrt + migration. + +Name: kvm_arch::tsc_write_lock +Type: raw_spinlock +Arch: x86 +Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} + - tsc offset in vmcb +Comment: 'raw' because updating the tsc offsets must not be preempted. diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt new file mode 100644 index 0000000..f46aa58 --- /dev/null +++ b/Documentation/virtual/kvm/mmu.txt @@ -0,0 +1,348 @@ +The x86 kvm shadow mmu +====================== + +The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible +for presenting a standard x86 mmu to the guest, while translating guest +physical addresses to host physical addresses. + +The mmu code attempts to satisfy the following requirements: + +- correctness: the guest should not be able to determine that it is running + on an emulated mmu except for timing (we attempt to comply + with the specification, not emulate the characteristics of + a particular implementation such as tlb size) +- security: the guest must not be able to touch host memory not assigned + to it +- performance: minimize the performance penalty imposed by the mmu +- scaling: need to scale to large memory and large vcpu guests +- hardware: support the full range of x86 virtualization hardware +- integration: Linux memory management code must be in control of guest memory + so that swapping, page migration, page merging, transparent + hugepages, and similar features work without change +- dirty tracking: report writes to guest memory to enable live migration + and framebuffer-based displays +- footprint: keep the amount of pinned kernel memory low (most memory + should be shrinkable) +- reliability: avoid multipage or GFP_ATOMIC allocations + +Acronyms +======== + +pfn host page frame number +hpa host physical address +hva host virtual address +gfn guest frame number +gpa guest physical address +gva guest virtual address +ngpa nested guest physical address +ngva nested guest virtual address +pte page table entry (used also to refer generically to paging structure + entries) +gpte guest pte (referring to gfns) +spte shadow pte (referring to pfns) +tdp two dimensional paging (vendor neutral term for NPT and EPT) + +Virtual and real hardware supported +=================================== + +The mmu supports first-generation mmu hardware, which allows an atomic switch +of the current paging mode and cr3 during guest entry, as well as +two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware +it exposes is the traditional 2/3/4 level x86 mmu, with support for global +pages, pae, pse, pse36, cr0.wp, and 1GB pages. Work is in progress to support +exposing NPT capable hardware on NPT capable hosts. + +Translation +=========== + +The primary job of the mmu is to program the processor's mmu to translate +addresses for the guest. Different translations are required at different +times: + +- when guest paging is disabled, we translate guest physical addresses to + host physical addresses (gpa->hpa) +- when guest paging is enabled, we translate guest virtual addresses, to + guest physical addresses, to host physical addresses (gva->gpa->hpa) +- when the guest launches a guest of its own, we translate nested guest + virtual addresses, to nested guest physical addresses, to guest physical + addresses, to host physical addresses (ngva->ngpa->gpa->hpa) + +The primary challenge is to encode between 1 and 3 translations into hardware +that support only 1 (traditional) and 2 (tdp) translations. When the +number of required translations matches the hardware, the mmu operates in +direct mode; otherwise it operates in shadow mode (see below). + +Memory +====== + +Guest memory (gpa) is part of the user address space of the process that is +using kvm. Userspace defines the translation between guest addresses and user +addresses (gpa->hva); note that two gpas may alias to the same hva, but not +vice versa. + +These hvas may be backed using any method available to the host: anonymous +memory, file backed memory, and device memory. Memory might be paged by the +host at any time. + +Events +====== + +The mmu is driven by events, some from the guest, some from the host. + +Guest generated events: +- writes to control registers (especially cr3) +- invlpg/invlpga instruction execution +- access to missing or protected translations + +Host generated events: +- changes in the gpa->hpa translation (either through gpa->hva changes or + through hva->hpa changes) +- memory pressure (the shrinker) + +Shadow pages +============ + +The principal data structure is the shadow page, 'struct kvm_mmu_page'. A +shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A +shadow page may contain a mix of leaf and nonleaf sptes. + +A nonleaf spte allows the hardware mmu to reach the leaf pages and +is not related to a translation directly. It points to other shadow pages. + +A leaf spte corresponds to either one or two translations encoded into +one paging structure entry. These are always the lowest level of the +translation stack, with optional higher level translations left to NPT/EPT. +Leaf ptes point at guest pages. + +The following table shows translations encoded by leaf ptes, with higher-level +translations in parentheses: + + Non-nested guests: + nonpaging: gpa->hpa + paging: gva->gpa->hpa + paging, tdp: (gva->)gpa->hpa + Nested guests: + non-tdp: ngva->gpa->hpa (*) + tdp: (ngva->)ngpa->gpa->hpa + +(*) the guest hypervisor will encode the ngva->gpa translation into its page + tables if npt is not present + +Shadow pages contain the following information: + role.level: + The level in the shadow paging hierarchy that this shadow page belongs to. + 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. + role.direct: + If set, leaf sptes reachable from this page are for a linear range. + Examples include real mode translation, large guest pages backed by small + host pages, and gpa->hpa translations when NPT or EPT is active. + The linear range starts at (gfn << PAGE_SHIFT) and its size is determined + by role.level (2MB for first level, 1GB for second level, 0.5TB for third + level, 256TB for fourth level) + If clear, this page corresponds to a guest page table denoted by the gfn + field. + role.quadrant: + When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit + sptes. That means a guest page table contains more ptes than the host, + so multiple shadow pages are needed to shadow one guest page. + For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the + first or second 512-gpte block in the guest page table. For second-level + page tables, each 32-bit gpte is converted to two 64-bit sptes + (since each first-level guest page is shadowed by two first-level + shadow pages) so role.quadrant takes values in the range 0..3. Each + quadrant maps 1GB virtual address space. + role.access: + Inherited guest access permissions in the form uwx. Note execute + permission is positive, not negative. + role.invalid: + The page is invalid and should not be used. It is a root page that is + currently pinned (by a cpu hardware register pointing to it); once it is + unpinned it will be destroyed. + role.cr4_pae: + Contains the value of cr4.pae for which the page is valid (e.g. whether + 32-bit or 64-bit gptes are in use). + role.nxe: + Contains the value of efer.nxe for which the page is valid. + role.cr0_wp: + Contains the value of cr0.wp for which the page is valid. + gfn: + Either the guest page table containing the translations shadowed by this + page, or the base page frame for linear translations. See role.direct. + spt: + A pageful of 64-bit sptes containing the translations for this page. + Accessed by both kvm and hardware. + The page pointed to by spt will have its page->private pointing back + at the shadow page structure. + sptes in spt point either at guest pages, or at lower-level shadow pages. + Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point + at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. + The spt array forms a DAG structure with the shadow page as a node, and + guest pages as leaves. + gfns: + An array of 512 guest frame numbers, one for each present pte. Used to + perform a reverse map from a pte to a gfn. When role.direct is set, any + element of this array can be calculated from the gfn field when used, in + this case, the array of gfns is not allocated. See role.direct and gfn. + slot_bitmap: + A bitmap containing one bit per memory slot. If the page contains a pte + mapping a page from memory slot n, then bit n of slot_bitmap will be set + (if a page is aliased among several slots, then it is not guaranteed that + all slots will be marked). + Used during dirty logging to avoid scanning a shadow page if none if its + pages need tracking. + root_count: + A counter keeping track of how many hardware registers (guest cr3 or + pdptrs) are now pointing at the page. While this counter is nonzero, the + page cannot be destroyed. See role.invalid. + multimapped: + Whether there exist multiple sptes pointing at this page. + parent_pte/parent_ptes: + If multimapped is zero, parent_pte points at the single spte that points at + this page's spt. Otherwise, parent_ptes points at a data structure + with a list of parent_ptes. + unsync: + If true, then the translations in this page may not match the guest's + translation. This is equivalent to the state of the tlb when a pte is + changed but before the tlb entry is flushed. Accordingly, unsync ptes + are synchronized when the guest executes invlpg or flushes its tlb by + other means. Valid for leaf pages. + unsync_children: + How many sptes in the page point at pages that are unsync (or have + unsynchronized children). + unsync_child_bitmap: + A bitmap indicating which sptes in spt point (directly or indirectly) at + pages that may be unsynchronized. Used to quickly locate all unsychronized + pages reachable from a given page. + +Reverse map +=========== + +The mmu maintains a reverse mapping whereby all ptes mapping a page can be +reached given its gfn. This is used, for example, when swapping out a page. + +Synchronized and unsynchronized pages +===================================== + +The guest uses two events to synchronize its tlb and page tables: tlb flushes +and page invalidations (invlpg). + +A tlb flush means that we need to synchronize all sptes reachable from the +guest's cr3. This is expensive, so we keep all guest page tables write +protected, and synchronize sptes to gptes when a gpte is written. + +A special case is when a guest page table is reachable from the current +guest cr3. In this case, the guest is obliged to issue an invlpg instruction +before using the translation. We take advantage of that by removing write +protection from the guest page, and allowing the guest to modify it freely. +We synchronize modified gptes when the guest invokes invlpg. This reduces +the amount of emulation we have to do when the guest modifies multiple gptes, +or when the a guest page is no longer used as a page table and is used for +random guest data. + +As a side effect we have to resynchronize all reachable unsynchronized shadow +pages on a tlb flush. + + +Reaction to events +================== + +- guest page fault (or npt page fault, or ept violation) + +This is the most complicated event. The cause of a page fault can be: + + - a true guest fault (the guest translation won't allow the access) (*) + - access to a missing translation + - access to a protected translation + - when logging dirty pages, memory is write protected + - synchronized shadow pages are write protected (*) + - access to untranslatable memory (mmio) + + (*) not applicable in direct mode + +Handling a page fault is performed as follows: + + - if needed, walk the guest page tables to determine the guest translation + (gva->gpa or ngpa->gpa) + - if permissions are insufficient, reflect the fault back to the guest + - determine the host page + - if this is an mmio request, there is no host page; call the emulator + to emulate the instruction instead + - walk the shadow page table to find the spte for the translation, + instantiating missing intermediate page tables as necessary + - try to unsynchronize the page + - if successful, we can let the guest continue and modify the gpte + - emulate the instruction + - if failed, unshadow the page and let the guest continue + - update any translations that were modified by the instruction + +invlpg handling: + + - walk the shadow page hierarchy and drop affected translations + - try to reinstantiate the indicated translation in the hope that the + guest will use it in the near future + +Guest control register updates: + +- mov to cr3 + - look up new shadow roots + - synchronize newly reachable shadow pages + +- mov to cr0/cr4/efer + - set up mmu context for new paging mode + - look up new shadow roots + - synchronize newly reachable shadow pages + +Host translation updates: + + - mmu notifier called with updated hva + - look up affected sptes through reverse map + - drop (or update) translations + +Emulating cr0.wp +================ + +If tdp is not enabled, the host must keep cr0.wp=1 so page write protection +works for the guest kernel, not guest guest userspace. When the guest +cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, +we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the +semantics require allowing any guest kernel access plus user read access). + +We handle this by mapping the permissions to two possible sptes, depending +on fault type: + +- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, + disallows user access) +- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel + write access) + +(user write faults generate a #PF) + +Large pages +=========== + +The mmu supports all combinations of large and small guest and host pages. +Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as +two separate 2M pages, on both guest and host, since the mmu always uses PAE +paging. + +To instantiate a large spte, four constraints must be satisfied: + +- the spte must point to a large host page +- the guest pte must be a large pte of at least equivalent size (if tdp is + enabled, there is no guest pte and this condition is satisified) +- if the spte will be writeable, the large page frame may not overlap any + write-protected pages +- the guest page must be wholly contained by a single memory slot + +To check the last two conditions, the mmu maintains a ->write_count set of +arrays for each memory slot and large page size. Every write protected page +causes its write_count to be incremented, thus preventing instantiation of +a large spte. The frames at the end of an unaligned memory slot have +artificically inflated ->write_counts so they can never be instantiated. + +Further reading +=============== + +- NPT presentation from KVM Forum 2008 + http://www.linux-kvm.org/wiki/images/c/c8/KvmForum2008%24kdf2008_21.pdf + diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt new file mode 100644 index 0000000..d079aed --- /dev/null +++ b/Documentation/virtual/kvm/msr.txt @@ -0,0 +1,187 @@ +KVM-specific MSRs. +Glauber Costa , Red Hat Inc, 2010 +===================================================== + +KVM makes use of some custom MSRs to service some requests. + +Custom MSRs have a range reserved for them, that goes from +0x4b564d00 to 0x4b564dff. There are MSRs outside this area, +but they are deprecated and their use is discouraged. + +Custom MSR list +-------- + +The current supported Custom MSR list is: + +MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 + + data: 4-byte alignment physical address of a memory area which must be + in guest RAM. This memory is expected to hold a copy of the following + structure: + + struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; + } __attribute__((__packed__)); + + whose data will be filled in by the hypervisor. The hypervisor is only + guaranteed to update this data at the moment of MSR write. + Users that want to reliably query this information more than once have + to write more than once to this MSR. Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + sec: number of seconds for wallclock. + + nsec: number of nanoseconds for wallclock. + + Note that although MSRs are per-CPU entities, the effect of this + particular MSR is global. + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 + + data: 4-byte aligned physical address of a memory area which must be in + guest RAM, plus an enable bit in bit 0. This memory is expected to hold + a copy of the following structure: + + struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; + } __attribute__((__packed__)); /* 32 bytes */ + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. + + Fields have the following meanings: + + version: guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + tsc_timestamp: the tsc value at the current VCPU at the time + of the update of this structure. Guests can subtract this value + from current tsc to derive a notion of elapsed time since the + structure update. + + system_time: a host notion of monotonic time, including sleep + time at the time this structure was last updated. Unit is + nanoseconds. + + tsc_to_system_mul: a function of the tsc frequency. One has + to multiply any tsc-related quantity by this value to get + a value in nanoseconds, besides dividing by 2^tsc_shift + + tsc_shift: cycle to nanosecond divider, as a power of two, to + allow for shift rights. One has to shift right any tsc-related + quantity by this value to get a value in nanoseconds, besides + multiplying by tsc_to_system_mul. + + With this information, guests can derive per-CPU time by + doing: + + time = (current_tsc - tsc_timestamp) + time = (time * tsc_to_system_mul) >> tsc_shift + time = time + system_time + + flags: bits in this field indicate extended capabilities + coordinated between the guest and the hypervisor. Availability + of specific flags has to be checked in 0x40000001 cpuid leaf. + Current flags are: + + flag bit | cpuid bit | meaning + ------------------------------------------------------------- + | | time measures taken across + 0 | 24 | multiple cpus are guaranteed to + | | be monotonic + ------------------------------------------------------------- + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + + +MSR_KVM_WALL_CLOCK: 0x11 + + data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME: 0x12 + + data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + + The suggested algorithm for detecting kvmclock presence is then: + + if (!kvm_para_available()) /* refer to cpuid.txt */ + return NON_PRESENT; + + flags = cpuid_eax(0x40000001); + if (flags & 3) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + return PRESENT; + } else if (flags & 0) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + return PRESENT; + } else + return NON_PRESENT; + +MSR_KVM_ASYNC_PF_EN: 0x4b564d02 + data: Bits 63-6 hold 64-byte aligned physical address of a + 64 byte memory area which must be in guest RAM and must be + zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 + when asynchronous page faults are enabled on the vcpu 0 when + disabled. Bit 2 is 1 if asynchronous page faults can be injected + when vcpu is in cpl == 0. + + First 4 byte of 64 byte memory location will be written to by + the hypervisor at the time of asynchronous page fault (APF) + injection to indicate type of asynchronous page fault. Value + of 1 means that the page referred to by the page fault is not + present. Value 2 means that the page is now available. Disabling + interrupt inhibits APFs. Guest must not enable interrupt + before the reason is read, or it may be overwritten by another + APF. Since APF uses the same exception vector as regular page + fault guest must reset the reason to 0 before it does + something that can generate normal page fault. If during page + fault APF reason is 0 it means that this is regular page + fault. + + During delivery of type 1 APF cr2 contains a token that will + be used to notify a guest when missing page becomes + available. When page becomes available type 2 APF is sent with + cr2 set to the token associated with the page. There is special + kind of token 0xffffffff which tells vcpu that it should wake + up all processes waiting for APFs and no individual type 2 APFs + will be sent. + + If APF is disabled while there are outstanding APFs, they will + not be delivered. + + Currently type 2 APF will be always delivered on the same vcpu as + type 1 was, but guest should not rely on that. diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virtual/kvm/ppc-pv.txt new file mode 100644 index 0000000..3ab969c --- /dev/null +++ b/Documentation/virtual/kvm/ppc-pv.txt @@ -0,0 +1,196 @@ +The PPC KVM paravirtual interface +================================= + +The basic execution principle by which KVM on PowerPC works is to run all kernel +space code in PR=1 which is user space. This way we trap all privileged +instructions and can emulate them accordingly. + +Unfortunately that is also the downfall. There are quite some privileged +instructions that needlessly return us to the hypervisor even though they +could be handled differently. + +This is what the PPC PV interface helps with. It takes privileged instructions +and transforms them into unprivileged ones with some help from the hypervisor. +This cuts down virtualization costs by about 50% on some of my benchmarks. + +The code for that interface can be found in arch/powerpc/kernel/kvm* + +Querying for existence +====================== + +To find out if we're running on KVM or not, we leverage the device tree. When +Linux is running on KVM, a node /hypervisor exists. That node contains a +compatible property with the value "linux,kvm". + +Once you determined you're running under a PV capable KVM, you can now use +hypercalls as described below. + +KVM hypercalls +============== + +Inside the device tree's /hypervisor node there's a property called +'hypercall-instructions'. This property contains at most 4 opcodes that make +up the hypercall. To call a hypercall, just call these instructions. + +The parameters are as follows: + + Register IN OUT + + r0 - volatile + r3 1st parameter Return code + r4 2nd parameter 1st output value + r5 3rd parameter 2nd output value + r6 4th parameter 3rd output value + r7 5th parameter 4th output value + r8 6th parameter 5th output value + r9 7th parameter 6th output value + r10 8th parameter 7th output value + r11 hypercall number 8th output value + r12 - volatile + +Hypercall definitions are shared in generic code, so the same hypercall numbers +apply for x86 and powerpc alike with the exception that each KVM hypercall +also needs to be ORed with the KVM vendor code which is (42 << 16). + +Return codes can be as follows: + + Code Meaning + + 0 Success + 12 Hypercall not implemented + <0 Error + +The magic page +============== + +To enable communication between the hypervisor and guest there is a new shared +page that contains parts of supervisor visible register state. The guest can +map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. + +With this hypercall issued the guest always gets the magic page mapped at the +desired location in effective and physical address space. For now, we always +map the page to -4096. This way we can access it using absolute load and store +functions. The following instruction reads the first field of the magic page: + + ld rX, -4096(0) + +The interface is designed to be extensible should there be need later to add +additional registers to the magic page. If you add fields to the magic page, +also define a new hypercall feature to indicate that the host can give you more +registers. Only if the host supports the additional features, make use of them. + +The magic page has the following layout as described in +arch/powerpc/include/asm/kvm_para.h: + +struct kvm_vcpu_arch_shared { + __u64 scratch1; + __u64 scratch2; + __u64 scratch3; + __u64 critical; /* Guest may not get interrupts if == r1 */ + __u64 sprg0; + __u64 sprg1; + __u64 sprg2; + __u64 sprg3; + __u64 srr0; + __u64 srr1; + __u64 dar; + __u64 msr; + __u32 dsisr; + __u32 int_pending; /* Tells the guest if we have an interrupt */ +}; + +Additions to the page must only occur at the end. Struct fields are always 32 +or 64 bit aligned, depending on them being 32 or 64 bit wide respectively. + +Magic page features +=================== + +When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, +a second return value is passed to the guest. This second return value contains +a bitmap of available features inside the magic page. + +The following enhancements to the magic page are currently available: + + KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page + +For enhanced features in the magic page, please check for the existence of the +feature before using them! + +MSR bits +======== + +The MSR contains bits that require hypervisor intervention and bits that do +not require direct hypervisor intervention because they only get interpreted +when entering the guest or don't have any impact on the hypervisor's behavior. + +The following bits are safe to be set inside the guest: + + MSR_EE + MSR_RI + MSR_CR + MSR_ME + +If any other bit changes in the MSR, please still use mtmsr(d). + +Patched instructions +==================== + +The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions +respectively on 32 bit systems with an added offset of 4 to accommodate for big +endianness. + +The following is a list of mapping the Linux kernel performs when running as +guest. Implementing any of those mappings is optional, as the instruction traps +also act on the shared page. So calling privileged instructions still works as +before. + +From To +==== == + +mfmsr rX ld rX, magic_page->msr +mfsprg rX, 0 ld rX, magic_page->sprg0 +mfsprg rX, 1 ld rX, magic_page->sprg1 +mfsprg rX, 2 ld rX, magic_page->sprg2 +mfsprg rX, 3 ld rX, magic_page->sprg3 +mfsrr0 rX ld rX, magic_page->srr0 +mfsrr1 rX ld rX, magic_page->srr1 +mfdar rX ld rX, magic_page->dar +mfdsisr rX lwz rX, magic_page->dsisr + +mtmsr rX std rX, magic_page->msr +mtsprg 0, rX std rX, magic_page->sprg0 +mtsprg 1, rX std rX, magic_page->sprg1 +mtsprg 2, rX std rX, magic_page->sprg2 +mtsprg 3, rX std rX, magic_page->sprg3 +mtsrr0 rX std rX, magic_page->srr0 +mtsrr1 rX std rX, magic_page->srr1 +mtdar rX std rX, magic_page->dar +mtdsisr rX stw rX, magic_page->dsisr + +tlbsync nop + +mtmsrd rX, 0 b +mtmsr rX b + +mtmsrd rX, 1 b + +[Book3S only] +mtsrin rX, rY b + +[BookE only] +wrteei [0|1] b + + +Some instructions require more logic to determine what's going on than a load +or store instruction can deliver. To enable patching of those, we keep some +RAM around where we can live translate instructions to. What happens is the +following: + + 1) copy emulation code to memory + 2) patch that code to fit the emulated instruction + 3) patch that code to return to the original pc + 4 + 4) patch the original instruction to branch to the new code + +That way we can inject an arbitrary amount of code as replacement for a single +instruction. This allows us to check for pending interrupts when setting EE=1 +for example. diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt new file mode 100644 index 0000000..730475a --- /dev/null +++ b/Documentation/virtual/kvm/review-checklist.txt @@ -0,0 +1,38 @@ +Review checklist for kvm patches +================================ + +1. The patch must follow Documentation/CodingStyle and + Documentation/SubmittingPatches. + +2. Patches should be against kvm.git master branch. + +3. If the patch introduces or modifies a new userspace API: + - the API must be documented in Documentation/kvm/api.txt + - the API must be discoverable using KVM_CHECK_EXTENSION + +4. New state must include support for save/restore. + +5. New features must default to off (userspace should explicitly request them). + Performance improvements can and should default to on. + +6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 + +7. Emulator changes should be accompanied by unit tests for qemu-kvm.git + kvm/test directory. + +8. Changes should be vendor neutral when possible. Changes to common code + are better than duplicating changes to vendor code. + +9. Similarly, prefer changes to arch independent code than to arch dependent + code. + +10. User/kernel interfaces and guest/host interfaces must be 64-bit clean + (all variables and sizes naturally aligned on 64-bit; use specific types + only - u64 rather than ulong). + +11. New guest visible features must either be documented in a hardware manual + or be accompanied by documentation. + +12. Features must be robust against reset and kexec - for example, shared + host/guest memory must be unshared to prevent the host from writing to + guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virtual/kvm/timekeeping.txt b/Documentation/virtual/kvm/timekeeping.txt new file mode 100644 index 0000000..df894637 --- /dev/null +++ b/Documentation/virtual/kvm/timekeeping.txt @@ -0,0 +1,612 @@ + + Timekeeping Virtualization for X86-Based Architectures + + Zachary Amsden + Copyright (c) 2010, Red Hat. All rights reserved. + +1) Overview +2) Timing Devices +3) TSC Hardware +4) Virtualization Problems + +========================================================================= + +1) Overview + +One of the most complicated parts of the X86 platform, and specifically, +the virtualization of this platform is the plethora of timing devices available +and the complexity of emulating those devices. In addition, virtualization of +time introduces a new set of challenges because it introduces a multiplexed +division of time beyond the control of the guest CPU. + +First, we will describe the various timekeeping hardware available, then +present some of the problems which arise and solutions available, giving +specific recommendations for certain classes of KVM guests. + +The purpose of this document is to collect data and information relevant to +timekeeping which may be difficult to find elsewhere, specifically, +information relevant to KVM and hardware-based virtualization. + +========================================================================= + +2) Timing Devices + +First we discuss the basic hardware devices available. TSC and the related +KVM clock are special enough to warrant a full exposition and are described in +the following section. + +2.1) i8254 - PIT + +One of the first timer devices available is the programmable interrupt timer, +or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three +channels which can be programmed to deliver periodic or one-shot interrupts. +These three channels can be configured in different modes and have individual +counters. Channel 1 and 2 were not available for general use in the original +IBM PC, and historically were connected to control RAM refresh and the PC +speaker. Now the PIT is typically integrated as part of an emulated chipset +and a separate physical PIT is not used. + +The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done +using single or multiple byte access to the I/O ports. There are 6 modes +available, but not all modes are available to all timers, as only timer 2 +has a connected gate input, required for modes 1 and 5. The gate line is +controlled by port 61h, bit 0, as illustrated in the following diagram. + + -------------- ---------------- +| | | | +| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 +| Clock | | | | + -------------- | +->| GATE TIMER 0 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM + | | | (aka /dev/null) + | +->| GATE TIMER 1 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> Port 61h, bit 5 + | | | +Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ + ---------------- _| )--|LPF|---Speaker + / *---- \___/ +Port 61h, bit 1 -----------------------------------/ + +The timer modes are now described. + +Mode 0: Single Timeout. This is a one-shot software timeout that counts down + when the gate is high (always true for timers 0 and 1). When the count + reaches zero, the output goes high. + +Mode 1: Triggered One-shot. The output is initially set high. When the gate + line is set high, a countdown is initiated (which does not stop if the gate is + lowered), during which the output is set low. When the count reaches zero, + the output goes high. + +Mode 2: Rate Generator. The output is initially set high. When the countdown + reaches 1, the output goes low for one count and then returns high. The value + is reloaded and the countdown automatically resumes. If the gate line goes + low, the count is halted. If the output is low when the gate is lowered, the + output automatically goes high (this only affects timer 2). + +Mode 3: Square Wave. This generates a high / low square wave. The count + determines the length of the pulse, which alternates between high and low + when zero is reached. The count only proceeds when gate is high and is + automatically reloaded on reaching zero. The count is decremented twice at + each clock to generate a full high / low cycle at the full periodic rate. + If the count is even, the clock remains high for N/2 counts and low for N/2 + counts; if the clock is odd, the clock is high for (N+1)/2 counts and low + for (N-1)/2 counts. Only even values are latched by the counter, so odd + values are not observed when reading. This is the intended mode for timer 2, + which generates sine-like tones by low-pass filtering the square wave output. + +Mode 4: Software Strobe. After programming this mode and loading the counter, + the output remains high until the counter reaches zero. Then the output + goes low for 1 clock cycle and returns high. The counter is not reloaded. + Counting only occurs when gate is high. + +Mode 5: Hardware Strobe. After programming and loading the counter, the + output remains high. When the gate is raised, a countdown is initiated + (which does not stop if the gate is lowered). When the counter reaches zero, + the output goes low for 1 clock cycle and then returns high. The counter is + not reloaded. + +In addition to normal binary counting, the PIT supports BCD counting. The +command port, 0x43 is used to set the counter and mode for each of the three +timers. + +PIT commands, issued to port 0x43, using the following bit encoding: + +Bit 7-4: Command (See table below) +Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) +Bit 0 : Binary (0) / BCD (1) + +Command table: + +0000 - Latch Timer 0 count for port 0x40 + sample and hold the count to be read in port 0x40; + additional commands ignored until counter is read; + mode bits ignored. + +0001 - Set Timer 0 LSB mode for port 0x40 + set timer to read LSB only and force MSB to zero; + mode bits set timer mode + +0010 - Set Timer 0 MSB mode for port 0x40 + set timer to read MSB only and force LSB to zero; + mode bits set timer mode + +0011 - Set Timer 0 16-bit mode for port 0x40 + set timer to read / write LSB first, then MSB; + mode bits set timer mode + +0100 - Latch Timer 1 count for port 0x41 - as described above +0101 - Set Timer 1 LSB mode for port 0x41 - as described above +0110 - Set Timer 1 MSB mode for port 0x41 - as described above +0111 - Set Timer 1 16-bit mode for port 0x41 - as described above + +1000 - Latch Timer 2 count for port 0x42 - as described above +1001 - Set Timer 2 LSB mode for port 0x42 - as described above +1010 - Set Timer 2 MSB mode for port 0x42 - as described above +1011 - Set Timer 2 16-bit mode for port 0x42 as described above + +1101 - General counter latch + Latch combination of counters into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + Bit 0 = Unused + +1110 - Latch timer status + Latch combination of counter mode into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + + The output of ports 0x40-0x42 following this command will be: + + Bit 7 = Output pin + Bit 6 = Count loaded (0 if timer has expired) + Bit 5-4 = Read / Write mode + 01 = MSB only + 10 = LSB only + 11 = LSB / MSB (16-bit) + Bit 3-1 = Mode + Bit 0 = Binary (0) / BCD mode (1) + +2.2) RTC + +The second device which was available in the original PC was the MC146818 real +time clock. The original device is now obsolete, and usually emulated by the +system chipset, sometimes by an HPET and some frankenstein IRQ routing. + +The RTC is accessed through CMOS variables, which uses an index register to +control which bytes are read. Since there is only one index register, read +of the CMOS and read of the RTC require lock protection (in addition, it is +dangerous to allow userspace utilities such as hwclock to have direct RTC +access, as they could corrupt kernel reads and writes of CMOS memory). + +The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt +can function as a periodic timer, an additional once a day alarm, and can issue +interrupts after an update of the CMOS registers by the MC146818 is complete. +The type of interrupt is signalled in the RTC status registers. + +The RTC will update the current time fields by battery power even while the +system is off. The current time fields should not be read while an update is +in progress, as indicated in the status register. + +The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be +programmed to a 32kHz divider if the RTC is to count seconds. + +This is the RAM map originally used for the RTC/CMOS: + +Location Size Description +------------------------------------------ +00h byte Current second (BCD) +01h byte Seconds alarm (BCD) +02h byte Current minute (BCD) +03h byte Minutes alarm (BCD) +04h byte Current hour (BCD) +05h byte Hours alarm (BCD) +06h byte Current day of week (BCD) +07h byte Current day of month (BCD) +08h byte Current month (BCD) +09h byte Current year (BCD) +0Ah byte Register A + bit 7 = Update in progress + bit 6-4 = Divider for clock + 000 = 4.194 MHz + 001 = 1.049 MHz + 010 = 32 kHz + 10X = test modes + 110 = reset / disable + 111 = reset / disable + bit 3-0 = Rate selection for periodic interrupt + 000 = periodic timer disabled + 001 = 3.90625 uS + 010 = 7.8125 uS + 011 = .122070 mS + 100 = .244141 mS + ... + 1101 = 125 mS + 1110 = 250 mS + 1111 = 500 mS +0Bh byte Register B + bit 7 = Run (0) / Halt (1) + bit 6 = Periodic interrupt enable + bit 5 = Alarm interrupt enable + bit 4 = Update-ended interrupt enable + bit 3 = Square wave interrupt enable + bit 2 = BCD calendar (0) / Binary (1) + bit 1 = 12-hour mode (0) / 24-hour mode (1) + bit 0 = 0 (DST off) / 1 (DST enabled) +OCh byte Register C (read only) + bit 7 = interrupt request flag (IRQF) + bit 6 = periodic interrupt flag (PF) + bit 5 = alarm interrupt flag (AF) + bit 4 = update interrupt flag (UF) + bit 3-0 = reserved +ODh byte Register D (read only) + bit 7 = RTC has power + bit 6-0 = reserved +32h byte Current century BCD (*) + (*) location vendor specific and now determined from ACPI global tables + +2.3) APIC + +On Pentium and later processors, an on-board timer is available to each CPU +as part of the Advanced Programmable Interrupt Controller. The APIC is +accessed through memory-mapped registers and provides interrupt service to each +CPU, used for IPIs and local timer interrupts. + +Although in theory the APIC is a safe and stable source for local interrupts, +in practice, many bugs and glitches have occurred due to the special nature of +the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect +the use of the APIC and that workarounds may be required. In addition, some of +these workarounds pose unique constraints for virtualization - requiring either +extra overhead incurred from extra reads of memory-mapped I/O or additional +functionality that may be more computationally expensive to implement. + +Since the APIC is documented quite well in the Intel and AMD manuals, we will +avoid repetition of the detail here. It should be pointed out that the APIC +timer is programmed through the LVT (local vector timer) register, is capable +of one-shot or periodic operation, and is based on the bus clock divided down +by the programmable divider register. + +2.4) HPET + +HPET is quite complex, and was originally intended to replace the PIT / RTC +support of the X86 PC. It remains to be seen whether that will be the case, as +the de facto standard of PC hardware is to emulate these older devices. Some +systems designated as legacy free may support only the HPET as a hardware timer +device. + +The HPET spec is rather loose and vague, requiring at least 3 hardware timers, +but allowing implementation freedom to support many more. It also imposes no +fixed rate on the timer frequency, but does impose some extremal values on +frequency, error and slew. + +In general, the HPET is recommended as a high precision (compared to PIT /RTC) +time source which is independent of local variation (as there is only one HPET +in any given system). The HPET is also memory-mapped, and its presence is +indicated through ACPI tables by the BIOS. + +Detailed specification of the HPET is beyond the current scope of this +document, as it is also very well documented elsewhere. + +2.5) Offboard Timers + +Several cards, both proprietary (watchdog boards) and commonplace (e1000) have +timing chips built into the cards which may have registers which are accessible +to kernel or user drivers. To the author's knowledge, using these to generate +a clocksource for a Linux or other kernel has not yet been attempted and is in +general frowned upon as not playing by the agreed rules of the game. Such a +timer device would require additional support to be virtualized properly and is +not considered important at this time as no known operating system does this. + +========================================================================= + +3) TSC Hardware + +The TSC or time stamp counter is relatively simple in theory; it counts +instruction cycles issued by the processor, which can be used as a measure of +time. In practice, due to a number of problems, it is the most complicated +timekeeping device to use. + +The TSC is represented internally as a 64-bit MSR which can be read with the +RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware +limitations made it possible to write the TSC, but generally on old hardware it +was only possible to write the low 32-bits of the 64-bit counter, and the upper +32-bits of the counter were cleared. Now, however, on Intel processors family +0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction +has been lifted and all 64-bits are writable. On AMD systems, the ability to +write the TSC MSR is not an architectural guarantee. + +The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by +means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. + +Some vendors have implemented an additional instruction, RDTSCP, which returns +atomically not just the TSC, but an indicator which corresponds to the +processor number. This can be used to index into an array of TSC variables to +determine offset information in SMP systems where TSCs are not synchronized. +The presence of this instruction must be determined by consulting CPUID feature +bits. + +Both VMX and SVM provide extension fields in the virtualization hardware which +allows the guest visible TSC to be offset by a constant. Newer implementations +promise to allow the TSC to additionally be scaled, but this hardware is not +yet widely available. + +3.1) TSC synchronization + +The TSC is a CPU-local clock in most implementations. This means, on SMP +platforms, the TSCs of different CPUs may start at different times depending +on when the CPUs are powered on. Generally, CPUs on the same die will share +the same clock, however, this is not always the case. + +The BIOS may attempt to resynchronize the TSCs during the poweron process and +the operating system or other system software may attempt to do this as well. +Several hardware limitations make the problem worse - if it is not possible to +write the full 64-bits of the TSC, it may be impossible to match the TSC in +newly arriving CPUs to that of the rest of the system, resulting in +unsynchronized TSCs. This may be done by BIOS or system software, but in +practice, getting a perfectly synchronized TSC will not be possible unless all +values are read from the same clock, which generally only is possible on single +socket systems or those with special hardware support. + +3.2) TSC and CPU hotplug + +As touched on already, CPUs which arrive later than the boot time of the system +may not have a TSC value that is synchronized with the rest of the system. +Either system software, BIOS, or SMM code may actually try to establish the TSC +to a value matching the rest of the system, but a perfect match is usually not +a guarantee. This can have the effect of bringing a system from a state where +TSC is synchronized back to a state where TSC synchronization flaws, however +small, may be exposed to the OS and any virtualization environment. + +3.3) TSC and multi-socket / NUMA + +Multi-socket systems, especially large multi-socket systems are likely to have +individual clocksources rather than a single, universally distributed clock. +Since these clocks are driven by different crystals, they will not have +perfectly matched frequency, and temperature and electrical variations will +cause the CPU clocks, and thus the TSCs to drift over time. Depending on the +exact clock and bus design, the drift may or may not be fixed in absolute +error, and may accumulate over time. + +In addition, very large systems may deliberately slew the clocks of individual +cores. This technique, known as spread-spectrum clocking, reduces EMI at the +clock frequency and harmonics of it, which may be required to pass FCC +standards for telecommunications and computer equipment. + +It is recommended not to trust the TSCs to remain synchronized on NUMA or +multiple socket systems for these reasons. + +3.4) TSC and C-states + +C-states, or idling states of the processor, especially C1E and deeper sleep +states may be problematic for TSC as well. The TSC may stop advancing in such +a state, resulting in a TSC which is behind that of other CPUs when execution +is resumed. Such CPUs must be detected and flagged by the operating system +based on CPU and chipset identifications. + +The TSC in such a case may be corrected by catching it up to a known external +clocksource. + +3.5) TSC frequency change / P-states + +To make things slightly more interesting, some CPUs may change frequency. They +may or may not run the TSC at the same rate, and because the frequency change +may be staggered or slewed, at some points in time, the TSC rate may not be +known other than falling within a range of values. In this case, the TSC will +not be a stable time source, and must be calibrated against a known, stable, +external clock to be a usable source of time. + +Whether the TSC runs at a constant rate or scales with the P-state is model +dependent and must be determined by inspecting CPUID, chipset or vendor +specific MSR fields. + +In addition, some vendors have known bugs where the P-state is actually +compensated for properly during normal operation, but when the processor is +inactive, the P-state may be raised temporarily to service cache misses from +other processors. In such cases, the TSC on halted CPUs could advance faster +than that of non-halted processors. AMD Turion processors are known to have +this problem. + +3.6) TSC and STPCLK / T-states + +External signals given to the processor may also have the effect of stopping +the TSC. This is typically done for thermal emergency power control to prevent +an overheating condition, and typically, there is no way to detect that this +condition has happened. + +3.7) TSC virtualization - VMX + +VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET +field specified in the VMCS. Special instructions must be used to read and +write the VMCS field. + +3.8) TSC virtualization - SVM + +SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, SVM allows passing through the host TSC plus an additional offset +field specified in the SVM control block. + +3.9) TSC feature bits in Linux + +In summary, there is no way to guarantee the TSC remains in perfect +synchronization unless it is explicitly guaranteed by the architecture. Even +if so, the TSCs in multi-sockets or NUMA systems may still run independently +despite being locally consistent. + +The following feature bits are used by Linux to signal various TSC attributes, +but they can only be taken to be meaningful for UP or single node systems. + +X86_FEATURE_TSC : The TSC is available in hardware +X86_FEATURE_RDTSCP : The RDTSCP instruction is available +X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states +X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states +X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) + +4) Virtualization Problems + +Timekeeping is especially problematic for virtualization because a number of +challenges arise. The most obvious problem is that time is now shared between +the host and, potentially, a number of virtual machines. Thus the virtual +operating system does not run with 100% usage of the CPU, despite the fact that +it may very well make that assumption. It may expect it to remain true to very +exacting bounds when interrupt sources are disabled, but in reality only its +virtual interrupt sources are disabled, and the machine may still be preempted +at any time. This causes problems as the passage of real time, the injection +of machine interrupts and the associated clock sources are no longer completely +synchronized with real time. + +This same problem can occur on native harware to a degree, as SMM mode may +steal cycles from the naturally on X86 systems when SMM mode is used by the +BIOS, but not in such an extreme fashion. However, the fact that SMM mode may +cause similar problems to virtualization makes it a good justification for +solving many of these problems on bare metal. + +4.1) Interrupt clocking + +One of the most immediate problems that occurs with legacy operating systems +is that the system timekeeping routines are often designed to keep track of +time by counting periodic interrupts. These interrupts may come from the PIT +or the RTC, but the problem is the same: the host virtualization engine may not +be able to deliver the proper number of interrupts per second, and so guest +time may fall behind. This is especially problematic if a high interrupt rate +is selected, such as 1000 HZ, which is unfortunately the default for many Linux +guests. + +There are three approaches to solving this problem; first, it may be possible +to simply ignore it. Guests which have a separate time source for tracking +'wall clock' or 'real time' may not need any adjustment of their interrupts to +maintain proper time. If this is not sufficient, it may be necessary to inject +additional interrupts into the guest in order to increase the effective +interrupt rate. This approach leads to complications in extreme conditions, +where host load or guest lag is too much to compensate for, and thus another +solution to the problem has risen: the guest may need to become aware of lost +ticks and compensate for them internally. Although promising in theory, the +implementation of this policy in Linux has been extremely error prone, and a +number of buggy variants of lost tick compensation are distributed across +commonly used Linux systems. + +Windows uses periodic RTC clocking as a means of keeping time internally, and +thus requires interrupt slewing to keep proper time. It does use a low enough +rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in +practice. + +4.2) TSC sampling and serialization + +As the highest precision time source available, the cycle counter of the CPU +has aroused much interest from developers. As explained above, this timer has +many problems unique to its nature as a local, potentially unstable and +potentially unsynchronized source. One issue which is not unique to the TSC, +but is highlighted because of its very precise nature is sampling delay. By +definition, the counter, once read is already old. However, it is also +possible for the counter to be read ahead of the actual use of the result. +This is a consequence of the superscalar execution of the instruction stream, +which may execute instructions out of order. Such execution is called +non-serialized. Forcing serialized execution is necessary for precise +measurement with the TSC, and requires a serializing instruction, such as CPUID +or an MSR read. + +Since CPUID may actually be virtualized by a trap and emulate mechanism, this +serialization can pose a performance issue for hardware virtualization. An +accurate time stamp counter reading may therefore not always be available, and +it may be necessary for an implementation to guard against "backwards" reads of +the TSC as seen from other CPUs, even in an otherwise perfectly synchronized +system. + +4.3) Timespec aliasing + +Additionally, this lack of serialization from the TSC poses another challenge +when using results of the TSC when measured against another time source. As +the TSC is much higher precision, many possible values of the TSC may be read +while another clock is still expressing the same value. + +That is, you may read (T,T+10) while external clock C maintains the same value. +Due to non-serialized reads, you may actually end up with a range which +fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but +calibrated against an external value may have a range of valid values. +Re-calibrating this computation may actually cause time, as computed after the +calibration, to go backwards, compared with time computed before the +calibration. + +This problem is particularly pronounced with an internal time source in Linux, +the kernel time, which is expressed in the theoretically high resolution +timespec - but which advances in much larger granularity intervals, sometimes +at the rate of jiffies, and possibly in catchup modes, at a much larger step. + +This aliasing requires care in the computation and recalibration of kvmclock +and any other values derived from TSC computation (such as TSC virtualization +itself). + +4.4) Migration + +Migration of a virtual machine raises problems for timekeeping in two ways. +First, the migration itself may take time, during which interrupts cannot be +delivered, and after which, the guest time may need to be caught up. NTP may +be able to help to some degree here, as the clock correction required is +typically small enough to fall in the NTP-correctable window. + +An additional concern is that timers based off the TSC (or HPET, if the raw bus +clock is exposed) may now be running at different rates, requiring compensation +in some way in the hypervisor by virtualizing these timers. In addition, +migrating to a faster machine may preclude the use of a passthrough TSC, as a +faster clock cannot be made visible to a guest without the potential of time +advancing faster than usual. A slower clock is less of a problem, as it can +always be caught up to the original rate. KVM clock avoids these problems by +simply storing multipliers and offsets against the TSC for the guest to convert +back into nanosecond resolution values. + +4.5) Scheduling + +Since scheduling may be based on precise timing and firing of interrupts, the +scheduling algorithms of an operating system may be adversely affected by +virtualization. In theory, the effect is random and should be universally +distributed, but in contrived as well as real scenarios (guest device access, +causes of virtualization exits, possible context switch), this may not always +be the case. The effect of this has not been well studied. + +In an attempt to work around this, several implementations have provided a +paravirtualized scheduler clock, which reveals the true amount of CPU time for +which a virtual machine has been running. + +4.6) Watchdogs + +Watchdog timers, such as the lock detector in Linux may fire accidentally when +running under hardware virtualization due to timer interrupts being delayed or +misinterpretation of the passage of real time. Usually, these warnings are +spurious and can be ignored, but in some circumstances it may be necessary to +disable such detection. + +4.7) Delays and precision timing + +Precise timing and delays may not be possible in a virtualized system. This +can happen if the system is controlling physical hardware, or issues delays to +compensate for slower I/O to and from devices. The first issue is not solvable +in general for a virtualized system; hardware control software can't be +adequately virtualized without a full real-time operating system, which would +require an RT aware virtualization platform. + +The second issue may cause performance problems, but this is unlikely to be a +significant issue. In many cases these delays may be eliminated through +configuration or paravirtualization. + +4.8) Covert channels and leaks + +In addition to the above problems, time information will inevitably leak to the +guest about the host in anything but a perfect implementation of virtualized +time. This may allow the guest to infer the presence of a hypervisor (as in a +red-pill type detection), and it may allow information to leak between guests +by using CPU utilization itself as a signalling channel. Preventing such +problems would require completely isolated virtual time which may not track +real time any longer. This may be useful in certain security or QA contexts, +but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virtual/lguest/.gitignore b/Documentation/virtual/lguest/.gitignore new file mode 100644 index 0000000..115587f --- /dev/null +++ b/Documentation/virtual/lguest/.gitignore @@ -0,0 +1 @@ +lguest diff --git a/Documentation/virtual/lguest/Makefile b/Documentation/virtual/lguest/Makefile new file mode 100644 index 0000000..bebac6b --- /dev/null +++ b/Documentation/virtual/lguest/Makefile @@ -0,0 +1,8 @@ +# This creates the demonstration utility "lguest" which runs a Linux guest. +# Missing headers? Add "-I../../include -I../../arch/x86/include" +CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE + +all: lguest + +clean: + rm -f lguest diff --git a/Documentation/virtual/lguest/extract b/Documentation/virtual/lguest/extract new file mode 100644 index 0000000..7730bb6 --- /dev/null +++ b/Documentation/virtual/lguest/extract @@ -0,0 +1,58 @@ +#! /bin/sh + +set -e + +PREFIX=$1 +shift + +trap 'rm -r $TMPDIR' 0 +TMPDIR=`mktemp -d` + +exec 3>/dev/null +for f; do + while IFS=" +" read -r LINE; do + case "$LINE" in + *$PREFIX:[0-9]*:\**) + NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` + if [ -f $TMPDIR/$NUM ]; then + echo "$TMPDIR/$NUM already exits prior to $f" + exit 1 + fi + exec 3>>$TMPDIR/$NUM + echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM + /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3 + ;; + *$PREFIX:[0-9]*) + NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"` + if [ -f $TMPDIR/$NUM ]; then + echo "$TMPDIR/$NUM already exits prior to $f" + exit 1 + fi + exec 3>>$TMPDIR/$NUM + echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM + /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3 + ;; + *:\**) + /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3 + echo >&3 + exec 3>/dev/null + ;; + *) + /bin/echo "$LINE" >&3 + ;; + esac + done < $f + echo >&3 + exec 3>/dev/null +done + +LASTFILE="" +for f in $TMPDIR/*; do + if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then + LASTFILE=$(cat $TMPDIR/.$(basename $f) ) + echo "[ $LASTFILE ]" + fi + cat $f +done + diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c new file mode 100644 index 0000000..d9da7e1 --- /dev/null +++ b/Documentation/virtual/lguest/lguest.c @@ -0,0 +1,2095 @@ +/*P:100 + * This is the Launcher code, a simple program which lays out the "physical" + * memory for the new Guest by mapping the kernel image and the virtual + * devices, then opens /dev/lguest to tell the kernel about the Guest and + * control it. +:*/ +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "../../include/linux/lguest_launcher.h" +/*L:110 + * We can ignore the 42 include files we need for this program, but I do want + * to draw attention to the use of kernel-style types. + * + * As Linus said, "C is a Spartan language, and so should your naming be." I + * like these abbreviations, so we define them here. Note that u64 is always + * unsigned long long, which works on all Linux systems: this means that we can + * use %llu in printf for any u64. + */ +typedef unsigned long long u64; +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; +/*:*/ + +#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ +#define BRIDGE_PFX "bridge:" +#ifndef SIOCBRADDIF +#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ +#endif +/* We can have up to 256 pages for devices. */ +#define DEVICE_PAGES 256 +/* This will occupy 3 pages: it must be a power of 2. */ +#define VIRTQUEUE_NUM 256 + +/*L:120 + * verbose is both a global flag and a macro. The C preprocessor allows + * this, and although I wouldn't recommend it, it works quite nicely here. + */ +static bool verbose; +#define verbose(args...) \ + do { if (verbose) printf(args); } while(0) +/*:*/ + +/* The pointer to the start of guest memory. */ +static void *guest_base; +/* The maximum guest physical address allowed, and maximum possible. */ +static unsigned long guest_limit, guest_max; +/* The /dev/lguest file descriptor. */ +static int lguest_fd; + +/* a per-cpu variable indicating whose vcpu is currently running */ +static unsigned int __thread cpu_id; + +/* This is our list of devices. */ +struct device_list { + /* Counter to assign interrupt numbers. */ + unsigned int next_irq; + + /* Counter to print out convenient device numbers. */ + unsigned int device_num; + + /* The descriptor page for the devices. */ + u8 *descpage; + + /* A single linked list of devices. */ + struct device *dev; + /* And a pointer to the last device for easy append. */ + struct device *lastdev; +}; + +/* The list of Guest devices, based on command line arguments. */ +static struct device_list devices; + +/* The device structure describes a single device. */ +struct device { + /* The linked-list pointer. */ + struct device *next; + + /* The device's descriptor, as mapped into the Guest. */ + struct lguest_device_desc *desc; + + /* We can't trust desc values once Guest has booted: we use these. */ + unsigned int feature_len; + unsigned int num_vq; + + /* The name of this device, for --verbose. */ + const char *name; + + /* Any queues attached to this device */ + struct virtqueue *vq; + + /* Is it operational */ + bool running; + + /* Does Guest want an intrrupt on empty? */ + bool irq_on_empty; + + /* Device-specific data. */ + void *priv; +}; + +/* The virtqueue structure describes a queue attached to a device. */ +struct virtqueue { + struct virtqueue *next; + + /* Which device owns me. */ + struct device *dev; + + /* The configuration for this queue. */ + struct lguest_vqconfig config; + + /* The actual ring of buffers. */ + struct vring vring; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* How many are used since we sent last irq? */ + unsigned int pending_used; + + /* Eventfd where Guest notifications arrive. */ + int eventfd; + + /* Function for the thread which is servicing this virtqueue. */ + void (*service)(struct virtqueue *vq); + pid_t thread; +}; + +/* Remember the arguments to the program so we can "reboot" */ +static char **main_args; + +/* The original tty settings to restore on exit. */ +static struct termios orig_term; + +/* + * We have to be careful with barriers: our devices are all run in separate + * threads and so we need to make sure that changes visible to the Guest happen + * in precise order. + */ +#define wmb() __asm__ __volatile__("" : : : "memory") +#define mb() __asm__ __volatile__("" : : : "memory") + +/* + * Convert an iovec element to the given type. + * + * This is a fairly ugly trick: we need to know the size of the type and + * alignment requirement to check the pointer is kosher. It's also nice to + * have the name of the type in case we report failure. + * + * Typing those three things all the time is cumbersome and error prone, so we + * have a macro which sets them all up and passes to the real function. + */ +#define convert(iov, type) \ + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) + +static void *_convert(struct iovec *iov, size_t size, size_t align, + const char *name) +{ + if (iov->iov_len != size) + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); + if ((unsigned long)iov->iov_base % align != 0) + errx(1, "Bad alignment %p for %s", iov->iov_base, name); + return iov->iov_base; +} + +/* Wrapper for the last available index. Makes it easier to change. */ +#define lg_last_avail(vq) ((vq)->last_avail_idx) + +/* + * The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. + */ +#define cpu_to_le16(v16) (v16) +#define cpu_to_le32(v32) (v32) +#define cpu_to_le64(v64) (v64) +#define le16_to_cpu(v16) (v16) +#define le32_to_cpu(v32) (v32) +#define le64_to_cpu(v64) (v64) + +/* Is this iovec empty? */ +static bool iov_empty(const struct iovec iov[], unsigned int num_iov) +{ + unsigned int i; + + for (i = 0; i < num_iov; i++) + if (iov[i].iov_len) + return false; + return true; +} + +/* Take len bytes from the front of this iovec. */ +static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) +{ + unsigned int i; + + for (i = 0; i < num_iov; i++) { + unsigned int used; + + used = iov[i].iov_len < len ? iov[i].iov_len : len; + iov[i].iov_base += used; + iov[i].iov_len -= used; + len -= used; + } + assert(len == 0); +} + +/* The device virtqueue descriptors are followed by feature bitmasks. */ +static u8 *get_feature_bits(struct device *dev) +{ + return (u8 *)(dev->desc + 1) + + dev->num_vq * sizeof(struct lguest_vqconfig); +} + +/*L:100 + * The Launcher code itself takes us out into userspace, that scary place where + * pointers run wild and free! Unfortunately, like most userspace programs, + * it's quite boring (which is why everyone likes to hack on the kernel!). + * Perhaps if you make up an Lguest Drinking Game at this point, it will get + * you through this section. Or, maybe not. + * + * The Launcher sets up a big chunk of memory to be the Guest's "physical" + * memory and stores it in "guest_base". In other words, Guest physical == + * Launcher virtual with an offset. + * + * This can be tough to get your head around, but usually it just means that we + * use these trivial conversion functions when the Guest gives us its + * "physical" addresses: + */ +static void *from_guest_phys(unsigned long addr) +{ + return guest_base + addr; +} + +static unsigned long to_guest_phys(const void *addr) +{ + return (addr - guest_base); +} + +/*L:130 + * Loading the Kernel. + * + * We start with couple of simple helper routines. open_or_die() avoids + * error-checking code cluttering the callers: + */ +static int open_or_die(const char *name, int flags) +{ + int fd = open(name, flags); + if (fd < 0) + err(1, "Failed to open %s", name); + return fd; +} + +/* map_zeroed_pages() takes a number of pages. */ +static void *map_zeroed_pages(unsigned int num) +{ + int fd = open_or_die("/dev/zero", O_RDONLY); + void *addr; + + /* + * We use a private mapping (ie. if we write to the page, it will be + * copied). We allocate an extra two pages PROT_NONE to act as guard + * pages against read/write attempts that exceed allocated space. + */ + addr = mmap(NULL, getpagesize() * (num+2), + PROT_NONE, MAP_PRIVATE, fd, 0); + + if (addr == MAP_FAILED) + err(1, "Mmapping %u pages of /dev/zero", num); + + if (mprotect(addr + getpagesize(), getpagesize() * num, + PROT_READ|PROT_WRITE) == -1) + err(1, "mprotect rw %u pages failed", num); + + /* + * One neat mmap feature is that you can close the fd, and it + * stays mapped. + */ + close(fd); + + /* Return address after PROT_NONE page */ + return addr + getpagesize(); +} + +/* Get some more pages for a device. */ +static void *get_pages(unsigned int num) +{ + void *addr = from_guest_phys(guest_limit); + + guest_limit += num * getpagesize(); + if (guest_limit > guest_max) + errx(1, "Not enough memory for devices"); + return addr; +} + +/* + * This routine is used to load the kernel or initrd. It tries mmap, but if + * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), + * it falls back to reading the memory in. + */ +static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) +{ + ssize_t r; + + /* + * We map writable even though for some segments are marked read-only. + * The kernel really wants to be writable: it patches its own + * instructions. + * + * MAP_PRIVATE means that the page won't be copied until a write is + * done to it. This allows us to share untouched memory between + * Guests. + */ + if (mmap(addr, len, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) + return; + + /* pread does a seek and a read in one shot: saves a few lines. */ + r = pread(fd, addr, len, offset); + if (r != len) + err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); +} + +/* + * This routine takes an open vmlinux image, which is in ELF, and maps it into + * the Guest memory. ELF = Embedded Linking Format, which is the format used + * by all modern binaries on Linux including the kernel. + * + * The ELF headers give *two* addresses: a physical address, and a virtual + * address. We use the physical address; the Guest will map itself to the + * virtual address. + * + * We return the starting address. + */ +static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) +{ + Elf32_Phdr phdr[ehdr->e_phnum]; + unsigned int i; + + /* + * Sanity checks on the main ELF header: an x86 executable with a + * reasonable number of correctly-sized program headers. + */ + if (ehdr->e_type != ET_EXEC + || ehdr->e_machine != EM_386 + || ehdr->e_phentsize != sizeof(Elf32_Phdr) + || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) + errx(1, "Malformed elf header"); + + /* + * An ELF executable contains an ELF header and a number of "program" + * headers which indicate which parts ("segments") of the program to + * load where. + */ + + /* We read in all the program headers at once: */ + if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) + err(1, "Seeking to program headers"); + if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) + err(1, "Reading program headers"); + + /* + * Try all the headers: there are usually only three. A read-only one, + * a read-write one, and a "note" section which we don't load. + */ + for (i = 0; i < ehdr->e_phnum; i++) { + /* If this isn't a loadable segment, we ignore it */ + if (phdr[i].p_type != PT_LOAD) + continue; + + verbose("Section %i: size %i addr %p\n", + i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); + + /* We map this section of the file at its physical address. */ + map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), + phdr[i].p_offset, phdr[i].p_filesz); + } + + /* The entry point is given in the ELF header. */ + return ehdr->e_entry; +} + +/*L:150 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed + * to jump into it and it will unpack itself. We used to have to perform some + * hairy magic because the unpacking code scared me. + * + * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote + * a small patch to jump over the tricky bits in the Guest, so now we just read + * the funky header so we know where in the file to load, and away we go! + */ +static unsigned long load_bzimage(int fd) +{ + struct boot_params boot; + int r; + /* Modern bzImages get loaded at 1M. */ + void *p = from_guest_phys(0x100000); + + /* + * Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/x86/i386/boot.txt) + */ + lseek(fd, 0, SEEK_SET); + read(fd, &boot, sizeof(boot)); + + /* Inside the setup_hdr, we expect the magic "HdrS" */ + if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) + errx(1, "This doesn't look like a bzImage to me"); + + /* Skip over the extra sectors of the header. */ + lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); + + /* Now read everything into memory. in nice big chunks. */ + while ((r = read(fd, p, 65536)) > 0) + p += r; + + /* Finally, code32_start tells us where to enter the kernel. */ + return boot.hdr.code32_start; +} + +/*L:140 + * Loading the kernel is easy when it's a "vmlinux", but most kernels + * come wrapped up in the self-decompressing "bzImage" format. With a little + * work, we can load those, too. + */ +static unsigned long load_kernel(int fd) +{ + Elf32_Ehdr hdr; + + /* Read in the first few bytes. */ + if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) + err(1, "Reading kernel"); + + /* If it's an ELF file, it starts with "\177ELF" */ + if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) + return map_elf(fd, &hdr); + + /* Otherwise we assume it's a bzImage, and try to load it. */ + return load_bzimage(fd); +} + +/* + * This is a trivial little helper to align pages. Andi Kleen hated it because + * it calls getpagesize() twice: "it's dumb code." + * + * Kernel guys get really het up about optimization, even when it's not + * necessary. I leave this code as a reaction against that. + */ +static inline unsigned long page_align(unsigned long addr) +{ + /* Add upwards and truncate downwards. */ + return ((addr + getpagesize()-1) & ~(getpagesize()-1)); +} + +/*L:180 + * An "initial ram disk" is a disk image loaded into memory along with the + * kernel which the kernel can use to boot from without needing any drivers. + * Most distributions now use this as standard: the initrd contains the code to + * load the appropriate driver modules for the current machine. + * + * Importantly, James Morris works for RedHat, and Fedora uses initrds for its + * kernels. He sent me this (and tells me when I break it). + */ +static unsigned long load_initrd(const char *name, unsigned long mem) +{ + int ifd; + struct stat st; + unsigned long len; + + ifd = open_or_die(name, O_RDONLY); + /* fstat() is needed to get the file size. */ + if (fstat(ifd, &st) < 0) + err(1, "fstat() on initrd '%s'", name); + + /* + * We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. + */ + len = page_align(st.st_size); + map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); + /* + * Once a file is mapped, you can close the file descriptor. It's a + * little odd, but quite useful. + */ + close(ifd); + verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); + + /* We return the initrd size. */ + return len; +} +/*:*/ + +/* + * Simple routine to roll all the commandline arguments together with spaces + * between them. + */ +static void concat(char *dst, char *args[]) +{ + unsigned int i, len = 0; + + for (i = 0; args[i]; i++) { + if (i) { + strcat(dst+len, " "); + len++; + } + strcpy(dst+len, args[i]); + len += strlen(args[i]); + } + /* In case it's empty. */ + dst[len] = '\0'; +} + +/*L:185 + * This is where we actually tell the kernel to initialize the Guest. We + * saw the arguments it expects when we looked at initialize() in lguest_user.c: + * the base of Guest "physical" memory, the top physical page to allow and the + * entry point for the Guest. + */ +static void tell_kernel(unsigned long start) +{ + unsigned long args[] = { LHREQ_INITIALIZE, + (unsigned long)guest_base, + guest_limit / getpagesize(), start }; + verbose("Guest: %p - %p (%#lx)\n", + guest_base, guest_base + guest_limit, guest_limit); + lguest_fd = open_or_die("/dev/lguest", O_RDWR); + if (write(lguest_fd, args, sizeof(args)) < 0) + err(1, "Writing to /dev/lguest"); +} +/*:*/ + +/*L:200 + * Device Handling. + * + * When the Guest gives us a buffer, it sends an array of addresses and sizes. + * We need to make sure it's not trying to reach into the Launcher itself, so + * we have a convenient routine which checks it and exits with an error message + * if something funny is going on: + */ +static void *_check_pointer(unsigned long addr, unsigned int size, + unsigned int line) +{ + /* + * Check if the requested address and size exceeds the allocated memory, + * or addr + size wraps around. + */ + if ((addr + size) > guest_limit || (addr + size) < addr) + errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); + /* + * We return a pointer for the caller's convenience, now we know it's + * safe to use. + */ + return from_guest_phys(addr); +} +/* A macro which transparently hands the line number to the real function. */ +#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) + +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, or vq->vring.num if we're + * at the end. + */ +static unsigned next_desc(struct vring_desc *desc, + unsigned int i, unsigned int max) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc[i].flags & VRING_DESC_F_NEXT)) + return max; + + /* Check they're not leading us off end of descriptors. */ + next = desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + wmb(); + + if (next >= max) + errx(1, "Desc next is %u", next); + + return next; +} + +/* + * This actually sends the interrupt for this virtqueue, if we've used a + * buffer. + */ +static void trigger_irq(struct virtqueue *vq) +{ + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + /* Don't inform them if nothing used. */ + if (!vq->pending_used) + return; + vq->pending_used = 0; + + /* If they don't want an interrupt, don't send one... */ + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { + /* ... unless they've asked us to force one on empty. */ + if (!vq->dev->irq_on_empty + || lg_last_avail(vq) != vq->vring.avail->idx) + return; + } + + /* Send the Guest an interrupt tell them we used something up. */ + if (write(lguest_fd, buf, sizeof(buf)) != 0) + err(1, "Triggering irq %i", vq->config.irq); +} + +/* + * This looks in the virtqueue for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function waits if necessary, and returns the descriptor number found. + */ +static unsigned wait_for_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + unsigned int i, head, max; + struct vring_desc *desc; + u16 last_avail = lg_last_avail(vq); + + /* There's nothing available? */ + while (last_avail == vq->vring.avail->idx) { + u64 event; + + /* + * Since we're about to sleep, now is a good time to tell the + * Guest about what we've used up to now. + */ + trigger_irq(vq); + + /* OK, now we need to know about added descriptors. */ + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; + + /* + * They could have slipped one in as we were doing that: make + * sure it's written, then check again. + */ + mb(); + if (last_avail != vq->vring.avail->idx) { + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + break; + } + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) + errx(1, "Event read failed?"); + + /* We don't need to be notified again. */ + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + } + + /* Check it isn't doing very strange things with descriptor numbers. */ + if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) + errx(1, "Guest moved used index from %u to %u", + last_avail, vq->vring.avail->idx); + + /* + * Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + head = vq->vring.avail->ring[last_avail % vq->vring.num]; + lg_last_avail(vq)++; + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->vring.num) + errx(1, "Guest says index %u is available", head); + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + max = vq->vring.num; + desc = vq->vring.desc; + i = head; + + /* + * If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. + */ + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) + errx(1, "Invalid size for indirect buffer table"); + + max = desc[i].len / sizeof(struct vring_desc); + desc = check_pointer(desc[i].addr, desc[i].len); + i = 0; + } + + do { + /* Grab the first descriptor, and check it's OK. */ + iov[*out_num + *in_num].iov_len = desc[i].len; + iov[*out_num + *in_num].iov_base + = check_pointer(desc[i].addr, desc[i].len); + /* If this is an input descriptor, increment that count. */ + if (desc[i].flags & VRING_DESC_F_WRITE) + (*in_num)++; + else { + /* + * If it's an output descriptor, they're all supposed + * to come before any input descriptors. + */ + if (*in_num) + errx(1, "Descriptor has out after in"); + (*out_num)++; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (*out_num + *in_num > max) + errx(1, "Looped descriptor"); + } while ((i = next_desc(desc, i, max)) != max); + + return head; +} + +/* + * After we've used one of their buffers, we tell the Guest about it. Sometime + * later we'll want to send them an interrupt using trigger_irq(); note that + * wait_for_vq_desc() does that for us if it has to wait. + */ +static void add_used(struct virtqueue *vq, unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* + * The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. + */ + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; + used->id = head; + used->len = len; + /* Make sure buffer is written before we update index. */ + wmb(); + vq->vring.used->idx++; + vq->pending_used++; +} + +/* And here's the combo meal deal. Supersize me! */ +static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) +{ + add_used(vq, head, len); + trigger_irq(vq); +} + +/* + * The Console + * + * We associate some data with the console for our exit hack. + */ +struct console_abort { + /* How many times have they hit ^C? */ + int count; + /* When did they start? */ + struct timeval start; +}; + +/* This is the routine which handles console input (ie. stdin). */ +static void console_input(struct virtqueue *vq) +{ + int len; + unsigned int head, in_num, out_num; + struct console_abort *abort = vq->dev->priv; + struct iovec iov[vq->vring.num]; + + /* Make sure there's a descriptor available. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + if (out_num) + errx(1, "Output buffers in console in queue?"); + + /* Read into it. This is where we usually wait. */ + len = readv(STDIN_FILENO, iov, in_num); + if (len <= 0) { + /* Ran out of input? */ + warnx("Failed to get console input, ignoring console."); + /* + * For simplicity, dying threads kill the whole Launcher. So + * just nap here. + */ + for (;;) + pause(); + } + + /* Tell the Guest we used a buffer. */ + add_used_and_trigger(vq, head, len); + + /* + * Three ^C within one second? Exit. + * + * This is such a hack, but works surprisingly well. Each ^C has to + * be in a buffer by itself, so they can't be too fast. But we check + * that we get three within about a second, so they can't be too + * slow. + */ + if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { + abort->count = 0; + return; + } + + abort->count++; + if (abort->count == 1) + gettimeofday(&abort->start, NULL); + else if (abort->count == 3) { + struct timeval now; + gettimeofday(&now, NULL); + /* Kill all Launcher processes with SIGINT, like normal ^C */ + if (now.tv_sec <= abort->start.tv_sec+1) + kill(0, SIGINT); + abort->count = 0; + } +} + +/* This is the routine which handles console output (ie. stdout). */ +static void console_output(struct virtqueue *vq) +{ + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + + /* We usually wait in here, for the Guest to give us something. */ + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in console output queue?"); + + /* writev can return a partial write, so we loop here. */ + while (!iov_empty(iov, out)) { + int len = writev(STDOUT_FILENO, iov, out); + if (len <= 0) + err(1, "Write to stdout gave %i", len); + iov_consume(iov, out, len); + } + + /* + * We're finished with that buffer: if we're going to sleep, + * wait_for_vq_desc() will prod the Guest with an interrupt. + */ + add_used(vq, head, 0); +} + +/* + * The Network + * + * Handling output for network is also simple: we get all the output buffers + * and write them to /dev/net/tun. + */ +struct net_info { + int tunfd; +}; + +static void net_output(struct virtqueue *vq) +{ + struct net_info *net_info = vq->dev->priv; + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + + /* We usually wait in here for the Guest to give us a packet. */ + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in net output queue?"); + /* + * Send the whole thing through to /dev/net/tun. It expects the exact + * same format: what a coincidence! + */ + if (writev(net_info->tunfd, iov, out) < 0) + errx(1, "Write to tun failed?"); + + /* + * Done with that one; wait_for_vq_desc() will send the interrupt if + * all packets are processed. + */ + add_used(vq, head, 0); +} + +/* + * Handling network input is a bit trickier, because I've tried to optimize it. + * + * First we have a helper routine which tells is if from this file descriptor + * (ie. the /dev/net/tun device) will block: + */ +static bool will_block(int fd) +{ + fd_set fdset; + struct timeval zero = { 0, 0 }; + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + return select(fd+1, &fdset, NULL, NULL, &zero) != 1; +} + +/* + * This handles packets coming in from the tun device to our Guest. Like all + * service routines, it gets called again as soon as it returns, so you don't + * see a while(1) loop here. + */ +static void net_input(struct virtqueue *vq) +{ + int len; + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + struct net_info *net_info = vq->dev->priv; + + /* + * Get a descriptor to write an incoming packet into. This will also + * send an interrupt if they're out of descriptors. + */ + head = wait_for_vq_desc(vq, iov, &out, &in); + if (out) + errx(1, "Output buffers in net input queue?"); + + /* + * If it looks like we'll block reading from the tun device, send them + * an interrupt. + */ + if (vq->pending_used && will_block(net_info->tunfd)) + trigger_irq(vq); + + /* + * Read in the packet. This is where we normally wait (when there's no + * incoming network traffic). + */ + len = readv(net_info->tunfd, iov, in); + if (len <= 0) + err(1, "Failed to read from tun."); + + /* + * Mark that packet buffer as used, but don't interrupt here. We want + * to wait until we've done as much work as we can. + */ + add_used(vq, head, len); +} +/*:*/ + +/* This is the helper to create threads: run the service routine in a loop. */ +static int do_thread(void *_vq) +{ + struct virtqueue *vq = _vq; + + for (;;) + vq->service(vq); + return 0; +} + +/* + * When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! + */ +static void kill_launcher(int signal) +{ + kill(0, SIGTERM); +} + +static void reset_device(struct device *dev) +{ + struct virtqueue *vq; + + verbose("Resetting device %s\n", dev->name); + + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); + + /* We're going to be explicitly killing threads, so ignore them. */ + signal(SIGCHLD, SIG_IGN); + + /* Zero out the virtqueues, get rid of their threads */ + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->thread != (pid_t)-1) { + kill(vq->thread, SIGTERM); + waitpid(vq->thread, NULL, 0); + vq->thread = (pid_t)-1; + } + memset(vq->vring.desc, 0, + vring_size(vq->config.num, LGUEST_VRING_ALIGN)); + lg_last_avail(vq) = 0; + } + dev->running = false; + + /* Now we care if threads die. */ + signal(SIGCHLD, (void *)kill_launcher); +} + +/*L:216 + * This actually creates the thread which services the virtqueue for a device. + */ +static void create_thread(struct virtqueue *vq) +{ + /* + * Create stack for thread. Since the stack grows upwards, we point + * the stack pointer to the end of this region. + */ + char *stack = malloc(32768); + unsigned long args[] = { LHREQ_EVENTFD, + vq->config.pfn*getpagesize(), 0 }; + + /* Create a zero-initialized eventfd. */ + vq->eventfd = eventfd(0, 0); + if (vq->eventfd < 0) + err(1, "Creating eventfd"); + args[2] = vq->eventfd; + + /* + * Attach an eventfd to this virtqueue: it will go off when the Guest + * does an LHCALL_NOTIFY for this vq. + */ + if (write(lguest_fd, &args, sizeof(args)) != 0) + err(1, "Attaching eventfd"); + + /* + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so + * we get a signal if it dies. + */ + vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); + if (vq->thread == (pid_t)-1) + err(1, "Creating clone"); + + /* We close our local copy now the child has it. */ + close(vq->eventfd); +} + +static bool accepted_feature(struct device *dev, unsigned int bit) +{ + const u8 *features = get_feature_bits(dev) + dev->feature_len; + + if (dev->feature_len < bit / CHAR_BIT) + return false; + return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); +} + +static void start_device(struct device *dev) +{ + unsigned int i; + struct virtqueue *vq; + + verbose("Device %s OK: offered", dev->name); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev)[i]); + verbose(", accepted"); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev) + [dev->feature_len+i]); + + dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); + + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->service) + create_thread(vq); + } + dev->running = true; +} + +static void cleanup_devices(void) +{ + struct device *dev; + + for (dev = devices.dev; dev; dev = dev->next) + reset_device(dev); + + /* If we saved off the original terminal settings, restore them now. */ + if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) + tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); +} + +/* When the Guest tells us they updated the status field, we handle it. */ +static void update_device_status(struct device *dev) +{ + /* A zero status is a reset, otherwise it's a set of flags. */ + if (dev->desc->status == 0) + reset_device(dev); + else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { + warnx("Device %s configuration FAILED", dev->name); + if (dev->running) + reset_device(dev); + } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { + if (!dev->running) + start_device(dev); + } +} + +/*L:215 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In + * particular, it's used to notify us of device status changes during boot. + */ +static void handle_output(unsigned long addr) +{ + struct device *i; + + /* Check each device. */ + for (i = devices.dev; i; i = i->next) { + struct virtqueue *vq; + + /* + * Notifications to device descriptors mean they updated the + * device status. + */ + if (from_guest_phys(addr) == i->desc) { + update_device_status(i); + return; + } + + /* + * Devices *can* be used before status is set to DRIVER_OK. + * The original plan was that they would never do this: they + * would always finish setting up their status bits before + * actually touching the virtqueues. In practice, we allowed + * them to, and they do (eg. the disk probes for partition + * tables as part of initialization). + * + * If we see this, we start the device: once it's running, we + * expect the device to catch all the notifications. + */ + for (vq = i->vq; vq; vq = vq->next) { + if (addr != vq->config.pfn*getpagesize()) + continue; + if (i->running) + errx(1, "Notification on running %s", i->name); + /* This just calls create_thread() for each virtqueue */ + start_device(i); + return; + } + } + + /* + * Early console write is done using notify on a nul-terminated string + * in Guest memory. It's also great for hacking debugging messages + * into a Guest. + */ + if (addr >= guest_limit) + errx(1, "Bad NOTIFY %#lx", addr); + + write(STDOUT_FILENO, from_guest_phys(addr), + strnlen(from_guest_phys(addr), guest_limit - addr)); +} + +/*L:190 + * Device Setup + * + * All devices need a descriptor so the Guest knows it exists, and a "struct + * device" so the Launcher can keep track of it. We have common helper + * routines to allocate and manage them. + */ + +/* + * The layout of the device page is a "struct lguest_device_desc" followed by a + * number of virtqueue descriptors, then two sets of feature bits, then an + * array of configuration bytes. This routine returns the configuration + * pointer. + */ +static u8 *device_config(const struct device *dev) +{ + return (void *)(dev->desc + 1) + + dev->num_vq * sizeof(struct lguest_vqconfig) + + dev->feature_len * 2; +} + +/* + * This routine allocates a new "struct lguest_device_desc" from descriptor + * table page just above the Guest's normal memory. It returns a pointer to + * that descriptor. + */ +static struct lguest_device_desc *new_dev_desc(u16 type) +{ + struct lguest_device_desc d = { .type = type }; + void *p; + + /* Figure out where the next device config is, based on the last one. */ + if (devices.lastdev) + p = device_config(devices.lastdev) + + devices.lastdev->desc->config_len; + else + p = devices.descpage; + + /* We only have one page for all the descriptors. */ + if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* p might not be aligned, so we memcpy in. */ + return memcpy(p, &d, sizeof(d)); +} + +/* + * Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. + */ +static void add_virtqueue(struct device *dev, unsigned int num_descs, + void (*service)(struct virtqueue *)) +{ + unsigned int pages; + struct virtqueue **i, *vq = malloc(sizeof(*vq)); + void *p; + + /* First we need some memory for this virtqueue. */ + pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) + / getpagesize(); + p = get_pages(pages); + + /* Initialize the virtqueue */ + vq->next = NULL; + vq->last_avail_idx = 0; + vq->dev = dev; + + /* + * This is the routine the service thread will run, and its Process ID + * once it's running. + */ + vq->service = service; + vq->thread = (pid_t)-1; + + /* Initialize the configuration. */ + vq->config.num = num_descs; + vq->config.irq = devices.next_irq++; + vq->config.pfn = to_guest_phys(p) / getpagesize(); + + /* Initialize the vring. */ + vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); + + /* + * Append virtqueue to this device's descriptor. We use + * device_config() to get the end of the device's current virtqueues; + * we check that we haven't added any config or feature information + * yet, otherwise we'd be overwriting them. + */ + assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); + memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->num_vq++; + dev->desc->num_vq++; + + verbose("Virtqueue page %#lx\n", to_guest_phys(p)); + + /* + * Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. + */ + for (i = &dev->vq; *i; i = &(*i)->next); + *i = vq; +} + +/* + * The first half of the feature bitmask is for us to advertise features. The + * second half is for the Guest to accept features. + */ +static void add_feature(struct device *dev, unsigned bit) +{ + u8 *features = get_feature_bits(dev); + + /* We can't extend the feature bits once we've added config bytes */ + if (dev->desc->feature_len <= bit / CHAR_BIT) { + assert(dev->desc->config_len == 0); + dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; + } + + features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); +} + +/* + * This routine sets the configuration fields for an existing device's + * descriptor. It only works for the last device, but that's OK because that's + * how we use it. + */ +static void set_config(struct device *dev, unsigned len, const void *conf) +{ + /* Check we haven't overflowed our single page. */ + if (device_config(dev) + len > devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the config information, and store the length. */ + memcpy(device_config(dev), conf, len); + dev->desc->config_len = len; + + /* Size must fit in config_len field (8 bits)! */ + assert(dev->desc->config_len == len); +} + +/* + * This routine does all the creation and setup of a new device, including + * calling new_dev_desc() to allocate the descriptor and device memory. We + * don't actually start the service threads until later. + * + * See what I mean about userspace being boring? + */ +static struct device *new_device(const char *name, u16 type) +{ + struct device *dev = malloc(sizeof(*dev)); + + /* Now we populate the fields one at a time. */ + dev->desc = new_dev_desc(type); + dev->name = name; + dev->vq = NULL; + dev->feature_len = 0; + dev->num_vq = 0; + dev->running = false; + + /* + * Append to device list. Prepending to a single-linked list is + * easier, but the user expects the devices to be arranged on the bus + * in command-line order. The first network device on the command line + * is eth0, the first block device /dev/vda, etc. + */ + if (devices.lastdev) + devices.lastdev->next = dev; + else + devices.dev = dev; + devices.lastdev = dev; + + return dev; +} + +/* + * Our first setup routine is the console. It's a fairly simple device, but + * UNIX tty handling makes it uglier than it could be. + */ +static void setup_console(void) +{ + struct device *dev; + + /* If we can save the initial standard input settings... */ + if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { + struct termios term = orig_term; + /* + * Then we turn off echo, line buffering and ^C etc: We want a + * raw input stream to the Guest. + */ + term.c_lflag &= ~(ISIG|ICANON|ECHO); + tcsetattr(STDIN_FILENO, TCSANOW, &term); + } + + dev = new_device("console", VIRTIO_ID_CONSOLE); + + /* We store the console state in dev->priv, and initialize it. */ + dev->priv = malloc(sizeof(struct console_abort)); + ((struct console_abort *)dev->priv)->count = 0; + + /* + * The console needs two virtqueues: the input then the output. When + * they put something the input queue, we make sure we're listening to + * stdin. When they put something in the output queue, we write it to + * stdout. + */ + add_virtqueue(dev, VIRTQUEUE_NUM, console_input); + add_virtqueue(dev, VIRTQUEUE_NUM, console_output); + + verbose("device %u: console\n", ++devices.device_num); +} +/*:*/ + +/*M:010 + * Inter-guest networking is an interesting area. Simplest is to have a + * --sharenet= option which opens or creates a named pipe. This can be + * used to send packets to another guest in a 1:1 manner. + * + * More sopisticated is to use one of the tools developed for project like UML + * to do networking. + * + * Faster is to do virtio bonding in kernel. Doing this 1:1 would be + * completely generic ("here's my vring, attach to your vring") and would work + * for any traffic. Of course, namespace and permissions issues need to be + * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide + * multiple inter-guest channels behind one interface, although it would + * require some manner of hotplugging new virtio channels. + * + * Finally, we could implement a virtio network switch in the kernel. +:*/ + +static u32 str2ip(const char *ipaddr) +{ + unsigned int b[4]; + + if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) + errx(1, "Failed to parse IP address '%s'", ipaddr); + return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; +} + +static void str2mac(const char *macaddr, unsigned char mac[6]) +{ + unsigned int m[6]; + if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", + &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) + errx(1, "Failed to parse mac address '%s'", macaddr); + mac[0] = m[0]; + mac[1] = m[1]; + mac[2] = m[2]; + mac[3] = m[3]; + mac[4] = m[4]; + mac[5] = m[5]; +} + +/* + * This code is "adapted" from libbridge: it attaches the Host end of the + * network device to the bridge device specified by the command line. + * + * This is yet another James Morris contribution (I'm an IP-level guy, so I + * dislike bridging), and I just try not to break it. + */ +static void add_to_bridge(int fd, const char *if_name, const char *br_name) +{ + int ifidx; + struct ifreq ifr; + + if (!*br_name) + errx(1, "must specify bridge name"); + + ifidx = if_nametoindex(if_name); + if (!ifidx) + errx(1, "interface %s does not exist!", if_name); + + strncpy(ifr.ifr_name, br_name, IFNAMSIZ); + ifr.ifr_name[IFNAMSIZ-1] = '\0'; + ifr.ifr_ifindex = ifidx; + if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) + err(1, "can't add %s to bridge %s", if_name, br_name); +} + +/* + * This sets up the Host end of the network device with an IP address, brings + * it up so packets will flow, the copies the MAC address into the hwaddr + * pointer. + */ +static void configure_device(int fd, const char *tapif, u32 ipaddr) +{ + struct ifreq ifr; + struct sockaddr_in sin; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, tapif); + + /* Don't read these incantations. Just cut & paste them like I did! */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(ipaddr); + memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); + if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) + err(1, "Setting %s interface address", tapif); + ifr.ifr_flags = IFF_UP; + if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) + err(1, "Bringing interface %s up", tapif); +} + +static int get_tun_device(char tapif[IFNAMSIZ]) +{ + struct ifreq ifr; + int netfd; + + /* Start with this zeroed. Messy but sure. */ + memset(&ifr, 0, sizeof(ifr)); + + /* + * We open the /dev/net/tun device and tell it we want a tap device. A + * tap device is like a tun device, only somehow different. To tell + * the truth, I completely blundered my way through this code, but it + * works now! + */ + netfd = open_or_die("/dev/net/tun", O_RDWR); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; + strcpy(ifr.ifr_name, "tap%d"); + if (ioctl(netfd, TUNSETIFF, &ifr) != 0) + err(1, "configuring /dev/net/tun"); + + if (ioctl(netfd, TUNSETOFFLOAD, + TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) + err(1, "Could not set features for tun device"); + + /* + * We don't need checksums calculated for packets coming in this + * device: trust us! + */ + ioctl(netfd, TUNSETNOCSUM, 1); + + memcpy(tapif, ifr.ifr_name, IFNAMSIZ); + return netfd; +} + +/*L:195 + * Our network is a Host<->Guest network. This can either use bridging or + * routing, but the principle is the same: it uses the "tun" device to inject + * packets into the Host as if they came in from a normal network card. We + * just shunt packets between the Guest and the tun device. + */ +static void setup_tun_net(char *arg) +{ + struct device *dev; + struct net_info *net_info = malloc(sizeof(*net_info)); + int ipfd; + u32 ip = INADDR_ANY; + bool bridging = false; + char tapif[IFNAMSIZ], *p; + struct virtio_net_config conf; + + net_info->tunfd = get_tun_device(tapif); + + /* First we create a new network device. */ + dev = new_device("net", VIRTIO_ID_NET); + dev->priv = net_info; + + /* Network devices need a recv and a send queue, just like console. */ + add_virtqueue(dev, VIRTQUEUE_NUM, net_input); + add_virtqueue(dev, VIRTQUEUE_NUM, net_output); + + /* + * We need a socket to perform the magic network ioctls to bring up the + * tap interface, connect to the bridge etc. Any socket will do! + */ + ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (ipfd < 0) + err(1, "opening IP socket"); + + /* If the command line was --tunnet=bridge: do bridging. */ + if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { + arg += strlen(BRIDGE_PFX); + bridging = true; + } + + /* A mac address may follow the bridge name or IP address */ + p = strchr(arg, ':'); + if (p) { + str2mac(p+1, conf.mac); + add_feature(dev, VIRTIO_NET_F_MAC); + *p = '\0'; + } + + /* arg is now either an IP address or a bridge name */ + if (bridging) + add_to_bridge(ipfd, tapif, arg); + else + ip = str2ip(arg); + + /* Set up the tun device. */ + configure_device(ipfd, tapif, ip); + + add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); + /* Expect Guest to handle everything except UFO */ + add_feature(dev, VIRTIO_NET_F_CSUM); + add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); + add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); + add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); + add_feature(dev, VIRTIO_NET_F_GUEST_ECN); + add_feature(dev, VIRTIO_NET_F_HOST_TSO4); + add_feature(dev, VIRTIO_NET_F_HOST_TSO6); + add_feature(dev, VIRTIO_NET_F_HOST_ECN); + /* We handle indirect ring entries */ + add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); + set_config(dev, sizeof(conf), &conf); + + /* We don't need the socket any more; setup is done. */ + close(ipfd); + + devices.device_num++; + + if (bridging) + verbose("device %u: tun %s attached to bridge: %s\n", + devices.device_num, tapif, arg); + else + verbose("device %u: tun %s: %s\n", + devices.device_num, tapif, arg); +} +/*:*/ + +/* This hangs off device->priv. */ +struct vblk_info { + /* The size of the file. */ + off64_t len; + + /* The file descriptor for the file. */ + int fd; + +}; + +/*L:210 + * The Disk + * + * The disk only has one virtqueue, so it only has one thread. It is really + * simple: the Guest asks for a block number and we read or write that position + * in the file. + * + * Before we serviced each virtqueue in a separate thread, that was unacceptably + * slow: the Guest waits until the read is finished before running anything + * else, even if it could have been doing useful work. + * + * We could have used async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. + */ +static void blk_request(struct virtqueue *vq) +{ + struct vblk_info *vblk = vq->dev->priv; + unsigned int head, out_num, in_num, wlen; + int ret; + u8 *in; + struct virtio_blk_outhdr *out; + struct iovec iov[vq->vring.num]; + off64_t off; + + /* + * Get the next request, where we normally wait. It triggers the + * interrupt to acknowledge previously serviced requests (if any). + */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + + /* + * Every block request should contain at least one output buffer + * (detailing the location on disk and the type of request) and one + * input buffer (to hold the result). + */ + if (out_num == 0 || in_num == 0) + errx(1, "Bad virtblk cmd %u out=%u in=%u", + head, out_num, in_num); + + out = convert(&iov[0], struct virtio_blk_outhdr); + in = convert(&iov[out_num+in_num-1], u8); + /* + * For historical reasons, block operations are expressed in 512 byte + * "sectors". + */ + off = out->sector * 512; + + /* + * In general the virtio block driver is allowed to try SCSI commands. + * It'd be nice if we supported eject, for example, but we don't. + */ + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { + fprintf(stderr, "Scsi commands unsupported\n"); + *in = VIRTIO_BLK_S_UNSUPP; + wlen = sizeof(*in); + } else if (out->type & VIRTIO_BLK_T_OUT) { + /* + * Write + * + * Move to the right location in the block file. This can fail + * if they try to write past end. + */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = writev(vblk->fd, iov+1, out_num-1); + verbose("WRITE to sector %llu: %i\n", out->sector, ret); + + /* + * Grr... Now we know how long the descriptor they sent was, we + * make sure they didn't try to write over the end of the block + * file (possibly extending it). + */ + if (ret > 0 && off + ret > vblk->len) { + /* Trim it back to the correct length */ + ftruncate64(vblk->fd, vblk->len); + /* Die, bad Guest, die. */ + errx(1, "Write past end %llu+%u", off, ret); + } + + wlen = sizeof(*in); + *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else if (out->type & VIRTIO_BLK_T_FLUSH) { + /* Flush */ + ret = fdatasync(vblk->fd); + verbose("FLUSH fdatasync: %i\n", ret); + wlen = sizeof(*in); + *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else { + /* + * Read + * + * Move to the right location in the block file. This can fail + * if they try to read past end. + */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = readv(vblk->fd, iov+1, in_num-1); + verbose("READ from sector %llu: %i\n", out->sector, ret); + if (ret >= 0) { + wlen = sizeof(*in) + ret; + *in = VIRTIO_BLK_S_OK; + } else { + wlen = sizeof(*in); + *in = VIRTIO_BLK_S_IOERR; + } + } + + /* Finished that request. */ + add_used(vq, head, wlen); +} + +/*L:198 This actually sets up a virtual block device. */ +static void setup_block_file(const char *filename) +{ + struct device *dev; + struct vblk_info *vblk; + struct virtio_blk_config conf; + + /* Creat the device. */ + dev = new_device("block", VIRTIO_ID_BLOCK); + + /* The device has one virtqueue, where the Guest places requests. */ + add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); + + /* Allocate the room for our own bookkeeping */ + vblk = dev->priv = malloc(sizeof(*vblk)); + + /* First we open the file and store the length. */ + vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); + vblk->len = lseek64(vblk->fd, 0, SEEK_END); + + /* We support FLUSH. */ + add_feature(dev, VIRTIO_BLK_F_FLUSH); + + /* Tell Guest how many sectors this device has. */ + conf.capacity = cpu_to_le64(vblk->len / 512); + + /* + * Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. + */ + add_feature(dev, VIRTIO_BLK_F_SEG_MAX); + conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); + + /* Don't try to put whole struct: we have 8 bit limit. */ + set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); + + verbose("device %u: virtblock %llu sectors\n", + ++devices.device_num, le64_to_cpu(conf.capacity)); +} + +/*L:211 + * Our random number generator device reads from /dev/random into the Guest's + * input buffers. The usual case is that the Guest doesn't want random numbers + * and so has no buffers although /dev/random is still readable, whereas + * console is the reverse. + * + * The same logic applies, however. + */ +struct rng_info { + int rfd; +}; + +static void rng_input(struct virtqueue *vq) +{ + int len; + unsigned int head, in_num, out_num, totlen = 0; + struct rng_info *rng_info = vq->dev->priv; + struct iovec iov[vq->vring.num]; + + /* First we need a buffer from the Guests's virtqueue. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); + if (out_num) + errx(1, "Output buffers in rng?"); + + /* + * Just like the console write, we loop to cover the whole iovec. + * In this case, short reads actually happen quite a bit. + */ + while (!iov_empty(iov, in_num)) { + len = readv(rng_info->rfd, iov, in_num); + if (len <= 0) + err(1, "Read from /dev/random gave %i", len); + iov_consume(iov, in_num, len); + totlen += len; + } + + /* Tell the Guest about the new input. */ + add_used(vq, head, totlen); +} + +/*L:199 + * This creates a "hardware" random number device for the Guest. + */ +static void setup_rng(void) +{ + struct device *dev; + struct rng_info *rng_info = malloc(sizeof(*rng_info)); + + /* Our device's privat info simply contains the /dev/random fd. */ + rng_info->rfd = open_or_die("/dev/random", O_RDONLY); + + /* Create the new device. */ + dev = new_device("rng", VIRTIO_ID_RNG); + dev->priv = rng_info; + + /* The device has one virtqueue, where the Guest places inbufs. */ + add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); + + verbose("device %u: rng\n", devices.device_num++); +} +/* That's the end of device setup. */ + +/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ +static void __attribute__((noreturn)) restart_guest(void) +{ + unsigned int i; + + /* + * Since we don't track all open fds, we simply close everything beyond + * stderr. + */ + for (i = 3; i < FD_SETSIZE; i++) + close(i); + + /* Reset all the devices (kills all threads). */ + cleanup_devices(); + + execv(main_args[0], main_args); + err(1, "Could not exec %s", main_args[0]); +} + +/*L:220 + * Finally we reach the core of the Launcher which runs the Guest, serves + * its input and output, and finally, lays it to rest. + */ +static void __attribute__((noreturn)) run_guest(void) +{ + for (;;) { + unsigned long notify_addr; + int readval; + + /* We read from the /dev/lguest device to run the Guest. */ + readval = pread(lguest_fd, ¬ify_addr, + sizeof(notify_addr), cpu_id); + + /* One unsigned long means the Guest did HCALL_NOTIFY */ + if (readval == sizeof(notify_addr)) { + verbose("Notify on address %#lx\n", notify_addr); + handle_output(notify_addr); + /* ENOENT means the Guest died. Reading tells us why. */ + } else if (errno == ENOENT) { + char reason[1024] = { 0 }; + pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); + errx(1, "%s", reason); + /* ERESTART means that we need to reboot the guest */ + } else if (errno == ERESTART) { + restart_guest(); + /* Anything else means a bug or incompatible change. */ + } else + err(1, "Running guest failed"); + } +} +/*L:240 + * This is the end of the Launcher. The good news: we are over halfway + * through! The bad news: the most fiendish part of the code still lies ahead + * of us. + * + * Are you ready? Take a deep breath and join me in the core of the Host, in + * "make Host". +:*/ + +static struct option opts[] = { + { "verbose", 0, NULL, 'v' }, + { "tunnet", 1, NULL, 't' }, + { "block", 1, NULL, 'b' }, + { "rng", 0, NULL, 'r' }, + { "initrd", 1, NULL, 'i' }, + { "username", 1, NULL, 'u' }, + { "chroot", 1, NULL, 'c' }, + { NULL }, +}; +static void usage(void) +{ + errx(1, "Usage: lguest [--verbose] " + "[--tunnet=(:|bridge::)\n" + "|--block=|--initrd=]...\n" + " vmlinux [args...]"); +} + +/*L:105 The main routine is where the real work begins: */ +int main(int argc, char *argv[]) +{ + /* Memory, code startpoint and size of the (optional) initrd. */ + unsigned long mem = 0, start, initrd_size = 0; + /* Two temporaries. */ + int i, c; + /* The boot information for the Guest. */ + struct boot_params *boot; + /* If they specify an initrd file to load. */ + const char *initrd_name = NULL; + + /* Password structure for initgroups/setres[gu]id */ + struct passwd *user_details = NULL; + + /* Directory to chroot to */ + char *chroot_path = NULL; + + /* Save the args: we "reboot" by execing ourselves again. */ + main_args = argv; + + /* + * First we initialize the device list. We keep a pointer to the last + * device, and the next interrupt number to use for devices (1: + * remember that 0 is used by the timer). + */ + devices.lastdev = NULL; + devices.next_irq = 1; + + /* We're CPU 0. In fact, that's the only CPU possible right now. */ + cpu_id = 0; + + /* + * We need to know how much memory so we can set up the device + * descriptor and memory pages for the devices as we parse the command + * line. So we quickly look through the arguments to find the amount + * of memory now. + */ + for (i = 1; i < argc; i++) { + if (argv[i][0] != '-') { + mem = atoi(argv[i]) * 1024 * 1024; + /* + * We start by mapping anonymous pages over all of + * guest-physical memory range. This fills it with 0, + * and ensures that the Guest won't be killed when it + * tries to access it. + */ + guest_base = map_zeroed_pages(mem / getpagesize() + + DEVICE_PAGES); + guest_limit = mem; + guest_max = mem + DEVICE_PAGES*getpagesize(); + devices.descpage = get_pages(1); + break; + } + } + + /* The options are fairly straight-forward */ + while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { + switch (c) { + case 'v': + verbose = true; + break; + case 't': + setup_tun_net(optarg); + break; + case 'b': + setup_block_file(optarg); + break; + case 'r': + setup_rng(); + break; + case 'i': + initrd_name = optarg; + break; + case 'u': + user_details = getpwnam(optarg); + if (!user_details) + err(1, "getpwnam failed, incorrect username?"); + break; + case 'c': + chroot_path = optarg; + break; + default: + warnx("Unknown argument %s", argv[optind]); + usage(); + } + } + /* + * After the other arguments we expect memory and kernel image name, + * followed by command line arguments for the kernel. + */ + if (optind + 2 > argc) + usage(); + + verbose("Guest base is at %p\n", guest_base); + + /* We always have a console device */ + setup_console(); + + /* Now we load the kernel */ + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); + + /* Boot information is stashed at physical address 0 */ + boot = from_guest_phys(0); + + /* Map the initrd image if requested (at top of physical memory) */ + if (initrd_name) { + initrd_size = load_initrd(initrd_name, mem); + /* + * These are the location in the Linux boot header where the + * start and size of the initrd are expected to be found. + */ + boot->hdr.ramdisk_image = mem - initrd_size; + boot->hdr.ramdisk_size = initrd_size; + /* The bootloader type 0xFF means "unknown"; that's OK. */ + boot->hdr.type_of_loader = 0xFF; + } + + /* + * The Linux boot header contains an "E820" memory map: ours is a + * simple, single region. + */ + boot->e820_entries = 1; + boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); + /* + * The boot header contains a command line pointer: we put the command + * line after the boot header. + */ + boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); + /* We use a simple helper to copy the arguments separated by spaces. */ + concat((char *)(boot + 1), argv+optind+2); + + /* Boot protocol version: 2.07 supports the fields for lguest. */ + boot->hdr.version = 0x207; + + /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = 1; + + /* Tell the entry path not to try to reload segment registers. */ + boot->hdr.loadflags |= KEEP_SEGMENTS; + + /* + * We tell the kernel to initialize the Guest: this returns the open + * /dev/lguest file descriptor. + */ + tell_kernel(start); + + /* Ensure that we terminate if a device-servicing child dies. */ + signal(SIGCHLD, kill_launcher); + + /* If we exit via err(), this kills all the threads, restores tty. */ + atexit(cleanup_devices); + + /* If requested, chroot to a directory */ + if (chroot_path) { + if (chroot(chroot_path) != 0) + err(1, "chroot(\"%s\") failed", chroot_path); + + if (chdir("/") != 0) + err(1, "chdir(\"/\") failed"); + + verbose("chroot done\n"); + } + + /* If requested, drop privileges */ + if (user_details) { + uid_t u; + gid_t g; + + u = user_details->pw_uid; + g = user_details->pw_gid; + + if (initgroups(user_details->pw_name, g) != 0) + err(1, "initgroups failed"); + + if (setresgid(g, g, g) != 0) + err(1, "setresgid failed"); + + if (setresuid(u, u, u) != 0) + err(1, "setresuid failed"); + + verbose("Dropping privileges completed\n"); + } + + /* Finally, run the Guest. This doesn't return. */ + run_guest(); +} +/*:*/ + +/*M:999 + * Mastery is done: you now know everything I do. + * + * But surely you have seen code, features and bugs in your wanderings which + * you now yearn to attack? That is the real game, and I look forward to you + * patching and forking lguest into the Your-Name-Here-visor. + * + * Farewell, and good coding! + * Rusty Russell. + */ diff --git a/Documentation/virtual/lguest/lguest.txt b/Documentation/virtual/lguest/lguest.txt new file mode 100644 index 0000000..dad9997 --- /dev/null +++ b/Documentation/virtual/lguest/lguest.txt @@ -0,0 +1,128 @@ + __ + (___()'`; Rusty's Remarkably Unreliable Guide to Lguest + /, /` - or, A Young Coder's Illustrated Hypervisor + \\"--\\ http://lguest.ozlabs.org + +Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, +for Linux developers and users to experiment with virtualization with the +minimum of complexity. Nonetheless, it should have sufficient features to +make it useful for specific tasks, and, of course, you are encouraged to fork +and enhance it (see drivers/lguest/README). + +Features: + +- Kernel module which runs in a normal kernel. +- Simple I/O model for communication. +- Simple program to create new guests. +- Logo contains cute puppies: http://lguest.ozlabs.org + +Developer features: + +- Fun to hack on. +- No ABI: being tied to a specific kernel anyway, you can change anything. +- Many opportunities for improvement or feature implementation. + +Running Lguest: + +- The easiest way to run lguest is to use same kernel as guest and host. + You can configure them differently, but usually it's easiest not to. + + You will need to configure your kernel with the following options: + + "General setup": + "Prompt for development and/or incomplete code/drivers" = Y + (CONFIG_EXPERIMENTAL=y) + + "Processor type and features": + "Paravirtualized guest support" = Y + "Lguest guest support" = Y + "High Memory Support" = off/4GB + "Alignment value to which kernel should be aligned" = 0x100000 + (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and + CONFIG_PHYSICAL_ALIGN=0x100000) + + "Device Drivers": + "Block devices" + "Virtio block driver (EXPERIMENTAL)" = M/Y + "Network device support" + "Universal TUN/TAP device driver support" = M/Y + "Virtio network driver (EXPERIMENTAL)" = M/Y + (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m) + + "Virtualization" + "Linux hypervisor example code" = M/Y + (CONFIG_LGUEST=m) + +- A tool called "lguest" is available in this directory: type "make" + to build it. If you didn't build your kernel in-tree, use "make + O=". + +- Create or find a root disk image. There are several useful ones + around, such as the xm-test tiny root image at + http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img + + For more serious work, I usually use a distribution ISO image and + install it under qemu, then make multiple copies: + + dd if=/dev/zero of=rootfile bs=1M count=2048 + qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d + + Make sure that you install a getty on /dev/hvc0 if you want to log in on the + console! + +- "modprobe lg" if you built it as a module. + +- Run an lguest as root: + + Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda + + Explanation: + 64: the amount of memory to use, in MB. + + vmlinux: the kernel image found in the top of your build directory. You + can also use a standard bzImage. + + --tunnet=192.168.19.1: configures a "tap" device for networking with this + IP address. + + --block=rootfile: a file or block device which becomes /dev/vda + inside the guest. + + root=/dev/vda: this (and anything else on the command line) are + kernel boot parameters. + +- Configuring networking. I usually have the host masquerade, using + "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 > + /proc/sys/net/ipv4/ip_forward". In this example, I would configure + eth0 inside the guest at 192.168.19.2. + + Another method is to bridge the tap device to an external interface + using --tunnet=bridge:, and perhaps run dhcp on the guest + to obtain an IP address. The bridge needs to be configured first: + this option simply adds the tap interface to it. + + A simple example on my system: + + ifconfig eth0 0.0.0.0 + brctl addbr lg0 + ifconfig lg0 up + brctl addif lg0 eth0 + dhclient lg0 + + Then use --tunnet=bridge:lg0 when launching the guest. + + See: + + http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge + + for general information on how to get bridging to work. + +- Random number generation. Using the --rng option will provide a + /dev/hwrng in the guest that will read from the host's /dev/random. + Use this option in conjunction with rng-tools (see ../hw_random.txt) + to provide entropy to the guest kernel's /dev/random. + +There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest + +Good luck! +Rusty Russell rusty@rustcorp.com.au. diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt new file mode 100644 index 0000000..9b7e190 --- /dev/null +++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt @@ -0,0 +1,4579 @@ + User Mode Linux HOWTO + User Mode Linux Core Team + Mon Nov 18 14:16:16 EST 2002 + + This document describes the use and abuse of Jeff Dike's User Mode + Linux: a port of the Linux kernel as a normal Intel Linux process. + ______________________________________________________________________ + + Table of Contents + + 1. Introduction + + 1.1 How is User Mode Linux Different? + 1.2 Why Would I Want User Mode Linux? + + 2. Compiling the kernel and modules + + 2.1 Compiling the kernel + 2.2 Compiling and installing kernel modules + 2.3 Compiling and installing uml_utilities + + 3. Running UML and logging in + + 3.1 Running UML + 3.2 Logging in + 3.3 Examples + + 4. UML on 2G/2G hosts + + 4.1 Introduction + 4.2 The problem + 4.3 The solution + + 5. Setting up serial lines and consoles + + 5.1 Specifying the device + 5.2 Specifying the channel + 5.3 Examples + + 6. Setting up the network + + 6.1 General setup + 6.2 Userspace daemons + 6.3 Specifying ethernet addresses + 6.4 UML interface setup + 6.5 Multicast + 6.6 TUN/TAP with the uml_net helper + 6.7 TUN/TAP with a preconfigured tap device + 6.8 Ethertap + 6.9 The switch daemon + 6.10 Slip + 6.11 Slirp + 6.12 pcap + 6.13 Setting up the host yourself + + 7. Sharing Filesystems between Virtual Machines + + 7.1 A warning + 7.2 Using layered block devices + 7.3 Note! + 7.4 Another warning + 7.5 uml_moo : Merging a COW file with its backing file + + 8. Creating filesystems + + 8.1 Create the filesystem file + 8.2 Assign the file to a UML device + 8.3 Creating and mounting the filesystem + + 9. Host file access + + 9.1 Using hostfs + 9.2 hostfs as the root filesystem + 9.3 Building hostfs + + 10. The Management Console + 10.1 version + 10.2 halt and reboot + 10.3 config + 10.4 remove + 10.5 sysrq + 10.6 help + 10.7 cad + 10.8 stop + 10.9 go + + 11. Kernel debugging + + 11.1 Starting the kernel under gdb + 11.2 Examining sleeping processes + 11.3 Running ddd on UML + 11.4 Debugging modules + 11.5 Attaching gdb to the kernel + 11.6 Using alternate debuggers + + 12. Kernel debugging examples + + 12.1 The case of the hung fsck + 12.2 Episode 2: The case of the hung fsck + + 13. What to do when UML doesn't work + + 13.1 Strange compilation errors when you build from source + 13.2 (obsolete) + 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem + 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' + 13.5 UML doesn't work when /tmp is an NFS filesystem + 13.6 UML hangs on boot when compiled with gprof support + 13.7 syslogd dies with a SIGTERM on startup + 13.8 TUN/TAP networking doesn't work on a 2.4 host + 13.9 You can network to the host but not to other machines on the net + 13.10 I have no root and I want to scream + 13.11 UML build conflict between ptrace.h and ucontext.h + 13.12 The UML BogoMips is exactly half the host's BogoMips + 13.13 When you run UML, it immediately segfaults + 13.14 xterms appear, then immediately disappear + 13.15 Any other panic, hang, or strange behavior + + 14. Diagnosing Problems + + 14.1 Case 1 : Normal kernel panics + 14.2 Case 2 : Tracing thread panics + 14.3 Case 3 : Tracing thread panics caused by other threads + 14.4 Case 4 : Hangs + + 15. Thanks + + 15.1 Code and Documentation + 15.2 Flushing out bugs + 15.3 Buglets and clean-ups + 15.4 Case Studies + 15.5 Other contributions + + + ______________________________________________________________________ + + 11.. IInnttrroodduuccttiioonn + + Welcome to User Mode Linux. It's going to be fun. + + + + 11..11.. HHooww iiss UUsseerr MMooddee LLiinnuuxx DDiiffffeerreenntt?? + + Normally, the Linux Kernel talks straight to your hardware (video + card, keyboard, hard drives, etc), and any programs which run ask the + kernel to operate the hardware, like so: + + + + +-----------+-----------+----+ + | Process 1 | Process 2 | ...| + +-----------+-----------+----+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + The User Mode Linux Kernel is different; instead of talking to the + hardware, it talks to a `real' Linux kernel (called the `host kernel' + from now on), like any other program. Programs can then run inside + User-Mode Linux as if they were running under a normal kernel, like + so: + + + + +----------------+ + | Process 2 | ...| + +-----------+----------------+ + | Process 1 | User-Mode Linux| + +----------------------------+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + + 11..22.. WWhhyy WWoouulldd II WWaanntt UUsseerr MMooddee LLiinnuuxx?? + + + 1. If User Mode Linux crashes, your host kernel is still fine. + + 2. You can run a usermode kernel as a non-root user. + + 3. You can debug the User Mode Linux like any normal process. + + 4. You can run gprof (profiling) and gcov (coverage testing). + + 5. You can play with your kernel without breaking things. + + 6. You can use it as a sandbox for testing new apps. + + 7. You can try new development kernels safely. + + 8. You can run different distributions simultaneously. + + 9. It's extremely fun. + + + + + + 22.. CCoommppiilliinngg tthhee kkeerrnneell aanndd mmoodduulleess + + + + + 22..11.. CCoommppiilliinngg tthhee kkeerrnneell + + + Compiling the user mode kernel is just like compiling any other + kernel. Let's go through the steps, using 2.4.0-prerelease (current + as of this writing) as an example: + + + 1. Download the latest UML patch from + + the download page + . + + + 3. Make a directory and unpack the kernel into it. + + + + host% + mkdir ~/uml + + + + + + + host% + cd ~/uml + + + + + + + host% + tar -xzvf linux-2.4.0-prerelease.tar.bz2 + + + + + + + 4. Apply the patch using + + + + host% + cd ~/uml/linux + + + + host% + bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 + + + + + + + 5. Run your favorite config; `make xconfig ARCH=um' is the most + convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' + will work as well. The defaults will give you a useful kernel. If + you want to change something, go ahead, it probably won't hurt + anything. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries + will not run. They will immediately segfault. See ``UML on 2G/2G + hosts'' for the scoop on running UML on your system. + + + + 6. Finish with `make linux ARCH=um': the result is a file called + `linux' in the top directory of your source tree. + + Make sure that you don't build this kernel in /usr/src/linux. On some + distributions, /usr/include/asm is a link into this pool. The user- + mode build changes the other end of that link, and things that include + stop compiling. + + The sources are also available from cvs at the project's cvs page, + which has directions on getting the sources. You can also browse the + CVS pool from there. + + If you get the CVS sources, you will have to check them out into an + empty directory. You will then have to copy each file into the + corresponding directory in the appropriate kernel pool. + + If you don't have the latest kernel pool, you can get the + corresponding user-mode sources with + + + host% cvs co -r v_2_3_x linux + + + + + where 'x' is the version in your pool. Note that you will not get the + bug fixes and enhancements that have gone into subsequent releases. + + + 22..22.. CCoommppiilliinngg aanndd iinnssttaalllliinngg kkeerrnneell mmoodduulleess + + UML modules are built in the same way as the native kernel (with the + exception of the 'ARCH=um' that you always need for UML): + + + host% make modules ARCH=um + + + + + Any modules that you want to load into this kernel need to be built in + the user-mode pool. Modules from the native kernel won't work. + + You can install them by using ftp or something to copy them into the + virtual machine and dropping them into /lib/modules/`uname -r`. + + You can also get the kernel build process to install them as follows: + + 1. with the kernel not booted, mount the root filesystem in the top + level of the kernel pool: + + + host% mount root_fs mnt -o loop + + + + + + + 2. run + + + host% + make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um + + + + + + + 3. unmount the filesystem + + + host% umount mnt + + + + + + + 4. boot the kernel on it + + + When the system is booted, you can use insmod as usual to get the + modules into the kernel. A number of things have been loaded into UML + as modules, especially filesystems and network protocols and filters, + so most symbols which need to be exported probably already are. + However, if you do find symbols that need exporting, let us + know, and + they'll be "taken care of". + + + + 22..33.. CCoommppiilliinngg aanndd iinnssttaalllliinngg uummll__uuttiilliittiieess + + Many features of the UML kernel require a user-space helper program, + so a uml_utilities package is distributed separately from the kernel + patch which provides these helpers. Included within this is: + + +o port-helper - Used by consoles which connect to xterms or ports + + +o tunctl - Configuration tool to create and delete tap devices + + +o uml_net - Setuid binary for automatic tap device configuration + + +o uml_switch - User-space virtual switch required for daemon + transport + + The uml_utilities tree is compiled with: + + + host# + make && make install + + + + + Note that UML kernel patches may require a specific version of the + uml_utilities distribution. If you don't keep up with the mailing + lists, ensure that you have the latest release of uml_utilities if you + are experiencing problems with your UML kernel, particularly when + dealing with consoles or command-line switches to the helper programs + + + + + + + + + 33.. RRuunnnniinngg UUMMLL aanndd llooggggiinngg iinn + + + + 33..11.. RRuunnnniinngg UUMMLL + + It runs on 2.2.15 or later, and all 2.4 kernels. + + + Booting UML is straightforward. Simply run 'linux': it will try to + mount the file `root_fs' in the current directory. You do not need to + run it as root. If your root filesystem is not named `root_fs', then + you need to put a `ubd0=root_fs_whatever' switch on the linux command + line. + + + You will need a filesystem to boot UML from. There are a number + available for download from here . There are also several tools + which can be + used to generate UML-compatible filesystem images from media. + The kernel will boot up and present you with a login prompt. + + + Note: If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries will + not run. They will immediately segfault. See ``UML on 2G/2G hosts'' + for the scoop on running UML on your system. + + + + 33..22.. LLooggggiinngg iinn + + + + The prepackaged filesystems have a root account with password 'root' + and a user account with password 'user'. The login banner will + generally tell you how to log in. So, you log in and you will find + yourself inside a little virtual machine. Our filesystems have a + variety of commands and utilities installed (and it is fairly easy to + add more), so you will have a lot of tools with which to poke around + the system. + + There are a couple of other ways to log in: + + +o On a virtual console + + + + Each virtual console that is configured (i.e. the device exists in + /dev and /etc/inittab runs a getty on it) will come up in its own + xterm. If you get tired of the xterms, read ``Setting up serial + lines and consoles'' to see how to attach the consoles to + something else, like host ptys. + + + + +o Over the serial line + + + In the boot output, find a line that looks like: + + + + serial line 0 assigned pty /dev/ptyp1 + + + + + Attach your favorite terminal program to the corresponding tty. I.e. + for minicom, the command would be + + + host% minicom -o -p /dev/ttyp1 + + + + + + + +o Over the net + + + If the network is running, then you can telnet to the virtual + machine and log in to it. See ``Setting up the network'' to learn + about setting up a virtual network. + + When you're done using it, run halt, and the kernel will bring itself + down and the process will exit. + + + 33..33.. EExxaammpplleess + + Here are some examples of UML in action: + + +o A login session + + +o A virtual network + + + + + + + + 44.. UUMMLL oonn 22GG//22GG hhoossttss + + + + + 44..11.. IInnttrroodduuccttiioonn + + + Most Linux machines are configured so that the kernel occupies the + upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and + processes use the lower 3G (0x00000000 - 0xbfffffff). However, some + machine are configured with a 2G/2G split, with the kernel occupying + the upper 2G (0x80000000 - 0xffffffff) and processes using the lower + 2G (0x00000000 - 0x7fffffff). + + + + + 44..22.. TThhee pprroobblleemm + + + The prebuilt UML binaries on this site will not run on 2G/2G hosts + because UML occupies the upper .5G of the 3G process address space + (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right + in the middle of the kernel address space, so UML won't even load - it + will immediately segfault. + + + + + 44..33.. TThhee ssoolluuttiioonn + + + The fix for this is to rebuild UML from source after enabling + CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to + load itself in the top .5G of that smaller process address space, + where it will run fine. See ``Compiling the kernel and modules'' if + you need help building UML from source. + + + + + + + + + + + 55.. SSeettttiinngg uupp sseerriiaall lliinneess aanndd ccoonnssoolleess + + + It is possible to attach UML serial lines and consoles to many types + of host I/O channels by specifying them on the command line. + + + You can attach them to host ptys, ttys, file descriptors, and ports. + This allows you to do things like + + +o have a UML console appear on an unused host console, + + +o hook two virtual machines together by having one attach to a pty + and having the other attach to the corresponding tty + + +o make a virtual machine accessible from the net by attaching a + console to a port on the host. + + + The general format of the command line option is device=channel. + + + + 55..11.. SSppeecciiffyyiinngg tthhee ddeevviiccee + + Devices are specified with "con" or "ssl" (console or serial line, + respectively), optionally with a device number if you are talking + about a specific device. + + + Using just "con" or "ssl" describes all of the consoles or serial + lines. If you want to talk about console #3 or serial line #10, they + would be "con3" and "ssl10", respectively. + + + A specific device name will override a less general "con=" or "ssl=". + So, for example, you can assign a pty to each of the serial lines + except for the first two like this: + + + ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 + + + + + The specificity of the device name is all that matters; order on the + command line is irrelevant. + + + + 55..22.. SSppeecciiffyyiinngg tthhee cchhaannnneell + + There are a number of different types of channels to attach a UML + device to, each with a different way of specifying exactly what to + attach to. + + +o pseudo-terminals - device=pty pts terminals - device=pts + + + This will cause UML to allocate a free host pseudo-terminal for the + device. The terminal that it got will be announced in the boot + log. You access it by attaching a terminal program to the + corresponding tty: + + +o screen /dev/pts/n + + +o screen /dev/ttyxx + + +o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts + devices + + +o kermit - start it up, 'open' the device, then 'connect' + + + + + + +o terminals - device=tty:tty device file + + + This will make UML attach the device to the specified tty (i.e + + + con1=tty:/dev/tty3 + + + + + will attach UML's console 1 to the host's /dev/tty3). If the tty that + you specify is the slave end of a tty/pty pair, something else must + have already opened the corresponding pty in order for this to work. + + + + + + +o xterms - device=xterm + + + UML will run an xterm and the device will be attached to it. + + + + + + +o Port - device=port:port number + + + This will attach the UML devices to the specified host port. + Attaching console 1 to the host's port 9000 would be done like + this: + + + con1=port:9000 + + + + + Attaching all the serial lines to that port would be done similarly: + + + ssl=port:9000 + + + + + You access these devices by telnetting to that port. Each active tel- + net session gets a different device. If there are more telnets to a + port than UML devices attached to it, then the extra telnet sessions + will block until an existing telnet detaches, or until another device + becomes active (i.e. by being activated in /etc/inittab). + + This channel has the advantage that you can both attach multiple UML + devices to it and know how to access them without reading the UML boot + log. It is also unique in allowing access to a UML from remote + machines without requiring that the UML be networked. This could be + useful in allowing public access to UMLs because they would be + accessible from the net, but wouldn't need any kind of network + filtering or access control because they would have no network access. + + + If you attach the main console to a portal, then the UML boot will + appear to hang. In reality, it's waiting for a telnet to connect, at + which point the boot will proceed. + + + + + + +o already-existing file descriptors - device=file descriptor + + + If you set up a file descriptor on the UML command line, you can + attach a UML device to it. This is most commonly used to put the + main console back on stdin and stdout after assigning all the other + consoles to something else: + + + con0=fd:0,fd:1 con=pts + + + + + + + + + +o Nothing - device=null + + + This allows the device to be opened, in contrast to 'none', but + reads will block, and writes will succeed and the data will be + thrown out. + + + + + + +o None - device=none + + + This causes the device to disappear. + + + + You can also specify different input and output channels for a device + by putting a comma between them: + + + ssl3=tty:/dev/tty2,xterm + + + + + will cause serial line 3 to accept input on the host's /dev/tty3 and + display output on an xterm. That's a silly example - the most common + use of this syntax is to reattach the main console to stdin and stdout + as shown above. + + + If you decide to move the main console away from stdin/stdout, the + initial boot output will appear in the terminal that you're running + UML in. However, once the console driver has been officially + initialized, then the boot output will start appearing wherever you + specified that console 0 should be. That device will receive all + subsequent output. + + + + 55..33.. EExxaammpplleess + + There are a number of interesting things you can do with this + capability. + + + First, this is how you get rid of those bleeding console xterms by + attaching them to host ptys: + + + con=pty con0=fd:0,fd:1 + + + + + This will make a UML console take over an unused host virtual console, + so that when you switch to it, you will see the UML login prompt + rather than the host login prompt: + + + con1=tty:/dev/tty6 + + + + + You can attach two virtual machines together with what amounts to a + serial line as follows: + + Run one UML with a serial line attached to a pty - + + + ssl1=pty + + + + + Look at the boot log to see what pty it got (this example will assume + that it got /dev/ptyp1). + + Boot the other UML with a serial line attached to the corresponding + tty - + + + ssl1=tty:/dev/ttyp1 + + + + + Log in, make sure that it has no getty on that serial line, attach a + terminal program like minicom to it, and you should see the login + prompt of the other virtual machine. + + + 66.. SSeettttiinngg uupp tthhee nneettwwoorrkk + + + + This page describes how to set up the various transports and to + provide a UML instance with network access to the host, other machines + on the local net, and the rest of the net. + + + As of 2.4.5, UML networking has been completely redone to make it much + easier to set up, fix bugs, and add new features. + + + There is a new helper, uml_net, which does the host setup that + requires root privileges. + + + There are currently five transport types available for a UML virtual + machine to exchange packets with other hosts: + + +o ethertap + + +o TUN/TAP + + +o Multicast + + +o a switch daemon + + +o slip + + +o slirp + + +o pcap + + The TUN/TAP, ethertap, slip, and slirp transports allow a UML + instance to exchange packets with the host. They may be directed + to the host or the host may just act as a router to provide access + to other physical or virtual machines. + + + The pcap transport is a synthetic read-only interface, using the + libpcap binary to collect packets from interfaces on the host and + filter them. This is useful for building preconfigured traffic + monitors or sniffers. + + + The daemon and multicast transports provide a completely virtual + network to other virtual machines. This network is completely + disconnected from the physical network unless one of the virtual + machines on it is acting as a gateway. + + + With so many host transports, which one should you use? Here's when + you should use each one: + + +o ethertap - if you want access to the host networking and it is + running 2.2 + + +o TUN/TAP - if you want access to the host networking and it is + running 2.4. Also, the TUN/TAP transport is able to use a + preconfigured device, allowing it to avoid using the setuid uml_net + helper, which is a security advantage. + + +o Multicast - if you want a purely virtual network and you don't want + to set up anything but the UML + + +o a switch daemon - if you want a purely virtual network and you + don't mind running the daemon in order to get somewhat better + performance + + +o slip - there is no particular reason to run the slip backend unless + ethertap and TUN/TAP are just not available for some reason + + +o slirp - if you don't have root access on the host to setup + networking, or if you don't want to allocate an IP to your UML + + +o pcap - not much use for actual network connectivity, but great for + monitoring traffic on the host + + Ethertap is available on 2.4 and works fine. TUN/TAP is preferred + to it because it has better performance and ethertap is officially + considered obsolete in 2.4. Also, the root helper only needs to + run occasionally for TUN/TAP, rather than handling every packet, as + it does with ethertap. This is a slight security advantage since + it provides fewer opportunities for a nasty UML user to somehow + exploit the helper's root privileges. + + + 66..11.. GGeenneerraall sseettuupp + + First, you must have the virtual network enabled in your UML. If are + running a prebuilt kernel from this site, everything is already + enabled. If you build the kernel yourself, under the "Network device + support" menu, enable "Network device support", and then the three + transports. + + + The next step is to provide a network device to the virtual machine. + This is done by describing it on the kernel command line. + + The general format is + + + eth = , + + + + + For example, a virtual ethernet device may be attached to a host + ethertap device as follows: + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + This sets up eth0 inside the virtual machine to attach itself to the + host /dev/tap0, assigns it an ethernet address, and assigns the host + tap0 interface an IP address. + + + + Note that the IP address you assign to the host end of the tap device + must be different than the IP you assign to the eth device inside UML. + If you are short on IPs and don't want to consume two per UML, then + you can reuse the host's eth IP address for the host ends of the tap + devices. Internally, the UMLs must still get unique IPs for their eth + devices. You can also give the UMLs non-routable IPs (192.168.x.x or + 10.x.x.x) and have the host masquerade them. This will let outgoing + connections work, but incoming connections won't without more work, + such as port forwarding from the host. + Also note that when you configure the host side of an interface, it is + only acting as a gateway. It will respond to pings sent to it + locally, but is not useful to do that since it's a host interface. + You are not talking to the UML when you ping that interface and get a + response. + + + You can also add devices to a UML and remove them at runtime. See the + ``The Management Console'' page for details. + + + The sections below describe this in more detail. + + + Once you've decided how you're going to set up the devices, you boot + UML, log in, configure the UML side of the devices, and set up routes + to the outside world. At that point, you will be able to talk to any + other machines, physical or virtual, on the net. + + + If ifconfig inside UML fails and the network refuses to come up, run + tell you what went wrong. + + + + 66..22.. UUsseerrssppaaccee ddaaeemmoonnss + + You will likely need the setuid helper, or the switch daemon, or both. + They are both installed with the RPM and deb, so if you've installed + either, you can skip the rest of this section. + + + If not, then you need to check them out of CVS, build them, and + install them. The helper is uml_net, in CVS /tools/uml_net, and the + daemon is uml_switch, in CVS /tools/uml_router. They are both built + with a plain 'make'. Both need to be installed in a directory that's + in your path - /usr/bin is recommend. On top of that, uml_net needs + to be setuid root. + + + + 66..33.. SSppeecciiffyyiinngg eetthheerrnneett aaddddrreesssseess + + Below, you will see that the TUN/TAP, ethertap, and daemon interfaces + allow you to specify hardware addresses for the virtual ethernet + devices. This is generally not necessary. If you don't have a + specific reason to do it, you probably shouldn't. If one is not + specified on the command line, the driver will assign one based on the + device IP address. It will provide the address fe:fd:nn:nn:nn:nn + where nn.nn.nn.nn is the device IP address. This is nearly always + sufficient to guarantee a unique hardware address for the device. A + couple of exceptions are: + + +o Another set of virtual ethernet devices are on the same network and + they are assigned hardware addresses using a different scheme which + may conflict with the UML IP address-based scheme + + +o You aren't going to use the device for IP networking, so you don't + assign the device an IP address + + If you let the driver provide the hardware address, you should make + sure that the device IP address is known before the interface is + brought up. So, inside UML, this will guarantee that: + + + + UML# + ifconfig eth0 192.168.0.250 up + + + + + If you decide to assign the hardware address yourself, make sure that + the first byte of the address is even. Addresses with an odd first + byte are broadcast addresses, which you don't want assigned to a + device. + + + + 66..44.. UUMMLL iinntteerrffaaccee sseettuupp + + Once the network devices have been described on the command line, you + should boot UML and log in. + + + The first thing to do is bring the interface up: + + + UML# ifconfig ethn ip-address up + + + + + You should be able to ping the host at this point. + + + To reach the rest of the world, you should set a default route to the + host: + + + UML# route add default gw host ip + + + + + Again, with host ip of 192.168.0.4: + + + UML# route add default gw 192.168.0.4 + + + + + This page used to recommend setting a network route to your local net. + This is wrong, because it will cause UML to try to figure out hardware + addresses of the local machines by arping on the interface to the + host. Since that interface is basically a single strand of ethernet + with two nodes on it (UML and the host) and arp requests don't cross + networks, they will fail to elicit any responses. So, what you want + is for UML to just blindly throw all packets at the host and let it + figure out what to do with them, which is what leaving out the network + route and adding the default route does. + + + Note: If you can't communicate with other hosts on your physical + ethernet, it's probably because of a network route that's + automatically set up. If you run 'route -n' and see a route that + looks like this: + + + + + Destination Gateway Genmask Flags Metric Ref Use Iface + 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 + + + + + with a mask that's not 255.255.255.255, then replace it with a route + to your host: + + + UML# + route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 + + + + + + + UML# + route add -host 192.168.0.4 dev eth0 + + + + + This, plus the default route to the host, will allow UML to exchange + packets with any machine on your ethernet. + + + + 66..55.. MMuullttiiccaasstt + + The simplest way to set up a virtual network between multiple UMLs is + to use the mcast transport. This was written by Harald Welte and is + present in UML version 2.4.5-5um and later. Your system must have + multicast enabled in the kernel and there must be a multicast-capable + network device on the host. Normally, this is eth0, but if there is + no ethernet card on the host, then you will likely get strange error + messages when you bring the device up inside UML. + + + To use it, run two UMLs with + + + eth0=mcast + + + + + on their command lines. Log in, configure the ethernet device in each + machine with different IP addresses: + + + UML1# ifconfig eth0 192.168.0.254 + + + + + + + UML2# ifconfig eth0 192.168.0.253 + + + + + and they should be able to talk to each other. + + The full set of command line options for this transport are + + + + ethn=mcast,ethernet address,multicast + address,multicast port,ttl + + + + + Harald's original README is here and explains these in detail, as well as + some other issues. + + + + 66..66.. TTUUNN//TTAAPP wwiitthh tthhee uummll__nneett hheellppeerr + + TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the + host. The TUN/TAP backend has been in UML since 2.4.9-3um. + + + The easiest way to get up and running is to let the setuid uml_net + helper do the host setup for you. This involves insmod-ing the tun.o + module if necessary, configuring the device, and setting up IP + forwarding, routing, and proxy arp. If you are new to UML networking, + do this first. If you're concerned about the security implications of + the setuid helper, use it to get up and running, then read the next + section to see how to have UML use a preconfigured tap device, which + avoids the use of uml_net. + + + If you specify an IP address for the host side of the device, the + uml_net helper will do all necessary setup on the host - the only + requirement is that TUN/TAP be available, either built in to the host + kernel or as the tun.o module. + + The format of the command line switch to attach a device to a TUN/TAP + device is + + + eth =tuntap,,, + + + + + For example, this argument will attach the UML's eth0 to the next + available tap device and assign an ethernet address to it based on its + IP address + + + eth0=tuntap,,,192.168.0.254 + + + + + + + Note that the IP address that must be used for the eth device inside + UML is fixed by the routing and proxy arp that is set up on the + TUN/TAP device on the host. You can use a different one, but it won't + work because reply packets won't reach the UML. This is a feature. + It prevents a nasty UML user from doing things like setting the UML IP + to the same as the network's nameserver or mail server. + + + There are a couple potential problems with running the TUN/TAP + transport on a 2.4 host kernel + + +o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host + kernel or use the ethertap transport. + + +o With an upgraded kernel, TUN/TAP may fail with + + + File descriptor in bad state + + + + + This is due to a header mismatch between the upgraded kernel and the + kernel that was originally installed on the machine. The fix is to + make sure that /usr/src/linux points to the headers for the running + kernel. + + These were pointed out by Tim Robinson in + name="this uml- + user post"> . + + + + 66..77.. TTUUNN//TTAAPP wwiitthh aa pprreeccoonnffiigguurreedd ttaapp ddeevviiccee + + If you prefer not to have UML use uml_net (which is somewhat + insecure), with UML 2.4.17-11, you can set up a TUN/TAP device + beforehand. The setup needs to be done as root, but once that's done, + there is no need for root assistance. Setting up the device is done + as follows: + + +o Create the device with tunctl (available from the UML utilities + tarball) + + + + + host# tunctl -u uid + + + + + where uid is the user id or username that UML will be run as. This + will tell you what device was created. + + +o Configure the device IP (change IP addresses and device name to + suit) + + + + + host# ifconfig tap0 192.168.0.254 up + + + + + + +o Set up routing and arping if desired - this is my recipe, there are + other ways of doing the same thing + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' + + host# + route add -host 192.168.0.253 dev tap0 + + + + + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' + + + + + + + host# + arp -Ds 192.168.0.253 eth0 pub + + + + + Note that this must be done every time the host boots - this configu- + ration is not stored across host reboots. So, it's probably a good + idea to stick it in an rc file. An even better idea would be a little + utility which reads the information from a config file and sets up + devices at boot time. + + +o Rather than using up two IPs and ARPing for one of them, you can + also provide direct access to your LAN by the UML by using a + bridge. + + + host# + brctl addbr br0 + + + + + + + host# + ifconfig eth0 0.0.0.0 promisc up + + + + + + + host# + ifconfig tap0 0.0.0.0 promisc up + + + + + + + host# + ifconfig br0 192.168.0.1 netmask 255.255.255.0 up + + + + + + + + host# + brctl stp br0 off + + + + + + + host# + brctl setfd br0 1 + + + + + + + host# + brctl sethello br0 1 + + + + + + + host# + brctl addif br0 eth0 + + + + + + + host# + brctl addif br0 tap0 + + + + + Note that 'br0' should be setup using ifconfig with the existing IP + address of eth0, as eth0 no longer has its own IP. + + +o + + + Also, the /dev/net/tun device must be writable by the user running + UML in order for the UML to use the device that's been configured + for it. The simplest thing to do is + + + host# chmod 666 /dev/net/tun + + + + + Making it world-writable looks bad, but it seems not to be + exploitable as a security hole. However, it does allow anyone to cre- + ate useless tap devices (useless because they can't configure them), + which is a DOS attack. A somewhat more secure alternative would to be + to create a group containing all the users who have preconfigured tap + devices and chgrp /dev/net/tun to that group with mode 664 or 660. + + + +o Once the device is set up, run UML with 'eth0=tuntap,device name' + (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the + mconsole config command). + + +o Bring the eth device up in UML and you're in business. + + If you don't want that tap device any more, you can make it non- + persistent with + + + host# tunctl -d tap device + + + + + Finally, tunctl has a -b (for brief mode) switch which causes it to + output only the name of the tap device it created. This makes it + suitable for capture by a script: + + + host# TAP=`tunctl -u 1000 -b` + + + + + + + 66..88.. EEtthheerrttaapp + + Ethertap is the general mechanism on 2.2 for userspace processes to + exchange packets with the kernel. + + + + To use this transport, you need to describe the virtual network device + on the UML command line. The general format for this is + + + eth =ethertap, , , + + + + + So, the previous example + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + attaches the UML eth0 device to the host /dev/tap0, assigns it the + ethernet address fe:fd:0:0:0:1, and assigns the IP address + 192.168.0.254 to the tap device. + + + + The tap device is mandatory, but the others are optional. If the + ethernet address is omitted, one will be assigned to it. + + + The presence of the tap IP address will cause the helper to run and do + whatever host setup is needed to allow the virtual machine to + communicate with the outside world. If you're not sure you know what + you're doing, this is the way to go. + + + If it is absent, then you must configure the tap device and whatever + arping and routing you will need on the host. However, even in this + case, the uml_net helper still needs to be in your path and it must be + setuid root if you're not running UML as root. This is because the + tap device doesn't support SIGIO, which UML needs in order to use + something as a source of input. So, the helper is used as a + convenient asynchronous IO thread. + + If you're using the uml_net helper, you can ignore the following host + setup - uml_net will do it for you. You just need to make sure you + have ethertap available, either built in to the host kernel or + available as a module. + + + If you want to set things up yourself, you need to make sure that the + appropriate /dev entry exists. If it doesn't, become root and create + it as follows: + + + mknod /dev/tap c 36 + 16 + + + + + For example, this is how to create /dev/tap0: + + + mknod /dev/tap0 c 36 0 + 16 + + + + + You also need to make sure that the host kernel has ethertap support. + If ethertap is enabled as a module, you apparently need to insmod + ethertap once for each ethertap device you want to enable. So, + + + host# + insmod ethertap + + + + + will give you the tap0 interface. To get the tap1 interface, you need + to run + + + host# + insmod ethertap unit=1 -o ethertap1 + + + + + + + + 66..99.. TThhee sswwiittcchh ddaaeemmoonn + + NNoottee: This is the daemon formerly known as uml_router, but which was + renamed so the network weenies of the world would stop growling at me. + + + The switch daemon, uml_switch, provides a mechanism for creating a + totally virtual network. By default, it provides no connection to the + host network (but see -tap, below). + + + The first thing you need to do is run the daemon. Running it with no + arguments will make it listen on a default pair of unix domain + sockets. + + + If you want it to listen on a different pair of sockets, use + + + -unix control socket data socket + + + + + + If you want it to act as a hub rather than a switch, use + + + -hub + + + + + + If you want the switch to be connected to host networking (allowing + the umls to get access to the outside world through the host), use + + + -tap tap0 + + + + + + Note that the tap device must be preconfigured (see "TUN/TAP with a + preconfigured tap device", above). If you're using a different tap + device than tap0, specify that instead of tap0. + + + uml_switch can be backgrounded as follows + + + host% + uml_switch [ options ] < /dev/null > /dev/null + + + + + The reason it doesn't background by default is that it listens to + stdin for EOF. When it sees that, it exits. + + + The general format of the kernel command line switch is + + + + ethn=daemon,ethernet address,socket + type,control socket,data socket + + + + + You can leave off everything except the 'daemon'. You only need to + specify the ethernet address if the one that will be assigned to it + isn't acceptable for some reason. The rest of the arguments describe + how to communicate with the daemon. You should only specify them if + you told the daemon to use different sockets than the default. So, if + you ran the daemon with no arguments, running the UML on the same + machine with + eth0=daemon + + + + + will cause the eth0 driver to attach itself to the daemon correctly. + + + + 66..1100.. SSlliipp + + Slip is another, less general, mechanism for a process to communicate + with the host networking. In contrast to the ethertap interface, + which exchanges ethernet frames with the host and can be used to + transport any higher-level protocol, it can only be used to transport + IP. + + + The general format of the command line switch is + + + + ethn=slip,slip IP + + + + + The slip IP argument is the IP address that will be assigned to the + host end of the slip device. If it is specified, the helper will run + and will set up the host so that the virtual machine can reach it and + the rest of the network. + + + There are some oddities with this interface that you should be aware + of. You should only specify one slip device on a given virtual + machine, and its name inside UML will be 'umn', not 'eth0' or whatever + you specified on the command line. These problems will be fixed at + some point. + + + + 66..1111.. SSlliirrpp + + slirp uses an external program, usually /usr/bin/slirp, to provide IP + only networking connectivity through the host. This is similar to IP + masquerading with a firewall, although the translation is performed in + user-space, rather than by the kernel. As slirp does not set up any + interfaces on the host, or changes routing, slirp does not require + root access or setuid binaries on the host. + + + The general format of the command line switch for slirp is: + + + + ethn=slirp,ethernet address,slirp path + + + + + The ethernet address is optional, as UML will set up the interface + with an ethernet address based upon the initial IP address of the + interface. The slirp path is generally /usr/bin/slirp, although it + will depend on distribution. + + + The slirp program can have a number of options passed to the command + line and we can't add them to the UML command line, as they will be + parsed incorrectly. Instead, a wrapper shell script can be written or + the options inserted into the /.slirprc file. More information on + all of the slirp options can be found in its man pages. + + + The eth0 interface on UML should be set up with the IP 10.2.0.15, + although you can use anything as long as it is not used by a network + you will be connecting to. The default route on UML should be set to + use + + + UML# + route add default dev eth0 + + + + + slirp provides a number of useful IP addresses which can be used by + UML, such as 10.0.2.3 which is an alias for the DNS server specified + in /etc/resolv.conf on the host or the IP given in the 'dns' option + for slirp. + + + Even with a baudrate setting higher than 115200, the slirp connection + is limited to 115200. If you need it to go faster, the slirp binary + needs to be compiled with FULL_BOLT defined in config.h. + + + + 66..1122.. ppccaapp + + The pcap transport is attached to a UML ethernet device on the command + line or with uml_mconsole with the following syntax: + + + + ethn=pcap,host interface,filter + expression,option1,option2 + + + + + The expression and options are optional. + + + The interface is whatever network device on the host you want to + sniff. The expression is a pcap filter expression, which is also what + tcpdump uses, so if you know how to specify tcpdump filters, you will + use the same expressions here. The options are up to two of + 'promisc', control whether pcap puts the host interface into + promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap + expression optimizer is used. + + + Example: + + + + eth0=pcap,eth0,tcp + + eth1=pcap,eth0,!tcp + + + + will cause the UML eth0 to emit all tcp packets on the host eth0 and + the UML eth1 to emit all non-tcp packets on the host eth0. + + + + 66..1133.. SSeettttiinngg uupp tthhee hhoosstt yyoouurrsseellff + + If you don't specify an address for the host side of the ethertap or + slip device, UML won't do any setup on the host. So this is what is + needed to get things working (the examples use a host-side IP of + 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your + own network): + + +o The device needs to be configured with its IP address. Tap devices + are also configured with an mtu of 1484. Slip devices are + configured with a point-to-point address pointing at the UML ip + address. + + + host# ifconfig tap0 arp mtu 1484 192.168.0.251 up + + + + + + + host# + ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up + + + + + + +o If a tap device is being set up, a route is set to the UML IP. + + + UML# route add -host 192.168.0.250 gw 192.168.0.251 + + + + + + +o To allow other hosts on your network to see the virtual machine, + proxy arp is set up for it. + + + host# arp -Ds 192.168.0.250 eth0 pub + + + + + + +o Finally, the host is set up to route packets. + + + host# echo 1 > /proc/sys/net/ipv4/ip_forward + + + + + + + + + + + 77.. SShhaarriinngg FFiilleessyysstteemmss bbeettwweeeenn VViirrttuuaall MMaacchhiinneess + + + + + 77..11.. AA wwaarrnniinngg + + Don't attempt to share filesystems simply by booting two UMLs from the + same file. That's the same thing as booting two physical machines + from a shared disk. It will result in filesystem corruption. + + + + 77..22.. UUssiinngg llaayyeerreedd bblloocckk ddeevviicceess + + The way to share a filesystem between two virtual machines is to use + the copy-on-write (COW) layering capability of the ubd block driver. + As of 2.4.6-2um, the driver supports layering a read-write private + device over a read-only shared device. A machine's writes are stored + in the private device, while reads come from either device - the + private one if the requested block is valid in it, the shared one if + not. Using this scheme, the majority of data which is unchanged is + shared between an arbitrary number of virtual machines, each of which + has a much smaller file containing the changes that it has made. With + a large number of UMLs booting from a large root filesystem, this + leads to a huge disk space saving. It will also help performance, + since the host will be able to cache the shared data using a much + smaller amount of memory, so UML disk requests will be served from the + host's memory rather than its disks. + + + + + To add a copy-on-write layer to an existing block device file, simply + add the name of the COW file to the appropriate ubd switch: + + + ubd0=root_fs_cow,root_fs_debian_22 + + + + + where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is + the existing shared filesystem. The COW file need not exist. If it + doesn't, the driver will create and initialize it. Once the COW file + has been initialized, it can be used on its own on the command line: + + + ubd0=root_fs_cow + + + + + The name of the backing file is stored in the COW file header, so it + would be redundant to continue specifying it on the command line. + + + + 77..33.. NNoottee!! + + When checking the size of the COW file in order to see the gobs of + space that you're saving, make sure you use 'ls -ls' to see the actual + disk consumption rather than the length of the file. The COW file is + sparse, so the length will be very different from the disk usage. + Here is a 'ls -l' of a COW file and backing file from one boot and + shutdown: + host% ls -l cow.debian debian2.2 + -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Doesn't look like much saved space, does it? Well, here's 'ls -ls': + + + host% ls -ls cow.debian debian2.2 + 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Now, you can see that the COW file has less than a meg of disk, rather + than 492 meg. + + + + 77..44.. AAnnootthheerr wwaarrnniinngg + + Once a filesystem is being used as a readonly backing file for a COW + file, do not boot directly from it or modify it in any way. Doing so + will invalidate any COW files that are using it. The mtime and size + of the backing file are stored in the COW file header at its creation, + and they must continue to match. If they don't, the driver will + refuse to use the COW file. + + + + + If you attempt to evade this restriction by changing either the + backing file or the COW header by hand, you will get a corrupted + filesystem. + + + + + Among other things, this means that upgrading the distribution in a + backing file and expecting that all of the COW files using it will see + the upgrade will not work. + + + + + 77..55.. uummll__mmoooo :: MMeerrggiinngg aa CCOOWW ffiillee wwiitthh iittss bbaacckkiinngg ffiillee + + Depending on how you use UML and COW devices, it may be advisable to + merge the changes in the COW file into the backing file every once in + a while. + + + + + The utility that does this is uml_moo. Its usage is + + + host% uml_moo COW file new backing file + + + + + There's no need to specify the backing file since that information is + already in the COW file header. If you're paranoid, boot the new + merged file, and if you're happy with it, move it over the old backing + file. + + + + + uml_moo creates a new backing file by default as a safety measure. It + also has a destructive merge option which will merge the COW file + directly into its current backing file. This is really only usable + when the backing file only has one COW file associated with it. If + there are multiple COWs associated with a backing file, a -d merge of + one of them will invalidate all of the others. However, it is + convenient if you're short of disk space, and it should also be + noticeably faster than a non-destructive merge. + + + + + uml_moo is installed with the UML deb and RPM. If you didn't install + UML from one of those packages, you can also get it from the UML + utilities tar file in tools/moo. + + + + + + + + + 88.. CCrreeaattiinngg ffiilleessyysstteemmss + + + You may want to create and mount new UML filesystems, either because + your root filesystem isn't large enough or because you want to use a + filesystem other than ext2. + + + This was written on the occasion of reiserfs being included in the + 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will + talk about reiserfs. This information is generic, and the examples + should be easy to translate to the filesystem of your choice. + + + 88..11.. CCrreeaattee tthhee ffiilleessyysstteemm ffiillee + + dd is your friend. All you need to do is tell dd to create an empty + file of the appropriate size. I usually make it sparse to save time + and to avoid allocating disk space until it's actually used. For + example, the following command will create a sparse 100 meg file full + of zeroes. + + + host% + dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M + + + + + + + 88..22.. AAssssiiggnn tthhee ffiillee ttoo aa UUMMLL ddeevviiccee + + Add an argument like the following to the UML command line: + + ubd4=new_filesystem + + + + + making sure that you use an unassigned ubd device number. + + + + 88..33.. CCrreeaattiinngg aanndd mmoouunnttiinngg tthhee ffiilleessyysstteemm + + Make sure that the filesystem is available, either by being built into + the kernel, or available as a module, then boot up UML and log in. If + the root filesystem doesn't have the filesystem utilities (mkfs, fsck, + etc), then get them into UML by way of the net or hostfs. + + + Make the new filesystem on the device assigned to the new file: + + + host# mkreiserfs /dev/ubd/4 + + + <----------- MKREISERFSv2 -----------> + + ReiserFS version 3.6.25 + Block size 4096 bytes + Block count 25856 + Used blocks 8212 + Journal - 8192 blocks (18-8209), journal header is in block 8210 + Bitmaps: 17 + Root block 8211 + Hash function "r5" + ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y + journal size 8192 (from 18) + Initializing journal - 0%....20%....40%....60%....80%....100% + Syncing..done. + + + + + Now, mount it: + + + UML# + mount /dev/ubd/4 /mnt + + + + + and you're in business. + + + + + + + + + + 99.. HHoosstt ffiillee aacccceessss + + + If you want to access files on the host machine from inside UML, you + can treat it as a separate machine and either nfs mount directories + from the host or copy files into the virtual machine with scp or rcp. + However, since UML is running on the host, it can access those + files just like any other process and make them available inside the + virtual machine without needing to use the network. + + + This is now possible with the hostfs virtual filesystem. With it, you + can mount a host directory into the UML filesystem and access the + files contained in it just as you would on the host. + + + 99..11.. UUssiinngg hhoossttffss + + To begin with, make sure that hostfs is available inside the virtual + machine with + + + UML# cat /proc/filesystems + + + + . hostfs should be listed. If it's not, either rebuild the kernel + with hostfs configured into it or make sure that hostfs is built as a + module and available inside the virtual machine, and insmod it. + + + Now all you need to do is run mount: + + + UML# mount none /mnt/host -t hostfs + + + + + will mount the host's / on the virtual machine's /mnt/host. + + + If you don't want to mount the host root directory, then you can + specify a subdirectory to mount with the -o switch to mount: + + + UML# mount none /mnt/home -t hostfs -o /home + + + + + will mount the hosts's /home on the virtual machine's /mnt/home. + + + + 99..22.. hhoossttffss aass tthhee rroooott ffiilleessyysstteemm + + It's possible to boot from a directory hierarchy on the host using + hostfs rather than using the standard filesystem in a file. + + To start, you need that hierarchy. The easiest way is to loop mount + an existing root_fs file: + + + host# mount root_fs uml_root_dir -o loop + + + + + You need to change the filesystem type of / in etc/fstab to be + 'hostfs', so that line looks like this: + + /dev/ubd/0 / hostfs defaults 1 1 + + + + + Then you need to chown to yourself all the files in that directory + that are owned by root. This worked for me: + + + host# find . -uid 0 -exec chown jdike {} \; + + + + + Next, make sure that your UML kernel has hostfs compiled in, not as a + module. Then run UML with the boot device pointing at that directory: + + + ubd0=/path/to/uml/root/directory + + + + + UML should then boot as it does normally. + + + 99..33.. BBuuiillddiinngg hhoossttffss + + If you need to build hostfs because it's not in your kernel, you have + two choices: + + + + +o Compiling hostfs into the kernel: + + + Reconfigure the kernel and set the 'Host filesystem' option under + + + +o Compiling hostfs as a module: + + + Reconfigure the kernel and set the 'Host filesystem' option under + be in arch/um/fs/hostfs/hostfs.o. Install that in + /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and + + + UML# insmod hostfs + + + + + + + + + + + + + 1100.. TThhee MMaannaaggeemmeenntt CCoonnssoollee + + + + The UML management console is a low-level interface to the kernel, + somewhat like the i386 SysRq interface. Since there is a full-blown + operating system under UML, there is much greater flexibility possible + than with the SysRq mechanism. + + + There are a number of things you can do with the mconsole interface: + + +o get the kernel version + + +o add and remove devices + + +o halt or reboot the machine + + +o Send SysRq commands + + +o Pause and resume the UML + + + You need the mconsole client (uml_mconsole) which is present in CVS + (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in + 2.4.6. + + + You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. + When you boot UML, you'll see a line like: + + + mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole + + + + + If you specify a unique machine id one the UML command line, i.e. + + + umid=debian + + + + + you'll see this + + + mconsole initialized on /home/jdike/.uml/debian/mconsole + + + + + That file is the socket that uml_mconsole will use to communicate with + UML. Run it with either the umid or the full path as its argument: + + + host% uml_mconsole debian + + + + + or + + + host% uml_mconsole /home/jdike/.uml/debian/mconsole + + + + + You'll get a prompt, at which you can run one of these commands: + + +o version + + +o halt + + +o reboot + + +o config + + +o remove + + +o sysrq + + +o help + + +o cad + + +o stop + + +o go + + + 1100..11.. vveerrssiioonn + + This takes no arguments. It prints the UML version. + + + (mconsole) version + OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 + + + + + There are a couple actual uses for this. It's a simple no-op which + can be used to check that a UML is running. It's also a way of + sending an interrupt to the UML. This is sometimes useful on SMP + hosts, where there's a bug which causes signals to UML to be lost, + often causing it to appear to hang. Sending such a UML the mconsole + version command is a good way to 'wake it up' before networking has + been enabled, as it does not do anything to the function of the UML. + + + + 1100..22.. hhaalltt aanndd rreebboooott + + These take no arguments. They shut the machine down immediately, with + no syncing of disks and no clean shutdown of userspace. So, they are + pretty close to crashing the machine. + + + (mconsole) halt + OK + + + + + + + 1100..33.. ccoonnffiigg + + "config" adds a new device to the virtual machine. Currently the ubd + and network drivers support this. It takes one argument, which is the + device to add, with the same syntax as the kernel command line. + + + + + (mconsole) + config ubd3=/home/jdike/incoming/roots/root_fs_debian22 + + OK + (mconsole) config eth1=mcast + OK + + + + + + + 1100..44.. rreemmoovvee + + "remove" deletes a device from the system. Its argument is just the + name of the device to be removed. The device must be idle in whatever + sense the driver considers necessary. In the case of the ubd driver, + the removed block device must not be mounted, swapped on, or otherwise + open, and in the case of the network driver, the device must be down. + + + (mconsole) remove ubd3 + OK + (mconsole) remove eth1 + OK + + + + + + + 1100..55.. ssyyssrrqq + + This takes one argument, which is a single letter. It calls the + generic kernel's SysRq driver, which does whatever is called for by + that argument. See the SysRq documentation in Documentation/sysrq.txt + in your favorite kernel tree to see what letters are valid and what + they do. + + + + 1100..66.. hheellpp + + "help" returns a string listing the valid commands and what each one + does. + + + + 1100..77.. ccaadd + + This invokes the Ctl-Alt-Del action on init. What exactly this ends + up doing is up to /etc/inittab. Normally, it reboots the machine. + With UML, this is usually not desired, so if a halt would be better, + then find the section of inittab that looks like this + + + # What to do when CTRL-ALT-DEL is pressed. + ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now + + + + + and change the command to halt. + + + + 1100..88.. ssttoopp + + This puts the UML in a loop reading mconsole requests until a 'go' + mconsole command is received. This is very useful for making backups + of UML filesystems, as the UML can be stopped, then synced via 'sysrq + s', so that everything is written to the filesystem. You can then copy + the filesystem and then send the UML 'go' via mconsole. + + + Note that a UML running with more than one CPU will have problems + after you send the 'stop' command, as only one CPU will be held in a + mconsole loop and all others will continue as normal. This is a bug, + and will be fixed. + + + + 1100..99.. ggoo + + This resumes a UML after being paused by a 'stop' command. Note that + when the UML has resumed, TCP connections may have timed out and if + the UML is paused for a long period of time, crond might go a little + crazy, running all the jobs it didn't do earlier. + + + + + + + + + 1111.. KKeerrnneell ddeebbuuggggiinngg + + + NNoottee:: The interface that makes debugging, as described here, possible + is present in 2.4.0-test6 kernels and later. + + + Since the user-mode kernel runs as a normal Linux process, it is + possible to debug it with gdb almost like any other process. It is + slightly different because the kernel's threads are already being + ptraced for system call interception, so gdb can't ptrace them. + However, a mechanism has been added to work around that problem. + + + In order to debug the kernel, you need build it from source. See + ``Compiling the kernel and modules'' for information on doing that. + Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during + the config. These will compile the kernel with -g, and enable the + ptrace proxy so that gdb works with UML, respectively. + + + + + 1111..11.. SSttaarrttiinngg tthhee kkeerrnneell uunnddeerr ggddbb + + You can have the kernel running under the control of gdb from the + beginning by putting 'debug' on the command line. You will get an + xterm with gdb running inside it. The kernel will send some commands + to gdb which will leave it stopped at the beginning of start_kernel. + At this point, you can get things going with 'next', 'step', or + 'cont'. + + + There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an + interrupt handler. + 1111..22.. EExxaammiinniinngg sslleeeeppiinngg pprroocceesssseess + + Not every bug is evident in the currently running process. Sometimes, + processes hang in the kernel when they shouldn't because they've + deadlocked on a semaphore or something similar. In this case, when + you ^C gdb and get a backtrace, you will see the idle thread, which + isn't very relevant. + + + What you want is the stack of whatever process is sleeping when it + shouldn't be. You need to figure out which process that is, which is + generally fairly easy. Then you need to get its host process id, + which you can do either by looking at ps on the host or at + task.thread.extern_pid in gdb. + + + Now what you do is this: + + +o detach from the current thread + + + (UML gdb) det + + + + + + +o attach to the thread you are interested in + + + (UML gdb) att + + + + + + +o look at its stack and anything else of interest + + + (UML gdb) bt + + + + + Note that you can't do anything at this point that requires that a + process execute, e.g. calling a function + + +o when you're done looking at that process, reattach to the current + thread and continue it + + + (UML gdb) + att 1 + + + + + + + (UML gdb) + c + + + + + Here, specifying any pid which is not the process id of a UML thread + will cause gdb to reattach to the current thread. I commonly use 1, + but any other invalid pid would work. + + + + 1111..33.. RRuunnnniinngg dddddd oonn UUMMLL + + ddd works on UML, but requires a special kludge. The process goes + like this: + + +o Start ddd + + + host% ddd linux + + + + + + +o With ps, get the pid of the gdb that ddd started. You can ask the + gdb to tell you, but for some reason that confuses things and + causes a hang. + + +o run UML with 'debug=parent gdb-pid=' added to the command line + - it will just sit there after you hit return + + +o type 'att 1' to the ddd gdb and you will see something like + + + 0xa013dc51 in __kill () + + + (gdb) + + + + + + +o At this point, type 'c', UML will boot up, and you can use ddd just + as you do on any other process. + + + + 1111..44.. DDeebbuuggggiinngg mmoodduulleess + + gdb has support for debugging code which is dynamically loaded into + the process. This support is what is needed to debug kernel modules + under UML. + + + Using that support is somewhat complicated. You have to tell gdb what + object file you just loaded into UML and where in memory it is. Then, + it can read the symbol table, and figure out where all the symbols are + from the load address that you provided. It gets more interesting + when you load the module again (i.e. after an rmmod). You have to + tell gdb to forget about all its symbols, including the main UML ones + for some reason, then load then all back in again. + + + There's an easy way and a hard way to do this. The easy way is to use + the umlgdb expect script written by Chandan Kudige. It basically + automates the process for you. + + + First, you must tell it where your modules are. There is a list in + the script that looks like this: + set MODULE_PATHS { + "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" + "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" + "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" + } + + + + + You change that to list the names and paths of the modules that you + are going to debug. Then you run it from the toplevel directory of + your UML pool and it basically tells you what to do: + + + + + ******** GDB pid is 21903 ******** + Start UML as: ./linux debug gdb-pid=21903 + + + + GNU gdb 5.0rh-5 Red Hat Linux 7.1 + Copyright 2001 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) b sys_init_module + Breakpoint 1 at 0xa0011923: file module.c, line 349. + (gdb) att 1 + + + + + After you run UML and it sits there doing nothing, you hit return at + the 'att 1' and continue it: + + + Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 + 0xa00f4221 in __kill () + (UML gdb) c + Continuing. + + + + + At this point, you debug normally. When you insmod something, the + expect magic will kick in and you'll see something like: + + + + + + + + + + + + + + + + + + *** Module hostfs loaded *** + Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 349 char *name, *n_name, *name_tmp = NULL; + (UML gdb) finish + Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 + 411 else res = EXECUTE_SYSCALL(syscall, regs); + Value returned is $1 = 0 + (UML gdb) + p/x (int)module_list + module_list->size_of_struct + + $2 = 0xa9021054 + (UML gdb) symbol-file ./linux + Load new symbol table from "./linux"? (y or n) y + Reading symbols from ./linux... + done. + (UML gdb) + add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 + + add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at + .text_addr = 0xa9021054 + (y or n) y + + Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... + done. + (UML gdb) p *module_list + $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", + size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, + nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, + init = 0xa90221f0 , cleanup = 0xa902222c , + ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, + persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, + kallsyms_end = 0x0, + archdata_start = 0x1b855
, + archdata_end = 0xe5890000
, + kernel_data = 0xf689c35d
} + >> Finished loading symbols for hostfs ... + + + + + That's the easy way. It's highly recommended. The hard way is + described below in case you're interested in what's going on. + + + Boot the kernel under the debugger and load the module with insmod or + modprobe. With gdb, do: + + + (UML gdb) p module_list + + + + + This is a list of modules that have been loaded into the kernel, with + the most recently loaded module first. Normally, the module you want + is at module_list. If it's not, walk down the next links, looking at + the name fields until find the module you want to debug. Take the + address of that structure, and add module.size_of_struct (which in + 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition + for you :-): + + + + (UML gdb) + printf "%#x\n", (int)module_list module_list->size_of_struct + + + + + The offset from the module start occasionally changes (before 2.4.0, + it was module.size_of_struct + 4), so it's a good idea to check the + init and cleanup addresses once in a while, as describe below. Now + do: + + + (UML gdb) + add-symbol-file /path/to/module/on/host that_address + + + + + Tell gdb you really want to do it, and you're in business. + + + If there's any doubt that you got the offset right, like breakpoints + appear not to work, or they're appearing in the wrong place, you can + check it by looking at the module structure. The init and cleanup + fields should look like: + + + init = 0x588066b0 , cleanup = 0x588066c0 + + + + + with no offsets on the symbol names. If the names are right, but they + are offset, then the offset tells you how much you need to add to the + address you gave to add-symbol-file. + + + When you want to load in a new version of the module, you need to get + gdb to forget about the old one. The only way I've found to do that + is to tell gdb to forget about all symbols that it knows about: + + + (UML gdb) symbol-file + + + + + Then reload the symbols from the kernel binary: + + + (UML gdb) symbol-file /path/to/kernel + + + + + and repeat the process above. You'll also need to re-enable break- + points. They were disabled when you dumped all the symbols because + gdb couldn't figure out where they should go. + + + + 1111..55.. AAttttaacchhiinngg ggddbb ttoo tthhee kkeerrnneell + + If you don't have the kernel running under gdb, you can attach gdb to + it later by sending the tracing thread a SIGUSR1. The first line of + the console output identifies its pid: + tracing thread pid = 20093 + + + + + When you send it the signal: + + + host% kill -USR1 20093 + + + + + you will get an xterm with gdb running in it. + + + If you have the mconsole compiled into UML, then the mconsole client + can be used to start gdb: + + + (mconsole) (mconsole) config gdb=xterm + + + + + will fire up an xterm with gdb running in it. + + + + 1111..66.. UUssiinngg aalltteerrnnaattee ddeebbuuggggeerrss + + UML has support for attaching to an already running debugger rather + than starting gdb itself. This is present in CVS as of 17 Apr 2001. + I sent it to Alan for inclusion in the ac tree, and it will be in my + 2.4.4 release. + + + This is useful when gdb is a subprocess of some UI, such as emacs or + ddd. It can also be used to run debuggers other than gdb on UML. + Below is an example of using strace as an alternate debugger. + + + To do this, you need to get the pid of the debugger and pass it in + with the + + + If you are using gdb under some UI, then tell it to 'att 1', and + you'll find yourself attached to UML. + + + If you are using something other than gdb as your debugger, then + you'll need to get it to do the equivalent of 'att 1' if it doesn't do + it automatically. + + + An example of an alternate debugger is strace. You can strace the + actual kernel as follows: + + +o Run the following in a shell + + + host% + sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' + + + + +o Run UML with 'debug' and 'gdb-pid=' with the pid printed out + by the previous command + + +o Hit return in the shell, and UML will start running, and strace + output will start accumulating in the output file. + + Note that this is different from running + + + host% strace ./linux + + + + + That will strace only the main UML thread, the tracing thread, which + doesn't do any of the actual kernel work. It just oversees the vir- + tual machine. In contrast, using strace as described above will show + you the low-level activity of the virtual machine. + + + + + + 1122.. KKeerrnneell ddeebbuuggggiinngg eexxaammpplleess + + 1122..11.. TThhee ccaassee ooff tthhee hhuunngg ffsscckk + + When booting up the kernel, fsck failed, and dropped me into a shell + to fix things up. I ran fsck -y, which hung: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 was not cleanly unmounted, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Inode 19780, i_blocks is 1548, should be 540. Fix? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + + + + + The standard drill in this sort of situation is to fire up gdb on the + signal thread, which, in this case, was pid 1935. In another window, + I run gdb and attach pid 1935. + + + + + ~/linux/2.3.26/um 1016: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + + (gdb) att 1935 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 + 0x100756d9 in __wait4 () + + + + + + + Let's see what's currently running: + + + + (gdb) p current_task.pid + $1 = 0 + + + + + + It's the idle thread, which means that fsck went to sleep for some + reason and never woke up. + + + Let's guess that the last process in the process list is fsck: + + + + (gdb) p current_task.prev_task.comm + $13 = "fsck.ext2\000\000\000\000\000\000" + + + + + + It is, so let's see what it thinks it's up to: + + + + (gdb) p current_task.prev_task.thread + $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, + kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { + 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, + request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { + regs = {1350467584, 2952789424, 0 }, sigstack = 0, + pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, + arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { + op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} + + + + + + The interesting things here are the fact that its .thread.syscall.id + is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or + the defines in include/asm-um/arch/unistd.h), and that it never + returned. Also, its .request.op is OP_SWITCH (see + arch/um/include/user_util.h). These mean that it went into a write, + and, for some reason, called schedule(). + + + The fact that it never returned from write means that its stack should + be fairly interesting. Its pid is 1980 (.thread.extern_pid). That + process is being ptraced by the signal thread, so it must be detached + before gdb can attach it: + + + + + + + + + + + (gdb) call detach(1980) + + Program received signal SIGSEGV, Segmentation fault. + + The program being debugged stopped while in a function called from GDB. + When the function (detach) is done executing, GDB will silently + stop (instead of continuing to evaluate the expression containing + the function call). + (gdb) call detach(1980) + $15 = 0 + + + + + + The first detach segfaults for some reason, and the second one + succeeds. + + + Now I detach from the signal thread, attach to the fsck thread, and + look at its stack: + + + (gdb) det + Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 + (gdb) att 1980 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 + 0x10070451 in __kill () + (gdb) bt + #0 0x10070451 in __kill () + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + #4 0x10001d12 in schedule () at sched.c:777 + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #9 + #10 0x10155404 in errno () + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #14 + #15 0xc0fd in ?? () + #16 0x10016647 in sys_write (fd=3, + buf=0x80b8800
, count=1024) + at read_write.c:159 + #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #19 + #20 0x400dc8b0 in ?? () + + + + + + The interesting things here are : + + +o There are two segfaults on this stack (frames 9 and 14) + + +o The first faulting address (frame 11) is 0x50000800 + + (gdb) p (void *)1342179328 + $16 = (void *) 0x50000800 + + + + + + The initial faulting address is interesting because it is on the idle + thread's stack. I had been seeing the idle thread segfault for no + apparent reason, and the cause looked like stack corruption. In hopes + of catching the culprit in the act, I had turned off all protections + to that stack while the idle thread wasn't running. This apparently + tripped that trap. + + + However, the more immediate problem is that second segfault and I'm + going to concentrate on that. First, I want to see where the fault + happened, so I have to go look at the sigcontent struct in frame 8: + + + + (gdb) up + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + 30 kill(pid, SIGUSR1); + (gdb) + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + 156 usr1_pid(getpid()); + (gdb) + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + 161 _switch_to(prev, next); + (gdb) + #4 0x10001d12 in schedule () at sched.c:777 + 777 switch_to(prev, next, prev); + (gdb) + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + 71 schedule(); + (gdb) + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + 157 } + (gdb) + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + 182 segv_handler(sc); + (gdb) p *sc + Cannot access memory at address 0x0. + + + + + That's not very useful, so I'll try a more manual method: + + + (gdb) p *((struct sigcontext *) (&sig + 1)) + $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, + esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, + trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, + esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1280} + + + + The ip is in handle_mm_fault: + + + (gdb) p (void *)268480945 + $20 = (void *) 0x1000b1b1 + (gdb) i sym $20 + handle_mm_fault + 57 in section .text + + + + + + Specifically, it's in pte_alloc: + + + (gdb) i line *$20 + Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b1 + and ends at 0x1000b1b7 . + + + + + + To find where in handle_mm_fault this is, I'll jump forward in the + code until I see an address in that procedure: + + + + (gdb) i line *0x1000b1c0 + Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b7 + and ends at 0x1000b1c3 . + (gdb) i line *0x1000b1d0 + Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1d0 + and ends at 0x1000b1da . + (gdb) i line *0x1000b1e0 + Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1da + and ends at 0x1000b1e1 . + (gdb) i line *0x1000b1f0 + Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1f0 + and ends at 0x1000b200 . + (gdb) i line *0x1000b200 + Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b200 + and ends at 0x1000b208 . + (gdb) i line *0x1000b210 + Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b210 + and ends at 0x1000b219 . + (gdb) i line *0x1000b220 + Line 1168 of "memory.c" starts at address 0x1000b21e + and ends at 0x1000b222 . + + + + + + Something is apparently wrong with the page tables or vma_structs, so + lets go back to frame 11 and have a look at them: + + + + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + 50 handle_mm_fault(current, vma, address, is_write); + (gdb) call pgd_offset_proc(vma->vm_mm, address) + $22 = (pgd_t *) 0x80a548c + + + + + + That's pretty bogus. Page tables aren't supposed to be in process + text or data areas. Let's see what's in the vma: + + + (gdb) p *vma + $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, + vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, + vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, + vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, + vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, + vm_private_data = 0x62} + (gdb) p *vma.vm_mm + $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, + pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, + map_count = 134909076, mmap_sem = {count = {counter = 135073792}, + sleepers = -1342177872, wait = {lock = , + task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, + __magic = -1342177670, __creator = -1342177300}, __magic = 98}, + page_table_lock = {}, context = 138, start_code = 0, end_code = 0, + start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, + arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, + total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, + swap_address = 0, segments = 0x0} + + + + + + This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx + addresses, this is looking like a stack was plonked down on top of + these structures. Maybe it's a stack overflow from the next page: + + + + (gdb) p vma + $25 = (struct vm_area_struct *) 0x507d2434 + + + + + + That's towards the lower quarter of the page, so that would have to + have been pretty heavy stack overflow: + + + + + + + + + + + + + + + (gdb) x/100x $25 + 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c + 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 + 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a + 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 + 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 + 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 + 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 + + + + + + It's not stack overflow. The only "stack-like" piece of this data is + the vma_struct itself. + + + At this point, I don't see any avenues to pursue, so I just have to + admit that I have no idea what's going on. What I will do, though, is + stick a trap on the segfault handler which will stop if it sees any + writes to the idle thread's stack. That was the thing that happened + first, and it may be that if I can catch it immediately, what's going + on will be somewhat clearer. + + + 1122..22.. EEppiissooddee 22:: TThhee ccaassee ooff tthhee hhuunngg ffsscckk + + After setting a trap in the SEGV handler for accesses to the signal + thread's stack, I reran the kernel. + + + fsck hung again, this time by hitting the trap: + + + + + + + + + + + + + + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 contains a file system with errors, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + Untested (4127) [100fe44c]: trap_kern.c line 31 + + + + + + I need to get the signal thread to detach from pid 4127 so that I can + attach to it with gdb. This is done by sending it a SIGUSR1, which is + caught by the signal thread, which detaches the process: + + + kill -USR1 4127 + + + + + + Now I can run gdb on it: + + + + + + + + + + + + + + ~/linux/2.3.26/um 1034: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) att 4127 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 + 0x10075891 in __libc_nanosleep () + + + + + + The backtrace shows that it was in a write and that the fault address + (address in frame 3) is 0x50000800, which is right in the middle of + the signal thread's stack page: + + + (gdb) bt + #0 0x10075891 in __libc_nanosleep () + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + #2 0x1006ce9a in stop () at user_util.c:191 + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 + #6 + #7 0xc0fd in ?? () + #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) + at read_write.c:159 + #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #11 + #12 0x400dc8b0 in ?? () + #13 + #14 0x400dc8b0 in ?? () + #15 0x80545fd in ?? () + #16 0x804daae in ?? () + #17 0x8054334 in ?? () + #18 0x804d23e in ?? () + #19 0x8049632 in ?? () + #20 0x80491d2 in ?? () + #21 0x80596b5 in ?? () + (gdb) p (void *)1342179328 + $3 = (void *) 0x50000800 + + + + + + Going up the stack to the segv_handler frame and looking at where in + the code the access happened shows that it happened near line 110 of + block_dev.c: + + + + + + + + + + (gdb) up + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. + (gdb) + #2 0x1006ce9a in stop () at user_util.c:191 + 191 while(1) sleep(1000000); + (gdb) + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + 31 KERN_UNTESTED(); + (gdb) + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) p *sc + $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, + esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, + err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, + esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1342179328} + (gdb) p (void *)268550834 + $2 = (void *) 0x1001c2b2 + (gdb) i sym $2 + block_write + 1090 in section .text + (gdb) i line *$2 + Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" + starts at address 0x1001c2a1 + and ends at 0x1001c2bf . + (gdb) i line *0x1001c2c0 + Line 110 of "block_dev.c" starts at address 0x1001c2bf + and ends at 0x1001c2e3 . + + + + + + Looking at the source shows that the fault happened during a call to + copy_to_user to copy the data into the kernel: + + + 107 count -= chars; + 108 copy_from_user(p,buf,chars); + 109 p += chars; + 110 buf += chars; + + + + + + p is the pointer which must contain 0x50000800, since buf contains + 0x80b8800 (frame 8 above). It is defined as: + + + p = offset + bh->b_data; + + + + + + I need to figure out what bh is, and it just so happens that bh is + passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a + few lines later, so I do a little disassembly: + + + + + (gdb) disas 0x1001c2bf 0x1001c2e0 + Dump of assembler code from 0x1001c2bf to 0x1001c2d0: + 0x1001c2bf : addl %eax,0xc(%ebp) + 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx + 0x1001c2c8 : btsl $0x0,0x18(%edx) + 0x1001c2cd : btsl $0x1,0x18(%edx) + 0x1001c2d2 : sbbl %ecx,%ecx + 0x1001c2d4 : testl %ecx,%ecx + 0x1001c2d6 : jne 0x1001c2e3 + 0x1001c2d8 : pushl $0x0 + 0x1001c2da : pushl %edx + 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> + End of assembler dump. + + + + + + At that point, bh is in %edx (address 0x1001c2da), which is calculated + at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, + taking %ebp from the sigcontext_struct above: + + + (gdb) p (void *)1342631484 + $5 = (void *) 0x5006ee3c + (gdb) p 0x5006ee3c+0xfffffdd4 + $6 = 1342630928 + (gdb) p (void *)$6 + $7 = (void *) 0x5006ec10 + (gdb) p *((void **)$7) + $8 = (void *) 0x50100200 + + + + + + Now, I look at the structure to see what's in it, and particularly, + what its b_data field contains: + + + (gdb) p *((struct buffer_head *)0x50100200) + $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, + b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, + b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, + b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, + b_data = 0x50000800 "", b_page = 0x50004000, + b_end_io = 0x10017f60 , b_dev_id = 0x0, + b_rsector = 98810, b_wait = {lock = , + task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, + __creator = 0}, b_kiobuf = 0x0} + + + + + + The b_data field is indeed 0x50000800, so the question becomes how + that happened. The rest of the structure looks fine, so this probably + is not a case of data corruption. It happened on purpose somehow. + + + The b_page field is a pointer to the page_struct representing the + 0x50000000 page. Looking at it shows the kernel's idea of the state + of that page: + + + + (gdb) p *$13.b_page + $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, + index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { + next = 0x50008460, prev = 0x50019350}, wait = { + lock = , task_list = {next = 0x50004024, + prev = 0x50004024}, __magic = 1342193708, __creator = 0}, + pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, + zone = 0x100c5160} + + + + + + Some sanity-checking: the virtual field shows the "virtual" address of + this page, which in this kernel is the same as its "physical" address, + and the page_struct itself should be mem_map[0], since it represents + the first page of memory: + + + + (gdb) p (void *)1342177280 + $18 = (void *) 0x50000000 + (gdb) p mem_map + $19 = (mem_map_t *) 0x50004000 + + + + + + These check out fine. + + + Now to check out the page_struct itself. In particular, the flags + field shows whether the page is considered free or not: + + + (gdb) p (void *)132 + $21 = (void *) 0x84 + + + + + + The "reserved" bit is the high bit, which is definitely not set, so + the kernel considers the signal stack page to be free and available to + be used. + + + At this point, I jump to conclusions and start looking at my early + boot code, because that's where that page is supposed to be reserved. + + + In my setup_arch procedure, I have the following code which looks just + fine: + + + + bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); + free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); + + + + + + Two stack pages have already been allocated, and low_physmem points to + the third page, which is the beginning of free memory. + The init_bootmem call declares the entire memory to the boot memory + manager, which marks it all reserved. The free_bootmem call frees up + all of it, except for the first two pages. This looks correct to me. + + + So, I decide to see init_bootmem run and make sure that it is marking + those first two pages as reserved. I never get that far. + + + Stepping into init_bootmem, and looking at bootmem_map before looking + at what it contains shows the following: + + + + (gdb) p bootmem_map + $3 = (void *) 0x50000000 + + + + + + Aha! The light dawns. That first page is doing double duty as a + stack and as the boot memory map. The last thing that the boot memory + manager does is to free the pages used by its memory map, so this page + is getting freed even its marked as reserved. + + + The fix was to initialize the boot memory manager before allocating + those two stack pages, and then allocate them through the boot memory + manager. After doing this, and fixing a couple of subsequent buglets, + the stack corruption problem disappeared. + + + + + + 1133.. WWhhaatt ttoo ddoo wwhheenn UUMMLL ddooeessnn''tt wwoorrkk + + + + + 1133..11.. SSttrraannggee ccoommppiillaattiioonn eerrrroorrss wwhheenn yyoouu bbuuiilldd ffrroomm ssoouurrccee + + As of test11, it is necessary to have "ARCH=um" in the environment or + on the make command line for all steps in building UML, including + clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, + and linux. If you forget for any of them, the i386 build seems to + contaminate the UML build. If this happens, start from scratch with + + + host% + make mrproper ARCH=um + + + + + and repeat the build process with ARCH=um on all the steps. + + + See ``Compiling the kernel and modules'' for more details. + + + Another cause of strange compilation errors is building UML in + /usr/src/linux. If you do this, the first thing you need to do is + clean up the mess you made. The /usr/src/linux/asm link will now + point to /usr/src/linux/asm-um. Make it point back to + /usr/src/linux/asm-i386. Then, move your UML pool someplace else and + build it there. Also see below, where a more specific set of symptoms + is described. + + + + 1133..33.. AA vvaarriieettyy ooff ppaanniiccss aanndd hhaannggss wwiitthh //ttmmpp oonn aa rreeiisseerrffss ffiilleessyyss-- + tteemm + + I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. + Panics preceded by + + + Detaching pid nnnn + + + + are diagnostic of this problem. This is a reiserfs bug which causes a + thread to occasionally read stale data from a mmapped page shared with + another thread. The fix is to upgrade the filesystem or to have /tmp + be an ext2 filesystem. + + + + 1133..44.. TThhee ccoommppiillee ffaaiillss wwiitthh eerrrroorrss aabboouutt ccoonnfflliiccttiinngg ttyyppeess ffoorr + ''ooppeenn'',, ''dduupp'',, aanndd ''wwaaiittppiidd'' + + This happens when you build in /usr/src/linux. The UML build makes + the include/asm link point to include/asm-um. /usr/include/asm points + to /usr/src/linux/include/asm, so when that link gets moved, files + which need to include the asm-i386 versions of headers get the + incompatible asm-um versions. The fix is to move the include/asm link + back to include/asm-i386 and to do UML builds someplace else. + + + + 1133..55.. UUMMLL ddooeessnn''tt wwoorrkk wwhheenn //ttmmpp iiss aann NNFFSS ffiilleessyysstteemm + + This seems to be a similar situation with the ReiserFS problem above. + Some versions of NFS seems not to handle mmap correctly, which UML + depends on. The workaround is have /tmp be a non-NFS directory. + + + 1133..66.. UUMMLL hhaannggss oonn bboooott wwhheenn ccoommppiilleedd wwiitthh ggpprrooff ssuuppppoorrtt + + If you build UML with gprof support and, early in the boot, it does + this + + + kernel BUG at page_alloc.c:100! + + + + + you have a buggy gcc. You can work around the problem by removing + UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up + another bug, but that one is fairly hard to reproduce. + + + + 1133..77.. ssyyssllooggdd ddiieess wwiitthh aa SSIIGGTTEERRMM oonn ssttaarrttuupp + + The exact boot error depends on the distribution that you're booting, + but Debian produces this: + + + /etc/rc2.d/S10sysklogd: line 49: 93 Terminated + start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD + + + + + This is a syslogd bug. There's a race between a parent process + installing a signal handler and its child sending the signal. See + this uml-devel post for the details. + + + + 1133..88.. TTUUNN//TTAAPP nneettwwoorrkkiinngg ddooeessnn''tt wwoorrkk oonn aa 22..44 hhoosstt + + There are a couple of problems which were + name="pointed + out"> by Tim Robinson + + +o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. + The fix is to upgrade to something more recent and then read the + next item. + + +o If you see + + + File descriptor in bad state + + + + when you bring up the device inside UML, you have a header mismatch + between the original kernel and the upgraded one. Make /usr/src/linux + point at the new headers. This will only be a problem if you build + uml_net yourself. + + + + 1133..99.. YYoouu ccaann nneettwwoorrkk ttoo tthhee hhoosstt bbuutt nnoott ttoo ootthheerr mmaacchhiinneess oonn tthhee + nneett + + If you can connect to the host, and the host can connect to UML, but + you cannot connect to any other machines, then you may need to enable + IP Masquerading on the host. Usually this is only experienced when + using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML + networking, rather than the public address space that your host is + connected to. UML does not enable IP Masquerading, so you will need + to create a static rule to enable it: + + + host% + iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE + + + + + Replace eth0 with the interface that you use to talk to the rest of + the world. + + + Documentation on IP Masquerading, and SNAT, can be found at + www.netfilter.org . + + + If you can reach the local net, but not the outside Internet, then + that is usually a routing problem. The UML needs a default route: + + + UML# + route add default gw gateway IP + + + + + The gateway IP can be any machine on the local net that knows how to + reach the outside world. Usually, this is the host or the local net- + work's gateway. + + + Occasionally, we hear from someone who can reach some machines, but + not others on the same net, or who can reach some ports on other + machines, but not others. These are usually caused by strange + firewalling somewhere between the UML and the other box. You track + this down by running tcpdump on every interface the packets travel + over and see where they disappear. When you find a machine that takes + the packets in, but does not send them onward, that's the culprit. + + + + 1133..1100.. II hhaavvee nnoo rroooott aanndd II wwaanntt ttoo ssccrreeaamm + + Thanks to Birgit Wahlich for telling me about this strange one. It + turns out that there's a limit of six environment variables on the + kernel command line. When that limit is reached or exceeded, argument + processing stops, which means that the 'root=' argument that UML + usually adds is not seen. So, the filesystem has no idea what the + root device is, so it panics. + + + The fix is to put less stuff on the command line. Glomming all your + setup variables into one is probably the best way to go. + + + + 1133..1111.. UUMMLL bbuuiilldd ccoonnfflliicctt bbeettwweeeenn ppttrraaccee..hh aanndd uuccoonntteexxtt..hh + + On some older systems, /usr/include/asm/ptrace.h and + /usr/include/sys/ucontext.h define the same names. So, when they're + included together, the defines from one completely mess up the parsing + of the other, producing errors like: + /usr/include/sys/ucontext.h:47: parse error before + `10' + + + + + plus a pile of warnings. + + + This is a libc botch, which has since been fixed, and I don't see any + way around it besides upgrading. + + + + 1133..1122.. TThhee UUMMLL BBooggooMMiippss iiss eexxaaccttllyy hhaallff tthhee hhoosstt''ss BBooggooMMiippss + + On i386 kernels, there are two ways of running the loop that is used + to calculate the BogoMips rating, using the TSC if it's there or using + a one-instruction loop. The TSC produces twice the BogoMips as the + loop. UML uses the loop, since it has nothing resembling a TSC, and + will get almost exactly the same BogoMips as a host using the loop. + However, on a host with a TSC, its BogoMips will be double the loop + BogoMips, and therefore double the UML BogoMips. + + + + 1133..1133.. WWhheenn yyoouu rruunn UUMMLL,, iitt iimmmmeeddiiaatteellyy sseeggffaauullttss + + If the host is configured with the 2G/2G address space split, that's + why. See ``UML on 2G/2G hosts'' for the details on getting UML to + run on your host. + + + + 1133..1144.. xxtteerrmmss aappppeeaarr,, tthheenn iimmmmeeddiiaatteellyy ddiissaappppeeaarr + + If you're running an up to date kernel with an old release of + uml_utilities, the port-helper program will not work properly, so + xterms will exit straight after they appear. The solution is to + upgrade to the latest release of uml_utilities. Usually this problem + occurs when you have installed a packaged release of UML then compiled + your own development kernel without upgrading the uml_utilities from + the source distribution. + + + + 1133..1155.. AAnnyy ootthheerr ppaanniicc,, hhaanngg,, oorr ssttrraannggee bbeehhaavviioorr + + If you're seeing truly strange behavior, such as hangs or panics that + happen in random places, or you try running the debugger to see what's + happening and it acts strangely, then it could be a problem in the + host kernel. If you're not running a stock Linus or -ac kernel, then + try that. An early version of the preemption patch and a 2.4.10 SuSE + kernel have caused very strange problems in UML. + + + Otherwise, let me know about it. Send a message to one of the UML + mailing lists - either the developer list - user-mode-linux-devel at + lists dot sourceforge dot net (subscription info) or the user list - + user-mode-linux-user at lists dot sourceforge do net (subscription + info), whichever you prefer. Don't assume that everyone knows about + it and that a fix is imminent. + + + If you want to be super-helpful, read ``Diagnosing Problems'' and + follow the instructions contained therein. + 1144.. DDiiaaggnnoossiinngg PPrroobblleemmss + + + If you get UML to crash, hang, or otherwise misbehave, you should + report this on one of the project mailing lists, either the developer + list - user-mode-linux-devel at lists dot sourceforge dot net + (subscription info) or the user list - user-mode-linux-user at lists + dot sourceforge dot net (subscription info). When you do, it is + likely that I will want more information. So, it would be helpful to + read the stuff below, do whatever is applicable in your case, and + report the results to the list. + + + For any diagnosis, you're going to need to build a debugging kernel. + The binaries from this site aren't debuggable. If you haven't done + this before, read about ``Compiling the kernel and modules'' and + ``Kernel debugging'' UML first. + + + 1144..11.. CCaassee 11 :: NNoorrmmaall kkeerrnneell ppaanniiccss + + The most common case is for a normal thread to panic. To debug this, + you will need to run it under the debugger (add 'debug' to the command + line). An xterm will start up with gdb running inside it. Continue + it when it stops in start_kernel and make it crash. Now ^C gdb and + + + If the panic was a "Kernel mode fault", then there will be a segv + frame on the stack and I'm going to want some more information. The + stack might look something like this: + + + (UML gdb) backtrace + #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) + at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 + #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 + #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 + #3 0x1009bf38 in __restore () + at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 + #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) + at trap_kern.c:66 + #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 + #6 0x1009bf38 in __restore () + + + + + I'm going to want to see the symbol and line information for the value + of ip in the segv frame. In this case, you would do the following: + + + (UML gdb) i sym 268849158 + + + + + and + + + (UML gdb) i line *268849158 + + + + + The reason for this is the __restore frame right above the segv_han- + dler frame is hiding the frame that actually segfaulted. So, I have + to get that information from the faulting ip. + + + 1144..22.. CCaassee 22 :: TTrraacciinngg tthhrreeaadd ppaanniiccss + + The less common and more painful case is when the tracing thread + panics. In this case, the kernel debugger will be useless because it + needs a healthy tracing thread in order to work. The first thing to + do is get a backtrace from the tracing thread. This is done by + figuring out what its pid is, firing up gdb, and attaching it to that + pid. You can figure out the tracing thread pid by looking at the + first line of the console output, which will look like this: + + + tracing thread pid = 15851 + + + + + or by running ps on the host and finding the line that looks like + this: + + + jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] + + + + + If the panic was 'segfault in signals', then follow the instructions + above for collecting information about the location of the seg fault. + + + If the tracing thread flaked out all by itself, then send that + backtrace in and wait for our crack debugging team to fix the problem. + + + 1144..33.. CCaassee 33 :: TTrraacciinngg tthhrreeaadd ppaanniiccss ccaauusseedd bbyy ootthheerr tthhrreeaaddss + + However, there are cases where the misbehavior of another thread + caused the problem. The most common panic of this type is: + + + wait_for_stop failed to wait for to stop with + + + + + In this case, you'll need to get a backtrace from the process men- + tioned in the panic, which is complicated by the fact that the kernel + debugger is defunct and without some fancy footwork, another gdb can't + attach to it. So, this is how the fancy footwork goes: + + In a shell: + + + host% kill -STOP pid + + + + + Run gdb on the tracing thread as described in case 2 and do: + + + (host gdb) call detach(pid) + + + If you get a segfault, do it again. It always works the second time. + + Detach from the tracing thread and attach to that other thread: + + + (host gdb) detach + + + + + + + (host gdb) attach pid + + + + + If gdb hangs when attaching to that process, go back to a shell and + do: + + + host% + kill -CONT pid + + + + + And then get the backtrace: + + + (host gdb) backtrace + + + + + + 1144..44.. CCaassee 44 :: HHaannggss + + Hangs seem to be fairly rare, but they sometimes happen. When a hang + happens, we need a backtrace from the offending process. Run the + kernel debugger as described in case 1 and get a backtrace. If the + current process is not the idle thread, then send in the backtrace. + You can tell that it's the idle thread if the stack looks like this: + + + #0 0x100b1401 in __libc_nanosleep () + #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 + #2 0x100a546f in do_idle () at process_kern.c:445 + #3 0x100a5508 in cpu_idle () at process_kern.c:471 + #4 0x100ec18f in start_kernel () at init/main.c:592 + #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 + #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 + + + + + If this is the case, then some other process is at fault, and went to + sleep when it shouldn't have. Run ps on the host and figure out which + process should not have gone to sleep and stayed asleep. Then attach + to it with gdb and get a backtrace as described in case 3. + + + + + + + 1155.. TThhaannkkss + + + A number of people have helped this project in various ways, and this + page gives recognition where recognition is due. + + + If you're listed here and you would prefer a real link on your name, + or no link at all, instead of the despammed email address pseudo-link, + let me know. + + + If you're not listed here and you think maybe you should be, please + let me know that as well. I try to get everyone, but sometimes my + bookkeeping lapses and I forget about contributions. + + + 1155..11.. CCooddee aanndd DDooccuummeennttaattiioonn + + Rusty Russell - + + +o wrote the HOWTO + + +o prodded me into making this project official and putting it on + SourceForge + + +o came up with the way cool UML logo + + +o redid the config process + + + Peter Moulder - Fixed my config and build + processes, and added some useful code to the block driver + + + Bill Stearns - + + +o HOWTO updates + + +o lots of bug reports + + +o lots of testing + + +o dedicated a box (uml.ists.dartmouth.edu) to support UML development + + +o wrote the mkrootfs script, which allows bootable filesystems of + RPM-based distributions to be cranked out + + +o cranked out a large number of filesystems with said script + + + Jim Leu - Wrote the virtual ethernet driver + and associated usermode tools + + Lars Brinkhoff - Contributed the ptrace + proxy from his own project to allow easier + kernel debugging + + + Andrea Arcangeli - Redid some of the early boot + code so that it would work on machines with Large File Support + + + Chris Emerson - Did + the first UML port to Linux/ppc + + + Harald Welte - Wrote the multicast + transport for the network driver + + + Jorgen Cederlof - Added special file support to hostfs + + + Greg Lonnon - Changed the ubd driver + to allow it to layer a COW file on a shared read-only filesystem and + wrote the iomem emulation support + + + Henrik Nordstrom - Provided a variety + of patches, fixes, and clues + + + Lennert Buytenhek - Contributed various patches, a rewrite of the + network driver, the first implementation of the mconsole driver, and + did the bulk of the work needed to get SMP working again. + + + Yon Uriarte - Fixed the TUN/TAP network backend while I slept. + + + Adam Heath - Made a bunch of nice cleanups to the initialization code, + plus various other small patches. + + + Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and + is doing a real nice job of it. He also noticed and fixed a number of + actually and potentially exploitable security holes in uml_net. Plus + the occasional patch. I like patches. + + + James McMechan - James seems to have taken over maintenance of the ubd + driver and is doing a nice job of it. + + + Chandan Kudige - wrote the umlgdb script which automates the reloading + of module symbols. + + + Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, + enabling UML processes to access audio devices on the host. He also + submitted patches for the slip transport and lots of other things. + + + David Coulson - + + +o Set up the usermodelinux.org site, + which is a great way of keeping the UML user community on top of + UML goings-on. + + +o Site documentation and updates + + +o Nifty little UML management daemon UMLd + + + +o Lots of testing and bug reports + + + + + 1155..22.. FFlluusshhiinngg oouutt bbuuggss + + + + +o Yuri Pudgorodsky + + +o Gerald Britton + + +o Ian Wehrman + + +o Gord Lamb + + +o Eugene Koontz + + +o John H. Hartman + + +o Anders Karlsson + + +o Daniel Phillips + + +o John Fremlin + + +o Rainer Burgstaller + + +o James Stevenson + + +o Matt Clay + + +o Cliff Jefferies + + +o Geoff Hoff + + +o Lennert Buytenhek + + +o Al Viro + + +o Frank Klingenhoefer + + +o Livio Baldini Soares + + +o Jon Burgess + + +o Petru Paler + + +o Paul + + +o Chris Reahard + + +o Sverker Nilsson + + +o Gong Su + + +o johan verrept + + +o Bjorn Eriksson + + +o Lorenzo Allegrucci + + +o Muli Ben-Yehuda + + +o David Mansfield + + +o Howard Goff + + +o Mike Anderson + + +o John Byrne + + +o Sapan J. Batia + + +o Iris Huang + + +o Jan Hudec + + +o Voluspa + + + + + 1155..33.. BBuugglleettss aanndd cclleeaann--uuppss + + + + +o Dave Zarzycki + + +o Adam Lazur + + +o Boria Feigin + + +o Brian J. Murrell + + +o JS + + +o Roman Zippel + + +o Wil Cooley + + +o Ayelet Shemesh + + +o Will Dyson + + +o Sverker Nilsson + + +o dvorak + + +o v.naga srinivas + + +o Shlomi Fish + + +o Roger Binns + + +o johan verrept + + +o MrChuoi + + +o Peter Cleve + + +o Vincent Guffens + + +o Nathan Scott + + +o Patrick Caulfield + + +o jbearce + + +o Catalin Marinas + + +o Shane Spencer + + +o Zou Min + + + +o Ryan Boder + + +o Lorenzo Colitti + + +o Gwendal Grignou + + +o Andre' Breiler + + +o Tsutomu Yasuda + + + + 1155..44.. CCaassee SSttuuddiieess + + + +o Jon Wright + + +o William McEwan + + +o Michael Richardson + + + + 1155..55.. OOtthheerr ccoonnttrriibbuuttiioonnss + + + Bill Carr made the Red Hat mkrootfs script + work with RH 6.2. + + Michael Jennings sent in some material which + is now gracing the top of the index page of this site. + + SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com + . The bandwidth there made it possible to + produce most of the filesystems available on the project download + page. + + Laurent Bonnaud took the old grotty + Debian filesystem that I've been distributing and updated it to 2.2. + It is now available by itself here. + + Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make + releases even when Sourceforge is broken. + + Rodrigo de Castro looked at my broken pte code and told me what was + wrong with it, letting me fix a long-standing (several weeks) and + serious set of bugs. + + Chris Reahard built a specialized root filesystem for running a DNS + server jailed inside UML. It's available from the download + page in the Jail + Filesystems section. + + + + + + + + + + + + -- cgit v1.1 From 570aa13a5dbea9b905b4cd6315aa6e1b3a9fd2f8 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Fri, 6 May 2011 09:23:23 -0700 Subject: Add a 00-INDEX file to Documentation/virtual Remove uml from the top level 00-INDEX file. Signed-off-by: Rob Landley Signed-off-by: Randy Dunlap --- Documentation/00-INDEX | 2 -- Documentation/virtual/00-INDEX | 10 ++++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 Documentation/virtual/00-INDEX diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index c17cd4b..1b777b9 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -328,8 +328,6 @@ sysrq.txt - info on the magic SysRq key. telephony/ - directory with info on telephony (e.g. voice over IP) support. -uml/ - - directory with information about User Mode Linux. unicode.txt - info on the Unicode character/font mapping used in Linux. unshare.txt diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX new file mode 100644 index 0000000..fe0251c --- /dev/null +++ b/Documentation/virtual/00-INDEX @@ -0,0 +1,10 @@ +Virtualization support in the Linux kernel. + +00-INDEX + - this file. +kvm/ + - Kernel Virtual Machine. See also http://linux-kvm.org +lguest/ + - Extremely simple hypervisor for experimental/educational use. +uml/ + - User Mode Linux, builds/runs Linux kernel as a userspace program. -- cgit v1.1 From 61516587513c84ac26e68e3ab008dc6e965d0378 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Fri, 6 May 2011 09:27:36 -0700 Subject: Correct occurrences of - Documentation/kvm/ to Documentation/virtual/kvm - Documentation/uml/ to Documentation/virtual/uml - Documentation/lguest/ to Documentation/virtual/lguest throughout the kernel source tree. Signed-off-by: Rob Landley Signed-off-by: Randy Dunlap --- Documentation/virtual/kvm/review-checklist.txt | 2 +- Documentation/virtual/lguest/lguest.txt | 3 ++- MAINTAINERS | 4 ++-- arch/x86/lguest/boot.c | 2 +- drivers/lguest/Kconfig | 6 ++++-- drivers/lguest/Makefile | 2 +- drivers/vhost/vhost.c | 2 +- 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt index 730475a..a850986 100644 --- a/Documentation/virtual/kvm/review-checklist.txt +++ b/Documentation/virtual/kvm/review-checklist.txt @@ -7,7 +7,7 @@ Review checklist for kvm patches 2. Patches should be against kvm.git master branch. 3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/kvm/api.txt + - the API must be documented in Documentation/virtual/kvm/api.txt - the API must be discoverable using KVM_CHECK_EXTENSION 4. New state must include support for save/restore. diff --git a/Documentation/virtual/lguest/lguest.txt b/Documentation/virtual/lguest/lguest.txt index dad9997..bff0c55 100644 --- a/Documentation/virtual/lguest/lguest.txt +++ b/Documentation/virtual/lguest/lguest.txt @@ -74,7 +74,8 @@ Running Lguest: - Run an lguest as root: - Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda + Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ + --block=rootfile root=/dev/vda Explanation: 64: the amount of memory to use, in MB. diff --git a/MAINTAINERS b/MAINTAINERS index 16a5c5f..aa9bbd1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3807,7 +3807,7 @@ M: Rusty Russell L: lguest@lists.ozlabs.org W: http://lguest.ozlabs.org/ S: Odd Fixes -F: Documentation/lguest/ +F: Documentation/virtual/lguest/ F: arch/x86/lguest/ F: drivers/lguest/ F: include/linux/lguest*.h @@ -6618,7 +6618,7 @@ L: user-mode-linux-devel@lists.sourceforge.net L: user-mode-linux-user@lists.sourceforge.net W: http://user-mode-linux.sourceforge.net S: Maintained -F: Documentation/uml/ +F: Documentation/virtual/uml/ F: arch/um/ F: fs/hostfs/ F: fs/hppfs/ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1cd6089..395bf01 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -7,7 +7,7 @@ * kernel and insert a module (lg.ko) which allows us to run other Linux * kernels the same way we'd run processes. We call the first kernel the Host, * and the others the Guests. The program which sets up and configures Guests - * (such as the example in Documentation/lguest/lguest.c) is called the + * (such as the example in Documentation/virtual/lguest/lguest.c) is called the * Launcher. * * Secondly, we only run specially modified Guests, not normal kernels: setting diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 0aaa059..34ae49d 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -5,8 +5,10 @@ config LGUEST ---help--- This is a very simple module which allows you to run multiple instances of the same Linux kernel, using the - "lguest" command found in the Documentation/lguest directory. + "lguest" command found in the Documentation/virtual/lguest + directory. + Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See Documentation/lguest/lguest.txt. + not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index 7d463c2..8ac947c 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -18,7 +18,7 @@ Mastery: PREFIX=M Beer: @for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}" Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery: - @sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` + @sh ../../Documentation/virtual/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'` Puppy: @clear @printf " __ \n (___()'\`;\n /, /\`\n \\\\\\\"--\\\\\\ \n" diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 2ab2912..7aa4eea 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -4,7 +4,7 @@ * Author: Michael S. Tsirkin * * Inspiration, some code, and most witty comments come from - * Documentation/lguest/lguest.c, by Rusty Russell + * Documentation/virtual/lguest/lguest.c, by Rusty Russell * * This work is licensed under the terms of the GNU GPL, version 2. * -- cgit v1.1 From a294865978b701e4d0d90135672749531b9a900d Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Fri, 6 May 2011 03:27:18 +0000 Subject: dccp: handle invalid feature options length A length of zero (after subtracting two for the type and len fields) for the DCCPO_{CHANGE,CONFIRM}_{L,R} options will cause an underflow due to the subtraction. The subsequent code may read past the end of the options value buffer when parsing. I'm unsure of what the consequences of this might be, but it's probably not good. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Acked-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/options.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dccp/options.c b/net/dccp/options.c index f06ffcf..4b2ab65 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -123,6 +123,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; + if (len == 0) + goto out_invalid_option; rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, *value, value + 1, len - 1); if (rc) -- cgit v1.1 From e328d410826d52e9ee348aff9064c4a207f2adb1 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 6 May 2011 08:32:53 +0000 Subject: vmxnet3: Consistently disable irqs when taking adapter->cmd_lock Using the vmxnet3 driver produces a lockdep warning because vmxnet3_set_mc(), which is called with mc->mca_lock held, takes adapter->cmd_lock. However, there are a couple of places where adapter->cmd_lock is taken with softirqs enabled, lockdep warns that a softirq that tries to take mc->mca_lock could happen while adapter->cmd_lock is held, leading to an AB-BA deadlock. I'm not sure if this is a real potential deadlock or not, but the simplest and best fix seems to be simply to make sure we take cmd_lock with spin_lock_irqsave() everywhere -- the places with plain spin_lock just look like oversights. The full enormous lockdep warning is: ========================================================= [ INFO: possible irq lock inversion dependency detected ] 2.6.39-rc6+ #1 --------------------------------------------------------- ifconfig/567 just changed the state of lock: (&(&mc->mca_lock)->rlock){+.-...}, at: [] mld_ifc_timer_expire+0xff/0x280 but this lock took another, SOFTIRQ-unsafe lock in the past: (&(&adapter->cmd_lock)->rlock){+.+...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: 4 locks held by ifconfig/567: #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 #1: ((inetaddr_chain).rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x5f/0xb0 #2: (&idev->mc_ifc_timer){+.-...}, at: [] run_timer_softirq+0xeb/0x3f0 #3: (&ndev->lock){++.-..}, at: [] mld_ifc_timer_expire+0x32/0x280 the shortest dependencies between 2nd lock and 1st lock: -> (&(&adapter->cmd_lock)->rlock){+.+...} ops: 11 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b SOFTIRQ-ON-W at: [] __lock_acquire+0x827/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b } ... key at: [] __key.42516+0x0/0xffffffffffffda70 [vmxnet3] ... acquired at: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_irqsave+0x55/0xa0 [] vmxnet3_set_mc+0x97/0x1a0 [vmxnet3] [] __dev_set_rx_mode+0x40/0xb0 [] dev_set_rx_mode+0x30/0x50 [] __dev_open+0xc7/0x100 [] __dev_change_flags+0xa1/0x180 [] dev_change_flags+0x28/0x70 [] devinet_ioctl+0x730/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b -> (_xmit_ETHER){+.....} ops: 6 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b } ... key at: [] netdev_addr_lock_key+0x8/0x1e0 ... acquired at: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b -> (&(&mc->mca_lock)->rlock){+.-...} ops: 6 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] igmp6_group_added+0x45/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_init+0x4e/0x183 [] inet6_init+0x191/0x2a6 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 IN-SOFTIRQ-W at: [] __lock_acquire+0x7ce/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] __do_softirq+0xc0/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] igmp6_group_added+0x45/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_init+0x4e/0x183 [] inet6_init+0x191/0x2a6 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 } ... key at: [] __key.40877+0x0/0x8 ... acquired at: [] check_usage_forwards+0x9c/0x110 [] mark_lock+0x19c/0x400 [] __lock_acquire+0x7ce/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] __do_softirq+0xc0/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b stack backtrace: Pid: 567, comm: ifconfig Not tainted 2.6.39-rc6+ #1 Call Trace: [] print_irq_inversion_bug+0x146/0x170 [] ? print_irq_inversion_bug+0x170/0x170 [] check_usage_forwards+0x9c/0x110 [] mark_lock+0x19c/0x400 [] __lock_acquire+0x7ce/0x1e10 [] ? mark_lock+0x1f3/0x400 [] ? __lock_acquire+0xf07/0x1e10 [] ? native_sched_clock+0x15/0x70 [] lock_acquire+0x9d/0x130 [] ? mld_ifc_timer_expire+0xff/0x280 [] ? lock_release_holdtime+0x3d/0x1a0 [] _raw_spin_lock_bh+0x3b/0x70 [] ? mld_ifc_timer_expire+0xff/0x280 [] ? _raw_spin_unlock+0x2b/0x40 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] ? run_timer_softirq+0xeb/0x3f0 [] ? sched_clock+0x9/0x10 [] ? mld_gq_timer_expire+0x30/0x30 [] __do_softirq+0xc0/0x210 [] ? tick_program_event+0x1f/0x30 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] ? retint_restore_args+0x13/0x13 [] ? lock_is_held+0x17/0xd0 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] ? local_clock+0x6f/0x80 [] ? do_page_fault+0x268/0x560 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] ? __call_rcu+0xa7/0x190 [] do_vfs_ioctl+0x98/0x570 [] ? fget_light+0x33e/0x430 [] ? retint_swapgs+0x13/0x1b [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Roland Dreier Signed-off-by: Shreyas N Bhatewara Signed-off-by: Scott J. Goldman Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 0d47c3a..c16ed96 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -178,6 +178,7 @@ static void vmxnet3_process_events(struct vmxnet3_adapter *adapter) { int i; + unsigned long flags; u32 events = le32_to_cpu(adapter->shared->ecr); if (!events) return; @@ -190,10 +191,10 @@ vmxnet3_process_events(struct vmxnet3_adapter *adapter) /* Check if there is an error on xmit/recv queues */ if (events & (VMXNET3_ECR_TQERR | VMXNET3_ECR_RQERR)) { - spin_lock(&adapter->cmd_lock); + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_QUEUE_STATUS); - spin_unlock(&adapter->cmd_lock); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); for (i = 0; i < adapter->num_tx_queues; i++) if (adapter->tqd_start[i].status.stopped) @@ -2733,13 +2734,14 @@ static void vmxnet3_alloc_intr_resources(struct vmxnet3_adapter *adapter) { u32 cfg; + unsigned long flags; /* intr settings */ - spin_lock(&adapter->cmd_lock); + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_CONF_INTR); cfg = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); - spin_unlock(&adapter->cmd_lock); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); adapter->intr.type = cfg & 0x3; adapter->intr.mask_mode = (cfg >> 2) & 0x3; -- cgit v1.1 From a3a4a5acd3bd2f6f1e102e1f1b9d2e2bb320a7fd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 5 May 2011 23:55:18 -0400 Subject: Regression: partial revert "tracing: Remove lock_depth from event entry" This partially reverts commit e6e1e2593592a8f6f6380496655d8c6f67431266. That commit changed the structure layout of the trace structure, which in turn broke PowerTOP (1.9x generation) quite badly. I appreciate not wanting to expose the variable in question, and PowerTOP was not using it, so I've replaced the variable with just a padding field - that way if in the future a new field is needed it can just use this padding field. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_events.c | 1 + 3 files changed, 3 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 22b32af..b5a550a 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -37,6 +37,7 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + int padding; }; #define FTRACE_MAX_EVENT \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d38c16a..1cb49be 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e88f74f..2fe1103 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,6 +116,7 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(int, padding); return ret; } -- cgit v1.1 From 5f54c8a00af20e5cf38c3e5ef2f59b6848a17cd9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 4 May 2011 17:31:27 +0200 Subject: rtc: mxc: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: Wolfram Sang [fixed up commit log -jstultz] Signed-off-by: John Stultz --- drivers/rtc/rtc-mxc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 826ab64..d814417 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -418,14 +418,6 @@ static int __init mxc_rtc_probe(struct platform_device *pdev) goto exit_put_clk; } - rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops, - THIS_MODULE); - if (IS_ERR(rtc)) { - ret = PTR_ERR(rtc); - goto exit_put_clk; - } - - pdata->rtc = rtc; platform_set_drvdata(pdev, pdata); /* Configure and enable the RTC */ @@ -438,8 +430,19 @@ static int __init mxc_rtc_probe(struct platform_device *pdev) pdata->irq = -1; } + rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops, + THIS_MODULE); + if (IS_ERR(rtc)) { + ret = PTR_ERR(rtc); + goto exit_clr_drvdata; + } + + pdata->rtc = rtc; + return 0; +exit_clr_drvdata: + platform_set_drvdata(pdev, NULL); exit_put_clk: clk_disable(pdata->clk); clk_put(pdata->clk); -- cgit v1.1 From f4e708ae8e5f3eb98f4c53036c0a470717bbc709 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 5 May 2011 11:46:14 +0200 Subject: rtc: davinci: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: Wolfram Sang [fixed up commit log -jstultz] Signed-off-by: John Stultz --- drivers/rtc/rtc-davinci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c index 8d46838..755e1fe 100644 --- a/drivers/rtc/rtc-davinci.c +++ b/drivers/rtc/rtc-davinci.c @@ -524,6 +524,8 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) goto fail2; } + platform_set_drvdata(pdev, davinci_rtc); + davinci_rtc->rtc = rtc_device_register(pdev->name, &pdev->dev, &davinci_rtc_ops, THIS_MODULE); if (IS_ERR(davinci_rtc->rtc)) { @@ -553,8 +555,6 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) rtcss_write(davinci_rtc, PRTCSS_RTC_CCTRL_CAEN, PRTCSS_RTC_CCTRL); - platform_set_drvdata(pdev, davinci_rtc); - device_init_wakeup(&pdev->dev, 0); return 0; @@ -562,6 +562,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev) fail4: rtc_device_unregister(davinci_rtc->rtc); fail3: + platform_set_drvdata(pdev, NULL); iounmap(davinci_rtc->base); fail2: release_mem_region(davinci_rtc->pbase, davinci_rtc->base_size); -- cgit v1.1 From 92d921c5def1a7b1411bc54859c0771b2cf2c08d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 5 May 2011 11:46:15 +0200 Subject: rtc: ep93xx: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: Wolfram Sang [Fixed up commit log -jstultz] Signed-off-by: John Stultz --- drivers/rtc/rtc-ep93xx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 11ae64d..335551d 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -151,6 +151,7 @@ static int __init ep93xx_rtc_probe(struct platform_device *pdev) return -ENXIO; pdev->dev.platform_data = ep93xx_rtc; + platform_set_drvdata(pdev, rtc); rtc = rtc_device_register(pdev->name, &pdev->dev, &ep93xx_rtc_ops, THIS_MODULE); @@ -159,8 +160,6 @@ static int __init ep93xx_rtc_probe(struct platform_device *pdev) goto exit; } - platform_set_drvdata(pdev, rtc); - err = sysfs_create_group(&pdev->dev.kobj, &ep93xx_rtc_sysfs_files); if (err) goto fail; @@ -168,9 +167,9 @@ static int __init ep93xx_rtc_probe(struct platform_device *pdev) return 0; fail: - platform_set_drvdata(pdev, NULL); rtc_device_unregister(rtc); exit: + platform_set_drvdata(pdev, NULL); pdev->dev.platform_data = NULL; return err; } -- cgit v1.1 From 9a281a677c1dbf25943b5bc3225de21fcb4945ae Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:21:12 -0700 Subject: rtc: ds1286: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-ds1286.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-ds1286.c b/drivers/rtc/rtc-ds1286.c index 60ce696..47e681d 100644 --- a/drivers/rtc/rtc-ds1286.c +++ b/drivers/rtc/rtc-ds1286.c @@ -355,6 +355,7 @@ static int __devinit ds1286_probe(struct platform_device *pdev) goto out; } spin_lock_init(&priv->lock); + platform_set_drvdata(pdev, priv); rtc = rtc_device_register("ds1286", &pdev->dev, &ds1286_ops, THIS_MODULE); if (IS_ERR(rtc)) { @@ -362,7 +363,6 @@ static int __devinit ds1286_probe(struct platform_device *pdev) goto out; } priv->rtc = rtc; - platform_set_drvdata(pdev, priv); return 0; out: -- cgit v1.1 From 0078bff5283d1fd6417b840eda6dab912b7a5560 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 Apr 2011 00:24:29 +0200 Subject: Allow setting of number of raw devices as a module parameter Allow setting of maximal number of raw devices as a module parameter. This requires changing of static array into a vmalloced one (the array is going to be too large for kmalloc). Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- drivers/char/Kconfig | 2 +- drivers/char/raw.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index ad59b4e..49502bc 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -523,7 +523,7 @@ config RAW_DRIVER with the O_DIRECT flag. config MAX_RAW_DEVS - int "Maximum number of RAW devices to support (1-8192)" + int "Maximum number of RAW devices to support (1-65536)" depends on RAW_DRIVER default "256" help diff --git a/drivers/char/raw.c b/drivers/char/raw.c index b4b9d5a..6f9db62 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -30,10 +31,15 @@ struct raw_device_data { }; static struct class *raw_class; -static struct raw_device_data raw_devices[MAX_RAW_MINORS]; +static struct raw_device_data *raw_devices; static DEFINE_MUTEX(raw_mutex); static const struct file_operations raw_ctl_fops; /* forward declaration */ +static int max_raw_minors = MAX_RAW_MINORS; + +module_param(max_raw_minors, int, 0); +MODULE_PARM_DESC(max_raw_minors, "Maximum number of raw devices (1-65536)"); + /* * Open/close code for raw IO. * @@ -125,7 +131,7 @@ static int bind_set(int number, u64 major, u64 minor) struct raw_device_data *rawdev; int err = 0; - if (number <= 0 || number >= MAX_RAW_MINORS) + if (number <= 0 || number >= max_raw_minors) return -EINVAL; if (MAJOR(dev) != major || MINOR(dev) != minor) @@ -312,12 +318,26 @@ static int __init raw_init(void) dev_t dev = MKDEV(RAW_MAJOR, 0); int ret; - ret = register_chrdev_region(dev, MAX_RAW_MINORS, "raw"); + if (max_raw_minors < 1 || max_raw_minors > 65536) { + printk(KERN_WARNING "raw: invalid max_raw_minors (must be" + " between 1 and 65536), using %d\n", MAX_RAW_MINORS); + max_raw_minors = MAX_RAW_MINORS; + } + + raw_devices = vmalloc(sizeof(struct raw_device_data) * max_raw_minors); + if (!raw_devices) { + printk(KERN_ERR "Not enough memory for raw device structures\n"); + ret = -ENOMEM; + goto error; + } + memset(raw_devices, 0, sizeof(struct raw_device_data) * max_raw_minors); + + ret = register_chrdev_region(dev, max_raw_minors, "raw"); if (ret) goto error; cdev_init(&raw_cdev, &raw_fops); - ret = cdev_add(&raw_cdev, dev, MAX_RAW_MINORS); + ret = cdev_add(&raw_cdev, dev, max_raw_minors); if (ret) { kobject_put(&raw_cdev.kobj); goto error_region; @@ -336,8 +356,9 @@ static int __init raw_init(void) return 0; error_region: - unregister_chrdev_region(dev, MAX_RAW_MINORS); + unregister_chrdev_region(dev, max_raw_minors); error: + vfree(raw_devices); return ret; } @@ -346,7 +367,7 @@ static void __exit raw_exit(void) device_destroy(raw_class, MKDEV(RAW_MAJOR, 0)); class_destroy(raw_class); cdev_del(&raw_cdev); - unregister_chrdev_region(MKDEV(RAW_MAJOR, 0), MAX_RAW_MINORS); + unregister_chrdev_region(MKDEV(RAW_MAJOR, 0), max_raw_minors); } module_init(raw_init); -- cgit v1.1 From aabb6e1531b0c78423dca6a39620892249fef7f9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 May 2011 13:27:41 -0700 Subject: efivars: prevent oops on unload when efi is not enabled efivars_exit() should check for efi_enabled and not undo allocations when efi is not enabled. Otherwise there is an Oops during module unload: calling efivars_init+0x0/0x1000 [efivars] @ 2810 EFI Variables Facility v0.08 2004-May-17 initcall efivars_init+0x0/0x1000 [efivars] returned 0 after 5120 usecs Oops: 0000 [#1] SMP DEBUG_PAGEALLOC last sysfs file: /sys/module/firmware_class/initstate CPU 1 Modules linked in: efivars(-) af_packet tun nfsd lockd nfs_acl auth_rpcgss sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT xt_tcpudp nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables x_tables ipv6 cpufreq_ondemand acpi_cpufreq freq_table mperf binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep mousedev snd_seq joydev snd_seq_device mac_hid evdev snd_pcm usbkbd usbmouse usbhid snd_timer hid tg3 snd sr_mod pcspkr rtc_cmos soundcore cdrom iTCO_wdt processor sg dcdbas i2c_i801 rtc_core iTCO_vendor_support intel_agp snd_page_alloc thermal_sys rtc_lib intel_gtt 8250_pnp button hwmon unix ide_pci_generic ide_core ata_generic pata_acpi ata_piix sd_mod crc_t10dif ext3 jbd mbcache uhci_hcd ohci_hcd ssb mmc_core pcmcia pcmcia_core firmware_class ehci_hcd usbcore [last unloaded: dell_rbu] Pid: 2812, comm: rmmod Not tainted 2.6.39-rc6 #1 Dell Inc. OptiPlex 745 /0TY565 RIP: 0010:[] [] unregister_efivars+0x28/0x12c [efivars] RSP: 0018:ffff88005eedde98 EFLAGS: 00010283 RAX: ffffffffa06a23fc RBX: ffffffffa06a44c0 RCX: ffff88007c227a50 RDX: 0000000000000000 RSI: 00000055ac13db78 RDI: ffffffffa06a44c0 RBP: ffff88005eeddec8 R08: 0000000000000000 R09: ffff88005eeddd78 R10: ffffffffa06a4220 R11: ffff88005eeddd78 R12: fffffffffffff7d0 R13: 00007fff5a3aaec0 R14: 0000000000000000 R15: ffffffffa06a4508 FS: 00007fa8dcc4a6f0(0000) GS:ffff88007c200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000005d148000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process rmmod (pid: 2812, threadinfo ffff88005eedc000, task ffff88006754b000) Stack: ffff88005eeddec8 ffffffffa06a4220 0000000000000000 00007fff5a3aaec0 0000000000000000 0000000000000001 ffff88005eedded8 ffffffffa06a2418 ffff88005eeddf78 ffffffff810d3598 ffffffffa06a4220 0000000000000880 Call Trace: [] efivars_exit+0x1c/0xc04 [efivars] [] sys_delete_module+0x2d6/0x368 [] ? lockdep_sys_exit_thunk+0x35/0x67 [] ? audit_syscall_entry+0x172/0x1a5 [] system_call_fastpath+0x16/0x1b Code: 5c c9 c3 55 48 89 e5 41 57 41 56 41 55 41 54 53 48 83 ec 08 0f 1f 44 00 00 4c 8b 67 48 48 89 fb 4c 8d 7f 48 49 81 ec 30 08 00 00 <4d> 8b ac 24 30 08 00 00 49 81 ed 30 08 00 00 eb 59 48 89 df 48 RIP [] unregister_efivars+0x28/0x12c [efivars] RSP CR2: 0000000000000000 ---[ end trace aa99b99090f70baa ]--- Matt apparently removed such a check in 2004 (with no reason given): * 17 May 2004 - Matt Domsch * remove check for efi_enabled in exit but there have been several changes since then. Signed-off-by: Randy Dunlap Signed-off-by: Mike Waychison Tested-by: Randy Dunlap Cc: Matt Domsch Cc: Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efivars.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index 5d1ec68..a2d2f1f 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -827,8 +827,10 @@ err_put: static void __exit efivars_exit(void) { - unregister_efivars(&__efivars); - kobject_put(efi_kobj); + if (efi_enabled) { + unregister_efivars(&__efivars); + kobject_put(efi_kobj); + } } module_init(efivars_init); -- cgit v1.1 From b50fa7c8077c625919b1e0a75fc37b825f024518 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 5 May 2011 13:32:05 +0200 Subject: reboot: disable usermodehelper to prevent fs access In case CONFIG_UEVENT_HELPER_PATH is not set to "", which it should be on every system, the kernel forks processes during shutdown, which try to access the rootfs, even when the binary does not exist. It causes exceptions and long delays in the disk driver, which gets read requests at the time it tries to shut down the disk. This patch disables all kernel-forked processes during reboot to allow a clean poweroff. Cc: Tejun Heo Tested-By: Anton Guda Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index af468ed..70c4c51 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -314,6 +314,7 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; + usermodehelper_disable(); device_shutdown(); sysdev_shutdown(); syscore_shutdown(); @@ -344,6 +345,7 @@ static void kernel_shutdown_prepare(enum system_states state) blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); system_state = state; + usermodehelper_disable(); device_shutdown(); } /** -- cgit v1.1 From 9333744dc7dcd85531cff13cabf1d5d6baf18e7d Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Wed, 4 May 2011 05:19:34 -0400 Subject: RAW driver: Remove call to kobject_put(). If cdev_add() fails, there is no justification for subsequently calling kobject_put(). Signed-off-by: Robert P. J. Day Signed-off-by: Greg Kroah-Hartman --- drivers/char/raw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 6f9db62..b33e8ea 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -339,7 +339,6 @@ static int __init raw_init(void) cdev_init(&raw_cdev, &raw_fops); ret = cdev_add(&raw_cdev, dev, max_raw_minors); if (ret) { - kobject_put(&raw_cdev.kobj); goto error_region; } -- cgit v1.1 From 3ccff540070b5adde7eec443676cfee1dd6b89fd Mon Sep 17 00:00:00 2001 From: Harry Wei Date: Thu, 5 May 2011 09:47:48 +0800 Subject: Translated Documentation/email-clients.txt The patch includes the translation Documentation/email-clients.txt. If anyone has other problems, please let me know. Signed-off-by: Harry Wei Signed-off-by: Greg Kroah-Hartman --- Documentation/zh_CN/email-clients.txt | 210 ++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 Documentation/zh_CN/email-clients.txt diff --git a/Documentation/zh_CN/email-clients.txt b/Documentation/zh_CN/email-clients.txt new file mode 100644 index 0000000..5d65e32 --- /dev/null +++ b/Documentation/zh_CN/email-clients.txt @@ -0,0 +1,210 @@ +锘?Chinese translated version of Documentation/email-clients.txt + +If you have any comment or update to the content, please contact the +original document maintainer directly. However, if you have a problem +communicating in English you can also ask the Chinese maintainer for +help. Contact the Chinese maintainer if this translation is outdated +or if there is a problem with the translation. + +Chinese maintainer: Harry Wei +--------------------------------------------------------------------- +Documentation/email-clients.txt ???涓????缈æ˜?? + +æ¿¡??????å® ??ç’烘????å­˜?版???????????瀹癸??璇风?å­˜?ヨ??绯诲?????妗g??ç¼å­˜?よ?????æ¿¡????æµ£?浣跨?ㄨ?辨?? +浜ゆ???????ä¼´?剧??ç’‡?é”›?涔????浠ュ??涓???????ç¼å­˜?よ??姹???┿??æ¿¡???????缈æ˜????å­˜?é¢???????舵?????缈? +ç’‡?瀛???ã„©??棰?é”›?璇疯??绯讳腑??????ç¼å­˜?よ????? + +涓???????ç¼å­˜?よ??é”›? ç’惧??æ¿ž? Harry Wei +涓???????缈æ˜?????é”›? ç’惧??æ¿ž? Harry Wei +涓?????????¤?????é”›? Yinglin Luan + Xiaochen Wang + yaxinsn + +浠ヤ??涓烘?f?? +--------------------------------------------------------------------- + +Linux???æµ è·º?㈡?风?????缃?淇℃?? +====================================================================== + +?????????缃? +---------------------------------------------------------------------- +Linux?????歌ˉ涓???????æ©????浠惰?????浜ょ??é”›????濂芥??ç›ãƒ¤??æµ£?涓洪??浠朵????????宓?????????????浜?ç¼å­˜?よ?? +??ユ?å •??浠讹??æµ£???????æµ å‰?????瀹规?ç…Ž??æ´?璇ユ??"text/plain"?????惰??é”›????浠朵????????涓?ç’§???????é”›? +???涓鸿??æµ¼?浣胯ˉ涓????寮???ã„©?ã„¥????ㄨ??ç’鸿??绋?涓???????寰???ä¼´?俱?? + +??ㄦ?ュ?????Linux?????歌ˉ涓???????æµ è·º?㈡?风????ã„¥?????ç›ãƒ¤????è·º??璇ュ??浜?????????????æ¿®???舵?????渚?æ¿¡?é”›? +æµ ?æµ ?涓???芥?ç‘°?????????????ã‚…?惰〃绗???????绌烘?ç¡·???????虫????ㄦ??涓?ç›????寮?澶存?????ç¼?ç俱?? + +涓?ç‘•????æ©?"format=flowed"妯″????????ç›ãƒ¤?????æ©???蜂??寮?璧蜂?????棰????浠ュ?????瀹崇?????ç›???? + +涓?ç‘•?ç’â•€????????æµ è·º?㈡?风??æ©?ç›??????ㄦ?㈣?????æ©???蜂??æµ¼???æ‘??æµ£????ç›ãƒ¤????? + +???æµ è·º?㈡?风??涓???芥?ç‘°???????????瀛?ç»—????缂??????ç‘°?????ç‘•??????????ç›ãƒ¤???????芥??ASCII??????UTF-8缂??????ç‘°??é”›? +æ¿¡????æµ£?浣跨??UTF-8缂??????ç‘°???????????浠讹????d??æµ£?ç?æµ¼???åž®??涓?浜??????è—‰????????瀛?ç»—???????棰???? + +???æµ è·º?㈡?风??æ´?璇ュ舰???骞朵??æ·‡???? References: ?????? In-Reply-To: ???棰?é”›???d?? +???浠惰??棰?çå˜??æµ¼?涓??????? + +澶???å‰??甯?(?????????ç’寸??甯?)???甯é•????ç•Œ?ㄤ??ç›ãƒ¤??é”›????涓哄?惰〃绗?æµ¼?æž????涓虹┖??笺??浣跨??xclipboard, xclip +??????xcutsel涔?ç’稿??浠ワ??æµ£???????濂芥??ç’‡?涓?涓?????????åž®??浣跨?ã„¥????å‰??甯???? + +涓?ç‘•???ㄤ娇???PGP/GPG缃æ’????????浠朵腑??????ç›ãƒ¤?????æ©???蜂??浣垮??寰?澶???????涓???借?诲??????????ㄤ??æµ£????ç›ãƒ¤????? +é”›?æ©?涓????棰?æ´?璇ユ?????浠ヤ慨澶????é”›? + +??ㄧ???????æ??æµ è·º??ç›ã„¥?????ç›ãƒ¤??涔????é”›?ç¼????宸åž?????涓?涓?ç›ãƒ¤?????涓?涓???????涓绘??é”›?æ·‡?瀛???ユ?è·º?扮?? +???浠讹??ç?ç›ãƒ¤?????'patch'??戒护???涓?é”›?æ¿¡??????????浜?é”›????ç¼??????æ??æµ è·º??ç›ã„¥???????? + + +涓?浜????æµ è·º?㈡?风?????绀? +---------------------------------------------------------------------- +æ©????ç¼???è½°??浜?ç’‡?ç¼????MUA???缃????绀猴?????浠ョ?ㄤ??ç¼?Linux?????稿?????ç›ãƒ¤?????æ©?浜?骞朵???????虫?? +?????????æž?æµ è·º?????缃???è¤????? + +璇存??é”›? +TUI = 浠ユ?????涓哄?虹???????ㄦ?锋?ュ?? +GUI = ??惧舰?????㈢?ㄦ?锋?ュ?? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Alpine (TUI) + +???缃????椤癸?? +???"Sending Preferences"??ã„¥??é”›? + +- "Do Not Send Flowed Text"蹇?椤诲????? +- "Strip Whitespace Before Sending"蹇?椤诲?抽?? + +褰???????浠舵?讹????????æ´?璇ユ?惧?ㄨˉ涓?æµ¼???虹?扮????版?癸????è·º?????涓?CTRL-Rç¼???????é”›?浣挎??瀹???? +ç›ãƒ¤?????æµ è·º????ュ?ä¼´??浠朵腑??? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Evolution (GUI) + +涓?浜?寮????????????????浣跨?ã„¥????????ç›ãƒ¤?? + +褰??????â•…??æµ å •??椤癸??Preformat + æµ ?Format->Heading->Preformatted (Ctrl-7)??????宸ュ?锋?? + +??è·º??浣跨??é”›? + Insert->Text File... (Alt-n x)?????ヨˉ涓????浠躲?? + +æµ£?æ©????æµ ?"diff -Nru old.c new.c | xclip"é”›???????Preformaté”›???è·º??浣跨?ㄤ腑??æ’®??æ©?ç›?ç»®?甯???? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Kmail (GUI) + +涓?浜?寮????????????????浣跨?ã„¥????????ç›ãƒ¤????? + +榛?ç’よ?剧疆涓?涓?HTML??ç…Ž??????????????é”›?涓?ç‘•??????ã„¥????? + +褰?涔????涓?ç????æµ å‰????è·º??é”›???ã„©??椤逛?????涓?ç‘•??????â•„????ㄦ?㈣????????涓????缂虹?ç‘°æ°¨???æµ£???ã„©??浠朵腑æˆ???ョ??浠讳???????? +??戒??æµ¼?çš??????ㄦ?㈣??é”›????å§ã‚„??蹇?椤诲?ã„¥?????ç›ãƒ¤??涔?????????ㄦ?㈣????????ç» ?????????规??ç辨???????ㄨ????ㄦ?㈣????ヤ功??????浠讹?? +??è·º?????瀹?æ·‡?瀛?涓鸿??绋裤??涓????æµ£???ㄨ??绋夸腑???娆℃??寮?瀹?é”›?瀹?宸茬????ã„©?ㄨ????ㄦ?㈣??浜?é”›???d??æµ£???????浠惰?ç•Œ?舵病??? +?????â•„????ㄦ?㈣??é”›?æµ£????æ©?涓?æµ¼?澶åž?诲凡???????????ㄦ?㈣????? + +??ã„©??æµ å‰??æ´????é”›??????ヨˉ涓?涔????é”›???å¥??甯哥?ㄧ??ç›ãƒ¤??瀹????ç»—?é”›?涓?涓?æ©?瀛????(---)??? + +??è·º?????"Message"????????$??é”›??????â•‚????ユ??浠讹????ョ????????æµ£????ç›ãƒ¤?????浠躲??æ©????涓?涓?棰?澶???????椤癸??æµ£????æµ ? +???æ©?瀹????缃?æµ£???????浠跺缓绔?宸ュ?锋????????é”›?æ©????浠ュ甫涓?"insert file"??炬????? + +æµ£????浠ュ????ã„¥?ä¼´??æ©?GPG???ç’ä¼´??浠讹??æµ£???????宓?ç›ãƒ¤?????濂戒??ç‘•?浣跨??GPG???ç’æ¿??æµ ????æµ£?涓哄??宓??????????绛惧??ç›ãƒ¤??é”›? +褰?æµ ?GPG涓???????7æµ£?缂??????朵??浣夸??æµ ?????????æ‘??澶??????? + +æ¿¡????æµ£????ç‘•?浠ラ??æµ å‰??褰㈠????????ç›ãƒ¤??é”›???d??çåž?抽????ç‘°?å©š??浠讹????è·º?????涓?çž???Ñ??ç»????"Suggest automatic +display"é”›?æ©???å³°??宓????浠舵?æ‘?规??ç’â•„?æ˜???????般?? + +褰?æµ£?ç‘•?æ·‡?瀛?ç?ç‘•?????????????宓???????ç›ãƒ¤??é”›?æµ£????浠ヤ??娑???????ç›ã„§????奸????â•?????ç›ãƒ¤????????浠讹????è·º????冲?å©š????? +"save as"???æµ£????浠ヤ娇??ㄤ??涓?娌℃????å­˜?圭????????ç›ãƒ¤????????浠讹??æ¿¡????瀹????浠ユ?g‘???褰㈠??ç¼???????褰?æµ£?å§ï½‡????ã„¥?? +???宸辩??ç»???d??涓?瀵????é”›???f?舵病??????椤瑰??浠ヤ??瀛????æµ ?--宸茬?????涓?涓?æ©???风??bugçš?姹???ュ?é¢??kmail???bugzilla +骞朵??甯????æ©?ç?æµ¼?çš?澶??????????浠舵??浠ュ?????瀵规??涓???ㄦ?å³°??璇诲???????????çš?æ·‡?瀛????é”›????浠ュ?????æµ£???虫?????æµ è·º????è·º?æ¿?朵????版?癸?? +æµ£?涓?寰?涓????æµ ?æµ ????????????逛负ç¼?????????ç¿ ?????璇汇?? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Lotus Notes (GUI) + +涓?ç‘•?浣跨?ã„¥????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Mutt (TUI) + +寰?澶?Linux寮????浜哄??浣跨??mutt瀹㈡?风??é”›????浠ヨ?????瀹????瀹?宸ヤ????????甯告??浜???? + +Mutt涓????甯?缂?æˆ????é”›????浠ヤ??绠′??浣跨?ㄤ??涔?缂?æˆ???ã„©?戒??æ´?璇ュ甫????????ㄦ??ç›????澶у????扮??æˆ???ã„©?藉甫??? +涓?涓?"insert file"???椤癸??瀹????浠ラ??æ©?涓???ç‘°?????æµ è·º??瀹圭????ç‘°???????ユ??浠躲?? + +'vim'æµ£?涓?mutt???缂?æˆ????é”›? + set editor="vi" + + æ¿¡????浣跨??xclipé”›???æ’?ヤ互涓???戒护 + :set paste + ???涓????涔??????????shift-insert??????浣跨?? + :r filename + +æ¿¡??????å® ?????ç›ãƒ¤??æµ£?涓哄??宓?????????? +(a)ttach宸ヤ?????寰?æ¿‚æ–¤??涓?甯????"set paste"??? + +???缃????椤癸?? +瀹?æ´?璇ヤ互榛?ç’よ?剧疆???褰㈠??宸ヤ????? +??惰??é”›????"send_charset"ç’剧疆涓?"us-ascii::utf-8"涔????涓?涓?涓???????涓绘????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pine (TUI) + +Pineæ©???绘??涓?浜?绌烘?ç…Ž????????棰?é”›?æµ£????æ©?浜???æ¿?ã„¥??璇ラ?借??æ·‡?澶?浜???? + +æ¿¡???????浠ワ??璇蜂娇???alpine(pine???ç¼Ñ„?胯??) + +???缃????椤癸?? +- ???æ©?????????????ç‘•?娑???ゆ??绋??????? +- "no-strip-whitespace-before-send"???椤逛????????ç‘•??????? + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Sylpheed (GUI) + +- ???宓??????????浠ュ??æ¿‚ç•Œ??宸ヤ??é”›???????浣跨?ã„©??浠讹????? +- ???ç’é•å¨‡??ã„¥????ㄧ??缂?æˆ???ã„£?? +- 瀵逛?????褰?æˆ?澶???å •??甯告????? +- æ¿¡???????æ©?non-SSLæ©???ワ?????娉?浣跨??TLS SMTP????????? +- ??ㄧ?????ç»???d腑???涓?涓?寰??????ㄧ??ruler bar??? +- ç¼???æ¿?????涓?娣诲????æ¿??çå˜??æµ¼?å§ï½‡â€˜???浜?瑙f?剧ãš?????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Thunderbird (GUI) + +榛?ç’ゆ????å…¸??é”›?thunderbird寰?瀹规??????????????é”›?æµ£????æ©????涓?浜???规?????浠ュ己??è·º?????寰???æ‘ソ??? + +- ??ㄧ?ㄦ?å³°????ç–¯?剧疆???é”›?ç¼???????瀵诲??é”›?涓?ç‘•???????"Compose messages in HTML format"??? + +- 缂?æˆ?æµ£????Thunderbird???缃?ç’剧疆??ヤ娇瀹?涓?ç‘•????ç›?浣跨??é”›?user_pref("mailnews.wraplength", 0); + +- 缂?æˆ?æµ£????Thunderbird???缃?ç’剧疆锛?浣垮??涓?ç‘•?浣跨??"format=flowed"??ç…Ž??é”›?user_pref("mailnews. + send_plaintext_flowed", false); + +- æµ£????ç‘•?æµ£?Thunderbird???涓洪???????ç…Ž????ç‘°??é”›? + æ¿¡????榛?ç’ゆ????å…¸??æµ£?涔??????????HTML??ç…Ž??é”›???d?????寰???俱??æµ ?æµ ?æµ ????棰???????涓????妗?涓???????"Preformat"??ç…Ž????? + æ¿¡????榛?ç’ゆ????å…¸??æµ£?涔??????????????????ç…Ž??é”›?æµ£?涓?寰????瀹???逛负HTML??ç…Ž??é”›?æµ ?æµ ?æµ£?涓轰??娆℃?Ñ…??é”›???ヤ功?????扮??娑????é”›? + ??è·º??寮哄?朵娇瀹??????版???????ç…Ž??é”›???????瀹?çå˜?????ç›????ç‘•?瀹???æ¿??é”›???ã„¥??淇$????炬??涓?浣跨??shift?????ヤ娇瀹????涓?HTML + ??ç…Ž??é”›???è·º?????棰???????涓????妗?涓???????"Preformat"??ç…Ž????? + +- ???ç’é•å¨‡??ã„¥????ㄧ??缂?æˆ????é”›? + ???瀵?Thunderbird???ç›ãƒ¤?????ç» ?????????规??ç辨??浣跨?ㄤ??涓?"external editor"??â•??é”›???è·º??浣跨?ㄤ????????娆㈢?? + $EDITOR??ヨ?诲???????????骞惰ˉ涓???版?????涓????ç‘•?瀹???æ¿??é”›????浠ヤ??æžè—‰è‹Ÿæ¶“?瀹?ç‘?æ©?涓???â•??é”›???è·º??娣诲??涓?涓?浣跨?ã„¥????? + ??????View->Toolbars->Customize...??????褰?æµ£?涔????淇℃???????è·º??æµ ?æµ ???ç‘°?诲??çåž??浠ヤ????? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +TkRat (GUI) + +???浠ヤ娇??ã„¥?????浣跨??"Insert file..."??????澶???ㄧ??缂?æˆ???ã„£?? + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Gmail (Web GUI) + +涓?ç‘•?浣跨?ã„¥????????ç›ãƒ¤????? + +Gmail缃?椤é›?㈡?风???????ã„¥?版????惰〃绗?æž????涓虹┖??笺?? + +??ç•Œ?è·º?惰〃绗?æž????涓虹┖??奸??棰????浠ヨ??澶???ㄧ??æˆ???ㄨВ??ç­¹???????è·º??æ©?æµ¼?浣跨?ã„¥??æž???㈣?????姣?ç›???????涓?78涓?瀛?ç»—???? + +???涓?涓????棰????Gmailæ©?æµ¼????浠讳??涓????ASCII???瀛?ç»—????淇℃????逛负base64缂???????瀹????涓?ç‘—åž®????????娆ф床浜虹?????瀛???? + + ### -- cgit v1.1 From a015dbc110a97ed3147546a9c914f18f71d798d0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:24:27 -0700 Subject: rtc: m41t80: Initialize clientdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the clientdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the clientdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-m41t80.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index 69fe664..eda128f 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -783,6 +783,9 @@ static int m41t80_probe(struct i2c_client *client, goto exit; } + clientdata->features = id->driver_data; + i2c_set_clientdata(client, clientdata); + rtc = rtc_device_register(client->name, &client->dev, &m41t80_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { @@ -792,8 +795,6 @@ static int m41t80_probe(struct i2c_client *client, } clientdata->rtc = rtc; - clientdata->features = id->driver_data; - i2c_set_clientdata(client, clientdata); /* Make sure HT (Halt Update) bit is cleared */ rc = i2c_smbus_read_byte_data(client, M41T80_REG_ALARM_HOUR); -- cgit v1.1 From 880ffb5c6c5c8c8c6efd9efe9355317322b4603b Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Thu, 5 May 2011 07:55:36 +0800 Subject: driver core: Add the device driver-model structures to kerneldoc Add the comments to the structure bus_type, device_driver, device, class to device.h for generating the driver-model kerneldoc. With another patch these all removed from the files in Documentation/driver-model/ since they are out of date. That will keep things up to date and provide a better way to document this stuff. Signed-off-by: Wanlong Gao Acked-by: Harry Wei Signed-off-by: Greg Kroah-Hartman --- Documentation/DocBook/device-drivers.tmpl | 6 +- include/linux/device.h | 154 +++++++++++++++++++++++++++++- 2 files changed, 154 insertions(+), 6 deletions(-) diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index 36f63d4..b638e50 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -96,10 +96,10 @@ X!Iinclude/linux/kobject.h Device drivers infrastructure + The Basic Device Driver-Model Structures +!Iinclude/linux/device.h + Device Drivers Base - !Edrivers/base/driver.c !Edrivers/base/core.c !Edrivers/base/class.c diff --git a/include/linux/device.h b/include/linux/device.h index 2215d01..4236506 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -47,6 +47,38 @@ extern int __must_check bus_create_file(struct bus_type *, struct bus_attribute *); extern void bus_remove_file(struct bus_type *, struct bus_attribute *); +/** + * struct bus_type - The bus type of the device + * + * @name: The name of the bus. + * @bus_attrs: Default attributes of the bus. + * @dev_attrs: Default attributes of the devices on the bus. + * @drv_attrs: Default attributes of the device drivers on the bus. + * @match: Called, perhaps multiple times, whenever a new device or driver + * is added for this bus. It should return a nonzero value if the + * given device can be handled by the given driver. + * @uevent: Called when a device is added, removed, or a few other things + * that generate uevents to add the environment variables. + * @probe: Called when a new device or driver add to this bus, and callback + * the specific driver's probe to initial the matched device. + * @remove: Called when a device removed from this bus. + * @shutdown: Called at shut-down time to quiesce the device. + * @suspend: Called when a device on this bus wants to go to sleep mode. + * @resume: Called to bring a device on this bus out of sleep mode. + * @pm: Power management operations of this bus, callback the specific + * device driver's pm-ops. + * @p: The private data of the driver core, only the driver core can + * touch this. + * + * A bus is a channel between the processor and one or more devices. For the + * purposes of the device model, all devices are connected via a bus, even if + * it is an internal, virtual, "platform" bus. Buses can plug into each other. + * A USB controller is usually a PCI device, for example. The device model + * represents the actual connections between buses and the devices they control. + * A bus is represented by the bus_type structure. It contains the name, the + * default attributes, the bus' methods, PM operations, and the driver core's + * private data. + */ struct bus_type { const char *name; struct bus_attribute *bus_attrs; @@ -119,6 +151,37 @@ extern int bus_unregister_notifier(struct bus_type *bus, extern struct kset *bus_get_kset(struct bus_type *bus); extern struct klist *bus_get_device_klist(struct bus_type *bus); +/** + * struct device_driver - The basic device driver structure + * @name: Name of the device driver. + * @bus: The bus which the device of this driver belongs to. + * @owner: The module owner. + * @mod_name: Used for built-in modules. + * @suppress_bind_attrs: Disables bind/unbind via sysfs. + * @of_match_table: The open firmware table. + * @probe: Called to query the existence of a specific device, + * whether this driver can work with it, and bind the driver + * to a specific device. + * @remove: Called when the device is removed from the system to + * unbind a device from this driver. + * @shutdown: Called at shut-down time to quiesce the device. + * @suspend: Called to put the device to sleep mode. Usually to a + * low power state. + * @resume: Called to bring a device from sleep mode. + * @groups: Default attributes that get created by the driver core + * automatically. + * @pm: Power management operations of the device which matched + * this driver. + * @p: Driver core's private data, no one other than the driver + * core can touch this. + * + * The device driver-model tracks all of the drivers known to the system. + * The main reason for this tracking is to enable the driver core to match + * up drivers with new devices. Once drivers are known objects within the + * system, however, a number of other things become possible. Device drivers + * can export information and configuration variables that are independent + * of any specific device. + */ struct device_driver { const char *name; struct bus_type *bus; @@ -185,8 +248,34 @@ struct device *driver_find_device(struct device_driver *drv, struct device *start, void *data, int (*match)(struct device *dev, void *data)); -/* - * device classes +/** + * struct class - device classes + * @name: Name of the class. + * @owner: The module owner. + * @class_attrs: Default attributes of this class. + * @dev_attrs: Default attributes of the devices belong to the class. + * @dev_bin_attrs: Default binary attributes of the devices belong to the class. + * @dev_kobj: The kobject that represents this class and links it into the hierarchy. + * @dev_uevent: Called when a device is added, removed from this class, or a + * few other things that generate uevents to add the environment + * variables. + * @devnode: Callback to provide the devtmpfs. + * @class_release: Called to release this class. + * @dev_release: Called to release the device. + * @suspend: Used to put the device to sleep mode, usually to a low power + * state. + * @resume: Used to bring the device from the sleep mode. + * @ns_type: Callbacks so sysfs can detemine namespaces. + * @namespace: Namespace of the device belongs to this class. + * @pm: The default device power management operations of this class. + * @p: The private data of the driver core, no one other than the + * driver core can touch this. + * + * A class is a higher-level view of a device that abstracts out low-level + * implementation details. Drivers may see a SCSI disk or an ATA disk, but, + * at the class level, they are all simply disks. Classes allow user space + * to work with devices based on what they do, rather than how they are + * connected or how they work. */ struct class { const char *name; @@ -401,6 +490,65 @@ struct device_dma_parameters { unsigned long segment_boundary_mask; }; +/** + * struct device - The basic device structure + * @parent: The device's "parent" device, the device to which it is attached. + * In most cases, a parent device is some sort of bus or host + * controller. If parent is NULL, the device, is a top-level device, + * which is not usually what you want. + * @p: Holds the private data of the driver core portions of the device. + * See the comment of the struct device_private for detail. + * @kobj: A top-level, abstract class from which other classes are derived. + * @init_name: Initial name of the device. + * @type: The type of device. + * This identifies the device type and carries type-specific + * information. + * @mutex: Mutex to synchronize calls to its driver. + * @bus: Type of bus device is on. + * @driver: Which driver has allocated this + * @platform_data: Platform data specific to the device. + * Example: For devices on custom boards, as typical of embedded + * and SOC based hardware, Linux often uses platform_data to point + * to board-specific structures describing devices and how they + * are wired. That can include what ports are available, chip + * variants, which GPIO pins act in what additional roles, and so + * on. This shrinks the "Board Support Packages" (BSPs) and + * minimizes board-specific #ifdefs in drivers. + * @power: For device power management. + * See Documentation/power/devices.txt for details. + * @pwr_domain: Provide callbacks that are executed during system suspend, + * hibernation, system resume and during runtime PM transitions + * along with subsystem-level and driver-level callbacks. + * @numa_node: NUMA node this device is close to. + * @dma_mask: Dma mask (if dma'ble device). + * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all + * hardware supports 64-bit addresses for consistent allocations + * such descriptors. + * @dma_parms: A low level driver may set these to teach IOMMU code about + * segment limitations. + * @dma_pools: Dma pools (if dma'ble device). + * @dma_mem: Internal for coherent mem override. + * @archdata: For arch-specific additions. + * @of_node: Associated device tree node. + * @of_match: Matching of_device_id from driver. + * @devt: For creating the sysfs "dev". + * @devres_lock: Spinlock to protect the resource of the device. + * @devres_head: The resources list of the device. + * @knode_class: The node used to add the device to the class list. + * @class: The class of the device. + * @groups: Optional attribute groups. + * @release: Callback to free the device after all references have + * gone away. This should be set by the allocator of the + * device (i.e. the bus driver that discovered the device). + * + * At the lowest level, every device in a Linux system is represented by an + * instance of struct device. The device structure contains the information + * that the device model core needs to model the system. Most subsystems, + * however, track additional information about the devices they host. As a + * result, it is rare for devices to be represented by bare device structures; + * instead, that structure, like kobject structures, is usually embedded within + * a higher-level representation of the device. + */ struct device { struct device *parent; @@ -611,7 +759,7 @@ extern int (*platform_notify)(struct device *dev); extern int (*platform_notify_remove)(struct device *dev); -/** +/* * get_device - atomically increment the reference count for the device. * */ -- cgit v1.1 From 63dc355a5a8cf296e2b1cc2e4192190dca221129 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Thu, 5 May 2011 07:55:37 +0800 Subject: driver core: remove the driver-model structures from the documentation Remove the struct bus_type, class, device, device_driver from the driver-model docs. With another patch add them to device.h, since they are out of date. That will keep things up to date and provide a better way to document this stuff. Signed-off-by: Wanlong Gao Acked-by: Harry Wei Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-model/bus.txt | 19 +------- Documentation/driver-model/class.txt | 17 +------ Documentation/driver-model/device.txt | 91 +---------------------------------- Documentation/driver-model/driver.txt | 18 +------ 4 files changed, 4 insertions(+), 141 deletions(-) diff --git a/Documentation/driver-model/bus.txt b/Documentation/driver-model/bus.txt index 5001b75..6754b2d 100644 --- a/Documentation/driver-model/bus.txt +++ b/Documentation/driver-model/bus.txt @@ -3,24 +3,7 @@ Bus Types Definition ~~~~~~~~~~ - -struct bus_type { - char * name; - - struct subsystem subsys; - struct kset drivers; - struct kset devices; - - struct bus_attribute * bus_attrs; - struct device_attribute * dev_attrs; - struct driver_attribute * drv_attrs; - - int (*match)(struct device * dev, struct device_driver * drv); - int (*hotplug) (struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); - int (*suspend)(struct device * dev, pm_message_t state); - int (*resume)(struct device * dev); -}; +See the kerneldoc for the struct bus_type. int bus_register(struct bus_type * bus); diff --git a/Documentation/driver-model/class.txt b/Documentation/driver-model/class.txt index 548505f..1fefc48 100644 --- a/Documentation/driver-model/class.txt +++ b/Documentation/driver-model/class.txt @@ -27,22 +27,7 @@ The device class structure looks like: typedef int (*devclass_add)(struct device *); typedef void (*devclass_remove)(struct device *); -struct device_class { - char * name; - rwlock_t lock; - u32 devnum; - struct list_head node; - - struct list_head drivers; - struct list_head intf_list; - - struct driver_dir_entry dir; - struct driver_dir_entry device_dir; - struct driver_dir_entry driver_dir; - - devclass_add add_device; - devclass_remove remove_device; -}; +See the kerneldoc for the struct class. A typical device class definition would look like: diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt index a124f31..b2ff426 100644 --- a/Documentation/driver-model/device.txt +++ b/Documentation/driver-model/device.txt @@ -2,96 +2,7 @@ The Basic Device Structure ~~~~~~~~~~~~~~~~~~~~~~~~~~ -struct device { - struct list_head g_list; - struct list_head node; - struct list_head bus_list; - struct list_head driver_list; - struct list_head intf_list; - struct list_head children; - struct device * parent; - - char name[DEVICE_NAME_SIZE]; - char bus_id[BUS_ID_SIZE]; - - spinlock_t lock; - atomic_t refcount; - - struct bus_type * bus; - struct driver_dir_entry dir; - - u32 class_num; - - struct device_driver *driver; - void *driver_data; - void *platform_data; - - u32 current_state; - unsigned char *saved_state; - - void (*release)(struct device * dev); -}; - -Fields -~~~~~~ -g_list: Node in the global device list. - -node: Node in device's parent's children list. - -bus_list: Node in device's bus's devices list. - -driver_list: Node in device's driver's devices list. - -intf_list: List of intf_data. There is one structure allocated for - each interface that the device supports. - -children: List of child devices. - -parent: *** FIXME *** - -name: ASCII description of device. - Example: " 3Com Corporation 3c905 100BaseTX [Boomerang]" - -bus_id: ASCII representation of device's bus position. This - field should be a name unique across all devices on the - bus type the device belongs to. - - Example: PCI bus_ids are in the form of - :. - This name is unique across all PCI devices in the system. - -lock: Spinlock for the device. - -refcount: Reference count on the device. - -bus: Pointer to struct bus_type that device belongs to. - -dir: Device's sysfs directory. - -class_num: Class-enumerated value of the device. - -driver: Pointer to struct device_driver that controls the device. - -driver_data: Driver-specific data. - -platform_data: Platform data specific to the device. - - Example: for devices on custom boards, as typical of embedded - and SOC based hardware, Linux often uses platform_data to point - to board-specific structures describing devices and how they - are wired. That can include what ports are available, chip - variants, which GPIO pins act in what additional roles, and so - on. This shrinks the "Board Support Packages" (BSPs) and - minimizes board-specific #ifdefs in drivers. - -current_state: Current power state of the device. - -saved_state: Pointer to saved state of the device. This is usable by - the device driver controlling the device. - -release: Callback to free the device after all references have - gone away. This should be set by the allocator of the - device (i.e. the bus driver that discovered the device). +See the kerneldoc for the struct device. Programming Interface diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt index d2cd6fb..4421135 100644 --- a/Documentation/driver-model/driver.txt +++ b/Documentation/driver-model/driver.txt @@ -1,23 +1,7 @@ Device Drivers -struct device_driver { - char * name; - struct bus_type * bus; - - struct completion unloaded; - struct kobject kobj; - list_t devices; - - struct module *owner; - - int (*probe) (struct device * dev); - int (*remove) (struct device * dev); - - int (*suspend) (struct device * dev, pm_message_t state); - int (*resume) (struct device * dev); -}; - +See the kerneldoc for the struct device_driver. Allocation -- cgit v1.1 From 2f5c4fe8f9811152d69ef5cd020e095a1f84ca65 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:26:25 -0700 Subject: rtc: max8925: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-max8925.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-max8925.c b/drivers/rtc/rtc-max8925.c index 20494b5..3bc046f 100644 --- a/drivers/rtc/rtc-max8925.c +++ b/drivers/rtc/rtc-max8925.c @@ -258,6 +258,8 @@ static int __devinit max8925_rtc_probe(struct platform_device *pdev) } dev_set_drvdata(&pdev->dev, info); + /* XXX - isn't this redundant? */ + platform_set_drvdata(pdev, info); info->rtc_dev = rtc_device_register("max8925-rtc", &pdev->dev, &max8925_rtc_ops, THIS_MODULE); @@ -267,10 +269,9 @@ static int __devinit max8925_rtc_probe(struct platform_device *pdev) goto out_rtc; } - platform_set_drvdata(pdev, info); - return 0; out_rtc: + platform_set_drvdata(pdev, NULL); free_irq(chip->irq_base + MAX8925_IRQ_RTC_ALARM0, info); out_irq: kfree(info); -- cgit v1.1 From 03cf7c477de8cb47658ba93f33dc93242985acff Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:27:07 -0700 Subject: rtc: max8998: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-max8998.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c index 3f7bc6b..2e48aa6 100644 --- a/drivers/rtc/rtc-max8998.c +++ b/drivers/rtc/rtc-max8998.c @@ -265,6 +265,8 @@ static int __devinit max8998_rtc_probe(struct platform_device *pdev) info->rtc = max8998->rtc; info->irq = max8998->irq_base + MAX8998_IRQ_ALARM0; + platform_set_drvdata(pdev, info); + info->rtc_dev = rtc_device_register("max8998-rtc", &pdev->dev, &max8998_rtc_ops, THIS_MODULE); @@ -274,8 +276,6 @@ static int __devinit max8998_rtc_probe(struct platform_device *pdev) goto out_rtc; } - platform_set_drvdata(pdev, info); - ret = request_threaded_irq(info->irq, NULL, max8998_rtc_alarm_irq, 0, "rtc-alarm0", info); @@ -293,6 +293,7 @@ static int __devinit max8998_rtc_probe(struct platform_device *pdev) return 0; out_rtc: + platform_set_drvdata(pdev, NULL); kfree(info); return ret; } -- cgit v1.1 From 93015236d92bf9ea746c0b10c3c1d9058cb11f82 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:28:36 -0700 Subject: rtc: msm6242: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-msm6242.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-msm6242.c b/drivers/rtc/rtc-msm6242.c index 6782062..fcb113c 100644 --- a/drivers/rtc/rtc-msm6242.c +++ b/drivers/rtc/rtc-msm6242.c @@ -214,6 +214,7 @@ static int __init msm6242_rtc_probe(struct platform_device *dev) error = -ENOMEM; goto out_free_priv; } + platform_set_drvdata(dev, priv); rtc = rtc_device_register("rtc-msm6242", &dev->dev, &msm6242_rtc_ops, THIS_MODULE); @@ -223,10 +224,10 @@ static int __init msm6242_rtc_probe(struct platform_device *dev) } priv->rtc = rtc; - platform_set_drvdata(dev, priv); return 0; out_unmap: + platform_set_drvdata(dev, NULL); iounmap(priv->regs); out_free_priv: kfree(priv); -- cgit v1.1 From 4b3687f9c18156cdb71729fe4e0c3000f7e4d7de Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:30:57 -0700 Subject: rtc: pcap: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-pcap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c index a633abc..cd4f198 100644 --- a/drivers/rtc/rtc-pcap.c +++ b/drivers/rtc/rtc-pcap.c @@ -151,6 +151,8 @@ static int __devinit pcap_rtc_probe(struct platform_device *pdev) pcap_rtc->pcap = dev_get_drvdata(pdev->dev.parent); + platform_set_drvdata(pdev, pcap_rtc); + pcap_rtc->rtc = rtc_device_register("pcap", &pdev->dev, &pcap_rtc_ops, THIS_MODULE); if (IS_ERR(pcap_rtc->rtc)) { @@ -158,7 +160,6 @@ static int __devinit pcap_rtc_probe(struct platform_device *pdev) goto fail_rtc; } - platform_set_drvdata(pdev, pcap_rtc); timer_irq = pcap_to_irq(pcap_rtc->pcap, PCAP_IRQ_1HZ); alarm_irq = pcap_to_irq(pcap_rtc->pcap, PCAP_IRQ_TODA); @@ -177,6 +178,7 @@ fail_alarm: fail_timer: rtc_device_unregister(pcap_rtc->rtc); fail_rtc: + platform_set_drvdata(pdev, NULL); kfree(pcap_rtc); return err; } -- cgit v1.1 From 130107b270f9a8ef1b50e02140a381c44a6abd68 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 6 May 2011 17:31:20 -0700 Subject: rtc: rp5c01: Initialize drvdata before registering device Commit f44f7f96a20 ("RTC: Initialize kernel state from RTC") uncovered an issue in a number of RTC drivers, where the drivers call rtc_device_register before initializing the device or platform drvdata. This frequently results in null pointer dereferences when the rtc_device_register immediately makes use of the rtc device, calling rtc_read_alarm. The solution is to ensure the drvdata is initialized prior to registering the rtc device. CC: Wolfram Sang CC: Alessandro Zummo CC: Thomas Gleixner CC: rtc-linux@googlegroups.com Signed-off-by: John Stultz --- drivers/rtc/rtc-rp5c01.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c index 694da39..359da6d 100644 --- a/drivers/rtc/rtc-rp5c01.c +++ b/drivers/rtc/rtc-rp5c01.c @@ -249,15 +249,15 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev) spin_lock_init(&priv->lock); + platform_set_drvdata(dev, priv); + rtc = rtc_device_register("rtc-rp5c01", &dev->dev, &rp5c01_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { error = PTR_ERR(rtc); goto out_unmap; } - priv->rtc = rtc; - platform_set_drvdata(dev, priv); error = sysfs_create_bin_file(&dev->dev.kobj, &priv->nvram_attr); if (error) @@ -268,6 +268,7 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev) out_unregister: rtc_device_unregister(rtc); out_unmap: + platform_set_drvdata(dev, NULL); iounmap(priv->regs); out_free_priv: kfree(priv); -- cgit v1.1 From 156229b352b999cafb86a21b50912975e39b7f44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 May 2011 11:57:47 +0200 Subject: rtc: mc13xxx: Don't call rtc_device_register while holding lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit f44f7f9 (RTC: Initialize kernel state from RTC) rtc_device_register reads the programmed alarm. As reading the alarm needs to take the mc13xxx lock, release it before calling rtc_device_register. This fixes a deadlock during boot: INFO: task swapper:1 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. swapper D c02b175c 0 1 0 0x00000000 [] (schedule+0x304/0x4f4) from [] (__mutex_lock_slowpath+0x7c/0x110) [] (__mutex_lock_slowpath+0x7c/0x110) from [] (mc13xxx_rtc_read_time+0x1c/0x118) [] (mc13xxx_rtc_read_time+0x1c/0x118) from [] (__rtc_read_time+0x58/0x5c) [] (__rtc_read_time+0x58/0x5c) from [] (rtc_read_time+0x30/0x48) [] (rtc_read_time+0x30/0x48) from [] (__rtc_read_alarm+0x1c/0x290) [] (__rtc_read_alarm+0x1c/0x290) from [] (rtc_device_register+0x150/0x27c) [] (rtc_device_register+0x150/0x27c) from [] (mc13xxx_rtc_probe+0x128/0x17c) [] (mc13xxx_rtc_probe+0x128/0x17c) from [] (platform_drv_probe+0x1c/0x24) [] (platform_drv_probe+0x1c/0x24) from [] (driver_probe_device+0x80/0x1a8) [] (driver_probe_device+0x80/0x1a8) from [] (__driver_attach+0x8c/0x90) [] (__driver_attach+0x8c/0x90) from [] (bus_for_each_dev+0x60/0x8c) [] (bus_for_each_dev+0x60/0x8c) from [] (bus_add_driver+0x180/0x248) [] (bus_add_driver+0x180/0x248) from [] (driver_register+0x70/0x15c) [] (driver_register+0x70/0x15c) from [] (platform_driver_probe+0x18/0x98) [] (platform_driver_probe+0x18/0x98) from [] (do_one_initcall+0x2c/0x168) [] (do_one_initcall+0x2c/0x168) from [] (kernel_init+0xa0/0x150) [] (kernel_init+0xa0/0x150) from [] (kernel_thread_exit+0x0/0x8) Reported-by: Vagrant Cascadian Signed-off-by: Uwe Kleine-König Closes: http://bugs.debian.org/625804 [Tweaked commit log -jstultz] Signed-off-by: John Stultz --- drivers/rtc/rtc-mc13xxx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index c5ac037..a1a278b 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -349,11 +349,15 @@ static int __devinit mc13xxx_rtc_probe(struct platform_device *pdev) if (ret) goto err_alarm_irq_request; + mc13xxx_unlock(mc13xxx); + priv->rtc = rtc_device_register(pdev->name, &pdev->dev, &mc13xxx_rtc_ops, THIS_MODULE); if (IS_ERR(priv->rtc)) { ret = PTR_ERR(priv->rtc); + mc13xxx_lock(mc13xxx); + mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_TODA, priv); err_alarm_irq_request: @@ -365,12 +369,12 @@ err_reset_irq_status: mc13xxx_irq_free(mc13xxx, MC13XXX_IRQ_RTCRST, priv); err_reset_irq_request: + mc13xxx_unlock(mc13xxx); + platform_set_drvdata(pdev, NULL); kfree(priv); } - mc13xxx_unlock(mc13xxx); - return ret; } -- cgit v1.1 From 3bd2cbb95543acf44fe123eb9f038de54e655eb4 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 21 Apr 2011 21:45:08 -0400 Subject: ARM: zImage: make sure the stack is 64-bit aligned With ARMv5+ and EABI, the compiler expects a 64-bit aligned stack so instructions like STRD and LDRD can be used. Without this, mysterious boot failures were seen semi randomly with the LZMA decompressor. While at it, let's align .bss as well. Signed-off-by: Nicolas Pitre Tested-by: Shawn Guo Acked-by: Tony Lindgren CC: stable@kernel.org --- arch/arm/boot/compressed/Makefile | 2 +- arch/arm/boot/compressed/vmlinux.lds.in | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 8ebbb51..0c6852d 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -74,7 +74,7 @@ ZTEXTADDR := $(CONFIG_ZBOOT_ROM_TEXT) ZBSSADDR := $(CONFIG_ZBOOT_ROM_BSS) else ZTEXTADDR := 0 -ZBSSADDR := ALIGN(4) +ZBSSADDR := ALIGN(8) endif SEDFLAGS = s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/ diff --git a/arch/arm/boot/compressed/vmlinux.lds.in b/arch/arm/boot/compressed/vmlinux.lds.in index 5309909..ea80abe 100644 --- a/arch/arm/boot/compressed/vmlinux.lds.in +++ b/arch/arm/boot/compressed/vmlinux.lds.in @@ -54,6 +54,7 @@ SECTIONS .bss : { *(.bss) } _end = .; + . = ALIGN(8); /* the stack must be 64-bit aligned */ .stack : { *(.stack) } .stab 0 : { *(.stab) } -- cgit v1.1 From 7c2527f0c4bf6bd096f58296597e1373387d69fd Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 26 Apr 2011 05:37:46 -0700 Subject: ARM: zImage: Fix bad SP address after relocating kernel Otherwise cache_clean_flush can overwrite some of the relocated area depending on where the kernel image gets loaded. This fixes booting on n900 after commit 6d7d0ae51574943bf571d269da3243257a2d15db (ARM: 6750/1: improvements to compressed/head.S). Thanks to Aaro Koskinen for debugging the address of the relocated area that gets corrupted, and to Nicolas Pitre for the other uncompress related fixes. Signed-off-by: Tony Lindgren Signed-off-by: Nicolas Pitre --- arch/arm/boot/compressed/head.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 84ac4d6..55a5bcb 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -253,6 +253,11 @@ restart: adr r0, LC0 /* Preserve offset to relocated code. */ sub r6, r9, r6 +#ifndef CONFIG_ZBOOT_ROM + /* cache_clean_flush may use the stack, so relocate it */ + add sp, sp, r6 +#endif + bl cache_clean_flush adr r0, BSYM(restart) -- cgit v1.1 From adcc25915b98e5752d51d66774ec4a61e50af3c5 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 27 Apr 2011 16:15:11 -0400 Subject: ARM: zImage: make sure not to relocate on top of the relocation code If the zImage load address is slightly below the relocation address, there is a risk for the copied data to overwrite the copy loop or cache flush code that the relocation process requires. Always bump the relocation address by the size of that code to avoid this issue. Noticed by Tony Lindgren . While at it, let's start the copy from the restart symbol which makes the above code size computation possible by the assembler directly (same sections), given that we don't need to preserve the code before that point anyway. And therefore we don't need to carry the _start pointer in r5 anymore. Signed-off-by: Nicolas Pitre Tested-by: Tony Lindgren --- arch/arm/boot/compressed/head.S | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 55a5bcb..53dd5da 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -187,15 +187,14 @@ not_angel: bl cache_on restart: adr r0, LC0 - ldmia r0, {r1, r2, r3, r5, r6, r9, r11, r12} - ldr sp, [r0, #32] + ldmia r0, {r1, r2, r3, r6, r9, r11, r12} + ldr sp, [r0, #28] /* * We might be running at a different address. We need * to fix up various pointers. */ sub r0, r0, r1 @ calculate the delta offset - add r5, r5, r0 @ _start add r6, r6, r0 @ _edata #ifndef CONFIG_ZBOOT_ROM @@ -214,31 +213,39 @@ restart: adr r0, LC0 /* * Check to see if we will overwrite ourselves. * r4 = final kernel address - * r5 = start of this image * r9 = size of decompressed image * r10 = end of this image, including bss/stack/malloc space if non XIP * We basically want: * r4 >= r10 -> OK - * r4 + image length <= r5 -> OK + * r4 + image length <= current position (pc) -> OK */ cmp r4, r10 bhs wont_overwrite add r10, r4, r9 - cmp r10, r5 + ARM( cmp r10, pc ) + THUMB( mov lr, pc ) + THUMB( cmp r10, lr ) bls wont_overwrite /* * Relocate ourselves past the end of the decompressed kernel. - * r5 = start of this image * r6 = _edata * r10 = end of the decompressed kernel * Because we always copy ahead, we need to do it from the end and go * backward in case the source and destination overlap. */ - /* Round up to next 256-byte boundary. */ - add r10, r10, #256 + /* + * Bump to the next 256-byte boundary with the size of + * the relocation code added. This avoids overwriting + * ourself when the offset is small. + */ + add r10, r10, #((reloc_code_end - restart + 256) & ~255) bic r10, r10, #255 + /* Get start of code we want to copy and align it down. */ + adr r5, restart + bic r5, r5, #31 + sub r9, r6, r5 @ size to copy add r9, r9, #31 @ rounded up to a multiple bic r9, r9, #31 @ ... of 32 bytes @@ -346,7 +353,6 @@ not_relocated: mov r0, #0 LC0: .word LC0 @ r1 .word __bss_start @ r2 .word _end @ r3 - .word _start @ r5 .word _edata @ r6 .word _image_size @ r9 .word _got_start @ r11 @@ -1075,6 +1081,7 @@ memdump: mov r12, r0 #endif .ltorg +reloc_code_end: .align .section ".stack", "aw", %nobits -- cgit v1.1 From ea9df3b168e641e87dbf889afae16390119e4179 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 21 Apr 2011 22:52:06 -0400 Subject: ARM: zImage: the page table memory must be considered before relocation For correctness, the initial page table located right before the decompressed kernel should be considered when determining if relocation is required. Signed-off-by: Nicolas Pitre Tested-by: Shawn Guo Acked-by: Tony Lindgren --- arch/arm/boot/compressed/head.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 53dd5da..d1fd1cf 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -216,9 +216,10 @@ restart: adr r0, LC0 * r9 = size of decompressed image * r10 = end of this image, including bss/stack/malloc space if non XIP * We basically want: - * r4 >= r10 -> OK + * r4 - 16k page directory >= r10 -> OK * r4 + image length <= current position (pc) -> OK */ + add r10, r10, #16384 cmp r4, r10 bhs wont_overwrite add r10, r4, r9 -- cgit v1.1 From 8fab6af2156c0100f953fd61f4e0b2f82c9776dc Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 6 May 2011 15:00:09 -0700 Subject: x86: Fix mrst sparse complaints Fix these Sparse complaints: CHECK arch/x86/platform/mrst/mrst.c arch/x86/platform/mrst/mrst.c:197:13: warning: symbol 'mrst_time_init' was not declared. Should it be static? arch/x86/platform/mrst/mrst.c:219:16: warning: symbol 'mrst_arch_setup' was not declared. Should it be static? Acked-by: Alan Cox Cc: Roman Gezikov Cc: Joonas Viskari Cc: Andrew Morton Cc: Allen Kao Signed-off-by: Luis R. Rodriguez Link: http://lkml.kernel.org/r/1304719209-26913-1-git-send-email-lrodriguez@atheros.com Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 275dbc1..7000e74b 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void) return 0; } -void __init mrst_time_init(void) +static void __init mrst_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); switch (mrst_timer_options) { @@ -216,7 +216,7 @@ void __init mrst_time_init(void) apbt_time_init(); } -void __cpuinit mrst_arch_setup(void) +static void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; -- cgit v1.1 From 174a7b1f9692acad7f0ca2b02f696894201a6d94 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sat, 7 May 2011 12:41:14 +0800 Subject: perf tools: Makefile: Use gcc to determine ARCH The original Makefile uses "uname -m" to determine ARCH. This causes problem on x86 when compile perf tool on 32 bit userspace with a 64 bit kernel. bench/../../../arch/x86/lib/memcpy_64.S: Assembler messages: bench/../../../arch/x86/lib/memcpy_64.S:28: Error: bad register name `%rdi' This is because "uname -m" returns x86_64 and memcpy_64.S is included in 32 bit build. Reported-by: Riccardo Magliocchetti Signed-off-by: Lin Ming Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1304743274.3132.17.camel@localhost Signed-off-by: Ingo Molnar --- tools/perf/Makefile | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 207dee5..0c54256 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -35,15 +35,21 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ ) +CC = $(CROSS_COMPILE)gcc +AR = $(CROSS_COMPILE)ar + # Additional ARCH settings for x86 ifeq ($(ARCH),i386) ARCH := x86 endif ifeq ($(ARCH),x86_64) - RAW_ARCH := x86_64 - ARCH := x86 - ARCH_CFLAGS := -DARCH_X86_64 - ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S + ARCH := x86 + IS_X86_64 := $(shell echo __x86_64__ | ${CC} -E -xc - | tail -n 1) + ifeq (${IS_X86_64}, 1) + RAW_ARCH := x86_64 + ARCH_CFLAGS := -DARCH_X86_64 + ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S + endif endif # @@ -119,8 +125,6 @@ lib = lib export prefix bindir sharedir sysconfdir -CC = $(CROSS_COMPILE)gcc -AR = $(CROSS_COMPILE)ar RM = rm -f MKDIR = mkdir FIND = find -- cgit v1.1 From 1217ed1ba5c67393293dfb0f03c353b118dadeb4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 May 2011 21:43:49 -0700 Subject: rcu: permit rcu_read_unlock() to be called while holding runqueue locks Avoid calling into the scheduler while holding core RCU locks. This allows rcu_read_unlock() to be called while holding the runqueue locks, but only as long as there was no chance of the RCU read-side critical section having been preempted. (Otherwise, if RCU priority boosting is enabled, rcu_read_unlock() might call into the scheduler in order to unboost itself, which might allows self-deadlock on the runqueue locks within the scheduler.) Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 44 ++++++++++------------------------ kernel/rcutree.h | 5 +--- kernel/rcutree_plugin.h | 64 ++++++++++++++++--------------------------------- 3 files changed, 34 insertions(+), 79 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 54ff7eb..5616b17 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1133,22 +1133,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); - - /* - * If there are no more online CPUs for this rcu_node structure, - * kill the rcu_node structure's kthread. Otherwise, adjust its - * affinity. - */ - t = rnp->node_kthread_task; - if (t != NULL && - rnp->qsmaskinit == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = NULL; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - kthread_stop(t); - rcu_stop_boost_kthread(rnp); - } else - rcu_node_kthread_setaffinity(rnp, -1); + rcu_node_kthread_setaffinity(rnp, -1); } /* @@ -1320,8 +1305,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) return; } if (rnp->qsmask == 0) { - rcu_initiate_boost(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ continue; } cpu = rnp->grplo; @@ -1340,10 +1324,10 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) raw_spin_unlock_irqrestore(&rnp->lock, flags); } rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - if (rnp->qsmask == 0) - rcu_initiate_boost(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (rnp->qsmask == 0) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + } } /* @@ -1497,7 +1481,8 @@ static void invoke_rcu_cpu_kthread(void) /* * Wake up the specified per-rcu_node-structure kthread. - * The caller must hold ->lock. + * Because the per-rcu_node kthreads are immortal, we don't need + * to do anything to keep them alive. */ static void invoke_rcu_node_kthread(struct rcu_node *rnp) { @@ -1546,8 +1531,8 @@ static void rcu_cpu_kthread_timer(unsigned long arg) raw_spin_lock_irqsave(&rnp->lock, flags); rnp->wakemask |= rdp->grpmask; - invoke_rcu_node_kthread(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + invoke_rcu_node_kthread(rnp); } /* @@ -1694,16 +1679,12 @@ static int rcu_node_kthread(void *arg) for (;;) { rnp->node_kthread_status = RCU_KTHREAD_WAITING; - wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 || - kthread_should_stop()); - if (kthread_should_stop()) - break; + wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0); rnp->node_kthread_status = RCU_KTHREAD_RUNNING; raw_spin_lock_irqsave(&rnp->lock, flags); mask = rnp->wakemask; rnp->wakemask = 0; - rcu_initiate_boost(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { if ((mask & 0x1) == 0) continue; @@ -1719,6 +1700,7 @@ static int rcu_node_kthread(void *arg) preempt_enable(); } } + /* NOTREACHED */ rnp->node_kthread_status = RCU_KTHREAD_STOPPED; return 0; } @@ -1738,7 +1720,7 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) int cpu; unsigned long mask = rnp->qsmaskinit; - if (rnp->node_kthread_task == NULL || mask == 0) + if (rnp->node_kthread_task == NULL) return; if (!alloc_cpumask_var(&cm, GFP_KERNEL)) return; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index a6a9717..93d4a1c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,15 +444,12 @@ static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp); -static void rcu_initiate_boost(struct rcu_node *rnp); +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, cpumask_var_t cm); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp, int rnp_index); -#ifdef CONFIG_HOTPLUG_CPU -static void rcu_stop_boost_kthread(struct rcu_node *rnp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f629479..ed339702 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -711,15 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { + unsigned long flags; int must_wait = 0; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - if (!list_empty(&rnp->blkd_tasks)) { + raw_spin_lock_irqsave(&rnp->lock, flags); + if (list_empty(&rnp->blkd_tasks)) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else { rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ must_wait = 1; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ if (!must_wait) rcu_report_exp_rnp(rsp, rnp); } @@ -1179,12 +1181,7 @@ static int rcu_boost(struct rcu_node *rnp) */ static void rcu_boost_kthread_timer(unsigned long arg) { - unsigned long flags; - struct rcu_node *rnp = (struct rcu_node *)arg; - - raw_spin_lock_irqsave(&rnp->lock, flags); - invoke_rcu_node_kthread(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + invoke_rcu_node_kthread((struct rcu_node *)arg); } /* @@ -1200,10 +1197,7 @@ static int rcu_boost_kthread(void *arg) for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || - rnp->exp_tasks || - kthread_should_stop()); - if (kthread_should_stop()) - break; + rnp->exp_tasks); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1215,7 +1209,7 @@ static int rcu_boost_kthread(void *arg) spincnt = 0; } } - rnp->boost_kthread_status = RCU_KTHREAD_STOPPED; + /* NOTREACHED */ return 0; } @@ -1225,14 +1219,17 @@ static int rcu_boost_kthread(void *arg) * kthread to start boosting them. If there is an expedited grace * period in progress, it is always time to boost. * - * The caller must hold rnp->lock. + * The caller must hold rnp->lock, which this function releases, + * but irqs remain disabled. The ->boost_kthread_task is immortal, + * so we don't need to worry about it going away. */ -static void rcu_initiate_boost(struct rcu_node *rnp) +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } if (rnp->exp_tasks != NULL || @@ -1242,11 +1239,14 @@ static void rcu_initiate_boost(struct rcu_node *rnp) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; + raw_spin_unlock_irqrestore(&rnp->lock, flags); t = rnp->boost_kthread_task; if (t != NULL) wake_up_process(t); - } else + } else { rcu_initiate_boost_trace(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } } /* @@ -1312,27 +1312,11 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_boost_kthread(struct rcu_node *rnp) -{ - unsigned long flags; - struct task_struct *t; - - raw_spin_lock_irqsave(&rnp->lock, flags); - t = rnp->boost_kthread_task; - rnp->boost_kthread_task = NULL; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (t != NULL) - kthread_stop(t); -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - #else /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_initiate_boost(struct rcu_node *rnp) +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); } static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, @@ -1355,14 +1339,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -static void rcu_stop_boost_kthread(struct rcu_node *rnp) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - #endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifndef CONFIG_SMP -- cgit v1.1 From 30088ad815802f850f26114920ccf9effd4bc520 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:53:46 +0800 Subject: cgroup,rcu: convert call_rcu(free_css_set_rcu) to kfree_rcu() The rcu callback free_css_set_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_css_set_rcu). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25c7eb5..d5160a8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -static void free_css_set_rcu(struct rcu_head *obj) -{ - struct css_set *cg = container_of(obj, struct css_set, rcu_head); - kfree(cg); -} - /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - call_rcu(&cg->rcu_head, free_css_set_rcu); + kfree_rcu(cg, rcu_head); } /* -- cgit v1.1 From f2da1c40dc003939f616f27a103b2592f1424b07 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:55:16 +0800 Subject: cgroup,rcu: convert call_rcu(free_cgroup_rcu) to kfree_rcu() The rcu callback free_cgroup_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_cgroup_rcu). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5160a8..20451ce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -806,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) return ret; } -static void free_cgroup_rcu(struct rcu_head *obj) -{ - struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); - - kfree(cgrp); -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -850,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); - call_rcu(&cgrp->rcu_head, free_cgroup_rcu); + kfree_rcu(cgrp, rcu_head); } iput(inode); } -- cgit v1.1 From 025cea99db3fb110ebc8ede5ff647833fab9574f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:56:10 +0800 Subject: cgroup,rcu: convert call_rcu(__free_css_id_cb) to kfree_rcu() The rcu callback __free_css_id_cb() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__free_css_id_cb). Signed-off-by: Lai Jiangshan Acked-by: Paul Menage Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/cgroup.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 20451ce..909a355 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4610,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, return ret; } -static void __free_css_id_cb(struct rcu_head *head) -{ - struct css_id *id; - - id = container_of(head, struct css_id, rcu_head); - kfree(id); -} - void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { struct css_id *id = css->id; @@ -4632,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); spin_unlock(&ss->id_lock); - call_rcu(&id->rcu_head, __free_css_id_cb); + kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); -- cgit v1.1 From f5c8593c107500979909bd51c85e74bb2eaffbaa Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:57:04 +0800 Subject: net,rcu: convert call_rcu(tcf_common_free_rcu) to kfree_rcu() The rcu callback tcf_common_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(tcf_common_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/sched/act_api.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 14b42f4..a606025 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -26,11 +26,6 @@ #include #include -static void tcf_common_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct tcf_common, tcfc_rcu)); -} - void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) { unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask); @@ -47,7 +42,7 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo) * gen_estimator est_timer() might access p->tcfc_lock * or bstats, wait a RCU grace period before freeing p */ - call_rcu(&p->tcfc_rcu, tcf_common_free_rcu); + kfree_rcu(p, tcfc_rcu); return; } } -- cgit v1.1 From 5957b1ac52c5fde2afe5d80abe2a1cb82a9b4f20 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:58:00 +0800 Subject: net,rcu: convert call_rcu(tcf_police_free_rcu) to kfree_rcu() [PATCH 05/17] net,rcu: convert call_rcu(tcf_police_free_rcu) to kfree_rcu() The rcu callback tcf_police_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(tcf_police_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/sched/act_police.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8a16307..d6bcd64 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -96,11 +96,6 @@ nla_put_failure: goto done; } -static void tcf_police_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct tcf_police, tcf_rcu)); -} - static void tcf_police_destroy(struct tcf_police *p) { unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK); @@ -121,7 +116,7 @@ static void tcf_police_destroy(struct tcf_police *p) * gen_estimator est_timer() might access p->tcf_lock * or bstats, wait a RCU grace period before freeing p */ - call_rcu(&p->tcf_rcu, tcf_police_free_rcu); + kfree_rcu(p, tcf_rcu); return; } } -- cgit v1.1 From 38f57d1a4b4c13db92c7f6300a4e4ae70ff94d2b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 17:59:14 +0800 Subject: net,rcu: convert call_rcu(in6_dev_finish_destroy_rcu) to kfree_rcu() The rcu callback in6_dev_finish_destroy_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(in6_dev_finish_destroy_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv6/addrconf.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a7bda07..c406f56 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -317,12 +317,6 @@ static void snmp6_free_dev(struct inet6_dev *idev) /* Nobody refers to this device, we may destroy it. */ -static void in6_dev_finish_destroy_rcu(struct rcu_head *head) -{ - struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu); - kfree(idev); -} - void in6_dev_finish_destroy(struct inet6_dev *idev) { struct net_device *dev = idev->dev; @@ -339,7 +333,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) return; } snmp6_free_dev(idev); - call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu); + kfree_rcu(idev, rcu); } EXPORT_SYMBOL(in6_dev_finish_destroy); -- cgit v1.1 From e57859854aaac9b7ce0db3fe4959190a1a80b60a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:00:14 +0800 Subject: net,rcu: convert call_rcu(inet6_ifa_finish_destroy_rcu) to kfree_rcu() The rcu callback inet6_ifa_finish_destroy_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(inet6_ifa_finish_destroy_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv6/addrconf.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c406f56..8f13d88 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -529,12 +529,6 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) } #endif -static void inet6_ifa_finish_destroy_rcu(struct rcu_head *head) -{ - struct inet6_ifaddr *ifp = container_of(head, struct inet6_ifaddr, rcu); - kfree(ifp); -} - /* Nobody refers to this ifaddr, destroy it */ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) { @@ -555,7 +549,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) } dst_release(&ifp->rt->dst); - call_rcu(&ifp->rcu, inet6_ifa_finish_destroy_rcu); + kfree_rcu(ifp, rcu); } static void -- cgit v1.1 From 37b6b935e96e837ccc60812c03e9f92e7dce2e61 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:01:42 +0800 Subject: net,rcu: convert call_rcu(listeners_free_rcu) to kfree_rcu() The rcu callback listeners_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(listeners_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/netlink/af_netlink.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c8f35b5..5fe4f3b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1566,12 +1566,6 @@ netlink_kernel_release(struct sock *sk) } EXPORT_SYMBOL(netlink_kernel_release); - -static void listeners_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct listeners, rcu)); -} - int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; @@ -1588,7 +1582,7 @@ int __netlink_change_ngroups(struct sock *sk, unsigned int groups) memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); - call_rcu(&old->rcu, listeners_free_rcu); + kfree_rcu(old, rcu); } tbl->groups = groups; -- cgit v1.1 From 1231f0baa547a541a7481119323b7f964dda4788 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:05:02 +0800 Subject: net,rcu: convert call_rcu(sctp_local_addr_free) to kfree_rcu() The rcu callback sctp_local_addr_free() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(sctp_local_addr_free). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/net/sctp/sctp.h | 1 - net/sctp/bind_addr.c | 2 +- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 9 +-------- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 505845d..01e094c 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -115,7 +115,6 @@ * sctp/protocol.c */ extern struct sock *sctp_get_ctl_sock(void); -extern void sctp_local_addr_free(struct rcu_head *head); extern int sctp_copy_local_addr_list(struct sctp_bind_addr *, sctp_scope_t, gfp_t gfp, int flags); diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index faf71d1..3c06c87 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -219,7 +219,7 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) } if (found) { - call_rcu(&addr->rcu, sctp_local_addr_free); + kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); return 0; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 865ce7b..185fe05 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -123,7 +123,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, } spin_unlock_bh(&sctp_local_addr_lock); if (found) - call_rcu(&addr->rcu, sctp_local_addr_free); + kfree_rcu(addr, rcu); break; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d5bf91d..065d999 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -230,13 +230,6 @@ static void sctp_free_local_addr_list(void) } } -void sctp_local_addr_free(struct rcu_head *head) -{ - struct sctp_sockaddr_entry *e = container_of(head, - struct sctp_sockaddr_entry, rcu); - kfree(e); -} - /* Copy the local addresses which are valid for 'scope' into 'bp'. */ int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, gfp_t gfp, int copy_flags) @@ -681,7 +674,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, } spin_unlock_bh(&sctp_local_addr_lock); if (found) - call_rcu(&addr->rcu, sctp_local_addr_free); + kfree_rcu(addr, rcu); break; } -- cgit v1.1 From 217f18639bc18ba4bbb67481113037344c148938 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:08:58 +0800 Subject: net,rcu: convert call_rcu(ha_rcu_free) to kfree_rcu() The rcu callback ha_rcu_free() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(ha_rcu_free). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/dev_addr_lists.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 7b39f3e..e2e6693 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -68,14 +68,6 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); } -static void ha_rcu_free(struct rcu_head *head) -{ - struct netdev_hw_addr *ha; - - ha = container_of(head, struct netdev_hw_addr, rcu_head); - kfree(ha); -} - static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, unsigned char *addr, int addr_len, unsigned char addr_type, bool global) @@ -94,7 +86,7 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); + kfree_rcu(ha, rcu_head); list->count--; return 0; } @@ -197,7 +189,7 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list) list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); - call_rcu(&ha->rcu_head, ha_rcu_free); + kfree_rcu(ha, rcu_head); } list->count = 0; } -- cgit v1.1 From 1e547757eca3c30eeeac526716e4ae833c2a9a2f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:10:12 +0800 Subject: net,rcu: convert call_rcu(dn_dev_free_ifa_rcu) to kfree_rcu() The rcu callback dn_dev_free_ifa_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(dn_dev_free_ifa_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/decnet/dn_dev.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 0dcaa90..4c27615 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -332,14 +332,9 @@ static struct dn_ifaddr *dn_dev_alloc_ifa(void) return ifa; } -static void dn_dev_free_ifa_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct dn_ifaddr, rcu)); -} - static void dn_dev_free_ifa(struct dn_ifaddr *ifa) { - call_rcu(&ifa->rcu, dn_dev_free_ifa_rcu); + kfree_rcu(ifa, rcu); } static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy) -- cgit v1.1 From 75ef0368d182785c7c5c06ac11081e31257a313e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:11:46 +0800 Subject: net,act_police,rcu: remove rcu_barrier() There is no callback of this module maybe queued since we use kfree_rcu(), we can safely remove the rcu_barrier(). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/sched/act_police.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index d6bcd64..b3b9b32 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -396,7 +396,6 @@ static void __exit police_cleanup_module(void) { tcf_unregister_action(&act_police_ops); - rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */ } module_init(police_init_module); -- cgit v1.1 From 3acb458c32293405cf68985b7b3ac5dc0a5e7929 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:11:07 +0800 Subject: security,rcu: convert call_rcu(user_update_rcu_disposal) to kfree_rcu() The rcu callback user_update_rcu_disposal() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(user_update_rcu_disposal). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Acked-by: David Howells Reviewed-by: Josh Triplett --- security/keys/user_defined.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index c6ca866..f66baf4 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -69,18 +69,6 @@ error: EXPORT_SYMBOL_GPL(user_instantiate); /* - * dispose of the old data from an updated user defined key - */ -static void user_update_rcu_disposal(struct rcu_head *rcu) -{ - struct user_key_payload *upayload; - - upayload = container_of(rcu, struct user_key_payload, rcu); - - kfree(upayload); -} - -/* * update a user defined key * - the key's semaphore is write-locked */ @@ -114,7 +102,7 @@ int user_update(struct key *key, const void *data, size_t datalen) key->expiry = 0; } - call_rcu(&zap->rcu, user_update_rcu_disposal); + kfree_rcu(zap, rcu); error: return ret; @@ -145,7 +133,7 @@ void user_revoke(struct key *key) if (upayload) { rcu_assign_pointer(key->payload.data, NULL); - call_rcu(&upayload->rcu, user_update_rcu_disposal); + kfree_rcu(upayload, rcu); } } -- cgit v1.1 From 4670994d150a86ebd53ab353a2af517c5465bfaf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:42:11 +0800 Subject: net,rcu: convert call_rcu(fc_rport_free_rcu) to kfree_rcu() The rcu callback fc_rport_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(fc_rport_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv4/fib_semantics.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 641a5a2..33e2c35 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -141,18 +141,8 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; - /* Release a nexthop info record */ -static void free_fib_info_rcu(struct rcu_head *head) -{ - struct fib_info *fi = container_of(head, struct fib_info, rcu); - - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); - kfree(fi); -} - void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { @@ -166,7 +156,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - call_rcu(&fi->rcu, free_fib_info_rcu); + kfree_rcu(fi, rcu); } void fib_release_info(struct fib_info *fi) -- cgit v1.1 From bceb0f4512d448763fe98c9f37504c98bbebbed6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:42:34 +0800 Subject: net,rcu: convert call_rcu(__leaf_info_free_rcu) to kfree_rcu() The rcu callback __leaf_info_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__leaf_info_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv4/fib_trie.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5fe9b8b..11d4d28 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -350,14 +350,9 @@ static inline void free_leaf(struct leaf *l) call_rcu_bh(&l->rcu, __leaf_free_rcu); } -static void __leaf_info_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct leaf_info, rcu)); -} - static inline void free_leaf_info(struct leaf_info *leaf) { - call_rcu(&leaf->rcu, __leaf_info_free_rcu); + kfree_rcu(leaf, rcu); } static struct tnode *tnode_alloc(size_t size) -- cgit v1.1 From dad178fcd5b295f5f350a46a5eaf2f28e847bb5a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:43:26 +0800 Subject: net,rcu: convert call_rcu(__gen_kill_estimator) to kfree_rcu() The rcu callback __gen_kill_estimator() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__gen_kill_estimator). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/gen_estimator.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7c23733..43b03dd 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -249,13 +249,6 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, } EXPORT_SYMBOL(gen_new_estimator); -static void __gen_kill_estimator(struct rcu_head *head) -{ - struct gen_estimator *e = container_of(head, - struct gen_estimator, e_rcu); - kfree(e); -} - /** * gen_kill_estimator - remove a rate estimator * @bstats: basic statistics @@ -279,7 +272,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, write_unlock(&est_lock); list_del_rcu(&e->list); - call_rcu(&e->e_rcu, __gen_kill_estimator); + kfree_rcu(e, e_rcu); } spin_unlock_bh(&est_tree_lock); } -- cgit v1.1 From 42ea299d3f1941f8c3dbc1fe031d682f8ad45005 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:44:08 +0800 Subject: net,rcu: convert call_rcu(ip_mc_list_reclaim) to kfree_rcu() The rcu callback ip_mc_list_reclaim() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(ip_mc_list_reclaim). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv4/igmp.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1fd3d9c..f22f30e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -149,17 +149,11 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); - -static void ip_mc_list_reclaim(struct rcu_head *head) -{ - kfree(container_of(head, struct ip_mc_list, rcu)); -} - static void ip_ma_put(struct ip_mc_list *im) { if (atomic_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); - call_rcu(&im->rcu, ip_mc_list_reclaim); + kfree_rcu(im, rcu); } } -- cgit v1.1 From 7519cce48fb0a314dac473bdfba203b787435857 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:44:46 +0800 Subject: net,rcu: convert call_rcu(ip_sf_socklist_reclaim) to kfree_rcu() The rcu callback ip_sf_socklist_reclaim() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(ip_sf_socklist_reclaim). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv4/igmp.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f22f30e..85a6e5d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1830,12 +1830,6 @@ done: } EXPORT_SYMBOL(ip_mc_join_group); -static void ip_sf_socklist_reclaim(struct rcu_head *rp) -{ - kfree(container_of(rp, struct ip_sf_socklist, rcu)); - /* sk_omem_alloc should have been decreased by the caller*/ -} - static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { @@ -1852,7 +1846,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, rcu_assign_pointer(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); - call_rcu(&psf->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psf, rcu); return err; } @@ -2020,7 +2014,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); - call_rcu(&psl->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psl, rcu); } rcu_assign_pointer(pmc->sflist, newpsl); psl = newpsl; @@ -2121,7 +2115,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); - call_rcu(&psl->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psl, rcu); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); -- cgit v1.1 From 10d50e748d983ff1003e0cf556ea17fa8f32c382 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:45:08 +0800 Subject: net,rcu: convert call_rcu(ip_mc_socklist_reclaim) to kfree_rcu() The rcu callback ip_mc_socklist_reclaim() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(ip_mc_socklist_reclaim). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv4/igmp.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85a6e5d..8f62d66 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1850,14 +1850,6 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, return err; } - -static void ip_mc_socklist_reclaim(struct rcu_head *rp) -{ - kfree(container_of(rp, struct ip_mc_socklist, rcu)); - /* sk_omem_alloc should have been decreased by the caller*/ -} - - /* * Ask a socket to leave a group. */ @@ -1897,7 +1889,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); - call_rcu(&iml->rcu, ip_mc_socklist_reclaim); + kfree_rcu(iml, rcu); return 0; } if (!in_dev) @@ -2312,7 +2304,7 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); - call_rcu(&iml->rcu, ip_mc_socklist_reclaim); + kfree_rcu(iml, rcu); } rtnl_unlock(); } -- cgit v1.1 From fa81c0e1d2d2176f1136c72c080c9ea4a98be347 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:39:43 +0800 Subject: net,rcu: convert call_rcu(free_dm_hw_stat) to kfree_rcu() The rcu callback free_dm_hw_stat() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_dm_hw_stat). Signed-off-by: Lai Jiangshan Acked-by: Neil Horman Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/drop_monitor.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 706502f..7f36b38 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -207,14 +207,6 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) rcu_read_unlock(); } - -static void free_dm_hw_stat(struct rcu_head *head) -{ - struct dm_hw_stat_delta *n; - n = container_of(head, struct dm_hw_stat_delta, rcu); - kfree(n); -} - static int set_all_monitor_traces(int state) { int rc = 0; @@ -245,7 +237,7 @@ static int set_all_monitor_traces(int state) list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { if (new_stat->dev == NULL) { list_del_rcu(&new_stat->list); - call_rcu(&new_stat->rcu, free_dm_hw_stat); + kfree_rcu(new_stat, rcu); } } break; @@ -314,7 +306,7 @@ static int dropmon_net_event(struct notifier_block *ev_block, new_stat->dev = NULL; if (trace_state == TRACE_OFF) { list_del_rcu(&new_stat->list); - call_rcu(&new_stat->rcu, free_dm_hw_stat); + kfree_rcu(new_stat, rcu); break; } } -- cgit v1.1 From bcec8b6531d481ced35506517af69adb2399f2a4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 11:57:21 +0800 Subject: ixgbe,rcu: convert call_rcu(ring_free_rcu) to kfree_rcu() The rcu callback ring_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(ring_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- drivers/net/ixgbe/ixgbe_main.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 6f8adc7..e145f2c 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -5100,11 +5100,6 @@ err_set_interrupt: return err; } -static void ring_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct ixgbe_ring, rcu)); -} - /** * ixgbe_clear_interrupt_scheme - Clear the current interrupt scheme settings * @adapter: board private structure to clear interrupt scheme on @@ -5126,7 +5121,7 @@ void ixgbe_clear_interrupt_scheme(struct ixgbe_adapter *adapter) /* ixgbe_get_stats64() might access this ring, we must wait * a grace period before freeing it. */ - call_rcu(&ring->rcu, ring_free_rcu); + kfree_rcu(ring, rcu); adapter->rx_ring[i] = NULL; } -- cgit v1.1 From 6e070aecd9e304264a6b8655f49aa7e6db0e55f2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:00:07 +0800 Subject: macvlan,rcu: convert call_rcu(macvlan_port_rcu_free) to kfree_rcu() The rcu callback macvlan_port_rcu_free() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(macvlan_port_rcu_free). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- drivers/net/macvlan.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 78e34e9..d8e4e69 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -603,21 +603,13 @@ static int macvlan_port_create(struct net_device *dev) return err; } -static void macvlan_port_rcu_free(struct rcu_head *head) -{ - struct macvlan_port *port; - - port = container_of(head, struct macvlan_port, rcu); - kfree(port); -} - static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get(dev); dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); - call_rcu(&port->rcu, macvlan_port_rcu_free); + kfree_rcu(port, rcu); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[]) -- cgit v1.1 From e3cbf28fa61456bd2f0ba39846ffd905257eb4b0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:00:50 +0800 Subject: net,rcu: convert call_rcu(ipv6_mc_socklist_reclaim) to kfree_rcu() The rcu callback ipv6_mc_socklist_reclaim() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(ipv6_mc_socklist_reclaim). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/ipv6/mcast.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 76b8937..f2d98ca 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -201,10 +201,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } -static void ipv6_mc_socklist_reclaim(struct rcu_head *head) -{ - kfree(container_of(head, struct ipv6_mc_socklist, rcu)); -} /* * socket leave on multicast group */ @@ -239,7 +235,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) (void) ip6_mc_leave_src(sk, mc_lst, NULL); rcu_read_unlock(); atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); - call_rcu(&mc_lst->rcu, ipv6_mc_socklist_reclaim); + kfree_rcu(mc_lst, rcu); return 0; } } @@ -307,7 +303,7 @@ void ipv6_sock_mc_close(struct sock *sk) rcu_read_unlock(); atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); - call_rcu(&mc_lst->rcu, ipv6_mc_socklist_reclaim); + kfree_rcu(mc_lst, rcu); spin_lock(&ipv6_sk_mc_lock); } -- cgit v1.1 From f6f80238fab1ad19c44b4a12501528d50fc7fcd6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:01:31 +0800 Subject: net,rcu: convert call_rcu(rps_map_release) to kfree_rcu() The rcu callback rps_map_release() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(rps_map_release). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/net-sysfs.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5ceb257..c410f28 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -565,13 +565,6 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, return len; } -static void rps_map_release(struct rcu_head *rcu) -{ - struct rps_map *map = container_of(rcu, struct rps_map, rcu); - - kfree(map); -} - static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, const char *buf, size_t len) @@ -619,7 +612,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (old_map) - call_rcu(&old_map->rcu, rps_map_release); + kfree_rcu(old_map, rcu); free_cpumask_var(mask); return len; @@ -728,7 +721,7 @@ static void rx_queue_release(struct kobject *kobj) map = rcu_dereference_raw(queue->rps_map); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); - call_rcu(&map->rcu, rps_map_release); + kfree_rcu(map, rcu); } flow_table = rcu_dereference_raw(queue->rps_flow_table); -- cgit v1.1 From edc86d8a1c824cca1df676f47fc713232885561f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:02:20 +0800 Subject: net,rcu: convert call_rcu(xps_map_release) to kfree_rcu() The rcu callback xps_map_release() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(xps_map_release). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/net-sysfs.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c410f28..48ffc21 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -891,13 +891,6 @@ static ssize_t show_xps_map(struct netdev_queue *queue, return len; } -static void xps_map_release(struct rcu_head *rcu) -{ - struct xps_map *map = container_of(rcu, struct xps_map, rcu); - - kfree(map); -} - static void xps_dev_maps_release(struct rcu_head *rcu) { struct xps_dev_maps *dev_maps = @@ -1002,7 +995,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue, map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) - call_rcu(&map->rcu, xps_map_release); + kfree_rcu(map, rcu); if (new_dev_maps->cpu_map[cpu]) nonempty = 1; } @@ -1077,7 +1070,7 @@ static void netdev_queue_release(struct kobject *kobj) else { RCU_INIT_POINTER(dev_maps->cpu_map[i], NULL); - call_rcu(&map->rcu, xps_map_release); + kfree_rcu(map, rcu); map = NULL; } } -- cgit v1.1 From b55071eb6011413af3b9c434ae77dea8832069c8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:02:47 +0800 Subject: net,rcu: convert call_rcu(xps_dev_maps_release) to kfree_rcu() The rcu callback xps_dev_maps_release() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(xps_dev_maps_release). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/net-sysfs.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 48ffc21..80b2aad 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -891,14 +891,6 @@ static ssize_t show_xps_map(struct netdev_queue *queue, return len; } -static void xps_dev_maps_release(struct rcu_head *rcu) -{ - struct xps_dev_maps *dev_maps = - container_of(rcu, struct xps_dev_maps, rcu); - - kfree(dev_maps); -} - static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) @@ -1008,7 +1000,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue, } if (dev_maps) - call_rcu(&dev_maps->rcu, xps_dev_maps_release); + kfree_rcu(dev_maps, rcu); netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : NUMA_NO_NODE); @@ -1080,7 +1072,7 @@ static void netdev_queue_release(struct kobject *kobj) if (!nonempty) { RCU_INIT_POINTER(dev->xps_maps, NULL); - call_rcu(&dev_maps->rcu, xps_dev_maps_release); + kfree_rcu(dev_maps, rcu); } } -- cgit v1.1 From 690273fc70e94a07d70044881e5e52926301bcd3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:03:19 +0800 Subject: security,rcu: convert call_rcu(sel_netif_free) to kfree_rcu() The rcu callback sel_netif_free() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(sel_netif_free). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- security/selinux/netif.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/security/selinux/netif.c b/security/selinux/netif.c index d6095d6..58cc481 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -104,22 +104,6 @@ static int sel_netif_insert(struct sel_netif *netif) } /** - * sel_netif_free - Frees an interface entry - * @p: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table interface entry can be - * released safely. - * - */ -static void sel_netif_free(struct rcu_head *p) -{ - struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); - kfree(netif); -} - -/** * sel_netif_destroy - Remove an interface record from the table * @netif: the existing interface record * @@ -131,7 +115,7 @@ static void sel_netif_destroy(struct sel_netif *netif) { list_del_rcu(&netif->list); sel_netif_total--; - call_rcu(&netif->rcu_head, sel_netif_free); + kfree_rcu(netif, rcu_head); } /** -- cgit v1.1 From c3b494209cc6084bd76d823d185a60ee5bd40b0d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:03:56 +0800 Subject: net,rcu: convert call_rcu(netlbl_unlhsh_free_addr4) to kfree_rcu() The rcu callback netlbl_unlhsh_free_addr4() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(netlbl_unlhsh_free_addr4). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Acked-by: Paul Moore Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/netlabel/netlabel_unlabeled.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index e2b0a68..4e5ad90 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -153,24 +153,6 @@ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1 * Unlabeled Connection Hash Table Functions */ -/** - * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table address entry can be - * released safely. - * - */ -static void netlbl_unlhsh_free_addr4(struct rcu_head *entry) -{ - struct netlbl_unlhsh_addr4 *ptr; - - ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu); - kfree(ptr); -} - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /** * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table @@ -568,7 +550,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, if (entry == NULL) return -ENOENT; - call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4); + kfree_rcu(entry, rcu); return 0; } -- cgit v1.1 From 6b2622328110b9c1094a4f644a6cdc9b6d5450ac Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:04:50 +0800 Subject: net,rcu: convert call_rcu(netlbl_unlhsh_free_addr6) to kfree_rcu() The rcu callback netlbl_unlhsh_free_addr6() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(netlbl_unlhsh_free_addr6). Signed-off-by: Lai Jiangshan Acked-by: Paul Moore Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/netlabel/netlabel_unlabeled.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 4e5ad90..9c38658 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -153,26 +153,6 @@ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1 * Unlabeled Connection Hash Table Functions */ -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table address entry can be - * released safely. - * - */ -static void netlbl_unlhsh_free_addr6(struct rcu_head *entry) -{ - struct netlbl_unlhsh_addr6 *ptr; - - ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu); - kfree(ptr); -} -#endif /* IPv6 */ - /** * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table * @entry: the entry's RCU field @@ -611,7 +591,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, if (entry == NULL) return -ENOENT; - call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6); + kfree_rcu(entry, rcu); return 0; } #endif /* IPv6 */ -- cgit v1.1 From 04d4dfed8e64f65d672502a614a4bb9093d1affd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:06:32 +0800 Subject: net,rcu: convert call_rcu(net_generic_release) to kfree_rcu() The rcu callback net_generic_release() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(net_generic_release). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/core/net_namespace.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3f86026..297bb92 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -27,14 +27,6 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ -static void net_generic_release(struct rcu_head *rcu) -{ - struct net_generic *ng; - - ng = container_of(rcu, struct net_generic, rcu); - kfree(ng); -} - static int net_assign_generic(struct net *net, int id, void *data) { struct net_generic *ng, *old_ng; @@ -68,7 +60,7 @@ static int net_assign_generic(struct net *net, int id, void *data) memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); rcu_assign_pointer(net->gen, ng); - call_rcu(&old_ng->rcu, net_generic_release); + kfree_rcu(old_ng, rcu); assign: ng->ptr[id - 1] = data; return 0; -- cgit v1.1 From 1f8d36a1869f5efae4fadf6baf01f02211040b97 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:07:09 +0800 Subject: net,rcu: convert call_rcu(__nf_ct_ext_free_rcu) to kfree_rcu() The rcu callback __nf_ct_ext_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(__nf_ct_ext_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/netfilter/nf_conntrack_extend.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 80a23ed..05ecdc2 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -68,12 +68,6 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) return (void *)(*ext) + off; } -static void __nf_ct_ext_free_rcu(struct rcu_head *head) -{ - struct nf_ct_ext *ext = container_of(head, struct nf_ct_ext, rcu); - kfree(ext); -} - void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { struct nf_ct_ext *old, *new; @@ -114,7 +108,7 @@ void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) (void *)old + old->offset[i]); rcu_read_unlock(); } - call_rcu(&old->rcu, __nf_ct_ext_free_rcu); + kfree_rcu(old, rcu); ct->ext = new; } -- cgit v1.1 From cb796ff338db9f064f4ecb7d41898e8bedcad4d9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:07:41 +0800 Subject: perf,rcu: convert call_rcu(free_ctx) to kfree_rcu() The rcu callback free_ctx() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(free_ctx). Signed-off-by: Lai Jiangshan Acked-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/perf_event.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8e81a98..17a176f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -586,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } -static void free_ctx(struct rcu_head *head) -{ - struct perf_event_context *ctx; - - ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx); -} - static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -601,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); + kfree_rcu(ctx, rcu_head); } } -- cgit v1.1 From fa4bbc4ca5795fbf1303b98c924e51a21a920f14 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:08:29 +0800 Subject: perf,rcu: convert call_rcu(swevent_hlist_release_rcu) to kfree_rcu() The rcu callback swevent_hlist_release_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(swevent_hlist_release_rcu). Signed-off-by: Lai Jiangshan Acked-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/perf_event.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 17a176f..b90d660 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5323,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash) lockdep_is_held(&swhash->hlist_mutex)); } -static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) -{ - struct swevent_hlist *hlist; - - hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); - kfree(hlist); -} - static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); @@ -5339,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) return; rcu_assign_pointer(swhash->swevent_hlist, NULL); - call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); + kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) -- cgit v1.1 From 7e113a9c759d1918fcbe456c5eb8890a4da62eaa Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:09:03 +0800 Subject: net,rcu: convert call_rcu(phonet_device_rcu_free) to kfree_rcu() The rcu callback phonet_device_rcu_free() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(phonet_device_rcu_free). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/phonet/pn_dev.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 947038d..1566672 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -162,14 +162,6 @@ int phonet_address_add(struct net_device *dev, u8 addr) return err; } -static void phonet_device_rcu_free(struct rcu_head *head) -{ - struct phonet_device *pnd; - - pnd = container_of(head, struct phonet_device, rcu); - kfree(pnd); -} - int phonet_address_del(struct net_device *dev, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); @@ -188,7 +180,7 @@ int phonet_address_del(struct net_device *dev, u8 addr) mutex_unlock(&pndevs->lock); if (pnd) - call_rcu(&pnd->rcu, phonet_device_rcu_free); + kfree_rcu(pnd, rcu); return err; } -- cgit v1.1 From 61845220248c2368095158420b029683fad5570a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:10:25 +0800 Subject: net,rcu: convert call_rcu(wq_free_rcu) to kfree_rcu() The rcu callback wq_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(wq_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/socket.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/net/socket.c b/net/socket.c index 310d16b..c2ed7c9 100644 --- a/net/socket.c +++ b/net/socket.c @@ -263,15 +263,6 @@ static struct inode *sock_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } - - -static void wq_free_rcu(struct rcu_head *head) -{ - struct socket_wq *wq = container_of(head, struct socket_wq, rcu); - - kfree(wq); -} - static void sock_destroy_inode(struct inode *inode) { struct socket_alloc *ei; @@ -279,7 +270,7 @@ static void sock_destroy_inode(struct inode *inode) ei = container_of(inode, struct socket_alloc, vfs_inode); wq = rcu_dereference_protected(ei->socket.wq, 1); - call_rcu(&wq->rcu, wq_free_rcu); + kfree_rcu(wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } -- cgit v1.1 From a74ce1425e4f6075de443274a5e917c84357eac4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:14:15 +0800 Subject: net/mac80211,rcu: convert call_rcu(work_free_rcu) to kfree_rcu() The rcu callback work_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(work_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: "John W. Linville" Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/mac80211/work.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/mac80211/work.c b/net/mac80211/work.c index e73c8ca..ac35496 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -65,17 +65,9 @@ static void run_again(struct ieee80211_local *local, mod_timer(&local->work_timer, timeout); } -static void work_free_rcu(struct rcu_head *head) -{ - struct ieee80211_work *wk = - container_of(head, struct ieee80211_work, rcu_head); - - kfree(wk); -} - void free_work(struct ieee80211_work *wk) { - call_rcu(&wk->rcu_head, work_free_rcu); + kfree_rcu(wk, rcu_head); } static int ieee80211_compatible_rates(const u8 *supp_rates, int supp_rates_len, -- cgit v1.1 From 88b4a0347a81539884df5ad535e10cf9fa87d8d4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 18 Mar 2011 12:15:02 +0800 Subject: net,rcu: convert call_rcu(xt_osf_finger_free_rcu) to kfree_rcu() The rcu callback xt_osf_finger_free_rcu() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(xt_osf_finger_free_rcu). Signed-off-by: Lai Jiangshan Acked-by: David S. Miller Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- net/netfilter/xt_osf.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 4327e10..846f895 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -62,13 +62,6 @@ static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, }; -static void xt_osf_finger_free_rcu(struct rcu_head *rcu_head) -{ - struct xt_osf_finger *f = container_of(rcu_head, struct xt_osf_finger, rcu_head); - - kfree(f); -} - static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[]) @@ -133,7 +126,7 @@ static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); - call_rcu(&sf->rcu_head, xt_osf_finger_free_rcu); + kfree_rcu(sf, rcu_head); err = 0; break; @@ -414,7 +407,7 @@ static void __exit xt_osf_fini(void) list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); - call_rcu(&f->rcu_head, xt_osf_finger_free_rcu); + kfree_rcu(f, rcu_head); } } rcu_read_unlock(); -- cgit v1.1 From 0744371aeba7a5004006c2309971ee026c0b2000 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Mar 2011 18:02:42 +0800 Subject: net,rcu: convert call_rcu(kfree_tid_tx) to kfree_rcu() The rcu callback kfree_tid_tx() just calls a kfree(), so we use kfree_rcu() instead of the call_rcu(kfree_tid_tx). Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: "John W. Linville" Cc: Johannes Berg Reviewed-by: Josh Triplett Acked-by: "David S. Miller" --- net/mac80211/agg-tx.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 63d852c..53defaf 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -136,14 +136,6 @@ void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u1 ieee80211_tx_skb(sdata, skb); } -static void kfree_tid_tx(struct rcu_head *rcu_head) -{ - struct tid_ampdu_tx *tid_tx = - container_of(rcu_head, struct tid_ampdu_tx, rcu_head); - - kfree(tid_tx); -} - int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_back_parties initiator, bool tx) @@ -163,7 +155,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, /* not even started yet! */ rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], NULL); spin_unlock_bh(&sta->lock); - call_rcu(&tid_tx->rcu_head, kfree_tid_tx); + kfree_rcu(tid_tx, rcu_head); return 0; } @@ -322,7 +314,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) spin_unlock_bh(&sta->lock); ieee80211_wake_queue_agg(local, tid); - call_rcu(&tid_tx->rcu_head, kfree_tid_tx); + kfree_rcu(tid_tx, rcu_head); return; } @@ -701,7 +693,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) ieee80211_agg_splice_finish(local, tid); - call_rcu(&tid_tx->rcu_head, kfree_tid_tx); + kfree_rcu(tid_tx, rcu_head); unlock_sta: spin_unlock_bh(&sta->lock); -- cgit v1.1 From eb340b2f804860a51a0b92e35fd36742b6c2d6b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 1 May 2011 23:25:02 -0700 Subject: batman,rcu: convert call_rcu(gw_node_free_rcu) to kfree_rcu The RCU callback gw_node_free_rcu() just calls kfree(), so we can use kfree_rcu() instead of call_rcu(). Signed-off-by: Paul E. McKenney Cc: Marek Lindner Cc: Simon Wunderlich Acked-by: David S. Miller Reviewed-by: Josh Triplett Acked-by: Sven Eckelmann --- net/batman-adv/gateway_client.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 3cc4355..150b6ce 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -28,18 +28,10 @@ #include #include -static void gw_node_free_rcu(struct rcu_head *rcu) -{ - struct gw_node *gw_node; - - gw_node = container_of(rcu, struct gw_node, rcu); - kfree(gw_node); -} - static void gw_node_free_ref(struct gw_node *gw_node) { if (atomic_dec_and_test(&gw_node->refcount)) - call_rcu(&gw_node->rcu, gw_node_free_rcu); + kfree_rcu(gw_node, rcu); } void *gw_get_selected(struct bat_priv *bat_priv) -- cgit v1.1 From ae179ae433bb4ef6b6179c5c1c7b6cc7dc01c670 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 1 May 2011 23:27:50 -0700 Subject: batman,rcu: convert call_rcu(neigh_node_free_rcu) to kfree() The RCU callback neigh_node_free_rcu() just calls kfree(), so we can use kfree_rcu() instead of call_rcu(). Signed-off-by: Paul E. McKenney Cc: Marek Lindner Cc: Simon Wunderlich Acked-by: David S. Miller Reviewed-by: Josh Triplett Acked-by: Sven Eckelmann --- net/batman-adv/originator.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 0b91330..ed23a589 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -56,18 +56,10 @@ err: return 0; } -static void neigh_node_free_rcu(struct rcu_head *rcu) -{ - struct neigh_node *neigh_node; - - neigh_node = container_of(rcu, struct neigh_node, rcu); - kfree(neigh_node); -} - void neigh_node_free_ref(struct neigh_node *neigh_node) { if (atomic_dec_and_test(&neigh_node->refcount)) - call_rcu(&neigh_node->rcu, neigh_node_free_rcu); + kfree_rcu(neigh_node, rcu); } struct neigh_node *create_neighbor(struct orig_node *orig_node, -- cgit v1.1 From 8e3572cff70ee19a0a1f2e2dde0bca0b7c8b54dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2011 00:52:23 -0700 Subject: batman,rcu: convert call_rcu(softif_neigh_free_rcu) to kfree_rcu The RCU callback softif_neigh_free_rcu() just calls kfree(), so we can use kfree_rcu() instead of call_rcu(). Signed-off-by: Paul E. McKenney Cc: Marek Lindner Cc: Simon Wunderlich Acked-by: David S. Miller Reviewed-by: Josh Triplett Acked-by: Sven Eckelmann --- net/batman-adv/soft-interface.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 824e1f6..04efe02 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -76,18 +76,10 @@ int my_skb_head_push(struct sk_buff *skb, unsigned int len) return 0; } -static void softif_neigh_free_rcu(struct rcu_head *rcu) -{ - struct softif_neigh *softif_neigh; - - softif_neigh = container_of(rcu, struct softif_neigh, rcu); - kfree(softif_neigh); -} - static void softif_neigh_free_ref(struct softif_neigh *softif_neigh) { if (atomic_dec_and_test(&softif_neigh->refcount)) - call_rcu(&softif_neigh->rcu, softif_neigh_free_rcu); + kfree_rcu(softif_neigh, rcu); } void softif_neigh_purge(struct bat_priv *bat_priv) -- cgit v1.1 From 11c476f31a0fabc6e604da5b09a6590b57c3fb20 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 May 2011 00:56:57 -0700 Subject: net,rcu: convert call_rcu(prl_entry_destroy_rcu) to kfree The RCU callback prl_entry_destroy_rcu() just calls kfree(), so we can use kfree_rcu() instead of call_rcu(). Signed-off-by: Paul E. McKenney Cc: Alexey Kuznetsov Cc: "Pekka Savola (ipv6)" Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: David S. Miller Reviewed-by: Josh Triplett --- net/ipv6/sit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 43b3337..5f35d59 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -401,11 +401,6 @@ out: return err; } -static void prl_entry_destroy_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct ip_tunnel_prl_entry, rcu_head)); -} - static void prl_list_destroy_rcu(struct rcu_head *head) { struct ip_tunnel_prl_entry *p, *n; @@ -433,7 +428,7 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) p = &x->next) { if (x->addr == a->addr) { *p = x->next; - call_rcu(&x->rcu_head, prl_entry_destroy_rcu); + kfree_rcu(x, rcu_head); t->prl_count--; goto out; } -- cgit v1.1 From 04b894553fd6e6fd7439e8440fd6bf5b6a17d9ae Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 5 May 2011 16:59:12 +0200 Subject: ASoC: SSM2602: Properly annotate i2c probe and remove functions Annotate the i2c probe and remove functions with __devinit and __devexit. Signed-off-by: Lars-Peter Clausen Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/codecs/ssm2602.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/ssm2602.c b/sound/soc/codecs/ssm2602.c index 2727bef..f7c1ce5 100644 --- a/sound/soc/codecs/ssm2602.c +++ b/sound/soc/codecs/ssm2602.c @@ -614,7 +614,7 @@ static struct snd_soc_codec_driver soc_codec_dev_ssm2602 = { * low = 0x1a * high = 0x1b */ -static int ssm2602_i2c_probe(struct i2c_client *i2c, +static int __devinit ssm2602_i2c_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { struct ssm2602_priv *ssm2602; @@ -635,7 +635,7 @@ static int ssm2602_i2c_probe(struct i2c_client *i2c, return ret; } -static int ssm2602_i2c_remove(struct i2c_client *client) +static int __devexit ssm2602_i2c_remove(struct i2c_client *client) { snd_soc_unregister_codec(&client->dev); kfree(i2c_get_clientdata(client)); @@ -655,7 +655,7 @@ static struct i2c_driver ssm2602_i2c_driver = { .owner = THIS_MODULE, }, .probe = ssm2602_i2c_probe, - .remove = ssm2602_i2c_remove, + .remove = __devexit_p(ssm2602_i2c_remove), .id_table = ssm2602_i2c_id, }; #endif -- cgit v1.1 From 36c90ab33feabbd63da775bd92ad356e5bd5cf56 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 5 May 2011 16:59:16 +0200 Subject: ASoC: SSM2602: Fix 'Mic Boost2' control The 'Mic Boost2' control's shift was off by one and thus was not working. Signed-off-by: Lars-Peter Clausen Acked-by: Liam Girdwood Signed-off-by: Mark Brown Cc: stable@kernel.org --- sound/soc/codecs/ssm2602.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/ssm2602.c b/sound/soc/codecs/ssm2602.c index f7c1ce5..946797d 100644 --- a/sound/soc/codecs/ssm2602.c +++ b/sound/soc/codecs/ssm2602.c @@ -139,7 +139,7 @@ SOC_DOUBLE_R("Capture Volume", SSM2602_LINVOL, SSM2602_RINVOL, 0, 31, 0), SOC_DOUBLE_R("Capture Switch", SSM2602_LINVOL, SSM2602_RINVOL, 7, 1, 1), SOC_SINGLE("Mic Boost (+20dB)", SSM2602_APANA, 0, 1, 0), -SOC_SINGLE("Mic Boost2 (+20dB)", SSM2602_APANA, 7, 1, 0), +SOC_SINGLE("Mic Boost2 (+20dB)", SSM2602_APANA, 8, 1, 0), SOC_SINGLE("Mic Switch", SSM2602_APANA, 1, 1, 1), SOC_SINGLE("Sidetone Playback Volume", SSM2602_APANA, 6, 3, 1), -- cgit v1.1 From 8fc63fe9412634c72676db42649f357eaac04566 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 5 May 2011 16:59:14 +0200 Subject: ASoC: SSM2602: Fix reg_cache_size reg_cache_size is supposed to be the number of elements in the register cache, not the size in bytes. Signed-off-by: Lars-Peter Clausen Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/codecs/ssm2602.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/ssm2602.c b/sound/soc/codecs/ssm2602.c index 946797d..b04d280 100644 --- a/sound/soc/codecs/ssm2602.c +++ b/sound/soc/codecs/ssm2602.c @@ -602,7 +602,7 @@ static struct snd_soc_codec_driver soc_codec_dev_ssm2602 = { .read = ssm2602_read_reg_cache, .write = ssm2602_write, .set_bias_level = ssm2602_set_bias_level, - .reg_cache_size = sizeof(ssm2602_reg), + .reg_cache_size = ARRAY_SIZE(ssm2602_reg), .reg_word_size = sizeof(u16), .reg_cache_default = ssm2602_reg, }; -- cgit v1.1 From bf707de21fec7bb203dace2d0a2bbd124d1b36ca Mon Sep 17 00:00:00 2001 From: Marek Belisko Date: Tue, 3 May 2011 14:46:32 +0200 Subject: ASoC: UDA134x: Remove POWER_OFF_ON_STANDBY define. Define POWER_OFF_ON_STANDBY cause trobles when trying to get some sound from codec because code for bias setup was not compiled (define wasn't defined). This define was removed in commit: cc3202f5 but again introduced by commit: f0fba2ad1 which then completely break codec functionality so remove it again. Signed-off-by: Marek Belisko Acked-by: Liam Girdwood Signed-off-by: Mark Brown Cc: stable@kernel.org --- sound/soc/codecs/uda134x.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/soc/codecs/uda134x.c b/sound/soc/codecs/uda134x.c index 48ffd40..a7b8f30 100644 --- a/sound/soc/codecs/uda134x.c +++ b/sound/soc/codecs/uda134x.c @@ -601,9 +601,7 @@ static struct snd_soc_codec_driver soc_codec_dev_uda134x = { .reg_cache_step = 1, .read = uda134x_read_reg_cache, .write = uda134x_write, -#ifdef POWER_OFF_ON_STANDBY .set_bias_level = uda134x_set_bias_level, -#endif }; static int __devinit uda134x_codec_probe(struct platform_device *pdev) -- cgit v1.1 From 7a7b94ad8ce3e24d4dd97b45583911e0f03aecd6 Mon Sep 17 00:00:00 2001 From: Jimmy Rentz Date: Sun, 17 Apr 2011 16:15:09 -0400 Subject: drm/nouveau: Fix a crash at card takedown for NV40 and older cards NV40 and older cards (pre NV50) reserve a vram bo for the vga memory at card init. This bo is then freed at card shutdown. The problem is that the ttm bo vram manager was already freed. So a crash occurs when the vga bo is freed. The fix is to free the vga bo prior to freeing the ttm bo vram manager. There might be other solutions but this seemed the simplest to me. Signed-off-by: Jimmy Rentz Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_mem.c | 2 -- drivers/gpu/drm/nouveau/nouveau_state.c | 5 +++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_mem.c b/drivers/gpu/drm/nouveau/nouveau_mem.c index 5045f8b..c3e953b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_mem.c +++ b/drivers/gpu/drm/nouveau/nouveau_mem.c @@ -152,8 +152,6 @@ nouveau_mem_vram_fini(struct drm_device *dev) { struct drm_nouveau_private *dev_priv = dev->dev_private; - nouveau_bo_ref(NULL, &dev_priv->vga_ram); - ttm_bo_device_release(&dev_priv->ttm.bdev); nouveau_ttm_global_release(dev_priv); diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c index a30adec..915fbce 100644 --- a/drivers/gpu/drm/nouveau/nouveau_state.c +++ b/drivers/gpu/drm/nouveau/nouveau_state.c @@ -768,6 +768,11 @@ static void nouveau_card_takedown(struct drm_device *dev) engine->mc.takedown(dev); engine->display.late_takedown(dev); + if (dev_priv->vga_ram) { + nouveau_bo_unpin(dev_priv->vga_ram); + nouveau_bo_ref(NULL, &dev_priv->vga_ram); + } + mutex_lock(&dev->struct_mutex); ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_VRAM); ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_TT); -- cgit v1.1 From 9c412942a0bb19ba18f7bd939d42eff1e132a901 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 3 May 2011 07:49:25 +0000 Subject: ipheth: Properly distinguish length and alignment in URBs and skbs The USB protocol this driver implements appears to require 2 bytes of padding in front of each received packet. This used to be equal to the value of NET_IP_ALIGN on x86, so the driver abused that constant and mostly worked, but this is no longer the case. The driver also mixed up the URB and packet lengths, resulting in 2 bytes of junk at the end of the skb. Introduce a private constant for the 2 bytes of padding; fix this confusion and check for the under-length case. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 7d42f9a..81126ff 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -65,6 +65,7 @@ #define IPHETH_USBINTF_PROTO 1 #define IPHETH_BUF_SIZE 1516 +#define IPHETH_IP_ALIGN 2 /* padding at front of URB */ #define IPHETH_TX_TIMEOUT (5 * HZ) #define IPHETH_INTFNUM 2 @@ -202,18 +203,21 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } - len = urb->actual_length; - buf = urb->transfer_buffer; + if (urb->actual_length <= IPHETH_IP_ALIGN) { + dev->net->stats.rx_length_errors++; + return; + } + len = urb->actual_length - IPHETH_IP_ALIGN; + buf = urb->transfer_buffer + IPHETH_IP_ALIGN; - skb = dev_alloc_skb(NET_IP_ALIGN + len); + skb = dev_alloc_skb(len); if (!skb) { err("%s: dev_alloc_skb: -ENOMEM", __func__); dev->net->stats.rx_dropped++; return; } - skb_reserve(skb, NET_IP_ALIGN); - memcpy(skb_put(skb, len), buf + NET_IP_ALIGN, len - NET_IP_ALIGN); + memcpy(skb_put(skb, len), buf, len); skb->dev = dev->net; skb->protocol = eth_type_trans(skb, dev->net); -- cgit v1.1 From b9f47a3aaeabdce3b42829bbb27765fa340f76ba Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 4 May 2011 10:04:56 +0000 Subject: tcp_cubic: limit delayed_ack ratio to prevent divide error TCP Cubic keeps a metric that estimates the amount of delayed acknowledgements to use in adjusting the window. If an abnormally large number of packets are acknowledged at once, then the update could wrap and reach zero. This kind of ACK could only happen when there was a large window and huge number of ACK's were lost. This patch limits the value of delayed ack ratio. The choice of 32 is just a conservative value since normally it should be range of 1 to 4 packets. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 34340c9..f376b05 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,6 +93,7 @@ struct bictcp { u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ #define ACK_RATIO_SHIFT 4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ @@ -398,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) u32 delay; if (icsk->icsk_ca_state == TCP_CA_Open) { - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + u32 ratio = ca->delayed_ack; + + ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ratio += cnt; + + ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ -- cgit v1.1 From 58e73811c85d0c0e74b8d300547bbc9abaf40a38 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 6 May 2011 01:42:49 -0400 Subject: drm/radeon/kms: ATPX switcheroo fixes When we switch the display mux, also switch the i2c mux. Also use the start and finish methods to let the sbios know that the switch is happening. Should fix: https://bugs.freedesktop.org/show_bug.cgi?id=35398 Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_atpx_handler.c | 29 ++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atpx_handler.c b/drivers/gpu/drm/radeon/radeon_atpx_handler.c index ed5dfe5..9d95792 100644 --- a/drivers/gpu/drm/radeon/radeon_atpx_handler.c +++ b/drivers/gpu/drm/radeon/radeon_atpx_handler.c @@ -15,6 +15,9 @@ #define ATPX_VERSION 0 #define ATPX_GPU_PWR 2 #define ATPX_MUX_SELECT 3 +#define ATPX_I2C_MUX_SELECT 4 +#define ATPX_SWITCH_START 5 +#define ATPX_SWITCH_END 6 #define ATPX_INTEGRATED 0 #define ATPX_DISCRETE 1 @@ -149,13 +152,35 @@ static int radeon_atpx_switch_mux(acpi_handle handle, int mux_id) return radeon_atpx_execute(handle, ATPX_MUX_SELECT, mux_id); } +static int radeon_atpx_switch_i2c_mux(acpi_handle handle, int mux_id) +{ + return radeon_atpx_execute(handle, ATPX_I2C_MUX_SELECT, mux_id); +} + +static int radeon_atpx_switch_start(acpi_handle handle, int gpu_id) +{ + return radeon_atpx_execute(handle, ATPX_SWITCH_START, gpu_id); +} + +static int radeon_atpx_switch_end(acpi_handle handle, int gpu_id) +{ + return radeon_atpx_execute(handle, ATPX_SWITCH_END, gpu_id); +} static int radeon_atpx_switchto(enum vga_switcheroo_client_id id) { + int gpu_id; + if (id == VGA_SWITCHEROO_IGD) - radeon_atpx_switch_mux(radeon_atpx_priv.atpx_handle, 0); + gpu_id = ATPX_INTEGRATED; else - radeon_atpx_switch_mux(radeon_atpx_priv.atpx_handle, 1); + gpu_id = ATPX_DISCRETE; + + radeon_atpx_switch_start(radeon_atpx_priv.atpx_handle, gpu_id); + radeon_atpx_switch_mux(radeon_atpx_priv.atpx_handle, gpu_id); + radeon_atpx_switch_i2c_mux(radeon_atpx_priv.atpx_handle, gpu_id); + radeon_atpx_switch_end(radeon_atpx_priv.atpx_handle, gpu_id); + return 0; } -- cgit v1.1 From 2bbd4492552867053b5a618a2474297e2b1c355d Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Fri, 6 May 2011 23:47:53 +0200 Subject: drm: mm: fix debug output The looping helper didn't do anything due to a superficial semicolon. Furthermore one of the two dump functions suffered from copy&paste fail. While staring at the code I've also noticed that the replace helper (currently unused) is a bit broken. Signed-off-by: Daniel Vetter Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_mm.c | 6 +++--- include/drm/drm_mm.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 5d00b0f..959186c 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -431,7 +431,7 @@ EXPORT_SYMBOL(drm_mm_search_free_in_range); void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new) { list_replace(&old->node_list, &new->node_list); - list_replace(&old->node_list, &new->hole_stack); + list_replace(&old->hole_stack, &new->hole_stack); new->hole_follows = old->hole_follows; new->mm = old->mm; new->start = old->start; @@ -699,8 +699,8 @@ int drm_mm_dump_table(struct seq_file *m, struct drm_mm *mm) entry->size); total_used += entry->size; if (entry->hole_follows) { - hole_start = drm_mm_hole_node_start(&mm->head_node); - hole_end = drm_mm_hole_node_end(&mm->head_node); + hole_start = drm_mm_hole_node_start(entry); + hole_end = drm_mm_hole_node_end(entry); hole_size = hole_end - hole_start; seq_printf(m, "0x%08lx-0x%08lx: 0x%08lx: free\n", hole_start, hole_end, hole_size); diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h index c2f93a8..564b14a 100644 --- a/include/drm/drm_mm.h +++ b/include/drm/drm_mm.h @@ -86,7 +86,7 @@ static inline bool drm_mm_initialized(struct drm_mm *mm) } #define drm_mm_for_each_node(entry, mm) list_for_each_entry(entry, \ &(mm)->head_node.node_list, \ - node_list); + node_list) #define drm_mm_for_each_scanned_node_reverse(entry, n, mm) \ for (entry = (mm)->prev_scanned_node, \ next = entry ? list_entry(entry->node_list.next, \ -- cgit v1.1 From 45e5f6a2ee6aac20e393d44f8a6762104426c81b Mon Sep 17 00:00:00 2001 From: Ilija Hadzic Date: Wed, 4 May 2011 20:15:03 -0400 Subject: drm/radeon: fix order of doing things in radeon_crtc_cursor_set if object pin or object lookup in radeon_cursor_set fail, the function could leave inconsistent mouse width and hight values in radeon_crtc fixed by moving cursor width and height assignments after all checks have passed Signed-off-by: Ilija Hadzic Reviewed-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_cursor.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_cursor.c b/drivers/gpu/drm/radeon/radeon_cursor.c index bdf2fa1..3189a7e 100644 --- a/drivers/gpu/drm/radeon/radeon_cursor.c +++ b/drivers/gpu/drm/radeon/radeon_cursor.c @@ -167,9 +167,6 @@ int radeon_crtc_cursor_set(struct drm_crtc *crtc, return -EINVAL; } - radeon_crtc->cursor_width = width; - radeon_crtc->cursor_height = height; - obj = drm_gem_object_lookup(crtc->dev, file_priv, handle); if (!obj) { DRM_ERROR("Cannot find cursor object %x for crtc %d\n", handle, radeon_crtc->crtc_id); @@ -180,6 +177,9 @@ int radeon_crtc_cursor_set(struct drm_crtc *crtc, if (ret) goto fail; + radeon_crtc->cursor_width = width; + radeon_crtc->cursor_height = height; + radeon_lock_cursor(crtc, true); /* XXX only 27 bit offset for legacy cursor */ radeon_set_cursor(crtc, obj, gpu_addr); -- cgit v1.1 From 4f87af46107499415afd238be104587b5a9d7ac3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 4 May 2011 11:41:47 -0400 Subject: drm/radeon/kms: add pci id to acer travelmate quirk for 5730 Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=34082 Reported by: Sampo Laaksonen Signed-off-by: Alex Deucher Cc: stable@kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_atombios.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index f116516..dd881d0 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -431,7 +431,7 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, } } - /* Acer laptop (Acer TravelMate 5730G) has an HDMI port + /* Acer laptop (Acer TravelMate 5730/5730G) has an HDMI port * on the laptop and a DVI port on the docking station and * both share the same encoder, hpd pin, and ddc line. * So while the bios table is technically correct, @@ -440,7 +440,7 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, * with different crtcs which isn't possible on the hardware * side and leaves no crtcs for LVDS or VGA. */ - if ((dev->pdev->device == 0x95c4) && + if (((dev->pdev->device == 0x95c4) || (dev->pdev->device == 0x9591)) && (dev->pdev->subsystem_vendor == 0x1025) && (dev->pdev->subsystem_device == 0x013c)) { if ((*connector_type == DRM_MODE_CONNECTOR_DVII) && -- cgit v1.1 From 9fbdaeb4f4dd14a0caa9fc35c496d5440c251a3a Mon Sep 17 00:00:00 2001 From: Manoj Iyer Date: Sun, 8 May 2011 18:04:29 -0400 Subject: thinkpad-acpi: module autoloading for newer Lenovo ThinkPads. The newer Lenovo ThinkPads have HKEY HID of LEN0068 instead of IBM0068. Added new HID so that thinkpad_acpi module will auto load on these newer Lenovo ThinkPads. Acked-by: Henrique de Moraes Holschuh Cc: stable@kernel.org Signed-off-by: Manoj Iyer Signed-off-by: Andy Lutomirski Signed-off-by: Matthew Garrett --- drivers/platform/x86/thinkpad_acpi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index efb3b6b..562fcf0 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -128,7 +128,8 @@ enum { }; /* ACPI HIDs */ -#define TPACPI_ACPI_HKEY_HID "IBM0068" +#define TPACPI_ACPI_IBM_HKEY_HID "IBM0068" +#define TPACPI_ACPI_LENOVO_HKEY_HID "LEN0068" #define TPACPI_ACPI_EC_HID "PNP0C09" /* Input IDs */ @@ -3879,7 +3880,8 @@ errexit: } static const struct acpi_device_id ibm_htk_device_ids[] = { - {TPACPI_ACPI_HKEY_HID, 0}, + {TPACPI_ACPI_IBM_HKEY_HID, 0}, + {TPACPI_ACPI_LENOVO_HKEY_HID, 0}, {"", 0}, }; -- cgit v1.1 From 6192fa7109fb33591fa1078c8c1981e39da02d2d Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Tue, 5 Apr 2011 23:38:36 +0900 Subject: sony-laptop: report failures on setting LCD brightness Check if we were successful in setting the requested brightness and report failure in that case. Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 8f709ae..9d80ae4 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -966,9 +966,10 @@ static int sony_nc_update_status_ng(struct backlight_device *bd) int *handle = (int *)bl_get_data(bd); value = bd->props.brightness; - sony_call_snc_handle(*handle, 0x0100 | (value << 16), &result); + if (sony_call_snc_handle(*handle, 0x0100 | (value << 16), &result)) + return -EIO; - return sony_nc_get_brightness_ng(bd); + return value; } static const struct backlight_ops sony_backlight_ops = { -- cgit v1.1 From 62d2f23e8bce3e7da4db53928e810fc8a474ce70 Mon Sep 17 00:00:00 2001 From: Mattia Dongili Date: Mon, 9 May 2011 10:20:29 -0400 Subject: [PATCH] sony-laptop: limit brightness range to DSDT provided ones The new style brightness control provides an operating range of 9 values (seems consistent over a large number of models sharing the same brightness control methods). Read and use the minimum and maximum values to limit the backlight interface between those boundaries. Signed-off-by: Mattia Dongili Signed-off-by: Matthew Garrett --- drivers/platform/x86/sony-laptop.c | 127 ++++++++++++++++++++++++++++++------- 1 file changed, 103 insertions(+), 24 deletions(-) diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 9d80ae4..6fe8cd6 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -934,6 +934,14 @@ static ssize_t sony_nc_sysfs_store(struct device *dev, /* * Backlight device */ +struct sony_backlight_props { + struct backlight_device *dev; + int handle; + u8 offset; + u8 maxlvl; +}; +struct sony_backlight_props sony_bl_props; + static int sony_backlight_update_status(struct backlight_device *bd) { return acpi_callsetfunc(sony_nc_acpi_handle, "SBRT", @@ -954,19 +962,23 @@ static int sony_nc_get_brightness_ng(struct backlight_device *bd) { int result; int *handle = (int *)bl_get_data(bd); + struct sony_backlight_props *sdev = + (struct sony_backlight_props *)bl_get_data(bd); - sony_call_snc_handle(*handle, 0x0200, &result); + sony_call_snc_handle(sdev->handle, 0x0200, &result); - return result & 0xff; + return (result & 0xff) - sdev->offset; } static int sony_nc_update_status_ng(struct backlight_device *bd) { int value, result; int *handle = (int *)bl_get_data(bd); + struct sony_backlight_props *sdev = + (struct sony_backlight_props *)bl_get_data(bd); - value = bd->props.brightness; - if (sony_call_snc_handle(*handle, 0x0100 | (value << 16), &result)) + value = bd->props.brightness + sdev->offset; + if (sony_call_snc_handle(sdev->handle, 0x0100 | (value << 16), &result)) return -EIO; return value; @@ -982,8 +994,6 @@ static const struct backlight_ops sony_backlight_ng_ops = { .update_status = sony_nc_update_status_ng, .get_brightness = sony_nc_get_brightness_ng, }; -static int backlight_ng_handle; -static struct backlight_device *sony_backlight_device; /* * New SNC-only Vaios event mapping to driver known keys @@ -1550,6 +1560,75 @@ static void sony_nc_kbd_backlight_resume(void) &ignore); } +static void sony_nc_backlight_ng_read_limits(int handle, + struct sony_backlight_props *props) +{ + int offset; + acpi_status status; + u8 brlvl, i; + u8 min = 0xff, max = 0x00; + struct acpi_object_list params; + union acpi_object in_obj; + union acpi_object *lvl_enum; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + + props->handle = handle; + props->offset = 0; + props->maxlvl = 0xff; + + offset = sony_find_snc_handle(handle); + if (offset < 0) + return; + + /* try to read the boundaries from ACPI tables, if we fail the above + * defaults should be reasonable + */ + params.count = 1; + params.pointer = &in_obj; + in_obj.type = ACPI_TYPE_INTEGER; + in_obj.integer.value = offset; + status = acpi_evaluate_object(sony_nc_acpi_handle, "SN06", ¶ms, + &buffer); + if (ACPI_FAILURE(status)) + return; + + lvl_enum = (union acpi_object *) buffer.pointer; + if (!lvl_enum) { + pr_err("No SN06 return object."); + return; + } + if (lvl_enum->type != ACPI_TYPE_BUFFER) { + pr_err("Invalid SN06 return object 0x%.2x\n", + lvl_enum->type); + goto out_invalid; + } + + /* the buffer lists brightness levels available, brightness levels are + * from 0 to 8 in the array, other values are used by ALS control. + */ + for (i = 0; i < 9 && i < lvl_enum->buffer.length; i++) { + + brlvl = *(lvl_enum->buffer.pointer + i); + dprintk("Brightness level: %d\n", brlvl); + + if (!brlvl) + break; + + if (brlvl > max) + max = brlvl; + if (brlvl < min) + min = brlvl; + } + props->offset = min; + props->maxlvl = max; + dprintk("Brightness levels: min=%d max=%d\n", props->offset, + props->maxlvl); + +out_invalid: + kfree(buffer.pointer); + return; +} + static void sony_nc_backlight_setup(void) { acpi_handle unused; @@ -1558,14 +1637,14 @@ static void sony_nc_backlight_setup(void) struct backlight_properties props; if (sony_find_snc_handle(0x12f) != -1) { - backlight_ng_handle = 0x12f; ops = &sony_backlight_ng_ops; - max_brightness = 0xff; + sony_nc_backlight_ng_read_limits(0x12f, &sony_bl_props); + max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; } else if (sony_find_snc_handle(0x137) != -1) { - backlight_ng_handle = 0x137; ops = &sony_backlight_ng_ops; - max_brightness = 0xff; + sony_nc_backlight_ng_read_limits(0x137, &sony_bl_props); + max_brightness = sony_bl_props.maxlvl - sony_bl_props.offset; } else if (ACPI_SUCCESS(acpi_get_handle(sony_nc_acpi_handle, "GBRT", &unused))) { @@ -1578,22 +1657,22 @@ static void sony_nc_backlight_setup(void) memset(&props, 0, sizeof(struct backlight_properties)); props.type = BACKLIGHT_PLATFORM; props.max_brightness = max_brightness; - sony_backlight_device = backlight_device_register("sony", NULL, - &backlight_ng_handle, - ops, &props); + sony_bl_props.dev = backlight_device_register("sony", NULL, + &sony_bl_props, + ops, &props); - if (IS_ERR(sony_backlight_device)) { - pr_warning(DRV_PFX "unable to register backlight device\n"); - sony_backlight_device = NULL; + if (IS_ERR(sony_bl_props.dev)) { + pr_warn(DRV_PFX "unable to register backlight device\n"); + sony_bl_props.dev = NULL; } else - sony_backlight_device->props.brightness = - ops->get_brightness(sony_backlight_device); + sony_bl_props.dev->props.brightness = + ops->get_brightness(sony_bl_props.dev); } static void sony_nc_backlight_cleanup(void) { - if (sony_backlight_device) - backlight_device_unregister(sony_backlight_device); + if (sony_bl_props.dev) + backlight_device_unregister(sony_bl_props.dev); } static int sony_nc_add(struct acpi_device *device) @@ -2591,7 +2670,7 @@ static long sonypi_misc_ioctl(struct file *fp, unsigned int cmd, mutex_lock(&spic_dev.lock); switch (cmd) { case SONYPI_IOCGBRT: - if (sony_backlight_device == NULL) { + if (sony_bl_props.dev == NULL) { ret = -EIO; break; } @@ -2604,7 +2683,7 @@ static long sonypi_misc_ioctl(struct file *fp, unsigned int cmd, ret = -EFAULT; break; case SONYPI_IOCSBRT: - if (sony_backlight_device == NULL) { + if (sony_bl_props.dev == NULL) { ret = -EIO; break; } @@ -2618,8 +2697,8 @@ static long sonypi_misc_ioctl(struct file *fp, unsigned int cmd, break; } /* sync the backlight device status */ - sony_backlight_device->props.brightness = - sony_backlight_get_brightness(sony_backlight_device); + sony_bl_props.dev->props.brightness = + sony_backlight_get_brightness(sony_bl_props.dev); break; case SONYPI_IOCGBAT1CAP: if (ec_read16(SONYPI_BAT1_FULL, &val16)) { -- cgit v1.1 From 14fdb152416c0fab80ecddf492c129d7da1bb8ef Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 9 May 2011 10:44:01 -0400 Subject: eeepc-laptop: Use ACPI handle to identify rfkill port The ACPI notification we get from rfkill events on these machines gives us all the information we need to identify the port that's changed. Do so rather than assuming that it's always bus 1. Signed-off-by: Matthew Garrett --- drivers/platform/x86/eeepc-laptop.c | 57 ++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c index 5f2dd38..2c1abf6 100644 --- a/drivers/platform/x86/eeepc-laptop.c +++ b/drivers/platform/x86/eeepc-laptop.c @@ -585,8 +585,9 @@ static bool eeepc_wlan_rfkill_blocked(struct eeepc_laptop *eeepc) return true; } -static void eeepc_rfkill_hotplug(struct eeepc_laptop *eeepc) +static void eeepc_rfkill_hotplug(struct eeepc_laptop *eeepc, acpi_handle handle) { + struct pci_dev *port; struct pci_dev *dev; struct pci_bus *bus; bool blocked = eeepc_wlan_rfkill_blocked(eeepc); @@ -599,9 +600,16 @@ static void eeepc_rfkill_hotplug(struct eeepc_laptop *eeepc) mutex_lock(&eeepc->hotplug_lock); if (eeepc->hotplug_slot) { - bus = pci_find_bus(0, 1); + port = acpi_get_pci_dev(handle); + if (!port) { + pr_warning("Unable to find port\n"); + goto out_unlock; + } + + bus = port->subordinate; + if (!bus) { - pr_warning("Unable to find PCI bus 1?\n"); + pr_warning("Unable to find PCI bus?\n"); goto out_unlock; } @@ -609,6 +617,7 @@ static void eeepc_rfkill_hotplug(struct eeepc_laptop *eeepc) pr_err("Unable to read PCI config space?\n"); goto out_unlock; } + absent = (l == 0xffffffff); if (blocked != absent) { @@ -647,6 +656,17 @@ out_unlock: mutex_unlock(&eeepc->hotplug_lock); } +static void eeepc_rfkill_hotplug_update(struct eeepc_laptop *eeepc, char *node) +{ + acpi_status status = AE_OK; + acpi_handle handle; + + status = acpi_get_handle(NULL, node, &handle); + + if (ACPI_SUCCESS(status)) + eeepc_rfkill_hotplug(eeepc, handle); +} + static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data) { struct eeepc_laptop *eeepc = data; @@ -654,7 +674,7 @@ static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data) if (event != ACPI_NOTIFY_BUS_CHECK) return; - eeepc_rfkill_hotplug(eeepc); + eeepc_rfkill_hotplug(eeepc, handle); } static int eeepc_register_rfkill_notifier(struct eeepc_laptop *eeepc, @@ -672,6 +692,11 @@ static int eeepc_register_rfkill_notifier(struct eeepc_laptop *eeepc, eeepc); if (ACPI_FAILURE(status)) pr_warning("Failed to register notify on %s\n", node); + /* + * Refresh pci hotplug in case the rfkill state was + * changed during setup. + */ + eeepc_rfkill_hotplug(eeepc, handle); } else return -ENODEV; @@ -693,6 +718,12 @@ static void eeepc_unregister_rfkill_notifier(struct eeepc_laptop *eeepc, if (ACPI_FAILURE(status)) pr_err("Error removing rfkill notify handler %s\n", node); + /* + * Refresh pci hotplug in case the rfkill + * state was changed after + * eeepc_unregister_rfkill_notifier() + */ + eeepc_rfkill_hotplug(eeepc, handle); } } @@ -816,11 +847,7 @@ static void eeepc_rfkill_exit(struct eeepc_laptop *eeepc) rfkill_destroy(eeepc->wlan_rfkill); eeepc->wlan_rfkill = NULL; } - /* - * Refresh pci hotplug in case the rfkill state was changed after - * eeepc_unregister_rfkill_notifier() - */ - eeepc_rfkill_hotplug(eeepc); + if (eeepc->hotplug_slot) pci_hp_deregister(eeepc->hotplug_slot); @@ -889,11 +916,6 @@ static int eeepc_rfkill_init(struct eeepc_laptop *eeepc) eeepc_register_rfkill_notifier(eeepc, "\\_SB.PCI0.P0P5"); eeepc_register_rfkill_notifier(eeepc, "\\_SB.PCI0.P0P6"); eeepc_register_rfkill_notifier(eeepc, "\\_SB.PCI0.P0P7"); - /* - * Refresh pci hotplug in case the rfkill state was changed during - * setup. - */ - eeepc_rfkill_hotplug(eeepc); exit: if (result && result != -ENODEV) @@ -928,8 +950,11 @@ static int eeepc_hotk_restore(struct device *device) struct eeepc_laptop *eeepc = dev_get_drvdata(device); /* Refresh both wlan rfkill state and pci hotplug */ - if (eeepc->wlan_rfkill) - eeepc_rfkill_hotplug(eeepc); + if (eeepc->wlan_rfkill) { + eeepc_rfkill_hotplug_update(eeepc, "\\_SB.PCI0.P0P5"); + eeepc_rfkill_hotplug_update(eeepc, "\\_SB.PCI0.P0P6"); + eeepc_rfkill_hotplug_update(eeepc, "\\_SB.PCI0.P0P7"); + } if (eeepc->bluetooth_rfkill) rfkill_set_sw_state(eeepc->bluetooth_rfkill, -- cgit v1.1 From 637b424bf8747e50bab6648ab919632d6efd6c28 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:42:44 +0200 Subject: HPFS: Make HPFS compile on preempt and SMP Make HPFS compile on preempt and SMP Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/Kconfig | 1 - fs/hpfs/super.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 0c39dc3..56bd15c 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -1,7 +1,6 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK - depends on BROKEN || !PREEMPT help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS is the file system used for organizing files on OS/2 hard disk diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index c89b408..501ea86 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -479,11 +479,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) int o; - if (num_possible_cpus() > 1) { - printk(KERN_ERR "HPFS is not SMP safe\n"); - return -EINVAL; - } - save_mount_options(s, options); sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); -- cgit v1.1 From 7dd29d8d865efdb00c0542a5d2c87af8c52ea6c7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:42:54 +0200 Subject: HPFS: Introduce a global mutex and lock it on every callback from VFS. Introduce a global mutex and lock it on every callback from VFS. Performance doesn't matter, reviewing the whole code for locking correctness would be too complicated, so simply lock it all. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/buffer.c | 8 ++++++++ fs/hpfs/file.c | 27 +++++++++++++++++++-------- fs/hpfs/hpfs_fn.h | 24 +++++++++++++++--------- fs/hpfs/super.c | 10 +++++++++- 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 793cb9d..7cef5d5 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -32,6 +32,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head { struct buffer_head *bh; + hpfs_lock_assert(s); + cond_resched(); *bhp = bh = sb_bread(s, secno); @@ -50,6 +52,8 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head struct buffer_head *bh; /*return hpfs_map_sector(s, secno, bhp, 0);*/ + hpfs_lock_assert(s); + cond_resched(); if ((*bhp = bh = sb_getblk(s, secno)) != NULL) { @@ -70,6 +74,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe struct buffer_head *bh; char *data; + hpfs_lock_assert(s); + cond_resched(); if (secno & 3) { @@ -125,6 +131,8 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno, { cond_resched(); + hpfs_lock_assert(s); + if (secno & 3) { printk("HPFS: hpfs_get_4sectors: unaligned read\n"); return NULL; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 9b9eb69..09a642f 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -48,38 +48,46 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno) static void hpfs_truncate(struct inode *i) { if (IS_IMMUTABLE(i)) return /*-EPERM*/; - hpfs_lock(i->i_sb); + hpfs_lock_assert(i->i_sb); + hpfs_i(i)->i_n_secs = 0; i->i_blocks = 1 + ((i->i_size + 511) >> 9); hpfs_i(i)->mmu_private = i->i_size; hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); hpfs_write_inode(i); hpfs_i(i)->i_n_secs = 0; - hpfs_unlock(i->i_sb); } static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + int r; secno s; + hpfs_lock(inode->i_sb); s = hpfs_bmap(inode, iblock); if (s) { map_bh(bh_result, inode->i_sb, s); - return 0; + goto ret_0; } - if (!create) return 0; + if (!create) goto ret_0; if (iblock<<9 != hpfs_i(inode)->mmu_private) { BUG(); - return -EIO; + r = -EIO; + goto ret_r; } if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) { hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1); - return -ENOSPC; + r = -ENOSPC; + goto ret_r; } inode->i_blocks++; hpfs_i(inode)->mmu_private += 512; set_buffer_new(bh_result); map_bh(bh_result, inode->i_sb, s); - return 0; + ret_0: + r = 0; + ret_r: + hpfs_unlock(inode->i_sb); + return r; } static int hpfs_writepage(struct page *page, struct writeback_control *wbc) @@ -130,8 +138,11 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf, ssize_t retval; retval = do_sync_write(file, buf, count, ppos); - if (retval > 0) + if (retval > 0) { + hpfs_lock(file->f_path.dentry->d_sb); hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1; + hpfs_unlock(file->f_path.dentry->d_sb); + } return retval; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index c15adbc..89a4714 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -63,6 +63,7 @@ struct hpfs_inode_info { }; struct hpfs_sb_info { + struct mutex hpfs_mutex; /* global hpfs lock */ ino_t sb_root; /* inode number of root dir */ unsigned sb_fs_size; /* file system size, sectors */ unsigned sb_bitmaps; /* sector number of bitmap list */ @@ -346,21 +347,26 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t) /* * Locking: * - * hpfs_lock() is a leftover from the big kernel lock. - * Right now, these functions are empty and only left - * for documentation purposes. The file system no longer - * works on SMP systems, so the lock is not needed - * any more. + * hpfs_lock() locks the whole filesystem. It must be taken + * on any method called by the VFS. * - * If someone is interested in making it work again, this - * would be the place to start by adding a per-superblock - * mutex and fixing all the bugs and performance issues - * caused by that. + * We don't do any per-file locking anymore, it is hard to + * review and HPFS is not performance-sensitive anyway. */ static inline void hpfs_lock(struct super_block *s) { + struct hpfs_sb_info *sbi = hpfs_sb(s); + mutex_lock(&sbi->hpfs_mutex); } static inline void hpfs_unlock(struct super_block *s) { + struct hpfs_sb_info *sbi = hpfs_sb(s); + mutex_unlock(&sbi->hpfs_mutex); +} + +static inline void hpfs_lock_assert(struct super_block *s) +{ + struct hpfs_sb_info *sbi = hpfs_sb(s); + WARN_ON(!mutex_is_locked(&sbi->hpfs_mutex)); } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 501ea86..41232c2 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -102,9 +102,12 @@ static void hpfs_put_super(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); + hpfs_lock(s); + unmark_dirty(s); + hpfs_unlock(s); + kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); - unmark_dirty(s); s->s_fs_info = NULL; kfree(sbi); } @@ -490,6 +493,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_bmp_dir = NULL; sbi->sb_cp_table = NULL; + mutex_init(&sbi->hpfs_mutex); + hpfs_lock(s); + mutex_init(&sbi->hpfs_creation_de); uid = current_uid(); @@ -669,6 +675,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) root->i_blocks = 5; hpfs_brelse4(&qbh); } + hpfs_unlock(s); return 0; bail4: brelse(bh2); @@ -676,6 +683,7 @@ bail3: brelse(bh1); bail2: brelse(bh0); bail1: bail0: + hpfs_unlock(s); kfree(sbi->sb_bmp_dir); kfree(sbi->sb_cp_table); s->s_fs_info = NULL; -- cgit v1.1 From 7d23ce36e3f52f9b83ac8da49296b73339c8b5b8 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:43:06 +0200 Subject: HPFS: Remove remaining locks Remove remaining locks Because of a new global per-fs lock, no other locks are needed Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/alloc.c | 50 +++++++++++------------------------------------- fs/hpfs/anode.c | 2 +- fs/hpfs/buffer.c | 16 ---------------- fs/hpfs/dnode.c | 57 ++++++++++++++++++++++++------------------------------- fs/hpfs/ea.c | 6 +++--- fs/hpfs/hpfs_fn.h | 12 +++--------- fs/hpfs/inode.c | 5 ----- fs/hpfs/namei.c | 53 +++++++-------------------------------------------- fs/hpfs/super.c | 4 ---- 9 files changed, 50 insertions(+), 155 deletions(-) diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index 5503e2c..995472d 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -8,8 +8,6 @@ #include "hpfs_fn.h" -static int hpfs_alloc_if_possible_nolock(struct super_block *s, secno sec); - /* * Check if a sector is allocated in bitmap * This is really slow. Turned on only if chk==2 @@ -75,7 +73,6 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne hpfs_error(s, "Bad allocation size: %d", n); return 0; } - lock_super(s); if (bs != ~0x3fff) { if (!(bmp = hpfs_map_bitmap(s, near >> 14, &qbh, "aib"))) goto uls; } else { @@ -143,7 +140,6 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne b: hpfs_brelse4(&qbh); uls: - unlock_super(s); return ret; } @@ -155,7 +151,7 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne * sectors */ -secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward, int lock) +secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward) { secno sec; int i; @@ -167,7 +163,6 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa forward = -forward; f_p = 1; } - if (lock) hpfs_lock_creation(s); n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14; if (near && near < sbi->sb_fs_size) { if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret; @@ -214,18 +209,17 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa ret: if (sec && f_p) { for (i = 0; i < forward; i++) { - if (!hpfs_alloc_if_possible_nolock(s, sec + i + 1)) { + if (!hpfs_alloc_if_possible(s, sec + i + 1)) { hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); sec = 0; break; } } } - if (lock) hpfs_unlock_creation(s); return sec; } -static secno alloc_in_dirband(struct super_block *s, secno near, int lock) +static secno alloc_in_dirband(struct super_block *s, secno near) { unsigned nr = near; secno sec; @@ -236,43 +230,29 @@ static secno alloc_in_dirband(struct super_block *s, secno near, int lock) nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4; nr -= sbi->sb_dirband_start; nr >>= 2; - if (lock) hpfs_lock_creation(s); sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); - if (lock) hpfs_unlock_creation(s); if (!sec) return 0; return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; } /* Alloc sector if it's free */ -static int hpfs_alloc_if_possible_nolock(struct super_block *s, secno sec) +int hpfs_alloc_if_possible(struct super_block *s, secno sec) { struct quad_buffer_head qbh; unsigned *bmp; - lock_super(s); if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; if (bmp[(sec & 0x3fff) >> 5] & (1 << (sec & 0x1f))) { bmp[(sec & 0x3fff) >> 5] &= ~(1 << (sec & 0x1f)); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); - unlock_super(s); return 1; } hpfs_brelse4(&qbh); end: - unlock_super(s); return 0; } -int hpfs_alloc_if_possible(struct super_block *s, secno sec) -{ - int r; - hpfs_lock_creation(s); - r = hpfs_alloc_if_possible_nolock(s, sec); - hpfs_unlock_creation(s); - return r; -} - /* Free sectors in bitmaps */ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) @@ -286,26 +266,22 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) hpfs_error(s, "Trying to free reserved sector %08x", sec); return; } - lock_super(s); sbi->sb_max_fwd_alloc += n > 0xffff ? 0xffff : n; if (sbi->sb_max_fwd_alloc > 0xffffff) sbi->sb_max_fwd_alloc = 0xffffff; new_map: if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "free"))) { - unlock_super(s); return; } new_tst: if ((bmp[(sec & 0x3fff) >> 5] >> (sec & 0x1f) & 1)) { hpfs_error(s, "sector %08x not allocated", sec); hpfs_brelse4(&qbh); - unlock_super(s); return; } bmp[(sec & 0x3fff) >> 5] |= 1 << (sec & 0x1f); if (!--n) { hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); - unlock_super(s); return; } if (!(++sec & 0x3fff)) { @@ -381,29 +357,25 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno) struct quad_buffer_head qbh; unsigned *bmp; unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; - lock_super(s); if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { - unlock_super(s); return; } bmp[ssec >> 5] |= 1 << (ssec & 0x1f); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); - unlock_super(s); } } struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, - dnode_secno *dno, struct quad_buffer_head *qbh, - int lock) + dnode_secno *dno, struct quad_buffer_head *qbh) { struct dnode *d; if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) { - if (!(*dno = alloc_in_dirband(s, near, lock))) - if (!(*dno = hpfs_alloc_sector(s, near, 4, 0, lock))) return NULL; + if (!(*dno = alloc_in_dirband(s, near))) + if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; } else { - if (!(*dno = hpfs_alloc_sector(s, near, 4, 0, lock))) - if (!(*dno = alloc_in_dirband(s, near, lock))) return NULL; + if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) + if (!(*dno = alloc_in_dirband(s, near))) return NULL; } if (!(d = hpfs_get_4sectors(s, *dno, qbh))) { hpfs_free_dnode(s, *dno); @@ -424,7 +396,7 @@ struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *f struct buffer_head **bh) { struct fnode *f; - if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD, 1))) return NULL; + if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD))) return NULL; if (!(f = hpfs_get_sector(s, *fno, bh))) { hpfs_free_sectors(s, *fno, 1); return NULL; @@ -441,7 +413,7 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a struct buffer_head **bh) { struct anode *a; - if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD, 1))) return NULL; + if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD))) return NULL; if (!(a = hpfs_get_sector(s, *ano, bh))) { hpfs_free_sectors(s, *ano, 1); return NULL; diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index 6a2f04b..f2a0384 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -115,7 +115,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } se = !fnod ? node : (node + 16384) & ~16383; } - if (!(se = hpfs_alloc_sector(s, se, 1, fsecno*ALLOC_M>ALLOC_FWD_MAX ? ALLOC_FWD_MAX : fsecno*ALLOC_MALLOC_FWD_MAX ? ALLOC_FWD_MAX : fsecno*ALLOC_M #include "hpfs_fn.h" -void hpfs_lock_creation(struct super_block *s) -{ -#ifdef DEBUG_LOCKS - printk("lock creation\n"); -#endif - mutex_lock(&hpfs_sb(s)->hpfs_creation_de); -} - -void hpfs_unlock_creation(struct super_block *s) -{ -#ifdef DEBUG_LOCKS - printk("unlock creation\n"); -#endif - mutex_unlock(&hpfs_sb(s)->hpfs_creation_de); -} - /* Map a sector into a buffer and return pointers to it and to the buffer. */ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 9b2ffad..07711c3 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -145,9 +145,10 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno } } if (ptr) { - if ((d->first_free += 4) > 2048) { - hpfs_error(s,"set_last_pointer: too long dnode %08x", d->self); - d->first_free -= 4; + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4); + if (le32_to_cpu(d->first_free) > 2048) { + hpfs_error(s, "set_last_pointer: too long dnode %08x", d->self); + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4); return; } de->length = 36; @@ -184,7 +185,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, de->not_8x3 = hpfs_is_name_long(name, namelen); de->namelen = namelen; memcpy(de->name, name, namelen); - d->first_free += d_size; + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + d_size); return de; } @@ -197,8 +198,8 @@ static void hpfs_delete_de(struct super_block *s, struct dnode *d, hpfs_error(s, "attempt to delete last dirent in dnode %08x", d->self); return; } - d->first_free -= de->length; - memmove(de, de_next_de(de), d->first_free + (char *)d - (char *)de); + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - de->length); + memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de); } static void fix_up_ptrs(struct super_block *s, struct dnode *d) @@ -262,7 +263,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, kfree(nname); return 1; } - if (d->first_free + de_size(namelen, down_ptr) <= 2048) { + if (le32_to_cpu(d->first_free) + de_size(namelen, down_ptr) <= 2048) { loff_t t; copy_de(de=hpfs_add_de(i->i_sb, d, name, namelen, down_ptr), new_de); t = get_pos(d, de); @@ -286,11 +287,11 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, kfree(nname); return 1; } - memcpy(nd, d, d->first_free); + memcpy(nd, d, le32_to_cpu(d->first_free)); copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de); for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1); h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10; - if (!(ad = hpfs_alloc_dnode(i->i_sb, d->up, &adno, &qbh1, 0))) { + if (!(ad = hpfs_alloc_dnode(i->i_sb, d->up, &adno, &qbh1))) { hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); hpfs_brelse4(&qbh); kfree(nd); @@ -313,9 +314,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, down_ptr = adno; set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); de = de_next_de(de); - memmove((char *)nd + 20, de, nd->first_free + (char *)nd - (char *)de); - nd->first_free -= (char *)de - (char *)nd - 20; - memcpy(d, nd, nd->first_free); + memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); + nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - (char *)de - (char *)nd - 20); + memcpy(d, nd, le32_to_cpu(nd->first_free)); for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); fix_up_ptrs(i->i_sb, ad); if (!d->root_dnode) { @@ -326,7 +327,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, hpfs_brelse4(&qbh1); goto go_up; } - if (!(rd = hpfs_alloc_dnode(i->i_sb, d->up, &rdno, &qbh2, 0))) { + if (!(rd = hpfs_alloc_dnode(i->i_sb, d->up, &rdno, &qbh2))) { hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); hpfs_brelse4(&qbh); hpfs_brelse4(&qbh1); @@ -373,7 +374,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, int hpfs_add_dirent(struct inode *i, const unsigned char *name, unsigned namelen, - struct hpfs_dirent *new_de, int cdepth) + struct hpfs_dirent *new_de) { struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct dnode *d; @@ -403,7 +404,6 @@ int hpfs_add_dirent(struct inode *i, } } hpfs_brelse4(&qbh); - if (!cdepth) hpfs_lock_creation(i->i_sb); if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_ADD)) { c = 1; goto ret; @@ -411,7 +411,6 @@ int hpfs_add_dirent(struct inode *i, i->i_version++; c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0); ret: - if (!cdepth) hpfs_unlock_creation(i->i_sb); return c; } @@ -474,7 +473,7 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) hpfs_brelse4(&qbh); return 0; } - dnode->first_free -= 4; + dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); de->length -= 4; de->down = 0; hpfs_mark_4buffers_dirty(&qbh); @@ -517,8 +516,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) try_it_again: if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "delete_empty_dnode")) return; if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return; - if (dnode->first_free > 56) goto end; - if (dnode->first_free == 52 || dnode->first_free == 56) { + if (le32_to_cpu(dnode->first_free) > 56) goto end; + if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) { struct hpfs_dirent *de_end; int root = dnode->root_dnode; up = dnode->up; @@ -571,9 +570,9 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (!down) { de->down = 0; de->length -= 4; - dnode->first_free -= 4; + dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); memmove(de_next_de(de), (char *)de_next_de(de) + 4, - (char *)dnode + dnode->first_free - (char *)de_next_de(de)); + (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); } else { struct dnode *d1; struct quad_buffer_head qbh1; @@ -585,7 +584,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) } } } else { - hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, dnode->first_free); + hpfs_error(i->i_sb, "delete_empty_dnode: dnode %08x, first_free == %03x", dno, le32_to_cpu(dnode->first_free)); goto end; } @@ -635,7 +634,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) struct hpfs_dirent *del = dnode_last_de(d1); dlp = del->down ? de_down_pointer(del) : 0; if (!dlp && down) { - if (d1->first_free > 2044) { + if (le32_to_cpu(d1->first_free) > 2044) { if (hpfs_sb(i->i_sb)->sb_chk >= 2) { printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); printk("HPFS: warning: terminating balancing operation\n"); @@ -649,12 +648,12 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) } del->length += 4; del->down = 1; - d1->first_free += 4; + d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4); } if (dlp && !down) { del->length -= 4; del->down = 0; - d1->first_free -= 4; + d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); } else if (down) *(dnode_secno *) ((void *) del + del->length - 4) = down; } else goto endm; @@ -670,7 +669,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (!de_prev->down) { de_prev->length += 4; de_prev->down = 1; - dnode->first_free += 4; + dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); } *(dnode_secno *) ((void *) de_prev + de_prev->length - 4) = ndown; hpfs_mark_4buffers_dirty(&qbh); @@ -701,7 +700,6 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, { struct dnode *dnode = qbh->data; dnode_secno down = 0; - int lock = 0; loff_t t; if (de->first || de->last) { hpfs_error(i->i_sb, "hpfs_remove_dirent: attempt to delete first or last dirent in dnode %08x", dno); @@ -710,11 +708,8 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, } if (de->down) down = de_down_pointer(de); if (depth && (de->down || (de == dnode_first_de(dnode) && de_next_de(de)->last))) { - lock = 1; - hpfs_lock_creation(i->i_sb); if (hpfs_check_free_dnodes(i->i_sb, FREE_DNODES_DEL)) { hpfs_brelse4(qbh); - hpfs_unlock_creation(i->i_sb); return 2; } } @@ -727,11 +722,9 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, dnode_secno a = move_to_top(i, down, dno); for_all_poss(i, hpfs_pos_subst, 5, t); if (a) delete_empty_dnode(i, a); - if (lock) hpfs_unlock_creation(i->i_sb); return !a; } delete_empty_dnode(i, dno); - if (lock) hpfs_unlock_creation(i->i_sb); return 0; } diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 45e53d9..1ac05bb 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -266,7 +266,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, secno n; struct buffer_head *bh; char *data; - if (!(n = hpfs_alloc_sector(s, fno, 1, 0, 1))) return; + if (!(n = hpfs_alloc_sector(s, fno, 1, 0))) return; if (!(data = hpfs_get_sector(s, n, &bh))) { hpfs_free_sectors(s, n, 1); return; @@ -284,7 +284,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, if (pos >= 30000) goto bail; while (((pos + 511) >> 9) > len) { if (!len) { - if (!(fnode->ea_secno = hpfs_alloc_sector(s, fno, 1, 0, 1))) + if (!(fnode->ea_secno = hpfs_alloc_sector(s, fno, 1, 0))) goto bail; fnode->ea_anode = 0; len++; @@ -312,7 +312,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, fnode->ea_secno = a_s;*/ secno new_sec; int i; - if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9), 1))) + if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9)))) goto bail; for (i = 0; i < len; i++) { struct buffer_head *bh1, *bh2; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 89a4714..860d09f 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -56,8 +56,6 @@ struct hpfs_inode_info { unsigned i_ea_uid : 1; /* file's uid is stored in ea */ unsigned i_ea_gid : 1; /* file's gid is stored in ea */ unsigned i_dirty : 1; - struct mutex i_mutex; - struct mutex i_parent_mutex; loff_t **i_rddir_off; struct inode vfs_inode; }; @@ -88,8 +86,6 @@ struct hpfs_sb_info { unsigned *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ - struct mutex hpfs_creation_de; /* when creating dirents, nobody else - can alloc blocks */ /*unsigned sb_mounting : 1;*/ int sb_timeshift; }; @@ -201,12 +197,12 @@ static inline unsigned tstbits(unsigned *bmp, unsigned b, unsigned n) /* alloc.c */ int hpfs_chk_sectors(struct super_block *, secno, int, char *); -secno hpfs_alloc_sector(struct super_block *, secno, unsigned, int, int); +secno hpfs_alloc_sector(struct super_block *, secno, unsigned, int); int hpfs_alloc_if_possible(struct super_block *, secno); void hpfs_free_sectors(struct super_block *, secno, unsigned); int hpfs_check_free_dnodes(struct super_block *, int); void hpfs_free_dnode(struct super_block *, secno); -struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *, int); +struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); @@ -223,8 +219,6 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno); /* buffer.c */ -void hpfs_lock_creation(struct super_block *); -void hpfs_unlock_creation(struct super_block *); void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); @@ -248,7 +242,7 @@ void hpfs_del_pos(struct inode *, loff_t *); struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, const unsigned char *, unsigned, secno); int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned, - struct hpfs_dirent *, int); + struct hpfs_dirent *); int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int); void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *); dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 87f1f78..29cf050 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -187,9 +187,7 @@ void hpfs_write_inode(struct inode *i) kfree(hpfs_inode->i_rddir_off); hpfs_inode->i_rddir_off = NULL; } - mutex_lock(&hpfs_inode->i_parent_mutex); if (!i->i_nlink) { - mutex_unlock(&hpfs_inode->i_parent_mutex); return; } parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir); @@ -200,14 +198,11 @@ void hpfs_write_inode(struct inode *i) hpfs_read_inode(parent); unlock_new_inode(parent); } - mutex_lock(&hpfs_inode->i_mutex); hpfs_write_inode_nolock(i); - mutex_unlock(&hpfs_inode->i_mutex); iput(parent); } else { mark_inode_dirty(i); } - mutex_unlock(&hpfs_inode->i_parent_mutex); } void hpfs_write_inode_nolock(struct inode *i) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index d5f8c8a..8c9f915 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -29,7 +29,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); if (!fnode) goto bail; - dnode = hpfs_alloc_dnode(dir->i_sb, fno, &dno, &qbh0, 1); + dnode = hpfs_alloc_dnode(dir->i_sb, fno, &dno, &qbh0); if (!dnode) goto bail1; memset(&dee, 0, sizeof dee); @@ -60,8 +60,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (dee.read_only) result->i_mode &= ~0222; - mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee); if (r == 1) goto bail3; if (r == -1) { @@ -101,11 +100,9 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) hpfs_write_inode_nolock(result); } d_instantiate(dentry, result); - mutex_unlock(&hpfs_i(dir)->i_mutex); hpfs_unlock(dir->i_sb); return 0; bail3: - mutex_unlock(&hpfs_i(dir)->i_mutex); iput(result); bail2: hpfs_brelse4(&qbh0); @@ -168,8 +165,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_data.a_ops = &hpfs_aops; hpfs_i(result)->mmu_private = 0; - mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee); if (r == 1) goto bail2; if (r == -1) { @@ -193,12 +189,10 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc hpfs_write_inode_nolock(result); } d_instantiate(dentry, result); - mutex_unlock(&hpfs_i(dir)->i_mutex); hpfs_unlock(dir->i_sb); return 0; bail2: - mutex_unlock(&hpfs_i(dir)->i_mutex); iput(result); bail1: brelse(bh); @@ -254,8 +248,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t result->i_blocks = 1; init_special_inode(result, mode, rdev); - mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee); if (r == 1) goto bail2; if (r == -1) { @@ -271,12 +264,10 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t hpfs_write_inode_nolock(result); d_instantiate(dentry, result); - mutex_unlock(&hpfs_i(dir)->i_mutex); brelse(bh); hpfs_unlock(dir->i_sb); return 0; bail2: - mutex_unlock(&hpfs_i(dir)->i_mutex); iput(result); bail1: brelse(bh); @@ -333,8 +324,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_op = &page_symlink_inode_operations; result->i_data.a_ops = &hpfs_symlink_aops; - mutex_lock(&hpfs_i(dir)->i_mutex); - r = hpfs_add_dirent(dir, name, len, &dee, 0); + r = hpfs_add_dirent(dir, name, len, &dee); if (r == 1) goto bail2; if (r == -1) { @@ -352,11 +342,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy hpfs_write_inode_nolock(result); d_instantiate(dentry, result); - mutex_unlock(&hpfs_i(dir)->i_mutex); hpfs_unlock(dir->i_sb); return 0; bail2: - mutex_unlock(&hpfs_i(dir)->i_mutex); iput(result); bail1: brelse(bh); @@ -382,8 +370,6 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) hpfs_lock(dir->i_sb); hpfs_adjust_length(name, &len); again: - mutex_lock(&hpfs_i(inode)->i_parent_mutex); - mutex_lock(&hpfs_i(dir)->i_mutex); err = -ENOENT; de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); if (!de) @@ -410,8 +396,6 @@ again: if (rep++) break; - mutex_unlock(&hpfs_i(dir)->i_mutex); - mutex_unlock(&hpfs_i(inode)->i_parent_mutex); dentry_unhash(dentry); if (!d_unhashed(dentry)) { dput(dentry); @@ -445,8 +429,6 @@ again: out1: hpfs_brelse4(&qbh); out: - mutex_unlock(&hpfs_i(dir)->i_mutex); - mutex_unlock(&hpfs_i(inode)->i_parent_mutex); hpfs_unlock(dir->i_sb); return err; } @@ -466,8 +448,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) hpfs_adjust_length(name, &len); hpfs_lock(dir->i_sb); - mutex_lock(&hpfs_i(inode)->i_parent_mutex); - mutex_lock(&hpfs_i(dir)->i_mutex); err = -ENOENT; de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh); if (!de) @@ -505,8 +485,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: - mutex_unlock(&hpfs_i(dir)->i_mutex); - mutex_unlock(&hpfs_i(inode)->i_parent_mutex); hpfs_unlock(dir->i_sb); return err; } @@ -568,12 +546,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, hpfs_lock(i->i_sb); /* order doesn't matter, due to VFS exclusion */ - mutex_lock(&hpfs_i(i)->i_parent_mutex); - if (new_inode) - mutex_lock(&hpfs_i(new_inode)->i_parent_mutex); - mutex_lock(&hpfs_i(old_dir)->i_mutex); - if (new_dir != old_dir) - mutex_lock(&hpfs_i(new_dir)->i_mutex); /* Erm? Moving over the empty non-busy directory is perfectly legal */ if (new_inode && S_ISDIR(new_inode->i_mode)) { @@ -610,9 +582,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir == old_dir) hpfs_brelse4(&qbh); - hpfs_lock_creation(i->i_sb); - if ((r = hpfs_add_dirent(new_dir, new_name, new_len, &de, 1))) { - hpfs_unlock_creation(i->i_sb); + if ((r = hpfs_add_dirent(new_dir, new_name, new_len, &de))) { if (r == -1) hpfs_error(new_dir->i_sb, "hpfs_rename: dirent already exists!"); err = r == 1 ? -ENOSPC : -EFSERROR; if (new_dir != old_dir) hpfs_brelse4(&qbh); @@ -621,20 +591,17 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir == old_dir) if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) { - hpfs_unlock_creation(i->i_sb); hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2"); err = -ENOENT; goto end1; } if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 0))) { - hpfs_unlock_creation(i->i_sb); hpfs_error(i->i_sb, "hpfs_rename: could not remove dirent"); err = r == 2 ? -ENOSPC : -EFSERROR; goto end1; } - hpfs_unlock_creation(i->i_sb); - + end: hpfs_i(i)->i_parent_dir = new_dir->i_ino; if (S_ISDIR(i->i_mode)) { @@ -652,12 +619,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; hpfs_decide_conv(i, new_name, new_len); end1: - if (old_dir != new_dir) - mutex_unlock(&hpfs_i(new_dir)->i_mutex); - mutex_unlock(&hpfs_i(old_dir)->i_mutex); - mutex_unlock(&hpfs_i(i)->i_parent_mutex); - if (new_inode) - mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex); hpfs_unlock(i->i_sb); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 41232c2..6493377 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -191,8 +191,6 @@ static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; - mutex_init(&ei->i_mutex); - mutex_init(&ei->i_parent_mutex); inode_init_once(&ei->vfs_inode); } @@ -496,8 +494,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); - mutex_init(&sbi->hpfs_creation_de); - uid = current_uid(); gid = current_gid(); umask = current_umask(); -- cgit v1.1 From 0fe105aa29bed0994991462b58ef61646db0e459 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:43:19 +0200 Subject: HPFS: Remove CR/LF conversion option Remove CR/LF conversion option It is unused anyway. It was used on 2.2 kernels or so. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/dir.c | 2 -- fs/hpfs/hpfs_fn.h | 11 ----------- fs/hpfs/inode.c | 1 - fs/hpfs/name.c | 33 --------------------------------- fs/hpfs/namei.c | 3 --- fs/hpfs/super.c | 32 +++++++------------------------- 6 files changed, 7 insertions(+), 75 deletions(-) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index b3d7c0d..208f3d7 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -250,8 +250,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name hpfs_result = hpfs_i(result); if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino; - hpfs_decide_conv(result, name, len); - if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) { hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures"); goto bail1; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 860d09f..d101086 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -51,7 +51,6 @@ struct hpfs_inode_info { unsigned i_disk_sec; /* (files) minimalist cache of alloc info */ unsigned i_n_secs; /* (files) minimalist cache of alloc info */ unsigned i_ea_size; /* size of extended attributes */ - unsigned i_conv : 2; /* (files) crlf->newline hackery */ unsigned i_ea_mode : 1; /* file's permission is stored in ea */ unsigned i_ea_uid : 1; /* file's uid is stored in ea */ unsigned i_ea_gid : 1; /* file's gid is stored in ea */ @@ -73,7 +72,6 @@ struct hpfs_sb_info { uid_t sb_uid; /* uid from mount options */ gid_t sb_gid; /* gid from mount options */ umode_t sb_mode; /* mode from mount options */ - unsigned sb_conv : 2; /* crlf->newline hackery */ unsigned sb_eas : 2; /* eas: 0-ignore, 1-ro, 2-rw */ unsigned sb_err : 2; /* on errs: 0-cont, 1-ro, 2-panic */ unsigned sb_chk : 2; /* checks: 0-no, 1-normal, 2-strict */ @@ -90,14 +88,6 @@ struct hpfs_sb_info { int sb_timeshift; }; -/* - * conv= options - */ - -#define CONV_BINARY 0 /* no conversion */ -#define CONV_TEXT 1 /* crlf->newline */ -#define CONV_AUTO 2 /* decide based on file contents */ - /* Four 512-byte buffers and the 2k block obtained by concatenating them */ struct quad_buffer_head { @@ -298,7 +288,6 @@ int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned, const unsigned char *, unsigned, int); int hpfs_is_name_long(const unsigned char *, unsigned); void hpfs_adjust_length(const unsigned char *, unsigned *); -void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned); /* namei.c */ diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 29cf050..3b8eeb1 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -17,7 +17,6 @@ void hpfs_init_inode(struct inode *i) i->i_uid = hpfs_sb(sb)->sb_uid; i->i_gid = hpfs_sb(sb)->sb_gid; i->i_mode = hpfs_sb(sb)->sb_mode; - hpfs_inode->i_conv = hpfs_sb(sb)->sb_conv; i->i_size = -1; i->i_blocks = -1; diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c index f24736d..9acdf33 100644 --- a/fs/hpfs/name.c +++ b/fs/hpfs/name.c @@ -8,39 +8,6 @@ #include "hpfs_fn.h" -static const char *text_postfix[]={ -".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF", -".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS", -".RC", ".TEX", ".TXT", ".Y", ""}; - -static const char *text_prefix[]={ -"AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ", -"MAKEFILE", "READ.ME", "README", "TERMCAP", ""}; - -void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len) -{ - struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); - int i; - if (hpfs_inode->i_conv != CONV_AUTO) return; - for (i = 0; *text_postfix[i]; i++) { - int l = strlen(text_postfix[i]); - if (l <= len) - if (!hpfs_compare_names(inode->i_sb, text_postfix[i], l, name + len - l, l, 0)) - goto text; - } - for (i = 0; *text_prefix[i]; i++) { - int l = strlen(text_prefix[i]); - if (l <= len) - if (!hpfs_compare_names(inode->i_sb, text_prefix[i], l, name, l, 0)) - goto text; - } - hpfs_inode->i_conv = CONV_BINARY; - return; - text: - hpfs_inode->i_conv = CONV_TEXT; - return; -} - static inline int not_allowed_char(unsigned char c) { return c<' ' || c=='"' || c=='*' || c=='/' || c==':' || c=='<' || diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 8c9f915..9c66f0e 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -151,7 +151,6 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_op = &hpfs_file_iops; result->i_fop = &hpfs_file_ops; result->i_nlink = 1; - hpfs_decide_conv(result, name, len); hpfs_i(result)->i_parent_dir = dir->i_ino; result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); result->i_ctime.tv_nsec = 0; @@ -616,8 +615,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_buffer_dirty(bh); brelse(bh); } - hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv; - hpfs_decide_conv(i, new_name, new_len); end1: hpfs_unlock(i->i_sb); return err; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 6493377..4858ff8 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -219,7 +219,6 @@ static void destroy_inodecache(void) enum { Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis, - Opt_conv_binary, Opt_conv_text, Opt_conv_auto, Opt_check_none, Opt_check_normal, Opt_check_strict, Opt_err_cont, Opt_err_ro, Opt_err_panic, Opt_eas_no, Opt_eas_ro, Opt_eas_rw, @@ -234,9 +233,6 @@ static const match_table_t tokens = { {Opt_umask, "umask=%o"}, {Opt_case_lower, "case=lower"}, {Opt_case_asis, "case=asis"}, - {Opt_conv_binary, "conv=binary"}, - {Opt_conv_text, "conv=text"}, - {Opt_conv_auto, "conv=auto"}, {Opt_check_none, "check=none"}, {Opt_check_normal, "check=normal"}, {Opt_check_strict, "check=strict"}, @@ -254,7 +250,7 @@ static const match_table_t tokens = { }; static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask, - int *lowercase, int *conv, int *eas, int *chk, int *errs, + int *lowercase, int *eas, int *chk, int *errs, int *chkdsk, int *timeshift) { char *p; @@ -296,15 +292,6 @@ static int parse_opts(char *opts, uid_t *uid, gid_t *gid, umode_t *umask, case Opt_case_asis: *lowercase = 0; break; - case Opt_conv_binary: - *conv = CONV_BINARY; - break; - case Opt_conv_text: - *conv = CONV_TEXT; - break; - case Opt_conv_auto: - *conv = CONV_AUTO; - break; case Opt_check_none: *chk = 0; break; @@ -371,9 +358,6 @@ HPFS filesystem options:\n\ umask=xxx set mode of files that don't have mode specified in eas\n\ case=lower lowercase all files\n\ case=asis do not lowercase files (default)\n\ - conv=binary do not convert CR/LF -> LF (default)\n\ - conv=auto convert only files with known text extensions\n\ - conv=text convert all files\n\ check=none no fs checks - kernel may crash on corrupted filesystem\n\ check=normal do some checks - it should not crash (default)\n\ check=strict do extra time-consuming checks, used for debugging\n\ @@ -395,7 +379,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) uid_t uid; gid_t gid; umode_t umask; - int lowercase, conv, eas, chk, errs, chkdsk, timeshift; + int lowercase, eas, chk, errs, chkdsk, timeshift; int o; struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); @@ -406,11 +390,11 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) lock_super(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; - lowercase = sbi->sb_lowercase; conv = sbi->sb_conv; + lowercase = sbi->sb_lowercase; eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk; errs = sbi->sb_err; timeshift = sbi->sb_timeshift; - if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, &conv, + if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, &eas, &chk, &errs, &chkdsk, ×hift))) { printk("HPFS: bad mount options.\n"); goto out_err; @@ -428,7 +412,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) sbi->sb_uid = uid; sbi->sb_gid = gid; sbi->sb_mode = 0777 & ~umask; - sbi->sb_lowercase = lowercase; sbi->sb_conv = conv; + sbi->sb_lowercase = lowercase; sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; sbi->sb_err = errs; sbi->sb_timeshift = timeshift; @@ -472,7 +456,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) uid_t uid; gid_t gid; umode_t umask; - int lowercase, conv, eas, chk, errs, chkdsk, timeshift; + int lowercase, eas, chk, errs, chkdsk, timeshift; dnode_secno root_dno; struct hpfs_dirent *de = NULL; @@ -498,14 +482,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) gid = current_gid(); umask = current_umask(); lowercase = 0; - conv = CONV_BINARY; eas = 2; chk = 1; errs = 1; chkdsk = 1; timeshift = 0; - if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, &conv, + if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, &eas, &chk, &errs, &chkdsk, ×hift))) { printk("HPFS: bad mount options.\n"); goto bail0; @@ -558,7 +541,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; sbi->sb_lowercase = lowercase; - sbi->sb_conv = conv; sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; -- cgit v1.1 From e5d6a7dd5e0b29eee4359e817e0bee728d7c5530 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:43:27 +0200 Subject: HPFS: Remove mark_inode_dirty Remove mark_inode_dirty HPFS doesn't use kernel's dirty inode indicator anyway because writing an inode requires directory's mutex. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 3b8eeb1..d093ce7 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -199,8 +199,6 @@ void hpfs_write_inode(struct inode *i) } hpfs_write_inode_nolock(i); iput(parent); - } else { - mark_inode_dirty(i); } } @@ -278,7 +276,6 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); - mark_inode_dirty(inode); hpfs_write_inode(inode); -- cgit v1.1 From d878597c2c498b63abe3e68d343459944bc358f9 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:43:34 +0200 Subject: HPFS: Use types with defined width Use types with defined width Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/hpfs.h | 219 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 107 insertions(+), 112 deletions(-) diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 0e84c73..8cd5130 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -21,7 +21,7 @@ /* Notation */ -typedef unsigned secno; /* sector number, partition relative */ +typedef u32 secno; /* sector number, partition relative */ typedef secno dnode_secno; /* sector number of a dnode */ typedef secno fnode_secno; /* sector number of an fnode */ @@ -38,28 +38,28 @@ typedef u32 time32_t; /* 32-bit time_t type */ struct hpfs_boot_block { - unsigned char jmp[3]; - unsigned char oem_id[8]; - unsigned char bytes_per_sector[2]; /* 512 */ - unsigned char sectors_per_cluster; - unsigned char n_reserved_sectors[2]; - unsigned char n_fats; - unsigned char n_rootdir_entries[2]; - unsigned char n_sectors_s[2]; - unsigned char media_byte; - unsigned short sectors_per_fat; - unsigned short sectors_per_track; - unsigned short heads_per_cyl; - unsigned int n_hidden_sectors; - unsigned int n_sectors_l; /* size of partition */ - unsigned char drive_number; - unsigned char mbz; - unsigned char sig_28h; /* 28h */ - unsigned char vol_serno[4]; - unsigned char vol_label[11]; - unsigned char sig_hpfs[8]; /* "HPFS " */ - unsigned char pad[448]; - unsigned short magic; /* aa55 */ + u8 jmp[3]; + u8 oem_id[8]; + u8 bytes_per_sector[2]; /* 512 */ + u8 sectors_per_cluster; + u8 n_reserved_sectors[2]; + u8 n_fats; + u8 n_rootdir_entries[2]; + u8 n_sectors_s[2]; + u8 media_byte; + u16 sectors_per_fat; + u16 sectors_per_track; + u16 heads_per_cyl; + u32 n_hidden_sectors; + u32 n_sectors_l; /* size of partition */ + u8 drive_number; + u8 mbz; + u8 sig_28h; /* 28h */ + u8 vol_serno[4]; + u8 vol_label[11]; + u8 sig_hpfs[8]; /* "HPFS " */ + u8 pad[448]; + u16 magic; /* aa55 */ }; @@ -71,31 +71,30 @@ struct hpfs_boot_block struct hpfs_super_block { - unsigned magic; /* f995 e849 */ - unsigned magic1; /* fa53 e9c5, more magic? */ - /*unsigned huh202;*/ /* ?? 202 = N. of B. in 1.00390625 S.*/ - char version; /* version of a filesystem usually 2 */ - char funcversion; /* functional version - oldest version + u32 magic; /* f995 e849 */ + u32 magic1; /* fa53 e9c5, more magic? */ + u8 version; /* version of a filesystem usually 2 */ + u8 funcversion; /* functional version - oldest version of filesystem that can understand this disk */ - unsigned short int zero; /* 0 */ + u16 zero; /* 0 */ fnode_secno root; /* fnode of root directory */ secno n_sectors; /* size of filesystem */ - unsigned n_badblocks; /* number of bad blocks */ + u32 n_badblocks; /* number of bad blocks */ secno bitmaps; /* pointers to free space bit maps */ - unsigned zero1; /* 0 */ + u32 zero1; /* 0 */ secno badblocks; /* bad block list */ - unsigned zero3; /* 0 */ + u32 zero3; /* 0 */ time32_t last_chkdsk; /* date last checked, 0 if never */ - /*unsigned zero4;*/ /* 0 */ - time32_t last_optimize; /* date last optimized, 0 if never */ + /*u32 zero4;*/ /* 0 */ + time32_t last_optimize; /* date last optimized, 0 if never */ secno n_dir_band; /* number of sectors in dir band */ secno dir_band_start; /* first sector in dir band */ secno dir_band_end; /* last sector in dir band */ secno dir_band_bitmap; /* free space map, 1 dnode per bit */ - char volume_name[32]; /* not used */ + u8 volume_name[32]; /* not used */ secno user_id_table; /* 8 preallocated sectors - user id */ - unsigned zero6[103]; /* 0 */ + u32 zero6[103]; /* 0 */ }; @@ -107,11 +106,10 @@ struct hpfs_super_block struct hpfs_spare_block { - unsigned magic; /* f991 1849 */ - unsigned magic1; /* fa52 29c5, more magic? */ + u32 magic; /* f991 1849 */ + u32 magic1; /* fa52 29c5, more magic? */ unsigned dirty: 1; /* 0 clean, 1 "improperly stopped" */ - /*unsigned flag1234: 4;*/ /* unknown flags */ unsigned sparedir_used: 1; /* spare dirblks used */ unsigned hotfixes_used: 1; /* hotfixes used */ unsigned bad_sector: 1; /* bad sector, corrupted disk (???) */ @@ -126,25 +124,24 @@ struct hpfs_spare_block unsigned dce_acls_active: 1; unsigned dasd_limits_dirty: 1; unsigned flag67: 2; - unsigned char mm_contlgulty; - unsigned char unused; + u8 mm_contlgulty; + u8 unused; secno hotfix_map; /* info about remapped bad sectors */ - unsigned n_spares_used; /* number of hotfixes */ - unsigned n_spares; /* number of spares in hotfix map */ - unsigned n_dnode_spares_free; /* spare dnodes unused */ - unsigned n_dnode_spares; /* length of spare_dnodes[] list, + u32 n_spares_used; /* number of hotfixes */ + u32 n_spares; /* number of spares in hotfix map */ + u32 n_dnode_spares_free; /* spare dnodes unused */ + u32 n_dnode_spares; /* length of spare_dnodes[] list, follows in this block*/ secno code_page_dir; /* code page directory block */ - unsigned n_code_pages; /* number of code pages */ - /*unsigned large_numbers[2];*/ /* ?? */ - unsigned super_crc; /* on HPFS386 and LAN Server this is + u32 n_code_pages; /* number of code pages */ + u32 super_crc; /* on HPFS386 and LAN Server this is checksum of superblock, on normal OS/2 unused */ - unsigned spare_crc; /* on HPFS386 checksum of spareblock */ - unsigned zero1[15]; /* unused */ + u32 spare_crc; /* on HPFS386 checksum of spareblock */ + u32 zero1[15]; /* unused */ dnode_secno spare_dnodes[100]; /* emergency free dnode list */ - unsigned zero2[1]; /* room for more? */ + u32 zero2[1]; /* room for more? */ }; /* The bad block list is 4 sectors long. The first word must be zero, @@ -179,18 +176,18 @@ struct hpfs_spare_block struct code_page_directory { - unsigned magic; /* 4945 21f7 */ - unsigned n_code_pages; /* number of pointers following */ - unsigned zero1[2]; + u32 magic; /* 4945 21f7 */ + u32 n_code_pages; /* number of pointers following */ + u32 zero1[2]; struct { - unsigned short ix; /* index */ - unsigned short code_page_number; /* code page number */ - unsigned bounds; /* matches corresponding word + u16 ix; /* index */ + u16 code_page_number; /* code page number */ + u32 bounds; /* matches corresponding word in data block */ secno code_page_data; /* sector number of a code_page_data containing c.p. array */ - unsigned short index; /* index in c.p. array in that sector*/ - unsigned short unknown; /* some unknown value; usually 0; + u16 index; /* index in c.p. array in that sector*/ + u16 unknown; /* some unknown value; usually 0; 2 in Japanese version */ } array[31]; /* unknown length */ }; @@ -201,21 +198,21 @@ struct code_page_directory struct code_page_data { - unsigned magic; /* 8945 21f7 */ - unsigned n_used; /* # elements used in c_p_data[] */ - unsigned bounds[3]; /* looks a bit like + u32 magic; /* 8945 21f7 */ + u32 n_used; /* # elements used in c_p_data[] */ + u32 bounds[3]; /* looks a bit like (beg1,end1), (beg2,end2) one byte each */ - unsigned short offs[3]; /* offsets from start of sector + u16 offs[3]; /* offsets from start of sector to start of c_p_data[ix] */ struct { - unsigned short ix; /* index */ - unsigned short code_page_number; /* code page number */ - unsigned short unknown; /* the same as in cp directory */ - unsigned char map[128]; /* upcase table for chars 80..ff */ - unsigned short zero2; + u16 ix; /* index */ + u16 code_page_number; /* code page number */ + u16 unknown; /* the same as in cp directory */ + u8 map[128]; /* upcase table for chars 80..ff */ + u16 zero2; } code_page[3]; - unsigned char incognita[78]; + u8 incognita[78]; }; @@ -255,8 +252,8 @@ struct code_page_data #define DNODE_MAGIC 0x77e40aae struct dnode { - unsigned magic; /* 77e4 0aae */ - unsigned first_free; /* offset from start of dnode to + u32 magic; /* 77e4 0aae */ + u32 first_free; /* offset from start of dnode to first free dir entry */ unsigned root_dnode:1; /* Is it root dnode? */ unsigned increment_me:31; /* some kind of activity counter? @@ -265,11 +262,11 @@ struct dnode { secno up; /* (root dnode) directory's fnode (nonroot) parent dnode */ dnode_secno self; /* pointer to this dnode */ - unsigned char dirent[2028]; /* one or more dirents */ + u8 dirent[2028]; /* one or more dirents */ }; struct hpfs_dirent { - unsigned short length; /* offset to next dirent */ + u16 length; /* offset to next dirent */ unsigned first: 1; /* set on phony ^A^A (".") entry */ unsigned has_acl: 1; unsigned down: 1; /* down pointer present (after name) */ @@ -290,15 +287,15 @@ struct hpfs_dirent { unsigned flag15: 1; fnode_secno fnode; /* fnode giving allocation info */ time32_t write_date; /* mtime */ - unsigned file_size; /* file length, bytes */ + u32 file_size; /* file length, bytes */ time32_t read_date; /* atime */ time32_t creation_date; /* ctime */ - unsigned ea_size; /* total EA length, bytes */ + u32 ea_size; /* total EA length, bytes */ unsigned char no_of_acls : 3; /* number of ACL's */ unsigned char reserver : 5; - unsigned char ix; /* code page index (of filename), see + u8 ix; /* code page index (of filename), see struct code_page_data */ - unsigned char namelen, name[1]; /* file name */ + u8 namelen, name[1]; /* file name */ /* dnode_secno down; btree down pointer, if present, follows name on next word boundary, or maybe it precedes next dirent, which is on a word boundary. */ @@ -318,14 +315,14 @@ struct hpfs_dirent { struct bplus_leaf_node { - unsigned file_secno; /* first file sector in extent */ - unsigned length; /* length, sectors */ + u32 file_secno; /* first file sector in extent */ + u32 length; /* length, sectors */ secno disk_secno; /* first corresponding disk sector */ }; struct bplus_internal_node { - unsigned file_secno; /* subtree maps sectors < this */ + u32 file_secno; /* subtree maps sectors < this */ anode_secno down; /* pointer to subtree */ }; @@ -346,10 +343,10 @@ struct bplus_header unsigned binary_search: 1; /* suggest binary search (unused) */ unsigned internal: 1; /* 1 -> (internal) tree of anodes 0 -> (leaf) list of extents */ - unsigned char fill[3]; - unsigned char n_free_nodes; /* free nodes in following array */ - unsigned char n_used_nodes; /* used nodes in following array */ - unsigned short first_free; /* offset from start of header to + u8 fill[3]; + u8 n_free_nodes; /* free nodes in following array */ + u8 n_used_nodes; /* used nodes in following array */ + u16 first_free; /* offset from start of header to first free node in array */ union { struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving @@ -369,19 +366,18 @@ struct bplus_header struct fnode { - unsigned magic; /* f7e4 0aae */ - unsigned zero1[2]; /* read history */ - unsigned char len, name[15]; /* true length, truncated name */ + u32 magic; /* f7e4 0aae */ + u32 zero1[2]; /* read history */ + u8 len, name[15]; /* true length, truncated name */ fnode_secno up; /* pointer to file's directory fnode */ - /*unsigned zero2[3];*/ secno acl_size_l; secno acl_secno; - unsigned short acl_size_s; - char acl_anode; - char zero2; /* history bit count */ - unsigned ea_size_l; /* length of disk-resident ea's */ + u16 acl_size_s; + u8 acl_anode; + u8 zero2; /* history bit count */ + u32 ea_size_l; /* length of disk-resident ea's */ secno ea_secno; /* first sector of disk-resident ea's*/ - unsigned short ea_size_s; /* length of fnode-resident ea's */ + u16 ea_size_s; /* length of fnode-resident ea's */ unsigned flag0: 1; unsigned ea_anode: 1; /* 1 -> ea_secno is an anode */ @@ -407,17 +403,16 @@ struct fnode struct bplus_internal_node internal[12]; } u; - unsigned file_size; /* file length, bytes */ - unsigned n_needea; /* number of EA's with NEEDEA set */ - char user_id[16]; /* unused */ - unsigned short ea_offs; /* offset from start of fnode + u32 file_size; /* file length, bytes */ + u32 n_needea; /* number of EA's with NEEDEA set */ + u8 user_id[16]; /* unused */ + u16 ea_offs; /* offset from start of fnode to first fnode-resident ea */ - char dasd_limit_treshhold; - char dasd_limit_delta; - unsigned dasd_limit; - unsigned dasd_usage; - /*unsigned zero5[2];*/ - unsigned char ea[316]; /* zero or more EA's, packed together + u8 dasd_limit_treshhold; + u8 dasd_limit_delta; + u32 dasd_limit; + u32 dasd_usage; + u8 ea[316]; /* zero or more EA's, packed together with no alignment padding. (Do not use this name, get here via fnode + ea_offs. I think.) */ @@ -430,7 +425,7 @@ struct fnode struct anode { - unsigned magic; /* 37e4 0aae */ + u32 magic; /* 37e4 0aae */ anode_secno self; /* pointer to this anode */ secno up; /* parent anode or fnode */ @@ -440,7 +435,7 @@ struct anode struct bplus_internal_node internal[60]; } u; - unsigned fill[3]; /* unused */ + u32 fill[3]; /* unused */ }; @@ -471,15 +466,15 @@ struct extended_attribute unsigned flag5: 1; unsigned flag6: 1; unsigned needea: 1; /* required ea */ - unsigned char namelen; /* length of name, bytes */ - unsigned short valuelen; /* length of value, bytes */ - unsigned char name[0]; + u8 namelen; /* length of name, bytes */ + u16 valuelen; /* length of value, bytes */ + u8 name[0]; /* - unsigned char name[namelen]; ascii attrib name - unsigned char nul; terminating '\0', not counted - unsigned char value[valuelen]; value, arbitrary + u8 name[namelen]; ascii attrib name + u8 nul; terminating '\0', not counted + u8 value[valuelen]; value, arbitrary if this.indirect, valuelen is 8 and the value is - unsigned length; real length of value, bytes + u32 length; real length of value, bytes secno secno; sector address where it starts if this.anode, the above sector number is the root of an anode tree which points to the value. -- cgit v1.1 From f73976818adeaa46515a238b21e865850b011a87 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:43:41 +0200 Subject: HPFS: When marking or clearing the dirty bit, sync the filesystem When marking or clearing the dirty bit, sync the filesystem Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 4858ff8..07e8d0c 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -27,6 +27,7 @@ static void mark_dirty(struct super_block *s) sb->dirty = 1; sb->old_wrote = 0; mark_buffer_dirty(bh); + sync_dirty_buffer(bh); brelse(bh); } } @@ -40,10 +41,12 @@ static void unmark_dirty(struct super_block *s) struct buffer_head *bh; struct hpfs_spare_block *sb; if (s->s_flags & MS_RDONLY) return; + sync_blockdev(s->s_bdev); if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error; sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error; mark_buffer_dirty(bh); + sync_dirty_buffer(bh); brelse(bh); } } -- cgit v1.1 From 48f10e8ce7461b393186c4c7c6d6f6634082159c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:00 +0200 Subject: HPFS: Restrict uid and gid to 16-bit values Restrict uid and gid to 16-bit values. HPFS stores only 2 bytes in the EAs. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index d093ce7..bc61bb4 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -261,6 +261,10 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) hpfs_lock(inode->i_sb); if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) goto out_unlock; + if ((attr->ia_valid & ATTR_UID) && attr->ia_uid >= 0x10000) + goto out_unlock; + if ((attr->ia_valid & ATTR_GID) && attr->ia_gid >= 0x10000) + goto out_unlock; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; -- cgit v1.1 From dab4c82a6e7ee2c60e63737eaa2ec283f9784df6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:08 +0200 Subject: HPFS: Fix a bug that filesystem was not marked dirty when remounting it Fix a bug that filesystem was not marked dirty when remounting it Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/super.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 07e8d0c..4a7d026 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -18,9 +18,9 @@ /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ -static void mark_dirty(struct super_block *s) +static void mark_dirty(struct super_block *s, int remount) { - if (hpfs_sb(s)->sb_chkdsk && !(s->s_flags & MS_RDONLY)) { + if (hpfs_sb(s)->sb_chkdsk && (remount || !(s->s_flags & MS_RDONLY))) { struct buffer_head *bh; struct hpfs_spare_block *sb; if ((sb = hpfs_map_sector(s, 17, &bh, 0))) { @@ -66,13 +66,13 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { printk("; crashing the system because you wanted it\n"); - mark_dirty(s); + mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n"); else { printk("; remounting read-only\n"); - mark_dirty(s); + mark_dirty(s, 0); s->s_flags |= MS_RDONLY; } } else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n"); @@ -419,7 +419,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; sbi->sb_err = errs; sbi->sb_timeshift = timeshift; - if (!(*flags & MS_RDONLY)) mark_dirty(s); + if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); replace_mount_options(s, new_opts); @@ -576,7 +576,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (spareblock->hotfixes_used || spareblock->n_spares_used) { if (errs >= 2) { printk("HPFS: Hotfixes not supported here, try chkdsk\n"); - mark_dirty(s); + mark_dirty(s, 0); goto bail4; } hpfs_error(s, "hotfixes not supported here, try chkdsk"); @@ -586,7 +586,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (spareblock->n_dnode_spares != spareblock->n_dnode_spares_free) { if (errs >= 2) { printk("HPFS: Spare dnodes used, try chkdsk\n"); - mark_dirty(s); + mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); @@ -605,7 +605,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (hpfs_chk_sectors(s, superblock->dir_band_start, superblock->n_dir_band, "dir_band") || hpfs_chk_sectors(s, superblock->dir_band_bitmap, 4, "dir_band_bitmap") || hpfs_chk_sectors(s, superblock->bitmaps, 4, "bitmaps")) { - mark_dirty(s); + mark_dirty(s, 0); goto bail4; } sbi->sb_dirband_size = a; -- cgit v1.1 From bc8728ee56bca62df269b2dd159bc60838ac8e80 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:19 +0200 Subject: HPFS: Implement fsync for hpfs Implement fsync for hpfs. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 09a642f..89c500e 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -20,8 +20,8 @@ static int hpfs_file_release(struct inode *inode, struct file *file) int hpfs_file_fsync(struct file *file, int datasync) { - /*return file_fsync(file, datasync);*/ - return 0; /* Don't fsync :-) */ + struct inode *inode = file->f_mapping->host; + return sync_blockdev(inode->i_sb->s_bdev); } /* -- cgit v1.1 From 0b69760be6968c528869d4aec95ecf64dbf3e8bd Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:26 +0200 Subject: HPFS: Fix endianity. Make hpfs work on big-endian machines Fix endianity. Make hpfs work on big-endian machines. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/alloc.c | 68 ++++++++--------- fs/hpfs/anode.c | 136 ++++++++++++++++----------------- fs/hpfs/dir.c | 20 ++--- fs/hpfs/dnode.c | 127 +++++++++++++++---------------- fs/hpfs/ea.c | 129 ++++++++++++++++---------------- fs/hpfs/hpfs.h | 219 +++++++++++++++++++++++++++++++++++++----------------- fs/hpfs/hpfs_fn.h | 27 ++++--- fs/hpfs/inode.c | 34 ++++----- fs/hpfs/map.c | 56 +++++++------- fs/hpfs/namei.c | 50 ++++++------- fs/hpfs/super.c | 50 ++++++------- 11 files changed, 498 insertions(+), 418 deletions(-) diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index 995472d..7a5eb2c 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -16,9 +16,9 @@ static int chk_if_allocated(struct super_block *s, secno sec, char *msg) { struct quad_buffer_head qbh; - unsigned *bmp; + u32 *bmp; if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail; - if ((bmp[(sec & 0x3fff) >> 5] >> (sec & 0x1f)) & 1) { + if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec); goto fail1; } @@ -26,7 +26,7 @@ static int chk_if_allocated(struct super_block *s, secno sec, char *msg) if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4; if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail; - if ((bmp[ssec >> 5] >> (ssec & 0x1f)) & 1) { + if ((le32_to_cpu(bmp[ssec >> 5]) >> (ssec & 0x1f)) & 1) { hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec); goto fail1; } @@ -82,10 +82,6 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne ret = bs + nr; goto rt; } - /*if (!tstbits(bmp, nr + n, n + forward)) { - ret = bs + nr + n; - goto rt; - }*/ q = nr + n; b = 0; while ((a = tstbits(bmp, q, n + forward)) != 0) { q += a; @@ -102,14 +98,14 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne goto rt; } nr >>= 5; - /*for (i = nr + 1; i != nr; i++, i &= 0x1ff) {*/ + /*for (i = nr + 1; i != nr; i++, i &= 0x1ff) */ i = nr; do { - if (!bmp[i]) goto cont; - if (n + forward >= 0x3f && bmp[i] != -1) goto cont; + if (!le32_to_cpu(bmp[i])) goto cont; + if (n + forward >= 0x3f && le32_to_cpu(bmp[i]) != 0xffffffff) goto cont; q = i<<5; if (i > 0) { - unsigned k = bmp[i-1]; + unsigned k = le32_to_cpu(bmp[i-1]); while (k & 0x80000000) { q--; k <<= 1; } @@ -129,12 +125,12 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne } while (i != nr); rt: if (ret) { - if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (bmp[(ret & 0x3fff) >> 5] | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) { + if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (le32_to_cpu(bmp[(ret & 0x3fff) >> 5]) | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) { hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret); ret = 0; goto b; } - bmp[(ret & 0x3fff) >> 5] &= ~(((1 << n) - 1) << (ret & 0x1f)); + bmp[(ret & 0x3fff) >> 5] &= cpu_to_le32(~(((1 << n) - 1) << (ret & 0x1f))); hpfs_mark_4buffers_dirty(&qbh); } b: @@ -240,10 +236,10 @@ static secno alloc_in_dirband(struct super_block *s, secno near) int hpfs_alloc_if_possible(struct super_block *s, secno sec) { struct quad_buffer_head qbh; - unsigned *bmp; + u32 *bmp; if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; - if (bmp[(sec & 0x3fff) >> 5] & (1 << (sec & 0x1f))) { - bmp[(sec & 0x3fff) >> 5] &= ~(1 << (sec & 0x1f)); + if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) { + bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); return 1; @@ -258,7 +254,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec) void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) { struct quad_buffer_head qbh; - unsigned *bmp; + u32 *bmp; struct hpfs_sb_info *sbi = hpfs_sb(s); /*printk("2 - ");*/ if (!n) return; @@ -273,12 +269,12 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) return; } new_tst: - if ((bmp[(sec & 0x3fff) >> 5] >> (sec & 0x1f) & 1)) { + if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f) & 1)) { hpfs_error(s, "sector %08x not allocated", sec); hpfs_brelse4(&qbh); return; } - bmp[(sec & 0x3fff) >> 5] |= 1 << (sec & 0x1f); + bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); if (!--n) { hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); @@ -303,13 +299,13 @@ int hpfs_check_free_dnodes(struct super_block *s, int n) int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14; int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff; int i, j; - unsigned *bmp; + u32 *bmp; struct quad_buffer_head qbh; if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) { for (j = 0; j < 512; j++) { unsigned k; - if (!bmp[j]) continue; - for (k = bmp[j]; k; k >>= 1) if (k & 1) if (!--n) { + if (!le32_to_cpu(bmp[j])) continue; + for (k = le32_to_cpu(bmp[j]); k; k >>= 1) if (k & 1) if (!--n) { hpfs_brelse4(&qbh); return 0; } @@ -328,10 +324,10 @@ int hpfs_check_free_dnodes(struct super_block *s, int n) chk_bmp: if (bmp) { for (j = 0; j < 512; j++) { - unsigned k; - if (!bmp[j]) continue; + u32 k; + if (!le32_to_cpu(bmp[j])) continue; for (k = 0xf; k; k <<= 4) - if ((bmp[j] & k) == k) { + if ((le32_to_cpu(bmp[j]) & k) == k) { if (!--n) { hpfs_brelse4(&qbh); return 0; @@ -355,12 +351,12 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno) hpfs_free_sectors(s, dno, 4); } else { struct quad_buffer_head qbh; - unsigned *bmp; + u32 *bmp; unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { return; } - bmp[ssec >> 5] |= 1 << (ssec & 0x1f); + bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); } @@ -382,13 +378,13 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, return NULL; } memset(d, 0, 2048); - d->magic = DNODE_MAGIC; - d->first_free = 52; + d->magic = cpu_to_le32(DNODE_MAGIC); + d->first_free = cpu_to_le32(52); d->dirent[0] = 32; d->dirent[2] = 8; d->dirent[30] = 1; d->dirent[31] = 255; - d->self = *dno; + d->self = cpu_to_le32(*dno); return d; } @@ -402,10 +398,10 @@ struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *f return NULL; } memset(f, 0, 512); - f->magic = FNODE_MAGIC; - f->ea_offs = 0xc4; + f->magic = cpu_to_le32(FNODE_MAGIC); + f->ea_offs = cpu_to_le16(0xc4); f->btree.n_free_nodes = 8; - f->btree.first_free = 8; + f->btree.first_free = cpu_to_le16(8); return f; } @@ -419,10 +415,10 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a return NULL; } memset(a, 0, 512); - a->magic = ANODE_MAGIC; - a->self = *ano; + a->magic = cpu_to_le32(ANODE_MAGIC); + a->self = cpu_to_le32(*ano); a->btree.n_free_nodes = 40; a->btree.n_used_nodes = 0; - a->btree.first_free = 8; + a->btree.first_free = cpu_to_le16(8); return a; } diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index f2a0384..08b503e8 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -22,8 +22,8 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode, if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1; if (btree->internal) { for (i = 0; i < btree->n_used_nodes; i++) - if (btree->u.internal[i].file_secno > sec) { - a = btree->u.internal[i].down; + if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) { + a = le32_to_cpu(btree->u.internal[i].down); brelse(bh); if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; btree = &anode->btree; @@ -34,18 +34,18 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode, return -1; } for (i = 0; i < btree->n_used_nodes; i++) - if (btree->u.external[i].file_secno <= sec && - btree->u.external[i].file_secno + btree->u.external[i].length > sec) { - a = btree->u.external[i].disk_secno + sec - btree->u.external[i].file_secno; + if (le32_to_cpu(btree->u.external[i].file_secno) <= sec && + le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > sec) { + a = le32_to_cpu(btree->u.external[i].disk_secno) + sec - le32_to_cpu(btree->u.external[i].file_secno); if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, a, 1, "data")) { brelse(bh); return -1; } if (inode) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); - hpfs_inode->i_file_sec = btree->u.external[i].file_secno; - hpfs_inode->i_disk_sec = btree->u.external[i].disk_secno; - hpfs_inode->i_n_secs = btree->u.external[i].length; + hpfs_inode->i_file_sec = le32_to_cpu(btree->u.external[i].file_secno); + hpfs_inode->i_disk_sec = le32_to_cpu(btree->u.external[i].disk_secno); + hpfs_inode->i_n_secs = le32_to_cpu(btree->u.external[i].length); } brelse(bh); return a; @@ -83,8 +83,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi return -1; } if (btree->internal) { - a = btree->u.internal[n].down; - btree->u.internal[n].file_secno = -1; + a = le32_to_cpu(btree->u.internal[n].down); + btree->u.internal[n].file_secno = cpu_to_le32(-1); mark_buffer_dirty(bh); brelse(bh); if (hpfs_sb(s)->sb_chk) @@ -94,15 +94,15 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi goto go_down; } if (n >= 0) { - if (btree->u.external[n].file_secno + btree->u.external[n].length != fsecno) { + if (le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length) != fsecno) { hpfs_error(s, "allocated size %08x, trying to add sector %08x, %cnode %08x", - btree->u.external[n].file_secno + btree->u.external[n].length, fsecno, + le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length), fsecno, fnod?'f':'a', node); brelse(bh); return -1; } - if (hpfs_alloc_if_possible(s, se = btree->u.external[n].disk_secno + btree->u.external[n].length)) { - btree->u.external[n].length++; + if (hpfs_alloc_if_possible(s, se = le32_to_cpu(btree->u.external[n].disk_secno) + le32_to_cpu(btree->u.external[n].length))) { + btree->u.external[n].length = cpu_to_le32(le32_to_cpu(btree->u.external[n].length) + 1); mark_buffer_dirty(bh); brelse(bh); return se; @@ -119,16 +119,16 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi brelse(bh); return -1; } - fs = n < 0 ? 0 : btree->u.external[n].file_secno + btree->u.external[n].length; + fs = n < 0 ? 0 : le32_to_cpu(btree->u.external[n].file_secno) + le32_to_cpu(btree->u.external[n].length); if (!btree->n_free_nodes) { - up = a != node ? anode->up : -1; + up = a != node ? le32_to_cpu(anode->up) : -1; if (!(anode = hpfs_alloc_anode(s, a, &na, &bh1))) { brelse(bh); hpfs_free_sectors(s, se, 1); return -1; } if (a == node && fnod) { - anode->up = node; + anode->up = cpu_to_le32(node); anode->btree.fnode_parent = 1; anode->btree.n_used_nodes = btree->n_used_nodes; anode->btree.first_free = btree->first_free; @@ -137,9 +137,9 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi btree->internal = 1; btree->n_free_nodes = 11; btree->n_used_nodes = 1; - btree->first_free = (char *)&(btree->u.internal[1]) - (char *)btree; - btree->u.internal[0].file_secno = -1; - btree->u.internal[0].down = na; + btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree); + btree->u.internal[0].file_secno = cpu_to_le32(-1); + btree->u.internal[0].down = cpu_to_le32(na); mark_buffer_dirty(bh); } else if (!(ranode = hpfs_alloc_anode(s, /*a*/0, &ra, &bh2))) { brelse(bh); @@ -153,15 +153,15 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi btree = &anode->btree; } btree->n_free_nodes--; n = btree->n_used_nodes++; - btree->first_free += 12; - btree->u.external[n].disk_secno = se; - btree->u.external[n].file_secno = fs; - btree->u.external[n].length = 1; + btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 12); + btree->u.external[n].disk_secno = cpu_to_le32(se); + btree->u.external[n].file_secno = cpu_to_le32(fs); + btree->u.external[n].length = cpu_to_le32(1); mark_buffer_dirty(bh); brelse(bh); if ((a == node && fnod) || na == -1) return se; c2 = 0; - while (up != -1) { + while (up != (anode_secno)-1) { struct anode *new_anode; if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1; @@ -174,47 +174,47 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } if (btree->n_free_nodes) { btree->n_free_nodes--; n = btree->n_used_nodes++; - btree->first_free += 8; - btree->u.internal[n].file_secno = -1; - btree->u.internal[n].down = na; - btree->u.internal[n-1].file_secno = fs; + btree->first_free = cpu_to_le16(le16_to_cpu(btree->first_free) + 8); + btree->u.internal[n].file_secno = cpu_to_le32(-1); + btree->u.internal[n].down = cpu_to_le32(na); + btree->u.internal[n-1].file_secno = cpu_to_le32(fs); mark_buffer_dirty(bh); brelse(bh); brelse(bh2); hpfs_free_sectors(s, ra, 1); if ((anode = hpfs_map_anode(s, na, &bh))) { - anode->up = up; + anode->up = cpu_to_le32(up); anode->btree.fnode_parent = up == node && fnod; mark_buffer_dirty(bh); brelse(bh); } return se; } - up = up != node ? anode->up : -1; - btree->u.internal[btree->n_used_nodes - 1].file_secno = /*fs*/-1; + up = up != node ? le32_to_cpu(anode->up) : -1; + btree->u.internal[btree->n_used_nodes - 1].file_secno = cpu_to_le32(/*fs*/-1); mark_buffer_dirty(bh); brelse(bh); a = na; if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) { anode = new_anode; - /*anode->up = up != -1 ? up : ra;*/ + /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/ anode->btree.internal = 1; anode->btree.n_used_nodes = 1; anode->btree.n_free_nodes = 59; - anode->btree.first_free = 16; - anode->btree.u.internal[0].down = a; - anode->btree.u.internal[0].file_secno = -1; + anode->btree.first_free = cpu_to_le16(16); + anode->btree.u.internal[0].down = cpu_to_le32(a); + anode->btree.u.internal[0].file_secno = cpu_to_le32(-1); mark_buffer_dirty(bh); brelse(bh); if ((anode = hpfs_map_anode(s, a, &bh))) { - anode->up = na; + anode->up = cpu_to_le32(na); mark_buffer_dirty(bh); brelse(bh); } } else na = a; } if ((anode = hpfs_map_anode(s, na, &bh))) { - anode->up = node; + anode->up = cpu_to_le32(node); if (fnod) anode->btree.fnode_parent = 1; mark_buffer_dirty(bh); brelse(bh); @@ -232,14 +232,14 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } btree = &fnode->btree; } - ranode->up = node; - memcpy(&ranode->btree, btree, btree->first_free); + ranode->up = cpu_to_le32(node); + memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free)); if (fnod) ranode->btree.fnode_parent = 1; ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes; if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) { struct anode *unode; - if ((unode = hpfs_map_anode(s, ranode->u.internal[n].down, &bh1))) { - unode->up = ra; + if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) { + unode->up = cpu_to_le32(ra); unode->btree.fnode_parent = 0; mark_buffer_dirty(bh1); brelse(bh1); @@ -248,11 +248,11 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi btree->internal = 1; btree->n_free_nodes = fnod ? 10 : 58; btree->n_used_nodes = 2; - btree->first_free = (char *)&btree->u.internal[2] - (char *)btree; - btree->u.internal[0].file_secno = fs; - btree->u.internal[0].down = ra; - btree->u.internal[1].file_secno = -1; - btree->u.internal[1].down = na; + btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree); + btree->u.internal[0].file_secno = cpu_to_le32(fs); + btree->u.internal[0].down = cpu_to_le32(ra); + btree->u.internal[1].file_secno = cpu_to_le32(-1); + btree->u.internal[1].down = cpu_to_le32(na); mark_buffer_dirty(bh); brelse(bh); mark_buffer_dirty(bh2); @@ -279,7 +279,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) go_down: d2 = 0; while (btree1->internal) { - ano = btree1->u.internal[pos].down; + ano = le32_to_cpu(btree1->u.internal[pos].down); if (level) brelse(bh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1")) @@ -290,7 +290,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) pos = 0; } for (i = 0; i < btree1->n_used_nodes; i++) - hpfs_free_sectors(s, btree1->u.external[i].disk_secno, btree1->u.external[i].length); + hpfs_free_sectors(s, le32_to_cpu(btree1->u.external[i].disk_secno), le32_to_cpu(btree1->u.external[i].length)); go_up: if (!level) return; brelse(bh); @@ -298,13 +298,13 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return; hpfs_free_sectors(s, ano, 1); oano = ano; - ano = anode->up; + ano = le32_to_cpu(anode->up); if (--level) { if (!(anode = hpfs_map_anode(s, ano, &bh))) return; btree1 = &anode->btree; } else btree1 = btree; for (i = 0; i < btree1->n_used_nodes; i++) { - if (btree1->u.internal[i].down == oano) { + if (le32_to_cpu(btree1->u.internal[i].down) == oano) { if ((pos = i + 1) < btree1->n_used_nodes) goto go_down; else @@ -411,7 +411,7 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) if (fno) { btree->n_free_nodes = 8; btree->n_used_nodes = 0; - btree->first_free = 8; + btree->first_free = cpu_to_le16(8); btree->internal = 0; mark_buffer_dirty(bh); } else hpfs_free_sectors(s, f, 1); @@ -421,22 +421,22 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) while (btree->internal) { nodes = btree->n_used_nodes + btree->n_free_nodes; for (i = 0; i < btree->n_used_nodes; i++) - if (btree->u.internal[i].file_secno >= secs) goto f; + if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f; brelse(bh); hpfs_error(s, "internal btree %08x doesn't end with -1", node); return; f: for (j = i + 1; j < btree->n_used_nodes; j++) - hpfs_ea_remove(s, btree->u.internal[j].down, 1, 0); + hpfs_ea_remove(s, le32_to_cpu(btree->u.internal[j].down), 1, 0); btree->n_used_nodes = i + 1; btree->n_free_nodes = nodes - btree->n_used_nodes; - btree->first_free = 8 + 8 * btree->n_used_nodes; + btree->first_free = cpu_to_le16(8 + 8 * btree->n_used_nodes); mark_buffer_dirty(bh); - if (btree->u.internal[i].file_secno == secs) { + if (btree->u.internal[i].file_secno == cpu_to_le32(secs)) { brelse(bh); return; } - node = btree->u.internal[i].down; + node = le32_to_cpu(btree->u.internal[i].down); brelse(bh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree")) @@ -446,25 +446,25 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) } nodes = btree->n_used_nodes + btree->n_free_nodes; for (i = 0; i < btree->n_used_nodes; i++) - if (btree->u.external[i].file_secno + btree->u.external[i].length >= secs) goto ff; + if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) >= secs) goto ff; brelse(bh); return; ff: - if (secs <= btree->u.external[i].file_secno) { + if (secs <= le32_to_cpu(btree->u.external[i].file_secno)) { hpfs_error(s, "there is an allocation error in file %08x, sector %08x", f, secs); if (i) i--; } - else if (btree->u.external[i].file_secno + btree->u.external[i].length > secs) { - hpfs_free_sectors(s, btree->u.external[i].disk_secno + secs - - btree->u.external[i].file_secno, btree->u.external[i].length - - secs + btree->u.external[i].file_secno); /* I hope gcc optimizes this :-) */ - btree->u.external[i].length = secs - btree->u.external[i].file_secno; + else if (le32_to_cpu(btree->u.external[i].file_secno) + le32_to_cpu(btree->u.external[i].length) > secs) { + hpfs_free_sectors(s, le32_to_cpu(btree->u.external[i].disk_secno) + secs - + le32_to_cpu(btree->u.external[i].file_secno), le32_to_cpu(btree->u.external[i].length) + - secs + le32_to_cpu(btree->u.external[i].file_secno)); /* I hope gcc optimizes this :-) */ + btree->u.external[i].length = cpu_to_le32(secs - le32_to_cpu(btree->u.external[i].file_secno)); } for (j = i + 1; j < btree->n_used_nodes; j++) - hpfs_free_sectors(s, btree->u.external[j].disk_secno, btree->u.external[j].length); + hpfs_free_sectors(s, le32_to_cpu(btree->u.external[j].disk_secno), le32_to_cpu(btree->u.external[j].length)); btree->n_used_nodes = i + 1; btree->n_free_nodes = nodes - btree->n_used_nodes; - btree->first_free = 8 + 12 * btree->n_used_nodes; + btree->first_free = cpu_to_le16(8 + 12 * btree->n_used_nodes); mark_buffer_dirty(bh); brelse(bh); } @@ -480,12 +480,12 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno) struct extended_attribute *ea_end; if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return; if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree); - else hpfs_remove_dtree(s, fnode->u.external[0].disk_secno); + else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno)); ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) if (ea->indirect) hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); - hpfs_ea_ext_remove(s, fnode->ea_secno, fnode->ea_anode, fnode->ea_size_l); + hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l)); brelse(bh); hpfs_free_sectors(s, fno, 1); } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 208f3d7..f46ae02 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -88,9 +88,9 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) hpfs_error(inode->i_sb, "not a directory, fnode %08lx", (unsigned long)inode->i_ino); } - if (hpfs_inode->i_dno != fno->u.external[0].disk_secno) { + if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) { e = 1; - hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, fno->u.external[0].disk_secno); + hpfs_error(inode->i_sb, "corrupted inode: i_dno == %08x, fnode -> dnode == %08x", hpfs_inode->i_dno, le32_to_cpu(fno->u.external[0].disk_secno)); } brelse(bh); if (e) { @@ -156,7 +156,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto again; } tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); - if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) { + if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) { filp->f_pos = old_pos; if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); @@ -221,7 +221,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name * Get inode number, what we're after. */ - ino = de->fnode; + ino = le32_to_cpu(de->fnode); /* * Go find or make an inode. @@ -236,7 +236,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name hpfs_init_inode(result); if (de->directory) hpfs_read_inode(result); - else if (de->ea_size && hpfs_sb(dir->i_sb)->sb_eas) + else if (le32_to_cpu(de->ea_size) && hpfs_sb(dir->i_sb)->sb_eas) hpfs_read_inode(result); else { result->i_mode |= S_IFREG; @@ -261,19 +261,19 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name */ if (!result->i_ctime.tv_sec) { - if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, de->creation_date))) + if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)))) result->i_ctime.tv_sec = 1; result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, de->write_date); + result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); result->i_mtime.tv_nsec = 0; - result->i_atime.tv_sec = local_to_gmt(dir->i_sb, de->read_date); + result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); result->i_atime.tv_nsec = 0; - hpfs_result->i_ea_size = de->ea_size; + hpfs_result->i_ea_size = le32_to_cpu(de->ea_size); if (!hpfs_result->i_ea_mode && de->read_only) result->i_mode &= ~0222; if (!de->directory) { if (result->i_size == -1) { - result->i_size = de->file_size; + result->i_size = le32_to_cpu(de->file_size); result->i_data.a_ops = &hpfs_aops; hpfs_i(result)->mmu_private = result->i_size; /* diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 07711c3..1e0e2ac 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -14,11 +14,11 @@ static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) struct hpfs_dirent *de_end = dnode_end_de(d); int i = 1; for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { - if (de == fde) return ((loff_t) d->self << 4) | (loff_t)i; + if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i; i++; } printk("HPFS: get_pos: not_found\n"); - return ((loff_t)d->self << 4) | (loff_t)1; + return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; } void hpfs_add_pos(struct inode *inode, loff_t *pos) @@ -130,30 +130,30 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno { struct hpfs_dirent *de; if (!(de = dnode_last_de(d))) { - hpfs_error(s, "set_last_pointer: empty dnode %08x", d->self); + hpfs_error(s, "set_last_pointer: empty dnode %08x", le32_to_cpu(d->self)); return; } if (hpfs_sb(s)->sb_chk) { if (de->down) { hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x", - d->self, de_down_pointer(de)); + le32_to_cpu(d->self), de_down_pointer(de)); return; } - if (de->length != 32) { - hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", d->self); + if (le16_to_cpu(de->length) != 32) { + hpfs_error(s, "set_last_pointer: bad last dirent in dnode %08x", le32_to_cpu(d->self)); return; } } if (ptr) { d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) + 4); if (le32_to_cpu(d->first_free) > 2048) { - hpfs_error(s, "set_last_pointer: too long dnode %08x", d->self); + hpfs_error(s, "set_last_pointer: too long dnode %08x", le32_to_cpu(d->self)); d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - 4); return; } - de->length = 36; + de->length = cpu_to_le16(36); de->down = 1; - *(dnode_secno *)((char *)de + 32) = ptr; + *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr); } } @@ -169,7 +169,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) { int c = hpfs_compare_names(s, name, namelen, de->name, de->namelen, de->last); if (!c) { - hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, d->self); + hpfs_error(s, "name (%c,%d) already exists in dnode %08x", *name, namelen, le32_to_cpu(d->self)); return NULL; } if (c < 0) break; @@ -177,11 +177,10 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, memmove((char *)de + d_size, de, (char *)de_end - (char *)de); memset(de, 0, d_size); if (down_ptr) { - *(int *)((char *)de + d_size - 4) = down_ptr; + *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); de->down = 1; } - de->length = d_size; - if (down_ptr) de->down = 1; + de->length = cpu_to_le16(d_size); de->not_8x3 = hpfs_is_name_long(name, namelen); de->namelen = namelen; memcpy(de->name, name, namelen); @@ -195,10 +194,10 @@ static void hpfs_delete_de(struct super_block *s, struct dnode *d, struct hpfs_dirent *de) { if (de->last) { - hpfs_error(s, "attempt to delete last dirent in dnode %08x", d->self); + hpfs_error(s, "attempt to delete last dirent in dnode %08x", le32_to_cpu(d->self)); return; } - d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - de->length); + d->first_free = cpu_to_le32(le32_to_cpu(d->first_free) - le16_to_cpu(de->length)); memmove(de, de_next_de(de), le32_to_cpu(d->first_free) + (char *)d - (char *)de); } @@ -206,14 +205,14 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d) { struct hpfs_dirent *de; struct hpfs_dirent *de_end = dnode_end_de(d); - dnode_secno dno = d->self; + dnode_secno dno = le32_to_cpu(d->self); for (de = dnode_first_de(d); de < de_end; de = de_next_de(de)) if (de->down) { struct quad_buffer_head qbh; struct dnode *dd; if ((dd = hpfs_map_dnode(s, de_down_pointer(de), &qbh))) { - if (dd->up != dno || dd->root_dnode) { - dd->up = dno; + if (le32_to_cpu(dd->up) != dno || dd->root_dnode) { + dd->up = cpu_to_le32(dno); dd->root_dnode = 0; hpfs_mark_4buffers_dirty(&qbh); } @@ -291,7 +290,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, copy_de(de = hpfs_add_de(i->i_sb, nd, name, namelen, down_ptr), new_de); for_all_poss(i, hpfs_pos_ins, get_pos(nd, de), 1); h = ((char *)dnode_last_de(nd) - (char *)nd) / 2 + 10; - if (!(ad = hpfs_alloc_dnode(i->i_sb, d->up, &adno, &qbh1))) { + if (!(ad = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &adno, &qbh1))) { hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); hpfs_brelse4(&qbh); kfree(nd); @@ -315,19 +314,20 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0); de = de_next_de(de); memmove((char *)nd + 20, de, le32_to_cpu(nd->first_free) + (char *)nd - (char *)de); - nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - (char *)de - (char *)nd - 20); + nd->first_free = cpu_to_le32(le32_to_cpu(nd->first_free) - ((char *)de - (char *)nd - 20)); memcpy(d, nd, le32_to_cpu(nd->first_free)); for_all_poss(i, hpfs_pos_del, (loff_t)dno << 4, pos); fix_up_ptrs(i->i_sb, ad); if (!d->root_dnode) { - dno = ad->up = d->up; + ad->up = d->up; + dno = le32_to_cpu(ad->up); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); goto go_up; } - if (!(rd = hpfs_alloc_dnode(i->i_sb, d->up, &rdno, &qbh2))) { + if (!(rd = hpfs_alloc_dnode(i->i_sb, le32_to_cpu(d->up), &rdno, &qbh2))) { hpfs_error(i->i_sb, "unable to alloc dnode - dnode tree will be corrupted"); hpfs_brelse4(&qbh); hpfs_brelse4(&qbh1); @@ -339,7 +339,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, i->i_blocks += 4; rd->root_dnode = 1; rd->up = d->up; - if (!(fnode = hpfs_map_fnode(i->i_sb, d->up, &bh))) { + if (!(fnode = hpfs_map_fnode(i->i_sb, le32_to_cpu(d->up), &bh))) { hpfs_free_dnode(i->i_sb, rdno); hpfs_brelse4(&qbh); hpfs_brelse4(&qbh1); @@ -348,10 +348,11 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, kfree(nname); return 1; } - fnode->u.external[0].disk_secno = rdno; + fnode->u.external[0].disk_secno = cpu_to_le32(rdno); mark_buffer_dirty(bh); brelse(bh); - d->up = ad->up = hpfs_i(i)->i_dno = rdno; + hpfs_i(i)->i_dno = rdno; + d->up = ad->up = cpu_to_le32(rdno); d->root_dnode = ad->root_dnode = 0; hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); @@ -436,9 +437,9 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) return 0; if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0; if (hpfs_sb(i->i_sb)->sb_chk) { - if (dnode->up != chk_up) { + if (le32_to_cpu(dnode->up) != chk_up) { hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x", - dno, chk_up, dnode->up); + dno, chk_up, le32_to_cpu(dnode->up)); hpfs_brelse4(&qbh); return 0; } @@ -454,7 +455,7 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) hpfs_brelse4(&qbh); } while (!(de = dnode_pre_last_de(dnode))) { - dnode_secno up = dnode->up; + dnode_secno up = le32_to_cpu(dnode->up); hpfs_brelse4(&qbh); hpfs_free_dnode(i->i_sb, dno); i->i_size -= 2048; @@ -474,7 +475,7 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) return 0; } dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); - de->length -= 4; + de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); de->down = 0; hpfs_mark_4buffers_dirty(&qbh); dno = up; @@ -482,12 +483,12 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to) t = get_pos(dnode, de); for_all_poss(i, hpfs_pos_subst, t, 4); for_all_poss(i, hpfs_pos_subst, t + 1, 5); - if (!(nde = kmalloc(de->length, GFP_NOFS))) { + if (!(nde = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { hpfs_error(i->i_sb, "out of memory for dirent - directory will be corrupted"); hpfs_brelse4(&qbh); return 0; } - memcpy(nde, de, de->length); + memcpy(nde, de, le16_to_cpu(de->length)); ddno = de->down ? de_down_pointer(de) : 0; hpfs_delete_de(i->i_sb, dnode, de); set_last_pointer(i->i_sb, dnode, ddno); @@ -520,7 +521,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (le32_to_cpu(dnode->first_free) == 52 || le32_to_cpu(dnode->first_free) == 56) { struct hpfs_dirent *de_end; int root = dnode->root_dnode; - up = dnode->up; + up = le32_to_cpu(dnode->up); de = dnode_first_de(dnode); down = de->down ? de_down_pointer(de) : 0; if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) { @@ -544,13 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) return; } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { - d1->up = up; + d1->up = cpu_to_le32(up); d1->root_dnode = 1; hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } if ((fnode = hpfs_map_fnode(i->i_sb, up, &bh))) { - fnode->u.external[0].disk_secno = down; + fnode->u.external[0].disk_secno = cpu_to_le32(down); mark_buffer_dirty(bh); brelse(bh); } @@ -569,16 +570,16 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | 1, ((loff_t)up << 4) | p); if (!down) { de->down = 0; - de->length -= 4; + de->length = cpu_to_le16(le16_to_cpu(de->length) - 4); dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) - 4); memmove(de_next_de(de), (char *)de_next_de(de) + 4, (char *)dnode + le32_to_cpu(dnode->first_free) - (char *)de_next_de(de)); } else { struct dnode *d1; struct quad_buffer_head qbh1; - *(dnode_secno *) ((void *) de + de->length - 4) = down; + *(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4) = down; if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { - d1->up = up; + d1->up = cpu_to_le32(up); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } @@ -595,18 +596,18 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) struct quad_buffer_head qbh1; if (!de_next->down) goto endm; ndown = de_down_pointer(de_next); - if (!(de_cp = kmalloc(de->length, GFP_NOFS))) { + if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { printk("HPFS: out of memory for dtree balancing\n"); goto endm; } - memcpy(de_cp, de, de->length); + memcpy(de_cp, de, le16_to_cpu(de->length)); hpfs_delete_de(i->i_sb, dnode, de); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, 4); for_all_poss(i, hpfs_pos_del, ((loff_t)up << 4) | p, 1); if (de_cp->down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de_cp), &qbh1))) { - d1->up = ndown; + d1->up = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } @@ -646,38 +647,38 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); printk("HPFS: warning: goin'on\n"); } - del->length += 4; + del->length = cpu_to_le16(le16_to_cpu(del->length) + 4); del->down = 1; d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) + 4); } if (dlp && !down) { - del->length -= 4; + del->length = cpu_to_le16(le16_to_cpu(del->length) - 4); del->down = 0; d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); } else if (down) - *(dnode_secno *) ((void *) del + del->length - 4) = down; + *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); } else goto endm; - if (!(de_cp = kmalloc(de_prev->length, GFP_NOFS))) { + if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { printk("HPFS: out of memory for dtree balancing\n"); hpfs_brelse4(&qbh1); goto endm; } hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); - memcpy(de_cp, de_prev, de_prev->length); + memcpy(de_cp, de_prev, le16_to_cpu(de_prev->length)); hpfs_delete_de(i->i_sb, dnode, de_prev); if (!de_prev->down) { - de_prev->length += 4; + de_prev->length = cpu_to_le16(le16_to_cpu(de_prev->length) + 4); de_prev->down = 1; dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); } - *(dnode_secno *) ((void *) de_prev + de_prev->length - 4) = ndown; + *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | p, ((loff_t)up << 4) | (p - 1)); if (down) if ((d1 = hpfs_map_dnode(i->i_sb, de_down_pointer(de), &qbh1))) { - d1->up = ndown; + d1->up = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh1); hpfs_brelse4(&qbh1); } @@ -744,8 +745,8 @@ void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes, ptr = 0; go_up: if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return; - if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && dnode->up != odno) - hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, dnode->up); + if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && le32_to_cpu(dnode->up) != odno) + hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, le32_to_cpu(dnode->up)); de = dnode_first_de(dnode); if (ptr) while(1) { if (de->down) if (de_down_pointer(de) == ptr) goto process_de; @@ -769,7 +770,7 @@ void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes, if (!de->first && !de->last && n_items) (*n_items)++; if ((de = de_next_de(de)) < dnode_end_de(dnode)) goto next_de; ptr = dno; - dno = dnode->up; + dno = le32_to_cpu(dnode->up); if (dnode->root_dnode) { hpfs_brelse4(&qbh); return; @@ -817,8 +818,8 @@ dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno) return d; if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno; if (hpfs_sb(s)->sb_chk) - if (up && ((struct dnode *)qbh.data)->up != up) - hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, ((struct dnode *)qbh.data)->up); + if (up && le32_to_cpu(((struct dnode *)qbh.data)->up) != up) + hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, le32_to_cpu(((struct dnode *)qbh.data)->up)); if (!de->down) { hpfs_brelse4(&qbh); return d; @@ -867,7 +868,7 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp, /* Going up */ if (dnode->root_dnode) goto bail; - if (!(up_dnode = hpfs_map_dnode(inode->i_sb, dnode->up, &qbh0))) + if (!(up_dnode = hpfs_map_dnode(inode->i_sb, le32_to_cpu(dnode->up), &qbh0))) goto bail; end_up_de = dnode_end_de(up_dnode); @@ -875,16 +876,16 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp, for (up_de = dnode_first_de(up_dnode); up_de < end_up_de; up_de = de_next_de(up_de)) { if (!(++c & 077)) hpfs_error(inode->i_sb, - "map_pos_dirent: pos crossed dnode boundary; dnode = %08x", dnode->up); + "map_pos_dirent: pos crossed dnode boundary; dnode = %08x", le32_to_cpu(dnode->up)); if (up_de->down && de_down_pointer(up_de) == dno) { - *posp = ((loff_t) dnode->up << 4) + c; + *posp = ((loff_t) le32_to_cpu(dnode->up) << 4) + c; hpfs_brelse4(&qbh0); return de; } } hpfs_error(inode->i_sb, "map_pos_dirent: pointer to dnode %08x not found in parent dnode %08x", - dno, dnode->up); + dno, le32_to_cpu(dnode->up)); hpfs_brelse4(&qbh0); bail: @@ -1010,17 +1011,17 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, /*name2[15] = 0xff;*/ name1len = 15; name2len = 256; } - if (!(upf = hpfs_map_fnode(s, f->up, &bh))) { + if (!(upf = hpfs_map_fnode(s, le32_to_cpu(f->up), &bh))) { kfree(name2); return NULL; } if (!upf->dirflag) { brelse(bh); - hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, f->up); + hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up)); kfree(name2); return NULL; } - dno = upf->u.external[0].disk_secno; + dno = le32_to_cpu(upf->u.external[0].disk_secno); brelse(bh); go_down: downd = 0; @@ -1042,7 +1043,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, return NULL; } next_de: - if (de->fnode == fno) { + if (le32_to_cpu(de->fnode) == fno) { kfree(name2); return de; } @@ -1058,7 +1059,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, goto go_down; } f: - if (de->fnode == fno) { + if (le32_to_cpu(de->fnode) == fno) { kfree(name2); return de; } @@ -1067,7 +1068,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, if ((de = de_next_de(de)) < de_end) goto next_de; if (d->root_dnode) goto not_found; downd = dno; - dno = d->up; + dno = le32_to_cpu(d->up); hpfs_brelse4(qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) { diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 1ac05bb..1bbc37d 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -24,7 +24,7 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) } if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; if (ea->indirect) { - if (ea->valuelen != 8) { + if (le16_to_cpu(ea->valuelen) != 8) { hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x", ano ? "anode" : "sectors", a, pos); return; @@ -33,7 +33,7 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) return; hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); } - pos += ea->namelen + ea->valuelen + 5; + pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; } if (!ano) hpfs_free_sectors(s, a, (len+511) >> 9); else { @@ -82,14 +82,14 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, if (!strcmp(ea->name, key)) { if (ea->indirect) goto indirect; - if (ea->valuelen >= size) + if (le16_to_cpu(ea->valuelen) >= size) return -EINVAL; - memcpy(buf, ea_data(ea), ea->valuelen); - buf[ea->valuelen] = 0; + memcpy(buf, ea_data(ea), le16_to_cpu(ea->valuelen)); + buf[le16_to_cpu(ea->valuelen)] = 0; return 0; } - a = fnode->ea_secno; - len = fnode->ea_size_l; + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); ano = fnode->ea_anode; pos = 0; while (pos < len) { @@ -106,14 +106,14 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, if (!strcmp(ea->name, key)) { if (ea->indirect) goto indirect; - if (ea->valuelen >= size) + if (le16_to_cpu(ea->valuelen) >= size) return -EINVAL; - if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea->valuelen, buf)) + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, le16_to_cpu(ea->valuelen), buf)) return -EIO; - buf[ea->valuelen] = 0; + buf[le16_to_cpu(ea->valuelen)] = 0; return 0; } - pos += ea->namelen + ea->valuelen + 5; + pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; } return -ENOENT; indirect: @@ -138,16 +138,16 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (!strcmp(ea->name, key)) { if (ea->indirect) return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); - if (!(ret = kmalloc((*size = ea->valuelen) + 1, GFP_NOFS))) { + if (!(ret = kmalloc((*size = le16_to_cpu(ea->valuelen)) + 1, GFP_NOFS))) { printk("HPFS: out of memory for EA\n"); return NULL; } - memcpy(ret, ea_data(ea), ea->valuelen); - ret[ea->valuelen] = 0; + memcpy(ret, ea_data(ea), le16_to_cpu(ea->valuelen)); + ret[le16_to_cpu(ea->valuelen)] = 0; return ret; } - a = fnode->ea_secno; - len = fnode->ea_size_l; + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); ano = fnode->ea_anode; pos = 0; while (pos < len) { @@ -164,18 +164,18 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (!strcmp(ea->name, key)) { if (ea->indirect) return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); - if (!(ret = kmalloc((*size = ea->valuelen) + 1, GFP_NOFS))) { + if (!(ret = kmalloc((*size = le16_to_cpu(ea->valuelen)) + 1, GFP_NOFS))) { printk("HPFS: out of memory for EA\n"); return NULL; } - if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea->valuelen, ret)) { + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, le16_to_cpu(ea->valuelen), ret)) { kfree(ret); return NULL; } - ret[ea->valuelen] = 0; + ret[le16_to_cpu(ea->valuelen)] = 0; return ret; } - pos += ea->namelen + ea->valuelen + 5; + pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; } return NULL; } @@ -202,13 +202,13 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, if (ea->indirect) { if (ea_len(ea) == size) set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); - } else if (ea->valuelen == size) { + } else if (le16_to_cpu(ea->valuelen) == size) { memcpy(ea_data(ea), data, size); } return; } - a = fnode->ea_secno; - len = fnode->ea_size_l; + a = le32_to_cpu(fnode->ea_secno); + len = le32_to_cpu(fnode->ea_size_l); ano = fnode->ea_anode; pos = 0; while (pos < len) { @@ -228,41 +228,41 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); } else { - if (ea->valuelen == size) + if (le16_to_cpu(ea->valuelen) == size) hpfs_ea_write(s, a, ano, pos + 4 + ea->namelen + 1, size, data); } return; } - pos += ea->namelen + ea->valuelen + 5; + pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; } - if (!fnode->ea_offs) { - /*if (fnode->ea_size_s) { + if (!le16_to_cpu(fnode->ea_offs)) { + /*if (le16_to_cpu(fnode->ea_size_s)) { hpfs_error(s, "fnode %08x: ea_size_s == %03x, ea_offs == 0", - inode->i_ino, fnode->ea_size_s); + inode->i_ino, le16_to_cpu(fnode->ea_size_s)); return; }*/ - fnode->ea_offs = 0xc4; + fnode->ea_offs = cpu_to_le16(0xc4); } - if (fnode->ea_offs < 0xc4 || fnode->ea_offs + fnode->acl_size_s + fnode->ea_size_s > 0x200) { + if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", (unsigned long)inode->i_ino, - fnode->ea_offs, fnode->ea_size_s); + le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); return; } - if ((fnode->ea_size_s || !fnode->ea_size_l) && - fnode->ea_offs + fnode->acl_size_s + fnode->ea_size_s + strlen(key) + size + 5 <= 0x200) { + if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) && + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5 <= 0x200) { ea = fnode_end_ea(fnode); *(char *)ea = 0; ea->namelen = strlen(key); - ea->valuelen = size; + ea->valuelen = cpu_to_le16(size); strcpy(ea->name, key); memcpy(ea_data(ea), data, size); - fnode->ea_size_s += strlen(key) + size + 5; + fnode->ea_size_s = cpu_to_le16(le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5); goto ret; } /* Most the code here is 99.9993422% unused. I hope there are no bugs. But what .. HPFS.IFS has also bugs in ea management. */ - if (fnode->ea_size_s && !fnode->ea_size_l) { + if (le16_to_cpu(fnode->ea_size_s) && !le32_to_cpu(fnode->ea_size_l)) { secno n; struct buffer_head *bh; char *data; @@ -271,25 +271,26 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, hpfs_free_sectors(s, n, 1); return; } - memcpy(data, fnode_ea(fnode), fnode->ea_size_s); - fnode->ea_size_l = fnode->ea_size_s; - fnode->ea_size_s = 0; - fnode->ea_secno = n; - fnode->ea_anode = 0; + memcpy(data, fnode_ea(fnode), le16_to_cpu(fnode->ea_size_s)); + fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s)); + fnode->ea_size_s = cpu_to_le16(0); + fnode->ea_secno = cpu_to_le32(n); + fnode->ea_anode = cpu_to_le32(0); mark_buffer_dirty(bh); brelse(bh); } - pos = fnode->ea_size_l + 5 + strlen(key) + size; - len = (fnode->ea_size_l + 511) >> 9; + pos = le32_to_cpu(fnode->ea_size_l) + 5 + strlen(key) + size; + len = (le32_to_cpu(fnode->ea_size_l) + 511) >> 9; if (pos >= 30000) goto bail; while (((pos + 511) >> 9) > len) { if (!len) { - if (!(fnode->ea_secno = hpfs_alloc_sector(s, fno, 1, 0))) - goto bail; + secno q = hpfs_alloc_sector(s, fno, 1, 0); + if (!q) goto bail; + fnode->ea_secno = cpu_to_le32(q); fnode->ea_anode = 0; len++; } else if (!fnode->ea_anode) { - if (hpfs_alloc_if_possible(s, fnode->ea_secno + len)) { + if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) { len++; } else { /* Aargh... don't know how to create ea anodes :-( */ @@ -298,18 +299,18 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, anode_secno a_s; if (!(anode = hpfs_alloc_anode(s, fno, &a_s, &bh))) goto bail; - anode->up = fno; + anode->up = cpu_to_le32(fno); anode->btree.fnode_parent = 1; anode->btree.n_free_nodes--; anode->btree.n_used_nodes++; - anode->btree.first_free += 12; - anode->u.external[0].disk_secno = fnode->ea_secno; - anode->u.external[0].file_secno = 0; - anode->u.external[0].length = len; + anode->btree.first_free = cpu_to_le16(le16_to_cpu(anode->btree.first_free) + 12); + anode->u.external[0].disk_secno = cpu_to_le32(le32_to_cpu(fnode->ea_secno)); + anode->u.external[0].file_secno = cpu_to_le32(0); + anode->u.external[0].length = cpu_to_le32(len); mark_buffer_dirty(bh); brelse(bh); fnode->ea_anode = 1; - fnode->ea_secno = a_s;*/ + fnode->ea_secno = cpu_to_le32(a_s);*/ secno new_sec; int i; if (!(new_sec = hpfs_alloc_sector(s, fno, 1, 1 - ((pos + 511) >> 9)))) @@ -317,7 +318,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, for (i = 0; i < len; i++) { struct buffer_head *bh1, *bh2; void *b1, *b2; - if (!(b1 = hpfs_map_sector(s, fnode->ea_secno + i, &bh1, len - i - 1))) { + if (!(b1 = hpfs_map_sector(s, le32_to_cpu(fnode->ea_secno) + i, &bh1, len - i - 1))) { hpfs_free_sectors(s, new_sec, (pos + 511) >> 9); goto bail; } @@ -331,13 +332,13 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, mark_buffer_dirty(bh2); brelse(bh2); } - hpfs_free_sectors(s, fnode->ea_secno, len); - fnode->ea_secno = new_sec; + hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno), len); + fnode->ea_secno = cpu_to_le32(new_sec); len = (pos + 511) >> 9; } } if (fnode->ea_anode) { - if (hpfs_add_sector_to_btree(s, fnode->ea_secno, + if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno), 0, len) != -1) { len++; } else { @@ -349,17 +350,17 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, h[1] = strlen(key); h[2] = size & 0xff; h[3] = size >> 8; - if (hpfs_ea_write(s, fnode->ea_secno, fnode->ea_anode, fnode->ea_size_l, 4, h)) goto bail; - if (hpfs_ea_write(s, fnode->ea_secno, fnode->ea_anode, fnode->ea_size_l + 4, h[1] + 1, key)) goto bail; - if (hpfs_ea_write(s, fnode->ea_secno, fnode->ea_anode, fnode->ea_size_l + 5 + h[1], size, data)) goto bail; - fnode->ea_size_l = pos; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail; + fnode->ea_size_l = cpu_to_le32(pos); ret: hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size; return; bail: - if (fnode->ea_secno) - if (fnode->ea_anode) hpfs_truncate_btree(s, fnode->ea_secno, 1, (fnode->ea_size_l + 511) >> 9); - else hpfs_free_sectors(s, fnode->ea_secno + ((fnode->ea_size_l + 511) >> 9), len - ((fnode->ea_size_l + 511) >> 9)); - else fnode->ea_secno = fnode->ea_size_l = 0; + if (le32_to_cpu(fnode->ea_secno)) + if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9); + else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9)); + else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0); } diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 8cd5130..91a6223 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -19,6 +19,10 @@ For definitive information on HPFS, ask somebody else -- this is guesswork. There are certain to be many mistakes. */ +#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) +#error unknown endian +#endif + /* Notation */ typedef u32 secno; /* sector number, partition relative */ @@ -86,7 +90,6 @@ struct hpfs_super_block secno badblocks; /* bad block list */ u32 zero3; /* 0 */ time32_t last_chkdsk; /* date last checked, 0 if never */ - /*u32 zero4;*/ /* 0 */ time32_t last_optimize; /* date last optimized, 0 if never */ secno n_dir_band; /* number of sectors in dir band */ secno dir_band_start; /* first sector in dir band */ @@ -109,21 +112,44 @@ struct hpfs_spare_block u32 magic; /* f991 1849 */ u32 magic1; /* fa52 29c5, more magic? */ - unsigned dirty: 1; /* 0 clean, 1 "improperly stopped" */ - unsigned sparedir_used: 1; /* spare dirblks used */ - unsigned hotfixes_used: 1; /* hotfixes used */ - unsigned bad_sector: 1; /* bad sector, corrupted disk (???) */ - unsigned bad_bitmap: 1; /* bad bitmap */ - unsigned fast: 1; /* partition was fast formatted */ - unsigned old_wrote: 1; /* old version wrote to partion */ - unsigned old_wrote_1: 1; /* old version wrote to partion (?) */ - unsigned install_dasd_limits: 1; /* HPFS386 flags */ - unsigned resynch_dasd_limits: 1; - unsigned dasd_limits_operational: 1; - unsigned multimedia_active: 1; - unsigned dce_acls_active: 1; - unsigned dasd_limits_dirty: 1; - unsigned flag67: 2; +#ifdef __LITTLE_ENDIAN + u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ + u8 sparedir_used: 1; /* spare dirblks used */ + u8 hotfixes_used: 1; /* hotfixes used */ + u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ + u8 bad_bitmap: 1; /* bad bitmap */ + u8 fast: 1; /* partition was fast formatted */ + u8 old_wrote: 1; /* old version wrote to partion */ + u8 old_wrote_1: 1; /* old version wrote to partion (?) */ +#else + u8 old_wrote_1: 1; /* old version wrote to partion (?) */ + u8 old_wrote: 1; /* old version wrote to partion */ + u8 fast: 1; /* partition was fast formatted */ + u8 bad_bitmap: 1; /* bad bitmap */ + u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ + u8 hotfixes_used: 1; /* hotfixes used */ + u8 sparedir_used: 1; /* spare dirblks used */ + u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ +#endif + +#ifdef __LITTLE_ENDIAN + u8 install_dasd_limits: 1; /* HPFS386 flags */ + u8 resynch_dasd_limits: 1; + u8 dasd_limits_operational: 1; + u8 multimedia_active: 1; + u8 dce_acls_active: 1; + u8 dasd_limits_dirty: 1; + u8 flag67: 2; +#else + u8 flag67: 2; + u8 dasd_limits_dirty: 1; + u8 dce_acls_active: 1; + u8 multimedia_active: 1; + u8 dasd_limits_operational: 1; + u8 resynch_dasd_limits: 1; + u8 install_dasd_limits: 1; /* HPFS386 flags */ +#endif + u8 mm_contlgulty; u8 unused; @@ -255,10 +281,18 @@ struct dnode { u32 magic; /* 77e4 0aae */ u32 first_free; /* offset from start of dnode to first free dir entry */ - unsigned root_dnode:1; /* Is it root dnode? */ - unsigned increment_me:31; /* some kind of activity counter? - Neither HPFS.IFS nor CHKDSK cares +#ifdef __LITTLE_ENDIAN + u8 root_dnode: 1; /* Is it root dnode? */ + u8 increment_me: 7; /* some kind of activity counter? */ + /* Neither HPFS.IFS nor CHKDSK cares if you change this word */ +#else + u8 increment_me: 7; /* some kind of activity counter? */ + /* Neither HPFS.IFS nor CHKDSK cares + if you change this word */ + u8 root_dnode: 1; /* Is it root dnode? */ +#endif + u8 increment_me2[3]; secno up; /* (root dnode) directory's fnode (nonroot) parent dnode */ dnode_secno self; /* pointer to this dnode */ @@ -266,33 +300,59 @@ struct dnode { }; struct hpfs_dirent { - u16 length; /* offset to next dirent */ - unsigned first: 1; /* set on phony ^A^A (".") entry */ - unsigned has_acl: 1; - unsigned down: 1; /* down pointer present (after name) */ - unsigned last: 1; /* set on phony \377 entry */ - unsigned has_ea: 1; /* entry has EA */ - unsigned has_xtd_perm: 1; /* has extended perm list (???) */ - unsigned has_explicit_acl: 1; - unsigned has_needea: 1; /* ?? some EA has NEEDEA set + u16 length; /* offset to next dirent */ + +#ifdef __LITTLE_ENDIAN + u8 first: 1; /* set on phony ^A^A (".") entry */ + u8 has_acl: 1; + u8 down: 1; /* down pointer present (after name) */ + u8 last: 1; /* set on phony \377 entry */ + u8 has_ea: 1; /* entry has EA */ + u8 has_xtd_perm: 1; /* has extended perm list (???) */ + u8 has_explicit_acl: 1; + u8 has_needea: 1; /* ?? some EA has NEEDEA set + I have no idea why this is + interesting in a dir entry */ +#else + u8 has_needea: 1; /* ?? some EA has NEEDEA set I have no idea why this is interesting in a dir entry */ - unsigned read_only: 1; /* dos attrib */ - unsigned hidden: 1; /* dos attrib */ - unsigned system: 1; /* dos attrib */ - unsigned flag11: 1; /* would be volume label dos attrib */ - unsigned directory: 1; /* dos attrib */ - unsigned archive: 1; /* dos attrib */ - unsigned not_8x3: 1; /* name is not 8.3 */ - unsigned flag15: 1; + u8 has_explicit_acl: 1; + u8 has_xtd_perm: 1; /* has extended perm list (???) */ + u8 has_ea: 1; /* entry has EA */ + u8 last: 1; /* set on phony \377 entry */ + u8 down: 1; /* down pointer present (after name) */ + u8 has_acl: 1; + u8 first: 1; /* set on phony ^A^A (".") entry */ +#endif + +#ifdef __LITTLE_ENDIAN + u8 read_only: 1; /* dos attrib */ + u8 hidden: 1; /* dos attrib */ + u8 system: 1; /* dos attrib */ + u8 flag11: 1; /* would be volume label dos attrib */ + u8 directory: 1; /* dos attrib */ + u8 archive: 1; /* dos attrib */ + u8 not_8x3: 1; /* name is not 8.3 */ + u8 flag15: 1; +#else + u8 flag15: 1; + u8 not_8x3: 1; /* name is not 8.3 */ + u8 archive: 1; /* dos attrib */ + u8 directory: 1; /* dos attrib */ + u8 flag11: 1; /* would be volume label dos attrib */ + u8 system: 1; /* dos attrib */ + u8 hidden: 1; /* dos attrib */ + u8 read_only: 1; /* dos attrib */ +#endif + fnode_secno fnode; /* fnode giving allocation info */ time32_t write_date; /* mtime */ u32 file_size; /* file length, bytes */ time32_t read_date; /* atime */ time32_t creation_date; /* ctime */ u32 ea_size; /* total EA length, bytes */ - unsigned char no_of_acls : 3; /* number of ACL's */ - unsigned char reserver : 5; + u8 no_of_acls; /* number of ACL's (low 3 bits) */ u8 ix; /* code page index (of filename), see struct code_page_data */ u8 namelen, name[1]; /* file name */ @@ -328,21 +388,33 @@ struct bplus_internal_node struct bplus_header { - unsigned hbff: 1; /* high bit of first free entry offset */ - unsigned flag1: 1; - unsigned flag2: 1; - unsigned flag3: 1; - unsigned flag4: 1; - unsigned fnode_parent: 1; /* ? we're pointed to by an fnode, +#ifdef __LITTLE_ENDIAN + u8 hbff: 1; /* high bit of first free entry offset */ + u8 flag1234: 4; + u8 fnode_parent: 1; /* ? we're pointed to by an fnode, the data btree or some ea or the main ea bootage pointer ea_secno */ /* also can get set in fnodes, which may be a chkdsk glitch or may mean this bit is irrelevant in fnodes, or this interpretation is all wet */ - unsigned binary_search: 1; /* suggest binary search (unused) */ - unsigned internal: 1; /* 1 -> (internal) tree of anodes + u8 binary_search: 1; /* suggest binary search (unused) */ + u8 internal: 1; /* 1 -> (internal) tree of anodes 0 -> (leaf) list of extents */ +#else + u8 internal: 1; /* 1 -> (internal) tree of anodes + 0 -> (leaf) list of extents */ + u8 binary_search: 1; /* suggest binary search (unused) */ + u8 fnode_parent: 1; /* ? we're pointed to by an fnode, + the data btree or some ea or the + main ea bootage pointer ea_secno */ + /* also can get set in fnodes, which + may be a chkdsk glitch or may mean + this bit is irrelevant in fnodes, + or this interpretation is all wet */ + u8 flag1234: 4; + u8 hbff: 1; /* high bit of first free entry offset */ +#endif u8 fill[3]; u8 n_free_nodes; /* free nodes in following array */ u8 n_used_nodes; /* used nodes in following array */ @@ -379,23 +451,25 @@ struct fnode secno ea_secno; /* first sector of disk-resident ea's*/ u16 ea_size_s; /* length of fnode-resident ea's */ - unsigned flag0: 1; - unsigned ea_anode: 1; /* 1 -> ea_secno is an anode */ - unsigned flag2: 1; - unsigned flag3: 1; - unsigned flag4: 1; - unsigned flag5: 1; - unsigned flag6: 1; - unsigned flag7: 1; - unsigned dirflag: 1; /* 1 -> directory. first & only extent +#ifdef __LITTLE_ENDIAN + u8 flag0: 1; + u8 ea_anode: 1; /* 1 -> ea_secno is an anode */ + u8 flag234567: 6; +#else + u8 flag234567: 6; + u8 ea_anode: 1; /* 1 -> ea_secno is an anode */ + u8 flag0: 1; +#endif + +#ifdef __LITTLE_ENDIAN + u8 dirflag: 1; /* 1 -> directory. first & only extent + points to dnode. */ + u8 flag9012345: 7; +#else + u8 flag9012345: 7; + u8 dirflag: 1; /* 1 -> directory. first & only extent points to dnode. */ - unsigned flag9: 1; - unsigned flag10: 1; - unsigned flag11: 1; - unsigned flag12: 1; - unsigned flag13: 1; - unsigned flag14: 1; - unsigned flag15: 1; +#endif struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */ union { @@ -456,16 +530,21 @@ struct anode struct extended_attribute { - unsigned indirect: 1; /* 1 -> value gives sector number +#ifdef __LITTLE_ENDIAN + u8 indirect: 1; /* 1 -> value gives sector number where real value starts */ - unsigned anode: 1; /* 1 -> sector is an anode + u8 anode: 1; /* 1 -> sector is an anode that points to fragmented value */ - unsigned flag2: 1; - unsigned flag3: 1; - unsigned flag4: 1; - unsigned flag5: 1; - unsigned flag6: 1; - unsigned needea: 1; /* required ea */ + u8 flag23456: 5; + u8 needea: 1; /* required ea */ +#else + u8 needea: 1; /* required ea */ + u8 flag23456: 5; + u8 anode: 1; /* 1 -> sector is an anode + that points to fragmented value */ + u8 indirect: 1; /* 1 -> value gives sector number + where real value starts */ +#endif u8 namelen; /* length of name, bytes */ u16 valuelen; /* length of value, bytes */ u8 name[0]; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index d101086..f993773 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -84,7 +84,6 @@ struct hpfs_sb_info { unsigned *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ - /*unsigned sb_mounting : 1;*/ int sb_timeshift; }; @@ -100,7 +99,7 @@ struct quad_buffer_head { static inline dnode_secno de_down_pointer (struct hpfs_dirent *de) { CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n")); - return *(dnode_secno *) ((void *) de + de->length - 4); + return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4)); } /* The first dir entry in a dnode */ @@ -114,41 +113,41 @@ static inline struct hpfs_dirent *dnode_first_de (struct dnode *dnode) static inline struct hpfs_dirent *dnode_end_de (struct dnode *dnode) { - CHKCOND(dnode->first_free>=0x14 && dnode->first_free<=0xa00,("HPFS: dnode_end_de: dnode->first_free = %d\n",(int)dnode->first_free)); - return (void *) dnode + dnode->first_free; + CHKCOND(le32_to_cpu(dnode->first_free)>=0x14 && le32_to_cpu(dnode->first_free)<=0xa00,("HPFS: dnode_end_de: dnode->first_free = %x\n",(unsigned)le32_to_cpu(dnode->first_free))); + return (void *) dnode + le32_to_cpu(dnode->first_free); } /* The dir entry after dir entry de */ static inline struct hpfs_dirent *de_next_de (struct hpfs_dirent *de) { - CHKCOND(de->length>=0x20 && de->length<0x800,("HPFS: de_next_de: de->length = %d\n",(int)de->length)); - return (void *) de + de->length; + CHKCOND(le16_to_cpu(de->length)>=0x20 && le16_to_cpu(de->length)<0x800,("HPFS: de_next_de: de->length = %x\n",(unsigned)le16_to_cpu(de->length))); + return (void *) de + le16_to_cpu(de->length); } static inline struct extended_attribute *fnode_ea(struct fnode *fnode) { - return (struct extended_attribute *)((char *)fnode + fnode->ea_offs + fnode->acl_size_s); + return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s)); } static inline struct extended_attribute *fnode_end_ea(struct fnode *fnode) { - return (struct extended_attribute *)((char *)fnode + fnode->ea_offs + fnode->acl_size_s + fnode->ea_size_s); + return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s)); } static inline struct extended_attribute *next_ea(struct extended_attribute *ea) { - return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + ea->valuelen); + return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + le16_to_cpu(ea->valuelen)); } static inline secno ea_sec(struct extended_attribute *ea) { - return *(secno *)((char *)ea + 9 + ea->namelen); + return le32_to_cpu(*((secno *)((char *)ea + 9 + ea->namelen))); } static inline secno ea_len(struct extended_attribute *ea) { - return *(secno *)((char *)ea + 5 + ea->namelen); + return le32_to_cpu(*((secno *)((char *)ea + 5 + ea->namelen))); } static inline char *ea_data(struct extended_attribute *ea) @@ -173,13 +172,13 @@ static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src) dst->not_8x3 = n; } -static inline unsigned tstbits(unsigned *bmp, unsigned b, unsigned n) +static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n) { int i; if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n; - if (!((bmp[(b & 0x3fff) >> 5] >> (b & 0x1f)) & 1)) return 1; + if (!((le32_to_cpu(bmp[(b & 0x3fff) >> 5]) >> (b & 0x1f)) & 1)) return 1; for (i = 1; i < n; i++) - if (/*b+i < 0x4000 &&*/ !((bmp[((b+i) & 0x3fff) >> 5] >> ((b+i) & 0x1f)) & 1)) + if (!((le32_to_cpu(bmp[((b+i) & 0x3fff) >> 5]) >> ((b+i) & 0x1f)) & 1)) return i + 1; return 0; } diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index bc61bb4..338cd83 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -115,8 +115,8 @@ void hpfs_read_inode(struct inode *i) i->i_mode |= S_IFDIR; i->i_op = &hpfs_dir_iops; i->i_fop = &hpfs_dir_ops; - hpfs_inode->i_parent_dir = fnode->up; - hpfs_inode->i_dno = fnode->u.external[0].disk_secno; + hpfs_inode->i_parent_dir = le32_to_cpu(fnode->up); + hpfs_inode->i_dno = le32_to_cpu(fnode->u.external[0].disk_secno); if (hpfs_sb(sb)->sb_chk >= 2) { struct buffer_head *bh0; if (hpfs_map_fnode(sb, hpfs_inode->i_parent_dir, &bh0)) brelse(bh0); @@ -132,7 +132,7 @@ void hpfs_read_inode(struct inode *i) i->i_op = &hpfs_file_iops; i->i_fop = &hpfs_file_ops; i->i_nlink = 1; - i->i_size = fnode->file_size; + i->i_size = le32_to_cpu(fnode->file_size); i->i_blocks = ((i->i_size + 511) >> 9) + 1; i->i_data.a_ops = &hpfs_aops; hpfs_i(i)->mmu_private = i->i_size; @@ -143,7 +143,7 @@ void hpfs_read_inode(struct inode *i) static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode) { struct hpfs_inode_info *hpfs_inode = hpfs_i(i); - /*if (fnode->acl_size_l || fnode->acl_size_s) { + /*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) { Some unknown structures like ACL may be in fnode, we'd better not overwrite them hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); @@ -218,30 +218,30 @@ void hpfs_write_inode_nolock(struct inode *i) } } else de = NULL; if (S_ISREG(i->i_mode)) { - fnode->file_size = i->i_size; - if (de) de->file_size = i->i_size; + fnode->file_size = cpu_to_le32(i->i_size); + if (de) de->file_size = cpu_to_le32(i->i_size); } else if (S_ISDIR(i->i_mode)) { - fnode->file_size = 0; - if (de) de->file_size = 0; + fnode->file_size = cpu_to_le32(0); + if (de) de->file_size = cpu_to_le32(0); } hpfs_write_inode_ea(i, fnode); if (de) { - de->write_date = gmt_to_local(i->i_sb, i->i_mtime.tv_sec); - de->read_date = gmt_to_local(i->i_sb, i->i_atime.tv_sec); - de->creation_date = gmt_to_local(i->i_sb, i->i_ctime.tv_sec); + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); de->read_only = !(i->i_mode & 0222); - de->ea_size = hpfs_inode->i_ea_size; + de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); } if (S_ISDIR(i->i_mode)) { if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { - de->write_date = gmt_to_local(i->i_sb, i->i_mtime.tv_sec); - de->read_date = gmt_to_local(i->i_sb, i->i_atime.tv_sec); - de->creation_date = gmt_to_local(i->i_sb, i->i_ctime.tv_sec); + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); de->read_only = !(i->i_mode & 0222); - de->ea_size = /*hpfs_inode->i_ea_size*/0; - de->file_size = 0; + de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); + de->file_size = cpu_to_le32(0); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); } else diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 840d033..a790821 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -21,7 +21,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); return NULL; } - sec = hpfs_sb(s)->sb_bmp_dir[bmp_block]; + sec = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) { hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); return NULL; @@ -46,18 +46,18 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) struct code_page_data *cpd; struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0); if (!cp) return NULL; - if (cp->magic != CP_DIR_MAGIC) { - printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", cp->magic); + if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) { + printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic)); brelse(bh); return NULL; } - if (!cp->n_code_pages) { + if (!le32_to_cpu(cp->n_code_pages)) { printk("HPFS: n_code_pages == 0\n"); brelse(bh); return NULL; } - cpds = cp->array[0].code_page_data; - cpi = cp->array[0].index; + cpds = le32_to_cpu(cp->array[0].code_page_data); + cpi = le16_to_cpu(cp->array[0].index); brelse(bh); if (cpi >= 3) { @@ -66,12 +66,12 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) } if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL; - if ((unsigned)cpd->offs[cpi] > 0x178) { + if (le16_to_cpu(cpd->offs[cpi]) > 0x178) { printk("HPFS: Code page index out of sector\n"); brelse(bh); return NULL; } - ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6; + ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6; if (!(cp_table = kmalloc(256, GFP_KERNEL))) { printk("HPFS: out of memory for code page table\n"); brelse(bh); @@ -125,7 +125,7 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea if (hpfs_sb(s)->sb_chk) { struct extended_attribute *ea; struct extended_attribute *ea_end; - if (fnode->magic != FNODE_MAGIC) { + if (le32_to_cpu(fnode->magic) != FNODE_MAGIC) { hpfs_error(s, "bad magic on fnode %08lx", (unsigned long)ino); goto bail; @@ -138,7 +138,7 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea (unsigned long)ino); goto bail; } - if (fnode->btree.first_free != + if (le16_to_cpu(fnode->btree.first_free) != 8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in fnode %08lx", @@ -146,12 +146,12 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea goto bail; } } - if (fnode->ea_size_s && ((signed int)fnode->ea_offs < 0xc4 || - (signed int)fnode->ea_offs + fnode->acl_size_s + fnode->ea_size_s > 0x200)) { + if (le16_to_cpu(fnode->ea_size_s) && (le16_to_cpu(fnode->ea_offs) < 0xc4 || + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200)) { hpfs_error(s, "bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x", (unsigned long)ino, - fnode->ea_offs, fnode->ea_size_s); + le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); goto bail; } ea = fnode_ea(fnode); @@ -178,16 +178,20 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL; if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD))) if (hpfs_sb(s)->sb_chk) { - if (anode->magic != ANODE_MAGIC || anode->self != ano) { + if (le32_to_cpu(anode->magic) != ANODE_MAGIC) { hpfs_error(s, "bad magic on anode %08x", ano); goto bail; } + if (le32_to_cpu(anode->self) != ano) { + hpfs_error(s, "self pointer invalid on anode %08x", ano); + goto bail; + } if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != (anode->btree.internal ? 60 : 40)) { hpfs_error(s, "bad number of nodes in anode %08x", ano); goto bail; } - if (anode->btree.first_free != + if (le16_to_cpu(anode->btree.first_free) != 8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in anode %08x", ano); goto bail; @@ -219,26 +223,26 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, unsigned p, pp = 0; unsigned char *d = (unsigned char *)dnode; int b = 0; - if (dnode->magic != DNODE_MAGIC) { + if (le32_to_cpu(dnode->magic) != DNODE_MAGIC) { hpfs_error(s, "bad magic on dnode %08x", secno); goto bail; } - if (dnode->self != secno) - hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, dnode->self); + if (le32_to_cpu(dnode->self) != secno) + hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, le32_to_cpu(dnode->self)); /* Check dirents - bad dirents would cause infinite loops or shooting to memory */ - if (dnode->first_free > 2048/* || dnode->first_free < 84*/) { - hpfs_error(s, "dnode %08x has first_free == %08x", secno, dnode->first_free); + if (le32_to_cpu(dnode->first_free) > 2048) { + hpfs_error(s, "dnode %08x has first_free == %08x", secno, le32_to_cpu(dnode->first_free)); goto bail; } - for (p = 20; p < dnode->first_free; p += d[p] + (d[p+1] << 8)) { + for (p = 20; p < le32_to_cpu(dnode->first_free); p += d[p] + (d[p+1] << 8)) { struct hpfs_dirent *de = (struct hpfs_dirent *)((char *)dnode + p); - if (de->length > 292 || (de->length < 32) || (de->length & 3) || p + de->length > 2048) { + if (le16_to_cpu(de->length) > 292 || (le16_to_cpu(de->length) < 32) || (le16_to_cpu(de->length) & 3) || p + le16_to_cpu(de->length) > 2048) { hpfs_error(s, "bad dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } - if (((31 + de->namelen + de->down*4 + 3) & ~3) != de->length) { - if (((31 + de->namelen + de->down*4 + 3) & ~3) < de->length && s->s_flags & MS_RDONLY) goto ok; + if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) { + if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok; hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } @@ -251,7 +255,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, pp = p; } - if (p != dnode->first_free) { + if (p != le32_to_cpu(dnode->first_free)) { hpfs_error(s, "size on last dirent does not match first_free; dnode %08x", secno); goto bail; } @@ -277,7 +281,7 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino) if (!fnode) return 0; - dno = fnode->u.external[0].disk_secno; + dno = le32_to_cpu(fnode->u.external[0].disk_secno); brelse(bh); return dno; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 9c66f0e..5a8de6a 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -37,8 +37,8 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!(mode & 0222)) dee.read_only = 1; /*dee.archive = 0;*/ dee.hidden = name[0] == '.'; - dee.fnode = fno; - dee.creation_date = dee.write_date = dee.read_date = gmt_to_local(dir->i_sb, get_seconds()); + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); result = new_inode(dir->i_sb); if (!result) goto bail2; @@ -46,7 +46,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; hpfs_i(result)->i_dno = dno; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; result->i_mtime.tv_nsec = 0; result->i_atime.tv_nsec = 0; @@ -69,21 +69,21 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) } fnode->len = len; memcpy(fnode->name, name, len > 15 ? 15 : len); - fnode->up = dir->i_ino; + fnode->up = cpu_to_le32(dir->i_ino); fnode->dirflag = 1; fnode->btree.n_free_nodes = 7; fnode->btree.n_used_nodes = 1; - fnode->btree.first_free = 0x14; - fnode->u.external[0].disk_secno = dno; - fnode->u.external[0].file_secno = -1; + fnode->btree.first_free = cpu_to_le16(0x14); + fnode->u.external[0].disk_secno = cpu_to_le32(dno); + fnode->u.external[0].file_secno = cpu_to_le32(-1); dnode->root_dnode = 1; - dnode->up = fno; + dnode->up = cpu_to_le32(fno); de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0); - de->creation_date = de->write_date = de->read_date = gmt_to_local(dir->i_sb, get_seconds()); + de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); if (!(mode & 0222)) de->read_only = 1; de->first = de->directory = 1; /*de->hidden = de->system = 0;*/ - de->fnode = fno; + de->fnode = cpu_to_le32(fno); mark_buffer_dirty(bh); brelse(bh); hpfs_mark_4buffers_dirty(&qbh0); @@ -137,8 +137,8 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc if (!(mode & 0222)) dee.read_only = 1; dee.archive = 1; dee.hidden = name[0] == '.'; - dee.fnode = fno; - dee.creation_date = dee.write_date = dee.read_date = gmt_to_local(dir->i_sb, get_seconds()); + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); result = new_inode(dir->i_sb); if (!result) @@ -152,7 +152,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc result->i_fop = &hpfs_file_ops; result->i_nlink = 1; hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; result->i_mtime.tv_nsec = 0; result->i_atime.tv_nsec = 0; @@ -173,7 +173,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc } fnode->len = len; memcpy(fnode->name, name, len > 15 ? 15 : len); - fnode->up = dir->i_ino; + fnode->up = cpu_to_le32(dir->i_ino); mark_buffer_dirty(bh); brelse(bh); @@ -225,8 +225,8 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t if (!(mode & 0222)) dee.read_only = 1; dee.archive = 1; dee.hidden = name[0] == '.'; - dee.fnode = fno; - dee.creation_date = dee.write_date = dee.read_date = gmt_to_local(dir->i_sb, get_seconds()); + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); result = new_inode(dir->i_sb); if (!result) @@ -235,7 +235,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; result->i_mtime.tv_nsec = 0; result->i_atime.tv_nsec = 0; @@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t } fnode->len = len; memcpy(fnode->name, name, len > 15 ? 15 : len); - fnode->up = dir->i_ino; + fnode->up = cpu_to_le32(dir->i_ino); mark_buffer_dirty(bh); insert_inode_hash(result); @@ -300,8 +300,8 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy memset(&dee, 0, sizeof dee); dee.archive = 1; dee.hidden = name[0] == '.'; - dee.fnode = fno; - dee.creation_date = dee.write_date = dee.read_date = gmt_to_local(dir->i_sb, get_seconds()); + dee.fnode = cpu_to_le32(fno); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); result = new_inode(dir->i_sb); if (!result) @@ -309,7 +309,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_ino = fno; hpfs_init_inode(result); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date); + result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); result->i_ctime.tv_nsec = 0; result->i_mtime.tv_nsec = 0; result->i_atime.tv_nsec = 0; @@ -332,7 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy } fnode->len = len; memcpy(fnode->name, name, len > 15 ? 15 : len); - fnode->up = dir->i_ino; + fnode->up = cpu_to_le32(dir->i_ino); hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink)); mark_buffer_dirty(bh); brelse(bh); @@ -382,7 +382,7 @@ again: if (de->directory) goto out1; - fno = de->fnode; + fno = le32_to_cpu(de->fnode); r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); switch (r) { case 1: @@ -465,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) if (n_items) goto out1; - fno = de->fnode; + fno = le32_to_cpu(de->fnode); r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); switch (r) { case 1: @@ -608,7 +608,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(old_dir); } if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) { - fnode->up = new_dir->i_ino; + fnode->up = cpu_to_le32(new_dir->i_ino); fnode->len = new_len; memcpy(fnode->name, new_name, new_len>15?15:new_len); if (new_len < 15) memset(&fnode->name[new_len], 0, 15 - new_len); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 4a7d026..98580a3 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -135,7 +135,7 @@ static unsigned count_bitmaps(struct super_block *s) n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; count = 0; for (n = 0; n < n_bands; n++) - count += hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_bmp_dir[n]); + count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); return count; } @@ -509,9 +509,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3; /* Check magics */ - if (/*bootblock->magic != BB_MAGIC - ||*/ superblock->magic != SB_MAGIC - || spareblock->magic != SP_MAGIC) { + if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC + ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC + || le32_to_cpu(spareblock->magic) != SP_MAGIC) { if (!silent) printk("HPFS: Bad magic ... probably not HPFS\n"); goto bail4; } @@ -532,12 +532,12 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) s->s_op = &hpfs_sops; s->s_d_op = &hpfs_dentry_operations; - sbi->sb_root = superblock->root; - sbi->sb_fs_size = superblock->n_sectors; - sbi->sb_bitmaps = superblock->bitmaps; - sbi->sb_dirband_start = superblock->dir_band_start; - sbi->sb_dirband_size = superblock->n_dir_band; - sbi->sb_dmap = superblock->dir_band_bitmap; + sbi->sb_root = le32_to_cpu(superblock->root); + sbi->sb_fs_size = le32_to_cpu(superblock->n_sectors); + sbi->sb_bitmaps = le32_to_cpu(superblock->bitmaps); + sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); + sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); + sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); sbi->sb_uid = uid; sbi->sb_gid = gid; sbi->sb_mode = 0777 & ~umask; @@ -555,7 +555,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_max_fwd_alloc = 0xffffff; /* Load bitmap directory */ - if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, superblock->bitmaps))) + if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; /* Check for general fs errors*/ @@ -573,7 +573,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mark_buffer_dirty(bh2); } - if (spareblock->hotfixes_used || spareblock->n_spares_used) { + if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) { if (errs >= 2) { printk("HPFS: Hotfixes not supported here, try chkdsk\n"); mark_dirty(s, 0); @@ -583,7 +583,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (errs == 0) printk("HPFS: Proceeding, but your filesystem will be probably corrupted by this driver...\n"); else printk("HPFS: This driver may read bad files or crash when operating on disk with hotfixes.\n"); } - if (spareblock->n_dnode_spares != spareblock->n_dnode_spares_free) { + if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (errs >= 2) { printk("HPFS: Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); @@ -594,17 +594,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } if (chk) { unsigned a; - if (superblock->dir_band_end - superblock->dir_band_start + 1 != superblock->n_dir_band || - superblock->dir_band_end < superblock->dir_band_start || superblock->n_dir_band > 0x4000) { + if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || + le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { hpfs_error(s, "dir band size mismatch: dir_band_start==%08x, dir_band_end==%08x, n_dir_band==%08x", - superblock->dir_band_start, superblock->dir_band_end, superblock->n_dir_band); + le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->dir_band_end), le32_to_cpu(superblock->n_dir_band)); goto bail4; } a = sbi->sb_dirband_size; sbi->sb_dirband_size = 0; - if (hpfs_chk_sectors(s, superblock->dir_band_start, superblock->n_dir_band, "dir_band") || - hpfs_chk_sectors(s, superblock->dir_band_bitmap, 4, "dir_band_bitmap") || - hpfs_chk_sectors(s, superblock->bitmaps, 4, "bitmaps")) { + if (hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_start), le32_to_cpu(superblock->n_dir_band), "dir_band") || + hpfs_chk_sectors(s, le32_to_cpu(superblock->dir_band_bitmap), 4, "dir_band_bitmap") || + hpfs_chk_sectors(s, le32_to_cpu(superblock->bitmaps), 4, "bitmaps")) { mark_dirty(s, 0); goto bail4; } @@ -612,8 +612,8 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } else printk("HPFS: You really don't want any checks? You are crazy...\n"); /* Load code page table */ - if (spareblock->n_code_pages) - if (!(sbi->sb_cp_table = hpfs_load_code_page(s, spareblock->code_page_dir))) + if (le32_to_cpu(spareblock->n_code_pages)) + if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) printk("HPFS: Warning: code page support is disabled\n"); brelse(bh2); @@ -642,13 +642,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (!de) hpfs_error(s, "unable to find root dir"); else { - root->i_atime.tv_sec = local_to_gmt(s, de->read_date); + root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date)); root->i_atime.tv_nsec = 0; - root->i_mtime.tv_sec = local_to_gmt(s, de->write_date); + root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); root->i_mtime.tv_nsec = 0; - root->i_ctime.tv_sec = local_to_gmt(s, de->creation_date); + root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); root->i_ctime.tv_nsec = 0; - hpfs_i(root)->i_ea_size = de->ea_size; + hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) root->i_size = 2048; -- cgit v1.1 From d0969d1949cc67a0f100f30ad69ec7ec1eca70d2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:32 +0200 Subject: HPFS: Fix some unaligned accesses Fix some unaligned accesses Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/ea.c | 41 +++++++++++++++++++++-------------------- fs/hpfs/hpfs.h | 3 ++- fs/hpfs/hpfs_fn.h | 12 +++++++++--- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 1bbc37d..7f1d90c 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -24,7 +24,7 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) } if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; if (ea->indirect) { - if (le16_to_cpu(ea->valuelen) != 8) { + if (ea_valuelen(ea) != 8) { hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x", ano ? "anode" : "sectors", a, pos); return; @@ -33,7 +33,7 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) return; hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); } - pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; + pos += ea->namelen + ea_valuelen(ea) + 5; } if (!ano) hpfs_free_sectors(s, a, (len+511) >> 9); else { @@ -82,10 +82,10 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, if (!strcmp(ea->name, key)) { if (ea->indirect) goto indirect; - if (le16_to_cpu(ea->valuelen) >= size) + if (ea_valuelen(ea) >= size) return -EINVAL; - memcpy(buf, ea_data(ea), le16_to_cpu(ea->valuelen)); - buf[le16_to_cpu(ea->valuelen)] = 0; + memcpy(buf, ea_data(ea), ea_valuelen(ea)); + buf[ea_valuelen(ea)] = 0; return 0; } a = le32_to_cpu(fnode->ea_secno); @@ -106,14 +106,14 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, if (!strcmp(ea->name, key)) { if (ea->indirect) goto indirect; - if (le16_to_cpu(ea->valuelen) >= size) + if (ea_valuelen(ea) >= size) return -EINVAL; - if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, le16_to_cpu(ea->valuelen), buf)) + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), buf)) return -EIO; - buf[le16_to_cpu(ea->valuelen)] = 0; + buf[ea_valuelen(ea)] = 0; return 0; } - pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; + pos += ea->namelen + ea_valuelen(ea) + 5; } return -ENOENT; indirect: @@ -138,12 +138,12 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (!strcmp(ea->name, key)) { if (ea->indirect) return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); - if (!(ret = kmalloc((*size = le16_to_cpu(ea->valuelen)) + 1, GFP_NOFS))) { + if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { printk("HPFS: out of memory for EA\n"); return NULL; } - memcpy(ret, ea_data(ea), le16_to_cpu(ea->valuelen)); - ret[le16_to_cpu(ea->valuelen)] = 0; + memcpy(ret, ea_data(ea), ea_valuelen(ea)); + ret[ea_valuelen(ea)] = 0; return ret; } a = le32_to_cpu(fnode->ea_secno); @@ -164,18 +164,18 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (!strcmp(ea->name, key)) { if (ea->indirect) return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); - if (!(ret = kmalloc((*size = le16_to_cpu(ea->valuelen)) + 1, GFP_NOFS))) { + if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { printk("HPFS: out of memory for EA\n"); return NULL; } - if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, le16_to_cpu(ea->valuelen), ret)) { + if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) { kfree(ret); return NULL; } - ret[le16_to_cpu(ea->valuelen)] = 0; + ret[ea_valuelen(ea)] = 0; return ret; } - pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; + pos += ea->namelen + ea_valuelen(ea) + 5; } return NULL; } @@ -202,7 +202,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, if (ea->indirect) { if (ea_len(ea) == size) set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); - } else if (le16_to_cpu(ea->valuelen) == size) { + } else if (ea_valuelen(ea) == size) { memcpy(ea_data(ea), data, size); } return; @@ -228,12 +228,12 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); } else { - if (le16_to_cpu(ea->valuelen) == size) + if (ea_valuelen(ea) == size) hpfs_ea_write(s, a, ano, pos + 4 + ea->namelen + 1, size, data); } return; } - pos += ea->namelen + le16_to_cpu(ea->valuelen) + 5; + pos += ea->namelen + ea_valuelen(ea) + 5; } if (!le16_to_cpu(fnode->ea_offs)) { /*if (le16_to_cpu(fnode->ea_size_s)) { @@ -254,7 +254,8 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, ea = fnode_end_ea(fnode); *(char *)ea = 0; ea->namelen = strlen(key); - ea->valuelen = cpu_to_le16(size); + ea->valuelen_lo = size; + ea->valuelen_hi = size >> 8; strcpy(ea->name, key); memcpy(ea_data(ea), data, size); fnode->ea_size_s = cpu_to_le16(le16_to_cpu(fnode->ea_size_s) + strlen(key) + size + 5); diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 91a6223..8b0650a 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -546,7 +546,8 @@ struct extended_attribute where real value starts */ #endif u8 namelen; /* length of name, bytes */ - u16 valuelen; /* length of value, bytes */ + u8 valuelen_lo; /* length of value, bytes */ + u8 valuelen_hi; /* length of value, bytes */ u8 name[0]; /* u8 name[namelen]; ascii attrib name diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index f993773..dd552f8 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "hpfs.h" @@ -135,19 +136,24 @@ static inline struct extended_attribute *fnode_end_ea(struct fnode *fnode) return (struct extended_attribute *)((char *)fnode + le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s)); } +static unsigned ea_valuelen(struct extended_attribute *ea) +{ + return ea->valuelen_lo + 256 * ea->valuelen_hi; +} + static inline struct extended_attribute *next_ea(struct extended_attribute *ea) { - return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + le16_to_cpu(ea->valuelen)); + return (struct extended_attribute *)((char *)ea + 5 + ea->namelen + ea_valuelen(ea)); } static inline secno ea_sec(struct extended_attribute *ea) { - return le32_to_cpu(*((secno *)((char *)ea + 9 + ea->namelen))); + return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen))); } static inline secno ea_len(struct extended_attribute *ea) { - return le32_to_cpu(*((secno *)((char *)ea + 5 + ea->namelen))); + return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen))); } static inline char *ea_data(struct extended_attribute *ea) -- cgit v1.1 From c3514817445a5a5e6c0d0c8152f5f161a98001db Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:38 +0200 Subject: HPFS: Move declaration up, so that there are no out-of-scope pointers Move declaration up, so that there are no out-of-scope pointers Reported-by: Jesper Juhl Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/ea.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 7f1d90c..d8b84d1 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -76,6 +76,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, unsigned pos; int ano, len; secno a; + char ex[4 + 255 + 1 + 8]; struct extended_attribute *ea; struct extended_attribute *ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) @@ -93,7 +94,6 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, ano = fnode->ea_anode; pos = 0; while (pos < len) { - char ex[4 + 255 + 1 + 8]; ea = (struct extended_attribute *)ex; if (pos + 4 > len) { hpfs_error(s, "EAs don't end correctly, %s %08x, len %08x", -- cgit v1.1 From 88f4e9e870c01452e57a6943c04c8d62f6a0a7a6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 8 May 2011 20:44:46 +0200 Subject: HPFS: Remove unused variable Remove unused variable Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/namei.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 5a8de6a..1f05839 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -361,7 +361,6 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) struct hpfs_dirent *de; struct inode *inode = dentry->d_inode; dnode_secno dno; - fnode_secno fno; int r; int rep = 0; int err; @@ -382,7 +381,6 @@ again: if (de->directory) goto out1; - fno = le32_to_cpu(de->fnode); r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); switch (r) { case 1: @@ -440,7 +438,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) struct hpfs_dirent *de; struct inode *inode = dentry->d_inode; dnode_secno dno; - fnode_secno fno; int n_items = 0; int err; int r; @@ -465,7 +462,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) if (n_items) goto out1; - fno = le32_to_cpu(de->fnode); r = hpfs_remove_dirent(dir, dno, de, &qbh, 1); switch (r) { case 1: -- cgit v1.1 From 49183b2818de6899383bb82bc032f9344d6791ff Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 19 Apr 2011 21:14:14 +0100 Subject: drm/i915: Only enable the plane after setting the fb base (pre-ILK) When enabling the plane, it is helpful to have already pointed that plane to valid memory or else we may incur the wrath of a PGTBL_ER. This code preserved the behaviour from the bad old days for unknown reasons... Found by assert_fb_bound_for_plane(). References: https://bugs.freedesktop.org/show_bug.cgi?id=36246 Signed-off-by: Chris Wilson Cc: Daniel Vetter Cc: Jesse Barnes Signed-off-by: Keith Packard --- drivers/gpu/drm/i915/intel_display.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index aab06cf..967451e 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -5154,8 +5154,6 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc, I915_WRITE(DSPCNTR(plane), dspcntr); POSTING_READ(DSPCNTR(plane)); - if (!HAS_PCH_SPLIT(dev)) - intel_enable_plane(dev_priv, plane, pipe); ret = intel_pipe_set_base(crtc, x, y, old_fb); -- cgit v1.1 From 39adb7a542db08998b4ae88f1698c4300dc39b55 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 22 Apr 2011 22:17:21 +0100 Subject: drm/i915: fix intel_crtc_clock_get pipe reads after "cleanup cleanup" Despite the fixes in 548f245ba6a31 (drm/i915: fix per-pipe reads after "cleanup"), we missed one neighbouring read that was mistakenly replaced with the reg value in 9db4a9c (drm/i915: cleanup per-pipe reg usage). This was preventing us from correctly determining the mode the BIOS left the panel in for machines that neither have an OpRegion nor access to the VBT, (e.g. the EeePC 700). Signed-off-by: Chris Wilson Cc: Jesse Barnes Cc: stable@kernel.org Reviewed-by: Jesse Barnes Signed-off-by: Keith Packard --- drivers/gpu/drm/i915/intel_display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 967451e..373c2a0 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -5603,9 +5603,9 @@ static int intel_crtc_clock_get(struct drm_device *dev, struct drm_crtc *crtc) intel_clock_t clock; if ((dpll & DISPLAY_RATE_SELECT_FPA1) == 0) - fp = FP0(pipe); + fp = I915_READ(FP0(pipe)); else - fp = FP1(pipe); + fp = I915_READ(FP1(pipe)); clock.m1 = (fp & FP_M1_DIV_MASK) >> FP_M1_DIV_SHIFT; if (IS_PINEVIEW(dev)) { -- cgit v1.1 From 2fb4e61d9471867677c97bf11dba8f1e9dfa7f7c Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 21 Apr 2011 16:08:14 -0600 Subject: drm/i915/lvds: Only act on lid notify when the device is on If we're using vga switcheroo, the device may be turned off and poking it can return random state. This provokes an OOPS fixed separately by 8ff887c847 (drm/i915/dp: Be paranoid in case we disable a DP before it is attached). Trying to use and respond to events on a device that has been turned off by the user is in principle a silly thing to do. Signed-off-by: Alex Williamson Signed-off-by: Chris Wilson Cc: stable@kernel.org Signed-off-by: Keith Packard --- drivers/gpu/drm/i915/intel_lvds.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index a562bd2..67cb076 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -539,6 +539,9 @@ static int intel_lid_notify(struct notifier_block *nb, unsigned long val, struct drm_device *dev = dev_priv->dev; struct drm_connector *connector = dev_priv->int_lvds_connector; + if (dev->switch_power_state != DRM_SWITCH_POWER_ON) + return NOTIFY_OK; + /* * check and update the status of LVDS connector after receiving * the LID nofication event. -- cgit v1.1 From dcbe14b91a920657ff3a9ba0efb7c5b5562f956a Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Wed, 4 May 2011 13:05:11 +0000 Subject: ehea: fix wrongly reported speed and port Currently EHEA reports to ethtool as supporting 10M, 100M, 1G and 10G and connected to FIBRE independent of the hardware configuration. However, when connected to FIBRE the only supported speed is 10G full-duplex, and the other speeds and modes are only supported when connected to twisted pair. Signed-off-by: Kleber Sacilotto de Souza Acked-by: Breno Leitao Signed-off-by: David S. Miller --- drivers/net/ehea/ehea_ethtool.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c index 3e2e734..f3bbdce 100644 --- a/drivers/net/ehea/ehea_ethtool.c +++ b/drivers/net/ehea/ehea_ethtool.c @@ -55,15 +55,20 @@ static int ehea_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) cmd->duplex = -1; } - cmd->supported = (SUPPORTED_10000baseT_Full | SUPPORTED_1000baseT_Full - | SUPPORTED_100baseT_Full | SUPPORTED_100baseT_Half - | SUPPORTED_10baseT_Full | SUPPORTED_10baseT_Half - | SUPPORTED_Autoneg | SUPPORTED_FIBRE); - - cmd->advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_Autoneg - | ADVERTISED_FIBRE); + if (cmd->speed == SPEED_10000) { + cmd->supported = (SUPPORTED_10000baseT_Full | SUPPORTED_FIBRE); + cmd->advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_FIBRE); + cmd->port = PORT_FIBRE; + } else { + cmd->supported = (SUPPORTED_1000baseT_Full | SUPPORTED_100baseT_Full + | SUPPORTED_100baseT_Half | SUPPORTED_10baseT_Full + | SUPPORTED_10baseT_Half | SUPPORTED_Autoneg + | SUPPORTED_TP); + cmd->advertising = (ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg + | ADVERTISED_TP); + cmd->port = PORT_TP; + } - cmd->port = PORT_FIBRE; cmd->autoneg = port->autoneg == 1 ? AUTONEG_ENABLE : AUTONEG_DISABLE; return 0; -- cgit v1.1 From 6709d9521df05c105343473ab8b147e2ef1e13d8 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Wed, 4 May 2011 22:40:46 +0000 Subject: be2net: Fixed bugs related to PVID. Fixed bug to make sure 'pvid' retrieval will work on big endian hosts. Fixed incorrect comparison between the Rx Completion's 16-bit VLAN TCI and the PVID. Now comparing only the relevant 12 bits corresponding to the VID. Renamed 'vid' field under Rx Completion to 'vlan_tag' to reflect accurate description. Signed-off-by: Somnath Kotur Signed-off-by: David S. Miller --- drivers/net/benet/be.h | 2 +- drivers/net/benet/be_cmds.c | 2 +- drivers/net/benet/be_main.c | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/benet/be.h b/drivers/net/benet/be.h index 66823ed..2353eca 100644 --- a/drivers/net/benet/be.h +++ b/drivers/net/benet/be.h @@ -213,7 +213,7 @@ struct be_rx_stats { struct be_rx_compl_info { u32 rss_hash; - u16 vid; + u16 vlan_tag; u16 pkt_size; u16 rxq_idx; u16 mac_id; diff --git a/drivers/net/benet/be_cmds.c b/drivers/net/benet/be_cmds.c index 1e2d825..9dc9394 100644 --- a/drivers/net/benet/be_cmds.c +++ b/drivers/net/benet/be_cmds.c @@ -132,7 +132,7 @@ static void be_async_grp5_pvid_state_process(struct be_adapter *adapter, struct be_async_event_grp5_pvid_state *evt) { if (evt->enabled) - adapter->pvid = evt->tag; + adapter->pvid = le16_to_cpu(evt->tag); else adapter->pvid = 0; } diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c index 02a0443..9187fb4 100644 --- a/drivers/net/benet/be_main.c +++ b/drivers/net/benet/be_main.c @@ -1018,7 +1018,8 @@ static void be_rx_compl_process(struct be_adapter *adapter, kfree_skb(skb); return; } - vlan_hwaccel_receive_skb(skb, adapter->vlan_grp, rxcp->vid); + vlan_hwaccel_receive_skb(skb, adapter->vlan_grp, + rxcp->vlan_tag); } else { netif_receive_skb(skb); } @@ -1076,7 +1077,8 @@ static void be_rx_compl_process_gro(struct be_adapter *adapter, if (likely(!rxcp->vlanf)) napi_gro_frags(&eq_obj->napi); else - vlan_gro_frags(&eq_obj->napi, adapter->vlan_grp, rxcp->vid); + vlan_gro_frags(&eq_obj->napi, adapter->vlan_grp, + rxcp->vlan_tag); } static void be_parse_rx_compl_v1(struct be_adapter *adapter, @@ -1102,7 +1104,8 @@ static void be_parse_rx_compl_v1(struct be_adapter *adapter, rxcp->pkt_type = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, cast_enc, compl); rxcp->vtm = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vtm, compl); - rxcp->vid = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vlan_tag, compl); + rxcp->vlan_tag = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vlan_tag, + compl); } static void be_parse_rx_compl_v0(struct be_adapter *adapter, @@ -1128,7 +1131,8 @@ static void be_parse_rx_compl_v0(struct be_adapter *adapter, rxcp->pkt_type = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, cast_enc, compl); rxcp->vtm = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vtm, compl); - rxcp->vid = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vlan_tag, compl); + rxcp->vlan_tag = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vlan_tag, + compl); } static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) @@ -1155,9 +1159,11 @@ static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) rxcp->vlanf = 0; if (!lancer_chip(adapter)) - rxcp->vid = swab16(rxcp->vid); + rxcp->vlan_tag = swab16(rxcp->vlan_tag); - if ((adapter->pvid == rxcp->vid) && !adapter->vlan_tag[rxcp->vid]) + if (((adapter->pvid & VLAN_VID_MASK) == + (rxcp->vlan_tag & VLAN_VID_MASK)) && + !adapter->vlan_tag[rxcp->vlan_tag]) rxcp->vlanf = 0; /* As the compl has been parsed, reset it; we wont touch it again */ -- cgit v1.1 From 057bef938896e6266ae24ec4266d24792d27c29a Mon Sep 17 00:00:00 2001 From: Matvejchikov Ilya Date: Fri, 6 May 2011 06:23:09 +0000 Subject: NET: slip, fix ldisc->open retval TTY layer expects 0 if the ldisc->open operation succeeded. Signed-off-by : Matvejchikov Ilya Acked-by: Oliver Hartkopp Acked-by: Alan Cox Signed-off-by: David S. Miller --- drivers/net/slip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 86cbb9e..8ec1a9a 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -853,7 +853,9 @@ static int slip_open(struct tty_struct *tty) /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ - return sl->dev->base_addr; + + /* TTY layer expects 0 on success */ + return 0; err_free_bufs: sl_free_bufs(sl); -- cgit v1.1 From ce3dad0f74e6b240f0b1dedbd8ea268a3f298d82 Mon Sep 17 00:00:00 2001 From: Toshiharu Okada Date: Fri, 6 May 2011 02:53:51 +0000 Subject: PCH_GbE : Fixed the issue of collision detection The collision detection setting was invalid. When collision occurred, because data was not resent, there was an issue to which a transmitting throughput falls. This patch enables the collision detection. Signed-off-by: Toshiharu Okada Signed-off-by: David S. Miller --- drivers/net/pch_gbe/pch_gbe_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 2ef2f9c..4ebd1d4 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -43,8 +43,7 @@ const char pch_driver_version[] = DRV_VERSION; #define PCH_GBE_MAC_RGMII_CTRL_SETTING ( \ PCH_GBE_CHIP_TYPE_INTERNAL | \ - PCH_GBE_RGMII_MODE_RGMII | \ - PCH_GBE_CRS_SEL \ + PCH_GBE_RGMII_MODE_RGMII \ ) /* Ethertype field values */ -- cgit v1.1 From 5d05a04d283061b586e8dc819cfa6f4b8cfd5948 Mon Sep 17 00:00:00 2001 From: Toshiharu Okada Date: Fri, 6 May 2011 02:53:56 +0000 Subject: PCH_GbE : Fixed the issue of checksum judgment The checksum judgment was mistaken. Judgment result 0:Correct 1:Wrong This patch fixes the issue. Signed-off-by: Toshiharu Okada Signed-off-by: David S. Miller --- drivers/net/pch_gbe/pch_gbe_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 4ebd1d4..9f6c402 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -1493,12 +1493,11 @@ pch_gbe_clean_rx(struct pch_gbe_adapter *adapter, /* Write meta date of skb */ skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if ((tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) == - PCH_GBE_RXD_ACC_STAT_TCPIPOK) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { + if (tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) skb->ip_summed = CHECKSUM_NONE; - } + else + skb->ip_summed = CHECKSUM_UNNECESSARY; + napi_gro_receive(&adapter->napi, skb); (*work_done)++; pr_debug("Receive skb->ip_summed: %d length: %d\n", -- cgit v1.1 From 2b5e8ef35bc89eee944c0627edacbb1fea5a1b84 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 5 May 2011 15:19:42 -0400 Subject: x86, efi: Remove virtual-mode SetVirtualAddressMap call The spec says that SetVirtualAddressMap doesn't work once you're in virtual mode, so there's no point in having infrastructure for calling it from there. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1304623186-18261-1-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 0fe27d7..f2e4fe9 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -145,17 +145,6 @@ static void virt_efi_reset_system(int reset_type, data_size, data); } -static efi_status_t virt_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - return efi_call_virt4(set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); -} - static efi_status_t __init phys_efi_set_virtual_address_map( unsigned long memory_map_size, unsigned long descriptor_size, @@ -572,7 +561,7 @@ void __init efi_enter_virtual_mode(void) efi.set_variable = virt_efi_set_variable; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; - efi.set_virtual_address_map = virt_efi_set_virtual_address_map; + efi.set_virtual_address_map = NULL; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); -- cgit v1.1 From 9cd2b07c197e3ff594fc04f5fb3d86efbeab6ad8 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 5 May 2011 15:19:43 -0400 Subject: x86, efi: Consolidate EFI nx control The core EFI code and 64-bit EFI code currently have independent implementations of code for setting memory regions as executable or not. Let's consolidate them. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1304623186-18261-2-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 1 + arch/x86/platform/efi/efi.c | 21 ++++++++++++++++----- arch/x86/platform/efi/efi_64.c | 28 +++++----------------------- 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8e4a165..7093e4a 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ extern int add_efi_memmap; +extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern void efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f2e4fe9..7daae27 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -457,11 +457,25 @@ void __init efi_init(void) #endif } +void __init efi_set_executable(efi_memory_desc_t *md, bool executable) +{ + u64 addr, npages; + + addr = md->virt_addr; + npages = md->num_pages; + + memrange_efi_to_native(&addr, &npages); + + if (executable) + set_memory_x(addr, npages); + else + set_memory_nx(addr, npages); +} + static void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; void *p; - u64 addr, npages; /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -470,10 +484,7 @@ static void __init runtime_code_page_mkexec(void) if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_x(addr, npages); + efi_set_executable(md, true); } } diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac0621a..94d6b39 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -41,22 +41,7 @@ static pgd_t save_pgd __initdata; static unsigned long efi_flags __initdata; -static void __init early_mapping_set_exec(unsigned long start, - unsigned long end, - int executable) -{ - unsigned long num_pages; - - start &= PMD_MASK; - end = (end + PMD_SIZE - 1) & PMD_MASK; - num_pages = (end - start) >> PAGE_SHIFT; - if (executable) - set_memory_x((unsigned long)__va(start), num_pages); - else - set_memory_nx((unsigned long)__va(start), num_pages); -} - -static void __init early_runtime_code_mapping_set_exec(int executable) +static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; void *p; @@ -67,11 +52,8 @@ static void __init early_runtime_code_mapping_set_exec(int executable) /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (md->type == EFI_RUNTIME_SERVICES_CODE) { - unsigned long end; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - early_mapping_set_exec(md->phys_addr, end, executable); - } + if (md->type == EFI_RUNTIME_SERVICES_CODE) + efi_set_executable(md, executable); } } @@ -79,7 +61,7 @@ void __init efi_call_phys_prelog(void) { unsigned long vaddress; - early_runtime_code_mapping_set_exec(1); + early_code_mapping_set_exec(1); local_irq_save(efi_flags); vaddress = (unsigned long)__va(0x0UL); save_pgd = *pgd_offset_k(0x0UL); @@ -95,7 +77,7 @@ void __init efi_call_phys_epilog(void) set_pgd(pgd_offset_k(0x0UL), save_pgd); __flush_tlb_all(); local_irq_restore(efi_flags); - early_runtime_code_mapping_set_exec(0); + early_code_mapping_set_exec(0); } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, -- cgit v1.1 From 202f9d0a41809e3424af5f61489b48b622824aed Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 5 May 2011 15:19:44 -0400 Subject: x86, efi: Merge contiguous memory regions of the same type and attribute Some firmware implementations assume that physically contiguous regions will be contiguous in virtual address space. This assumption is, obviously, entirely unjustifiable. Said firmware implementations lack the good grace to handle their failings in a measured and reasonable manner, instead tending to shit all over address space and oopsing the kernel. In an ideal universe these firmware implementations would simultaneously catch fire and cease to be a problem, but since some of them are present in attractively thin and shiny metal devices vanity wins out and some poor developer spends an extended period of time surrounded by a growing array of empty bottles until the underlying reason becomes apparent. Said developer presents this patch, which simply merges adjacent regions if they happen to be contiguous and have the same EFI memory type and caching attributes. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1304623186-18261-3-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 7daae27..a46a73e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -498,13 +498,41 @@ static void __init runtime_code_page_mkexec(void) */ void __init efi_enter_virtual_mode(void) { - efi_memory_desc_t *md; + efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; u64 end, systab, addr, npages, end_pfn; void *p, *va; efi.systab = NULL; + + /* Merge contiguous regions of the same type and attribute */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + u64 prev_size; + md = p; + + if (!prev_md) { + prev_md = md; + continue; + } + + if (prev_md->type != md->type || + prev_md->attribute != md->attribute) { + prev_md = md; + continue; + } + + prev_size = prev_md->num_pages << EFI_PAGE_SHIFT; + + if (md->phys_addr == (prev_md->phys_addr + prev_size)) { + prev_md->num_pages += md->num_pages; + md->type = EFI_RESERVED_TYPE; + md->attribute = 0; + continue; + } + prev_md = md; + } + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) -- cgit v1.1 From 7cb00b72876ea2451eb79d468da0e8fb9134aa8a Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 5 May 2011 15:19:45 -0400 Subject: x86, efi: Pass a minimal map to SetVirtualAddressMap() Experimentation with various EFI implementations has shown that functions outside runtime services will still update their pointers if SetVirtualAddressMap() is called with memory descriptors outside the runtime area. This is obviously insane, and therefore is unsurprising. Evidence from instrumenting another EFI implementation suggests that it only passes the set of descriptors covering runtime regions, so let's avoid any problems by doing the same. Runtime descriptors are copied to a separate memory map, and only that map is passed back to the firmware. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1304623186-18261-4-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a46a73e..b30aa26 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -502,7 +502,8 @@ void __init efi_enter_virtual_mode(void) efi_status_t status; unsigned long size; u64 end, systab, addr, npages, end_pfn; - void *p, *va; + void *p, *va, *new_memmap = NULL; + int count = 0; efi.systab = NULL; @@ -569,15 +570,21 @@ void __init efi_enter_virtual_mode(void) systab += md->virt_addr - md->phys_addr; efi.systab = (efi_system_table_t *) (unsigned long) systab; } + new_memmap = krealloc(new_memmap, + (count + 1) * memmap.desc_size, + GFP_KERNEL); + memcpy(new_memmap + (count * memmap.desc_size), md, + memmap.desc_size); + count++; } BUG_ON(!efi.systab); status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, + memmap.desc_size * count, memmap.desc_size, memmap.desc_version, - memmap.phys_map); + (efi_memory_desc_t *)__pa(new_memmap)); if (status != EFI_SUCCESS) { printk(KERN_ALERT "Unable to switch EFI into virtual mode " @@ -605,6 +612,7 @@ void __init efi_enter_virtual_mode(void) runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; + kfree(new_memmap); } /* -- cgit v1.1 From 935a638241b0658b9749edd060f972575f9d4a78 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 5 May 2011 15:19:46 -0400 Subject: x86, efi: Ensure that the entirity of a region is mapped It's possible for init_memory_mapping() to fail to map the entire region if it crosses a boundary, so ensure that we complete the mapping. Signed-off-by: Matthew Garrett Link: http://lkml.kernel.org/r/1304623186-18261-5-git-send-email-mjg@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/platform/efi/efi_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 94d6b39..2649426 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -89,8 +89,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, return ioremap(phys_addr, size); last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) - return NULL; + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { + unsigned long top = last_map_pfn << PAGE_SHIFT; + efi_ioremap(top, size - (top - phys_addr), type); + } return (void __iomem *)__va(phys_addr); } -- cgit v1.1 From b0e6baf5619a6fa3eaf43b55fdb4daa362c3c916 Mon Sep 17 00:00:00 2001 From: Tomoya Date: Mon, 9 May 2011 01:19:37 +0000 Subject: pch_gbe: support ML7223 IOH Support new device OKI SEMICONDUCTOR ML7223 IOH(Input/Output Hub). The ML7223 IOH is for MP(Media Phone) use. The ML7223 is companion chip for Intel Atom E6xx series. The ML7223 is completely compatible for Intel EG20T PCH. Signed-off-by: Tomoya MORINAGA Signed-off-by: David S. Miller --- drivers/net/Kconfig | 8 +++++++- drivers/net/pch_gbe/pch_gbe_main.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index dc280bc..6c884ef 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2536,7 +2536,7 @@ config S6GMAC source "drivers/net/stmmac/Kconfig" config PCH_GBE - tristate "PCH Gigabit Ethernet" + tristate "Intel EG20T PCH / OKI SEMICONDUCTOR ML7223 IOH GbE" depends on PCI select MII ---help--- @@ -2548,6 +2548,12 @@ config PCH_GBE to Gigabit Ethernet. This driver enables Gigabit Ethernet function. + This driver also can be used for OKI SEMICONDUCTOR IOH(Input/ + Output Hub), ML7223. + ML7223 IOH is for MP(Media Phone) use. + ML7223 is companion chip for Intel Atom E6xx series. + ML7223 is completely compatible for Intel EG20T PCH. + endif # NETDEV_1000 # diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 9f6c402..56d049a 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -34,6 +34,10 @@ const char pch_driver_version[] = DRV_VERSION; #define PCH_GBE_COPYBREAK_DEFAULT 256 #define PCH_GBE_PCI_BAR 1 +/* Macros for ML7223 */ +#define PCI_VENDOR_ID_ROHM 0x10db +#define PCI_DEVICE_ID_ROHM_ML7223_GBE 0x8013 + #define PCH_GBE_TX_WEIGHT 64 #define PCH_GBE_RX_WEIGHT 64 #define PCH_GBE_RX_BUFFER_WRITE 16 @@ -2418,6 +2422,13 @@ static DEFINE_PCI_DEVICE_TABLE(pch_gbe_pcidev_id) = { .class = (PCI_CLASS_NETWORK_ETHERNET << 8), .class_mask = (0xFFFF00) }, + {.vendor = PCI_VENDOR_ID_ROHM, + .device = PCI_DEVICE_ID_ROHM_ML7223_GBE, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .class = (PCI_CLASS_NETWORK_ETHERNET << 8), + .class_mask = (0xFFFF00) + }, /* required last entry */ {0} }; -- cgit v1.1 From a09a79f66874c905af35d5bb5e5f2fdc7b6b894d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 9 May 2011 13:01:09 +0200 Subject: Don't lock guardpage if the stack is growing up Linux kernel excludes guard page when performing mlock on a VMA with down-growing stack. However, some architectures have up-growing stack and locking the guard page should be excluded in this case too. This patch fixes lvm2 on PA-RISC (and possibly other architectures with up-growing stack). lvm2 calculates number of used pages when locking and when unlocking and reports an internal error if the numbers mismatch. [ Patch changed fairly extensively to also fix /proc//maps for the grows-up case, and to move things around a bit to clean it all up and share the infrstructure with the /proc bits. Tested on ia64 that has both grow-up and grow-down segments - Linus ] Signed-off-by: Mikulas Patocka Tested-by: Tony Luck Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 +++++++----- include/linux/mm.h | 24 +++++++++++++++++++++++- mm/memory.c | 16 +++++++--------- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2e7addf..318d865 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -214,7 +214,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) int flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; - unsigned long start; + unsigned long start, end; dev_t dev = 0; int len; @@ -227,13 +227,15 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (vma->vm_flags & VM_GROWSDOWN) - if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) - start += PAGE_SIZE; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, - vma->vm_end, + end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', diff --git a/include/linux/mm.h b/include/linux/mm.h index 2348db2..6507dde 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,11 +1011,33 @@ int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); /* Is the vma a continuation of the stack vma above it? */ -static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) +static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr) { return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); } +static inline int stack_guard_page_start(struct vm_area_struct *vma, + unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_growsdown(vma->vm_prev, addr); +} + +/* Is the vma a continuation of the stack vma below it? */ +static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr) +{ + return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP); +} + +static inline int stack_guard_page_end(struct vm_area_struct *vma, + unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSUP) && + (vma->vm_end == addr) && + !vma_growsup(vma->vm_next, addr); +} + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); diff --git a/mm/memory.c b/mm/memory.c index 27f4253..61e66f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1412,9 +1412,8 @@ no_page_table: static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) { - return (vma->vm_flags & VM_GROWSDOWN) && - (vma->vm_start == addr) && - !vma_stack_continue(vma->vm_prev, addr); + return stack_guard_page_start(vma, addr) || + stack_guard_page_end(vma, addr+PAGE_SIZE); } /** @@ -1551,12 +1550,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - /* - * For mlock, just skip the stack guard page. - */ - if ((gup_flags & FOLL_MLOCK) && stack_guard_page(vma, start)) - goto next_page; - do { struct page *page; unsigned int foll_flags = gup_flags; @@ -1573,6 +1566,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int ret; unsigned int fault_flags = 0; + /* For mlock, just skip the stack guard page. */ + if (foll_flags & FOLL_MLOCK) { + if (stack_guard_page(vma, start)) + goto next_page; + } if (foll_flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) -- cgit v1.1 From 228d62dd3f74734b9801c789b5addc57fdfc208f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 6 May 2011 02:54:04 +0000 Subject: xfs: ensure reclaim cursor is reset correctly at end of AG On a 32 bit highmem PowerPC machine, the XFS inode cache was growing without bound and exhausting low memory causing the OOM killer to be triggered. After some effort, the problem was reproduced on a 32 bit x86 highmem machine. The problem is that the per-ag inode reclaim index cursor was not getting reset to the start of the AG if the radix tree tag lookup found no more reclaimable inodes. Hence every further reclaim attempt started at the same index beyond where any reclaimable inodes lay, and no further background reclaim ever occurred from the AG. Without background inode reclaim the VM driven cache shrinker simply cannot keep up with cache growth, and OOM is the result. While the change that exposed the problem was the conversion of the inode reclaim to use work queues for background reclaim, it was not the cause of the bug. The bug was introduced when the cursor code was added, just waiting for some weird configuration to strike.... Signed-off-by: Dave Chinner Tested-By: Christian Kujau Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder (cherry picked from commit b223221956675ce8a7b436d198ced974bb388571) --- fs/xfs/linux-2.6/xfs_sync.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index e4f9c1b..3e898a4 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -926,6 +926,7 @@ restart: XFS_LOOKUP_BATCH, XFS_ICI_RECLAIM_TAG); if (!nr_found) { + done = 1; rcu_read_unlock(); break; } -- cgit v1.1 From 9e7004e741de0b2daabbbadafbaf11ff1a94e00c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 6 May 2011 02:54:05 +0000 Subject: xfs: exit AIL push work correctly when AIL is empty The recent conversion of the xfsaild functionality to a work queue introduced a hard-to-hit log space grant hang. The main cause is a regression where a work exit path fails to clear the PUSHING state and recheck the target correctly. Make both exit paths do the same PUSHING bit clearing and target checking when the "no more work to be done" condition is hit. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder (cherry picked from commit ea35a20021f8497390d05b93271b4d675516c654) --- fs/xfs/xfs_trans_ail.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index acdb92f..226c58b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -346,18 +346,20 @@ xfs_ail_delete( */ STATIC void xfs_ail_worker( - struct work_struct *work) + struct work_struct *work) { - struct xfs_ail *ailp = container_of(to_delayed_work(work), + struct xfs_ail *ailp = container_of(to_delayed_work(work), struct xfs_ail, xa_work); - long tout; - xfs_lsn_t target = ailp->xa_target; - xfs_lsn_t lsn; - xfs_log_item_t *lip; - int flush_log, count, stuck; - xfs_mount_t *mp = ailp->xa_mount; + xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor *cur = &ailp->xa_cursors; - int push_xfsbufd = 0; + xfs_log_item_t *lip; + xfs_lsn_t lsn; + xfs_lsn_t target = ailp->xa_target; + long tout = 10; + int flush_log = 0; + int stuck = 0; + int count = 0; + int push_xfsbufd = 0; spin_lock(&ailp->xa_lock); xfs_trans_ail_cursor_init(ailp, cur); @@ -368,8 +370,7 @@ xfs_ail_worker( */ xfs_trans_ail_cursor_done(ailp, cur); spin_unlock(&ailp->xa_lock); - ailp->xa_last_pushed_lsn = 0; - return; + goto out_done; } XFS_STATS_INC(xs_push_ail); @@ -386,7 +387,6 @@ xfs_ail_worker( * lots of contention on the AIL lists. */ lsn = lip->li_lsn; - flush_log = stuck = count = 0; while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { int lock_result; /* @@ -480,7 +480,7 @@ xfs_ail_worker( } /* assume we have more work to do in a short while */ - tout = 10; +out_done: if (!count) { /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; -- cgit v1.1 From 50e86686dfb287d720af8b0f977202d205c04215 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 6 May 2011 02:54:06 +0000 Subject: xfs: always push the AIL to the target The recent conversion of the xfsaild functionality to a work queue introduced a hard-to-hit log space grant hang. One of the problems discovered is a target mismatch between the item pushing loop and the target itself. The push trigger checks for the target increasing (i.e. new target > current) while the push loop only pushes items that have a LSN < current. As a result, we can get the situation where the push target is X, the items at the tail of the AIL have LSN X and they don't get pushed. The push work then completes thinking it is done, and cannot be restarted until the push target increases to >= X + 1. If the push target then never increases (because the tail is not moving), then we never run the push work again and we stall. Fix it by making sure log items with a LSN that matches the target exactly are pushed during the loop. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder (cherry picked from commit cb64026b6e8af50db598ec7c3f59d504259b00bb) --- fs/xfs/xfs_trans_ail.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 226c58b..9f427c2 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -387,7 +387,7 @@ xfs_ail_worker( * lots of contention on the AIL lists. */ lsn = lip->li_lsn; - while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { + while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; /* * If we can lock the item without sleeping, unlock the AIL -- cgit v1.1 From fe0da767311933d1c1907cb8d326beea7a3cbd9c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 6 May 2011 02:54:07 +0000 Subject: xfs: make AIL target updates and compares 32bit safe. The recent conversion of the xfsaild functionality to a work queue introduced a hard-to-hit log space grant hang. One of the problems noticed was that updates of the push target are not 32 bit safe as the target is a 64 bit value. We cannot copy a 64 bit LSN without the possibility of corrupting the result when racing with another updating thread. We have function to do this update safely without needing to care about 32/64 bit issues - xfs_trans_ail_copy_lsn() - so use that when updating the AIL push target. Also move the reading of the target in the push work inside the AIL lock, and use XFS_LSN_CMP() for the unlocked comparison during work termination to close read holes as well. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder (cherry picked from commit fd5670f22fce247754243cf2ed41941e5762d990) --- fs/xfs/xfs_trans_ail.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9f427c2..d7eebbf 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -354,7 +354,7 @@ xfs_ail_worker( struct xfs_ail_cursor *cur = &ailp->xa_cursors; xfs_log_item_t *lip; xfs_lsn_t lsn; - xfs_lsn_t target = ailp->xa_target; + xfs_lsn_t target; long tout = 10; int flush_log = 0; int stuck = 0; @@ -362,6 +362,7 @@ xfs_ail_worker( int push_xfsbufd = 0; spin_lock(&ailp->xa_lock); + target = ailp->xa_target; xfs_trans_ail_cursor_init(ailp, cur); lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { @@ -491,7 +492,7 @@ out_done: * work to do. Wait a bit longer before starting that work. */ smp_rmb(); - if (ailp->xa_target == target) { + if (XFS_LSN_CMP(ailp->xa_target, target) == 0) { clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); return; } @@ -553,7 +554,7 @@ xfs_ail_push( * the XFS_AIL_PUSHING_BIT. */ smp_wmb(); - ailp->xa_target = threshold_lsn; + xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); } -- cgit v1.1 From 7ac956576d0ce8f97450a39c2f304db8eea01647 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 6 May 2011 02:54:08 +0000 Subject: xfs: fix race condition in AIL push trigger The recent conversion of the xfsaild functionality to a work queue introduced a hard-to-hit log space grant hang. One is caused by a race condition in determining whether there is a psh in progress or not. The XFS_AIL_PUSHING_BIT is used to determine whether a push is currently in progress. When the AIL push work completes, it checked whether the target changed and cleared the PUSHING bit to allow a new push to be requeued. The race condition is as follows: Thread 1 push work smp_wmb() smp_rmb() check ailp->xa_target unchanged update ailp->xa_target test/set PUSHING bit does not queue clear PUSHING bit does not requeue Now that the push target is updated, new attempts to push the AIL will not trigger as the push target will be the same, and hence despite trying to push the AIL we won't ever wake it again. The fix is to ensure that the AIL push work clears the PUSHING bit before it checks if the target is unchanged. As a result, both push triggers operate on the same test/set bit criteria, so even if we race in the push work and miss the target update, the thread requesting the push will still set the PUSHING bit and queue the push work to occur. For safety sake, the same queue check is done if the push work detects the target change, though only one of the two will will queue new work due to the use of test_and_set_bit() checks. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder (cherry picked from commit e4d3c4a43b595d5124ae824d300626e6489ae857) --- fs/xfs/xfs_trans_ail.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d7eebbf..5fc2380 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -487,15 +487,19 @@ out_done: ailp->xa_last_pushed_lsn = 0; /* - * Check for an updated push target before clearing the - * XFS_AIL_PUSHING_BIT. If the target changed, we've got more - * work to do. Wait a bit longer before starting that work. + * We clear the XFS_AIL_PUSHING_BIT first before checking + * whether the target has changed. If the target has changed, + * this pushes the requeue race directly onto the result of the + * atomic test/set bit, so we are guaranteed that either the + * the pusher that changed the target or ourselves will requeue + * the work (but not both). */ + clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); smp_rmb(); - if (XFS_LSN_CMP(ailp->xa_target, target) == 0) { - clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); + if (XFS_LSN_CMP(ailp->xa_target, target) == 0 || + test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) return; - } + tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* -- cgit v1.1 From 42c36f63ac1366ab0ecc2d5717821362c259f517 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 9 May 2011 17:44:42 -0700 Subject: vm: fix vm_pgoff wrap in upward expansion Commit a626ca6a6564 ("vm: fix vm_pgoff wrap in stack expansion") fixed the case of an expanding mapping causing vm_pgoff wrapping when you had downward stack expansion. But there was another case where IA64 and PA-RISC expand mappings: upward expansion. This fixes that case too. Signed-off-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index e27e0cf..772140c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1767,10 +1767,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_end = address; - perf_event_mmap(vma); + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_end = address; + perf_event_mmap(vma); + } } } vma_unlock_anon_vma(vma); -- cgit v1.1 From 693d92a1bbc9e42681c42ed190bd42b636ca876f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 9 May 2011 19:33:54 -0700 Subject: Linux 2.6.39-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 28820f7..41ea6fb 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 39 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* -- cgit v1.1 From 3cd7967825a2b3926dc96ae566d986c4420919f7 Mon Sep 17 00:00:00 2001 From: "M. Mohan Kumar" Date: Fri, 15 Apr 2011 13:59:33 +0530 Subject: net/9p: Handle get_user_pages_fast return properly Use proper data type to handle get_user_pages_fast error condition. Also do not treat EFAULT error as fatal. Signed-off-by: M. Mohan Kumar Signed-off-by: Venkateswararao Jujjuri Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 2 +- net/9p/trans_common.c | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 7736774..a9aa2dd 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -614,7 +614,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) err = c->trans_mod->request(c, req); if (err < 0) { - if (err != -ERESTARTSYS) + if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; goto reterr; } diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index e883172..9a70ebd 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -63,7 +63,7 @@ p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, int nr_pages, u8 rw) { uint32_t first_page_bytes = 0; - uint32_t pdata_mapped_pages; + int32_t pdata_mapped_pages; struct trans_rpage_info *rpinfo; *pdata_off = (__force size_t)req->tc->pubuf & (PAGE_SIZE-1); @@ -75,14 +75,9 @@ p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len, rpinfo = req->tc->private; pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf, nr_pages, rw, &rpinfo->rp_data[0]); + if (pdata_mapped_pages <= 0) + return pdata_mapped_pages; - if (pdata_mapped_pages < 0) { - printk(KERN_ERR "get_user_pages_fast failed:%d udata:%p" - "nr_pages:%d\n", pdata_mapped_pages, - req->tc->pubuf, nr_pages); - pdata_mapped_pages = 0; - return -EIO; - } rpinfo->rp_nr_pages = pdata_mapped_pages; if (*pdata_off) { *pdata_len = first_page_bytes; -- cgit v1.1 From 43b752daae9445a3b2b075a236840d801fce1593 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Mon, 9 May 2011 22:06:10 -0700 Subject: RDMA/cma: Fix handling of IPv6 addressing in cma_use_port cma_use_port() assumes that the sockaddr is an IPv4 address. Since IPv6 addressing is supported (and also to support other address families) make the code more generic in its address handling. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5ed9d25..eff5e46 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -712,6 +712,21 @@ static inline int cma_any_addr(struct sockaddr *addr) return cma_zero_addr(addr) || cma_loopback_addr(addr); } +static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +{ + if (src->sa_family != dst->sa_family) + return -1; + + switch (src->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) src)->sin_addr.s_addr != + ((struct sockaddr_in *) dst)->sin_addr.s_addr; + default: + return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, + &((struct sockaddr_in6 *) dst)->sin6_addr); + } +} + static inline __be16 cma_port(struct sockaddr *addr) { if (addr->sa_family == AF_INET) @@ -2168,13 +2183,13 @@ retry: static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; - struct sockaddr_in *sin, *cur_sin; + struct sockaddr *addr, *cur_addr; struct rdma_bind_list *bind_list; struct hlist_node *node; unsigned short snum; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - snum = ntohs(sin->sin_port); + addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + snum = ntohs(cma_port(addr)); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; @@ -2186,15 +2201,15 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) * We don't support binding to any address if anyone is bound to * a specific address on the same port. */ - if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) + if (cma_any_addr(addr)) return -EADDRNOTAVAIL; hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { - if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr)) + cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + if (cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; - cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr; - if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr) + if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } -- cgit v1.1 From a9bb79128aa659f97b774b97c9bb1bdc74444595 Mon Sep 17 00:00:00 2001 From: "Hefty, Sean" Date: Mon, 9 May 2011 22:06:10 -0700 Subject: RDMA/cma: Add an ID_REUSEADDR option Lustre requires that clients bind to a privileged port number before connecting to a remote server. On larger clusters (typically more than about 1000 nodes), the number of privileged ports is exhausted, resulting in lustre being unusable. To handle this, we add support for reusable addresses to the rdma_cm. This mimics the behavior of the socket option SO_REUSEADDR. A user may set an rdma_cm_id to reuse an address before calling rdma_bind_addr() (explicitly or implicitly). If set, other rdma_cm_id's may be bound to the same address, provided that they all have reuse enabled, and there are no active listens. If rdma_listen() is called on an rdma_cm_id that has reuse enabled, it will only succeed if there are no other id's bound to that same address. The reuse option is exported to user space. The behavior of the kernel reuse implementation was verified against that given by sockets. This patch is derived from a path by Ira Weiny Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 190 ++++++++++++++++++++++++++--------------- drivers/infiniband/core/ucma.c | 7 ++ include/rdma/rdma_cm.h | 10 +++ include/rdma/rdma_user_cm.h | 5 +- 4 files changed, 143 insertions(+), 69 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index eff5e46..99dde87 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -148,6 +148,7 @@ struct rdma_id_private { u32 qp_num; u8 srq; u8 tos; + u8 reuseaddr; }; struct cma_multicast { @@ -1579,50 +1580,6 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -int rdma_listen(struct rdma_cm_id *id, int backlog) -{ - struct rdma_id_private *id_priv; - int ret; - - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == CMA_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); - if (ret) - return ret; - } - - if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) - return -EINVAL; - - id_priv->backlog = backlog; - if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - ret = cma_ib_listen(id_priv); - if (ret) - goto err; - break; - case RDMA_TRANSPORT_IWARP: - ret = cma_iw_listen(id_priv, backlog); - if (ret) - goto err; - break; - default: - ret = -ENOSYS; - goto err; - } - } else - cma_listen_on_all(id_priv); - - return 0; -err: - id_priv->backlog = 0; - cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); - return ret; -} -EXPORT_SYMBOL(rdma_listen); - void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; @@ -2105,6 +2062,25 @@ err: } EXPORT_SYMBOL(rdma_resolve_addr); +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == CMA_IDLE) { + id_priv->reuseaddr = reuse; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_reuseaddr); + static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { @@ -2180,43 +2156,73 @@ retry: return -EADDRNOTAVAIL; } -static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +/* + * Check that the requested port is available. This is called when trying to + * bind to a specific port, or when trying to listen on a bound port. In + * the latter case, the provided id_priv may already be on the bind_list, but + * we still need to check that it's okay to start listening. + */ +static int cma_check_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; - struct rdma_bind_list *bind_list; struct hlist_node *node; - unsigned short snum; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - snum = ntohs(cma_port(addr)); - if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) - return -EACCES; - - bind_list = idr_find(ps, snum); - if (!bind_list) - return cma_alloc_port(ps, id_priv, snum); - - /* - * We don't support binding to any address if anyone is bound to - * a specific address on the same port. - */ - if (cma_any_addr(addr)) + if (cma_any_addr(addr) && !reuseaddr) return -EADDRNOTAVAIL; hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; - if (cma_any_addr(cur_addr)) - return -EADDRNOTAVAIL; + if (id_priv == cur_id) + continue; - if (!cma_addr_cmp(addr, cur_addr)) - return -EADDRINUSE; - } + if ((cur_id->state == CMA_LISTEN) || + !reuseaddr || !cur_id->reuseaddr) { + cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + if (cma_any_addr(cur_addr)) + return -EADDRNOTAVAIL; - cma_bind_port(bind_list, id_priv); + if (!cma_addr_cmp(addr, cur_addr)) + return -EADDRINUSE; + } + } return 0; } +static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list; + unsigned short snum; + int ret; + + snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + bind_list = idr_find(ps, snum); + if (!bind_list) { + ret = cma_alloc_port(ps, id_priv, snum); + } else { + ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); + if (!ret) + cma_bind_port(bind_list, id_priv); + } + return ret; +} + +static int cma_bind_listen(struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list = id_priv->bind_list; + int ret = 0; + + mutex_lock(&lock); + if (bind_list->owners.first->next) + ret = cma_check_port(bind_list, id_priv, 0); + mutex_unlock(&lock); + return ret; +} + static int cma_get_port(struct rdma_id_private *id_priv) { struct idr *ps; @@ -2268,6 +2274,56 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, return 0; } +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == CMA_IDLE) { + ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; + ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) + return -EINVAL; + + if (id_priv->reuseaddr) { + ret = cma_bind_listen(id_priv); + if (ret) + goto err; + } + + id_priv->backlog = backlog; + if (id->device) { + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + break; + case RDMA_TRANSPORT_IWARP: + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; + break; + default: + ret = -ENOSYS; + goto err; + } + } else + cma_listen_on_all(id_priv); + + return 0; +err: + id_priv->backlog = 0; + cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_listen); + int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ec1e9da..b3fa798 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -883,6 +883,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; + case RDMA_OPTION_ID_REUSEADDR: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; default: ret = -ENOSYS; } diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 4fae903..169f7a5 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -329,4 +329,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); */ void rdma_set_service_type(struct rdma_cm_id *id, int tos); +/** + * rdma_set_reuseaddr - Allow the reuse of local addresses when binding + * the rdma_cm_id. + * @id: Communication identifier to configure. + * @reuse: Value indicating if the bound address is reusable. + * + * Reuse must be set before an address is bound to the id. + */ +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); + #endif /* RDMA_CM_H */ diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h index 1d16502..fc82c18 100644 --- a/include/rdma/rdma_user_cm.h +++ b/include/rdma/rdma_user_cm.h @@ -221,8 +221,9 @@ enum { /* Option details */ enum { - RDMA_OPTION_ID_TOS = 0, - RDMA_OPTION_IB_PATH = 1 + RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_ID_REUSEADDR = 1, + RDMA_OPTION_IB_PATH = 1 }; struct rdma_ucm_set_option { -- cgit v1.1 From 30c95c2d495c1c8d4d6a97bb9f4e4eacb91ba1d2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 9 May 2011 22:06:22 -0700 Subject: RDMA/cxgb4: Don't change QP state outside EP lock Concurrent ingress CLOSE and ULP ABORT operations causes a crash due to a race condition where the close path releases the EP lock and then tries to move the QP state to CLOSED. This must be done inside the EP lock to avoid the race. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 16 +++++++--------- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 ++-- drivers/infiniband/hw/cxgb4/qp.c | 1 - 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 9d8dcfa..d235810 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1466,7 +1466,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; int disconnect = 1; int release = 0; - int closing = 0; + int abort = 0; struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(hdr); @@ -1507,8 +1507,11 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) case FPDU_MODE: start_ep_timer(ep); __state_set(&ep->com, CLOSING); - closing = 1; + attrs.next_state = C4IW_QP_STATE_CLOSING; + abort = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); peer_close_upcall(ep); + disconnect = 1; break; case ABORTING: disconnect = 0; @@ -1536,11 +1539,6 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) BUG_ON(1); } mutex_unlock(&ep->com.mutex); - if (closing) { - attrs.next_state = C4IW_QP_STATE_CLOSING; - c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); - } if (disconnect) c4iw_ep_disconnect(ep, 0, GFP_KERNEL); if (release) @@ -1710,14 +1708,14 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) ep = lookup_tid(t, tid); BUG_ON(!ep); - if (ep->com.qp) { + if (ep && ep->com.qp) { printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid, ep->com.qp->wq.sq.qid); attrs.next_state = C4IW_QP_STATE_TERMINATE; c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } else - printk(KERN_WARNING MOD "TERM received tid %u no qp\n", tid); + printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 9f6166f..8e16eb2 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -161,8 +161,8 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, } } while (!wr_waitp->done); if (wr_waitp->ret) - printk(KERN_WARNING MOD "%s: FW reply %d tid %u qpid %u\n", - pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); + PDBG("%s: FW reply %d tid %u qpid %u\n", + pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); return wr_waitp->ret; } diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 70a5a3c..a1824a5 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1210,7 +1210,6 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, if (ret) { if (internal) c4iw_get_ep(&qhp->ep->com); - disconnect = abort = 1; goto err; } break; -- cgit v1.1 From bbe9a0a2bc07cf30c5b89b51154f2c87200a5dfd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 9 May 2011 22:06:22 -0700 Subject: RDMA/cxgb4: Initialization errors can cause crash c4iw_uld_add() must return ERR_PTR() values instead of NULL on failure. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index e29172c..8e70953 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -392,7 +392,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); if (!devp) { printk(KERN_ERR MOD "Cannot allocate ib device\n"); - return NULL; + return ERR_PTR(-ENOMEM); } devp->rdev.lldi = *infop; @@ -414,7 +414,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) mutex_unlock(&dev_mutex); printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret); ib_dealloc_device(&devp->ibdev); - return NULL; + return ERR_PTR(ret); } idr_init(&devp->cqidr); @@ -444,7 +444,7 @@ static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) DRV_VERSION); dev = c4iw_alloc(infop); - if (!dev) + if (IS_ERR(dev)) goto out; PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n", -- cgit v1.1 From 85d215b0f316bee0a6936bd1a5f21abf03333eaa Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 9 May 2011 22:06:22 -0700 Subject: RDMA/cxgb4: Fix missing parentheses Parens are missing: '|' has a higher presedence than '?'. Signed-off-by: Roel Kluin Acked-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index a1824a5..3b773b0 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -214,7 +214,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ - t4_sq_onchip(&wq->sq) ? F_FW_RI_RES_WR_ONCHIP : 0 | + (t4_sq_onchip(&wq->sq) ? F_FW_RI_RES_WR_ONCHIP : 0) | V_FW_RI_RES_WR_IQID(scq->cqid)); res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | -- cgit v1.1 From d9594d990a528d4c444777d0f360bb50c6114825 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 9 May 2011 22:06:22 -0700 Subject: RDMA/cxgb4: Reset wait condition atomically The driver was never really waiting for RDMA_WR/FINI completions because the condition variable used to determine if the completion happened was never reset, and this condition variable is reused for both connection setup and teardown. This causes various driver crashes under heavy loads due to releasing resources too early. The fix is to use atomic bits to correctly reset the condition immediately after the completion is detected. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/cm.c | 30 +++++++----------------------- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 26 +++++++++++++++++++------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index d235810..d7ee70f 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1198,9 +1198,7 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) } PDBG("%s ep %p status %d error %d\n", __func__, ep, rpl->status, status2errno(rpl->status)); - ep->com.wr_wait.ret = status2errno(rpl->status); - ep->com.wr_wait.done = 1; - wake_up(&ep->com.wr_wait.wait); + c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); return 0; } @@ -1234,9 +1232,7 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_listen_ep *ep = lookup_stid(t, stid); PDBG("%s ep %p\n", __func__, ep); - ep->com.wr_wait.ret = status2errno(rpl->status); - ep->com.wr_wait.done = 1; - wake_up(&ep->com.wr_wait.wait); + c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); return 0; } @@ -1492,17 +1488,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) * in rdma connection migration (see c4iw_accept_cr()). */ __state_set(&ep->com, CLOSING); - ep->com.wr_wait.done = 1; - ep->com.wr_wait.ret = -ECONNRESET; PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); - wake_up(&ep->com.wr_wait.wait); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); break; case MPA_REP_SENT: __state_set(&ep->com, CLOSING); - ep->com.wr_wait.done = 1; - ep->com.wr_wait.ret = -ECONNRESET; PDBG("waking up ep %p tid %u\n", ep, ep->hwtid); - wake_up(&ep->com.wr_wait.wait); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); break; case FPDU_MODE: start_ep_timer(ep); @@ -1579,9 +1571,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) /* * Wake up any threads in rdma_init() or rdma_fini(). */ - ep->com.wr_wait.done = 1; - ep->com.wr_wait.ret = -ECONNRESET; - wake_up(&ep->com.wr_wait.wait); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); mutex_lock(&ep->com.mutex); switch (ep->com.state) { @@ -2294,14 +2284,8 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff); wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); - if (wr_waitp) { - if (ret) - wr_waitp->ret = -ret; - else - wr_waitp->ret = 0; - wr_waitp->done = 1; - wake_up(&wr_waitp->wait); - } + if (wr_waitp) + c4iw_wake_up(wr_waitp, ret ? -ret : 0); kfree_skb(skb); break; case 2: diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 8e16eb2..3dcfe82 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -131,42 +131,54 @@ static inline int c4iw_num_stags(struct c4iw_rdev *rdev) #define C4IW_WR_TO (10*HZ) +enum { + REPLY_READY = 0, +}; + struct c4iw_wr_wait { wait_queue_head_t wait; - int done; + unsigned long status; int ret; }; static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) { wr_waitp->ret = 0; - wr_waitp->done = 0; + wr_waitp->status = 0; init_waitqueue_head(&wr_waitp->wait); } +static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret) +{ + wr_waitp->ret = ret; + set_bit(REPLY_READY, &wr_waitp->status); + wake_up(&wr_waitp->wait); +} + static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, struct c4iw_wr_wait *wr_waitp, u32 hwtid, u32 qpid, const char *func) { unsigned to = C4IW_WR_TO; - do { + int ret; - wait_event_timeout(wr_waitp->wait, wr_waitp->done, to); - if (!wr_waitp->done) { + do { + ret = wait_event_timeout(wr_waitp->wait, + test_and_clear_bit(REPLY_READY, &wr_waitp->status), to); + if (!ret) { printk(KERN_ERR MOD "%s - Device %s not responding - " "tid %u qpid %u\n", func, pci_name(rdev->lldi.pdev), hwtid, qpid); to = to << 2; } - } while (!wr_waitp->done); + } while (!ret); if (wr_waitp->ret) PDBG("%s: FW reply %d tid %u qpid %u\n", pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); return wr_waitp->ret; } - struct c4iw_dev { struct ib_device ibdev; struct c4iw_rdev rdev; -- cgit v1.1 From 2f25e9a540951ebd533b9b98d2259deb44b0b476 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 9 May 2011 22:06:23 -0700 Subject: RDMA/cxgb4: EEH errors can hang the driver A few more EEH fixes: c4iw_wait_for_reply(): detect fatal EEH condition on timeout and return an error. The iw_cxgb4 driver was only calling ib_deregister_device() on an EEH event followed by a ib_register_device() when the device was reinitialized. However, the RDMA core doesn't allow multiple iterations of register/deregister by the provider. See drivers/infiniband/core/sysfs.c: ib_device_unregister_sysfs() where the kobject ref is held until the device is deallocated in ib_deallocate_device(). Calling deregister adds this kobj reference, and then a subsequent register call will generate a WARN_ON() from the kobject subsystem because the kobject is being initialized but is already initialized with the ref held. So the provider must deregister and dealloc when resetting for an EEH event, then alloc/register to re-initialize. To do this, we cannot use the device ptr as our ULD handle since it will change with each reallocation. This commit adds a ULD context struct which is used as the ULD handle, and then contains the device pointer and other state needed. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/device.c | 111 ++++++++++++++++++--------------- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 6 +- drivers/infiniband/hw/cxgb4/provider.c | 2 - 3 files changed, 66 insertions(+), 53 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 8e70953..40a13cc 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -44,7 +44,7 @@ MODULE_DESCRIPTION("Chelsio T4 RDMA Driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); -static LIST_HEAD(dev_list); +static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); static struct dentry *c4iw_debugfs_root; @@ -370,18 +370,23 @@ static void c4iw_rdev_close(struct c4iw_rdev *rdev) c4iw_destroy_resource(&rdev->resource); } -static void c4iw_remove(struct c4iw_dev *dev) +struct uld_ctx { + struct list_head entry; + struct cxgb4_lld_info lldi; + struct c4iw_dev *dev; +}; + +static void c4iw_remove(struct uld_ctx *ctx) { - PDBG("%s c4iw_dev %p\n", __func__, dev); - list_del(&dev->entry); - if (dev->registered) - c4iw_unregister_device(dev); - c4iw_rdev_close(&dev->rdev); - idr_destroy(&dev->cqidr); - idr_destroy(&dev->qpidr); - idr_destroy(&dev->mmidr); - iounmap(dev->rdev.oc_mw_kva); - ib_dealloc_device(&dev->ibdev); + PDBG("%s c4iw_dev %p\n", __func__, ctx->dev); + c4iw_unregister_device(ctx->dev); + c4iw_rdev_close(&ctx->dev->rdev); + idr_destroy(&ctx->dev->cqidr); + idr_destroy(&ctx->dev->qpidr); + idr_destroy(&ctx->dev->mmidr); + iounmap(ctx->dev->rdev.oc_mw_kva); + ib_dealloc_device(&ctx->dev->ibdev); + ctx->dev = NULL; } static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) @@ -402,13 +407,11 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa, devp->rdev.lldi.vr->ocq.size); - printk(KERN_INFO MOD "ocq memory: " + PDBG(KERN_INFO MOD "ocq memory: " "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n", devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size, devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva); - mutex_lock(&dev_mutex); - ret = c4iw_rdev_open(&devp->rdev); if (ret) { mutex_unlock(&dev_mutex); @@ -421,8 +424,6 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) idr_init(&devp->qpidr); idr_init(&devp->mmidr); spin_lock_init(&devp->lock); - list_add_tail(&devp->entry, &dev_list); - mutex_unlock(&dev_mutex); if (c4iw_debugfs_root) { devp->debugfs_root = debugfs_create_dir( @@ -435,7 +436,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) { - struct c4iw_dev *dev; + struct uld_ctx *ctx; static int vers_printed; int i; @@ -443,25 +444,33 @@ static void *c4iw_uld_add(const struct cxgb4_lld_info *infop) printk(KERN_INFO MOD "Chelsio T4 RDMA Driver - version %s\n", DRV_VERSION); - dev = c4iw_alloc(infop); - if (IS_ERR(dev)) + ctx = kzalloc(sizeof *ctx, GFP_KERNEL); + if (!ctx) { + ctx = ERR_PTR(-ENOMEM); goto out; + } + ctx->lldi = *infop; PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n", - __func__, pci_name(dev->rdev.lldi.pdev), - dev->rdev.lldi.nchan, dev->rdev.lldi.nrxq, - dev->rdev.lldi.ntxq, dev->rdev.lldi.nports); + __func__, pci_name(ctx->lldi.pdev), + ctx->lldi.nchan, ctx->lldi.nrxq, + ctx->lldi.ntxq, ctx->lldi.nports); + + mutex_lock(&dev_mutex); + list_add_tail(&ctx->entry, &uld_ctx_list); + mutex_unlock(&dev_mutex); - for (i = 0; i < dev->rdev.lldi.nrxq; i++) - PDBG("rxqid[%u] %u\n", i, dev->rdev.lldi.rxq_ids[i]); + for (i = 0; i < ctx->lldi.nrxq; i++) + PDBG("rxqid[%u] %u\n", i, ctx->lldi.rxq_ids[i]); out: - return dev; + return ctx; } static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp, const struct pkt_gl *gl) { - struct c4iw_dev *dev = handle; + struct uld_ctx *ctx = handle; + struct c4iw_dev *dev = ctx->dev; struct sk_buff *skb; const struct cpl_act_establish *rpl; unsigned int opcode; @@ -503,47 +512,49 @@ nomem: static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state) { - struct c4iw_dev *dev = handle; + struct uld_ctx *ctx = handle; PDBG("%s new_state %u\n", __func__, new_state); switch (new_state) { case CXGB4_STATE_UP: - printk(KERN_INFO MOD "%s: Up\n", pci_name(dev->rdev.lldi.pdev)); - if (!dev->registered) { - int ret; - ret = c4iw_register_device(dev); - if (ret) + printk(KERN_INFO MOD "%s: Up\n", pci_name(ctx->lldi.pdev)); + if (!ctx->dev) { + int ret = 0; + + ctx->dev = c4iw_alloc(&ctx->lldi); + if (!IS_ERR(ctx->dev)) + ret = c4iw_register_device(ctx->dev); + if (IS_ERR(ctx->dev) || ret) printk(KERN_ERR MOD "%s: RDMA registration failed: %d\n", - pci_name(dev->rdev.lldi.pdev), ret); + pci_name(ctx->lldi.pdev), ret); } break; case CXGB4_STATE_DOWN: printk(KERN_INFO MOD "%s: Down\n", - pci_name(dev->rdev.lldi.pdev)); - if (dev->registered) - c4iw_unregister_device(dev); + pci_name(ctx->lldi.pdev)); + if (ctx->dev) + c4iw_remove(ctx); break; case CXGB4_STATE_START_RECOVERY: printk(KERN_INFO MOD "%s: Fatal Error\n", - pci_name(dev->rdev.lldi.pdev)); - dev->rdev.flags |= T4_FATAL_ERROR; - if (dev->registered) { + pci_name(ctx->lldi.pdev)); + if (ctx->dev) { struct ib_event event; + ctx->dev->rdev.flags |= T4_FATAL_ERROR; memset(&event, 0, sizeof event); event.event = IB_EVENT_DEVICE_FATAL; - event.device = &dev->ibdev; + event.device = &ctx->dev->ibdev; ib_dispatch_event(&event); - c4iw_unregister_device(dev); + c4iw_remove(ctx); } break; case CXGB4_STATE_DETACH: printk(KERN_INFO MOD "%s: Detach\n", - pci_name(dev->rdev.lldi.pdev)); - mutex_lock(&dev_mutex); - c4iw_remove(dev); - mutex_unlock(&dev_mutex); + pci_name(ctx->lldi.pdev)); + if (ctx->dev) + c4iw_remove(ctx); break; } return 0; @@ -576,11 +587,13 @@ static int __init c4iw_init_module(void) static void __exit c4iw_exit_module(void) { - struct c4iw_dev *dev, *tmp; + struct uld_ctx *ctx, *tmp; mutex_lock(&dev_mutex); - list_for_each_entry_safe(dev, tmp, &dev_list, entry) { - c4iw_remove(dev); + list_for_each_entry_safe(ctx, tmp, &uld_ctx_list, entry) { + if (ctx->dev) + c4iw_remove(ctx); + kfree(ctx); } mutex_unlock(&dev_mutex); cxgb4_unregister_uld(CXGB4_ULD_RDMA); diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 3dcfe82..35d2a5d 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -170,6 +170,10 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, printk(KERN_ERR MOD "%s - Device %s not responding - " "tid %u qpid %u\n", func, pci_name(rdev->lldi.pdev), hwtid, qpid); + if (c4iw_fatal_error(rdev)) { + wr_waitp->ret = -EIO; + break; + } to = to << 2; } } while (!ret); @@ -187,9 +191,7 @@ struct c4iw_dev { struct idr qpidr; struct idr mmidr; spinlock_t lock; - struct list_head entry; struct dentry *debugfs_root; - u8 registered; }; static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index f66dd8b..5b9e422 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -516,7 +516,6 @@ int c4iw_register_device(struct c4iw_dev *dev) if (ret) goto bail2; } - dev->registered = 1; return 0; bail2: ib_unregister_device(&dev->ibdev); @@ -535,6 +534,5 @@ void c4iw_unregister_device(struct c4iw_dev *dev) c4iw_class_attributes[i]); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); - dev->registered = 0; return; } -- cgit v1.1 From 9f5754e34bf964a41e821b2e0b358bc7c314ca4e Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Mon, 9 May 2011 22:07:31 -0700 Subject: IB/qib: Prevent driver hang with unprogrammed boards The time limit test now correctly checks against current jiffies to avoid the hang. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 6bab3ea..9f53e68 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -7534,7 +7534,8 @@ static int serdes_7322_init_new(struct qib_pportdata *ppd) ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10)); tstart = get_jiffies_64(); while (chan_done && - !time_after64(tstart, tstart + msecs_to_jiffies(500))) { + !time_after64(get_jiffies_64(), + tstart + msecs_to_jiffies(500))) { msleep(20); for (chan = 0; chan < SERDES_CHANS; ++chan) { rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), -- cgit v1.1 From ec03d6777a9e2b76917ef6d3fc8a01e174bcb9b1 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 9 May 2011 22:07:31 -0700 Subject: IB/ipath: Use pci_dev->revision, again Commit 44c10138fd4b ("PCI: Change all drivers to use pci_device->revision") already converted this driver to using the revision field of struct pci_dev but commit bb9171448deb ("IB/ipath: Misc changes to prepare for IB7220 introduction") later reverted that change for some strange reason. Restore the change. Signed-off-by: Sergei Shtylyov Acked-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 58c0e41..be24ac7 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -398,7 +398,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, struct ipath_devdata *dd; unsigned long long addr; u32 bar0 = 0, bar1 = 0; - u8 rev; dd = ipath_alloc_devdata(pdev); if (IS_ERR(dd)) { @@ -540,13 +539,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, goto bail_regions; } - ret = pci_read_config_byte(pdev, PCI_REVISION_ID, &rev); - if (ret) { - ipath_dev_err(dd, "Failed to read PCI revision ID unit " - "%u: err %d\n", dd->ipath_unit, -ret); - goto bail_regions; /* shouldn't ever happen */ - } - dd->ipath_pcirev = rev; + dd->ipath_pcirev = pdev->revision; #if defined(__powerpc__) /* There isn't a generic way to specify writethrough mappings */ -- cgit v1.1 From d0c49bf391b2e230a8f3ae4486da7df440f1216d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 9 May 2011 22:23:57 -0700 Subject: RDMA/iwcm: Get rid of enum iw_cm_event_status The IW_CM_EVENT_STATUS_xxx values were used in only a couple of places; cma.c uses -Exxx values instead, and so do the amso1100, cxgb3 and cxgb4 drivers -- only nes was using the enum values (with the mild consequence that all nes connection failures were treated as generic errors rather than reported as timeouts or rejections). We can fix this confusion by getting rid of enum iw_cm_event_status and using a plain int for struct iw_cm_event.status, and converting nes to use -Exxx as the other iWARP drivers do. This also gets rid of the warning drivers/infiniband/core/cma.c: In function 'cma_iw_handler': drivers/infiniband/core/cma.c:1333:3: warning: case value '4294967185' not in enumerated type 'enum iw_cm_event_status' drivers/infiniband/core/cma.c:1336:3: warning: case value '4294967186' not in enumerated type 'enum iw_cm_event_status' drivers/infiniband/core/cma.c:1332:3: warning: case value '4294967192' not in enumerated type 'enum iw_cm_event_status' Signed-off-by: Roland Dreier Reviewed-by: Steve Wise Reviewed-by: Sean Hefty Reviewed-by: Faisal Latif --- drivers/infiniband/core/iwcm.c | 2 +- drivers/infiniband/hw/nes/nes_cm.c | 16 ++++++++-------- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- include/rdma/iw_cm.h | 11 +---------- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 2a1e9ae..a9c0423 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -725,7 +725,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); - if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) { + if (iw_event->status == 0) { cm_id_priv->id.local_addr = iw_event->local_addr; cm_id_priv->id.remote_addr = iw_event->remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 33c7eed..e74cdf9 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -2563,7 +2563,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) u16 last_ae; u8 original_hw_tcp_state; u8 original_ibqp_state; - enum iw_cm_event_status disconn_status = IW_CM_EVENT_STATUS_OK; + int disconn_status = 0; int issue_disconn = 0; int issue_close = 0; int issue_flush = 0; @@ -2605,7 +2605,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { issue_disconn = 1; if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET) - disconn_status = IW_CM_EVENT_STATUS_RESET; + disconn_status = -ECONNRESET; } if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) || @@ -2666,7 +2666,7 @@ static int nes_cm_disconn_true(struct nes_qp *nesqp) cm_id->provider_data = nesqp; /* Send up the close complete event */ cm_event.event = IW_CM_EVENT_CLOSE; - cm_event.status = IW_CM_EVENT_STATUS_OK; + cm_event.status = 0; cm_event.provider_data = cm_id->provider_data; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; @@ -2966,7 +2966,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) nes_add_ref(&nesqp->ibqp); cm_event.event = IW_CM_EVENT_ESTABLISHED; - cm_event.status = IW_CM_EVENT_STATUS_ACCEPTED; + cm_event.status = 0; cm_event.provider_data = (void *)nesqp; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; @@ -3377,7 +3377,7 @@ static void cm_event_connected(struct nes_cm_event *event) /* notify OF layer we successfully created the requested connection */ cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = IW_CM_EVENT_STATUS_ACCEPTED; + cm_event.status = 0; cm_event.provider_data = cm_id->provider_data; cm_event.local_addr.sin_family = AF_INET; cm_event.local_addr.sin_port = cm_id->local_addr.sin_port; @@ -3484,7 +3484,7 @@ static void cm_event_reset(struct nes_cm_event *event) nesqp->cm_id = NULL; /* cm_id->provider_data = NULL; */ cm_event.event = IW_CM_EVENT_DISCONNECT; - cm_event.status = IW_CM_EVENT_STATUS_RESET; + cm_event.status = -ECONNRESET; cm_event.provider_data = cm_id->provider_data; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; @@ -3495,7 +3495,7 @@ static void cm_event_reset(struct nes_cm_event *event) ret = cm_id->event_handler(cm_id, &cm_event); atomic_inc(&cm_closes); cm_event.event = IW_CM_EVENT_CLOSE; - cm_event.status = IW_CM_EVENT_STATUS_OK; + cm_event.status = 0; cm_event.provider_data = cm_id->provider_data; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; @@ -3534,7 +3534,7 @@ static void cm_event_mpa_req(struct nes_cm_event *event) cm_node, cm_id, jiffies); cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; - cm_event.status = IW_CM_EVENT_STATUS_OK; + cm_event.status = 0; cm_event.provider_data = (void *)cm_node; cm_event.local_addr.sin_family = AF_INET; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 26d8018..95ca93c 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1484,7 +1484,7 @@ static int nes_destroy_qp(struct ib_qp *ibqp) (nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) { cm_id = nesqp->cm_id; cm_event.event = IW_CM_EVENT_CONNECT_REPLY; - cm_event.status = IW_CM_EVENT_STATUS_TIMEOUT; + cm_event.status = -ETIMEDOUT; cm_event.local_addr = cm_id->local_addr; cm_event.remote_addr = cm_id->remote_addr; cm_event.private_data = NULL; diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index cbb822e..2d0191c 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -46,18 +46,9 @@ enum iw_cm_event_type { IW_CM_EVENT_CLOSE /* close complete */ }; -enum iw_cm_event_status { - IW_CM_EVENT_STATUS_OK = 0, /* request successful */ - IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */ - IW_CM_EVENT_STATUS_REJECTED, /* connect request rejected */ - IW_CM_EVENT_STATUS_TIMEOUT, /* the operation timed out */ - IW_CM_EVENT_STATUS_RESET, /* reset from remote peer */ - IW_CM_EVENT_STATUS_EINVAL, /* asynchronous failure for bad parm */ -}; - struct iw_cm_event { enum iw_cm_event_type event; - enum iw_cm_event_status status; + int status; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; void *private_data; -- cgit v1.1 From 1d44e8288a0557c28c447d7e511f50d06ff93a34 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 9 May 2011 11:35:19 -0500 Subject: x86, UV: Fix NMI handler for UV platforms This fixes problems seen on UV systems handling NMIs from the node controller. I isolated the "dazed..." messages that I saw earlier to a bug in the BMC on our platform. It was sending NMIs w/o properly setting a register that indicated the source of NMI. So rather than _assuming_ any unhandled NMI came from the UV system maintenance console (SMC), add a check to verify that the SMC actually sent the NMI. Signed-off-by: Jack Steiner Cc: gorcunov@gmail.com Cc: dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 2 ++ arch/x86/include/asm/uv/uv_mmrs.h | 16 ++++++++++++- arch/x86/kernel/apic/x2apic_uv_x.c | 48 ++++++++++++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a501741..4298002 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -398,6 +398,8 @@ struct uv_blade_info { unsigned short nr_online_cpus; unsigned short pnode; short memory_nid; + spinlock_t nmi_lock; + unsigned long nmi_count; }; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 20cafea..f5bb64a 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -1099,5 +1099,19 @@ union uvh_rtc1_int_config_u { } s; }; +/* ========================================================================= */ +/* UVH_SCRATCH5 */ +/* ========================================================================= */ +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x00778 + +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL +union uvh_scratch5_u { + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5 : 64; /* RW, W1CS */ + } s; +}; #endif /* __ASM_UV_MMRS_X86_H__ */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 33b10a0..7acd2d2 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -37,6 +37,13 @@ #include #include #include +#include + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) +DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -642,18 +649,46 @@ void __cpuinit uv_cpu_init(void) */ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { + unsigned long real_uv_nmi; + int bid; + if (reason != DIE_NMIUNKNOWN) return NOTIFY_OK; if (in_crash_kexec) /* do nothing if entering the crash kernel */ return NOTIFY_OK; + /* - * Use a lock so only one cpu prints at a time - * to prevent intermixed output. + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + + if (unlikely(real_uv_nmi)) { + spin_lock(&uv_blade_info[bid].nmi_lock); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (real_uv_nmi) { + uv_blade_info[bid].nmi_count++; + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + } + spin_unlock(&uv_blade_info[bid].nmi_lock); + } + + if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) + return NOTIFY_DONE; + + __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; + + /* + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. */ spin_lock(&uv_nmi_lock); - pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); dump_stack(); spin_unlock(&uv_nmi_lock); @@ -661,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, }; void uv_register_nmi_notifier(void) @@ -720,8 +756,9 @@ void __init uv_system_init(void) printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -747,6 +784,7 @@ void __init uv_system_init(void) uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); max_pnode = max(pnode, max_pnode); blade++; } -- cgit v1.1 From 315c34dae0069d0c67abd714bb846cd466289c7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Apr 2011 10:55:07 +0200 Subject: netfilter: ctnetlink: fix timestamp support for new conntracks This patch fixes the missing initialization of the start time if the timestamp support is enabled. libnetfilter_conntrack/utils# conntrack -E & libnetfilter_conntrack/utils# ./conntrack_create tcp 6 109 ESTABLISHED src=1.1.1.1 dst=2.2.2.2 sport=1025 dport=21 packets=0 bytes=0 [UNREPLIED] src=2.2.2.2 dst=1.1.1.1 sport=21 dport=1025 packets=0 bytes=0 mark=0 delta-time=1303296401 use=2 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 30bf8a1..482e90c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1334,6 +1334,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1451,6 +1452,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); add_timer(&ct->timeout); nf_conntrack_hash_insert(ct); -- cgit v1.1 From 5a6351eecf8c87afed9c883bb6341d09406d74ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Apr 2011 10:57:21 +0200 Subject: netfilter: fix ebtables compat support commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) made ebtables not working anymore. 1) xt_compat_calc_jump() is not an exact match lookup 2) compat_table_info() has a typo in xt_compat_init_offsets() call 3) compat_do_replace() misses a xt_compat_init_offsets() call Reported-by: dann frazier Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 3 ++- net/netfilter/x_tables.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 893669c..9707079 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1766,7 +1766,7 @@ static int compat_table_info(const struct ebt_table_info *info, newinfo->entries_size = size; - xt_compat_init_offsets(AF_INET, info->nentries); + xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } @@ -2240,6 +2240,7 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); + xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a9adf4c..8a025a5 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -455,6 +455,7 @@ void xt_compat_flush_offsets(u_int8_t af) vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; xt[af].number = 0; + xt[af].cur = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); @@ -473,8 +474,7 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset) else return mid ? tmp[mid - 1].delta : 0; } - WARN_ON_ONCE(1); - return 0; + return left ? tmp[left - 1].delta : 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); -- cgit v1.1 From 103a9778e07bcc0cd34b5c35a87281454eec719e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Apr 2011 10:58:25 +0200 Subject: netfilter: ebtables: only call xt_compat_add_offset once per rule The optimizations in commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) assume that xt_compat_add_offset is called once per rule. ebtables however called it for each match/target found in a rule. The match/watcher/target parser already returns the needed delta, so it is sufficient to move the xt_compat_add_offset call to a more reasonable location. While at it, also get rid of the unused COMPAT iterator macros. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 61 ++++++----------------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9707079..1a92b36 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1882,7 +1882,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct xt_match *match; struct xt_target *wt; void *dst = NULL; - int off, pad = 0, ret = 0; + int off, pad = 0; unsigned int size_kern, entry_offset, match_size = mwt->match_size; strlcpy(name, mwt->u.name, sizeof(name)); @@ -1935,13 +1935,6 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; } - if (!dst) { - ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, - off + ebt_compat_entry_padsize()); - if (ret < 0) - return ret; - } - state->buf_kern_offset += match_size + off; state->buf_user_offset += match_size; pad = XT_ALIGN(size_kern) - size_kern; @@ -2016,50 +2009,6 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, return growth; } -#define EBT_COMPAT_WATCHER_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__watcher; \ - \ - for (__i = e->watchers_offset; \ - __i < (e)->target_offset; \ - __i += __watcher->watcher_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __watcher = (void *)(e) + __i; \ - __ret = fn(__watcher , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->target_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - -#define EBT_COMPAT_MATCH_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__match; \ - \ - for (__i = sizeof(struct ebt_entry); \ - __i < (e)->watchers_offset; \ - __i += __match->match_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __match = (void *)(e) + __i; \ - __ret = fn(__match , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->watchers_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - /* called for all ebt_entry structures. */ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, unsigned int *total, @@ -2132,6 +2081,14 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, } } + if (state->buf_kern_start == NULL) { + unsigned int offset = buf_start - (char *) base; + + ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset); + if (ret < 0) + return ret; + } + startoff = state->buf_user_offset - startoff; BUG_ON(*total < startoff); -- cgit v1.1 From 1ae132b0347907ac95b8bc9dba37934f59d2a508 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:30 +0200 Subject: IPVS: Change of socket usage to enable name space exit. If the sync daemons run in a name space while it crashes or get killed, there is no way to stop them except for a reboot. When all patches are there, ip_vs_core will handle register_pernet_(), i.e. ip_vs_sync_init() and ip_vs_sync_cleanup() will be removed. Kernel threads should not increment the use count of a socket. By calling sk_change_net() after creating a socket this is avoided. sock_release cant be used intead sk_release_kernel() should be used. Thanks Eric W Biederman for your advices. Signed-off-by: Hans Schillstrom [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 58 ++++++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 07accf6..a0791dc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1896,7 +1896,7 @@ static int __net_init __ip_vs_init(struct net *net) static void __net_exit __ip_vs_cleanup(struct net *net) { - IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } static struct pernet_operations ipvs_core_ops = { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3e7961e..0cce953 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1303,13 +1303,18 @@ static struct socket *make_send_sock(struct net *net) struct socket *sock; int result; - /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1334,8 +1339,8 @@ static struct socket *make_send_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1350,12 +1355,17 @@ static struct socket *make_receive_sock(struct net *net) int result; /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = 1; @@ -1377,8 +1387,8 @@ static struct socket *make_receive_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1473,7 +1483,7 @@ static int sync_thread_master(void *data) ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo); return 0; @@ -1513,7 +1523,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo->buf); kfree(tinfo); @@ -1601,7 +1611,7 @@ outtinfo: outbuf: kfree(buf); outsocket: - sock_release(sock); + sk_release_kernel(sock->sk); out: return result; } @@ -1610,6 +1620,7 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); @@ -1629,7 +1640,7 @@ int stop_sync_thread(struct net *net, int state) spin_lock_bh(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; spin_unlock_bh(&ipvs->sync_lock); - kthread_stop(ipvs->master_thread); + retc = kthread_stop(ipvs->master_thread); ipvs->master_thread = NULL; } else if (state == IP_VS_STATE_BACKUP) { if (!ipvs->backup_thread) @@ -1639,16 +1650,14 @@ int stop_sync_thread(struct net *net, int state) task_pid_nr(ipvs->backup_thread)); ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(ipvs->backup_thread); + retc = kthread_stop(ipvs->backup_thread); ipvs->backup_thread = NULL; - } else { - return -EINVAL; } /* decrease the module use count */ ip_vs_use_count_dec(); - return 0; + return retc; } /* @@ -1670,8 +1679,15 @@ static int __net_init __ip_vs_sync_init(struct net *net) static void __ip_vs_sync_cleanup(struct net *net) { - stop_sync_thread(net, IP_VS_STATE_MASTER); - stop_sync_thread(net, IP_VS_STATE_BACKUP); + int retc; + + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); } static struct pernet_operations ipvs_sync_ops = { @@ -1682,10 +1698,10 @@ static struct pernet_operations ipvs_sync_ops = { int __init ip_vs_sync_init(void) { - return register_pernet_subsys(&ipvs_sync_ops); + return register_pernet_device(&ipvs_sync_ops); } void ip_vs_sync_cleanup(void) { - unregister_pernet_subsys(&ipvs_sync_ops); + unregister_pernet_device(&ipvs_sync_ops); } -- cgit v1.1 From 7a4f0761fce32ff4918a7c23b08db564ad33092d Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:31 +0200 Subject: IPVS: init and cleanup restructuring DESCRIPTION This patch tries to restore the initial init and cleanup sequences that was before namspace patch. Netns also requires action when net devices unregister which has never been implemented. I.e this patch also covers when a device moves into a network namespace, and has to be released. IMPLEMENTATION The number of calls to register_pernet_device have been reduced to one for the ip_vs.ko Schedulers still have their own calls. This patch adds a function __ip_vs_service_cleanup() and an enable flag for the netfilter hooks. The nf hooks will be enabled when the first service is loaded and never disabled again, except when a namespace exit starts. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 ++++++ net/netfilter/ipvs/ip_vs_app.c | 15 +---- net/netfilter/ipvs/ip_vs_conn.c | 12 +--- net/netfilter/ipvs/ip_vs_core.c | 101 +++++++++++++++++++++++++++++--- net/netfilter/ipvs/ip_vs_ctl.c | 120 ++++++++++++++++++++++++++++++++------- net/netfilter/ipvs/ip_vs_est.c | 14 +---- net/netfilter/ipvs/ip_vs_proto.c | 11 +--- net/netfilter/ipvs/ip_vs_sync.c | 13 +---- 8 files changed, 223 insertions(+), 80 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d516f00..86aefed 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -791,6 +791,7 @@ struct ip_vs_app { /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ + int enable; /* enable like nf_hooks do */ /* * Hash table: for real service lookups */ @@ -1089,6 +1090,22 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } +/* + * IPVS netns init & cleanup functions + */ +extern int __ip_vs_estimator_init(struct net *net); +extern int __ip_vs_control_init(struct net *net); +extern int __ip_vs_protocol_init(struct net *net); +extern int __ip_vs_app_init(struct net *net); +extern int __ip_vs_conn_init(struct net *net); +extern int __ip_vs_sync_init(struct net *net); +extern void __ip_vs_conn_cleanup(struct net *net); +extern void __ip_vs_app_cleanup(struct net *net); +extern void __ip_vs_protocol_cleanup(struct net *net); +extern void __ip_vs_control_cleanup(struct net *net); +extern void __ip_vs_estimator_cleanup(struct net *net); +extern void __ip_vs_sync_cleanup(struct net *net); +extern void __ip_vs_service_cleanup(struct net *net); /* * IPVS application functions diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 2dc6de1..51f3af7 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -576,7 +576,7 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -static int __net_init __ip_vs_app_init(struct net *net) +int __net_init __ip_vs_app_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -585,26 +585,17 @@ static int __net_init __ip_vs_app_init(struct net *net) return 0; } -static void __net_exit __ip_vs_app_cleanup(struct net *net) +void __net_exit __ip_vs_app_cleanup(struct net *net) { proc_net_remove(net, "ip_vs_app"); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_app_init, - .exit = __ip_vs_app_cleanup, -}; - int __init ip_vs_app_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_app_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c97bd45..d3fd91b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1258,22 +1258,17 @@ int __net_init __ip_vs_conn_init(struct net *net) return 0; } -static void __net_exit __ip_vs_conn_cleanup(struct net *net) +void __net_exit __ip_vs_conn_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); proc_net_remove(net, "ip_vs_conn"); proc_net_remove(net, "ip_vs_conn_sync"); } -static struct pernet_operations ipvs_conn_ops = { - .init = __ip_vs_conn_init, - .exit = __ip_vs_conn_cleanup, -}; int __init ip_vs_conn_init(void) { int idx; - int retc; /* Compute size and mask */ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; @@ -1309,17 +1304,14 @@ int __init ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - retc = register_pernet_subsys(&ipvs_conn_ops); - /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return retc; + return 0; } void ip_vs_conn_cleanup(void) { - unregister_pernet_subsys(&ipvs_conn_ops); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a0791dc..a74dae6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1113,6 +1113,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1343,6 +1346,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) return NF_ACCEPT; /* The packet looks wrong, ignore */ net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->protocol); if (!pd) return NF_ACCEPT; @@ -1529,6 +1533,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* Bad... Do not break raw sockets */ @@ -1562,7 +1571,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - net = skb_net(skb); /* Protocol supported? */ pd = ip_vs_proto_data_get(net, iph.protocol); if (unlikely(!pd)) @@ -1588,7 +1596,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - net = skb_net(skb); ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { @@ -1743,10 +1750,16 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp(skb, &r, hooknum); } @@ -1757,10 +1770,16 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp_v6(skb, &r, hooknum); } #endif @@ -1884,21 +1903,72 @@ static int __net_init __ip_vs_init(struct net *net) pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; + + if (__ip_vs_estimator_init(net) < 0) + goto estimator_fail; + + if (__ip_vs_control_init(net) < 0) + goto control_fail; + + if (__ip_vs_protocol_init(net) < 0) + goto protocol_fail; + + if (__ip_vs_app_init(net) < 0) + goto app_fail; + + if (__ip_vs_conn_init(net) < 0) + goto conn_fail; + + if (__ip_vs_sync_init(net) < 0) + goto sync_fail; + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", sizeof(struct netns_ipvs), ipvs->gen); return 0; +/* + * Error handling + */ + +sync_fail: + __ip_vs_conn_cleanup(net); +conn_fail: + __ip_vs_app_cleanup(net); +app_fail: + __ip_vs_protocol_cleanup(net); +protocol_fail: + __ip_vs_control_cleanup(net); +control_fail: + __ip_vs_estimator_cleanup(net); +estimator_fail: + return -ENOMEM; } static void __net_exit __ip_vs_cleanup(struct net *net) { + __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */ + __ip_vs_conn_cleanup(net); + __ip_vs_app_cleanup(net); + __ip_vs_protocol_cleanup(net); + __ip_vs_control_cleanup(net); + __ip_vs_estimator_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + __ip_vs_sync_cleanup(net); + LeaveFunction(2); +} + static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit = __ip_vs_cleanup, @@ -1906,6 +1976,10 @@ static struct pernet_operations ipvs_core_ops = { .size = sizeof(struct netns_ipvs), }; +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + /* * Initialize IP Virtual Server */ @@ -1913,10 +1987,6 @@ static int __init ip_vs_init(void) { int ret; - ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ - if (ret < 0) - return ret; - ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { @@ -1944,15 +2014,28 @@ static int __init ip_vs_init(void) goto cleanup_conn; } + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_sync; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { pr_err("can't register hooks.\n"); - goto cleanup_sync; + goto cleanup_dev; } pr_info("ipvs loaded.\n"); + return ret; +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); cleanup_sync: ip_vs_sync_cleanup(); cleanup_conn: @@ -1964,20 +2047,20 @@ cleanup_sync: ip_vs_control_cleanup(); cleanup_estimator: ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ return ret; } static void __exit ip_vs_cleanup(void) { nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ae47090..ea72281 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -69,6 +69,11 @@ int ip_vs_get_debug_level(void) } #endif + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc); + + #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static int __ip_vs_addr_is_local_v6(struct net *net, @@ -1214,6 +1219,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; return 0; @@ -1472,6 +1479,84 @@ static int ip_vs_flush(struct net *net) return 0; } +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void __ip_vs_service_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} +/* + * Release dst hold by dst_cache + */ +static inline void +__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +{ + spin_lock_bh(&dest->dst_lock); + if (dest->dst_cache && dest->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + ip_vs_dst_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* + * Netdev event receiver + * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to + * a device that is "unregister" it must be released. + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + } + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + + } + } + + list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + __ip_vs_dev_reset(dest, dev); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} /* * Zero counters in a service or all services @@ -3588,6 +3673,10 @@ void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } #endif +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + int __net_init __ip_vs_control_init(struct net *net) { int idx; @@ -3626,7 +3715,7 @@ err: return -ENOMEM; } -static void __net_exit __ip_vs_control_cleanup(struct net *net) +void __net_exit __ip_vs_control_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -3639,11 +3728,6 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) free_percpu(ipvs->tot_stats.cpustats); } -static struct pernet_operations ipvs_control_ops = { - .init = __ip_vs_control_init, - .exit = __ip_vs_control_cleanup, -}; - int __init ip_vs_control_init(void) { int idx; @@ -3657,33 +3741,32 @@ int __init ip_vs_control_init(void) INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - ret = register_pernet_subsys(&ipvs_control_ops); - if (ret) { - pr_err("cannot register namespace.\n"); - goto err; - } - smp_wmb(); /* Do we really need it now ? */ ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); - goto err_net; + goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - goto err_net; + goto err_genl; } + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + goto err_notf; + LeaveFunction(2); return 0; -err_net: - unregister_pernet_subsys(&ipvs_control_ops); -err: +err_notf: + ip_vs_genl_unregister(); +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: return ret; } @@ -3691,7 +3774,6 @@ err: void ip_vs_control_cleanup(void) { EnterFunction(2); - unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 8c8766c..508cce9 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -192,7 +192,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst, dst->outbps = (e->outbps + 0xF) >> 5; } -static int __net_init __ip_vs_estimator_init(struct net *net) +int __net_init __ip_vs_estimator_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -203,24 +203,16 @@ static int __net_init __ip_vs_estimator_init(struct net *net) return 0; } -static void __net_exit __ip_vs_estimator_exit(struct net *net) +void __net_exit __ip_vs_estimator_cleanup(struct net *net) { del_timer_sync(&net_ipvs(net)->est_timer); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_estimator_init, - .exit = __ip_vs_estimator_exit, -}; int __init ip_vs_estimator_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_estimator_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 17484a4..eb86028 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -316,7 +316,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -static int __net_init __ip_vs_protocol_init(struct net *net) +int __net_init __ip_vs_protocol_init(struct net *net) { #ifdef CONFIG_IP_VS_PROTO_TCP register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); @@ -336,7 +336,7 @@ static int __net_init __ip_vs_protocol_init(struct net *net) return 0; } -static void __net_exit __ip_vs_protocol_cleanup(struct net *net) +void __net_exit __ip_vs_protocol_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; @@ -349,11 +349,6 @@ static void __net_exit __ip_vs_protocol_cleanup(struct net *net) } } -static struct pernet_operations ipvs_proto_ops = { - .init = __ip_vs_protocol_init, - .exit = __ip_vs_protocol_cleanup, -}; - int __init ip_vs_protocol_init(void) { char protocols[64]; @@ -382,7 +377,6 @@ int __init ip_vs_protocol_init(void) REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); - return register_pernet_subsys(&ipvs_proto_ops); return 0; } @@ -393,7 +387,6 @@ void ip_vs_protocol_cleanup(void) struct ip_vs_protocol *pp; int i; - unregister_pernet_subsys(&ipvs_proto_ops); /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 0cce953..e292e5b 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1663,7 +1663,7 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -static int __net_init __ip_vs_sync_init(struct net *net) +int __net_init __ip_vs_sync_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1677,7 +1677,7 @@ static int __net_init __ip_vs_sync_init(struct net *net) return 0; } -static void __ip_vs_sync_cleanup(struct net *net) +void __ip_vs_sync_cleanup(struct net *net) { int retc; @@ -1690,18 +1690,11 @@ static void __ip_vs_sync_cleanup(struct net *net) pr_err("Failed to stop Backup Daemon\n"); } -static struct pernet_operations ipvs_sync_ops = { - .init = __ip_vs_sync_init, - .exit = __ip_vs_sync_cleanup, -}; - - int __init ip_vs_sync_init(void) { - return register_pernet_device(&ipvs_sync_ops); + return 0; } void ip_vs_sync_cleanup(void) { - unregister_pernet_device(&ipvs_sync_ops); } -- cgit v1.1 From 4319cc0cf5bb894b7368008cdf6dd20eb8868018 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 09:55:44 +0200 Subject: netfilter: IPv6: initialize TOS field in REJECT target module The IPv6 header is not zeroed out in alloc_skb so we must initialize it properly unless we want to see IPv6 packets with random TOS fields floating around. The current implementation resets the flow label but this could be changed if deemed necessary. We stumbled upon this issue when trying to apply a mangle rule to the RST packet generated by the REJECT target module. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_REJECT.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 28e7448..a5a4c5d 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -45,6 +45,8 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) int tcphoff, needs_ack; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; struct dst_entry *dst = NULL; u8 proto; struct flowi6 fl6; @@ -124,7 +126,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); - ip6h->version = 6; + *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); ip6h->hop_limit = ip6_dst_hoplimit(dst); ip6h->nexthdr = IPPROTO_TCP; ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); -- cgit v1.1 From 1ed2f73d90fb49bcf5704aee7e9084adb882bfc5 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 10:00:21 +0200 Subject: netfilter: IPv6: fix DSCP mangle code The mask indicates the bits one wants to zero out, so it needs to be inverted before applying to the original TOS field. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_DSCP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 0a22919..ae82716 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); - nv = (orig & info->tos_mask) ^ info->tos_value; + nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (!skb_make_writable(skb, sizeof(struct iphdr))) -- cgit v1.1 From e969687595c27e02e02be0c9363261826123ba77 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 5 Nov 2010 16:12:35 -0700 Subject: arch/x86/kernel/pci-iommu_table.c: Convert sprintf_symbol to %pS Coalesce format as well. Signed-off-by: Joe Perches Signed-off-by: Joerg Roedel --- arch/x86/kernel/pci-iommu_table.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 55d745e..35ccf75 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start, struct iommu_table_entry *finish) { struct iommu_table_entry *p, *q, *x; - char sym_p[KSYM_SYMBOL_LEN]; - char sym_q[KSYM_SYMBOL_LEN]; /* Simple cyclic dependency checker. */ for (p = start; p < finish; p++) { q = find_dependents_of(start, finish, p); x = find_dependents_of(start, finish, q); if (p == x) { - sprint_symbol(sym_p, (unsigned long)p->detect); - sprint_symbol(sym_q, (unsigned long)q->detect); - - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \ - " on %s and vice-versa. BREAKING IT.\n", - sym_p, sym_q); + printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", + p->detect, q->detect); /* Heavy handed way..*/ x->depend = 0; } @@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start, for (p = start; p < finish; p++) { q = find_dependents_of(p, finish, p); if (q && q > p) { - sprint_symbol(sym_p, (unsigned long)p->detect); - sprint_symbol(sym_q, (unsigned long)q->detect); - - printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\ - "should be called before %s!\n", - sym_p, sym_q); + printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", + p->detect, q->detect); } } } -- cgit v1.1 From 72fe00f01f9a3240a1073be27aeaf4fc476cc662 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 10 May 2011 10:50:42 +0200 Subject: x86/amd-iommu: Use threaded interupt handler Move the interupt handling for the iommu into the interupt thread to reduce latencies and prepare interupt handling for pri handling. Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu_proto.h | 1 + arch/x86/kernel/amd_iommu.c | 7 ++++++- arch/x86/kernel/amd_iommu_init.c | 9 +++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index a4ae6c3..55d95eb 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -23,6 +23,7 @@ extern int amd_iommu_init_dma_ops(void); extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_thread(int irq, void *data); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_apply_erratum_63(u16 devid); extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index dc5ddda..873e7e1 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -366,7 +366,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } -irqreturn_t amd_iommu_int_handler(int irq, void *data) +irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu; @@ -376,6 +376,11 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) return IRQ_HANDLED; } +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ + return IRQ_WAKE_THREAD; +} + /**************************************************************************** * * IOMMU command queuing functions diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 28b0781..9179c21 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1034,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu) if (pci_enable_msi(iommu->dev)) return 1; - r = request_irq(iommu->dev->irq, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD-Vi", - NULL); + r = request_threaded_irq(iommu->dev->irq, + amd_iommu_int_handler, + amd_iommu_int_thread, + 0, "AMD-Vi", + iommu->dev); if (r) { pci_disable_msi(iommu->dev); -- cgit v1.1 From 61bf35b9a3eab961ee1249467d9b2ac11d3c34c1 Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Mon, 9 May 2011 16:32:03 -0600 Subject: ASoC: WM8903: Fix Digital Capture Volume range Increase the range of the Digital Capture Volume control to be 120 steps. Each step is 0.75dB, and the range starts at -72dB, giving a max setting of 18dB, which matches the latest datasheet, to the precision of the step size. Signed-off-by: Stephen Warren Acked-by: Liam Girdwood Signed-off-by: Mark Brown --- sound/soc/codecs/wm8903.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm8903.c b/sound/soc/codecs/wm8903.c index f52b623..824d1c8 100644 --- a/sound/soc/codecs/wm8903.c +++ b/sound/soc/codecs/wm8903.c @@ -692,7 +692,7 @@ SOC_ENUM("DRC Smoothing Threshold", drc_smoothing), SOC_SINGLE_TLV("DRC Startup Volume", WM8903_DRC_0, 6, 18, 0, drc_tlv_startup), SOC_DOUBLE_R_TLV("Digital Capture Volume", WM8903_ADC_DIGITAL_VOLUME_LEFT, - WM8903_ADC_DIGITAL_VOLUME_RIGHT, 1, 96, 0, digital_tlv), + WM8903_ADC_DIGITAL_VOLUME_RIGHT, 1, 120, 0, digital_tlv), SOC_ENUM("ADC Companding Mode", adc_companding), SOC_SINGLE("ADC Companding Switch", WM8903_AUDIO_INTERFACE_0, 3, 1, 0), -- cgit v1.1 From 93bbce1ad0cd788190dd7d6c17d289f771fe3d0d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 May 2011 12:13:36 +0200 Subject: netfilter: revert a2361c8735e07322023aedc36e4938b35af31eb0 This patch reverts a2361c8735e07322023aedc36e4938b35af31eb0: "[PATCH] netfilter: xt_conntrack: warn about use in raw table" Florian Wesphal says: "... when the packet was sent from the local machine the skb already has ->nfct attached, and -m conntrack seems to do the right thing." Acked-by: Jan Engelhardt Reported-by: Florian Wesphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_conntrack.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 481a86f..61805d7 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -272,11 +272,6 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; - if (strcmp(par->table, "raw") == 0) { - pr_info("state is undetermined at the time of raw table\n"); - return -EINVAL; - } - ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", -- cgit v1.1 From 349dbc3669d043e656f3ed48c7bfe073ca1c6326 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 10 May 2011 20:59:34 +0900 Subject: nilfs2: fix infinite loop in nilfs_palloc_freev function After having applied commit 9954e7af14868b8b ("nilfs2: add free entries count only if clear bit operation succeeded"), a free routine of nilfs came to fall into an infinite loop, outputting the same message endlessly: nilfs_palloc_freev: entry number 29497 already freed nilfs_palloc_freev: entry number 29497 already freed nilfs_palloc_freev: entry number 29497 already freed nilfs_palloc_freev: entry number 29497 already freed nilfs_palloc_freev: entry number 29497 already freed ... That patch broke the routine so that a loop counter is never updated in an abnormal state. This fixes the regression. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 0a0a66d..f768448 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -646,7 +646,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned long group, group_offset; int i, j, n, ret; - for (i = 0; i < nitems; i += n) { + for (i = 0; i < nitems; i = j) { group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); if (ret < 0) -- cgit v1.1 From 2b348a77981227c6b64fb9cf19f7c711a6806bc9 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 29 Apr 2011 08:41:57 +0000 Subject: perf probe: Fix the missed parameter initialization pubname_callback_param::found should be initialized to 0 in fastpath lookup, the structure is on the stack and uninitialized otherwise. Signed-off-by: Lin Ming Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1304066518-30420-2-git-send-email-ming.m.lin@intel.com Signed-off-by: Ingo Molnar --- tools/perf/util/probe-finder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index a7c7145..3b9d0b8 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -1538,6 +1538,7 @@ static int find_probes(int fd, struct probe_finder *pf) .file = pp->file, .cu_die = &pf->cu_die, .sp_die = &pf->sp_die, + .found = 0, }; struct dwarf_callback_param probe_param = { .data = pf, -- cgit v1.1 From fdb1bb157525907163e2a0c96fe8bb19fbe867a8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 10 May 2011 17:13:37 +0200 Subject: [S390] sclp/memory hotplug: fix initial usecount of increments Fix initial usecount of attached and assigned storage increments so they can be set offline. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp_cmd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 4b60ede..be55fb2 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -518,6 +518,8 @@ static void __init insert_increment(u16 rn, int standby, int assigned) return; new_incr->rn = rn; new_incr->standby = standby; + if (!standby) + new_incr->usecount = 1; last_rn = 0; prev = &sclp_mem_list; list_for_each_entry(incr, &sclp_mem_list, list) { -- cgit v1.1 From aade6c0dfb46ff7ce7df0ed7a2ef15d2d3c47f05 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 10 May 2011 17:13:38 +0200 Subject: [S390] dasd: prevent IO error during reserve/release loop The termination of running CQR caused by reserve/release operations may lead to an IO error if reserve/release is done in a tight loop. Prevent this by increasing the retry counter after termination. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 475e603..86b6f1c 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1742,11 +1742,20 @@ int dasd_sleep_on_interruptible(struct dasd_ccw_req *cqr) static inline int _dasd_term_running_cqr(struct dasd_device *device) { struct dasd_ccw_req *cqr; + int rc; if (list_empty(&device->ccw_queue)) return 0; cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, devlist); - return device->discipline->term_IO(cqr); + rc = device->discipline->term_IO(cqr); + if (!rc) + /* + * CQR terminated because a more important request is pending. + * Undo decreasing of retry counter because this is + * not an error case. + */ + cqr->retries++; + return rc; } int dasd_sleep_on_immediatly(struct dasd_ccw_req *cqr) -- cgit v1.1 From 8eb4bd666ffdca7171cd8118138a91842012b028 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Tue, 10 May 2011 17:13:39 +0200 Subject: [S390] kernel: Initialize register 14 when starting new CPU When starting a new CPU we currently jump to start_secondary() without setting register 14 (the return address) correctly. Therefore on the stack frame for start_secondary an invalid return address is stored. This leads to wrong stack back traces in kernel dumps. Example: #00 [1f33fe48] cpu_idle at 10614a #01 [1f33fe90] start_secondary at 54fa88 #02 [1f33feb8] (null) at 0 <--- invalid To fix this start_secondary() is called now with basr/brasl that sets register 14 correctly. The output of the stack backtrace looks then like the following: #00 [1f33fe48] cpu_idle at 10614a #01 [1f33fe90] start_secondary at 54fa88 #02 [1f33feb8] restart_base at 54f41e <--- correct Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 2 +- arch/s390/kernel/entry64.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 648f642..1b67fc6 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -836,7 +836,7 @@ restart_base: stosm __SF_EMPTY(%r15),0x04 # now we can turn dat on basr %r14,0 l %r14,restart_addr-.(%r14) - br %r14 # branch to start_secondary + basr %r14,%r14 # branch to start_secondary restart_addr: .long start_secondary .align 8 diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 9d3603d..9fd8645 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -841,7 +841,7 @@ restart_base: mvc __LC_SYSTEM_TIMER(8),__TI_system_timer(%r1) xc __LC_STEAL_TIMER(8),__LC_STEAL_TIMER stosm __SF_EMPTY(%r15),0x04 # now we can turn dat on - jg start_secondary + brasl %r14,start_secondary .align 8 restart_vtime: .long 0x7fffffff,0xffffffff -- cgit v1.1 From 91d378088b104f8e31baba8c518f32a7a219d58c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 10 May 2011 17:13:40 +0200 Subject: [S390] disassembler: handle b280/spp instruction arch/s390/kvm/sie64a.S uses the b280 instruction. Tell the builtin disassembler to handle that code. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index c83726c..3d4a78f 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -672,6 +672,7 @@ static struct insn opcode_b2[] = { { "rp", 0x77, INSTR_S_RD }, { "stcke", 0x78, INSTR_S_RD }, { "sacf", 0x79, INSTR_S_RD }, + { "spp", 0x80, INSTR_S_RD }, { "stsi", 0x7d, INSTR_S_RD }, { "srnm", 0x99, INSTR_S_RD }, { "stfpc", 0x9c, INSTR_S_RD }, -- cgit v1.1 From 83ace2701b81be549cca7af33c5b0499cb2602d6 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Tue, 10 May 2011 17:13:41 +0200 Subject: [S390] replace diag10() with diag10_range() function Currently the diag10() function can only release one page. For exploiters that have to call diag10 on a contiguous memory region this is suboptimal. This patch replaces the diag10() function with diag10_range() that is able to release multiple pages. In addition to that the new function now allows to release memory with addresses higher than 2047 MiB. This was due to a restriction of the diagnose implementation under z/VM prior to release 5.2. Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/diag.h | 17 +++++++++++++++-- arch/s390/kernel/diag.c | 21 --------------------- arch/s390/mm/cmm.c | 2 +- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 72b2e2f..7e91c58 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -9,9 +9,22 @@ #define _ASM_S390_DIAG_H /* - * Diagnose 10: Release pages + * Diagnose 10: Release page range */ -extern void diag10(unsigned long addr); +static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) +{ + unsigned long start_addr, end_addr; + + start_addr = start_pfn << PAGE_SHIFT; + end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT; + + asm volatile( + "0: diag %0,%1,0x10\n" + "1:\n" + EX_TABLE(0b, 1b) + EX_TABLE(1b, 1b) + : : "a" (start_addr), "a" (end_addr)); +} /* * Diagnose 14: Input spool file manipulation diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index c032d11..8237fc0 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -9,27 +9,6 @@ #include /* - * Diagnose 10: Release pages - */ -void diag10(unsigned long addr) -{ - if (addr >= 0x7ff00000) - return; - asm volatile( -#ifdef CONFIG_64BIT - " sam31\n" - " diag %0,%0,0x10\n" - "0: sam64\n" -#else - " diag %0,%0,0x10\n" - "0:\n" -#endif - EX_TABLE(0b, 0b) - : : "a" (addr)); -} -EXPORT_SYMBOL(diag10); - -/* * Diagnose 14: Input spool file manipulation */ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index c66ffd8..1f1dba9 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -91,7 +91,7 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10(addr); + diag10_range(addr >> PAGE_SHIFT, 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); -- cgit v1.1 From 3d8dcb3c76bb2930798f61675c33cce8945ab988 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 17:13:42 +0200 Subject: [S390] oprofile: fix min/max interval query checks oprofile_min_interval and oprofile_max_interval are unsigned, checking for negative values doesn't work. Change hwsampler_query_min_interval and hwsampler_query_max_interval to return an unsigned long and check for a zero value instead. Reported-by: Nicolas Kaiser Acked-by: Robert Richter Signed-off-by: Martin Schwidefsky --- arch/s390/oprofile/hwsampler.c | 14 ++++---------- arch/s390/oprofile/hwsampler.h | 4 ++-- arch/s390/oprofile/init.c | 8 ++------ 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/s390/oprofile/hwsampler.c b/arch/s390/oprofile/hwsampler.c index 4952872..33cbd37 100644 --- a/arch/s390/oprofile/hwsampler.c +++ b/arch/s390/oprofile/hwsampler.c @@ -1021,20 +1021,14 @@ deallocate_exit: return rc; } -long hwsampler_query_min_interval(void) +unsigned long hwsampler_query_min_interval(void) { - if (min_sampler_rate) - return min_sampler_rate; - else - return -EINVAL; + return min_sampler_rate; } -long hwsampler_query_max_interval(void) +unsigned long hwsampler_query_max_interval(void) { - if (max_sampler_rate) - return max_sampler_rate; - else - return -EINVAL; + return max_sampler_rate; } unsigned long hwsampler_get_sample_overflow_count(unsigned int cpu) diff --git a/arch/s390/oprofile/hwsampler.h b/arch/s390/oprofile/hwsampler.h index 8c72b59..1912f3b 100644 --- a/arch/s390/oprofile/hwsampler.h +++ b/arch/s390/oprofile/hwsampler.h @@ -102,8 +102,8 @@ int hwsampler_setup(void); int hwsampler_shutdown(void); int hwsampler_allocate(unsigned long sdbt, unsigned long sdb); int hwsampler_deallocate(void); -long hwsampler_query_min_interval(void); -long hwsampler_query_max_interval(void); +unsigned long hwsampler_query_min_interval(void); +unsigned long hwsampler_query_max_interval(void); int hwsampler_start_all(unsigned long interval); int hwsampler_stop_all(void); int hwsampler_deactivate(unsigned int cpu); diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index c63d7e5..5995e9b 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -145,15 +145,11 @@ static int oprofile_hwsampler_init(struct oprofile_operations *ops) * create hwsampler files only if hwsampler_setup() succeeds. */ oprofile_min_interval = hwsampler_query_min_interval(); - if (oprofile_min_interval < 0) { - oprofile_min_interval = 0; + if (oprofile_min_interval == 0) return -ENODEV; - } oprofile_max_interval = hwsampler_query_max_interval(); - if (oprofile_max_interval < 0) { - oprofile_max_interval = 0; + if (oprofile_max_interval == 0) return -ENODEV; - } if (oprofile_timer_init(ops)) return -ENODEV; -- cgit v1.1 From badb8bb983e9cf5b7a872e0a4f6ebeac2b1ce133 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 17:13:43 +0200 Subject: [S390] fix alloc_pgste check in init_new_context Processes started with kernel_execve from a kernel thread will have current->mm==NULL. Reading current->mm->context.alloc_pgste will read a more or less random bit from lowcore in this case. If the bit turns out to be set the whole process tree started this way will allocate page table extensions although they have no need for it. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index a6f0e7c..8c277ca 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -23,7 +23,7 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_64BIT mm->context.asce_bits |= _ASCE_TYPE_REGION3; #endif - if (current->mm->context.alloc_pgste) { + if (current->mm && current->mm->context.alloc_pgste) { /* * alloc_pgste indicates, that any NEW context will be created * with extended page tables. The old context is unchanged. The -- cgit v1.1 From fffcda1183e93df84ad73ba7eb7782a5c354e2b3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 10 May 2011 17:22:06 +0200 Subject: x86, gart: Rename pci-gart_64.c to amd_gart_64.c This file only contains code relevant for the northbridge gart in AMD processors. This patch renames the file to represent this fact in the filename. Signed-off-by: Joerg Roedel --- Documentation/x86/x86_64/boot-options.txt | 2 +- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/amd_gart_64.c | 898 ++++++++++++++++++++++++++++++ arch/x86/kernel/pci-gart_64.c | 898 ------------------------------ 4 files changed, 900 insertions(+), 900 deletions(-) create mode 100644 arch/x86/kernel/amd_gart_64.c delete mode 100644 arch/x86/kernel/pci-gart_64.c diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 092e596..c54b4f5 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -206,7 +206,7 @@ IOMMU (input/output memory management unit) (e.g. because you have < 3 GB memory). Kernel boot message: "PCI-DMA: Disabling IOMMU" - 2. : AMD GART based hardware IOMMU. + 2. : AMD GART based hardware IOMMU. Kernel boot message: "PCI-DMA: using GART IOMMU" 3. : Software IOMMU implementation. Used diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 7338ef2..97ebf82e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -117,7 +117,7 @@ obj-$(CONFIG_OF) += devicetree.o ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AUDIT) += audit_64.o - obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o + obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c new file mode 100644 index 0000000..b117efd --- /dev/null +++ b/arch/x86/kernel/amd_gart_64.c @@ -0,0 +1,898 @@ +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU General Public License v2 only. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +static u32 *iommu_gatt_base; /* Remapping table */ + +static dma_addr_t bad_dma_addr; + +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ +static int iommu_fullflush = 1; + +/* Allocation bitmap for the remapping area: */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static bool need_flush; /* global flush state. set for each gart wrap */ + +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) +{ + unsigned long offset, flags; + unsigned long boundary_size; + unsigned long base_index; + + base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), + PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + size, base_index, boundary_size, align_mask); + if (offset == -1) { + need_flush = true; + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + size, base_index, boundary_size, + align_mask); + } + if (offset != -1) { + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = true; + } + } + if (iommu_fullflush) + need_flush = true; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + bitmap_clear(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(void) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + amd_flush_garts(); + need_flush = false; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +#ifdef CONFIG_IOMMU_LEAK +/* Debugging aid for drivers that don't free their IOMMU tables */ +static int leak_trace; +static int iommu_leak_pages = 20; + +static void dump_leak(void) +{ + static int dump; + + if (dump) + return; + dump = 1; + + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); +} +#endif + +static void iommu_full(struct device *dev, size_t size, int dir) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); + + if (size > PAGE_SIZE*EMERGENCY_PAGES) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR + "PCI-DMA: Random memory would be DMAed\n"); + } +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return force_iommu || !dma_capable(dev, addr, size); +} + +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return !dma_capable(dev, addr, size); +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + size_t size, int dir, unsigned long align_mask) +{ + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); + unsigned long iommu_page; + int i; + + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return bad_dma_addr; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ +static dma_addr_t gart_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unsigned long bus; + phys_addr_t paddr = page_to_phys(page) + offset; + + if (!dev) + dev = &x86_dma_fallback_dev; + + if (!need_iommu(dev, paddr, size)) + return paddr; + + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); + + return bus; +} + +/* + * Free a DMA mapping. + */ +static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unsigned long iommu_page; + int npages; + int i; + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + if (!s->dma_length || !s->length) + break; + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); + } +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + struct scatterlist *s; + int i; + +#ifdef CONFIG_IOMMU_DEBUG + pr_debug("dma_map_sg overflow\n"); +#endif + + for_each_sg(sg, s, nents, i) { + unsigned long addr = sg_phys(s); + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == bad_dma_addr) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, NULL); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(); + + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct device *dev, struct scatterlist *start, + int nelems, struct scatterlist *sout, + unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(dev, pages, 0); + unsigned long iommu_page = iommu_start; + struct scatterlist *s; + int i; + + if (iommu_start == -1) + return -1; + + for_each_sg(start, s, nelems, i) { + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(s != start && s->offset); + if (s == start) { + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + + return 0; +} + +static inline int +dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, + struct scatterlist *sout, unsigned long pages, int need) +{ + if (!need) { + BUG_ON(nelems != 1); + sout->dma_address = start->dma_address; + sout->dma_length = start->length; + return 0; + } + return __dma_map_cont(dev, start, nelems, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; + unsigned long pages = 0; + unsigned int seg_size; + unsigned int max_seg_size; + + if (nents == 0) + return 0; + + if (!dev) + dev = &x86_dma_fallback_dev; + + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + + for_each_sg(sg, s, nents, i) { + dma_addr_t addr = sg_phys(s); + + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ + if (!iommu_merge || !nextneed || !need || s->offset || + (s->length + seg_size > max_seg_size) || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need) < 0) + goto error; + out++; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; + } + } + + seg_size += s->length; + need = nextneed; + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); + ps = s; + } + if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) + goto error; + out++; + flush_gart(); + if (out < nents) { + sgmap = sg_next(sgmap); + sgmap->dma_length = 0; + } + return out; + +error: + flush_gart(); + gart_unmap_sg(dev, sg, out, dir, NULL); + + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + + iommu_full(dev, pages << PAGE_SHIFT, dir); + for_each_sg(sg, s, nents, i) + s->dma_address = bad_dma_addr; + return 0; +} + +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag) +{ + dma_addr_t paddr; + unsigned long align_mask; + struct page *page; + + if (force_iommu && !(flag & GFP_DMA)) { + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); + if (!page) + return NULL; + + align_mask = (1UL << get_order(size)) - 1; + paddr = dma_map_area(dev, page_to_phys(page), size, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); + if (paddr != bad_dma_addr) { + *dma_addr = paddr; + return page_address(page); + } + __free_pages(page, get_order(size)); + } else + return dma_generic_alloc_coherent(dev, size, dma_addr, flag); + + return NULL; +} + +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); + free_pages((unsigned long)vaddr, get_order(size)); +} + +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + +static int no_agp; + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) { + pr_warning( + "PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; + u64 aper_base; + + pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +static void enable_gart_translations(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + enable_gart_translation(dev, __pa(agp_gatt_table)); + } + + /* Flush the GART-TLB to remove stale entries */ + amd_flush_garts(); +} + +/* + * If fix_up_north_bridges is set, the north bridges have to be fixed up on + * resume in the same way as they are handled in gart_iommu_hole_init(). + */ +static bool fix_up_north_bridges; +static u32 aperture_order; +static u32 aperture_alloc; + +void set_up_gart_resume(u32 aper_order, u32 aper_alloc) +{ + fix_up_north_bridges = true; + aperture_order = aper_order; + aperture_alloc = aper_alloc; +} + +static void gart_fixup_northbridges(void) +{ + int i; + + if (!fix_up_north_bridges) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + pr_info("PCI-DMA: Restoring GART aperture settings\n"); + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + gart_set_size_and_enable(dev, aperture_order); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); + } +} + +static void gart_resume(void) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(); + + enable_gart_translations(); +} + +static struct syscore_ops gart_syscore_ops = { + .resume = gart_resume, + +}; + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_amd_gatt(struct agp_kern_info *info) +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; + struct pci_dev *dev; + void *gatt; + int i; + + pr_info("PCI-DMA: Disabling AGP.\n"); + + aper_size = aper_base = info->aper_size = 0; + dev = NULL; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + + info->aper_base = aper_base; + info->aper_size = aper_size >> 20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) + panic("Could not set GART PTEs to uncacheable pages"); + + agp_gatt_table = gatt; + + register_syscore_ops(&gart_syscore_ops); + + flush_gart(); + + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); + + return 0; + + nommu: + /* Should not happen anymore */ + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" + "falling back to iommu=soft.\n"); + return -1; +} + +static struct dma_map_ops gart_dma_ops = { + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, + .map_page = gart_map_page, + .unmap_page = gart_unmap_page, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, +}; + +static void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + /* don't shutdown it if there is AGP installed */ + if (!no_agp) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + u32 ctl; + + dev = node_to_amd_nb(i)->misc; + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + + ctl &= ~GARTEN; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + } +} + +int __init gart_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long iommu_start; + unsigned long aper_base, aper_size; + unsigned long start_pfn, end_pfn; + unsigned long scratch; + long i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other AMD AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || + (no_agp && init_amd_gatt(&info) < 0)) { + if (max_pfn > MAX_DMA32_PFN) { + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); + } + return 0; + } + + /* need to map that range */ + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + + if (end_pfn > max_low_pfn_mapped) { + start_pfn = (aper_base>>PAGE_SHIFT); + init_memory_mapping(start_pfn<> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + +#ifdef CONFIG_IOMMU_LEAK + if (leak_trace) { + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) + pr_debug("PCI-DMA: Cannot trace all the entries\n"); + } +#endif + + /* + * Out of IOMMU space handling. + * Reserve some invalid pages at the beginning of the GART. + */ + bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size >> 20); + + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); + /* + * Tricky. The GART table remaps the physical memory range, + * so the CPU wont notice potential aliases and if the memory + * is remapped to UC later on, we might surprise the PCI devices + * with a stray writeout of a cacheline. So play it sure and + * do an explicit, full-scale wbinvd() _after_ having marked all + * the pages as Not-Present: + */ + wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); + + /* + * Try to workaround a bug (thanks to BenH): + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. (P2P bridge may be prefetching on DMA reads). + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + iommu_gatt_base[i] = gart_unmapped_entry; + + flush_gart(); + dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; +} + +void __init gart_parse_options(char *p) +{ + int arg; + +#ifdef CONFIG_IOMMU_LEAK + if (!strncmp(p, "leak", 4)) { + leak_trace = 1; + p += 4; + if (*p == '=') + ++p; + if (isdigit(*p) && get_option(&p, &arg)) + iommu_leak_pages = arg; + } +#endif + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + if (!strncmp(p, "fullflush", 9)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush", 11)) + iommu_fullflush = 0; + if (!strncmp(p, "noagp", 5)) + no_agp = 1; + if (!strncmp(p, "noaperture", 10)) + fix_aperture = 0; + /* duplicated from pci-dma.c */ + if (!strncmp(p, "force", 5)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "allowed", 7)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } +} +IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c deleted file mode 100644 index b117efd..0000000 --- a/arch/x86/kernel/pci-gart_64.c +++ /dev/null @@ -1,898 +0,0 @@ -/* - * Dynamic DMA mapping support for AMD Hammer. - * - * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. - * This allows to use PCI devices that only support 32bit addresses on systems - * with more than 4GB. - * - * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification. - * - * Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU General Public License v2 only. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned long iommu_bus_base; /* GART remapping area (physical) */ -static unsigned long iommu_size; /* size of remapping area bytes */ -static unsigned long iommu_pages; /* .. and in pages */ - -static u32 *iommu_gatt_base; /* Remapping table */ - -static dma_addr_t bad_dma_addr; - -/* - * If this is disabled the IOMMU will use an optimized flushing strategy - * of only flushing when an mapping is reused. With it true the GART is - * flushed for every mapping. Problem is that doing the lazy flush seems - * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). - */ -static int iommu_fullflush = 1; - -/* Allocation bitmap for the remapping area: */ -static DEFINE_SPINLOCK(iommu_bitmap_lock); -/* Guarded by iommu_bitmap_lock: */ -static unsigned long *iommu_gart_bitmap; - -static u32 gart_unmapped_entry; - -#define GPTE_VALID 1 -#define GPTE_COHERENT 2 -#define GPTE_ENCODE(x) \ - (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) -#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) - -#define EMERGENCY_PAGES 32 /* = 128KB */ - -#ifdef CONFIG_AGP -#define AGPEXTERN extern -#else -#define AGPEXTERN -#endif - -/* GART can only remap to physical addresses < 1TB */ -#define GART_MAX_PHYS_ADDR (1ULL << 40) - -/* backdoor interface to AGP driver */ -AGPEXTERN int agp_memory_reserved; -AGPEXTERN __u32 *agp_gatt_table; - -static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static bool need_flush; /* global flush state. set for each gart wrap */ - -static unsigned long alloc_iommu(struct device *dev, int size, - unsigned long align_mask) -{ - unsigned long offset, flags; - unsigned long boundary_size; - unsigned long base_index; - - base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), - PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - - spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, - size, base_index, boundary_size, align_mask); - if (offset == -1) { - need_flush = true; - offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, - size, base_index, boundary_size, - align_mask); - } - if (offset != -1) { - next_bit = offset+size; - if (next_bit >= iommu_pages) { - next_bit = 0; - need_flush = true; - } - } - if (iommu_fullflush) - need_flush = true; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); - - return offset; -} - -static void free_iommu(unsigned long offset, int size) -{ - unsigned long flags; - - spin_lock_irqsave(&iommu_bitmap_lock, flags); - bitmap_clear(iommu_gart_bitmap, offset, size); - if (offset >= next_bit) - next_bit = offset + size; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} - -/* - * Use global flush state to avoid races with multiple flushers. - */ -static void flush_gart(void) -{ - unsigned long flags; - - spin_lock_irqsave(&iommu_bitmap_lock, flags); - if (need_flush) { - amd_flush_garts(); - need_flush = false; - } - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} - -#ifdef CONFIG_IOMMU_LEAK -/* Debugging aid for drivers that don't free their IOMMU tables */ -static int leak_trace; -static int iommu_leak_pages = 20; - -static void dump_leak(void) -{ - static int dump; - - if (dump) - return; - dump = 1; - - show_stack(NULL, NULL); - debug_dma_dump_mappings(NULL); -} -#endif - -static void iommu_full(struct device *dev, size_t size, int dir) -{ - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * Return some non mapped prereserved space in the aperture and - * let the Northbridge deal with it. This will result in garbage - * in the IO operation. When the size exceeds the prereserved space - * memory corruption will occur or random memory will be DMAed - * out. Hopefully no network devices use single mappings that big. - */ - - dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); - - if (size > PAGE_SIZE*EMERGENCY_PAGES) { - if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic(KERN_ERR - "PCI-DMA: Random memory would be DMAed\n"); - } -#ifdef CONFIG_IOMMU_LEAK - dump_leak(); -#endif -} - -static inline int -need_iommu(struct device *dev, unsigned long addr, size_t size) -{ - return force_iommu || !dma_capable(dev, addr, size); -} - -static inline int -nonforced_iommu(struct device *dev, unsigned long addr, size_t size) -{ - return !dma_capable(dev, addr, size); -} - -/* Map a single continuous physical area into the IOMMU. - * Caller needs to check if the iommu is needed and flush. - */ -static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir, unsigned long align_mask) -{ - unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page; - int i; - - if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) - return bad_dma_addr; - - iommu_page = alloc_iommu(dev, npages, align_mask); - if (iommu_page == -1) { - if (!nonforced_iommu(dev, phys_mem, size)) - return phys_mem; - if (panic_on_overflow) - panic("dma_map_area overflow %lu bytes\n", size); - iommu_full(dev, size, dir); - return bad_dma_addr; - } - - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - phys_mem += PAGE_SIZE; - } - return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); -} - -/* Map a single area into the IOMMU */ -static dma_addr_t gart_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - unsigned long bus; - phys_addr_t paddr = page_to_phys(page) + offset; - - if (!dev) - dev = &x86_dma_fallback_dev; - - if (!need_iommu(dev, paddr, size)) - return paddr; - - bus = dma_map_area(dev, paddr, size, dir, 0); - flush_gart(); - - return bus; -} - -/* - * Free a DMA mapping. - */ -static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - unsigned long iommu_page; - int npages; - int i; - - if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || - dma_addr >= iommu_bus_base + iommu_size) - return; - - iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - } - free_iommu(iommu_page, npages); -} - -/* - * Wrapper for pci_unmap_single working with scatterlists. - */ -static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nents, i) { - if (!s->dma_length || !s->length) - break; - gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); - } -} - -/* Fallback for dma_map_sg in case of overflow */ -static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ - struct scatterlist *s; - int i; - -#ifdef CONFIG_IOMMU_DEBUG - pr_debug("dma_map_sg overflow\n"); -#endif - - for_each_sg(sg, s, nents, i) { - unsigned long addr = sg_phys(s); - - if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_addr) { - if (i > 0) - gart_unmap_sg(dev, sg, i, dir, NULL); - nents = 0; - sg[0].dma_length = 0; - break; - } - } - s->dma_address = addr; - s->dma_length = s->length; - } - flush_gart(); - - return nents; -} - -/* Map multiple scatterlist entries continuous into the first. */ -static int __dma_map_cont(struct device *dev, struct scatterlist *start, - int nelems, struct scatterlist *sout, - unsigned long pages) -{ - unsigned long iommu_start = alloc_iommu(dev, pages, 0); - unsigned long iommu_page = iommu_start; - struct scatterlist *s; - int i; - - if (iommu_start == -1) - return -1; - - for_each_sg(start, s, nelems, i) { - unsigned long pages, addr; - unsigned long phys_addr = s->dma_address; - - BUG_ON(s != start && s->offset); - if (s == start) { - sout->dma_address = iommu_bus_base; - sout->dma_address += iommu_page*PAGE_SIZE + s->offset; - sout->dma_length = s->length; - } else { - sout->dma_length += s->length; - } - - addr = phys_addr; - pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); - while (pages--) { - iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - addr += PAGE_SIZE; - iommu_page++; - } - } - BUG_ON(iommu_page - iommu_start != pages); - - return 0; -} - -static inline int -dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, - struct scatterlist *sout, unsigned long pages, int need) -{ - if (!need) { - BUG_ON(nelems != 1); - sout->dma_address = start->dma_address; - sout->dma_length = start->length; - return 0; - } - return __dma_map_cont(dev, start, nelems, sout, pages); -} - -/* - * DMA map all entries in a scatterlist. - * Merge chunks that have page aligned sizes into a continuous mapping. - */ -static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) -{ - struct scatterlist *s, *ps, *start_sg, *sgmap; - int need = 0, nextneed, i, out, start; - unsigned long pages = 0; - unsigned int seg_size; - unsigned int max_seg_size; - - if (nents == 0) - return 0; - - if (!dev) - dev = &x86_dma_fallback_dev; - - out = 0; - start = 0; - start_sg = sg; - sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ - - for_each_sg(sg, s, nents, i) { - dma_addr_t addr = sg_phys(s); - - s->dma_address = addr; - BUG_ON(s->length == 0); - - nextneed = need_iommu(dev, addr, s->length); - - /* Handle the previous not yet processed entries */ - if (i > start) { - /* - * Can only merge when the last chunk ends on a - * page boundary and the new one doesn't have an - * offset. - */ - if (!iommu_merge || !nextneed || !need || s->offset || - (s->length + seg_size > max_seg_size) || - (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(dev, start_sg, i - start, - sgmap, pages, need) < 0) - goto error; - out++; - - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; - } - } - - seg_size += s->length; - need = nextneed; - pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); - ps = s; - } - if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) - goto error; - out++; - flush_gart(); - if (out < nents) { - sgmap = sg_next(sgmap); - sgmap->dma_length = 0; - } - return out; - -error: - flush_gart(); - gart_unmap_sg(dev, sg, out, dir, NULL); - - /* When it was forced or merged try again in a dumb way */ - if (force_iommu || iommu_merge) { - out = dma_map_sg_nonforce(dev, sg, nents, dir); - if (out > 0) - return out; - } - if (panic_on_overflow) - panic("dma_map_sg: overflow on %lu pages\n", pages); - - iommu_full(dev, pages << PAGE_SHIFT, dir); - for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_addr; - return 0; -} - -/* allocate and map a coherent mapping */ -static void * -gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag) -{ - dma_addr_t paddr; - unsigned long align_mask; - struct page *page; - - if (force_iommu && !(flag & GFP_DMA)) { - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - page = alloc_pages(flag | __GFP_ZERO, get_order(size)); - if (!page) - return NULL; - - align_mask = (1UL << get_order(size)) - 1; - paddr = dma_map_area(dev, page_to_phys(page), size, - DMA_BIDIRECTIONAL, align_mask); - - flush_gart(); - if (paddr != bad_dma_addr) { - *dma_addr = paddr; - return page_address(page); - } - __free_pages(page, get_order(size)); - } else - return dma_generic_alloc_coherent(dev, size, dma_addr, flag); - - return NULL; -} - -/* free a coherent mapping */ -static void -gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr) -{ - gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); - free_pages((unsigned long)vaddr, get_order(size)); -} - -static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return (dma_addr == bad_dma_addr); -} - -static int no_agp; - -static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) -{ - unsigned long a; - - if (!iommu_size) { - iommu_size = aper_size; - if (!no_agp) - iommu_size /= 2; - } - - a = aper + iommu_size; - iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; - - if (iommu_size < 64*1024*1024) { - pr_warning( - "PCI-DMA: Warning: Small IOMMU %luMB." - " Consider increasing the AGP aperture in BIOS\n", - iommu_size >> 20); - } - - return iommu_size; -} - -static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) -{ - unsigned aper_size = 0, aper_base_32, aper_order; - u64 aper_base; - - pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); - aper_order = (aper_order >> 1) & 7; - - aper_base = aper_base_32 & 0x7fff; - aper_base <<= 25; - - aper_size = (32 * 1024 * 1024) << aper_order; - if (aper_base + aper_size > 0x100000000UL || !aper_size) - aper_base = 0; - - *size = aper_size; - return aper_base; -} - -static void enable_gart_translations(void) -{ - int i; - - if (!amd_nb_has_feature(AMD_NB_GART)) - return; - - for (i = 0; i < amd_nb_num(); i++) { - struct pci_dev *dev = node_to_amd_nb(i)->misc; - - enable_gart_translation(dev, __pa(agp_gatt_table)); - } - - /* Flush the GART-TLB to remove stale entries */ - amd_flush_garts(); -} - -/* - * If fix_up_north_bridges is set, the north bridges have to be fixed up on - * resume in the same way as they are handled in gart_iommu_hole_init(). - */ -static bool fix_up_north_bridges; -static u32 aperture_order; -static u32 aperture_alloc; - -void set_up_gart_resume(u32 aper_order, u32 aper_alloc) -{ - fix_up_north_bridges = true; - aperture_order = aper_order; - aperture_alloc = aper_alloc; -} - -static void gart_fixup_northbridges(void) -{ - int i; - - if (!fix_up_north_bridges) - return; - - if (!amd_nb_has_feature(AMD_NB_GART)) - return; - - pr_info("PCI-DMA: Restoring GART aperture settings\n"); - - for (i = 0; i < amd_nb_num(); i++) { - struct pci_dev *dev = node_to_amd_nb(i)->misc; - - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - gart_set_size_and_enable(dev, aperture_order); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); - } -} - -static void gart_resume(void) -{ - pr_info("PCI-DMA: Resuming GART IOMMU\n"); - - gart_fixup_northbridges(); - - enable_gart_translations(); -} - -static struct syscore_ops gart_syscore_ops = { - .resume = gart_resume, - -}; - -/* - * Private Northbridge GATT initialization in case we cannot use the - * AGP driver for some reason. - */ -static __init int init_amd_gatt(struct agp_kern_info *info) -{ - unsigned aper_size, gatt_size, new_aper_size; - unsigned aper_base, new_aper_base; - struct pci_dev *dev; - void *gatt; - int i; - - pr_info("PCI-DMA: Disabling AGP.\n"); - - aper_size = aper_base = info->aper_size = 0; - dev = NULL; - for (i = 0; i < amd_nb_num(); i++) { - dev = node_to_amd_nb(i)->misc; - new_aper_base = read_aperture(dev, &new_aper_size); - if (!new_aper_base) - goto nommu; - - if (!aper_base) { - aper_size = new_aper_size; - aper_base = new_aper_base; - } - if (aper_size != new_aper_size || aper_base != new_aper_base) - goto nommu; - } - if (!aper_base) - goto nommu; - - info->aper_base = aper_base; - info->aper_size = aper_size >> 20; - - gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(gatt_size)); - if (!gatt) - panic("Cannot allocate GATT table"); - if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) - panic("Could not set GART PTEs to uncacheable pages"); - - agp_gatt_table = gatt; - - register_syscore_ops(&gart_syscore_ops); - - flush_gart(); - - pr_info("PCI-DMA: aperture base @ %x size %u KB\n", - aper_base, aper_size>>10); - - return 0; - - nommu: - /* Should not happen anymore */ - pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" - "falling back to iommu=soft.\n"); - return -1; -} - -static struct dma_map_ops gart_dma_ops = { - .map_sg = gart_map_sg, - .unmap_sg = gart_unmap_sg, - .map_page = gart_map_page, - .unmap_page = gart_unmap_page, - .alloc_coherent = gart_alloc_coherent, - .free_coherent = gart_free_coherent, - .mapping_error = gart_mapping_error, -}; - -static void gart_iommu_shutdown(void) -{ - struct pci_dev *dev; - int i; - - /* don't shutdown it if there is AGP installed */ - if (!no_agp) - return; - - if (!amd_nb_has_feature(AMD_NB_GART)) - return; - - for (i = 0; i < amd_nb_num(); i++) { - u32 ctl; - - dev = node_to_amd_nb(i)->misc; - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - - ctl &= ~GARTEN; - - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); - } -} - -int __init gart_iommu_init(void) -{ - struct agp_kern_info info; - unsigned long iommu_start; - unsigned long aper_base, aper_size; - unsigned long start_pfn, end_pfn; - unsigned long scratch; - long i; - - if (!amd_nb_has_feature(AMD_NB_GART)) - return 0; - -#ifndef CONFIG_AGP_AMD64 - no_agp = 1; -#else - /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other AMD AGP bridge drivers here */ - no_agp = no_agp || - (agp_amd64_init() < 0) || - (agp_copy_info(agp_bridge, &info) < 0); -#endif - - if (no_iommu || - (!force_iommu && max_pfn <= MAX_DMA32_PFN) || - !gart_iommu_aperture || - (no_agp && init_amd_gatt(&info) < 0)) { - if (max_pfn > MAX_DMA32_PFN) { - pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); - pr_warning("falling back to iommu=soft.\n"); - } - return 0; - } - - /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); - init_memory_mapping(start_pfn<> PAGE_SHIFT; - - iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(iommu_pages/8)); - if (!iommu_gart_bitmap) - panic("Cannot allocate iommu bitmap\n"); - -#ifdef CONFIG_IOMMU_LEAK - if (leak_trace) { - int ret; - - ret = dma_debug_resize_entries(iommu_pages); - if (ret) - pr_debug("PCI-DMA: Cannot trace all the entries\n"); - } -#endif - - /* - * Out of IOMMU space handling. - * Reserve some invalid pages at the beginning of the GART. - */ - bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - - pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", - iommu_size >> 20); - - agp_memory_reserved = iommu_size; - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_addr = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); - - /* - * Unmap the IOMMU part of the GART. The alias of the page is - * always mapped with cache enabled and there is no full cache - * coherency across the GART remapping. The unmapping avoids - * automatic prefetches from the CPU allocating cache lines in - * there. All CPU accesses are done via the direct mapping to - * the backing memory. The GART address is only used by PCI - * devices. - */ - set_memory_np((unsigned long)__va(iommu_bus_base), - iommu_size >> PAGE_SHIFT); - /* - * Tricky. The GART table remaps the physical memory range, - * so the CPU wont notice potential aliases and if the memory - * is remapped to UC later on, we might surprise the PCI devices - * with a stray writeout of a cacheline. So play it sure and - * do an explicit, full-scale wbinvd() _after_ having marked all - * the pages as Not-Present: - */ - wbinvd(); - - /* - * Now all caches are flushed and we can safely enable - * GART hardware. Doing it early leaves the possibility - * of stale cache entries that can lead to GART PTE - * errors. - */ - enable_gart_translations(); - - /* - * Try to workaround a bug (thanks to BenH): - * Set unmapped entries to a scratch page instead of 0. - * Any prefetches that hit unmapped entries won't get an bus abort - * then. (P2P bridge may be prefetching on DMA reads). - */ - scratch = get_zeroed_page(GFP_KERNEL); - if (!scratch) - panic("Cannot allocate iommu scratch page"); - gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); - for (i = EMERGENCY_PAGES; i < iommu_pages; i++) - iommu_gatt_base[i] = gart_unmapped_entry; - - flush_gart(); - dma_ops = &gart_dma_ops; - x86_platform.iommu_shutdown = gart_iommu_shutdown; - swiotlb = 0; - - return 0; -} - -void __init gart_parse_options(char *p) -{ - int arg; - -#ifdef CONFIG_IOMMU_LEAK - if (!strncmp(p, "leak", 4)) { - leak_trace = 1; - p += 4; - if (*p == '=') - ++p; - if (isdigit(*p) && get_option(&p, &arg)) - iommu_leak_pages = arg; - } -#endif - if (isdigit(*p) && get_option(&p, &arg)) - iommu_size = arg; - if (!strncmp(p, "fullflush", 9)) - iommu_fullflush = 1; - if (!strncmp(p, "nofullflush", 11)) - iommu_fullflush = 0; - if (!strncmp(p, "noagp", 5)) - no_agp = 1; - if (!strncmp(p, "noaperture", 10)) - fix_aperture = 0; - /* duplicated from pci-dma.c */ - if (!strncmp(p, "force", 5)) - gart_iommu_aperture_allowed = 1; - if (!strncmp(p, "allowed", 7)) - gart_iommu_aperture_allowed = 1; - if (!strncmp(p, "memaper", 7)) { - fallback_aper_force = 1; - p += 7; - if (*p == '=') { - ++p; - if (get_option(&p, &arg)) - fallback_aper_order = arg; - } - } -} -IOMMU_INIT_POST(gart_iommu_hole_init); -- cgit v1.1 From d24339059d640f108c08ba99ef30e3bafa10f8e4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 May 2011 17:35:58 +0200 Subject: fuse: fix oops in revalidate when called with NULL nameidata Some cases (e.g. ecryptfs) can call ->dentry_revalidate with NULL nameidata. https://bugzilla.kernel.org/show_bug.cgi?id=34732 Tyler Hicks pointed out that this bug was introduced by commit e7c0a16786 "fuse: make fuse_dentry_revalidate() RCU aware" Reported-by: Witold Baryluk Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c6ba49b..b32eb29 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -174,7 +174,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) if (!inode) return 0; - if (nd->flags & LOOKUP_RCU) + if (nd && (nd->flags & LOOKUP_RCU)) return -ECHILD; fc = get_fuse_conn(inode); -- cgit v1.1 From 7e186bdd0098b34c69fb8067c67340ae610ea499 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 6 May 2011 12:27:50 +0100 Subject: xen: do not clear and mask evtchns in __xen_evtchn_do_upcall Change the irq handler of evtchns and pirqs that don't need EOI (pirqs that correspond to physical edge interrupts) to handle_edge_irq. Use handle_fasteoi_irq for pirqs that need eoi (they generally correspond to level triggered irqs), no risk in loosing interrupts because we have to EOI the irq anyway. This change has the following benefits: - it uses the very same handlers that Linux would use on native for the same irqs (handle_edge_irq for edge irqs and msis, and handle_fasteoi_irq for everything else); - it uses these handlers in the same way native code would use them: it let Linux mask\unmask and ack the irq when Linux want to mask\unmask and ack the irq; - it fixes a problem occurring when a driver calls disable_irq() in its handler: the old code was unconditionally unmasking the evtchn even if the irq is disabled when irq_eoi was called. See Documentation/DocBook/genericirq.tmpl for more informations. Signed-off-by: Stefano Stabellini [v1: Fixed space/tab issues] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 113 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 40 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 33167b4..0ae1d4d 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -118,6 +118,8 @@ static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], static struct irq_chip xen_dynamic_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; +static void enable_dynirq(struct irq_data *data); +static void disable_dynirq(struct irq_data *data); /* Get info for IRQ */ static struct irq_info *info_for_irq(unsigned irq) @@ -473,16 +475,6 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } -static void pirq_unmask_notify(int irq) -{ - struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; - - if (unlikely(pirq_needs_eoi(irq))) { - int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); - WARN_ON(rc); - } -} - static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -506,6 +498,29 @@ static bool probing_irq(int irq) return desc && desc->action == NULL; } +static void eoi_pirq(struct irq_data *data) +{ + int evtchn = evtchn_from_irq(data->irq); + struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + int rc = 0; + + irq_move_irq(data); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); + + if (pirq_needs_eoi(data->irq)) { + rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void mask_ack_pirq(struct irq_data *data) +{ + disable_dynirq(data); + eoi_pirq(data); +} + static unsigned int __startup_pirq(unsigned int irq) { struct evtchn_bind_pirq bind_pirq; @@ -539,7 +554,7 @@ static unsigned int __startup_pirq(unsigned int irq) out: unmask_evtchn(evtchn); - pirq_unmask_notify(irq); + eoi_pirq(irq_get_irq_data(irq)); return 0; } @@ -579,18 +594,7 @@ static void enable_pirq(struct irq_data *data) static void disable_pirq(struct irq_data *data) { -} - -static void ack_pirq(struct irq_data *data) -{ - int evtchn = evtchn_from_irq(data->irq); - - irq_move_irq(data); - - if (VALID_EVTCHN(evtchn)) { - mask_evtchn(evtchn); - clear_evtchn(evtchn); - } + disable_dynirq(data); } static int find_irq_by_gsi(unsigned gsi) @@ -639,9 +643,6 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, - name); - irq_op.irq = irq; irq_op.vector = 0; @@ -658,6 +659,32 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, shareable ? PIRQ_SHAREABLE : 0); + pirq_query_unmask(irq); + /* We try to use the handler with the appropriate semantic for the + * type of interrupt: if the interrupt doesn't need an eoi + * (pirq_needs_eoi returns false), we treat it like an edge + * triggered interrupt so we use handle_edge_irq. + * As a matter of fact this only happens when the corresponding + * physical interrupt is edge triggered or an msi. + * + * On the other hand if the interrupt needs an eoi (pirq_needs_eoi + * returns true) we treat it like a level triggered interrupt so we + * use handle_fasteoi_irq like the native code does for this kind of + * interrupts. + * Depending on the Xen version, pirq_needs_eoi might return true + * not only for level triggered interrupts but for edge triggered + * interrupts too. In any case Xen always honors the eoi mechanism, + * not injecting any more pirqs of the same kind if the first one + * hasn't received an eoi yet. Therefore using the fasteoi handler + * is the right choice either way. + */ + if (pirq_needs_eoi(irq)) + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_fasteoi_irq, name); + else + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_edge_irq, name); + out: spin_unlock(&irq_mapping_update_lock); @@ -690,8 +717,8 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, if (irq == -1) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, - name); + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, + name); xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0); ret = irq_set_msi_desc(irq, msidesc); @@ -773,7 +800,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) goto out; irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_fasteoi_irq, "event"); + handle_edge_irq, "event"); xen_irq_info_evtchn_init(irq, evtchn); } @@ -1179,9 +1206,6 @@ static void __xen_evtchn_do_upcall(void) port = (word_idx * BITS_PER_LONG) + bit_idx; irq = evtchn_to_irq[port]; - mask_evtchn(port); - clear_evtchn(port); - if (irq != -1) { desc = irq_to_desc(irq); if (desc) @@ -1337,10 +1361,16 @@ static void ack_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); - irq_move_masked_irq(data); + irq_move_irq(data); if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); + clear_evtchn(evtchn); +} + +static void mask_ack_dynirq(struct irq_data *data) +{ + disable_dynirq(data); + ack_dynirq(data); } static int retrigger_dynirq(struct irq_data *data) @@ -1535,7 +1565,9 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .irq_mask = disable_dynirq, .irq_unmask = enable_dynirq, - .irq_eoi = ack_dynirq, + .irq_ack = ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + .irq_set_affinity = set_affinity_irq, .irq_retrigger = retrigger_dynirq, }; @@ -1545,14 +1577,15 @@ static struct irq_chip xen_pirq_chip __read_mostly = { .irq_startup = startup_pirq, .irq_shutdown = shutdown_pirq, - .irq_enable = enable_pirq, - .irq_unmask = enable_pirq, - .irq_disable = disable_pirq, - .irq_mask = disable_pirq, - .irq_ack = ack_pirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = eoi_pirq, + .irq_eoi = eoi_pirq, + .irq_mask_ack = mask_ack_pirq, .irq_set_affinity = set_affinity_irq, -- cgit v1.1 From c54794d19e61472156e37263c074225574c80df1 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 28 Dec 2010 13:21:37 -0800 Subject: MIPS: Mask jump target in ftrace_dyn_arch_init_insns(). The current code is abusing the uasm interface by passing jump target addresses with high bits set. Mask the addresses to avoid annoying messages at boot time. Signed-off-by: David Daney Cc: Steven Rostedt Cc: Wu Zhangjin Patchwork: https://patchwork.linux-mips.org/patch/1922/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ftrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 94ca2b0..feb8021 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -23,6 +23,7 @@ #define JAL 0x0c000000 /* jump & link: ip --> ra, jump to target */ #define ADDR_MASK 0x03ffffff /* op_code|addr : 31...26|25 ....0 */ +#define JUMP_RANGE_MASK ((1UL << 28) - 1) #define INSN_NOP 0x00000000 /* nop */ #define INSN_JAL(addr) \ @@ -44,12 +45,12 @@ static inline void ftrace_dyn_arch_init_insns(void) /* jal (ftrace_caller + 8), jump over the first two instruction */ buf = (u32 *)&insn_jal_ftrace_caller; - uasm_i_jal(&buf, (FTRACE_ADDR + 8)); + uasm_i_jal(&buf, (FTRACE_ADDR + 8) & JUMP_RANGE_MASK); #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* j ftrace_graph_caller */ buf = (u32 *)&insn_j_ftrace_graph_caller; - uasm_i_j(&buf, (unsigned long)ftrace_graph_caller); + uasm_i_j(&buf, (unsigned long)ftrace_graph_caller & JUMP_RANGE_MASK); #endif } -- cgit v1.1 From 71271aab8cbdeb9612761db3230fe8dadb9a01c3 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 10:50:38 +0200 Subject: MIPS: c-r4k: Fix GCC 4.6.0 build error CC arch/mips/mm/c-r4k.o arch/mips/mm/c-r4k.c: In function 'probe_scache': arch/mips/mm/c-r4k.c:1078:6: error: variable 'tmp' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Older GCC versions didn't warn about the unused variable tmp because it was getting initialized. Signed-off-by: Ralf Baechle --- arch/mips/mm/c-r4k.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index b4923a7..71bddf8 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1075,7 +1075,6 @@ static int __cpuinit probe_scache(void) unsigned long flags, addr, begin, end, pow2; unsigned int config = read_c0_config(); struct cpuinfo_mips *c = ¤t_cpu_data; - int tmp; if (config & CONF_SC) return 0; @@ -1108,7 +1107,6 @@ static int __cpuinit probe_scache(void) /* Now search for the wrap around point. */ pow2 = (128 * 1024); - tmp = 0; for (addr = begin + (128 * 1024); addr < end; addr = begin + pow2) { cache_op(Index_Load_Tag_SD, addr); __asm__ __volatile__("nop; nop; nop; nop;"); /* hazard... */ -- cgit v1.1 From 4a9040f451c32cd62971ecda1cb5bc4aed444c78 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 10:54:54 +0200 Subject: MIPS: tlbex: Fix GCC 4.6.0 build error CC arch/mips/mm/tlbex.o arch/mips/mm/tlbex.c: In function 'build_r4000_tlb_refill_handler': arch/mips/mm/tlbex.c:1155:22: error: variable 'vmalloc_mode' set but not used [-Werror=unused-but-set-variable] arch/mips/mm/tlbex.c:1154:28: error: variable 'htlb_info' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/mm/tlbex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 5ef294f..f5734c2 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -1151,8 +1151,8 @@ static void __cpuinit build_r4000_tlb_refill_handler(void) struct uasm_reloc *r = relocs; u32 *f; unsigned int final_len; - struct mips_huge_tlb_info htlb_info; - enum vmalloc64_mode vmalloc_mode; + struct mips_huge_tlb_info htlb_info __maybe_unused; + enum vmalloc64_mode vmalloc_mode __maybe_unused; memset(tlb_handler, 0, sizeof(tlb_handler)); memset(labels, 0, sizeof(labels)); -- cgit v1.1 From 6fd78fc1fa3ed1e70501c978c2d0bef94320252f Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 11:00:44 +0200 Subject: MIPS: IP22: Fix GCC 4.6.0 build error CC arch/mips/sgi-ip22/ip22-time.o arch/mips/sgi-ip22/ip22-time.c: In function 'dosample': arch/mips/sgi-ip22/ip22-time.c:35:10: error: variable 'lsb' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip22/ip22-time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c index 603fc91..1a94c98 100644 --- a/arch/mips/sgi-ip22/ip22-time.c +++ b/arch/mips/sgi-ip22/ip22-time.c @@ -32,7 +32,7 @@ static unsigned long dosample(void) { u32 ct0, ct1; - u8 msb, lsb; + u8 msb; /* Start the counter. */ sgint->tcword = (SGINT_TCWORD_CNT2 | SGINT_TCWORD_CALL | @@ -46,7 +46,7 @@ static unsigned long dosample(void) /* Latch and spin until top byte of counter2 is zero */ do { writeb(SGINT_TCWORD_CNT2 | SGINT_TCWORD_CLAT, &sgint->tcword); - lsb = readb(&sgint->tcnt2); + (void) readb(&sgint->tcnt2); msb = readb(&sgint->tcnt2); ct1 = read_c0_count(); } while (msb); -- cgit v1.1 From 3be1afc8f64742552325d9f03c2b96339e822f9e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 11:06:49 +0200 Subject: MIPS: IP22: Fix GCC 4.6.0 build error CC arch/mips/sgi-ip22/ip22-platform.o arch/mips/sgi-ip22/ip22-platform.c: In function 'sgiseeq_devinit': arch/mips/sgi-ip22/ip22-platform.c:135:15: error: variable 'tmp' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors While at it rename the variable to pbdma for readability; there is a local variable tmp of different type being used in two nested blocks. Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip22/ip22-platform.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/sgi-ip22/ip22-platform.c b/arch/mips/sgi-ip22/ip22-platform.c index deddbf0..698904d 100644 --- a/arch/mips/sgi-ip22/ip22-platform.c +++ b/arch/mips/sgi-ip22/ip22-platform.c @@ -132,7 +132,7 @@ static struct platform_device eth1_device = { */ static int __init sgiseeq_devinit(void) { - unsigned int tmp; + unsigned int pbdma __maybe_unused; int res, i; eth0_pd.hpc = hpc3c0; @@ -151,7 +151,7 @@ static int __init sgiseeq_devinit(void) /* Second HPC is missing? */ if (ip22_is_fullhouse() || - get_dbe(tmp, (unsigned int *)&hpc3c1->pbdma[1])) + get_dbe(pbdma, (unsigned int *)&hpc3c1->pbdma[1])) return 0; sgimc->giopar |= SGIMC_GIOPAR_MASTEREXP1 | SGIMC_GIOPAR_EXP164 | -- cgit v1.1 From af3a1f6f4813907e143f87030cde67a9971db533 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 11:43:19 +0200 Subject: MIPS: Malta: Fix GCC 4.6.0 build error CC arch/mips/mti-malta/malta-init.o arch/mips/mti-malta/malta-init.c: In function 'prom_init': arch/mips/mti-malta/malta-init.c:196:6: error: variable 'result' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/mti-malta/malta-init.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/mips/mti-malta/malta-init.c b/arch/mips/mti-malta/malta-init.c index 414f0c9..31180c3 100644 --- a/arch/mips/mti-malta/malta-init.c +++ b/arch/mips/mti-malta/malta-init.c @@ -193,8 +193,6 @@ extern struct plat_smp_ops msmtc_smp_ops; void __init prom_init(void) { - int result; - prom_argc = fw_arg0; _prom_argv = (int *) fw_arg1; _prom_envp = (int *) fw_arg2; @@ -360,20 +358,14 @@ void __init prom_init(void) #ifdef CONFIG_SERIAL_8250_CONSOLE console_config(); #endif - /* Early detection of CMP support */ - result = gcmp_probe(GCMP_BASE_ADDR, GCMP_ADDRSPACE_SZ); - #ifdef CONFIG_MIPS_CMP - if (result) + /* Early detection of CMP support */ + if (gcmp_probe(GCMP_BASE_ADDR, GCMP_ADDRSPACE_SZ)) register_smp_ops(&cmp_smp_ops); + else #endif #ifdef CONFIG_MIPS_MT_SMP -#ifdef CONFIG_MIPS_CMP - if (!result) register_smp_ops(&vsmp_smp_ops); -#else - register_smp_ops(&vsmp_smp_ops); -#endif #endif #ifdef CONFIG_MIPS_MT_SMTC register_smp_ops(&msmtc_smp_ops); -- cgit v1.1 From 6be63bbbdab66b9185dc6f67c8b1bacb6f37f946 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 11:48:22 +0200 Subject: MIPS: Malta: Fix GCC 4.6.0 build error CC arch/mips/mti-malta/malta-int.o arch/mips/mti-malta/malta-int.c: In function 'mips_pcibios_iack': arch/mips/mti-malta/malta-int.c:59:6: error: variable 'dummy' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/mti-malta/malta-int.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index 9027061..e85c977 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -56,7 +56,6 @@ static DEFINE_RAW_SPINLOCK(mips_irq_lock); static inline int mips_pcibios_iack(void) { int irq; - u32 dummy; /* * Determine highest priority pending interrupt by performing @@ -83,7 +82,7 @@ static inline int mips_pcibios_iack(void) BONITO_PCIMAP_CFG = 0x20000; /* Flush Bonito register block */ - dummy = BONITO_PCIMAP_CFG; + (void) BONITO_PCIMAP_CFG; iob(); /* sync */ irq = __raw_readl((u32 *)_pcictrl_bonito_pcicfg); -- cgit v1.1 From 11b9d0eca559d087f3d49282033f2865cceacedd Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 11:57:11 +0200 Subject: MIPS: SNI: Fix GCC 4.6.0 build error CC arch/mips/sni/time.o arch/mips/sni/time.c: In function 'dosample': arch/mips/sni/time.c:98:19: error: variable 'lsb' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/sni/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index c76151b..0904d4d 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -95,7 +95,7 @@ static void __init sni_a20r_timer_setup(void) static __init unsigned long dosample(void) { u32 ct0, ct1; - volatile u8 msb, lsb; + volatile u8 msb; /* Start the counter. */ outb_p(0x34, 0x43); @@ -108,7 +108,7 @@ static __init unsigned long dosample(void) /* Latch and spin until top byte of counter0 is zero */ do { outb(0x00, 0x43); - lsb = inb(0x40); + (void) inb(0x40); msb = inb(0x40); ct1 = read_c0_count(); } while (msb); -- cgit v1.1 From 84d3b0dbac103fc1b3aff1e71cb723b5456a849c Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 12:09:51 +0200 Subject: MIPS: Jazz: Fix GCC 4.6.0 build error CC arch/mips/jazz/jazzdma.o arch/mips/jazz/jazzdma.c: In function 'vdma_remap': arch/mips/jazz/jazzdma.c:214:20: error: variable 'npages' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/jazz/jazzdma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index 9ce9f64..2d8e447 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -211,7 +211,7 @@ EXPORT_SYMBOL(vdma_free); */ int vdma_remap(unsigned long laddr, unsigned long paddr, unsigned long size) { - int first, pages, npages; + int first, pages; if (laddr > 0xffffff) { if (vdma_debug) @@ -228,8 +228,7 @@ int vdma_remap(unsigned long laddr, unsigned long paddr, unsigned long size) return -EINVAL; /* invalid physical address */ } - npages = pages = - (((paddr & (VDMA_PAGESIZE - 1)) + size) >> 12) + 1; + pages = (((paddr & (VDMA_PAGESIZE - 1)) + size) >> 12) + 1; first = laddr >> 12; if (vdma_debug) printk("vdma_remap: first=%x, pages=%x\n", first, pages); -- cgit v1.1 From c87444af6fc853dd5571a830efff7e07c46a544e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 12:32:55 +0200 Subject: MIPS: Loongson: Fix GCC 2.6.0 build error. CC arch/mips/loongson/common/env.o arch/mips/loongson/common/env.c: In function 'prom_init_env': arch/mips/loongson/common/env.c:50:12: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] arch/mips/loongson/common/env.c:51:12: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] arch/mips/loongson/common/env.c:52:12: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] arch/mips/loongson/common/env.c:53:12: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/loongson/common/env.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/mips/loongson/common/env.c b/arch/mips/loongson/common/env.c index 11b193f..d93830a 100644 --- a/arch/mips/loongson/common/env.c +++ b/arch/mips/loongson/common/env.c @@ -29,9 +29,10 @@ unsigned long memsize, highmemsize; #define parse_even_earlier(res, option, p) \ do { \ - int ret; \ + unsigned int tmp __maybe_unused; \ + \ if (strncmp(option, (char *)p, strlen(option)) == 0) \ - ret = strict_strtol((char *)p + strlen(option"="), 10, &res); \ + tmp = strict_strtol((char *)p + strlen(option"="), 10, &res); \ } while (0) void __init prom_init_env(void) -- cgit v1.1 From 088a42acc4f0e28fc6d8b823cafb03a00ff61aec Mon Sep 17 00:00:00 2001 From: Yoichi Yuasa Date: Tue, 29 Mar 2011 15:53:56 +0900 Subject: MIPS: MSP71xx: Fix typo in msp_per_irq_controller CC arch/mips/pmc-sierra/msp71xx/msp_irq_per.o arch/mips/pmc-sierra/msp71xx/msp_irq_per.c:101:2: error: expected identifier before '.' token make[2]: *** [arch/mips/pmc-sierra/msp71xx/msp_irq_per.o] Error 1 Signed-off-by: Yoichi Yuasa Patchwork: https://patchwork.linux-mips.org/patch/2246/ Cc: linux-mips Signed-off-by: Ralf Baechle --- arch/mips/pmc-sierra/msp71xx/msp_irq_per.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/pmc-sierra/msp71xx/msp_irq_per.c b/arch/mips/pmc-sierra/msp71xx/msp_irq_per.c index f9b9dcd..98fd009 100644 --- a/arch/mips/pmc-sierra/msp71xx/msp_irq_per.c +++ b/arch/mips/pmc-sierra/msp71xx/msp_irq_per.c @@ -97,7 +97,7 @@ static int msp_per_irq_set_affinity(struct irq_data *d, static struct irq_chip msp_per_irq_controller = { .name = "MSP_PER", - .irq_enable = unmask_per_irq. + .irq_enable = unmask_per_irq, .irq_disable = mask_per_irq, .irq_ack = msp_per_irq_ack, #ifdef CONFIG_SMP -- cgit v1.1 From 866d7f5622cf5830b085a4471e67d4ed9106cb2e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 29 Mar 2011 16:09:25 +0200 Subject: MIPS: MSP: Fix build error Reported and original patch by Yoichi Yuasa . Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cevt-r4k.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/include/asm/cevt-r4k.h b/arch/mips/include/asm/cevt-r4k.h index fa4328f..65f9bdd 100644 --- a/arch/mips/include/asm/cevt-r4k.h +++ b/arch/mips/include/asm/cevt-r4k.h @@ -14,6 +14,9 @@ #ifndef __ASM_CEVT_R4K_H #define __ASM_CEVT_R4K_H +#include +#include + DECLARE_PER_CPU(struct clock_event_device, mips_clockevent_device); void mips_event_handler(struct clock_event_device *dev); -- cgit v1.1 From f8bec75acdadd3a6597fe0acb5c3161b71cc2ea0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 29 Mar 2011 11:40:06 +0100 Subject: MIPS: Rename .data..mostly and properly handle it in linker script Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cache.h | 2 +- arch/mips/kernel/vmlinux.lds.S | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/cache.h b/arch/mips/include/asm/cache.h index 650ac9b..b4db69f 100644 --- a/arch/mips/include/asm/cache.h +++ b/arch/mips/include/asm/cache.h @@ -17,6 +17,6 @@ #define SMP_CACHE_SHIFT L1_CACHE_SHIFT #define SMP_CACHE_BYTES L1_CACHE_BYTES -#define __read_mostly __attribute__((__section__(".data.read_mostly"))) +#define __read_mostly __attribute__((__section__(".data..read_mostly"))) #endif /* _ASM_CACHE_H */ diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 832afbb..e4b0b0b 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -74,6 +74,7 @@ SECTIONS INIT_TASK_DATA(PAGE_SIZE) NOSAVE_DATA CACHELINE_ALIGNED_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT) + READ_MOSTLY_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT) DATA_DATA CONSTRUCTORS } -- cgit v1.1 From e3fb3f27a7600982478e1ec415bf265c744d2ae4 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Feb 2011 14:04:33 -0800 Subject: MIPS: Octeon: Cleanup Kconfig IRQ_CPU* symbols. Octeon doesn't use IRQ_CPU, so don't select it. IRQ_CPU_OCTEON is a completely unused symbol, remove it completely. Signed-off-by: David Daney To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2086/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 8e256cc..351c80f 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -997,9 +997,6 @@ config IRQ_GT641XX config IRQ_GIC bool -config IRQ_CPU_OCTEON - bool - config MIPS_BOARDS_GEN bool @@ -1359,8 +1356,6 @@ config CPU_SB1 config CPU_CAVIUM_OCTEON bool "Cavium Octeon processor" depends on SYS_HAS_CPU_CAVIUM_OCTEON - select IRQ_CPU - select IRQ_CPU_OCTEON select CPU_HAS_PREFETCH select CPU_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_SMP -- cgit v1.1 From 23a271ecdf463e5b0198f78b0a0d5763598972b1 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Feb 2011 18:23:32 -0800 Subject: MIPS: Octeon: Guard the Kconfig body with CPU_CAVIUM_OCTEON Instead of making each Octeon specific option depend on CPU_CAVIUM_OCTEON, gate the body of the entire file with CPU_CAVIUM_OCTEON. With this change, CAVIUM_OCTEON_SPECIFIC_OPTIONS becomes useless, so get rid of it as well. Signed-off-by: David Daney To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2091/ Signed-off-by: Ralf Baechle --- arch/mips/cavium-octeon/Kconfig | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig index caae228..cad555e 100644 --- a/arch/mips/cavium-octeon/Kconfig +++ b/arch/mips/cavium-octeon/Kconfig @@ -1,11 +1,7 @@ -config CAVIUM_OCTEON_SPECIFIC_OPTIONS - bool "Enable Octeon specific options" - depends on CPU_CAVIUM_OCTEON - default "y" +if CPU_CAVIUM_OCTEON config CAVIUM_CN63XXP1 bool "Enable CN63XXP1 errata worarounds" - depends on CAVIUM_OCTEON_SPECIFIC_OPTIONS default "n" help The CN63XXP1 chip requires build time workarounds to @@ -16,7 +12,6 @@ config CAVIUM_CN63XXP1 config CAVIUM_OCTEON_2ND_KERNEL bool "Build the kernel to be used as a 2nd kernel on the same chip" - depends on CAVIUM_OCTEON_SPECIFIC_OPTIONS default "n" help This option configures this kernel to be linked at a different @@ -26,7 +21,6 @@ config CAVIUM_OCTEON_2ND_KERNEL config CAVIUM_OCTEON_HW_FIX_UNALIGNED bool "Enable hardware fixups of unaligned loads and stores" - depends on CAVIUM_OCTEON_SPECIFIC_OPTIONS default "y" help Configure the Octeon hardware to automatically fix unaligned loads @@ -38,7 +32,6 @@ config CAVIUM_OCTEON_HW_FIX_UNALIGNED config CAVIUM_OCTEON_CVMSEG_SIZE int "Number of L1 cache lines reserved for CVMSEG memory" - depends on CAVIUM_OCTEON_SPECIFIC_OPTIONS range 0 54 default 1 help @@ -50,7 +43,6 @@ config CAVIUM_OCTEON_CVMSEG_SIZE config CAVIUM_OCTEON_LOCK_L2 bool "Lock often used kernel code in the L2" - depends on CAVIUM_OCTEON_SPECIFIC_OPTIONS default "y" help Enable locking parts of the kernel into the L2 cache. @@ -93,7 +85,6 @@ config CAVIUM_OCTEON_LOCK_L2_MEMCPY config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_STATIC - depends on CPU_CAVIUM_OCTEON config CAVIUM_OCTEON_HELPER def_bool y @@ -107,6 +98,8 @@ config NEED_SG_DMA_LENGTH config SWIOTLB def_bool y - depends on CPU_CAVIUM_OCTEON select IOMMU_HELPER select NEED_SG_DMA_LENGTH + + +endif # CPU_CAVIUM_OCTEON -- cgit v1.1 From 7da34c1dac0db934913d0e81d2fd548e4973a326 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 8 Apr 2011 14:32:15 +0200 Subject: MIPS: bcm63xx: Fix header_crc comment in bcm963xx_tag.h The CRC32 actually includes the tag_version. Signed-off-by: Jonas Gorski Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2275/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mach-bcm63xx/bcm963xx_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/mach-bcm63xx/bcm963xx_tag.h b/arch/mips/include/asm/mach-bcm63xx/bcm963xx_tag.h index 32978d3..ed72e6a 100644 --- a/arch/mips/include/asm/mach-bcm63xx/bcm963xx_tag.h +++ b/arch/mips/include/asm/mach-bcm63xx/bcm963xx_tag.h @@ -88,7 +88,7 @@ struct bcm_tag { char kernel_crc[CRC_LEN]; /* 228-235: Unused at present */ char reserved1[8]; - /* 236-239: CRC32 of header excluding tagVersion */ + /* 236-239: CRC32 of header excluding last 20 bytes */ char header_crc[CRC_LEN]; /* 240-255: Unused at present */ char reserved2[16]; -- cgit v1.1 From a6ab5ca39404e04d46b1bae133cd059d84926a2d Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 11 Apr 2011 11:37:15 +0200 Subject: MIPS: IP27: Fix GCC 4.6.0 build error. CC arch/mips/sgi-ip27/ip27-hubio.o arch/mips/sgi-ip27/ip27-hubio.c: In function 'hub_pio_map': arch/mips/sgi-ip27/ip27-hubio.c:32:20: error: variable 'junk' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip27/ip27-klnuma.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/mips/sgi-ip27/ip27-klnuma.c b/arch/mips/sgi-ip27/ip27-klnuma.c index c3d30a8..1d1919a 100644 --- a/arch/mips/sgi-ip27/ip27-klnuma.c +++ b/arch/mips/sgi-ip27/ip27-klnuma.c @@ -54,11 +54,8 @@ void __init setup_replication_mask(void) static __init void set_ktext_source(nasid_t client_nasid, nasid_t server_nasid) { - cnodeid_t client_cnode; kern_vars_t *kvp; - client_cnode = NASID_TO_COMPACT_NODEID(client_nasid); - kvp = &hub_data(client_nasid)->kern_vars; KERN_VARS_ADDR(client_nasid) = (unsigned long)kvp; -- cgit v1.1 From e12f47ef1680d8bd6449a8e4e98165d2590617eb Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 11 Apr 2011 11:48:31 +0200 Subject: MIPS: IP27: Fix GCC 4.6.0 build error. CC arch/mips/sgi-ip27/ip27-hubio.o arch/mips/sgi-ip27/ip27-hubio.c: In function 'hub_pio_map': arch/mips/sgi-ip27/ip27-hubio.c:32:20: error: variable 'junk' set but not used [-Werror=unused-but-set-variable] cc1: all warnings being treated as errors Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip27/ip27-hubio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/mips/sgi-ip27/ip27-hubio.c b/arch/mips/sgi-ip27/ip27-hubio.c index a1fa4ab..cd0d5b0 100644 --- a/arch/mips/sgi-ip27/ip27-hubio.c +++ b/arch/mips/sgi-ip27/ip27-hubio.c @@ -29,7 +29,6 @@ unsigned long hub_pio_map(cnodeid_t cnode, xwidgetnum_t widget, unsigned long xtalk_addr, size_t size) { nasid_t nasid = COMPACT_TO_NASID_NODEID(cnode); - volatile hubreg_t junk; unsigned i; /* use small-window mapping if possible */ @@ -64,7 +63,7 @@ unsigned long hub_pio_map(cnodeid_t cnode, xwidgetnum_t widget, * after we write it. */ IIO_ITTE_PUT(nasid, i, HUB_PIO_MAP_TO_MEM, widget, xtalk_addr); - junk = HUB_L(IIO_ITTE_GET(nasid, i)); + (void) HUB_L(IIO_ITTE_GET(nasid, i)); return NODE_BWIN_BASE(nasid, widget) + (xtalk_addr % BWIN_SIZE); } -- cgit v1.1 From 8bdd51429da5aec173ab6f0e431b13ee6782a888 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 13 Apr 2011 20:50:46 +0200 Subject: MIPS: Document former use of timerfd(2) syscall number. Signed-off-by: Ralf Baechle --- arch/mips/kernel/scall32-o32.S | 2 +- arch/mips/kernel/scall64-64.S | 2 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 7f5468b..7f1377e 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -565,7 +565,7 @@ einval: li v0, -ENOSYS sys sys_ioprio_get 2 /* 4315 */ sys sys_utimensat 4 sys sys_signalfd 3 - sys sys_ni_syscall 0 + sys sys_ni_syscall 0 /* was timerfd */ sys sys_eventfd 1 sys sys_fallocate 6 /* 4320 */ sys sys_timerfd_create 2 diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index a2e1fcb..7c0ef7f 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -404,7 +404,7 @@ sys_call_table: PTR sys_ioprio_get PTR sys_utimensat /* 5275 */ PTR sys_signalfd - PTR sys_ni_syscall + PTR sys_ni_syscall /* was timerfd */ PTR sys_eventfd PTR sys_fallocate PTR sys_timerfd_create /* 5280 */ diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index b2c7624..de6c556 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -403,7 +403,7 @@ EXPORT(sysn32_call_table) PTR sys_ioprio_get PTR compat_sys_utimensat PTR compat_sys_signalfd /* 6280 */ - PTR sys_ni_syscall + PTR sys_ni_syscall /* was timerfd */ PTR sys_eventfd PTR sys_fallocate PTR sys_timerfd_create diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 049a9c8..b0541dd 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -522,7 +522,7 @@ sys_call_table: PTR sys_ioprio_get /* 4315 */ PTR compat_sys_utimensat PTR compat_sys_signalfd - PTR sys_ni_syscall + PTR sys_ni_syscall /* was timerfd */ PTR sys_eventfd PTR sys32_fallocate /* 4320 */ PTR sys_timerfd_create -- cgit v1.1 From 403fbdff96057ad312b672408ec676782a802b74 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 13 Apr 2011 21:15:09 +0200 Subject: MIPS: Alchemy: Fix GCC 4.6.0 build error. CC arch/mips/alchemy/devboards/db1x00/board_setup.o arch/mips/alchemy/devboards/db1x00/board_setup.c: In function 'board_setup': arch/mips/alchemy/devboards/db1x00/board_setup.c:130:6: error: variable 'pin_func' set but not used [-Werror=unused-but-set-variable] Signed-off-by: Ralf Baechle --- arch/mips/alchemy/devboards/db1x00/board_setup.c | 61 +++++++++++++----------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/arch/mips/alchemy/devboards/db1x00/board_setup.c b/arch/mips/alchemy/devboards/db1x00/board_setup.c index 05f120f..5c956fe 100644 --- a/arch/mips/alchemy/devboards/db1x00/board_setup.c +++ b/arch/mips/alchemy/devboards/db1x00/board_setup.c @@ -127,13 +127,10 @@ const char *get_system_type(void) void __init board_setup(void) { unsigned long bcsr1, bcsr2; - u32 pin_func; bcsr1 = DB1000_BCSR_PHYS_ADDR; bcsr2 = DB1000_BCSR_PHYS_ADDR + DB1000_BCSR_HEXLED_OFS; - pin_func = 0; - #ifdef CONFIG_MIPS_DB1000 printk(KERN_INFO "AMD Alchemy Au1000/Db1000 Board\n"); #endif @@ -164,12 +161,16 @@ void __init board_setup(void) /* Not valid for Au1550 */ #if defined(CONFIG_IRDA) && \ (defined(CONFIG_SOC_AU1000) || defined(CONFIG_SOC_AU1100)) - /* Set IRFIRSEL instead of GPIO15 */ - pin_func = au_readl(SYS_PINFUNC) | SYS_PF_IRF; - au_writel(pin_func, SYS_PINFUNC); - /* Power off until the driver is in use */ - bcsr_mod(BCSR_RESETS, BCSR_RESETS_IRDA_MODE_MASK, - BCSR_RESETS_IRDA_MODE_OFF); + { + u32 pin_func; + + /* Set IRFIRSEL instead of GPIO15 */ + pin_func = au_readl(SYS_PINFUNC) | SYS_PF_IRF; + au_writel(pin_func, SYS_PINFUNC); + /* Power off until the driver is in use */ + bcsr_mod(BCSR_RESETS, BCSR_RESETS_IRDA_MODE_MASK, + BCSR_RESETS_IRDA_MODE_OFF); + } #endif bcsr_write(BCSR_PCMCIA, 0); /* turn off PCMCIA power */ @@ -177,31 +178,35 @@ void __init board_setup(void) alchemy_gpio1_input_enable(); #ifdef CONFIG_MIPS_MIRAGE - /* GPIO[20] is output */ - alchemy_gpio_direction_output(20, 0); + { + u32 pin_func; - /* Set GPIO[210:208] instead of SSI_0 */ - pin_func = au_readl(SYS_PINFUNC) | SYS_PF_S0; + /* GPIO[20] is output */ + alchemy_gpio_direction_output(20, 0); - /* Set GPIO[215:211] for LEDs */ - pin_func |= 5 << 2; + /* Set GPIO[210:208] instead of SSI_0 */ + pin_func = au_readl(SYS_PINFUNC) | SYS_PF_S0; - /* Set GPIO[214:213] for more LEDs */ - pin_func |= 5 << 12; + /* Set GPIO[215:211] for LEDs */ + pin_func |= 5 << 2; - /* Set GPIO[207:200] instead of PCMCIA/LCD */ - pin_func |= SYS_PF_LCD | SYS_PF_PC; - au_writel(pin_func, SYS_PINFUNC); + /* Set GPIO[214:213] for more LEDs */ + pin_func |= 5 << 12; - /* - * Enable speaker amplifier. This should - * be part of the audio driver. - */ - alchemy_gpio_direction_output(209, 1); + /* Set GPIO[207:200] instead of PCMCIA/LCD */ + pin_func |= SYS_PF_LCD | SYS_PF_PC; + au_writel(pin_func, SYS_PINFUNC); - pm_power_off = mirage_power_off; - _machine_halt = mirage_power_off; - _machine_restart = (void(*)(char *))mips_softreset; + /* + * Enable speaker amplifier. This should + * be part of the audio driver. + */ + alchemy_gpio_direction_output(209, 1); + + pm_power_off = mirage_power_off; + _machine_halt = mirage_power_off; + _machine_restart = (void(*)(char *))mips_softreset; + } #endif #ifdef CONFIG_MIPS_BOSPORUS -- cgit v1.1 From 893d20fbae483913250a5d8bd9b4ce861a3adf2a Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 13 Apr 2011 21:49:54 +0200 Subject: MIPS: Fix calc_vmlinuz_load_addr build warnings. HOSTCC arch/mips/boot/compressed/calc_vmlinuz_load_addr arch/mips/boot/compressed/calc_vmlinuz_load_addr.c: In function 'main': arch/mips/boot/compressed/calc_vmlinuz_load_addr.c:35:2: warning: format '%llx' expects type 'long long unsigned int *', but argument 3 has type 'uint64_t *' arch/mips/boot/compressed/calc_vmlinuz_load_addr.c:54:2: warning: format '%llx' expects type 'long long unsigned int', but argument 2 has type 'uint64_t' Signed-off-by: Ralf Baechle --- arch/mips/boot/compressed/calc_vmlinuz_load_addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c b/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c index 88c9d963..9a62436 100644 --- a/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c +++ b/arch/mips/boot/compressed/calc_vmlinuz_load_addr.c @@ -16,8 +16,8 @@ int main(int argc, char *argv[]) { + unsigned long long vmlinux_size, vmlinux_load_addr, vmlinuz_load_addr; struct stat sb; - uint64_t vmlinux_size, vmlinux_load_addr, vmlinuz_load_addr; if (argc != 3) { fprintf(stderr, "Usage: %s \n", -- cgit v1.1 From b20bff02b21ac7b725fd09590d5724d306552529 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 13 Apr 2011 23:51:23 +0200 Subject: MIPS: Audit: Fix success success argument pass to audit_syscall_exit Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index d21c388..584e6b5 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -540,8 +540,8 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) secure_computing(regs->regs[2]); if (unlikely(current->audit_context) && entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]), - regs->regs[2]); + audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), + -regs->regs[2]); if (!(current->ptrace & PT_PTRACED)) goto out; -- cgit v1.1 From f1b6a5054c5c5c1770863b781de9b721fc99c3e3 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 18 Apr 2011 11:16:42 +0100 Subject: MIPS: JZ4740: Fix GCC 4.6.0 build error. CC arch/mips/jz4740/dma.o arch/mips/jz4740/dma.c: In function 'jz4740_dma_chan_irq': arch/mips/jz4740/dma.c:245:11: error: variable 'status' set but not used [-Werro r=unused-but-set-variable] Signed-off-by: Ralf Baechle --- arch/mips/jz4740/dma.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/mips/jz4740/dma.c b/arch/mips/jz4740/dma.c index 5ebe75a..d7feb89 100644 --- a/arch/mips/jz4740/dma.c +++ b/arch/mips/jz4740/dma.c @@ -242,9 +242,7 @@ EXPORT_SYMBOL_GPL(jz4740_dma_get_residue); static void jz4740_dma_chan_irq(struct jz4740_dma_chan *dma) { - uint32_t status; - - status = jz4740_dma_read(JZ_REG_DMA_STATUS_CTRL(dma->id)); + (void) jz4740_dma_read(JZ_REG_DMA_STATUS_CTRL(dma->id)); jz4740_dma_write_mask(JZ_REG_DMA_STATUS_CTRL(dma->id), 0, JZ_DMA_STATUS_CTRL_ENABLE | JZ_DMA_STATUS_CTRL_TRANSFER_DONE); -- cgit v1.1 From aa7ce1c3038814801c5d7712f7403b15fea5d77d Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 18 Apr 2011 11:19:32 +0100 Subject: MIPS: JZ4740: Export symbols to the watchdog driver module MODPOST 356 modules ERROR: "jz4740_timer_disable_watchdog" [drivers/watchdog/jz4740_wdt.ko] undefine d! ERROR: "jz4740_timer_enable_watchdog" [drivers/watchdog/jz4740_wdt.ko] undefined ! make[1]: *** [__modpost] Error 1 Signed-off-by: Ralf Baechle --- arch/mips/jz4740/timer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/jz4740/timer.c b/arch/mips/jz4740/timer.c index b2c0151..654d5c3 100644 --- a/arch/mips/jz4740/timer.c +++ b/arch/mips/jz4740/timer.c @@ -27,11 +27,13 @@ void jz4740_timer_enable_watchdog(void) { writel(BIT(16), jz4740_timer_base + JZ_REG_TIMER_STOP_CLEAR); } +EXPORT_SYMBOL_GPL(jz4740_timer_enable_watchdog); void jz4740_timer_disable_watchdog(void) { writel(BIT(16), jz4740_timer_base + JZ_REG_TIMER_STOP_SET); } +EXPORT_SYMBOL_GPL(jz4740_timer_disable_watchdog); void __init jz4740_timer_init(void) { -- cgit v1.1 From 1e2bbde4afd97b8d6a3f1f6c7bf3b6a9d226ba2e Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 31 Mar 2011 20:52:20 +0200 Subject: MIPS: JZ4740: Set one-shot feature flag for the clockevent The code for supporting one-shot mode for the clockevent is already there, only the feature flag was not set. Setting the one-shot flag allows the kernel to run in tickless mode. Signed-off-by: Lars-Peter Clausen Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2261/ Signed-off-by: Ralf Baechle --- arch/mips/jz4740/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c index fe01678..eaa853a 100644 --- a/arch/mips/jz4740/time.c +++ b/arch/mips/jz4740/time.c @@ -89,7 +89,7 @@ static int jz4740_clockevent_set_next(unsigned long evt, static struct clock_event_device jz4740_clockevent = { .name = "jz4740-timer", - .features = CLOCK_EVT_FEAT_PERIODIC, + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_next_event = jz4740_clockevent_set_next, .set_mode = jz4740_clockevent_set_mode, .rating = 200, -- cgit v1.1 From f850548ef88e5ff9e40bae9e1a7140bef0653e6b Mon Sep 17 00:00:00 2001 From: Wu Zhangjin Date: Sun, 24 Apr 2011 05:56:59 +0800 Subject: MIPS: Hibernation: Fixes for PAGE_SIZE >= 64kb PAGE_SIZE >= 64kb (1 << 16) is too big to be the immediate of the addiu/daddiu instruction, so, use addu/daddu instruction instead. The following compiling error is fixed: AS arch/mips/power/hibernate.o arch/mips/power/hibernate.S: Assembler messages: arch/mips/power/hibernate.S:38: Error: expression out of range make[2]: *** [arch/mips/power/hibernate.o] Error 1 make[1]: *** [arch/mips/power] Error 2 Reported-by: Roman Mamedov Signed-off-by: Wu Zhangjin To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2313/ Signed-off-by: Ralf Baechle --- arch/mips/power/hibernate.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/power/hibernate.S b/arch/mips/power/hibernate.S index dbb5c7b..f8a751c 100644 --- a/arch/mips/power/hibernate.S +++ b/arch/mips/power/hibernate.S @@ -35,7 +35,7 @@ LEAF(swsusp_arch_resume) 0: PTR_L t1, PBE_ADDRESS(t0) /* source */ PTR_L t2, PBE_ORIG_ADDRESS(t0) /* destination */ - PTR_ADDIU t3, t1, PAGE_SIZE + PTR_ADDU t3, t1, PAGE_SIZE 1: REG_L t8, (t1) REG_S t8, (t2) -- cgit v1.1 From 310f1303390758ee7688e350e117a7b50ba5fa05 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 27 Apr 2011 16:39:28 -0700 Subject: MIPS: Invalidate old TLB mappings when updating huge page PTEs. Without this, stale Icache or TLB entries may be used. Signed-off-by: David Daney To: linux-mips@linux-mips.org https://patchwork.linux-mips.org/patch/2318/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index f5e85601..c565b7c 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -70,6 +70,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + flush_tlb_mm(vma->vm_mm); } static inline int huge_pte_none(pte_t pte) -- cgit v1.1 From 780914c3cf691a75a0b7fe89f4466eeff8058165 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sat, 7 May 2011 13:55:19 +0200 Subject: MIPS: Alchemy: fix xxs1500 build error This fixes: alchemy/xxs1500/init.c: In function 'prom_init': alchemy/xxs1500/init.c:57:17: error: ignoring return value of 'kstrtoul', declared with attribute warn_unused_result Signed-off-by: Manuel Lauss Cc: Linux-MIPS Patchwork: https://patchwork.linux-mips.org/patch/2340/ Signed-off-by: Ralf Baechle --- arch/mips/alchemy/xxs1500/init.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/mips/alchemy/xxs1500/init.c b/arch/mips/alchemy/xxs1500/init.c index 15125c2..34a90a4 100644 --- a/arch/mips/alchemy/xxs1500/init.c +++ b/arch/mips/alchemy/xxs1500/init.c @@ -51,10 +51,9 @@ void __init prom_init(void) prom_init_cmdline(); memsize_str = prom_getenv("memsize"); - if (!memsize_str) + if (!memsize_str || strict_strtoul(memsize_str, 0, &memsize)) memsize = 0x04000000; - else - strict_strtoul(memsize_str, 0, &memsize); + add_memory_region(0, memsize, BOOT_MEM_RAM); } -- cgit v1.1 From 5db1c07ced19b2eec3a149a3c624d88e02e246ae Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Tue, 3 May 2011 21:40:08 +0300 Subject: mac80211: don't start the dynamic ps timer if not associated When we are disconnecting, we set PS off, but this happens before we send the deauth/disassoc request. When the deauth/disassoc frames are sent, we trigger the dynamic ps timer, which then times out and turns PS back on. Thus, PS remains on after disconnecting, causing problems when associating again. This can be fixed by preventing the timer to start when we're not associated anymore. Signed-off-by: Luciano Coelho Signed-off-by: John W. Linville --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ce4596e..bd1224f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -237,6 +237,10 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) &local->dynamic_ps_disable_work); } + /* Don't restart the timer if we're not disassociated */ + if (!ifmgd->associated) + return TX_CONTINUE; + mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); -- cgit v1.1 From 99aa55b66e3553e6f7212ec1104e0fac06cc558e Mon Sep 17 00:00:00 2001 From: Mohammed Shafi Shajakhan Date: Fri, 6 May 2011 20:43:11 +0530 Subject: ath9k: Fix a warning due to a queued work during S3 state during suspend/S3 state drv_flush is called from mac80211 irrespective of interface count. In ath9k we queue a work in ath9k_flush which we expect to be cancelled in the drv_stop call back. during suspend process mac80211 calls drv_stop only when the interface count(local->count) is non-zero. unfortunately when the network manager is enabled, drv_flush is called while drv_stop is not called as local->count reaches '0'. So fix this by simply checking for the device presence in the drv_flush call back in the driver before queueing work or anything else. this patch fixes the following WARNING Call Trace: [] warn_slowpath_common+0x72/0xa0 [] ? ieee80211_can_queue_work+0x39/0x50 [mac80211] [] ? ieee80211_can_queue_work+0x39/0x50 [mac80211] [] warn_slowpath_fmt+0x2b/0x30 [] ieee80211_can_queue_work+0x39/0x50 [mac80211] [] ieee80211_queue_delayed_work+0x21/0x50 [mac80211] [] ath_tx_complete_poll_work+0xb2/0x100 [ath9k] [] run_workqueue+0x8e/0x150 [] ? ath_tx_complete_poll_work+0x0/0x100 [ath9k] [] worker_thread+0x84/0xe0 [] ? autoremove_wake_function+0x0/0x50 [] ? worker_thread+0x0/0xe0 [] kthread+0x74/0x80 [] ? kthread+0x0/0x80 [] kernel_thread_helper+0x7/0x10 ---[ end trace 2aff81010df9215b ]--- Signed-off-by: Rajkumar Manoharan Signed-off-by: Mohammed Shafi Shajakhan Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index 17d04ff..1482fa6 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -2141,6 +2141,8 @@ static void ath9k_set_coverage_class(struct ieee80211_hw *hw, u8 coverage_class) static void ath9k_flush(struct ieee80211_hw *hw, bool drop) { struct ath_softc *sc = hw->priv; + struct ath_hw *ah = sc->sc_ah; + struct ath_common *common = ath9k_hw_common(ah); int timeout = 200; /* ms */ int i, j; @@ -2149,6 +2151,12 @@ static void ath9k_flush(struct ieee80211_hw *hw, bool drop) cancel_delayed_work_sync(&sc->tx_complete_work); + if (sc->sc_flags & SC_OP_INVALID) { + ath_dbg(common, ATH_DBG_ANY, "Device not present\n"); + mutex_unlock(&sc->mutex); + return; + } + if (drop) timeout = 1; -- cgit v1.1 From eb85de3f84868ca85703a23617b4079ce79a801e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Sat, 7 May 2011 17:46:21 +0200 Subject: iwlegacy: fix IBSS mode crashes We should not switch to non-IBSS channels when working in IBSS mode, otherwise there are microcode errors, and after some time system crashes. This bug is only observable when software scan is used in IBSS mode, so should be considered as regression after: commit 0263aa45293838b514b8af674a03faf040991a90 Author: Stanislaw Gruszka Date: Tue Mar 29 11:24:21 2011 +0200 iwl3945: disable hw scan by default However IBSS mode check, which this patch add again, was removed by commit b2f30e8bdd8ef5f3b5a7ef9146509585a15347d3 Author: Johannes Berg Date: Thu Jan 21 07:32:20 2010 -0800 iwlwifi: remove IBSS channel sanity check That commit claim that mac80211 will not use non-IBSS channel in IBSS mode, what definitely is not true. Bug probably should be fixed in mac80211, but that will require more work, so better to apply that patch temporally, and provide proper mac80211 fix latter. Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=34452 Reported-and-tested-by: Mikko Rapeli Cc: stable@kernel.org # 2.6.38.5+ Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- drivers/net/wireless/iwlegacy/iwl-core.c | 7 +++++++ drivers/net/wireless/iwlegacy/iwl-dev.h | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/net/wireless/iwlegacy/iwl-core.c b/drivers/net/wireless/iwlegacy/iwl-core.c index 2b08efb..dcbb2ef 100644 --- a/drivers/net/wireless/iwlegacy/iwl-core.c +++ b/drivers/net/wireless/iwlegacy/iwl-core.c @@ -2155,6 +2155,13 @@ int iwl_legacy_mac_config(struct ieee80211_hw *hw, u32 changed) goto set_ch_out; } + if (priv->iw_mode == NL80211_IFTYPE_ADHOC && + !iwl_legacy_is_channel_ibss(ch_info)) { + IWL_DEBUG_MAC80211(priv, "leave - not IBSS channel\n"); + ret = -EINVAL; + goto set_ch_out; + } + spin_lock_irqsave(&priv->lock, flags); for_each_context(priv, ctx) { diff --git a/drivers/net/wireless/iwlegacy/iwl-dev.h b/drivers/net/wireless/iwlegacy/iwl-dev.h index 9ee849d..f43ac1e 100644 --- a/drivers/net/wireless/iwlegacy/iwl-dev.h +++ b/drivers/net/wireless/iwlegacy/iwl-dev.h @@ -1411,6 +1411,12 @@ iwl_legacy_is_channel_passive(const struct iwl_channel_info *ch) return (!(ch->flags & EEPROM_CHANNEL_ACTIVE)) ? 1 : 0; } +static inline int +iwl_legacy_is_channel_ibss(const struct iwl_channel_info *ch) +{ + return (ch->flags & EEPROM_CHANNEL_IBSS) ? 1 : 0; +} + static inline void __iwl_legacy_free_pages(struct iwl_priv *priv, struct page *page) { -- cgit v1.1 From 2ae1b8b35faba31a59b153cbad07f9c15de99740 Mon Sep 17 00:00:00 2001 From: Paul Fox Date: Mon, 9 May 2011 10:40:42 +0100 Subject: libertas: fix cmdpendingq locking We occasionally see list corruption using libertas. While we haven't been able to diagnose this precisely, we have spotted a possible cause: cmdpendingq is generally modified with driver_lock held. However, there are a couple of points where this is not the case. Fix up those operations to execute under the lock, it seems like the correct thing to do and will hopefully improve the situation. Signed-off-by: Paul Fox Signed-off-by: Daniel Drake Acked-by: Dan Williams Cc: stable@kernel.org Signed-off-by: John W. Linville --- drivers/net/wireless/libertas/cmd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c index 7e8a658..f3ac624 100644 --- a/drivers/net/wireless/libertas/cmd.c +++ b/drivers/net/wireless/libertas/cmd.c @@ -1339,8 +1339,8 @@ int lbs_execute_next_command(struct lbs_private *priv) cpu_to_le16(PS_MODE_ACTION_EXIT_PS)) { lbs_deb_host( "EXEC_NEXT_CMD: ignore ENTER_PS cmd\n"); - list_del(&cmdnode->list); spin_lock_irqsave(&priv->driver_lock, flags); + list_del(&cmdnode->list); lbs_complete_command(priv, cmdnode, 0); spin_unlock_irqrestore(&priv->driver_lock, flags); @@ -1352,8 +1352,8 @@ int lbs_execute_next_command(struct lbs_private *priv) (priv->psstate == PS_STATE_PRE_SLEEP)) { lbs_deb_host( "EXEC_NEXT_CMD: ignore EXIT_PS cmd in sleep\n"); - list_del(&cmdnode->list); spin_lock_irqsave(&priv->driver_lock, flags); + list_del(&cmdnode->list); lbs_complete_command(priv, cmdnode, 0); spin_unlock_irqrestore(&priv->driver_lock, flags); priv->needtowakeup = 1; @@ -1366,7 +1366,9 @@ int lbs_execute_next_command(struct lbs_private *priv) "EXEC_NEXT_CMD: sending EXIT_PS\n"); } } + spin_lock_irqsave(&priv->driver_lock, flags); list_del(&cmdnode->list); + spin_unlock_irqrestore(&priv->driver_lock, flags); lbs_deb_host("EXEC_NEXT_CMD: sending command 0x%04x\n", le16_to_cpu(cmd->command)); lbs_submit_command(priv, cmdnode); -- cgit v1.1 From 1f8e1cdac616e510eeb2dc2a9226bf597bc6cfd6 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 7 May 2011 17:18:20 -0400 Subject: SYSFS: Fix erroneous comments for sysfs_update_group(). Fix what is clearly a simple copy-and-paste error in commenting the sysfs_update_group() routine. Signed-off-by: Robert P. J. Day Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index c8769dc2..194414f 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -101,9 +101,9 @@ int sysfs_create_group(struct kobject *kobj, } /** - * sysfs_update_group - given a directory kobject, create an attribute group - * @kobj: The kobject to create the group on - * @grp: The attribute group to create + * sysfs_update_group - given a directory kobject, update an attribute group + * @kobj: The kobject to update the group on + * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any -- cgit v1.1 From 55aee10dec477254241e4f72968f92e0543b33ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:22:54 -0700 Subject: vlan: fix GVRP at dismantle time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ip link add link eth2 eth2.103 type vlan id 103 gvrp on loose_binding on ip link set eth2.103 up rmmod tg3 # driver providing eth2 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] garp_request_leave+0x3e/0xc0 [garp] PGD 11d251067 PUD 11b9e0067 PMD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/virtual/net/eth2.104/ifindex CPU 0 Modules linked in: tg3(-) 8021q garp nfsd lockd auth_rpcgss sunrpc libphy sg [last unloaded: x_tables] Pid: 11494, comm: rmmod Tainted: G W 2.6.39-rc6-00261-gfd71257-dirty #580 HP ProLiant BL460c G6 RIP: 0010:[] [] garp_request_leave+0x3e/0xc0 [garp] RSP: 0018:ffff88007a19bae8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88011b5e2000 RCX: 0000000000000002 RDX: 0000000000000000 RSI: 0000000000000175 RDI: ffffffffa0030d5b RBP: ffff88007a19bb18 R08: 0000000000000001 R09: ffff88011bd64a00 R10: ffff88011d34ec00 R11: 0000000000000000 R12: 0000000000000002 R13: ffff88007a19bc48 R14: ffff88007a19bb88 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88011fc00000(0063) knlGS:00000000f77d76c0 CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000011a675000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process rmmod (pid: 11494, threadinfo ffff88007a19a000, task ffff8800798595c0) Stack: ffff88007a19bb36 ffff88011c84b800 ffff88011b5e2000 ffff88007a19bc48 ffff88007a19bb88 0000000000000006 ffff88007a19bb38 ffffffffa003a5f6 ffff88007a19bb38 670088007a19bba8 ffff88007a19bb58 ffffffffa00397e7 Call Trace: [] vlan_gvrp_request_leave+0x46/0x50 [8021q] [] vlan_dev_stop+0xb7/0xc0 [8021q] [] __dev_close_many+0x87/0xe0 [] dev_close_many+0x87/0x110 [] rollback_registered_many+0xa0/0x240 [] unregister_netdevice_many+0x19/0x60 [] vlan_device_event+0x53b/0x550 [8021q] [] ? ip6mr_device_event+0xa8/0xd0 [] notifier_call_chain+0x53/0x80 [] __raw_notifier_call_chain+0x9/0x10 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers+0x32/0x60 [] rollback_registered_many+0x10f/0x240 [] rollback_registered+0x2f/0x40 [] unregister_netdevice_queue+0x58/0x90 [] unregister_netdev+0x1b/0x30 [] tg3_remove_one+0x6f/0x10b [tg3] We should call vlan_gvrp_request_leave() from unregister_vlan_dev(), not from vlan_dev_stop(), because vlan_gvrp_uninit_applicant() is called right after unregister_netdevice_queue(). In batch mode, unregister_netdevice_queue() doesn’t immediately call vlan_dev_stop(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 +++ net/8021q/vlan_dev.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 7850412..0eb1a88 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -124,6 +124,9 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) grp->nr_vlans--; + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_leave(dev); + vlan_group_set_device(grp, vlan_id, NULL); if (!grp->killall) synchronize_net(); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e34ea9e..b2ff6c8 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -487,9 +487,6 @@ static int vlan_dev_stop(struct net_device *dev) struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev = vlan->real_dev; - if (vlan->flags & VLAN_FLAG_GVRP) - vlan_gvrp_request_leave(dev); - dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) -- cgit v1.1 From e14a599335427f81bbb0008963e59aa9c6449dce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:26:06 -0700 Subject: net: dev_close() should check IFF_UP Commit 443457242beb (factorize sync-rcu call in unregister_netdevice_many) mistakenly removed one test from dev_close() Following actions trigger a BUG : modprobe bonding modprobe dummy ifconfig bond0 up ifenslave bond0 dummy0 rmmod dummy dev_close() must not close a non IFF_UP device. With help from Frank Blaschka and Einar EL Lueck Reported-by: Frank Blaschka Reported-by: Einar EL Lueck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 856b6ee..9200944 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,11 +1284,13 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - LIST_HEAD(single); + if (dev->flags & IFF_UP) { + LIST_HEAD(single); - list_add(&dev->unreg_list, &single); - dev_close_many(&single); - list_del(&single); + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } return 0; } EXPORT_SYMBOL(dev_close); -- cgit v1.1 From 43a4dea4c9d44baae38ddc14b9b6d86fde4c8b88 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:36:38 +0000 Subject: xfrm: Assign the inner mode output function to the dst entry As it is, we assign the outer modes output function to the dst entry when we create the xfrm bundle. This leads to two problems on interfamily scenarios. We might insert ipv4 packets into ip6_fragment when called from xfrm6_output. The system crashes if we try to fragment an ipv4 packet with ip6_fragment. This issue was introduced with git commit ad0081e4 (ipv6: Fragment locally generated tunnel-mode IPSec6 packets as needed). The second issue is, that we might insert ipv4 packets in netfilter6 and vice versa on interfamily scenarios. With this patch we assign the inner mode output function to the dst entry when we create the xfrm bundle. So xfrm4_output/xfrm6_output from the inner mode is used and the right fragmentation and netfilter functions are called. We switch then to outer mode with the output_finish functions. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/xfrm.h | 3 +++ net/ipv4/xfrm4_output.c | 8 ++++++-- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_output.c | 6 +++--- net/ipv6/xfrm6_state.c | 1 + net/xfrm/xfrm_policy.c | 14 +++++++++++++- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6ae4bc5..20afeaa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -324,6 +324,7 @@ struct xfrm_state_afinfo { int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); int (*output)(struct sk_buff *skb); + int (*output_finish)(struct sk_buff *skb); int (*extract_input)(struct xfrm_state *x, struct sk_buff *skb); int (*extract_output)(struct xfrm_state *x, @@ -1454,6 +1455,7 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) extern int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_output(struct sk_buff *skb); +extern int xfrm4_output_finish(struct sk_buff *skb); extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm6_extract_header(struct sk_buff *skb); @@ -1470,6 +1472,7 @@ extern __be32 xfrm6_tunnel_spi_lookup(struct net *net, xfrm_address_t *saddr); extern int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_output(struct sk_buff *skb); +extern int xfrm6_output_finish(struct sk_buff *skb); extern int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96..2d51840 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm4_prepare_output); -static int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER if (!skb_dst(skb)->xfrm) { @@ -86,7 +86,11 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, skb_dst(skb)->dev, xfrm4_output_finish, + NULL, dst->dev, + x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 1717c64..805d63e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -78,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .init_tempsel = __xfrm4_init_tempsel, .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, + .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8e688b3..49a91c5f 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -79,7 +79,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm6_prepare_output); -static int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; @@ -97,9 +97,9 @@ static int __xfrm6_output(struct sk_buff *skb) if ((x && x->props.mode == XFRM_MODE_TUNNEL) && ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(skb, xfrm6_output_finish); + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); } - return xfrm6_output_finish(skb); + return x->outer_mode->afinfo->output_finish(skb); } int xfrm6_output(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index afe941e..248f0b2 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -178,6 +178,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 15792d8..b4d745e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1406,6 +1406,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; + struct xfrm_mode *inner_mode; struct dst_entry *dst_prev = NULL; struct dst_entry *dst0 = NULL; int i = 0; @@ -1436,6 +1437,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } + if (xfrm[i]->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(xfrm[i], + xfrm_af2proto(family)); + if (!inner_mode) { + err = -EAFNOSUPPORT; + dst_release(dst); + goto put_states; + } + } else + inner_mode = xfrm[i]->inner_mode; + if (!dst_prev) dst0 = dst1; else { @@ -1464,7 +1476,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->lastuse = now; dst1->input = dst_discard; - dst1->output = xfrm[i]->outer_mode->afinfo->output; + dst1->output = inner_mode->afinfo->output; dst1->next = dst_prev; dst_prev = dst1; -- cgit v1.1 From 6fa5ddcc675b937f94d05628e8997c07a80c6cb9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:43:05 +0000 Subject: xfrm: Don't allow esn with disabled anti replay detection Unlike the standard case, disabled anti replay detection needs some nontrivial extra treatment on ESN. RFC 4303 states: Note: If a receiver chooses to not enable anti-replay for an SA, then the receiver SHOULD NOT negotiate ESN in an SA management protocol. Use of ESN creates a need for the receiver to manage the anti-replay window (in order to determine the correct value for the high-order bits of the ESN, which are employed in the ICV computation), which is generally contrary to the notion of disabling anti-replay for an SA. So return an error if an ESN state with disabled anti replay detection is inserted for now and add the extra treatment later if we need it. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_replay.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e8a7814..47f1b86 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -535,6 +535,9 @@ int xfrm_init_replay(struct xfrm_state *x) replay_esn->bmp_len * sizeof(__u32) * 8) return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && replay_esn->replay_window == 0) + return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn) x->repl = &xfrm_replay_esn; else -- cgit v1.1 From aae1e743fee2b5523fb31ee050295f062cb26a31 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 May 2011 07:43:20 +0000 Subject: net/usb: mark LG VL600 LTE modem ethernet interface as WWAN Like other mobile broadband device ethernet interfaces, mark the LG VL600 with the 'wwan' devtype so userspace knows it needs additional configuration via the AT port before the interface can be used. Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index a301479..c924ea2 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -567,7 +567,7 @@ static const struct usb_device_id products [] = { { USB_DEVICE_AND_INTERFACE_INFO(0x1004, 0x61aa, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), - .driver_info = 0, + .driver_info = (unsigned long)&wwan_info, }, /* -- cgit v1.1 From 0d4420a90b51abdea71585f571bad6d789ff8eb7 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 10 May 2011 13:12:30 -0700 Subject: slcan: fix ldisc->open retval TTY layer expects 0 if the ldisc->open operation succeeded. Reported-by: Matvejchikov Ilya Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- drivers/net/can/slcan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index b423965..1b49df6 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -583,7 +583,9 @@ static int slcan_open(struct tty_struct *tty) /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ - return sl->dev->base_addr; + + /* TTY layer expects 0 on success */ + return 0; err_free_chan: sl->tty = NULL; -- cgit v1.1 From 21a43e397e7f66d3be44e09b54045f1a67838cc0 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 10 May 2011 17:08:54 -0700 Subject: slub: Revert "[PARISC] slub: fix panic with DISCONTIGMEM" This reverts commit 4a5fa3590f09, which did not allow SLUB to be used on architectures that use DISCONTIGMEM without compiling NUMA support without CONFIG_BROKEN also set. The slub panic that it was intended to prevent is addressed by d9b41e0b54fd ("[PARISC] set memory ranges in N_NORMAL_MEMORY when onlined") on parisc so there is no further slub issues with such a configuration. The reverts allows SLUB now to be used on such architectures since there haven't been any reports of additional errors. Cc: James Bottomley Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- init/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index d886b1e..7a71e0a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1226,7 +1226,6 @@ config SLAB per cpu and per node queues. config SLUB - depends on BROKEN || NUMA || !DISCONTIGMEM bool "SLUB (Unqueued Allocator)" help SLUB is a slab allocator that minimizes cache line usage -- cgit v1.1 From 285e042dcd9fe9a9b7fbfdc215e4d076791ded8e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 9 May 2011 14:54:33 +1000 Subject: drm/radeon: fix cayman struct accessors. We are accessing totally the wrong struct in this case, and putting uninitialised values into the GPU, which it doesn't like unsurprisingly. Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/ni.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 7aade20..e9e45ea 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -871,7 +871,7 @@ static void cayman_gpu_init(struct radeon_device *rdev) smx_dc_ctl0 = RREG32(SMX_DC_CTL0); smx_dc_ctl0 &= ~NUMBER_OF_SETS(0x1ff); - smx_dc_ctl0 |= NUMBER_OF_SETS(rdev->config.evergreen.sx_num_of_sets); + smx_dc_ctl0 |= NUMBER_OF_SETS(rdev->config.cayman.sx_num_of_sets); WREG32(SMX_DC_CTL0, smx_dc_ctl0); WREG32(SPI_CONFIG_CNTL_1, VTX_DONE_DELAY(4) | CRC_SIMD_ID_WADDR_DISABLE); @@ -887,20 +887,20 @@ static void cayman_gpu_init(struct radeon_device *rdev) WREG32(TA_CNTL_AUX, DISABLE_CUBE_ANISO); - WREG32(SX_EXPORT_BUFFER_SIZES, (COLOR_BUFFER_SIZE((rdev->config.evergreen.sx_max_export_size / 4) - 1) | - POSITION_BUFFER_SIZE((rdev->config.evergreen.sx_max_export_pos_size / 4) - 1) | - SMX_BUFFER_SIZE((rdev->config.evergreen.sx_max_export_smx_size / 4) - 1))); + WREG32(SX_EXPORT_BUFFER_SIZES, (COLOR_BUFFER_SIZE((rdev->config.cayman.sx_max_export_size / 4) - 1) | + POSITION_BUFFER_SIZE((rdev->config.cayman.sx_max_export_pos_size / 4) - 1) | + SMX_BUFFER_SIZE((rdev->config.cayman.sx_max_export_smx_size / 4) - 1))); - WREG32(PA_SC_FIFO_SIZE, (SC_PRIM_FIFO_SIZE(rdev->config.evergreen.sc_prim_fifo_size) | - SC_HIZ_TILE_FIFO_SIZE(rdev->config.evergreen.sc_hiz_tile_fifo_size) | - SC_EARLYZ_TILE_FIFO_SIZE(rdev->config.evergreen.sc_earlyz_tile_fifo_size))); + WREG32(PA_SC_FIFO_SIZE, (SC_PRIM_FIFO_SIZE(rdev->config.cayman.sc_prim_fifo_size) | + SC_HIZ_TILE_FIFO_SIZE(rdev->config.cayman.sc_hiz_tile_fifo_size) | + SC_EARLYZ_TILE_FIFO_SIZE(rdev->config.cayman.sc_earlyz_tile_fifo_size))); WREG32(VGT_NUM_INSTANCES, 1); WREG32(CP_PERFMON_CNTL, 0); - WREG32(SQ_MS_FIFO_SIZES, (CACHE_FIFO_SIZE(16 * rdev->config.evergreen.sq_num_cf_insts) | + WREG32(SQ_MS_FIFO_SIZES, (CACHE_FIFO_SIZE(16 * rdev->config.cayman.sq_num_cf_insts) | FETCH_FIFO_HIWATER(0x4) | DONE_FIFO_HIWATER(0xe0) | ALU_UPDATE_FIFO_HIWATER(0x8))); -- cgit v1.1 From 1f03128251b77bfc68d1578a4f11316eb3806238 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 10 May 2011 02:14:52 +0000 Subject: drm/radeon/kms: fix cayman acceleration The TCC disable setup was incorrect. This prevents the GPU from hanging when draw commands are issued. Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/ni.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index e9e45ea..3d8a763 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -674,7 +674,7 @@ static void cayman_gpu_init(struct radeon_device *rdev) cc_rb_backend_disable = RREG32(CC_RB_BACKEND_DISABLE); cc_gc_shader_pipe_config = RREG32(CC_GC_SHADER_PIPE_CONFIG); - cgts_tcc_disable = RREG32(CGTS_TCC_DISABLE); + cgts_tcc_disable = 0xff000000; gc_user_rb_backend_disable = RREG32(GC_USER_RB_BACKEND_DISABLE); gc_user_shader_pipe_config = RREG32(GC_USER_SHADER_PIPE_CONFIG); cgts_user_tcc_disable = RREG32(CGTS_USER_TCC_DISABLE); -- cgit v1.1 From 03a80665341bbb9a57064c2ddeca13b554d56893 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 9 May 2011 02:24:04 +0000 Subject: drm/radeon/nouveau: fix build regression on alpha due to Xen changes. The Xen changes were using DMA_ERROR_CODE which isn't defined on a few platforms, however we reverted the Xen patch that caused use to try and use this code path earlier in 2.6.39 cycle, so for now lets just force the code to never take this path and allow it to build again on alpha. The proper long term answer is probably to store if the dma_addr has been assigned to alongside the dma_addr in the higher level code, though I think Thomas wanted to rewrite most of this anyways properly. Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Dave Airlie --- drivers/gpu/drm/nouveau/nouveau_sgdma.c | 3 ++- drivers/gpu/drm/radeon/radeon_gart.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c index 4bce801..c77111e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c +++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c @@ -42,7 +42,8 @@ nouveau_sgdma_populate(struct ttm_backend *be, unsigned long num_pages, nvbe->nr_pages = 0; while (num_pages--) { - if (dma_addrs[nvbe->nr_pages] != DMA_ERROR_CODE) { + /* this code path isn't called and is incorrect anyways */ + if (0) { /*dma_addrs[nvbe->nr_pages] != DMA_ERROR_CODE)*/ nvbe->pages[nvbe->nr_pages] = dma_addrs[nvbe->nr_pages]; nvbe->ttm_alloced[nvbe->nr_pages] = true; diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c index 8a955bb..a533f52 100644 --- a/drivers/gpu/drm/radeon/radeon_gart.c +++ b/drivers/gpu/drm/radeon/radeon_gart.c @@ -181,9 +181,9 @@ int radeon_gart_bind(struct radeon_device *rdev, unsigned offset, p = t / (PAGE_SIZE / RADEON_GPU_PAGE_SIZE); for (i = 0; i < pages; i++, p++) { - /* On TTM path, we only use the DMA API if TTM_PAGE_FLAG_DMA32 - * is requested. */ - if (dma_addr[i] != DMA_ERROR_CODE) { + /* we reverted the patch using dma_addr in TTM for now but this + * code stops building on alpha so just comment it out for now */ + if (0) { /*dma_addr[i] != DMA_ERROR_CODE) */ rdev->gart.ttm_alloced[p] = true; rdev->gart.pages_addr[p] = dma_addr[i]; } else { -- cgit v1.1 From 557f447f21621de9c5447c8702c33b53279822ce Mon Sep 17 00:00:00 2001 From: Juergen Kilb Date: Thu, 14 Apr 2011 09:31:43 +0200 Subject: mfd: Fixed gpio polarity of omap-usb gpio USB-phy reset With commit 19403165 a main part of ehci-omap.c moved to drivers/mfd/omap-usb-host.c created by commit 17cdd29d. Due to this reorganisation the polarity used to reset the external USB phy changed and USB host doesn't recognize any devices. Signed-off-by: Juergen Kilb Acked-by: Felipe Balbi Tested-by: Steve Sakoman Signed-off-by: Samuel Ortiz --- drivers/mfd/omap-usb-host.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index 2e16511..3ab9ffa 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -717,14 +717,14 @@ static int usbhs_enable(struct device *dev) gpio_request(pdata->ehci_data->reset_gpio_port[0], "USB1 PHY reset"); gpio_direction_output - (pdata->ehci_data->reset_gpio_port[0], 1); + (pdata->ehci_data->reset_gpio_port[0], 0); } if (gpio_is_valid(pdata->ehci_data->reset_gpio_port[1])) { gpio_request(pdata->ehci_data->reset_gpio_port[1], "USB2 PHY reset"); gpio_direction_output - (pdata->ehci_data->reset_gpio_port[1], 1); + (pdata->ehci_data->reset_gpio_port[1], 0); } /* Hold the PHY in RESET for enough time till DIR is high */ @@ -904,11 +904,11 @@ static int usbhs_enable(struct device *dev) if (gpio_is_valid(pdata->ehci_data->reset_gpio_port[0])) gpio_set_value - (pdata->ehci_data->reset_gpio_port[0], 0); + (pdata->ehci_data->reset_gpio_port[0], 1); if (gpio_is_valid(pdata->ehci_data->reset_gpio_port[1])) gpio_set_value - (pdata->ehci_data->reset_gpio_port[1], 0); + (pdata->ehci_data->reset_gpio_port[1], 1); } end_count: -- cgit v1.1 From a09aee8b636a3b2b7b10ad57d60d91e9272e771d Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Thu, 14 Apr 2011 22:43:47 +0800 Subject: mfd: Fix asic3 build error Fix below compile error: CC drivers/mfd/asic3.o drivers/mfd/asic3.c: In function 'asic3_irq_demux': drivers/mfd/asic3.c:147: error: 'irq_data' undeclared (first use in this function) drivers/mfd/asic3.c:147: error: (Each undeclared identifier is reported only once drivers/mfd/asic3.c:147: error: for each function it appears in.) Signed-off-by: Axel Lin Signed-off-by: Samuel Ortiz --- drivers/mfd/asic3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index d4a851c..0b4d5b2 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -144,7 +144,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) int iter, i; unsigned long flags; - data->chip->irq_ack(irq_data); + data->chip->irq_ack(data); for (iter = 0 ; iter < MAX_ASIC_ISR_LOOPS; iter++) { u32 status; -- cgit v1.1 From c62dd365e248222903e6e3e3a8f5d8587e7e3345 Mon Sep 17 00:00:00 2001 From: Lesly A M Date: Thu, 14 Apr 2011 17:57:49 +0530 Subject: mfd: Fix for the TWL4030 PM sleep/wakeup sequence Only configure sleep script when the flag is TWL4030_SLEEP_SCRIPT. Adding the missing brackets for fixing the issue. Signed-off-by: Lesly A M Cc: Nishanth Menon Cc: David Derrick Signed-off-by: Samuel Ortiz --- drivers/mfd/twl4030-power.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c index 16422de0..2c0d4d1 100644 --- a/drivers/mfd/twl4030-power.c +++ b/drivers/mfd/twl4030-power.c @@ -447,12 +447,13 @@ static int __init load_twl4030_script(struct twl4030_script *tscript, if (err) goto out; } - if (tscript->flags & TWL4030_SLEEP_SCRIPT) + if (tscript->flags & TWL4030_SLEEP_SCRIPT) { if (order) pr_warning("TWL4030: Bad order of scripts (sleep "\ "script before wakeup) Leads to boot"\ "failure on some boards\n"); err = twl4030_config_sleep_sequence(address); + } out: return err; } -- cgit v1.1 From 9bbeacf52f66d165739a4bbe9c018d17493a74b5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 11 May 2011 13:06:13 +0200 Subject: kprobes, x86: Disable irqs during optimized callback Disable irqs during optimized callback, so we dont miss any in-irq kprobes. The following commands: # cd /debug/tracing/ # echo "p mutex_unlock" >> kprobe_events # echo "p _raw_spin_lock" >> kprobe_events # echo "p smp_apic_timer_interrupt" >> ./kprobe_events # echo 1 > events/enable Cause the optimized kprobes to be missed. None is missed with the fix applied. Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20110511110613.GB2390@jolsa.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c969fd9..f1a6244d 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - preempt_disable(); + local_irq_save(flags); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { @@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + local_irq_restore(flags); } static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) -- cgit v1.1 From 7d8e18a69d9ebb8bf51748842929f8cc1ad61d49 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Wed, 11 May 2011 10:29:52 +0000 Subject: ceph: print debug message before put mds session The mds session, s, could be freed during ceph_put_mds_session. Move dout before ceph_put_mds_session. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f60b07b..d0fae4c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3304,8 +3304,8 @@ static void con_put(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; + dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); ceph_put_mds_session(s); - dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref)); } /* -- cgit v1.1 From a26a185d27b49e1656b335ef8ad1a32f7a0e7d7f Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Wed, 11 May 2011 10:29:53 +0000 Subject: ceph: fix list_add in ceph_put_snap_realm Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/snap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e86ec11..24067d6 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -206,7 +206,7 @@ void ceph_put_snap_realm(struct ceph_mds_client *mdsc, up_write(&mdsc->snap_rwsem); } else { spin_lock(&mdsc->snap_empty_lock); - list_add(&mdsc->snap_empty, &realm->empty_item); + list_add(&realm->empty_item, &mdsc->snap_empty); spin_unlock(&mdsc->snap_empty_lock); } } -- cgit v1.1 From d3d0720d4a7a46e93e055e5b0f1a8bd612743ed6 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Wed, 11 May 2011 10:29:54 +0000 Subject: ceph: do not use i_wrbuffer_ref as refcount for Fb cap We increments i_wrbuffer_ref when taking the Fb cap. This breaks the dirty page accounting and causes looping in __ceph_do_pending_vmtruncate, and ceph client hangs. This bug can be reproduced occasionally by running blogbench. Add a new field i_wb_ref to inode and dedicate it to Fb reference counting. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/caps.c | 16 ++++++++-------- fs/ceph/inode.c | 1 + fs/ceph/super.h | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9fa0866..2a5404c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -819,7 +819,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; - if (ci->i_wrbuffer_ref) + if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; return used; } @@ -1990,11 +1990,11 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) if (got & CEPH_CAP_FILE_WR) ci->i_wr_ref++; if (got & CEPH_CAP_FILE_BUFFER) { - if (ci->i_wrbuffer_ref == 0) + if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); - ci->i_wrbuffer_ref++; - dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n", - &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref); + ci->i_wb_ref++; + dout("__take_cap_refs %p wb %d -> %d (?)\n", + &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2169,12 +2169,12 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { - if (--ci->i_wrbuffer_ref == 0) { + if (--ci->i_wb_ref == 0) { last++; put++; } - dout("put_cap_refs %p wrbuffer %d -> %d (?)\n", - inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref); + dout("put_cap_refs %p wb %d -> %d (?)\n", + inode, ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 03d6daf..70b6a48 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -355,6 +355,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rd_ref = 0; ci->i_rdcache_ref = 0; ci->i_wr_ref = 0; + ci->i_wb_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; ci->i_shared_gen = 0; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b1f1b8b..f5cabef 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -293,7 +293,7 @@ struct ceph_inode_info { /* held references to caps */ int i_pin_ref; - int i_rd_ref, i_rdcache_ref, i_wr_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ -- cgit v1.1 From c56b2ddd5ff4352cdb0df07eefba8068d043382e Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 10 May 2011 16:56:46 +0200 Subject: omap: iommu: Return IRQ_HANDLED in fault handler when no fault occured Commit d594f1f31afe13edd8c02f3854a65cc58cfb3b74 (omap: IOMMU: add support to callback during fault handling) broke interrupt line sharing between the OMAP3 ISP and its IOMMU. Because of this, every interrupt generated by the OMAP3 ISP is handled by the IOMMU driver instead of being passed to the OMAP3 ISP driver. Signed-off-by: Laurent Pinchart Acked-by: Hiroshi DOYU Signed-off-by: Tony Lindgren --- arch/arm/plat-omap/iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/plat-omap/iommu.c b/arch/arm/plat-omap/iommu.c index 8a51fd5..34fc31e 100644 --- a/arch/arm/plat-omap/iommu.c +++ b/arch/arm/plat-omap/iommu.c @@ -793,6 +793,8 @@ static irqreturn_t iommu_fault_handler(int irq, void *data) clk_enable(obj->clk); errs = iommu_report_fault(obj, &da); clk_disable(obj->clk); + if (errs == 0) + return IRQ_HANDLED; /* Fault callback or TLB/PTE Dynamic loading */ if (obj->isr && !obj->isr(obj, da, errs, obj->isr_priv)) -- cgit v1.1 From a8a4ae3a899a6c0b4771cc57884800d8b76a6996 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 3 May 2011 13:43:03 -0400 Subject: NFSv41: Resend on NFS4ERR_RETRY_UNCACHED_REP Free the slot and resend the RPC with new session . For nfs4_async_handle_error, return -EAGAIN and set the task->tk_status to 0 to restart the async rpc in the rpc_restart_call_prepare state which resets the slot. For nfs4_handle_exception, retrying a call that uses nfs4_call_sync will reset the slot via nfs41_call_sync_prepare. For open/close/lock/locku/delegreturn/layoutcommit/unlink/rename/write cachethis is true, so these operations will not trigger an NFS4ERR_RETRY_UNCACHED_REP. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 2 ++ fs/nfs/nfs4proc.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 6f8192f..7841ea6 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -117,6 +117,8 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EKEYEXPIRED: rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); break; + case -NFS4ERR_RETRY_UNCACHED_REP: + break; default: dprintk("%s DS error. Retry through MDS %d\n", __func__, task->tk_status); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 69c0f3c..cf1b339 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -300,6 +300,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc ret = nfs4_delay(server->client, &exception->timeout); if (ret != 0) break; + case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: exception->retry = 1; break; @@ -3695,6 +3696,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; return -EAGAIN; + case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: task->tk_status = 0; return -EAGAIN; @@ -4844,6 +4846,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: nfs_restart_rpc(task, data->clp); return; } @@ -5479,6 +5483,8 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf break; case -NFS4ERR_DELAY: rpc_delay(task, NFS4_POLL_RETRY_MAX); + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; default: nfs4_schedule_lease_recovery(clp); -- cgit v1.1 From 2887fe45522843149ccf72e01f43813be4fb36c5 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 11 May 2011 01:19:58 -0400 Subject: NFSv4.1: remove pnfs_layout_hdr from pnfs_destroy_all_layouts tmp_list Prevents an infinite loop as list was never emptied. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ff681ab..65455f5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -383,6 +383,7 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) plh_layouts); dprintk("%s freeing layout for inode %lu\n", __func__, lo->plh_inode->i_ino); + list_del_init(&lo->plh_layouts); pnfs_destroy_layout(NFS_I(lo->plh_inode)); } } -- cgit v1.1 From 87186475a402391a1ca7d42a675c9b35a18dc348 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 May 2011 21:09:53 +0200 Subject: PM: Fix warning in pm_restrict_gfp_mask() during SNAPSHOT_S2RAM ioctl A warning is printed by pm_restrict_gfp_mask() while the SNAPSHOT_S2RAM ioctl is being executed after creating a hibernation image, because pm_restrict_gfp_mask() has been called once already before the image creation and suspend_devices_and_enter() calls it once again. This happens after commit 452aa6999e6703ffbddd7f6ea124d3 (mm/pm: force GFP_NOIO during suspend/hibernation and resume). To avoid this issue, move pm_restrict_gfp_mask() and pm_restore_gfp_mask() from suspend_devices_and_enter() to its caller in kernel/power/suspend.c. Reported-by: Alexandre Felipe Muller de Souza Signed-off-by: Rafael J. Wysocki Cc: stable@kernel.org --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8935369..6275970 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -216,7 +216,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -233,7 +232,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) @@ -294,7 +292,9 @@ int enter_state(suspend_state_t state) goto Finish; pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); Finish: pr_debug("PM: Finishing wakeup.\n"); -- cgit v1.1 From 9744997a8a2280e67984d4bffd87221d24f3b6b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 May 2011 21:10:01 +0200 Subject: PM / Hibernate: Make snapshot_release() restore GFP mask If the process using the hibernate user space interface closes /dev/snapshot after creating a hibernation image without thawing tasks, snapshot_release() should call pm_restore_gfp_mask() to restore the GFP mask used before the creation of the image. Make that happen. Tested-by: Alexandre Felipe Muller de Souza Signed-off-by: Rafael J. Wysocki Cc: stable@kernel.org --- kernel/power/user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/user.c b/kernel/power/user.c index c36c3b9..6522be9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); - if (data->frozen) + if (data->frozen) { + pm_restore_gfp_mask(); thaw_processes(); + } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); -- cgit v1.1 From 36cb7035ea0c11ef2c7fa2bbe0cd181b23569b29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 May 2011 21:10:13 +0200 Subject: PM / Hibernate: Fix ioctl SNAPSHOT_S2RAM The SNAPSHOT_S2RAM ioctl used for implementing the feature allowing one to suspend to RAM after creating a hibernation image is currently broken, because it doesn't clear the "ready" flag in the struct snapshot_data object handled by it. As a result, the SNAPSHOT_UNFREEZE doesn't work correctly after SNAPSHOT_S2RAM has returned and the user space hibernate task cannot thaw the other processes as appropriate. Make SNAPSHOT_S2RAM clear data->ready to fix this problem. Tested-by: Alexandre Felipe Muller de Souza Signed-off-by: Rafael J. Wysocki Cc: stable@kernel.org --- kernel/power/user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/user.c b/kernel/power/user.c index 6522be9..7d02d33 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -381,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); + data->ready = 0; break; case SNAPSHOT_PLATFORM_SUPPORT: -- cgit v1.1 From f25f4f522a9d2e18595da9df0bf1b6f282040e08 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Apr 2011 19:14:35 +0200 Subject: PM / AVR32: Use struct syscore_ops instead of sysdevs for PM Convert some AVR32 architecture's code to using struct syscore_ops objects for power management instead of sysdev classes and sysdevs. This simplifies the code and reduces the kernel's memory footprint. It also is necessary for removing sysdevs from the kernel entirely in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Hans-Christian Egtvedt --- arch/avr32/mach-at32ap/intc.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/arch/avr32/mach-at32ap/intc.c b/arch/avr32/mach-at32ap/intc.c index 21ce35f..3e36461 100644 --- a/arch/avr32/mach-at32ap/intc.c +++ b/arch/avr32/mach-at32ap/intc.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include @@ -21,7 +21,6 @@ struct intc { void __iomem *regs; struct irq_chip chip; - struct sys_device sysdev; #ifdef CONFIG_PM unsigned long suspend_ipr; unsigned long saved_ipr[64]; @@ -146,9 +145,8 @@ void intc_set_suspend_handler(unsigned long offset) intc0.suspend_ipr = offset; } -static int intc_suspend(struct sys_device *sdev, pm_message_t state) +static int intc_suspend(void) { - struct intc *intc = container_of(sdev, struct intc, sysdev); int i; if (unlikely(!irqs_disabled())) { @@ -156,28 +154,25 @@ static int intc_suspend(struct sys_device *sdev, pm_message_t state) return -EINVAL; } - if (unlikely(!intc->suspend_ipr)) { + if (unlikely(!intc0.suspend_ipr)) { pr_err("intc_suspend: suspend_ipr not initialized\n"); return -EINVAL; } for (i = 0; i < 64; i++) { - intc->saved_ipr[i] = intc_readl(intc, INTPR0 + 4 * i); - intc_writel(intc, INTPR0 + 4 * i, intc->suspend_ipr); + intc0.saved_ipr[i] = intc_readl(&intc0, INTPR0 + 4 * i); + intc_writel(&intc0, INTPR0 + 4 * i, intc0.suspend_ipr); } return 0; } -static int intc_resume(struct sys_device *sdev) +static int intc_resume(void) { - struct intc *intc = container_of(sdev, struct intc, sysdev); int i; - WARN_ON(!irqs_disabled()); - for (i = 0; i < 64; i++) - intc_writel(intc, INTPR0 + 4 * i, intc->saved_ipr[i]); + intc_writel(&intc0, INTPR0 + 4 * i, intc0.saved_ipr[i]); return 0; } @@ -186,27 +181,18 @@ static int intc_resume(struct sys_device *sdev) #define intc_resume NULL #endif -static struct sysdev_class intc_class = { - .name = "intc", +static struct syscore_ops intc_syscore_ops = { .suspend = intc_suspend, .resume = intc_resume, }; -static int __init intc_init_sysdev(void) +static int __init intc_init_syscore(void) { - int ret; - - ret = sysdev_class_register(&intc_class); - if (ret) - return ret; + register_syscore_ops(&intc_syscore_ops); - intc0.sysdev.id = 0; - intc0.sysdev.cls = &intc_class; - ret = sysdev_register(&intc0.sysdev); - - return ret; + return 0; } -device_initcall(intc_init_sysdev); +device_initcall(intc_init_syscore); unsigned long intc_get_pending(unsigned int group) { -- cgit v1.1 From f98bf4aa39ecce96020631cba910be094c87ac3c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Apr 2011 19:14:47 +0200 Subject: PM / UNICORE32: Use struct syscore_ops instead of sysdevs for PM Make some UNICORE32 architecture's code use struct syscore_ops objects for power management instead of sysdev classes and sysdevs. This simplifies the code and reduces the kernel's memory footprint. It also is necessary for removing sysdevs from the kernel entirely in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Guan Xuetao --- arch/unicore32/kernel/irq.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/unicore32/kernel/irq.c b/arch/unicore32/kernel/irq.c index 2aa30a3..d4efa7d 100644 --- a/arch/unicore32/kernel/irq.c +++ b/arch/unicore32/kernel/irq.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -237,7 +237,7 @@ static struct puv3_irq_state { unsigned int iccr; } puv3_irq_state; -static int puv3_irq_suspend(struct sys_device *dev, pm_message_t state) +static int puv3_irq_suspend(void) { struct puv3_irq_state *st = &puv3_irq_state; @@ -265,7 +265,7 @@ static int puv3_irq_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int puv3_irq_resume(struct sys_device *dev) +static void puv3_irq_resume(void) { struct puv3_irq_state *st = &puv3_irq_state; @@ -278,27 +278,20 @@ static int puv3_irq_resume(struct sys_device *dev) writel(st->icmr, INTC_ICMR); } - return 0; } -static struct sysdev_class puv3_irq_sysclass = { - .name = "pkunity-irq", +static struct syscore_ops puv3_irq_syscore_ops = { .suspend = puv3_irq_suspend, .resume = puv3_irq_resume, }; -static struct sys_device puv3_irq_device = { - .id = 0, - .cls = &puv3_irq_sysclass, -}; - -static int __init puv3_irq_init_devicefs(void) +static int __init puv3_irq_init_syscore(void) { - sysdev_class_register(&puv3_irq_sysclass); - return sysdev_register(&puv3_irq_device); + register_syscore_ops(&puv3_irq_syscore_ops); + return 0; } -device_initcall(puv3_irq_init_devicefs); +device_initcall(puv3_irq_init_syscore); void __init init_IRQ(void) { -- cgit v1.1 From f5a592f7d74e38c5007876c731e6bf5580072e63 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Apr 2011 19:14:57 +0200 Subject: PM / PowerPC: Use struct syscore_ops instead of sysdevs for PM Make some PowerPC architecture's code use struct syscore_ops objects for power management instead of sysdev classes and sysdevs. This simplifies the code and reduces the kernel's memory footprint. It also is necessary for removing sysdevs from the kernel entirely in the future. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/mpic.h | 3 --- arch/powerpc/platforms/cell/spu_base.c | 28 +++++++++++++------- arch/powerpc/platforms/powermac/pic.c | 42 ++++++++--------------------- arch/powerpc/sysdev/ipic.c | 36 +++++++------------------ arch/powerpc/sysdev/mpic.c | 48 ++++++++++++++++++---------------- 5 files changed, 64 insertions(+), 93 deletions(-) diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index 7005ee0..49baddc 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -3,7 +3,6 @@ #ifdef __KERNEL__ #include -#include #include #include @@ -320,8 +319,6 @@ struct mpic /* link */ struct mpic *next; - struct sys_device sysdev; - #ifdef CONFIG_PM struct mpic_irq_save *save_data; #endif diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index acfacce..3675da7 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -521,18 +522,8 @@ void spu_init_channels(struct spu *spu) } EXPORT_SYMBOL_GPL(spu_init_channels); -static int spu_shutdown(struct sys_device *sysdev) -{ - struct spu *spu = container_of(sysdev, struct spu, sysdev); - - spu_free_irqs(spu); - spu_destroy_spu(spu); - return 0; -} - static struct sysdev_class spu_sysdev_class = { .name = "spu", - .shutdown = spu_shutdown, }; int spu_add_sysdev_attr(struct sysdev_attribute *attr) @@ -797,6 +788,22 @@ static inline void crash_register_spus(struct list_head *list) } #endif +static void spu_shutdown(void) +{ + struct spu *spu; + + mutex_lock(&spu_full_list_mutex); + list_for_each_entry(spu, &spu_full_list, full_list) { + spu_free_irqs(spu); + spu_destroy_spu(spu); + } + mutex_unlock(&spu_full_list_mutex); +} + +static struct syscore_ops spu_syscore_ops = { + .shutdown = spu_shutdown, +}; + static int __init init_spu_base(void) { int i, ret = 0; @@ -830,6 +837,7 @@ static int __init init_spu_base(void) crash_register_spus(&spu_full_list); mutex_unlock(&spu_full_list_mutex); spu_add_sysdev_attr(&attr_stat); + register_syscore_ops(&spu_syscore_ops); spu_init_affinity(); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 023f240..7c18a16 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -677,7 +677,7 @@ not_found: return viaint; } -static int pmacpic_suspend(struct sys_device *sysdev, pm_message_t state) +static int pmacpic_suspend(void) { int viaint = pmacpic_find_viaint(); @@ -698,7 +698,7 @@ static int pmacpic_suspend(struct sys_device *sysdev, pm_message_t state) return 0; } -static int pmacpic_resume(struct sys_device *sysdev) +static void pmacpic_resume(void) { int i; @@ -709,39 +709,19 @@ static int pmacpic_resume(struct sys_device *sysdev) for (i = 0; i < max_real_irqs; ++i) if (test_bit(i, sleep_save_mask)) pmac_unmask_irq(irq_get_irq_data(i)); - - return 0; } -#endif /* CONFIG_PM && CONFIG_PPC32 */ - -static struct sysdev_class pmacpic_sysclass = { - .name = "pmac_pic", +static struct syscore_ops pmacpic_syscore_ops = { + .suspend = pmacpic_suspend, + .resume = pmacpic_resume, }; -static struct sys_device device_pmacpic = { - .id = 0, - .cls = &pmacpic_sysclass, -}; - -static struct sysdev_driver driver_pmacpic = { -#if defined(CONFIG_PM) && defined(CONFIG_PPC32) - .suspend = &pmacpic_suspend, - .resume = &pmacpic_resume, -#endif /* CONFIG_PM && CONFIG_PPC32 */ -}; - -static int __init init_pmacpic_sysfs(void) +static int __init init_pmacpic_syscore(void) { -#ifdef CONFIG_PPC32 - if (max_irqs == 0) - return -ENODEV; -#endif - printk(KERN_DEBUG "Registering pmac pic with sysfs...\n"); - sysdev_class_register(&pmacpic_sysclass); - sysdev_register(&device_pmacpic); - sysdev_driver_register(&pmacpic_sysclass, &driver_pmacpic); + register_syscore_ops(&pmacpic_syscore_ops); return 0; } -machine_subsys_initcall(powermac, init_pmacpic_sysfs); +machine_subsys_initcall(powermac, init_pmacpic_syscore); + +#endif /* CONFIG_PM && CONFIG_PPC32 */ diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index fa438be..596554a 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -902,7 +902,7 @@ static struct { u32 sercr; } ipic_saved_state; -static int ipic_suspend(struct sys_device *sdev, pm_message_t state) +static int ipic_suspend(void) { struct ipic *ipic = primary_ipic; @@ -933,7 +933,7 @@ static int ipic_suspend(struct sys_device *sdev, pm_message_t state) return 0; } -static int ipic_resume(struct sys_device *sdev) +static void ipic_resume(void) { struct ipic *ipic = primary_ipic; @@ -949,44 +949,26 @@ static int ipic_resume(struct sys_device *sdev) ipic_write(ipic->regs, IPIC_SECNR, ipic_saved_state.secnr); ipic_write(ipic->regs, IPIC_SERMR, ipic_saved_state.sermr); ipic_write(ipic->regs, IPIC_SERCR, ipic_saved_state.sercr); - - return 0; } #else #define ipic_suspend NULL #define ipic_resume NULL #endif -static struct sysdev_class ipic_sysclass = { - .name = "ipic", +static struct syscore_ops ipic_syscore_ops = { .suspend = ipic_suspend, .resume = ipic_resume, }; -static struct sys_device device_ipic = { - .id = 0, - .cls = &ipic_sysclass, -}; - -static int __init init_ipic_sysfs(void) +static int __init init_ipic_syscore(void) { - int rc; - if (!primary_ipic || !primary_ipic->regs) return -ENODEV; - printk(KERN_DEBUG "Registering ipic with sysfs...\n"); - rc = sysdev_class_register(&ipic_sysclass); - if (rc) { - printk(KERN_ERR "Failed registering ipic sys class\n"); - return -ENODEV; - } - rc = sysdev_register(&device_ipic); - if (rc) { - printk(KERN_ERR "Failed registering ipic sys device\n"); - return -ENODEV; - } + printk(KERN_DEBUG "Registering ipic system core operations\n"); + register_syscore_ops(&ipic_syscore_ops); + return 0; } -subsys_initcall(init_ipic_sysfs); +subsys_initcall(init_ipic_syscore); diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index f91c065..7e5dc8f 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1702,9 +1703,8 @@ void mpic_reset_core(int cpu) #endif /* CONFIG_SMP */ #ifdef CONFIG_PM -static int mpic_suspend(struct sys_device *dev, pm_message_t state) +static void mpic_suspend_one(struct mpic *mpic) { - struct mpic *mpic = container_of(dev, struct mpic, sysdev); int i; for (i = 0; i < mpic->num_sources; i++) { @@ -1713,13 +1713,22 @@ static int mpic_suspend(struct sys_device *dev, pm_message_t state) mpic->save_data[i].dest = mpic_irq_read(i, MPIC_INFO(IRQ_DESTINATION)); } +} + +static int mpic_suspend(void) +{ + struct mpic *mpic = mpics; + + while (mpic) { + mpic_suspend_one(mpic); + mpic = mpic->next; + } return 0; } -static int mpic_resume(struct sys_device *dev) +static void mpic_resume_one(struct mpic *mpic) { - struct mpic *mpic = container_of(dev, struct mpic, sysdev); int i; for (i = 0; i < mpic->num_sources; i++) { @@ -1746,33 +1755,28 @@ static int mpic_resume(struct sys_device *dev) } #endif } /* end for loop */ +} - return 0; +static void mpic_resume(void) +{ + struct mpic *mpic = mpics; + + while (mpic) { + mpic_resume_one(mpic); + mpic = mpic->next; + } } -#endif -static struct sysdev_class mpic_sysclass = { -#ifdef CONFIG_PM +static struct syscore_ops mpic_syscore_ops = { .resume = mpic_resume, .suspend = mpic_suspend, -#endif - .name = "mpic", }; static int mpic_init_sys(void) { - struct mpic *mpic = mpics; - int error, id = 0; - - error = sysdev_class_register(&mpic_sysclass); - - while (mpic && !error) { - mpic->sysdev.cls = &mpic_sysclass; - mpic->sysdev.id = id++; - error = sysdev_register(&mpic->sysdev); - mpic = mpic->next; - } - return error; + register_syscore_ops(&mpic_syscore_ops); + return 0; } device_initcall(mpic_init_sys); +#endif -- cgit v1.1 From 2e711c04dbbf7a7732a3f7073b1fc285d12b369d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Apr 2011 19:15:07 +0200 Subject: PM: Remove sysdev suspend, resume and shutdown operations Since suspend, resume and shutdown operations in struct sysdev_class and struct sysdev_driver are not used any more, remove them. Also drop sysdev_suspend(), sysdev_resume() and sysdev_shutdown() used for executing those operations and modify all of their users accordingly. This reduces kernel code size quite a bit and reduces its complexity. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- arch/sh/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/kernel/apm_32.c | 4 - drivers/base/Kconfig | 7 -- drivers/base/base.h | 2 - drivers/base/sys.c | 202 +---------------------------------------------- drivers/xen/manage.c | 8 +- include/linux/device.h | 7 -- include/linux/pm.h | 8 -- include/linux/sysdev.h | 11 --- kernel/kexec.c | 9 +-- kernel/power/hibernate.c | 18 +---- kernel/power/suspend.c | 8 +- kernel/sys.c | 3 - 14 files changed, 7 insertions(+), 282 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 4b89da2..bc439de 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -24,7 +24,6 @@ config SUPERH select RTC_LIB select GENERIC_ATOMIC64 select GENERIC_IRQ_SHOW - select ARCH_NO_SYSDEV_OPS help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a..140e254 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,7 +71,6 @@ config X86 select GENERIC_IRQ_SHOW select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP - select ARCH_NO_SYSDEV_OPS config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index adee12e..3bfa022 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1238,7 +1238,6 @@ static int suspend(int vetoable) dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); - sysdev_suspend(PMSG_SUSPEND); syscore_suspend(); local_irq_enable(); @@ -1258,7 +1257,6 @@ static int suspend(int vetoable) err = (err == APM_SUCCESS) ? 0 : -EIO; syscore_resume(); - sysdev_resume(); local_irq_enable(); dpm_resume_noirq(PMSG_RESUME); @@ -1282,7 +1280,6 @@ static void standby(void) dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); - sysdev_suspend(PMSG_SUSPEND); syscore_suspend(); local_irq_enable(); @@ -1292,7 +1289,6 @@ static void standby(void) local_irq_disable(); syscore_resume(); - sysdev_resume(); local_irq_enable(); dpm_resume_noirq(PMSG_RESUME); diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index e9e5238..d57e8d0 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -168,11 +168,4 @@ config SYS_HYPERVISOR bool default n -config ARCH_NO_SYSDEV_OPS - bool - ---help--- - To be selected by architectures that don't use sysdev class or - sysdev driver power management (suspend/resume) and shutdown - operations. - endmenu diff --git a/drivers/base/base.h b/drivers/base/base.h index 19f49e4..a34dca0 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -111,8 +111,6 @@ static inline int driver_match_device(struct device_driver *drv, return drv->bus->match ? drv->bus->match(dev, drv) : 1; } -extern void sysdev_shutdown(void); - extern char *make_class_name(const char *name, struct kobject *kobj); extern int devres_release_all(struct device *dev); diff --git a/drivers/base/sys.c b/drivers/base/sys.c index acde9b5..9dff77b 100644 --- a/drivers/base/sys.c +++ b/drivers/base/sys.c @@ -328,203 +328,8 @@ void sysdev_unregister(struct sys_device *sysdev) kobject_put(&sysdev->kobj); } - -#ifndef CONFIG_ARCH_NO_SYSDEV_OPS -/** - * sysdev_shutdown - Shut down all system devices. - * - * Loop over each class of system devices, and the devices in each - * of those classes. For each device, we call the shutdown method for - * each driver registered for the device - the auxiliaries, - * and the class driver. - * - * Note: The list is iterated in reverse order, so that we shut down - * child devices before we shut down their parents. The list ordering - * is guaranteed by virtue of the fact that child devices are registered - * after their parents. - */ -void sysdev_shutdown(void) -{ - struct sysdev_class *cls; - - pr_debug("Shutting Down System Devices\n"); - - mutex_lock(&sysdev_drivers_lock); - list_for_each_entry_reverse(cls, &system_kset->list, kset.kobj.entry) { - struct sys_device *sysdev; - - pr_debug("Shutting down type '%s':\n", - kobject_name(&cls->kset.kobj)); - - list_for_each_entry(sysdev, &cls->kset.list, kobj.entry) { - struct sysdev_driver *drv; - pr_debug(" %s\n", kobject_name(&sysdev->kobj)); - - /* Call auxiliary drivers first */ - list_for_each_entry(drv, &cls->drivers, entry) { - if (drv->shutdown) - drv->shutdown(sysdev); - } - - /* Now call the generic one */ - if (cls->shutdown) - cls->shutdown(sysdev); - } - } - mutex_unlock(&sysdev_drivers_lock); -} - -static void __sysdev_resume(struct sys_device *dev) -{ - struct sysdev_class *cls = dev->cls; - struct sysdev_driver *drv; - - /* First, call the class-specific one */ - if (cls->resume) - cls->resume(dev); - WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pF\n", cls->resume); - - /* Call auxiliary drivers next. */ - list_for_each_entry(drv, &cls->drivers, entry) { - if (drv->resume) - drv->resume(dev); - WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pF\n", drv->resume); - } -} - -/** - * sysdev_suspend - Suspend all system devices. - * @state: Power state to enter. - * - * We perform an almost identical operation as sysdev_shutdown() - * above, though calling ->suspend() instead. Interrupts are disabled - * when this called. Devices are responsible for both saving state and - * quiescing or powering down the device. - * - * This is only called by the device PM core, so we let them handle - * all synchronization. - */ -int sysdev_suspend(pm_message_t state) -{ - struct sysdev_class *cls; - struct sys_device *sysdev, *err_dev; - struct sysdev_driver *drv, *err_drv; - int ret; - - pr_debug("Checking wake-up interrupts\n"); - - /* Return error code if there are any wake-up interrupts pending */ - ret = check_wakeup_irqs(); - if (ret) - return ret; - - WARN_ONCE(!irqs_disabled(), - "Interrupts enabled while suspending system devices\n"); - - pr_debug("Suspending System Devices\n"); - - list_for_each_entry_reverse(cls, &system_kset->list, kset.kobj.entry) { - pr_debug("Suspending type '%s':\n", - kobject_name(&cls->kset.kobj)); - - list_for_each_entry(sysdev, &cls->kset.list, kobj.entry) { - pr_debug(" %s\n", kobject_name(&sysdev->kobj)); - - /* Call auxiliary drivers first */ - list_for_each_entry(drv, &cls->drivers, entry) { - if (drv->suspend) { - ret = drv->suspend(sysdev, state); - if (ret) - goto aux_driver; - } - WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pF\n", - drv->suspend); - } - - /* Now call the generic one */ - if (cls->suspend) { - ret = cls->suspend(sysdev, state); - if (ret) - goto cls_driver; - WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pF\n", - cls->suspend); - } - } - } - return 0; - /* resume current sysdev */ -cls_driver: - drv = NULL; - printk(KERN_ERR "Class suspend failed for %s: %d\n", - kobject_name(&sysdev->kobj), ret); - -aux_driver: - if (drv) - printk(KERN_ERR "Class driver suspend failed for %s: %d\n", - kobject_name(&sysdev->kobj), ret); - list_for_each_entry(err_drv, &cls->drivers, entry) { - if (err_drv == drv) - break; - if (err_drv->resume) - err_drv->resume(sysdev); - } - - /* resume other sysdevs in current class */ - list_for_each_entry(err_dev, &cls->kset.list, kobj.entry) { - if (err_dev == sysdev) - break; - pr_debug(" %s\n", kobject_name(&err_dev->kobj)); - __sysdev_resume(err_dev); - } - - /* resume other classes */ - list_for_each_entry_continue(cls, &system_kset->list, kset.kobj.entry) { - list_for_each_entry(err_dev, &cls->kset.list, kobj.entry) { - pr_debug(" %s\n", kobject_name(&err_dev->kobj)); - __sysdev_resume(err_dev); - } - } - return ret; -} -EXPORT_SYMBOL_GPL(sysdev_suspend); - -/** - * sysdev_resume - Bring system devices back to life. - * - * Similar to sysdev_suspend(), but we iterate the list forwards - * to guarantee that parent devices are resumed before their children. - * - * Note: Interrupts are disabled when called. - */ -int sysdev_resume(void) -{ - struct sysdev_class *cls; - - WARN_ONCE(!irqs_disabled(), - "Interrupts enabled while resuming system devices\n"); - - pr_debug("Resuming System Devices\n"); - - list_for_each_entry(cls, &system_kset->list, kset.kobj.entry) { - struct sys_device *sysdev; - - pr_debug("Resuming type '%s':\n", - kobject_name(&cls->kset.kobj)); - - list_for_each_entry(sysdev, &cls->kset.list, kobj.entry) { - pr_debug(" %s\n", kobject_name(&sysdev->kobj)); - - __sysdev_resume(sysdev); - } - } - return 0; -} -EXPORT_SYMBOL_GPL(sysdev_resume); -#endif /* CONFIG_ARCH_NO_SYSDEV_OPS */ +EXPORT_SYMBOL_GPL(sysdev_register); +EXPORT_SYMBOL_GPL(sysdev_unregister); int __init system_bus_init(void) { @@ -534,9 +339,6 @@ int __init system_bus_init(void) return 0; } -EXPORT_SYMBOL_GPL(sysdev_register); -EXPORT_SYMBOL_GPL(sysdev_unregister); - #define to_ext_attr(x) container_of(x, struct sysdev_ext_attribute, attr) ssize_t sysdev_store_ulong(struct sys_device *sysdev, diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index a2eee57..0b5366b 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -70,12 +70,7 @@ static int xen_suspend(void *data) BUG_ON(!irqs_disabled()); - err = sysdev_suspend(PMSG_FREEZE); - if (!err) { - err = syscore_suspend(); - if (err) - sysdev_resume(); - } + err = syscore_suspend(); if (err) { printk(KERN_ERR "xen_suspend: system core suspend failed: %d\n", err); @@ -102,7 +97,6 @@ static int xen_suspend(void *data) } syscore_resume(); - sysdev_resume(); return 0; } diff --git a/include/linux/device.h b/include/linux/device.h index ab8dfc0..ea9db9b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -633,13 +633,6 @@ static inline int devtmpfs_mount(const char *mountpoint) { return 0; } /* drivers/base/power/shutdown.c */ extern void device_shutdown(void); -#ifndef CONFIG_ARCH_NO_SYSDEV_OPS -/* drivers/base/sys.c */ -extern void sysdev_shutdown(void); -#else -static inline void sysdev_shutdown(void) { } -#endif - /* debugging and troubleshooting/diagnostic helpers. */ extern const char *dev_driver_string(const struct device *dev); diff --git a/include/linux/pm.h b/include/linux/pm.h index 512e091..3c053e2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -529,14 +529,6 @@ struct dev_power_domain { */ #ifdef CONFIG_PM_SLEEP -#ifndef CONFIG_ARCH_NO_SYSDEV_OPS -extern int sysdev_suspend(pm_message_t state); -extern int sysdev_resume(void); -#else -static inline int sysdev_suspend(pm_message_t state) { return 0; } -static inline int sysdev_resume(void) { return 0; } -#endif - extern void device_pm_lock(void); extern void dpm_resume_noirq(pm_message_t state); extern void dpm_resume_end(pm_message_t state); diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h index dfb078d..d35e783 100644 --- a/include/linux/sysdev.h +++ b/include/linux/sysdev.h @@ -34,12 +34,6 @@ struct sysdev_class { struct list_head drivers; struct sysdev_class_attribute **attrs; struct kset kset; -#ifndef CONFIG_ARCH_NO_SYSDEV_OPS - /* Default operations for these types of devices */ - int (*shutdown)(struct sys_device *); - int (*suspend)(struct sys_device *, pm_message_t state); - int (*resume)(struct sys_device *); -#endif }; struct sysdev_class_attribute { @@ -77,11 +71,6 @@ struct sysdev_driver { struct list_head entry; int (*add)(struct sys_device *); int (*remove)(struct sys_device *); -#ifndef CONFIG_ARCH_NO_SYSDEV_OPS - int (*shutdown)(struct sys_device *); - int (*suspend)(struct sys_device *, pm_message_t state); - int (*resume)(struct sys_device *); -#endif }; diff --git a/kernel/kexec.c b/kernel/kexec.c index 87b77de..8d814cb 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1531,13 +1531,7 @@ int kernel_kexec(void) if (error) goto Enable_cpus; local_irq_disable(); - /* Suspend system devices */ - error = sysdev_suspend(PMSG_FREEZE); - if (!error) { - error = syscore_suspend(); - if (error) - sysdev_resume(); - } + error = syscore_suspend(); if (error) goto Enable_irqs; } else @@ -1553,7 +1547,6 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { syscore_resume(); - sysdev_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 50aae66..554d3b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -272,12 +272,7 @@ static int create_image(int platform_mode) local_irq_disable(); - error = sysdev_suspend(PMSG_FREEZE); - if (!error) { - error = syscore_suspend(); - if (error) - sysdev_resume(); - } + error = syscore_suspend(); if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -302,7 +297,6 @@ static int create_image(int platform_mode) Power_up: syscore_resume(); - sysdev_resume(); /* NOTE: dpm_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ @@ -409,12 +403,7 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); - error = sysdev_suspend(PMSG_QUIESCE); - if (!error) { - error = syscore_suspend(); - if (error) - sysdev_resume(); - } + error = syscore_suspend(); if (error) goto Enable_irqs; @@ -442,7 +431,6 @@ static int resume_target_kernel(bool platform_mode) touch_softlockup_watchdog(); syscore_resume(); - sysdev_resume(); Enable_irqs: local_irq_enable(); @@ -528,7 +516,6 @@ int hibernation_platform_enter(void) goto Platform_finish; local_irq_disable(); - sysdev_suspend(PMSG_HIBERNATE); syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; @@ -541,7 +528,6 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); - sysdev_resume(); local_irq_enable(); enable_nonboot_cpus(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8935369..732d77a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -163,19 +163,13 @@ static int suspend_enter(suspend_state_t state) arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); - error = sysdev_suspend(PMSG_SUSPEND); - if (!error) { - error = syscore_suspend(); - if (error) - sysdev_resume(); - } + error = syscore_suspend(); if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } syscore_resume(); - sysdev_resume(); } arch_suspend_enable_irqs(); diff --git a/kernel/sys.c b/kernel/sys.c index af468ed..f0c1038 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -315,7 +315,6 @@ void kernel_restart_prepare(char *cmd) blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; device_shutdown(); - sysdev_shutdown(); syscore_shutdown(); } @@ -354,7 +353,6 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - sysdev_shutdown(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -374,7 +372,6 @@ void kernel_power_off(void) if (pm_power_off_prepare) pm_power_off_prepare(); disable_nonboot_cpus(); - sysdev_shutdown(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); -- cgit v1.1 From 5409d2cd841cf2c76396470e566500f6505f8d2a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 11 May 2011 17:25:14 +1000 Subject: memory hotplug: Speed up add/remove when blocks are larger than PAGES_PER_SECTION On ppc64 the minimum memory section for hotplug is 16MB but most recent machines have a memory block size of 256MB. This means memory_block_change_state does 16 separate calls to memory_section_action. This also means we call the notifiers 16 times and the hook in the ehea network driver is quite costly. To offline one 256MB region takes: # time echo offline > /sys/devices/system/memory/memory32/state 7.9s This patch removes the loop and calls online_pages or remove_memory once for the entire region and in doing so makes the logic simpler since we don't have to back out if things fail part way through. The same test to offline one region now takes: # time echo online > /sys/devices/system/memory/memory32/state 0.67s Over 11 times faster. Signed-off-by: Anton Blanchard Signed-off-by: Greg Kroah-Hartman --- drivers/base/memory.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 3e9aa3d..c4c443d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -229,10 +229,11 @@ int memory_isolate_notify(unsigned long val, void *v) * OK to have direct references to sparsemem variables in here. */ static int -memory_section_action(unsigned long phys_index, unsigned long action) +memory_block_action(unsigned long phys_index, unsigned long action) { int i; unsigned long start_pfn, start_paddr; + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; struct page *first_page; int ret; @@ -244,7 +245,7 @@ memory_section_action(unsigned long phys_index, unsigned long action) * that way. */ if (action == MEM_ONLINE) { - for (i = 0; i < PAGES_PER_SECTION; i++) { + for (i = 0; i < nr_pages; i++) { if (PageReserved(first_page+i)) continue; @@ -258,12 +259,12 @@ memory_section_action(unsigned long phys_index, unsigned long action) switch (action) { case MEM_ONLINE: start_pfn = page_to_pfn(first_page); - ret = online_pages(start_pfn, PAGES_PER_SECTION); + ret = online_pages(start_pfn, nr_pages); break; case MEM_OFFLINE: start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; ret = remove_memory(start_paddr, - PAGES_PER_SECTION << PAGE_SHIFT); + nr_pages << PAGE_SHIFT); break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " @@ -289,20 +290,11 @@ static int memory_block_change_state(struct memory_block *mem, if (to_state == MEM_OFFLINE) mem->state = MEM_GOING_OFFLINE; - for (i = 0; i < sections_per_block; i++) { - ret = memory_section_action(mem->start_section_nr + i, - to_state); - if (ret) - break; - } - - if (ret) { - for (i = 0; i < sections_per_block; i++) - memory_section_action(mem->start_section_nr + i, - from_state_req); + ret = memory_block_action(mem->start_section_nr, to_state); + if (ret) mem->state = from_state_req; - } else + else mem->state = to_state; out: -- cgit v1.1 From 52cd4e5c620af9e21b5298bf01844b98573505a7 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 11 May 2011 15:13:28 -0700 Subject: drivers/rtc/rtc-s3c.c: fixup wake support for rtc The driver is not balancing set_irq and disable_irq_wake() calls, so ensure that it keeps track of whether the wake is enabled. The fixes the following error on S3C6410 devices: WARNING: at kernel/irq/manage.c:382 set_irq_wake+0x84/0xec() Unbalanced IRQ 92 wake disable Signed-off-by: Ben Dooks Signed-off-by: Mark Brown Cc: Alessandro Zummo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-s3c.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index b3466c4..16512ec 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -46,6 +46,7 @@ static struct clk *rtc_clk; static void __iomem *s3c_rtc_base; static int s3c_rtc_alarmno = NO_IRQ; static int s3c_rtc_tickno = NO_IRQ; +static bool wake_en; static enum s3c_cpu_type s3c_rtc_cpu_type; static DEFINE_SPINLOCK(s3c_rtc_pie_lock); @@ -562,8 +563,12 @@ static int s3c_rtc_suspend(struct platform_device *pdev, pm_message_t state) } s3c_rtc_enable(pdev, 0); - if (device_may_wakeup(&pdev->dev)) - enable_irq_wake(s3c_rtc_alarmno); + if (device_may_wakeup(&pdev->dev) && !wake_en) { + if (enable_irq_wake(s3c_rtc_alarmno) == 0) + wake_en = true; + else + dev_err(&pdev->dev, "enable_irq_wake failed\n"); + } return 0; } @@ -579,8 +584,10 @@ static int s3c_rtc_resume(struct platform_device *pdev) writew(tmp | ticnt_en_save, s3c_rtc_base + S3C2410_RTCCON); } - if (device_may_wakeup(&pdev->dev)) + if (device_may_wakeup(&pdev->dev) && wake_en) { disable_irq_wake(s3c_rtc_alarmno); + wake_en = false; + } return 0; } -- cgit v1.1 From bad49d9c89d8755a1289d68e6d0127a6ee79e119 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 11 May 2011 15:13:30 -0700 Subject: mm: check PageUnevictable in lru_deactivate_fn() The lru_deactivate_fn should not move page which in on unevictable lru into inactive list. Otherwise, we can meet BUG when we use isolate_lru_pages as __isolate_lru_page could return -EINVAL. Reported-by: Ying Han Tested-by: Ying Han Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/swap.c b/mm/swap.c index a448db3..5602f1a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -396,6 +396,9 @@ static void lru_deactivate_fn(struct page *page, void *arg) if (!PageLRU(page)) return; + if (PageUnevictable(page)) + return; + /* Some processes are using the page */ if (page_mapped(page)) return; -- cgit v1.1 From 8f389a99b652aab5b42297280bd94d95933ad12f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 May 2011 15:13:32 -0700 Subject: mm: use alloc_bootmem_node_nopanic() on really needed path Stefan found nobootmem does not work on his system that has only 8M of RAM. This causes an early panic: BIOS-provided physical RAM map: BIOS-88: 0000000000000000 - 000000000009f000 (usable) BIOS-88: 0000000000100000 - 0000000000840000 (usable) bootconsole [earlyser0] enabled Notice: NX (Execute Disable) protection missing in CPU or disabled in BIOS! DMI not present or invalid. last_pfn = 0x840 max_arch_pfn = 0x100000 init_memory_mapping: 0000000000000000-0000000000840000 8MB LOWMEM available. mapped low ram: 0 - 00840000 low ram: 0 - 00840000 Zone PFN ranges: DMA 0x00000001 -> 0x00001000 Normal empty Movable zone start PFN for each node early_node_map[2] active PFN ranges 0: 0x00000001 -> 0x0000009f 0: 0x00000100 -> 0x00000840 BUG: Int 6: CR2 (null) EDI c034663c ESI (null) EBP c0329f38 ESP c0329ef4 EBX c0346380 EDX 00000006 ECX ffffffff EAX fffffff4 err (null) EIP c0353191 CS c0320060 flg 00010082 Stack: (null) c030c533 000007cd (null) c030c533 00000001 (null) (null) 00000003 0000083f 00000018 00000002 00000002 c0329f6c c03534d6 (null) (null) 00000100 00000840 (null) c0329f64 00000001 00001000 (null) Pid: 0, comm: swapper Not tainted 2.6.36 #5 Call Trace: [] ? 0xc02e3707 [] 0xc035e6e5 [] ? 0xc0353191 [] 0xc03534d6 [] 0xc034f1cd [] 0xc034a824 [] ? 0xc03513cb [] 0xc0349432 [] 0xc0349066 It turns out that we should ignore the low limit of 16M. Use alloc_bootmem_node_nopanic() in this case. [akpm@linux-foundation.org: less mess] Signed-off-by: Yinghai LU Reported-by: Stefan Hellermann Tested-by: Stefan Hellermann Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: [2.6.34+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 2 ++ mm/page_alloc.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index b8613e8..01eca17 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -111,6 +111,8 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_node_nopanic(pgdat, x) \ + __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node_nopanic(pgdat, x) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f8a97b..454191a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3564,7 +3564,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, alloc_size); + alloc_bootmem_node_nopanic(pgdat, alloc_size); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -4141,7 +4141,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); + zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, + usemapsize); } #else static inline void setup_usemap(struct pglist_data *pgdat, @@ -4307,7 +4308,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node(pgdat, size); + map = alloc_bootmem_node_nopanic(pgdat, size); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.1 From 71a6d0af5b031d27029fda64fbab9b9d953d2b33 Mon Sep 17 00:00:00 2001 From: Harry Wei Date: Wed, 11 May 2011 15:13:33 -0700 Subject: MAINTAINERS: fix sorting Take alphabetical orders for MAINTAINERS file. Signed-off-by: Harry Wei Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 80 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 16a5c5f..69f19f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2813,38 +2813,19 @@ F: Documentation/gpio.txt F: drivers/gpio/ F: include/linux/gpio* +GRE DEMULTIPLEXER DRIVER +M: Dmitry Kozlov +L: netdev@vger.kernel.org +S: Maintained +F: net/ipv4/gre.c +F: include/net/gre.h + GRETH 10/100/1G Ethernet MAC device driver M: Kristoffer Glembo L: netdev@vger.kernel.org S: Maintained F: drivers/net/greth* -HARD DRIVE ACTIVE PROTECTION SYSTEM (HDAPS) DRIVER -M: Frank Seidel -L: platform-driver-x86@vger.kernel.org -W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ -S: Maintained -F: drivers/platform/x86/hdaps.c - -HWPOISON MEMORY FAILURE HANDLING -M: Andi Kleen -L: linux-mm@kvack.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison -S: Maintained -F: mm/memory-failure.c -F: mm/hwpoison-inject.c - -HYPERVISOR VIRTUAL CONSOLE DRIVER -L: linuxppc-dev@lists.ozlabs.org -S: Odd Fixes -F: drivers/tty/hvc/ - -iSCSI BOOT FIRMWARE TABLE (iBFT) DRIVER -M: Peter Jones -M: Konrad Rzeszutek Wilk -S: Maintained -F: drivers/firmware/iscsi_ibft* - GSPCA FINEPIX SUBDRIVER M: Frank Zago L: linux-media@vger.kernel.org @@ -2895,6 +2876,26 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git S: Maintained F: drivers/media/video/gspca/ +HARD DRIVE ACTIVE PROTECTION SYSTEM (HDAPS) DRIVER +M: Frank Seidel +L: platform-driver-x86@vger.kernel.org +W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ +S: Maintained +F: drivers/platform/x86/hdaps.c + +HWPOISON MEMORY FAILURE HANDLING +M: Andi Kleen +L: linux-mm@kvack.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison +S: Maintained +F: mm/memory-failure.c +F: mm/hwpoison-inject.c + +HYPERVISOR VIRTUAL CONSOLE DRIVER +L: linuxppc-dev@lists.ozlabs.org +S: Odd Fixes +F: drivers/tty/hvc/ + HARDWARE MONITORING M: Jean Delvare M: Guenter Roeck @@ -3478,6 +3479,12 @@ F: Documentation/isapnp.txt F: drivers/pnp/isapnp/ F: include/linux/isapnp.h +iSCSI BOOT FIRMWARE TABLE (iBFT) DRIVER +M: Peter Jones +M: Konrad Rzeszutek Wilk +S: Maintained +F: drivers/firmware/iscsi_ibft* + ISCSI M: Mike Christie L: open-iscsi@googlegroups.com @@ -4989,6 +4996,13 @@ F: Documentation/pps/ F: drivers/pps/ F: include/linux/pps*.h +PPTP DRIVER +M: Dmitry Kozlov +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/pptp.c +W: http://sourceforge.net/projects/accel-pptp + PREEMPTIBLE KERNEL M: Robert Love L: kpreempt-tech@lists.sourceforge.net @@ -7024,20 +7038,6 @@ M: "Maciej W. Rozycki" S: Maintained F: drivers/tty/serial/zs.* -GRE DEMULTIPLEXER DRIVER -M: Dmitry Kozlov -L: netdev@vger.kernel.org -S: Maintained -F: net/ipv4/gre.c -F: include/net/gre.h - -PPTP DRIVER -M: Dmitry Kozlov -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/pptp.c -W: http://sourceforge.net/projects/accel-pptp - THE REST M: Linus Torvalds L: linux-kernel@vger.kernel.org -- cgit v1.1 From ee85c2e1454603ebb9f8d87223ac79dcdc87fa32 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 11 May 2011 15:13:34 -0700 Subject: mm: add alloc_pages_exact_nid() Add a alloc_pages_exact_nid() that allocates on a specific node. The naming is quite broken, but fixing that would need a larger renaming action. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andi Kleen Cc: Michal Hocko Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Dave Hansen Cc: David Rientjes Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index bfb8f93..56d8fc8 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -353,6 +353,8 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); +/* This is different from alloc_pages_exact_node !!! */ +void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 454191a..570d944 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2317,6 +2317,21 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +{ + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page((void *)addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + return (void *)addr; +} + /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate @@ -2336,22 +2351,32 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long addr; addr = __get_free_pages(gfp_mask, order); - if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); - - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } - } - - return (void *)addr; + return make_alloc_exact(addr, order, size); } EXPORT_SYMBOL(alloc_pages_exact); /** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * pages on a node. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * Note this is not alloc_pages_exact_node() which allocates on a specific node, + * but is not exact. + */ +void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ + unsigned order = get_order(size); + struct page *p = alloc_pages_node(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +} +EXPORT_SYMBOL(alloc_pages_exact_nid); + +/** * free_pages_exact - release memory allocated via alloc_pages_exact() * @virt: the value returned by alloc_pages_exact. * @size: size of allocation, same value as passed to alloc_pages_exact(). -- cgit v1.1 From 21a3c9646873ae0919415d635b671d6a58758ede Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 11 May 2011 15:13:35 -0700 Subject: memcg: allocate memory cgroup structures in local nodes Commit dde79e005a769 ("page_cgroup: reduce allocation overhead for page_cgroup array for CONFIG_SPARSEMEM") added a regression that the memory cgroup data structures all end up in node 0 because the first attempt at allocating them would not pass in a node hint. Since the initialization runs on CPU #0 it would all end up node 0. This is a problem on large memory systems, where node 0 would lose a lot of memory. Change the alloc_pages_exact() to alloc_pages_exact_nid(). This will still fall back to other nodes if not enough memory is available. [ RED-PEN: right now it would fall back first before trying vmalloc_node. Probably not the best strategy ... But I left it like that for now. ] Signed-off-by: Andi Kleen Reported-by: Doug Nelson Cc: David Rientjes Reviewed-by: Michal Hocko Cc: Dave Hansen Acked-by: Balbir Singh Acked-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 9905501..2daadc3 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -134,7 +134,7 @@ static void *__init_refok alloc_page_cgroup(size_t size, int nid) { void *addr = NULL; - addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); + addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); if (addr) return addr; -- cgit v1.1 From b1dea800ac39599301d4bb8dcf2b1d29c2558211 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 May 2011 15:13:36 -0700 Subject: tmpfs: fix race between umount and writepage Konstanin Khlebnikov reports that a dangerous race between umount and shmem_writepage can be reproduced by this script: for i in {1..300} ; do mkdir $i while true ; do mount -t tmpfs none $i dd if=/dev/zero of=$i/test bs=1M count=$(($RANDOM % 100)) umount $i done & done on a 6xCPU node with 8Gb RAM: kernel very unstable after this accident. =) Kernel log: VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day... WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() list_del corruption. prev->next should be ffff880222fdaac8, but was (null) Pid: 11222, comm: mount.tmpfs Not tainted 2.6.39-rc2+ #4 Call Trace: warn_slowpath_common+0x80/0x98 warn_slowpath_fmt+0x41/0x43 __list_del_entry+0x8d/0x98 evict+0x50/0x113 iput+0x138/0x141 ... BUG: unable to handle kernel paging request at ffffffffffffffff IP: shmem_free_blocks+0x18/0x4c Pid: 10422, comm: dd Tainted: G W 2.6.39-rc2+ #4 Call Trace: shmem_recalc_inode+0x61/0x66 shmem_writepage+0xba/0x1dc pageout+0x13c/0x24c shrink_page_list+0x28e/0x4be shrink_inactive_list+0x21f/0x382 ... shmem_writepage() calls igrab() on the inode for the page which came from page reclaim, to add it later into shmem_swaplist for swapoff operation. This igrab() can race with super-block deactivating process: shrink_inactive_list() deactivate_super() pageout() tmpfs_fs_type->kill_sb() shmem_writepage() kill_litter_super() generic_shutdown_super() evict_inodes() igrab() atomic_read(&inode->i_count) skip-inode iput() if (!list_empty(&sb->s_inodes)) printk("VFS: Busy inodes after... This igrap-iput pair was added in commit 1b1b32f2c6f6 "tmpfs: fix shmem_swaplist races" based on incorrect assumptions: igrab() protects the inode from concurrent eviction by deletion, but it does nothing to protect it from concurrent unmounting, which goes ahead despite the raised i_count. So this use of igrab() was wrong all along, but the race made much worse in 2.6.37 when commit 63997e98a3be "split invalidate_inodes()" replaced two attempts at invalidate_inodes() by a single evict_inodes(). Konstantin posted a plausible patch, raising sb->s_active too: I'm unsure whether it was correct or not; but burnt once by igrab(), I am sure that we don't want to rely more deeply upon externals here. Fix it by adding the inode to shmem_swaplist earlier, while the page lock on page in page cache still secures the inode against eviction, without artifically raising i_count. It was originally added later because shmem_unuse_inode() is liable to remove an inode from the list while it's unswapped; but we can guard against that by taking spinlock before dropping mutex. Reported-by: Konstantin Khlebnikov Signed-off-by: Hugh Dickins Tested-by: Konstantin Khlebnikov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8fa27e4..262d711 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1039,6 +1039,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct address_space *mapping; unsigned long index; struct inode *inode; + bool unlock_mutex = false; BUG_ON(!PageLocked(page)); mapping = page->mapping; @@ -1064,7 +1065,26 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) else swap.val = 0; + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now because we cannot take + * mutex while holding spinlock, and must do so before the page + * is moved to swap cache, when its pagelock no longer protects + * the inode from eviction. But don't unlock the mutex until + * we've taken the spinlock, because shmem_unuse_inode() will + * prune a !swapped inode from the swaplist under both locks. + */ + if (swap.val && list_empty(&info->swaplist)) { + mutex_lock(&shmem_swaplist_mutex); + /* move instead of add in case we're racing */ + list_move_tail(&info->swaplist, &shmem_swaplist); + unlock_mutex = true; + } + spin_lock(&info->lock); + if (unlock_mutex) + mutex_unlock(&shmem_swaplist_mutex); + if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); goto unlock; @@ -1084,21 +1104,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) delete_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); - if (list_empty(&info->swaplist)) - inode = igrab(inode); - else - inode = NULL; spin_unlock(&info->lock); swap_shmem_alloc(swap); BUG_ON(page_mapped(page)); swap_writepage(page, wbc); - if (inode) { - mutex_lock(&shmem_swaplist_mutex); - /* move instead of add in case we're racing */ - list_move_tail(&info->swaplist, &shmem_swaplist); - mutex_unlock(&shmem_swaplist_mutex); - iput(inode); - } return 0; } -- cgit v1.1 From 778dd893ae785c5fd505dac30b5fc40aae188bf1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 May 2011 15:13:37 -0700 Subject: tmpfs: fix race between umount and swapoff The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable to umount as that in shmem_writepage(). Fix this instance by extending the protection of shmem_swaplist_mutex right across shmem_unuse_inode(): while it's on the list, the inode cannot be evicted (and the filesystem cannot be unmounted) without shmem_evict_inode() taking that mutex to remove it from the list. But since shmem_writepage() might take that mutex, we should avoid making memory allocations or memcg charges while holding it: prepare them at the outer level in shmem_unuse(). When mem_cgroup_cache_charge() was originally placed, we didn't know until that point that the page from swap was actually a shmem page; but nowadays it's noted in the swap_map, so we're safe to charge upfront. For the radix_tree, do as is done in shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves on occasion if subsequently preempted. With the allocation and charge moved out from shmem_unuse_inode(), we can also hold index map and info->lock over from finding the entry. Signed-off-by: Hugh Dickins Cc: Konstantin Khlebnikov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 88 ++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 262d711..dc17551 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -852,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) { - struct inode *inode; + struct address_space *mapping; unsigned long idx; unsigned long size; unsigned long limit; @@ -875,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; offset = shmem_find_swp(entry, ptr, ptr+size); - if (offset >= 0) + if (offset >= 0) { + shmem_swp_balance_unmap(); goto found; + } if (!info->i_indirect) goto lost2; @@ -914,11 +916,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; offset = shmem_find_swp(entry, ptr, ptr+size); - shmem_swp_unmap(ptr); if (offset >= 0) { shmem_dir_unmap(dir); goto found; } + shmem_swp_unmap(ptr); } } lost1: @@ -928,8 +930,7 @@ lost2: return 0; found: idx += offset; - inode = igrab(&info->vfs_inode); - spin_unlock(&info->lock); + ptr += offset; /* * Move _head_ to start search for next from here. @@ -940,37 +941,18 @@ found: */ if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); - error = 1; - if (!inode) - goto out; /* - * Charge page using GFP_KERNEL while we can wait. - * Charged back to the user(not to caller) when swap account is used. - * add_to_page_cache() will be called with GFP_NOWAIT. + * We rely on shmem_swaplist_mutex, not only to protect the swaplist, + * but also to hold up shmem_evict_inode(): so inode cannot be freed + * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); - if (error) - goto out; - error = radix_tree_preload(GFP_KERNEL); - if (error) { - mem_cgroup_uncharge_cache_page(page); - goto out; - } - error = 1; - - spin_lock(&info->lock); - ptr = shmem_swp_entry(info, idx, NULL); - if (ptr && ptr->val == entry.val) { - error = add_to_page_cache_locked(page, inode->i_mapping, - idx, GFP_NOWAIT); - /* does mem_cgroup_uncharge_cache_page on error */ - } else /* we must compensate for our precharge above */ - mem_cgroup_uncharge_cache_page(page); + mapping = info->vfs_inode.i_mapping; + error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); + /* which does mem_cgroup_uncharge_cache_page on error */ if (error == -EEXIST) { - struct page *filepage = find_get_page(inode->i_mapping, idx); + struct page *filepage = find_get_page(mapping, idx); error = 1; if (filepage) { /* @@ -990,14 +972,8 @@ found: swap_free(entry); error = 1; /* not an error, but entry was found */ } - if (ptr) - shmem_swp_unmap(ptr); + shmem_swp_unmap(ptr); spin_unlock(&info->lock); - radix_tree_preload_end(); -out: - unlock_page(page); - page_cache_release(page); - iput(inode); /* allows for NULL */ return error; } @@ -1009,6 +985,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page) struct list_head *p, *next; struct shmem_inode_info *info; int found = 0; + int error; + + /* + * Charge page using GFP_KERNEL while we can wait, before taking + * the shmem_swaplist_mutex which might hold up shmem_writepage(). + * Charged back to the user (not to caller) when swap account is used. + * add_to_page_cache() will be called with GFP_NOWAIT. + */ + error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + if (error) + goto out; + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu, + * it's okay if sometimes we get rescheduled after this. + */ + error = radix_tree_preload(GFP_KERNEL); + if (error) + goto uncharge; + radix_tree_preload_end(); mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(p, next, &shmem_swaplist) { @@ -1016,17 +1012,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page) found = shmem_unuse_inode(info, entry, page); cond_resched(); if (found) - goto out; + break; } mutex_unlock(&shmem_swaplist_mutex); - /* - * Can some race bring us here? We've been holding page lock, - * so I think not; but would rather try again later than BUG() - */ + +uncharge: + if (!found) + mem_cgroup_uncharge_cache_page(page); + if (found < 0) + error = found; +out: unlock_page(page); page_cache_release(page); -out: - return (found < 0) ? found : 0; + return error; } /* -- cgit v1.1 From 59a16ead572330deb38e5848151d30ed1af754bc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 May 2011 15:13:38 -0700 Subject: tmpfs: fix spurious ENOSPC when racing with unswap Testing the shmem_swaplist replacements for igrab() revealed another bug: writes to /dev/loop0 on a tmpfs file which fills its filesystem were sometimes failing with "Buffer I/O error"s. These came from ENOSPC failures of shmem_getpage(), when racing with swapoff: the same could happen when racing with another shmem_getpage(), pulling the page in from swap in between our find_lock_page() and our taking the info->lock (though not in the single-threaded loop case). This is unacceptable, and surprising that I've not noticed it before: it dates back many years, but (presumably) was made a lot easier to reproduce in 2.6.36, which sited a page preallocation in the race window. Fix it by rechecking the page cache before settling on an ENOSPC error. Signed-off-by: Hugh Dickins Cc: Konstantin Khlebnikov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index dc17551..9e755c1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1407,20 +1407,14 @@ repeat: if (sbinfo->max_blocks) { if (percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) >= 0 || - shmem_acct_block(info->flags)) { - spin_unlock(&info->lock); - error = -ENOSPC; - goto failed; - } + shmem_acct_block(info->flags)) + goto nospace; percpu_counter_inc(&sbinfo->used_blocks); spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; spin_unlock(&inode->i_lock); - } else if (shmem_acct_block(info->flags)) { - spin_unlock(&info->lock); - error = -ENOSPC; - goto failed; - } + } else if (shmem_acct_block(info->flags)) + goto nospace; if (!filepage) { int ret; @@ -1500,6 +1494,24 @@ done: error = 0; goto out; +nospace: + /* + * Perhaps the page was brought in from swap between find_lock_page + * and taking info->lock? We allow for that at add_to_page_cache_lru, + * but must also avoid reporting a spurious ENOSPC while working on a + * full tmpfs. (When filepage has been passed in to shmem_getpage, it + * is already in page cache, which prevents this race from occurring.) + */ + if (!filepage) { + struct page *page = find_get_page(mapping, idx); + if (page) { + spin_unlock(&info->lock); + page_cache_release(page); + goto repeat; + } + } + spin_unlock(&info->lock); + error = -ENOSPC; failed: if (*pagep != filepage) { unlock_page(filepage); -- cgit v1.1 From 1d929b7a84438ad9012c5826f5617d79a3efcef1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 May 2011 15:13:39 -0700 Subject: mm: tracing: add missing GFP flags to tracing include/linux/gfp.h and include/trace/events/gfpflags.h are out of sync. When tracing is enabled, certain flags are not recognised and the text output is less useful as a result. Add the missing flags. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/gfpflags.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index e3615c0..9fe3a36 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -10,6 +10,7 @@ */ #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ + {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ {(unsigned long)GFP_USER, "GFP_USER"}, \ @@ -32,6 +33,9 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"} \ + {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ + {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ + {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ + {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" -- cgit v1.1 From ce8453776d68982cfe93bcb28191af8ccad01f45 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 11 May 2011 18:58:16 -0700 Subject: Revert "Bluetooth: fix shutdown on SCO sockets" This reverts commit f21ca5fff6e548833fa5ee8867239a8378623150. Quoth Gustavo F. Padovan: "Commit f21ca5fff6e548833fa5ee8867239a8378623150 can cause a NULL dereference if we call shutdown in a bluetooth SCO socket and doesn't wait the shutdown completion to call close(). Please revert it. I may have a fix for it soon, but we don't have time anymore, so revert is the way to go. ;)" Requested-by: Gustavo F. Padovan Signed-off-by: Linus Torvalds --- net/bluetooth/sco.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 94954c7..42fdffd 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -369,15 +369,6 @@ static void __sco_sock_close(struct sock *sk) case BT_CONNECTED: case BT_CONFIG: - if (sco_pi(sk)->conn) { - sk->sk_state = BT_DISCONN; - sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); - hci_conn_put(sco_pi(sk)->conn->hcon); - sco_pi(sk)->conn = NULL; - } else - sco_chan_del(sk, ECONNRESET); - break; - case BT_CONNECT: case BT_DISCONN: sco_chan_del(sk, ECONNRESET); -- cgit v1.1 From a75b9df9d3bfc3cd1083974c045ae31ce5f3434f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 May 2011 18:00:51 -0400 Subject: NFSv4.1: Ensure that layoutget uses the correct gfp modes Currently, writebacks may end up recursing back into the filesystem due to GFP_KERNEL direct reclaims in the pnfs subsystem. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 25 ++++++++++++++----------- fs/nfs/nfs4filelayout.h | 2 +- fs/nfs/nfs4filelayoutdev.c | 34 +++++++++++++++++----------------- fs/nfs/pnfs.c | 33 +++++++++++++++++++-------------- fs/nfs/pnfs.h | 6 +++--- fs/nfs/read.c | 4 ++-- fs/nfs/write.c | 4 ++-- include/linux/nfs_xdr.h | 1 + 8 files changed, 59 insertions(+), 50 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 7841ea6..be79dc9 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -418,7 +418,8 @@ static int filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_filelayout_segment *fl, struct nfs4_layoutget_res *lgr, - struct nfs4_deviceid *id) + struct nfs4_deviceid *id, + gfp_t gfp_flags) { struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; @@ -441,7 +442,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, /* find and reference the deviceid */ dsaddr = nfs4_fl_find_get_deviceid(id); if (dsaddr == NULL) { - dsaddr = get_device_info(lo->plh_inode, id); + dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); if (dsaddr == NULL) goto out; } @@ -502,7 +503,8 @@ static int filelayout_decode_layout(struct pnfs_layout_hdr *flo, struct nfs4_filelayout_segment *fl, struct nfs4_layoutget_res *lgr, - struct nfs4_deviceid *id) + struct nfs4_deviceid *id, + gfp_t gfp_flags) { struct xdr_stream stream; struct xdr_buf buf = { @@ -518,7 +520,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, dprintk("%s: set_layout_map Begin\n", __func__); - scratch = alloc_page(GFP_KERNEL); + scratch = alloc_page(gfp_flags); if (!scratch) return -ENOMEM; @@ -556,13 +558,13 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, goto out_err; fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), - GFP_KERNEL); + gfp_flags); if (!fl->fh_array) goto out_err; for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ - fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); if (!fl->fh_array[i]) goto out_err_free; @@ -607,19 +609,20 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, - struct nfs4_layoutget_res *lgr) + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) { struct nfs4_filelayout_segment *fl; int rc; struct nfs4_deviceid id; dprintk("--> %s\n", __func__); - fl = kzalloc(sizeof(*fl), GFP_KERNEL); + fl = kzalloc(sizeof(*fl), gfp_flags); if (!fl) return NULL; - rc = filelayout_decode_layout(layoutid, fl, lgr, &id); - if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) { + rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { _filelayout_free_lseg(fl); return NULL; } @@ -635,7 +638,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, int size = (fl->stripe_type == STRIPE_SPARSE) ? fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - fl->commit_buckets = kcalloc(size, sizeof(struct list_head), GFP_KERNEL); + fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags); if (!fl->commit_buckets) { filelayout_free_lseg(&fl->generic_hdr); return NULL; diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 7c44579..2b461d7 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -104,6 +104,6 @@ extern struct nfs4_file_layout_dsaddr * nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id); extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index de5350f..db07c7a 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -225,11 +225,11 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) } static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) +nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) { struct nfs4_pnfs_ds *tmp_ds, *ds; - ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); + ds = kzalloc(sizeof(*tmp_ds), gfp_flags); if (!ds) goto out; @@ -261,7 +261,7 @@ out: * Currently only support ipv4, and one multi-path address. */ static struct nfs4_pnfs_ds * -decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode) +decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) { struct nfs4_pnfs_ds *ds = NULL; char *buf; @@ -303,7 +303,7 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode) rlen); goto out_err; } - buf = kmalloc(rlen + 1, GFP_KERNEL); + buf = kmalloc(rlen + 1, gfp_flags); if (!buf) { dprintk("%s: Not enough memory\n", __func__); goto out_err; @@ -333,7 +333,7 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode) sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); port = htons((tmp[0] << 8) | (tmp[1])); - ds = nfs4_pnfs_ds_add(inode, ip_addr, port); + ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); dprintk("%s: Decoded address and port %s\n", __func__, buf); out_free: kfree(buf); @@ -343,7 +343,7 @@ out_err: /* Decode opaque device data and return the result */ static struct nfs4_file_layout_dsaddr* -decode_device(struct inode *ino, struct pnfs_device *pdev) +decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) { int i; u32 cnt, num; @@ -362,7 +362,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) struct page *scratch; /* set up xdr stream */ - scratch = alloc_page(GFP_KERNEL); + scratch = alloc_page(gfp_flags); if (!scratch) goto out_err; @@ -384,7 +384,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) } /* read stripe indices */ - stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL); + stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags); if (!stripe_indices) goto out_err_free_scratch; @@ -423,7 +423,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) dsaddr = kzalloc(sizeof(*dsaddr) + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), - GFP_KERNEL); + gfp_flags); if (!dsaddr) goto out_err_free_stripe_indices; @@ -452,7 +452,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) for (j = 0; j < mp_count; j++) { if (j == 0) { dsaddr->ds_list[i] = decode_and_add_ds(&stream, - ino); + ino, gfp_flags); if (dsaddr->ds_list[i] == NULL) goto out_err_free_deviceid; } else { @@ -503,12 +503,12 @@ out_err: * available devices. */ static struct nfs4_file_layout_dsaddr * -decode_and_add_device(struct inode *inode, struct pnfs_device *dev) +decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) { struct nfs4_file_layout_dsaddr *d, *new; long hash; - new = decode_device(inode, dev); + new = decode_device(inode, dev, gfp_flags); if (!new) { printk(KERN_WARNING "%s: Could not decode or add device\n", __func__); @@ -537,7 +537,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev) * of available devices, and return it. */ struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) +get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) { struct pnfs_device *pdev = NULL; u32 max_resp_sz; @@ -556,17 +556,17 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) dprintk("%s inode %p max_resp_sz %u max_pages %d\n", __func__, inode, max_resp_sz, max_pages); - pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); + pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); if (pdev == NULL) return NULL; - pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); if (pages == NULL) { kfree(pdev); return NULL; } for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(GFP_KERNEL); + pages[i] = alloc_page(gfp_flags); if (!pages[i]) goto out_free; } @@ -587,7 +587,7 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) * Found new device, need to decode it and then add it to the * list of known devices for this mountpoint. */ - dsaddr = decode_and_add_device(inode, pdev); + dsaddr = decode_and_add_device(inode, pdev, gfp_flags); out_free: for (i = 0; i < max_pages; i++) __free_page(pages[i]); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 65455f5..f57f528 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -467,7 +467,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - u32 iomode) + u32 iomode, + gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); @@ -480,7 +481,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); BUG_ON(ctx == NULL); - lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); + lgp = kzalloc(sizeof(*lgp), gfp_flags); if (lgp == NULL) return NULL; @@ -488,12 +489,12 @@ send_layoutget(struct pnfs_layout_hdr *lo, max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; max_pages = max_resp_sz >> PAGE_SHIFT; - pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); if (!pages) goto out_err_free; for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(GFP_KERNEL); + pages[i] = alloc_page(gfp_flags); if (!pages[i]) goto out_err_free; } @@ -509,6 +510,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.layout.pages = pages; lgp->args.layout.pglen = max_pages * PAGE_SIZE; lgp->lsegpp = &lseg; + lgp->gfp_flags = gfp_flags; /* Synchronously retrieve layout information from server and * store in lseg. @@ -666,11 +668,11 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, } static struct pnfs_layout_hdr * -alloc_init_layout_hdr(struct inode *ino) +alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) { struct pnfs_layout_hdr *lo; - lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); + lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); if (!lo) return NULL; atomic_set(&lo->plh_refcount, 1); @@ -682,7 +684,7 @@ alloc_init_layout_hdr(struct inode *ino) } static struct pnfs_layout_hdr * -pnfs_find_alloc_layout(struct inode *ino) +pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *new = NULL; @@ -697,7 +699,7 @@ pnfs_find_alloc_layout(struct inode *ino) return nfsi->layout; } spin_unlock(&ino->i_lock); - new = alloc_init_layout_hdr(ino); + new = alloc_init_layout_hdr(ino, gfp_flags); spin_lock(&ino->i_lock); if (likely(nfsi->layout == NULL)) /* Won the race? */ @@ -757,7 +759,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - enum pnfs_iomode iomode) + enum pnfs_iomode iomode, + gfp_t gfp_flags) { struct nfs_inode *nfsi = NFS_I(ino); struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; @@ -768,7 +771,7 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; spin_lock(&ino->i_lock); - lo = pnfs_find_alloc_layout(ino); + lo = pnfs_find_alloc_layout(ino, gfp_flags); if (lo == NULL) { dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); goto out_unlock; @@ -808,7 +811,7 @@ pnfs_update_layout(struct inode *ino, spin_unlock(&clp->cl_lock); } - lseg = send_layoutget(lo, ctx, iomode); + lseg = send_layoutget(lo, ctx, iomode, gfp_flags); if (!lseg && first) { spin_lock(&clp->cl_lock); list_del_init(&lo->plh_layouts); @@ -847,7 +850,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out; } /* Inject layout blob into I/O device driver */ - lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); if (!lseg || IS_ERR(lseg)) { if (!lseg) status = -ENOMEM; @@ -900,7 +903,8 @@ static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, /* This is first coelesce call for a series of nfs_pages */ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, prev->wb_context, - IOMODE_READ); + IOMODE_READ, + GFP_KERNEL); } return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); } @@ -922,7 +926,8 @@ static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, /* This is first coelesce call for a series of nfs_pages */ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, prev->wb_context, - IOMODE_RW); + IOMODE_RW, + GFP_NOFS); } return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index bc48272..0c015ba 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -70,7 +70,7 @@ struct pnfs_layoutdriver_type { const u32 id; const char *name; struct module *owner; - struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); /* test for nfs page cache coalescing */ @@ -126,7 +126,7 @@ void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - enum pnfs_iomode access_type); + enum pnfs_iomode access_type, gfp_t gfp_flags); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, @@ -245,7 +245,7 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) static inline struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - enum pnfs_iomode access_type) + enum pnfs_iomode access_type, gfp_t gfp_flags) { return NULL; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 7cded2b..2bcf0dc 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -288,7 +288,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) atomic_set(&req->wb_complete, requests); BUG_ON(desc->pg_lseg != NULL); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -351,7 +351,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 0, lseg); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3bd5d7e..49c715b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -939,7 +939,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) atomic_set(&req->wb_complete, requests); BUG_ON(desc->pg_lseg); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -1013,7 +1013,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 890dce2..7e371f7 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -233,6 +233,7 @@ struct nfs4_layoutget { struct nfs4_layoutget_args args; struct nfs4_layoutget_res res; struct pnfs_layout_segment **lsegpp; + gfp_t gfp_flags; }; struct nfs4_getdeviceinfo_args { -- cgit v1.1 From 21ccc7936dac5ca9b3e2838bbc112a60f34e18b3 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 10 May 2011 16:17:10 +0000 Subject: ehea: Fix memory hotplug oops The ehea driver oopses during memory hotplug if the ports are not up. A simple testcase: # ifconfig ethX down # echo offline > /sys/devices/system/memory/memory32/state Oops: Kernel access of bad area, sig: 11 [#1] last sysfs file: /sys/devices/system/memory/memory32/state REGS: c000000709393110 TRAP: 0300 Not tainted (2.6.39-rc2-01385-g7ef73bc-dirty) DAR: 0000000000000000, DSISR: 40000000 ... NIP [c000000000067c98] .__wake_up_common+0x48/0xf0 LR [c00000000006d034] .__wake_up+0x54/0x90 Call Trace: [c00000000006d034] .__wake_up+0x54/0x90 [d000000006bb6270] .ehea_rereg_mrs+0x140/0x730 [ehea] [d000000006bb69c4] .ehea_mem_notifier+0x164/0x170 [ehea] [c0000000006fc8a8] .notifier_call_chain+0x78/0xf0 [c0000000000b3d70] .__blocking_notifier_call_chain+0x70/0xb0 [c000000000458d78] .memory_notify+0x28/0x40 [c0000000001871d8] .remove_memory+0x208/0x6d0 [c000000000458264] .memory_section_action+0x94/0x140 [c0000000004583ec] .memory_block_change_state+0xdc/0x1d0 [c0000000004585cc] .store_mem_state+0xec/0x160 [c00000000044768c] .sysdev_store+0x3c/0x50 [c00000000020b48c] .sysfs_write_file+0xec/0x1f0 [c00000000018f86c] .vfs_write+0xec/0x1e0 [c00000000018fa88] .SyS_write+0x58/0xd0 To fix this, initialise the waitqueues during port probe instead of port open. Signed-off-by: Anton Blanchard Cc: stable@kernel.org Acked-by: Breno Leitao Signed-off-by: David S. Miller --- drivers/net/ehea/ehea_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index 53c0f04..cf79cf7 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -2688,9 +2688,6 @@ static int ehea_open(struct net_device *dev) netif_start_queue(dev); } - init_waitqueue_head(&port->swqe_avail_wq); - init_waitqueue_head(&port->restart_wq); - mutex_unlock(&port->port_lock); return ret; @@ -3276,6 +3273,9 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter, INIT_WORK(&port->reset_task, ehea_reset_port); + init_waitqueue_head(&port->swqe_avail_wq); + init_waitqueue_head(&port->restart_wq); + ret = register_netdev(dev); if (ret) { pr_err("register_netdev failed. ret=%d\n", ret); -- cgit v1.1 From b1054282d752c5a026e2c0450616ebf37fc0413e Mon Sep 17 00:00:00 2001 From: Tkhai Kirill Date: Tue, 10 May 2011 02:31:41 +0000 Subject: sparc32: Fixed unaligned memory copying in function __csum_partial_copy_sparc_generic When we are in the label cc_dword_align, registers %o0 and %o1 have the same last 2 bits, but it's not guaranteed one of them is zero. So we can get unaligned memory access in label ccte. Example of parameters which lead to this: %o0=0x7ff183e9, %o1=0x8e709e7d, %g1=3 With the parameters I had a memory corruption, when the additional 5 bytes were rewritten. This patch corrects the error. One comment to the patch. We don't care about the third bit in %o1, because cc_end_cruft stores word or less. Signed-off-by: Tkhai Kirill Signed-off-by: David S. Miller --- arch/sparc/lib/checksum_32.S | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S index 3632cb3..0084c33 100644 --- a/arch/sparc/lib/checksum_32.S +++ b/arch/sparc/lib/checksum_32.S @@ -289,10 +289,16 @@ cc_end_cruft: /* Also, handle the alignment code out of band. */ cc_dword_align: - cmp %g1, 6 - bl,a ccte + cmp %g1, 16 + bge 1f + srl %g1, 1, %o3 +2: cmp %o3, 0 + be,a ccte andcc %g1, 0xf, %o3 - andcc %o0, 0x1, %g0 + andcc %o3, %o0, %g0 ! Check %o0 only (%o1 has the same last 2 bits) + be,a 2b + srl %o3, 1, %o3 +1: andcc %o0, 0x1, %g0 bne ccslow andcc %o0, 0x2, %g0 be 1f -- cgit v1.1 From 3e51e3edfd81bfd9853ad7de91167e4ce33d0fe7 Mon Sep 17 00:00:00 2001 From: Samir Bellabes Date: Wed, 11 May 2011 18:18:05 +0200 Subject: sched: Remove unused parameters from sched_fork() and wake_up_new_task() sched_fork() and wake_up_new_task() are defined with a parameter 'unsigned long clone_flags', which is unused. This patch removes the parameters. Signed-off-by: Samir Bellabes Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305130685-1047-1-git-send-email-sam@synack.fr Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++--- kernel/fork.c | 4 ++-- kernel/sched.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6b4280b..12211e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2051,14 +2051,13 @@ extern void xtime_update(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); -extern void wake_up_new_task(struct task_struct *tsk, - unsigned long clone_flags); +extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p, int clone_flags); +extern void sched_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); diff --git a/kernel/fork.c b/kernel/fork.c index aca6287..2b44d82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1152,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); + sched_fork(p); retval = perf_event_init_task(p); if (retval) @@ -1463,7 +1463,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - wake_up_new_task(p, clone_flags); + wake_up_new_task(p); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); diff --git a/kernel/sched.c b/kernel/sched.c index da93381..f9778c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2741,7 +2741,7 @@ static void __sched_fork(struct task_struct *p) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p, int clone_flags) +void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); @@ -2823,7 +2823,7 @@ void sched_fork(struct task_struct *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; -- cgit v1.1 From 9af386c8dc5a9dce56f36b484647ad6401758c85 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Apr 2011 18:44:31 +0100 Subject: ARM: 6890/1: memmap: only free allocated memmap entries when using SPARSEMEM The SPARSEMEM code allocates memmap entries only for sections which are present (i.e. those which contain some valid memory). The membank checks in free_unused_memmap do not take this into account and can incorrectly attempt to free memory which is not allocated, resulting in a BUG() in the bootmem code. However, if memory is configured as follows: |<----section---->|<----hole---->|<----section---->| +--------+--------+--------------+--------+--------+ | bank 0 | unused | | bank 1 | unused | +--------+--------+--------------+--------+--------+ where a bank only occupies part of a section, the memmap allocated for the remainder of the section *can* be freed. This patch modifies the checks in free_unused_memmap so that only valid memmap entries are considered for removal. Acked-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/mm/init.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index e5f6fc4..e591513 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -392,7 +392,7 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn) * Convert start_pfn/end_pfn to a struct page pointer. */ start_pg = pfn_to_page(start_pfn - 1) + 1; - end_pg = pfn_to_page(end_pfn); + end_pg = pfn_to_page(end_pfn - 1) + 1; /* * Convert to physical addresses, and @@ -426,6 +426,14 @@ static void __init free_unused_memmap(struct meminfo *mi) bank_start = bank_pfn_start(bank); +#ifdef CONFIG_SPARSEMEM + /* + * Take care not to free memmap entries that don't exist + * due to SPARSEMEM sections which aren't present. + */ + bank_start = min(bank_start, + ALIGN(prev_bank_end, PAGES_PER_SECTION)); +#endif /* * If we had a previous bank, and there is a space * between the current bank and the previous, free it. @@ -440,6 +448,12 @@ static void __init free_unused_memmap(struct meminfo *mi) */ prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES); } + +#ifdef CONFIG_SPARSEMEM + if (!IS_ALIGNED(prev_bank_end, PAGES_PER_SECTION)) + free_memmap(prev_bank_end, + ALIGN(prev_bank_end, PAGES_PER_SECTION)); +#endif } static void __init free_highpages(void) -- cgit v1.1 From 2af68df02fe5ccd644f4312ba2401996f52faab3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 May 2011 18:32:55 +0100 Subject: ARM: 6892/1: handle ptrace requests to change PC during interrupted system calls GDB's interrupt.exp test cases currenly fail on ARM. The problem is how do_signal handled restarting interrupted system calls: The entry.S assembler code determines that we come from a system call; and that information is passed as "syscall" parameter to do_signal. That routine then calls get_signal_to_deliver [*] and if a signal is to be delivered, calls into handle_signal. If a system call is to be restarted either after the signal handler returns, or if no handler is to be called in the first place, the PC is updated after the get_signal_to_deliver call, either in handle_signal (if we have a handler) or at the end of do_signal (otherwise). Now the problem is that during [*], the call to get_signal_to_deliver, a ptrace intercept may happen. During this intercept, the debugger may change registers, including the PC. This is done by GDB if it wants to execute an "inferior call", i.e. the execution of some code in the debugged program triggered by GDB. To this purpose, GDB will save all registers, allocate a stack frame, set up PC and arguments as appropriate for the call, and point the link register to a dummy breakpoint instruction. Once the process is restarted, it will execute the call and then trap back to the debugger, at which point GDB will restore all registers and continue original execution. This generally works fine. However, now consider what happens when GDB attempts to do exactly that while the process was interrupted during execution of a to-be- restarted system call: do_signal is called with the syscall flag set; it calls get_signal_to_deliver, at which point the debugger takes over and changes the PC to point to a completely different place. Now get_signal_to_deliver returns without a signal to deliver; but now do_signal decides it should be restarting a system call, and decrements the PC by 2 or 4 -- so it now points to 2 or 4 bytes before the function GDB wants to call -- which leads to a subsequent crash. To fix this problem, two things need to be supported: - do_signal must be able to recognize that get_signal_to_deliver changed the PC to a different location, and skip the restart-syscall sequence - once the debugger has restored all registers at the end of the inferior call sequence, do_signal must recognize that *now* it needs to restart the pending system call, even though it was now entered from a breakpoint instead of an actual svc instruction This set of issues is solved on other platforms, usually by one of two mechanisms: - The status information "do_signal is handling a system call that may need restarting" is itself carried in some register that can be accessed via ptrace. This is e.g. on Intel the "orig_eax" register; on Sparc the kernel defines a magic extra bit in the flags register for this purpose. This allows GDB to manage that state: reset it when doing an inferior call, and restore it after the call is finished. - On s390, do_signal transparently handles this problem without requiring GDB interaction, by performing system call restarting in the following way: first, adjust the PC as necessary for restarting the call. Then, call get_signal_to_deliver; and finally just continue execution at the PC. This way, if GDB does not change the PC, everything is as before. If GDB *does* change the PC, execution will simply continue there -- and once GDB restores the PC it saved at that point, it will automatically point to the *restarted* system call. (There is the minor twist how to handle system calls that do *not* need restarting -- do_signal will undo the PC change in this case, after get_signal_to_deliver has returned, and only if ptrace did not change the PC during that call.) Because there does not appear to be any obvious register to carry the syscall-restart information on ARM, we'd either have to introduce a new artificial ptrace register just for that purpose, or else handle the issue transparently like on s390. The patch below implements the second option; using this patch makes the interrupt.exp test cases pass on ARM, with no regression in the GDB test suite otherwise. Cc: patches@linaro.org Signed-off-by: Ulrich Weigand Signed-off-by: Arnd Bergmann Signed-off-by: Russell King --- arch/arm/kernel/signal.c | 90 ++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 37 deletions(-) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index cb83983..0340224 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -597,19 +597,13 @@ setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info, return err; } -static inline void setup_syscall_restart(struct pt_regs *regs) -{ - regs->ARM_r0 = regs->ARM_ORIG_r0; - regs->ARM_pc -= thumb_mode(regs) ? 2 : 4; -} - /* * OK, we're invoking a handler */ static int handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, - struct pt_regs * regs, int syscall) + struct pt_regs * regs) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = current; @@ -617,26 +611,6 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, int ret; /* - * If we were from a system call, check for system call restarting... - */ - if (syscall) { - switch (regs->ARM_r0) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->ARM_r0 = -EINTR; - break; - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->ARM_r0 = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - setup_syscall_restart(regs); - } - } - - /* * translate the signal */ if (usig < 32 && thread->exec_domain && thread->exec_domain->signal_invmap) @@ -685,6 +659,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, */ static void do_signal(struct pt_regs *regs, int syscall) { + unsigned int retval = 0, continue_addr = 0, restart_addr = 0; struct k_sigaction ka; siginfo_t info; int signr; @@ -698,18 +673,61 @@ static void do_signal(struct pt_regs *regs, int syscall) if (!user_mode(regs)) return; + /* + * If we were from a system call, check for system call restarting... + */ + if (syscall) { + continue_addr = regs->ARM_pc; + restart_addr = continue_addr - (thumb_mode(regs) ? 2 : 4); + retval = regs->ARM_r0; + + /* + * Prepare for system call restart. We do this here so that a + * debugger will see the already changed PSW. + */ + switch (retval) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->ARM_r0 = regs->ARM_ORIG_r0; + regs->ARM_pc = restart_addr; + break; + case -ERESTART_RESTARTBLOCK: + regs->ARM_r0 = -EINTR; + break; + } + } + if (try_to_freeze()) goto no_signal; + /* + * Get the signal to deliver. When running under ptrace, at this + * point the debugger may change all our registers ... + */ signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { sigset_t *oldset; + /* + * Depending on the signal settings we may need to revert the + * decision to restart the system call. But skip this if a + * debugger has chosen to restart at a different PC. + */ + if (regs->ARM_pc == restart_addr) { + if (retval == -ERESTARTNOHAND + || (retval == -ERESTARTSYS + && !(ka.sa.sa_flags & SA_RESTART))) { + regs->ARM_r0 = -EINTR; + regs->ARM_pc = continue_addr; + } + } + if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else oldset = ¤t->blocked; - if (handle_signal(signr, &ka, &info, oldset, regs, syscall) == 0) { + if (handle_signal(signr, &ka, &info, oldset, regs) == 0) { /* * A signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, @@ -723,11 +741,14 @@ static void do_signal(struct pt_regs *regs, int syscall) } no_signal: - /* - * No signal to deliver to the process - restart the syscall. - */ if (syscall) { - if (regs->ARM_r0 == -ERESTART_RESTARTBLOCK) { + /* + * Handle restarting a different system call. As above, + * if a debugger has chosen to restart at a different PC, + * ignore the restart. + */ + if (retval == -ERESTART_RESTARTBLOCK + && regs->ARM_pc == continue_addr) { if (thumb_mode(regs)) { regs->ARM_r7 = __NR_restart_syscall - __NR_SYSCALL_BASE; regs->ARM_pc -= 2; @@ -750,11 +771,6 @@ static void do_signal(struct pt_regs *regs, int syscall) #endif } } - if (regs->ARM_r0 == -ERESTARTNOHAND || - regs->ARM_r0 == -ERESTARTSYS || - regs->ARM_r0 == -ERESTARTNOINTR) { - setup_syscall_restart(regs); - } /* If there's no signal to deliver, we just put the saved sigmask * back. -- cgit v1.1 From a904f5f9eb7a55baacb2f4c1423cac8a8eb78a3a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 6 Apr 2011 16:18:47 +0100 Subject: ARM: 6870/1: The mandatory barrier rmb() must be a dsb() in for device accesses Since mandatory barriers may be used (explicitly or implicitly via readl etc.) to ensure the ordering between Device and Normal memory accesses, a DMB is not enough. This patch converts it to a DSB. Cc: Colin Cross Signed-off-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/include/asm/system.h | 2 +- arch/arm/mach-realview/include/mach/barriers.h | 2 +- arch/arm/mach-tegra/include/mach/barriers.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 885be09..832888d 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -159,7 +159,7 @@ extern unsigned int user_debug; #include #elif defined(CONFIG_ARM_DMA_MEM_BUFFERABLE) || defined(CONFIG_SMP) #define mb() do { dsb(); outer_sync(); } while (0) -#define rmb() dmb() +#define rmb() dsb() #define wmb() mb() #else #include diff --git a/arch/arm/mach-realview/include/mach/barriers.h b/arch/arm/mach-realview/include/mach/barriers.h index 0c5d749..9a73219 100644 --- a/arch/arm/mach-realview/include/mach/barriers.h +++ b/arch/arm/mach-realview/include/mach/barriers.h @@ -4,5 +4,5 @@ * operation to deadlock the system. */ #define mb() dsb() -#define rmb() dmb() +#define rmb() dsb() #define wmb() mb() diff --git a/arch/arm/mach-tegra/include/mach/barriers.h b/arch/arm/mach-tegra/include/mach/barriers.h index cc11517..425b42e 100644 --- a/arch/arm/mach-tegra/include/mach/barriers.h +++ b/arch/arm/mach-tegra/include/mach/barriers.h @@ -23,7 +23,7 @@ #include -#define rmb() dmb() +#define rmb() dsb() #define wmb() do { dsb(); outer_sync(); } while (0) #define mb() wmb() -- cgit v1.1 From 5db1256a5131d3b133946fa02ac9770a784e6eb2 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Thu, 12 May 2011 04:13:54 -0500 Subject: seqlock: Don't smp_rmb in seqlock reader spin loop Move the smp_rmb after cpu_relax loop in read_seqlock and add ACCESS_ONCE to make sure the test and return are consistent. A multi-threaded core in the lab didn't like the update from 2.6.35 to 2.6.36, to the point it would hang during boot when multiple threads were active. Bisection showed af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867 (clockevents: Remove the per cpu tick skew) as the culprit and it is supported with stack traces showing xtime_lock waits including tick_do_update_jiffies64 and/or update_vsyscall. Experimentation showed the combination of cpu_relax and smp_rmb was significantly slowing the progress of other threads sharing the core, and this patch is effective in avoiding the hang. A theory is the rmb is affecting the whole core while the cpu_relax is causing a resource rebalance flush, together they cause an interfernce cadance that is unbroken when the seqlock reader has interrupts disabled. At first I was confused why the refactor in 3c22cd5709e8143444a6d08682a87f4c57902df3 (kernel: optimise seqlock) didn't affect this patch application, but after some study that affected seqcount not seqlock. The new seqcount was not factored back into the seqlock. I defer that the future. While the removal of the timer interrupt offset created contention for the xtime lock while a cpu does the additonal work to update the system clock, the seqlock implementation with the tight rmb spin loop goes back much further, and is just waiting for the right trigger. Cc: Signed-off-by: Milton Miller Cc: Cc: Linus Torvalds Cc: Andi Kleen Cc: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Anton Blanchard Cc: Paul McKenney Acked-by: Eric Dumazet Link: http://lkml.kernel.org/r/%3Cseqlock-rmb%40mdm.bga.com%3E Signed-off-by: Thomas Gleixner --- include/linux/seqlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index e98cd2e..06d6964 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -88,12 +88,12 @@ static __always_inline unsigned read_seqbegin(const seqlock_t *sl) unsigned ret; repeat: - ret = sl->sequence; - smp_rmb(); + ret = ACCESS_ONCE(sl->sequence); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; } + smp_rmb(); return ret; } -- cgit v1.1 From 449a66fd1fa75d36dca917704827c40c8f416bca Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Thu, 12 May 2011 15:11:12 +0200 Subject: x86: Remove warning and warning_symbol from struct stacktrace_ops Both warning and warning_symbol are nowhere used. Let's get rid of them. Signed-off-by: Richard Weinberger Cc: Oleg Nesterov Cc: Andrew Morton Cc: Huang Ying Cc: Soeren Sandmann Pedersen Cc: Namhyung Kim Cc: x86 Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Robert Richter Cc: Paul Mundt Link: http://lkml.kernel.org/r/1305205872-10321-2-git-send-email-richard@nod.at Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/stacktrace.h | 3 --- arch/x86/kernel/cpu/perf_event.c | 13 ------------- arch/x86/kernel/dumpstack.c | 16 ---------------- arch/x86/kernel/stacktrace.c | 13 ------------- arch/x86/oprofile/backtrace.c | 13 ------------- 5 files changed, 58 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index d7e89c8..70bbe39 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo, /* Generic stack tracer with callbacks */ struct stacktrace_ops { - void (*warning)(void *data, char *msg); - /* msg must contain %s for the symbol */ - void (*warning_symbol)(void *data, char *msg, unsigned long symbol); void (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0de6b2b..3a0338b 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1773,17 +1773,6 @@ static struct pmu pmu = { * callchain support */ -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - static int backtrace_stack(void *data, char *name) { return 0; @@ -1797,8 +1786,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, .walk_stack = print_context_stack_bp, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index e2a3f06..f478ff6 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo, } EXPORT_SYMBOL_GPL(print_context_stack_bp); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - static int print_trace_stack(void *data, char *name) { printk("%s <%s> ", (char *)data, name); @@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, .address = print_trace_address, .walk_stack = print_context_stack, diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 6515733..55d9bc0 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -9,15 +9,6 @@ #include #include -static void save_stack_warning(void *data, char *msg) -{ -} - -static void -save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) -{ -} - static int save_stack_stack(void *data, char *name) { return 0; @@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, .address = save_stack_address, .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, .address = save_stack_address_nosched, .walk_stack = print_context_stack, diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2d49d4e..a5b64ab 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -16,17 +16,6 @@ #include #include -static void backtrace_warning_symbol(void *data, char *msg, - unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - static int backtrace_stack(void *data, char *name) { /* Yes, we want all stacks */ @@ -42,8 +31,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, .walk_stack = print_context_stack, -- cgit v1.1 From 747df2258b1b9a2e25929ef496262c339c380009 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 11 May 2011 17:41:18 +0100 Subject: sfc: Always map MCDI shared memory as uncacheable We enabled write-combining for memory-mapped registers in commit 65f0b417dee94f779ce9b77102b7d73c93723b39, but inhibited it for the MCDI shared memory where this is not supported. However, write-combining mappings also allow read-reordering, which may also be a problem. I found that when an SFC9000-family controller is connected to an Intel 3000 chipset, and write-combining is enabled, the controller stops responding to PCIe read requests during driver initialisation while the driver is polling for completion of an MCDI command. This results in an NMI and system hang. Adding read memory barriers between all reads to the shared memory area appears to reduce but not eliminate the probability of this. We have not yet established whether this is a bug in our BIU or in the PCIe bridge. For now, work around by mapping the shared memory area separately. Signed-off-by: Ben Hutchings --- drivers/net/sfc/mcdi.c | 49 ++++++++++++++++++++++++++++++------------------- drivers/net/sfc/nic.h | 2 ++ drivers/net/sfc/siena.c | 25 +++++++++++++++++++++---- 3 files changed, 53 insertions(+), 23 deletions(-) diff --git a/drivers/net/sfc/mcdi.c b/drivers/net/sfc/mcdi.c index d984790..3dd45ed 100644 --- a/drivers/net/sfc/mcdi.c +++ b/drivers/net/sfc/mcdi.c @@ -50,6 +50,20 @@ static inline struct efx_mcdi_iface *efx_mcdi(struct efx_nic *efx) return &nic_data->mcdi; } +static inline void +efx_mcdi_readd(struct efx_nic *efx, efx_dword_t *value, unsigned reg) +{ + struct siena_nic_data *nic_data = efx->nic_data; + value->u32[0] = (__force __le32)__raw_readl(nic_data->mcdi_smem + reg); +} + +static inline void +efx_mcdi_writed(struct efx_nic *efx, const efx_dword_t *value, unsigned reg) +{ + struct siena_nic_data *nic_data = efx->nic_data; + __raw_writel((__force u32)value->u32[0], nic_data->mcdi_smem + reg); +} + void efx_mcdi_init(struct efx_nic *efx) { struct efx_mcdi_iface *mcdi; @@ -70,8 +84,8 @@ static void efx_mcdi_copyin(struct efx_nic *efx, unsigned cmd, const u8 *inbuf, size_t inlen) { struct efx_mcdi_iface *mcdi = efx_mcdi(efx); - unsigned pdu = FR_CZ_MC_TREG_SMEM + MCDI_PDU(efx); - unsigned doorbell = FR_CZ_MC_TREG_SMEM + MCDI_DOORBELL(efx); + unsigned pdu = MCDI_PDU(efx); + unsigned doorbell = MCDI_DOORBELL(efx); unsigned int i; efx_dword_t hdr; u32 xflags, seqno; @@ -92,30 +106,28 @@ static void efx_mcdi_copyin(struct efx_nic *efx, unsigned cmd, MCDI_HEADER_SEQ, seqno, MCDI_HEADER_XFLAGS, xflags); - efx_writed(efx, &hdr, pdu); + efx_mcdi_writed(efx, &hdr, pdu); - for (i = 0; i < inlen; i += 4) { - _efx_writed(efx, *((__le32 *)(inbuf + i)), pdu + 4 + i); - /* use wmb() within loop to inhibit write combining */ - wmb(); - } + for (i = 0; i < inlen; i += 4) + efx_mcdi_writed(efx, (const efx_dword_t *)(inbuf + i), + pdu + 4 + i); /* ring the doorbell with a distinctive value */ - _efx_writed(efx, (__force __le32) 0x45789abc, doorbell); - wmb(); + EFX_POPULATE_DWORD_1(hdr, EFX_DWORD_0, 0x45789abc); + efx_mcdi_writed(efx, &hdr, doorbell); } static void efx_mcdi_copyout(struct efx_nic *efx, u8 *outbuf, size_t outlen) { struct efx_mcdi_iface *mcdi = efx_mcdi(efx); - unsigned int pdu = FR_CZ_MC_TREG_SMEM + MCDI_PDU(efx); + unsigned int pdu = MCDI_PDU(efx); int i; BUG_ON(atomic_read(&mcdi->state) == MCDI_STATE_QUIESCENT); BUG_ON(outlen & 3 || outlen >= 0x100); for (i = 0; i < outlen; i += 4) - *((__le32 *)(outbuf + i)) = _efx_readd(efx, pdu + 4 + i); + efx_mcdi_readd(efx, (efx_dword_t *)(outbuf + i), pdu + 4 + i); } static int efx_mcdi_poll(struct efx_nic *efx) @@ -123,7 +135,7 @@ static int efx_mcdi_poll(struct efx_nic *efx) struct efx_mcdi_iface *mcdi = efx_mcdi(efx); unsigned int time, finish; unsigned int respseq, respcmd, error; - unsigned int pdu = FR_CZ_MC_TREG_SMEM + MCDI_PDU(efx); + unsigned int pdu = MCDI_PDU(efx); unsigned int rc, spins; efx_dword_t reg; @@ -149,8 +161,7 @@ static int efx_mcdi_poll(struct efx_nic *efx) time = get_seconds(); - rmb(); - efx_readd(efx, ®, pdu); + efx_mcdi_readd(efx, ®, pdu); /* All 1's indicates that shared memory is in reset (and is * not a valid header). Wait for it to come out reset before @@ -177,7 +188,7 @@ static int efx_mcdi_poll(struct efx_nic *efx) respseq, mcdi->seqno); rc = EIO; } else if (error) { - efx_readd(efx, ®, pdu + 4); + efx_mcdi_readd(efx, ®, pdu + 4); switch (EFX_DWORD_FIELD(reg, EFX_DWORD_0)) { #define TRANSLATE_ERROR(name) \ case MC_CMD_ERR_ ## name: \ @@ -211,21 +222,21 @@ out: /* Test and clear MC-rebooted flag for this port/function */ int efx_mcdi_poll_reboot(struct efx_nic *efx) { - unsigned int addr = FR_CZ_MC_TREG_SMEM + MCDI_REBOOT_FLAG(efx); + unsigned int addr = MCDI_REBOOT_FLAG(efx); efx_dword_t reg; uint32_t value; if (efx_nic_rev(efx) < EFX_REV_SIENA_A0) return false; - efx_readd(efx, ®, addr); + efx_mcdi_readd(efx, ®, addr); value = EFX_DWORD_FIELD(reg, EFX_DWORD_0); if (value == 0) return 0; EFX_ZERO_DWORD(reg); - efx_writed(efx, ®, addr); + efx_mcdi_writed(efx, ®, addr); if (value == MC_STATUS_DWORD_ASSERT) return -EINTR; diff --git a/drivers/net/sfc/nic.h b/drivers/net/sfc/nic.h index a42db6e..d91701a 100644 --- a/drivers/net/sfc/nic.h +++ b/drivers/net/sfc/nic.h @@ -143,10 +143,12 @@ static inline struct falcon_board *falcon_board(struct efx_nic *efx) /** * struct siena_nic_data - Siena NIC state * @mcdi: Management-Controller-to-Driver Interface + * @mcdi_smem: MCDI shared memory mapping. The mapping is always uncacheable. * @wol_filter_id: Wake-on-LAN packet filter id */ struct siena_nic_data { struct efx_mcdi_iface mcdi; + void __iomem *mcdi_smem; int wol_filter_id; }; diff --git a/drivers/net/sfc/siena.c b/drivers/net/sfc/siena.c index e4dd898..837869b 100644 --- a/drivers/net/sfc/siena.c +++ b/drivers/net/sfc/siena.c @@ -220,12 +220,26 @@ static int siena_probe_nic(struct efx_nic *efx) efx_reado(efx, ®, FR_AZ_CS_DEBUG); efx->net_dev->dev_id = EFX_OWORD_FIELD(reg, FRF_CZ_CS_PORT_NUM) - 1; + /* Initialise MCDI */ + nic_data->mcdi_smem = ioremap_nocache(efx->membase_phys + + FR_CZ_MC_TREG_SMEM, + FR_CZ_MC_TREG_SMEM_STEP * + FR_CZ_MC_TREG_SMEM_ROWS); + if (!nic_data->mcdi_smem) { + netif_err(efx, probe, efx->net_dev, + "could not map MCDI at %llx+%x\n", + (unsigned long long)efx->membase_phys + + FR_CZ_MC_TREG_SMEM, + FR_CZ_MC_TREG_SMEM_STEP * FR_CZ_MC_TREG_SMEM_ROWS); + rc = -ENOMEM; + goto fail1; + } efx_mcdi_init(efx); /* Recover from a failed assertion before probing */ rc = efx_mcdi_handle_assertion(efx); if (rc) - goto fail1; + goto fail2; /* Let the BMC know that the driver is now in charge of link and * filter settings. We must do this before we reset the NIC */ @@ -280,6 +294,7 @@ fail4: fail3: efx_mcdi_drv_attach(efx, false, NULL); fail2: + iounmap(nic_data->mcdi_smem); fail1: kfree(efx->nic_data); return rc; @@ -359,6 +374,8 @@ static int siena_init_nic(struct efx_nic *efx) static void siena_remove_nic(struct efx_nic *efx) { + struct siena_nic_data *nic_data = efx->nic_data; + efx_nic_free_buffer(efx, &efx->irq_status); siena_reset_hw(efx, RESET_TYPE_ALL); @@ -368,7 +385,8 @@ static void siena_remove_nic(struct efx_nic *efx) efx_mcdi_drv_attach(efx, false, NULL); /* Tear down the private nic state */ - kfree(efx->nic_data); + iounmap(nic_data->mcdi_smem); + kfree(nic_data); efx->nic_data = NULL; } @@ -606,8 +624,7 @@ struct efx_nic_type siena_a0_nic_type = { .default_mac_ops = &efx_mcdi_mac_operations, .revision = EFX_REV_SIENA_A0, - .mem_map_size = (FR_CZ_MC_TREG_SMEM + - FR_CZ_MC_TREG_SMEM_STEP * FR_CZ_MC_TREG_SMEM_ROWS), + .mem_map_size = FR_CZ_MC_TREG_SMEM, /* MC_TREG_SMEM mapped separately */ .txd_ptr_tbl_base = FR_BZ_TX_DESC_PTR_TBL, .rxd_ptr_tbl_base = FR_BZ_RX_DESC_PTR_TBL, .buf_tbl_base = FR_BZ_BUF_FULL_TBL, -- cgit v1.1 From 698b368275c3fa98261159253cfc79653f9dffc6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 11 May 2011 14:49:36 -0700 Subject: fbcon: add lifetime refcount to opened frame buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This just adds the refcount and the new registration lock logic. It does not (for example) actually change the read/write/ioctl routines to actually use the frame buffer that was opened: those function still end up alway susing whatever the current frame buffer is at the time of the call. Without this, if something holds the frame buffer open over a framebuffer switch, the close() operation after the switch will access a fb_info that has been free'd by the unregistering of the old frame buffer. (The read/write/ioctl operations will normally not cause problems, because they will - illogically - pick up the new fbcon instead. But a switch that happens just as one of those is going on might see problems too, the window is just much smaller: one individual op rather than the whole open-close sequence.) This use-after-free is apparently fairly easily triggered by the Ubuntu 11.04 boot sequence. Acked-by: Tim Gardner Tested-by: Daniel J Blueman Tested-by: Anca Emanuel Cc: Bruno Prémont Cc: Alan Cox Cc: Paul Mundt Cc: Dave Airlie Cc: Andy Whitcroft Signed-off-by: Linus Torvalds --- drivers/video/fbmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------- include/linux/fb.h | 1 + 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index e0c22849..eec14d2 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -42,9 +42,34 @@ #define FBPIXMAPSIZE (1024 * 8) +static DEFINE_MUTEX(registration_lock); struct fb_info *registered_fb[FB_MAX] __read_mostly; int num_registered_fb __read_mostly; +static struct fb_info *get_fb_info(unsigned int idx) +{ + struct fb_info *fb_info; + + if (idx >= FB_MAX) + return ERR_PTR(-ENODEV); + + mutex_lock(®istration_lock); + fb_info = registered_fb[idx]; + if (fb_info) + atomic_inc(&fb_info->count); + mutex_unlock(®istration_lock); + + return fb_info; +} + +static void put_fb_info(struct fb_info *fb_info) +{ + if (!atomic_dec_and_test(&fb_info->count)) + return; + if (fb_info->fbops->fb_destroy) + fb_info->fbops->fb_destroy(fb_info); +} + int lock_fb_info(struct fb_info *info) { mutex_lock(&info->lock); @@ -647,6 +672,7 @@ int fb_show_logo(struct fb_info *info, int rotate) { return 0; } static void *fb_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(®istration_lock); return (*pos < FB_MAX) ? pos : NULL; } @@ -658,6 +684,7 @@ static void *fb_seq_next(struct seq_file *m, void *v, loff_t *pos) static void fb_seq_stop(struct seq_file *m, void *v) { + mutex_unlock(®istration_lock); } static int fb_seq_show(struct seq_file *m, void *v) @@ -1361,14 +1388,16 @@ __releases(&info->lock) struct fb_info *info; int res = 0; - if (fbidx >= FB_MAX) - return -ENODEV; - info = registered_fb[fbidx]; - if (!info) + info = get_fb_info(fbidx); + if (!info) { request_module("fb%d", fbidx); - info = registered_fb[fbidx]; - if (!info) - return -ENODEV; + info = get_fb_info(fbidx); + if (!info) + return -ENODEV; + } + if (IS_ERR(info)) + return PTR_ERR(info); + mutex_lock(&info->lock); if (!try_module_get(info->fbops->owner)) { res = -ENODEV; @@ -1386,6 +1415,8 @@ __releases(&info->lock) #endif out: mutex_unlock(&info->lock); + if (res) + put_fb_info(info); return res; } @@ -1401,6 +1432,7 @@ __releases(&info->lock) info->fbops->fb_release(info,1); module_put(info->fbops->owner); mutex_unlock(&info->lock); + put_fb_info(info); return 0; } @@ -1542,11 +1574,13 @@ register_framebuffer(struct fb_info *fb_info) remove_conflicting_framebuffers(fb_info->apertures, fb_info->fix.id, fb_is_primary_device(fb_info)); + mutex_lock(®istration_lock); num_registered_fb++; for (i = 0 ; i < FB_MAX; i++) if (!registered_fb[i]) break; fb_info->node = i; + atomic_set(&fb_info->count, 1); mutex_init(&fb_info->lock); mutex_init(&fb_info->mm_lock); @@ -1583,6 +1617,7 @@ register_framebuffer(struct fb_info *fb_info) fb_var_to_videomode(&mode, &fb_info->var); fb_add_videomode(&mode, &fb_info->modelist); registered_fb[i] = fb_info; + mutex_unlock(®istration_lock); event.info = fb_info; if (!lock_fb_info(fb_info)) @@ -1616,6 +1651,7 @@ unregister_framebuffer(struct fb_info *fb_info) struct fb_event event; int i, ret = 0; + mutex_lock(®istration_lock); i = fb_info->node; if (!registered_fb[i]) { ret = -EINVAL; @@ -1638,7 +1674,7 @@ unregister_framebuffer(struct fb_info *fb_info) (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) kfree(fb_info->pixmap.addr); fb_destroy_modelist(&fb_info->modelist); - registered_fb[i]=NULL; + registered_fb[i] = NULL; num_registered_fb--; fb_cleanup_device(fb_info); device_destroy(fb_class, MKDEV(FB_MAJOR, i)); @@ -1646,9 +1682,9 @@ unregister_framebuffer(struct fb_info *fb_info) fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event); /* this may free fb info */ - if (fb_info->fbops->fb_destroy) - fb_info->fbops->fb_destroy(fb_info); + put_fb_info(fb_info); done: + mutex_unlock(®istration_lock); return ret; } diff --git a/include/linux/fb.h b/include/linux/fb.h index df728c1..6a82748 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -832,6 +832,7 @@ struct fb_tile_ops { #define FBINFO_CAN_FORCE_OUTPUT 0x200000 struct fb_info { + atomic_t count; int node; int flags; struct mutex lock; /* Lock for open/release/ioctl funcs */ -- cgit v1.1 From c47747fde931c02455683bd00ea43eaa62f35b0e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 11 May 2011 14:58:34 -0700 Subject: fbmem: make read/write/ioctl use the frame buffer at open time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit read/write/ioctl on a fbcon file descriptor has traditionally used the fbcon not when it was opened, but as it was at the time of the call. That makes no sense, but the lack of sense is much more obvious now that we properly ref-count the usage - it means that the ref-counting doesn't actually protect operations we do on the frame buffer. This changes it to look at the fb_info that we got at open time, but in order to avoid using a frame buffer long after it has been unregistered, we do verify that it is still current, and return -ENODEV if not. Acked-by: Tim Gardner Tested-by: Daniel J Blueman Tested-by: Anca Emanuel Cc: Bruno Prémont Cc: Alan Cox Cc: Paul Mundt Cc: Dave Airlie Cc: Andy Whitcroft Signed-off-by: Linus Torvalds --- drivers/video/fbmem.c | 50 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index eec14d2..ea16e65 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -717,13 +717,30 @@ static const struct file_operations fb_proc_fops = { .release = seq_release, }; -static ssize_t -fb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +/* + * We hold a reference to the fb_info in file->private_data, + * but if the current registered fb has changed, we don't + * actually want to use it. + * + * So look up the fb_info using the inode minor number, + * and just verify it against the reference we have. + */ +static struct fb_info *file_fb_info(struct file *file) { - unsigned long p = *ppos; struct inode *inode = file->f_path.dentry->d_inode; int fbidx = iminor(inode); struct fb_info *info = registered_fb[fbidx]; + + if (info != file->private_data) + info = NULL; + return info; +} + +static ssize_t +fb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + struct fb_info *info = file_fb_info(file); u8 *buffer, *dst; u8 __iomem *src; int c, cnt = 0, err = 0; @@ -788,9 +805,7 @@ static ssize_t fb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; - struct inode *inode = file->f_path.dentry->d_inode; - int fbidx = iminor(inode); - struct fb_info *info = registered_fb[fbidx]; + struct fb_info *info = file_fb_info(file); u8 *buffer, *src; u8 __iomem *dst; int c, cnt = 0, err = 0; @@ -1168,10 +1183,10 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int fbidx = iminor(inode); - struct fb_info *info = registered_fb[fbidx]; + struct fb_info *info = file_fb_info(file); + if (!info) + return -ENODEV; return do_fb_ioctl(info, cmd, arg); } @@ -1292,12 +1307,13 @@ static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd, static long fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int fbidx = iminor(inode); - struct fb_info *info = registered_fb[fbidx]; - struct fb_ops *fb = info->fbops; + struct fb_info *info = file_fb_info(file); + struct fb_ops *fb; long ret = -ENOIOCTLCMD; + if (!info) + return -ENODEV; + fb = info->fbops; switch(cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: @@ -1330,16 +1346,18 @@ static long fb_compat_ioctl(struct file *file, unsigned int cmd, static int fb_mmap(struct file *file, struct vm_area_struct * vma) { - int fbidx = iminor(file->f_path.dentry->d_inode); - struct fb_info *info = registered_fb[fbidx]; - struct fb_ops *fb = info->fbops; + struct fb_info *info = file_fb_info(file); + struct fb_ops *fb; unsigned long off; unsigned long start; u32 len; + if (!info) + return -ENODEV; if (vma->vm_pgoff > (~0UL >> PAGE_SHIFT)) return -EINVAL; off = vma->vm_pgoff << PAGE_SHIFT; + fb = info->fbops; if (!fb) return -ENODEV; mutex_lock(&info->mm_lock); -- cgit v1.1 From 1c65335714e99864a438b0d757c8736d3c6e7d79 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 9 May 2011 22:07:31 -0700 Subject: IB/qib: Use pci_dev->revision The driver reads PCI revision ID from the PCI configuration register while it's already stored by PCI subsystem in the revision field of struct pci_dev. Signed-off-by: Sergei Shtylyov Acked-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_pcie.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 48b6674..891cc2f 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -526,11 +526,8 @@ static int qib_tune_pcie_coalesce(struct qib_devdata *dd) */ devid = parent->device; if (devid >= 0x25e2 && devid <= 0x25fa) { - u8 rev; - /* 5000 P/V/X/Z */ - pci_read_config_byte(parent, PCI_REVISION_ID, &rev); - if (rev <= 0xb2) + if (parent->revision <= 0xb2) bits = 1U << 10; else bits = 7U << 10; -- cgit v1.1 From 92bdaef7b2c5d3cb8abc902faa1f7670a183dcdc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 5 May 2011 13:50:43 -0400 Subject: Revert "xen/mmu: Add workaround "x86-64, mm: Put early page table high"" This reverts commit a38647837a411f7df79623128421eef2118b5884. It does not work with certain AMD machines. last_pfn = 0x100000 max_arch_pfn = 0x400000000 initial memory mapped : 0 - 02c3a000 Base memory trampoline at [ffff88000009b000] 9b000 size 20480 init_memory_mapping: 0000000000000000-0000000100000000 0000000000 - 0100000000 page 4k kernel direct mapping tables up to 100000000 @ ff7fb000-100000000 init_memory_mapping: 0000000100000000-00000001e0800000 0100000000 - 01e0800000 page 4k kernel direct mapping tables up to 1e0800000 @ 1df0f3000-1e0000000 xen: setting RW the range fffdc000 - 100000000 RAMDISK: 0203b000 - 02c3a000 No NUMA configuration found Faking a node at 0000000000000000-00000001e0800000 NUMA: Using 63 for the hash shift. Initmem setup node 0 0000000000000000-00000001e0800000 NODE_DATA [00000001dfffb000 - 00000001dfffffff] BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] setup_node_bootmem+0x18a/0x1ea PGD 0 Oops: 0003 [#1] SMP last sysfs file: CPU 0 Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.39-0-virtual #6~smb1 RIP: e030:[] [] setup_node_bootmem+0x18a/0x1ea RSP: e02b:ffffffff81c01e38 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 00000001e0800000 RCX: 0000000000001040 RDX: 0000000000004100 RSI: 0000000000000000 RDI: ffff8801dfffb000 RBP: ffffffff81c01e58 R08: 0000000000000020 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000bfe400 FS: 0000000000000000(0000) GS:ffffffff81cca000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000001c03000 CR4: 0000000000000660 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffffffff81c00000, task ffffffff81c0b020) Stack: 0000000000000040 0000000000000001 0000000000000000 ffffffffffffffff ffffffff81c01e88 ffffffff81cf6c25 0000000000000000 0000000000000000 ffffffff81cf687f 0000000000000000 ffffffff81c01ea8 ffffffff81cf6e45 Call Trace: [] numa_register_memblks.constprop.3+0x150/0x181 [] ? numa_add_memblk+0x7c/0x7c [] numa_init.part.2+0x1c/0x7c [] ? numa_add_memblk+0x7c/0x7c [] numa_init+0x6c/0x70 [] initmem_init+0x39/0x3b [] setup_arch+0x64e/0x769 [] ? printk+0x51/0x53 [] start_kernel+0xd4/0x3f3 [] x86_64_start_reservations+0x132/0x136 [] xen_start_kernel+0x588/0x58f Code: 41 00 00 48 8b 3c c5 a0 24 cc 81 31 c0 40 f6 c7 01 74 05 aa 66 ba ff 40 40 f6 c7 02 74 05 66 ab 83 ea 02 89 d1 c1 e9 02 f6 c2 02 ab 74 02 66 ab 80 e2 01 74 01 aa 49 63 c4 48 c1 eb 0c 44 89 RIP [] setup_node_bootmem+0x18a/0x1ea RSP CR2: 0000000000000000 ---[ end trace a7919e7f17c0a725 ]--- Kernel panic - not syncing: Attempted to kill the idle task! Pid: 0, comm: swapper Tainted: G D 2.6.39-0-virtual #6~smb1 Reported-by: Stefan Bader Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 123 ----------------------------------------------------- 1 file changed, 123 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 55c965b..cf4ef61 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1463,119 +1463,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } -#ifdef CONFIG_X86_64 -static __initdata u64 __last_pgt_set_rw = 0; -static __initdata u64 __pgt_buf_start = 0; -static __initdata u64 __pgt_buf_end = 0; -static __initdata u64 __pgt_buf_top = 0; -/* - * As a consequence of the commit: - * - * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e - * Author: Yinghai Lu - * Date: Fri Dec 17 16:58:28 2010 -0800 - * - * x86-64, mm: Put early page table high - * - * at some point init_memory_mapping is going to reach the pagetable pages - * area and map those pages too (mapping them as normal memory that falls - * in the range of addresses passed to init_memory_mapping as argument). - * Some of those pages are already pagetable pages (they are in the range - * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and - * everything is fine. - * Some of these pages are not pagetable pages yet (they fall in the range - * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they - * are going to be mapped RW. When these pages become pagetable pages and - * are hooked into the pagetable, xen will find that the guest has already - * a RW mapping of them somewhere and fail the operation. - * The reason Xen requires pagetables to be RO is that the hypervisor needs - * to verify that the pagetables are valid before using them. The validation - * operations are called "pinning". - * - * In order to fix the issue we mark all the pages in the entire range - * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation - * is completed only the range pgt_buf_start-pgt_buf_end is reserved by - * init_memory_mapping. Hence the kernel is going to crash as soon as one - * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those - * ranges are RO). - * - * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ - * the init_memory_mapping has completed (in a perfect world we would - * call this function from init_memory_mapping, but lets ignore that). - * - * Because we are called _after_ init_memory_mapping the pgt_buf_[start, - * end,top] have all changed to new values (b/c init_memory_mapping - * is called and setting up another new page-table). Hence, the first time - * we enter this function, we save away the pgt_buf_start value and update - * the pgt_buf_[end,top]. - * - * When we detect that the "old" pgt_buf_start through pgt_buf_end - * PFNs have been reserved (so memblock_x86_reserve_range has been called), - * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. - * - * And then we update those "old" pgt_buf_[end|top] with the new ones - * so that we can redo this on the next pagetable. - */ -static __init void mark_rw_past_pgt(void) { - - if (pgt_buf_end > pgt_buf_start) { - u64 addr, size; - - /* Save it away. */ - if (!__pgt_buf_start) { - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - /* If we get the range that starts at __pgt_buf_end that means - * the range is reserved, and that in 'init_memory_mapping' - * the 'memblock_x86_reserve_range' has been called with the - * outdated __pgt_buf_start, __pgt_buf_end (the "new" - * pgt_buf_[start|end|top] refer now to a new pagetable. - * Note: we are called _after_ the pgt_buf_[..] have been - * updated.*/ - - addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), - &size, PAGE_SIZE); - - /* Still not reserved, meaning 'memblock_x86_reserve_range' - * hasn't been called yet. Update the _end and _top.*/ - if (addr == PFN_PHYS(__pgt_buf_start)) { - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - - /* OK, the area is reserved, meaning it is time for us to - * set RW for the old end->top PFNs. */ - - /* ..unless we had already done this. */ - if (__pgt_buf_end == __last_pgt_set_rw) - return; - - addr = PFN_PHYS(__pgt_buf_end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", - PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); - - while (addr < PFN_PHYS(__pgt_buf_top)) { - make_lowmem_page_readwrite(__va(addr)); - addr += PAGE_SIZE; - } - /* And update everything so that we are ready for the next - * pagetable (the one created for regions past 4GB) */ - __last_pgt_set_rw = __pgt_buf_end; - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - } - return; -} -#else -static __init void mark_rw_past_pgt(void) { } -#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1602,14 +1489,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) unsigned long pfn = pte_pfn(pte); /* - * A bit of optimization. We do not need to call the workaround - * when xen_set_pte_init is called with a PTE with 0 as PFN. - * That is b/c the pagetable at that point are just being populated - * with empty values and we can save some cycles by not calling - * the 'memblock' code.*/ - if (pfn) - mark_rw_past_pgt(); - /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an * early_ioremap fixmap slot as a freshly allocated page, make sure @@ -2118,8 +1997,6 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { - mark_rw_past_pgt(); - #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif -- cgit v1.1 From 279b706bf800b5967037f492dbe4fc5081ad5d0f Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 14 Apr 2011 15:49:41 +0100 Subject: x86,xen: introduce x86_init.mapping.pagetable_reserve Introduce a new x86_init hook called pagetable_reserve that at the end of init_memory_mapping is used to reserve a range of memory addresses for the kernel pagetable pages we used and free the other ones. On native it just calls memblock_x86_reserve_range while on xen it also takes care of setting the spare memory previously allocated for kernel pagetable pages from RO to RW, so that it can be used for other purposes. A detailed explanation of the reason why this hook is needed follows. As a consequence of the commit: commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e Author: Yinghai Lu Date: Fri Dec 17 16:58:28 2010 -0800 x86-64, mm: Put early page table high at some point init_memory_mapping is going to reach the pagetable pages area and map those pages too (mapping them as normal memory that falls in the range of addresses passed to init_memory_mapping as argument). Some of those pages are already pagetable pages (they are in the range pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and everything is fine. Some of these pages are not pagetable pages yet (they fall in the range pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they are going to be mapped RW. When these pages become pagetable pages and are hooked into the pagetable, xen will find that the guest has already a RW mapping of them somewhere and fail the operation. The reason Xen requires pagetables to be RO is that the hypervisor needs to verify that the pagetables are valid before using them. The validation operations are called "pinning" (more details in arch/x86/xen/mmu.c). In order to fix the issue we mark all the pages in the entire range pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation is completed only the range pgt_buf_start-pgt_buf_end is reserved by init_memory_mapping. Hence the kernel is going to crash as soon as one of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those ranges are RO). For this reason we need a hook to reserve the kernel pagetable pages we used and free the other ones so that they can be reused for other purposes. On native it just means calling memblock_x86_reserve_range, on Xen it also means marking RW the pagetable pages that we allocated before but that haven't been used before. Another way to fix this is without using the hook is by adding a 'if (xen_pv_domain)' in the 'init_memory_mapping' code and calling the Xen counterpart, but that is just nasty. Signed-off-by: Stefano Stabellini Acked-by: Yinghai Lu Acked-by: H. Peter Anvin Cc: Ingo Molnar Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/pgtable_types.h | 1 + arch/x86/include/asm/x86_init.h | 12 ++++++++++++ arch/x86/kernel/x86_init.c | 4 ++++ arch/x86/mm/init.c | 24 ++++++++++++++++++++++-- arch/x86/xen/mmu.c | 15 +++++++++++++++ 5 files changed, 54 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723..d56187c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 643ebf2..d3d8590 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,6 +68,17 @@ struct x86_init_oem { }; /** + * struct x86_init_mapping - platform specific initial kernel pagetable setup + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage + * + * For more details on the purpose of this hook, look in + * init_memory_mapping and the commit that added it. + */ +struct x86_init_mapping { + void (*pagetable_reserve)(u64 start, u64 end); +}; + +/** * struct x86_init_paging - platform specific paging functions * @pagetable_setup_start: platform specific pre paging_init() call * @pagetable_setup_done: platform specific post paging_init() call @@ -123,6 +134,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9..75ef4b1 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 286d289..722a4c3 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } +void native_pagetable_reserve(u64 start, u64 end) +{ + memblock_x86_reserve_range(start, end, "PGTABLE"); +} + struct map_range { unsigned long start; unsigned long end; @@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ if (!after_bootmem && pgt_buf_end > pgt_buf_start) - memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, - pgt_buf_end << PAGE_SHIFT, "PGTABLE"); + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index cf4ef61..0684f3c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) +{ + /* reserve the range used */ + native_pagetable_reserve(start, end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, + PFN_PHYS(pgt_buf_top)); + while (end < PFN_PHYS(pgt_buf_top)) { + make_lowmem_page_readwrite(__va(end)); + end += PAGE_SIZE; + } +} + static void xen_post_allocator_init(void); static __init void xen_pagetable_setup_done(pgd_t *base) @@ -2105,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { void __init xen_init_mmu_ops(void) { + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; -- cgit v1.1 From 53f8023febf9b3e18d8fb0d99c55010e473ce53d Mon Sep 17 00:00:00 2001 From: Sedat Dilek Date: Sun, 17 Apr 2011 16:17:34 +0200 Subject: x86/mm: Fix section mismatch derived from native_pagetable_reserve() With CONFIG_DEBUG_SECTION_MISMATCH=y I see these warnings in next-20110415: LD vmlinux.o MODPOST vmlinux.o WARNING: vmlinux.o(.text+0x1ba48): Section mismatch in reference from the function native_pagetable_reserve() to the function .init.text:memblock_x86_reserve_range() The function native_pagetable_reserve() references the function __init memblock_x86_reserve_range(). This is often because native_pagetable_reserve lacks a __init annotation or the annotation of memblock_x86_reserve_range is wrong. This patch fixes the issue. Thanks to pipacs from PaX project for help on IRC. Acked-by: "H. Peter Anvin" Signed-off-by: Sedat Dilek Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 722a4c3..37b8b0f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,7 +81,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } -void native_pagetable_reserve(u64 start, u64 end) +void __init native_pagetable_reserve(u64 start, u64 end) { memblock_x86_reserve_range(start, end, "PGTABLE"); } -- cgit v1.1 From 7899891c7d161752f29abcc9bc0a9c6c3a3af26c Mon Sep 17 00:00:00 2001 From: "Tian, Kevin" Date: Thu, 12 May 2011 10:56:08 +0800 Subject: xen mmu: fix a race window causing leave_mm BUG() There's a race window in xen_drop_mm_ref, where remote cpu may exit dirty bitmap between the check on this cpu and the point where remote cpu handles drop request. So in drop_other_mm_ref we need check whether TLB state is still lazy before calling into leave_mm. This bug is rarely observed in earlier kernel, but exaggerated by the commit 831d52bc153971b70e64eccfbed2b232394f22f8 ("x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask after switching mm") which clears bitmap after changing the TLB state. the call trace is as below: --------------------------------- kernel BUG at arch/x86/mm/tlb.c:61! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/system/xen_memory/xen_memory0/info/current_kb CPU 1 Modules linked in: 8021q garp xen_netback xen_blkback blktap blkback_pagemap nbd bridge stp llc autofs4 ipmi_devintf ipmi_si ipmi_msghandler lockd sunrpc bonding ipv6 xenfs dm_multipath video output sbs sbshc parport_pc lp parport ses enclosure snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device serio_raw bnx2 snd_pcm_oss snd_mixer_oss snd_pcm snd_timer iTCO_wdt snd soundcore snd_page_alloc i2c_i801 iTCO_vendor_support i2c_core pcs pkr pata_acpi ata_generic ata_piix shpchp mptsas mptscsih mptbase [last unloaded: freq_table] Pid: 25581, comm: khelper Not tainted 2.6.32.36fixxen #1 Tecal RH2285 RIP: e030:[] [] leave_mm+0x15/0x46 RSP: e02b:ffff88002805be48 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88015f8e2da0 RDX: ffff88002805be78 RSI: 0000000000000000 RDI: 0000000000000001 RBP: ffff88002805be48 R08: ffff88009d662000 R09: dead000000200200 R10: dead000000100100 R11: ffffffff814472b2 R12: ffff88009bfc1880 R13: ffff880028063020 R14: 00000000000004f6 R15: 0000000000000000 FS: 00007f62362d66e0(0000) GS:ffff880028058000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000003aabc11909 CR3: 000000009b8ca000 CR4: 0000000000002660 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000000 00 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process khelper (pid: 25581, threadinfo ffff88007691e000, task ffff88009b92db40) Stack: ffff88002805be68 ffffffff8100e4ae 0000000000000001 ffff88009d733b88 <0> ffff88002805be98 ffffffff81087224 ffff88002805be78 ffff88002805be78 <0> ffff88015f808360 00000000000004f6 ffff88002805bea8 ffffffff81010108 Call Trace: [] drop_other_mm_ref+0x2a/0x53 [] generic_smp_call_function_single_interrupt+0xd8/0xfc [] xen_call_function_single_interrupt+0x13/0x28 [] handle_IRQ_event+0x66/0x120 [] handle_percpu_irq+0x41/0x6e [] __xen_evtchn_do_upcall+0x1ab/0x27d [] xen_evtchn_do_upcall+0x33/0x46 [] xen_do_hyper visor_callback+0x1e/0x30 [] ? _spin_unlock_irqrestore+0x15/0x17 [] ? xen_restore_fl_direct_end+0x0/0x1 [] ? flush_old_exec+0x3ac/0x500 [] ? load_elf_binary+0x0/0x17ef [] ? load_elf_binary+0x0/0x17ef [] ? load_elf_binary+0x398/0x17ef [] ? need_resched+0x23/0x2d [] ? process_measurement+0xc0/0xd7 [] ? load_elf_binary+0x0/0x17ef [] ? search_binary_handler+0xc8/0x255 [] ? do_execve+0x1c3/0x29e [] ? sys_execve+0x43/0x5d [] ? __call_usermodehelper+0x0/0x6f [] ? kernel_execve+0x68/0xd0 [] ? __call_usermodehelper+0x0/0x6f [] ? xen_restore_fl_direct_end+0x0/0x1 [] ? ____call_usermodehelper+0x113/0x11e [] ? child_rip+0xa/0x20 [] ? __call_usermodehelper+0x0/0x6f [] ? int_ret_from_sys_call+0x7/0x1b [] ? retint_restore_args+0x5/0x6 [] ? child_rip+0x0/0x20 Code: 41 5e 41 5f c9 c3 55 48 89 e5 0f 1f 44 00 00 e8 17 ff ff ff c9 c3 55 48 89 e5 0f 1f 44 00 00 65 8b 04 25 c8 55 01 00 ff c8 75 04 <0f> 0b eb fe 65 48 8b 34 25 c0 55 01 00 48 81 c6 b8 02 00 00 e8 RIP [] leave_mm+0x15/0x46 RSP ---[ end trace ce9cee6832a9c503 ]--- Tested-by: Maoxiaoyun Signed-off-by: Kevin Tian [v1: Fleshed out the git description a bit] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e92b61..4fd7387 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1140,7 +1140,7 @@ static void drop_other_mm_ref(void *info) active_mm = percpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm) + if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure -- cgit v1.1 From 15bfc094517db2ddf38ca7ed47f3a1c0ad24f7c4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 12 Apr 2011 07:57:15 -0400 Subject: xen/setup: Ignore E820_UNUSABLE when setting 1-1 mappings. When we parse the raw E820, the Xen hypervisor can set "E820_RAM" to "E820_UNUSABLE" if the mem=X argument is used. As such we should _not_ consider the E820_UNUSABLE as an 1-1 identity mapping, but instead use the same case as for E820_RAM. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 90bac0a..fba4a6c 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list, if (last > end) continue; - if (entry->type == E820_RAM) { + if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { if (start > start_pci) identity += set_phys_range_identity( PFN_UP(start_pci), PFN_DOWN(start)); -- cgit v1.1 From 0f16d0dfcdb5aab97d9e368f008b070b5b3ec6d3 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 11 May 2011 22:34:38 +0200 Subject: xen/setup: Fix for incorrect xen_extra_mem_start initialization under 32-bit git commit 24bdb0b62cc82120924762ae6bc85afc8c3f2b26 (xen: do not create the extra e820 region at an addr lower than 4G) does not take into account that ifdef CONFIG_X86_32 instead of e820_end_of_low_ram_pfn() find_low_pfn_range() is called (both calls are from arch/x86/kernel/setup.c). find_low_pfn_range() behaves correctly and does not require change in xen_extra_mem_start initialization. Additionally, if xen_extra_mem_start is initialized in the same way as ifdef CONFIG_X86_64 then memory hotplug support for Xen balloon driver (under development) is broken. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index fba4a6c..ca6297b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -227,7 +227,11 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; +#ifdef CONFIG_X86_32 + xen_extra_mem_start = mem_end; +#else xen_extra_mem_start = max((1ULL << 32), mem_end); +#endif for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; -- cgit v1.1 From 8c5950881c3b5e6e350e4b0438a8ccc513d90df9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 1 Apr 2011 15:18:48 -0400 Subject: xen/p2m: Create entries in the P2M_MFN trees's to track 1-1 mappings .. when applicable. We need to track in the p2m_mfn and p2m_mfn_p the MFNs and pointers, respectivly, for the P2M entries that are allocated for the identity mappings. Without this, a PV domain with an E820 that triggers the 1-1 mapping to kick in, won't be able to be restored as the P2M won't have the identity mappings. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 141eb0d..a01e653 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn) /* Boundary cross-over for the edges: */ if (idx) { unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); + unsigned long *mid_mfn_p; p2m_init(p2m); p2m_top[topidx][mididx] = p2m; + /* For save/restore we need to MFN of the P2M saved */ + + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); + } return idx != 0; } @@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_mid_missing) { - unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid); p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in __early_alloc_p2m */ } } -- cgit v1.1 From 982b2035d9d7033f63db187bac55e9d8998b0266 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 May 2011 12:19:43 -0700 Subject: Revert "drm/i915: Only enable the plane after setting the fb base (pre-ILK)" This reverts commit 49183b2818de6899383bb82bc032f9344d6791ff. Quoth Franz Melchior: "This patch introduces a bug on my infamous "Acer Travelmate 5735Z-452G32Mnss": when KMS takes over, the frame buffer contents get completely garbled up on screen, with colored stripes and unreadable text (photo on request). Only when X11 is started, the screen gets restored again. Closing and re-opening the lid partly cures the mess, too: it makes the font readable, though horizontally stretched." Acked-by: Keith Packard Cc: Chris Wilson Cc: Daniel Vetter Cc: Jesse Barnes Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/intel_display.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 373c2a0..2166ee0 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -5154,6 +5154,8 @@ static int intel_crtc_mode_set(struct drm_crtc *crtc, I915_WRITE(DSPCNTR(plane), dspcntr); POSTING_READ(DSPCNTR(plane)); + if (!HAS_PCH_SPLIT(dev)) + intel_enable_plane(dev_priv, plane, pipe); ret = intel_pipe_set_base(crtc, x, y, old_fb); -- cgit v1.1 From 93826c092c385549c04af184fbebd43f36995c69 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 7 Apr 2011 14:46:59 -0400 Subject: SELinux: delete debugging printks from filename_trans rule processing The filename_trans rule processing has some printk(KERN_ERR ) messages which were intended as debug aids in creating the code but weren't removed before it was submitted. Remove them. Reported-by: Paul Bolle Signed-off-by: Eric Paris --- security/selinux/ss/policydb.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index e6e7ce0..7102457 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -1819,8 +1819,6 @@ static int filename_trans_read(struct policydb *p, void *fp) goto out; nel = le32_to_cpu(buf[0]); - printk(KERN_ERR "%s: nel=%d\n", __func__, nel); - last = p->filename_trans; while (last && last->next) last = last->next; @@ -1857,8 +1855,6 @@ static int filename_trans_read(struct policydb *p, void *fp) goto out; name[len] = 0; - printk(KERN_ERR "%s: ft=%p ft->name=%p ft->name=%s\n", __func__, ft, ft->name, ft->name); - rc = next_entry(buf, fp, sizeof(u32) * 4); if (rc) goto out; -- cgit v1.1 From a236c71766a5f69edf189e2eaeb0aa587c8c5684 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 12 May 2011 12:45:31 -0700 Subject: drivers/base/memory.c: fix warning due to "memory hotplug: Speed up add/remove when blocks are larger than PAGES_PER_SECTION" drivers/base/memory.c: In function 'memory_block_change_state': drivers/base/memory.c:281: warning: unused variable 'i' less beer, more testing Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- drivers/base/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c4c443d..0a134a4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -278,7 +278,7 @@ memory_block_action(unsigned long phys_index, unsigned long action) static int memory_block_change_state(struct memory_block *mem, unsigned long to_state, unsigned long from_state_req) { - int i, ret = 0; + int ret = 0; mutex_lock(&mem->state_mutex); -- cgit v1.1 From cf7e032fc87d59c475df26c4d40bf45d401b2adb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 May 2011 09:11:38 +0000 Subject: zorro8390: Fix regression caused during net_device_ops conversion Changeset b6114794a1c394534659f4a17420e48cf23aa922 ("zorro8390: convert to net_device_ops") broke zorro8390 by adding 8390.o to the link. That meant that lib8390.c was included twice, once in zorro8390.c and once in 8390.c, subject to different macros. This patch reverts that by avoiding the wrappers in 8390.c. Fix based on commits 217cbfa856dc1cbc2890781626c4032d9e3ec59f ("mac8390: fix regression caused during net_device_ops conversion") and 4e0168fa4842e27795a75b205a510f25b62181d9 ("mac8390: fix build with NET_POLL_CONTROLLER"). Reported-by: Christian T. Steigies Suggested-by: Finn Thain Signed-off-by: Geert Uytterhoeven Tested-by: Christian T. Steigies Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/net/Makefile | 2 +- drivers/net/zorro8390.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 01b604a..c64675f 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -219,7 +219,7 @@ obj-$(CONFIG_SC92031) += sc92031.o obj-$(CONFIG_LP486E) += lp486e.o obj-$(CONFIG_ETH16I) += eth16i.o -obj-$(CONFIG_ZORRO8390) += zorro8390.o 8390.o +obj-$(CONFIG_ZORRO8390) += zorro8390.o obj-$(CONFIG_HPLANCE) += hplance.o 7990.o obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o obj-$(CONFIG_EQUALIZER) += eql.o diff --git a/drivers/net/zorro8390.c b/drivers/net/zorro8390.c index b78a38d..8c7c522 100644 --- a/drivers/net/zorro8390.c +++ b/drivers/net/zorro8390.c @@ -126,7 +126,7 @@ static int __devinit zorro8390_init_one(struct zorro_dev *z, board = z->resource.start; ioaddr = board+cards[i].offset; - dev = alloc_ei_netdev(); + dev = ____alloc_ei_netdev(0); if (!dev) return -ENOMEM; if (!request_mem_region(ioaddr, NE_IO_EXTENT*2, DRV_NAME)) { @@ -146,15 +146,15 @@ static int __devinit zorro8390_init_one(struct zorro_dev *z, static const struct net_device_ops zorro8390_netdev_ops = { .ndo_open = zorro8390_open, .ndo_stop = zorro8390_close, - .ndo_start_xmit = ei_start_xmit, - .ndo_tx_timeout = ei_tx_timeout, - .ndo_get_stats = ei_get_stats, - .ndo_set_multicast_list = ei_set_multicast_list, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_multicast_list = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ei_poll, + .ndo_poll_controller = __ei_poll, #endif }; -- cgit v1.1 From 0b25e0157dfa236a0629c16c8ad6f222f633f682 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 May 2011 09:11:39 +0000 Subject: hydra: Fix regression caused during net_device_ops conversion Changeset 5618f0d1193d6b051da9b59b0e32ad24397f06a4 ("hydra: convert to net_device_ops") broke hydra by adding 8390.o to the link. That meant that lib8390.c was included twice, once in hydra.c and once in 8390.c, subject to different macros. This patch reverts that by avoiding the wrappers in 8390.c. Fix based on commits 217cbfa856dc1cbc2890781626c4032d9e3ec59f ("mac8390: fix regression caused during net_device_ops conversion") and 4e0168fa4842e27795a75b205a510f25b62181d9 ("mac8390: fix build with NET_POLL_CONTROLLER"). Signed-off-by: Geert Uytterhoeven Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/net/Makefile | 2 +- drivers/net/hydra.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/Makefile b/drivers/net/Makefile index c64675f..4d2f094 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -231,7 +231,7 @@ obj-$(CONFIG_SGI_IOC3_ETH) += ioc3-eth.o obj-$(CONFIG_DECLANCE) += declance.o obj-$(CONFIG_ATARILANCE) += atarilance.o obj-$(CONFIG_A2065) += a2065.o -obj-$(CONFIG_HYDRA) += hydra.o 8390.o +obj-$(CONFIG_HYDRA) += hydra.o obj-$(CONFIG_ARIADNE) += ariadne.o obj-$(CONFIG_CS89x0) += cs89x0.o obj-$(CONFIG_MACSONIC) += macsonic.o diff --git a/drivers/net/hydra.c b/drivers/net/hydra.c index c5ef62c..1cd481c 100644 --- a/drivers/net/hydra.c +++ b/drivers/net/hydra.c @@ -98,15 +98,15 @@ static const struct net_device_ops hydra_netdev_ops = { .ndo_open = hydra_open, .ndo_stop = hydra_close, - .ndo_start_xmit = ei_start_xmit, - .ndo_tx_timeout = ei_tx_timeout, - .ndo_get_stats = ei_get_stats, - .ndo_set_multicast_list = ei_set_multicast_list, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_multicast_list = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, + .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ei_poll, + .ndo_poll_controller = __ei_poll, #endif }; @@ -125,7 +125,7 @@ static int __devinit hydra_init(struct zorro_dev *z) 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, }; - dev = alloc_ei_netdev(); + dev = ____alloc_ei_netdev(0); if (!dev) return -ENOMEM; -- cgit v1.1 From 2592a7354092afd304a8c067319b15ab1e441e35 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 12 May 2011 09:11:40 +0000 Subject: ne-h8300: Fix regression caused during net_device_ops conversion Changeset dcd39c90290297f6e6ed8a04bb20da7ac2b043c5 ("ne-h8300: convert to net_device_ops") broke ne-h8300 by adding 8390.o to the link. That meant that lib8390.c was included twice, once in ne-h8300.c and once in 8390.c, subject to different macros. This patch reverts that by avoiding the wrappers in 8390.c. Fix based on commits 217cbfa856dc1cbc2890781626c4032d9e3ec59f ("mac8390: fix regression caused during net_device_ops conversion") and 4e0168fa4842e27795a75b205a510f25b62181d9 ("mac8390: fix build with NET_POLL_CONTROLLER"). Signed-off-by: Geert Uytterhoeven Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/net/Makefile | 2 +- drivers/net/ne-h8300.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 4d2f094..e5a7375 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -144,7 +144,7 @@ obj-$(CONFIG_NE3210) += ne3210.o 8390.o obj-$(CONFIG_SB1250_MAC) += sb1250-mac.o obj-$(CONFIG_B44) += b44.o obj-$(CONFIG_FORCEDETH) += forcedeth.o -obj-$(CONFIG_NE_H8300) += ne-h8300.o 8390.o +obj-$(CONFIG_NE_H8300) += ne-h8300.o obj-$(CONFIG_AX88796) += ax88796.o obj-$(CONFIG_BCM63XX_ENET) += bcm63xx_enet.o obj-$(CONFIG_FTMAC100) += ftmac100.o diff --git a/drivers/net/ne-h8300.c b/drivers/net/ne-h8300.c index 30be8c6..7298a34 100644 --- a/drivers/net/ne-h8300.c +++ b/drivers/net/ne-h8300.c @@ -167,7 +167,7 @@ static void cleanup_card(struct net_device *dev) #ifndef MODULE struct net_device * __init ne_probe(int unit) { - struct net_device *dev = alloc_ei_netdev(); + struct net_device *dev = ____alloc_ei_netdev(0); int err; if (!dev) @@ -197,15 +197,15 @@ static const struct net_device_ops ne_netdev_ops = { .ndo_open = ne_open, .ndo_stop = ne_close, - .ndo_start_xmit = ei_start_xmit, - .ndo_tx_timeout = ei_tx_timeout, - .ndo_get_stats = ei_get_stats, - .ndo_set_multicast_list = ei_set_multicast_list, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_multicast_list = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, + .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ei_poll, + .ndo_poll_controller = __ei_poll, #endif }; @@ -637,7 +637,7 @@ int init_module(void) int err; for (this_dev = 0; this_dev < MAX_NE_CARDS; this_dev++) { - struct net_device *dev = alloc_ei_netdev(); + struct net_device *dev = ____alloc_ei_netdev(0); if (!dev) break; if (io[this_dev]) { -- cgit v1.1 From bdf516748ed73f6219f5e4065a8fe2f333520687 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 4 Apr 2011 13:39:53 +0100 Subject: xen: tidy up whitespace in drivers/xen/Makefile Various merges over time have led to rather a mish-mash of indentation. Signed-off-by: Ian Campbell Cc: Jeremy Fitzhardinge Cc: xen-devel@lists.xensource.com Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index f420f1f..4781f80 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -4,21 +4,21 @@ obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_features.o := $(nostackp) -obj-$(CONFIG_BLOCK) += biomerge.o -obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o -obj-$(CONFIG_XEN_XENCOMM) += xencomm.o -obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o -obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o -obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o +obj-$(CONFIG_BLOCK) += biomerge.o +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +obj-$(CONFIG_XEN_XENCOMM) += xencomm.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o +obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o -obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o -obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o -obj-$(CONFIG_XEN_DOM0) += pci.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o +obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o +obj-$(CONFIG_XEN_DOM0) += pci.o -xen-evtchn-y := evtchn.o +xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o -xen-platform-pci-y := platform-pci.o +xen-platform-pci-y := platform-pci.o -- cgit v1.1 From 251511a18dcfe1868e3edfdc490777dcfaa1f851 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:16:07 +0200 Subject: arch/x86/xen/irq: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 6a6fe89..8bbb465 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -113,7 +113,7 @@ static void xen_halt(void) xen_safe_halt(); } -static const struct pv_irq_ops xen_irq_ops __initdata = { +static const struct pv_irq_ops xen_irq_ops __initconst = { .save_fl = PV_CALLEE_SAVE(xen_save_fl), .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), -- cgit v1.1 From ad3062a0f438a5f436dae267f795c0a9686f11d2 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:15:18 +0200 Subject: arch/x86/xen/enlighten: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e3c6a06..dd7b88f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -235,7 +235,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *dx &= maskedx; } -static __init void xen_init_cpuid_mask(void) +static void __init xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; unsigned int xsave_mask; @@ -400,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr) /* * load_gdt for early boot, when the gdt is only mapped once */ -static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) +static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) { unsigned long va = dtr->address; unsigned int size = dtr->size + 1; @@ -662,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, * Version of write_gdt_entry for use at early boot-time needed to * update an entry as simply as possible. */ -static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, +static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, const void *desc, int type) { switch (type) { @@ -933,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static const struct pv_info xen_info __initdata = { +static const struct pv_info xen_info __initconst = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initdata = { +static const struct pv_init_ops xen_init_ops __initconst = { .patch = xen_patch, }; -static const struct pv_cpu_ops xen_cpu_ops __initdata = { +static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, @@ -1004,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .end_context_switch = xen_end_context_switch, }; -static const struct pv_apic_ops xen_apic_ops __initdata = { +static const struct pv_apic_ops xen_apic_ops __initconst = { #ifdef CONFIG_X86_LOCAL_APIC .startup_ipi_hook = paravirt_nop, #endif @@ -1055,7 +1055,7 @@ int xen_panic_handler_init(void) return 0; } -static const struct machine_ops __initdata xen_machine_ops = { +static const struct machine_ops xen_machine_ops __initconst = { .restart = xen_restart, .halt = xen_machine_halt, .power_off = xen_machine_halt, @@ -1332,7 +1332,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { +static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { .notifier_call = xen_hvm_cpu_notify, }; @@ -1381,7 +1381,7 @@ bool xen_hvm_need_lapic(void) } EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { +const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { .name = "Xen HVM", .detect = xen_hvm_platform, .init_platform = xen_hvm_guest_init, -- cgit v1.1 From ae15a3b4d1374b733016ce4b4148b2ba42bbeb0f Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:17:21 +0200 Subject: arch/x86/xen/setup: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 90bac0a..d3663df 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -50,7 +50,7 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; */ #define EXTRA_MEM_RATIO (10) -static __init void xen_add_extra_mem(unsigned long pages) +static void __init xen_add_extra_mem(unsigned long pages) { unsigned long pfn; @@ -336,7 +336,7 @@ static void __init fiddle_vdso(void) #endif } -static __cpuinit int register_callback(unsigned type, const void *func) +static int __cpuinit register_callback(unsigned type, const void *func) { struct callback_register callback = { .type = type, -- cgit v1.1 From 77ed23f8d995a01cd8101d84351b567bf5177a30 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 10 May 2011 08:26:43 -0500 Subject: x86: Fix UV BAU for non-consecutive nasids This is a fix for the SGI Altix-UV Broadcast Assist Unit code, which is used for TLB flushing. Certain hardware configurations (that customers are ordering) cause nasids (numa address space id's) to be non-consecutive. Specifically, once you have more than 4 blades in a IRU (Individual Rack Unit - or 1/2 rack) but less than the maximum of 16, the nasid numbering becomes non-consecutive. This currently results in a 'catastrophic error' (CATERR) detected by the firmware during OS boot. The BAU is generating an 'INTD' request that is targeting a non-existent nasid value. Such configurations may also occur when a blade is configured off because of hardware errors. (There is one UV hub per blade.) This patch is required to support such configurations. The problem with the tlb_uv.c code is that is using the consecutive hub numbers as indices to the BAU distribution bit map. These are simply the ordinal position of the hub or blade within its partition. It should be using physical node numbers (pnodes), which correspond to the physical nasid values. Use of the hub number only works as long as the nasids in the partition are consecutive and increase with a stride of 1. This patch changes the index to be the pnode number, thus allowing nasids to be non-consecutive. It also provides a table in local memory for each cpu to translate target cpu number to target pnode and nasid. And it improves naming to properly reflect 'node' and 'uvhub' versus 'nasid'. Signed-off-by: Cliff Wickman Cc: Link: http://lkml.kernel.org/r/E1QJmxX-0002Mz-Fk@eag09.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 17 ++++++-- arch/x86/platform/uv/tlb_uv.c | 92 +++++++++++++++++++++++++++------------- 2 files changed, 76 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 3e094af..130f1ee 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -94,6 +94,8 @@ /* after this # consecutive successes, bump up the throttle if it was lowered */ #define COMPLETE_THRESHOLD 5 +#define UV_LB_SUBNODEID 0x10 + /* * number of entries in the destination side payload queue */ @@ -124,7 +126,7 @@ * The distribution specification (32 bytes) is interpreted as a 256-bit * distribution vector. Adjacent bits correspond to consecutive even numbered * nodeIDs. The result of adding the index of a given bit to the 15-bit - * 'base_dest_nodeid' field of the header corresponds to the + * 'base_dest_nasid' field of the header corresponds to the * destination nodeID associated with that specified bit. */ struct bau_target_uvhubmask { @@ -176,7 +178,7 @@ struct bau_msg_payload { struct bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid of the */ + unsigned int base_dest_nasid:15; /* nasid of the */ /* bits 20:6 */ /* first bit in uvhub map */ unsigned int command:8; /* message type */ /* bits 28:21 */ @@ -378,6 +380,10 @@ struct ptc_stats { unsigned long d_rcanceled; /* number of messages canceled by resets */ }; +struct hub_and_pnode { + short uvhub; + short pnode; +}; /* * one per-cpu; to locate the software tables */ @@ -399,10 +405,12 @@ struct bau_control { int baudisabled; int set_bau_off; short cpu; + short osnode; short uvhub_cpu; short uvhub; short cpus_in_socket; short cpus_in_uvhub; + short partition_base_pnode; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -422,15 +430,16 @@ struct bau_control { int congested_period; cycles_t period_time; long period_requests; + struct hub_and_pnode *target_hub_and_pnode; }; static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) { return constant_test_bit(uvhub, &dstp->bits[0]); } -static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) +static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp) { - __set_bit(uvhub, &dstp->bits[0]); + __set_bit(pnode, &dstp->bits[0]); } static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, int nbits) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 7cb6424..c58e0ea 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -699,16 +699,17 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - int tcpu; - int uvhub; int locals = 0; int remotes = 0; int hubs = 0; + int tcpu; + int tpnode; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; struct bau_control *tbcp; + struct hub_and_pnode *hpp; /* kernel was booted 'nobau' */ if (nobau) @@ -750,11 +751,18 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - /* cpu statistics */ for_each_cpu(tcpu, flush_mask) { - uvhub = uv_cpu_to_blade_id(tcpu); - bau_uvhub_set(uvhub, &bau_desc->distribution); - if (uvhub == bcp->uvhub) + /* + * The distribution vector is a bit map of pnodes, relative + * to the partition base pnode (and the partition base nasid + * in the header). + * Translate cpu to pnode and hub using an array stored + * in local memory. + */ + hpp = &bcp->socket_master->target_hub_and_pnode[tcpu]; + tpnode = hpp->pnode - bcp->partition_base_pnode; + bau_uvhub_set(tpnode, &bau_desc->distribution); + if (hpp->uvhub == bcp->uvhub) locals++; else remotes++; @@ -855,7 +863,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) * an interrupt, but causes an error message to be returned to * the sender. */ -static void uv_enable_timeouts(void) +static void __init uv_enable_timeouts(void) { int uvhub; int nuvhubs; @@ -1326,10 +1334,10 @@ static int __init uv_ptc_init(void) } /* - * initialize the sending side's sending buffers + * Initialize the sending side's sending buffers. */ static void -uv_activation_descriptor_init(int node, int pnode) +uv_activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; @@ -1352,11 +1360,11 @@ uv_activation_descriptor_init(int node, int pnode) n = pa >> uv_nshift; m = pa & uv_mmask; + /* the 14-bit pnode */ uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, (n << UV_DESC_BASE_PNODE_SHIFT | m)); - /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each * cpu even though we only use the first one; one descriptor can * describe a broadcast to 256 uv hubs. */ @@ -1365,12 +1373,13 @@ uv_activation_descriptor_init(int node, int pnode) memset(bd2, 0, sizeof(struct bau_desc)); bd2->header.sw_ack_flag = 1; /* - * base_dest_nodeid is the nasid of the first uvhub - * in the partition. The bit map will indicate uvhub numbers, - * which are 0-N in a partition. Pnodes are unique system-wide. + * The base_dest_nasid set in the message header is the nasid + * of the first uvhub in the partition. The bit map will + * indicate destination pnode numbers relative to that base. + * They may not be consecutive if nasid striding is being used. */ - bd2->header.base_dest_nodeid = UV_PNODE_TO_NASID(uv_partition_base_pnode); - bd2->header.dest_subnodeid = 0x10; /* the LB */ + bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); + bd2->header.dest_subnodeid = UV_LB_SUBNODEID; bd2->header.command = UV_NET_ENDPOINT_INTD; bd2->header.int_both = 1; /* @@ -1442,7 +1451,7 @@ uv_payload_queue_init(int node, int pnode) /* * Initialization of each UV hub's structures */ -static void __init uv_init_uvhub(int uvhub, int vector) +static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode) { int node; int pnode; @@ -1450,11 +1459,11 @@ static void __init uv_init_uvhub(int uvhub, int vector) node = uvhub_to_first_node(uvhub); pnode = uv_blade_to_pnode(uvhub); - uv_activation_descriptor_init(node, pnode); + uv_activation_descriptor_init(node, pnode, base_pnode); uv_payload_queue_init(node, pnode); /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS + * The below initialization can't be in firmware because the + * messaging IRQ will be determined by the OS. */ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, @@ -1491,10 +1500,11 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static int __init uv_init_per_cpu(int nuvhubs) +static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode) { int i; int cpu; + int tcpu; int pnode; int uvhub; int have_hmaster; @@ -1528,6 +1538,15 @@ static int __init uv_init_per_cpu(int nuvhubs) bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); pnode = uv_cpu_hub_info(cpu)->pnode; + if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) { + printk(KERN_EMERG + "cpu %d pnode %d-%d beyond %d; BAU disabled\n", + cpu, pnode, base_part_pnode, + UV_DISTRIBUTION_SIZE); + return 1; + } + bcp->osnode = cpu_to_node(cpu); + bcp->partition_base_pnode = uv_partition_base_pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); bdp = &uvhub_descs[uvhub]; @@ -1536,7 +1555,7 @@ static int __init uv_init_per_cpu(int nuvhubs) bdp->pnode = pnode; /* kludge: 'assuming' one node per socket, and assuming that disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1); + socket = bcp->osnode & 1; bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; @@ -1585,6 +1604,20 @@ static int __init uv_init_per_cpu(int nuvhubs) nextsocket: socket++; socket_mask = (socket_mask >> 1); + /* each socket gets a local array of pnodes/hubs */ + bcp = smaster; + bcp->target_hub_and_pnode = kmalloc_node( + sizeof(struct hub_and_pnode) * + num_possible_cpus(), GFP_KERNEL, bcp->osnode); + memset(bcp->target_hub_and_pnode, 0, + sizeof(struct hub_and_pnode) * + num_possible_cpus()); + for_each_present_cpu(tcpu) { + bcp->target_hub_and_pnode[tcpu].pnode = + uv_cpu_hub_info(tcpu)->pnode; + bcp->target_hub_and_pnode[tcpu].uvhub = + uv_cpu_hub_info(tcpu)->numa_blade_id; + } } } kfree(uvhub_descs); @@ -1637,21 +1670,22 @@ static int __init uv_bau_init(void) spin_lock_init(&disable_lock); congested_cycles = microsec_2_cycles(congested_response_us); - if (uv_init_per_cpu(nuvhubs)) { - nobau = 1; - return 0; - } - uv_partition_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { if (uv_blade_nr_possible_cpus(uvhub) && (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) uv_partition_base_pnode = uv_blade_to_pnode(uvhub); + } + + if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) { + nobau = 1; + return 0; + } vector = UV_BAU_MESSAGE; for_each_possible_blade(uvhub) if (uv_blade_nr_possible_cpus(uvhub)) - uv_init_uvhub(uvhub, vector); + uv_init_uvhub(uvhub, vector, uv_partition_base_pnode); uv_enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); -- cgit v1.1 From 1b0bcbcf62884959fa7214eb16c44cff445691c6 Mon Sep 17 00:00:00 2001 From: Pedro Scarapicchia Junior Date: Mon, 9 May 2011 14:10:49 +0000 Subject: net/9p/protocol.c: Fix a memory leak When p9pdu_readf() is called with "s" attribute, it allocates a pointer that will store a string. In p9dirent_read(), this pointer is not being released, leading to out of memory errors. This patch releases this pointer after string is copyed to dirent->d_name. Signed-off-by: Pedro Scarapicchia Junior Signed-off-by: Eric Van Hensbergen --- net/9p/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index b58a501..a873277 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -674,6 +674,7 @@ int p9dirent_read(char *buf, int len, struct p9_dirent *dirent, } strcpy(dirent->d_name, nameptr); + kfree(nameptr); out: return fake_pdu.offset; -- cgit v1.1 From 411f05f123cbd7f8aa1edcae86970755a6e2a9d9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 May 2011 23:00:28 +0200 Subject: vsprintf: Turn kptr_restrict off by default kptr_restrict has been triggering bugs in apps such as perf, and it also makes the system less useful by default, so turn it off by default. This is how we generally handle security features that remove functionality, such as firewall code or SELinux - they have to be configured and activated from user-space. Distributions can turn kptr_restrict on again via this line in /etc/sysctrl.conf: kernel.kptr_restrict = 1 ( Also mark the variable __read_mostly while at it, as it's typically modified only once per bootup, or not at all. ) Signed-off-by: Ingo Molnar Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- lib/vsprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index bc0ac6b..dfd6019 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -797,7 +797,7 @@ char *uuid_string(char *buf, char *end, const u8 *addr, return string(buf, end, uuid, spec); } -int kptr_restrict = 1; +int kptr_restrict __read_mostly; /* * Show a '%p' thing. A kernel extension is that the '%p' is followed -- cgit v1.1 From ca06707022d6ba4744198a8ebbe4994786b0c613 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Fri, 6 May 2011 23:44:46 +0000 Subject: ipv6: restore correct ECN handling on TCP xmit Since commit e9df2e8fd8fbc9 (Use appropriate sock tclass setting for routing lookup) we lost ability to properly add ECN codemarks to ipv6 TCP frames. It seems like TCP_ECN_send() calls INET_ECN_xmit(), which only sets the ECN bit in the IPv4 ToS field (inet_sk(sk)->tos), but after the patch, what's checked is inet6_sk(sk)->tclass, which is a completely different field. Close bug https://bugzilla.kernel.org/show_bug.cgi?id=34322 [Eric Dumazet] : added the INET_ECN_dontxmit() fix and replace macros by inline functions for clarity. Signed-off-by: Steinar H. Gunderson Signed-off-by: Eric Dumazet Cc: YOSHIFUJI Hideaki Cc: Andrew Morton Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 88bdd01..2fa8d13 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -38,9 +38,19 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) return outer; } -#define INET_ECN_xmit(sk) do { inet_sk(sk)->tos |= INET_ECN_ECT_0; } while (0) -#define INET_ECN_dontxmit(sk) \ - do { inet_sk(sk)->tos &= ~INET_ECN_MASK; } while (0) +static inline void INET_ECN_xmit(struct sock *sk) +{ + inet_sk(sk)->tos |= INET_ECN_ECT_0; + if (inet6_sk(sk) != NULL) + inet6_sk(sk)->tclass |= INET_ECN_ECT_0; +} + +static inline void INET_ECN_dontxmit(struct sock *sk) +{ + inet_sk(sk)->tos &= ~INET_ECN_MASK; + if (inet6_sk(sk) != NULL) + inet6_sk(sk)->tclass &= ~INET_ECN_MASK; +} #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ -- cgit v1.1 From 9ddabb055d73c63037878bb9346e52c7f2e07e96 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 29 Apr 2011 15:30:02 +0200 Subject: i2c: pnx: Fix crash due to wrong init of timer->data alg_data is already a pointer which must be passed directly. Reported-by: Dieter Ripp Signed-off-by: Wolfram Sang Cc: Russell King Cc: Ben Dooks Signed-off-by: Ben Dooks --- drivers/i2c/busses/i2c-pnx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c index a97e3fe..04be9f82 100644 --- a/drivers/i2c/busses/i2c-pnx.c +++ b/drivers/i2c/busses/i2c-pnx.c @@ -65,7 +65,7 @@ static inline void i2c_pnx_arm_timer(struct i2c_pnx_algo_data *alg_data) jiffies, expires); timer->expires = jiffies + expires; - timer->data = (unsigned long)&alg_data; + timer->data = (unsigned long)alg_data; add_timer(timer); } -- cgit v1.1 From 11f770027b5c0de16544f3ec82b5c6f9f8d5a644 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 16:13:54 -0700 Subject: rbd: fix leak of ops struct The ops vector must be freed by the rbd_do_request caller. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3e90471..2146cab 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -996,6 +996,8 @@ static int rbd_do_op(struct request *rq, ops, num_reply, rbd_req_cb, 0, NULL); + + rbd_destroy_ops(ops); done: kfree(seg_name); return ret; @@ -1063,7 +1065,9 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, { struct ceph_osd_req_op *ops; struct page **pages = NULL; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); + int ret; + + ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); if (ret < 0) return ret; -- cgit v1.1 From d9282fca8a763be574a2fc20b2edcc6e132cbf90 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 11 May 2011 03:15:24 -0400 Subject: drm/radeon/kms: fix tiling reg on fusion The location of MC_ARB_RAMCFG changed on fusion. I've diffed all the other regs in evergreend.h and this is the only other reg that changed. Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/evergreen.c | 5 ++++- drivers/gpu/drm/radeon/evergreend.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index c20eac3..9073e3b 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -1780,7 +1780,10 @@ static void evergreen_gpu_init(struct radeon_device *rdev) mc_shared_chmap = RREG32(MC_SHARED_CHMAP); - mc_arb_ramcfg = RREG32(MC_ARB_RAMCFG); + if (rdev->flags & RADEON_IS_IGP) + mc_arb_ramcfg = RREG32(FUS_MC_ARB_RAMCFG); + else + mc_arb_ramcfg = RREG32(MC_ARB_RAMCFG); switch (rdev->config.evergreen.max_tile_pipes) { case 1: diff --git a/drivers/gpu/drm/radeon/evergreend.h b/drivers/gpu/drm/radeon/evergreend.h index 9453384..fc40e0c 100644 --- a/drivers/gpu/drm/radeon/evergreend.h +++ b/drivers/gpu/drm/radeon/evergreend.h @@ -200,6 +200,7 @@ #define BURSTLENGTH_SHIFT 9 #define BURSTLENGTH_MASK 0x00000200 #define CHANSIZE_OVERRIDE (1 << 11) +#define FUS_MC_ARB_RAMCFG 0x2768 #define MC_VM_AGP_TOP 0x2028 #define MC_VM_AGP_BOT 0x202C #define MC_VM_AGP_BASE 0x2030 -- cgit v1.1 From 05fa7ea7d23980de0014417a0e0af2048a0f9fc1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 11 May 2011 14:02:07 -0400 Subject: drm/radeon/kms: fix extended lvds info parsing On rev <= 1.1 tables, the offset is absolute, on newer tables, it's relative. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=700326 Signed-off-by: Alex Deucher Reviewed-by: Jerome Glisse Cc: stable@kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_atombios.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index dd881d0..90dfb2b 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -1574,9 +1574,17 @@ struct radeon_encoder_atom_dig *radeon_atombios_get_lvds_info(struct ATOM_FAKE_EDID_PATCH_RECORD *fake_edid_record; ATOM_PANEL_RESOLUTION_PATCH_RECORD *panel_res_record; bool bad_record = false; - u8 *record = (u8 *)(mode_info->atom_context->bios + - data_offset + - le16_to_cpu(lvds_info->info.usModePatchTableOffset)); + u8 *record; + + if ((frev == 1) && (crev < 2)) + /* absolute */ + record = (u8 *)(mode_info->atom_context->bios + + le16_to_cpu(lvds_info->info.usModePatchTableOffset)); + else + /* relative */ + record = (u8 *)(mode_info->atom_context->bios + + data_offset + + le16_to_cpu(lvds_info->info.usModePatchTableOffset)); while (*record != ATOM_RECORD_END_TYPE) { switch (*record) { case LCD_MODE_PATCH_RECORD_MODE_TYPE: -- cgit v1.1 From 3a8ab79eae4500e6ac618a92a90cee63d6e804a8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 12 May 2011 21:15:15 -0400 Subject: drm/radeon/kms: add some evergreen/ni safe regs need to programmed from the userspace drivers. Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/reg_srcs/cayman | 1 + drivers/gpu/drm/radeon/reg_srcs/evergreen | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/radeon/reg_srcs/cayman b/drivers/gpu/drm/radeon/reg_srcs/cayman index 6334f8a..0aa8e85 100644 --- a/drivers/gpu/drm/radeon/reg_srcs/cayman +++ b/drivers/gpu/drm/radeon/reg_srcs/cayman @@ -33,6 +33,7 @@ cayman 0x9400 0x00008E48 SQ_EX_ALLOC_TABLE_SLOTS 0x00009100 SPI_CONFIG_CNTL 0x0000913C SPI_CONFIG_CNTL_1 +0x00009508 TA_CNTL_AUX 0x00009830 DB_DEBUG 0x00009834 DB_DEBUG2 0x00009838 DB_DEBUG3 diff --git a/drivers/gpu/drm/radeon/reg_srcs/evergreen b/drivers/gpu/drm/radeon/reg_srcs/evergreen index 7e16371..0e28cae 100644 --- a/drivers/gpu/drm/radeon/reg_srcs/evergreen +++ b/drivers/gpu/drm/radeon/reg_srcs/evergreen @@ -46,6 +46,7 @@ evergreen 0x9400 0x00008E48 SQ_EX_ALLOC_TABLE_SLOTS 0x00009100 SPI_CONFIG_CNTL 0x0000913C SPI_CONFIG_CNTL_1 +0x00009508 TA_CNTL_AUX 0x00009700 VC_CNTL 0x00009714 VC_ENHANCE 0x00009830 DB_DEBUG -- cgit v1.1 From 5fd2a84ab3c8b87176e25db1d98c5cc34043a669 Mon Sep 17 00:00:00 2001 From: "Avinash H.M" Date: Mon, 9 May 2011 12:29:40 +0000 Subject: OMAP3: set the core dpll clk rate in its set_rate function The debug l3_ick/rate is not displaying the actual rate of the clock in hardware. This is because, the core dpll set_rate function doesn't update the clk.rate. After fixing, the l3_ick/rate is displaying proper values. Signed-off-by: Shweta Gulati Signed-off-by: Avinash.H.M Cc: Rajendra Nayak Cc: Paul Wamsley Acked-by: Paul Walmsley Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/clkt34xx_dpll3m2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-omap2/clkt34xx_dpll3m2.c b/arch/arm/mach-omap2/clkt34xx_dpll3m2.c index b2b1e37..d6e34dd 100644 --- a/arch/arm/mach-omap2/clkt34xx_dpll3m2.c +++ b/arch/arm/mach-omap2/clkt34xx_dpll3m2.c @@ -115,6 +115,7 @@ int omap3_core_dpll_m2_set_rate(struct clk *clk, unsigned long rate) sdrc_cs0->rfr_ctrl, sdrc_cs0->actim_ctrla, sdrc_cs0->actim_ctrlb, sdrc_cs0->mr, 0, 0, 0, 0); + clk->rate = rate; return 0; } -- cgit v1.1 From d9a5ac9ef306eb5cc874f285185a15c303c50009 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 13 May 2011 15:52:09 +0200 Subject: x86, mce, AMD: Fix leaving freed data in a list b may be added to a list, but is not removed before being freed in the case of an error. This is done in the corresponding deallocation function, so the code here has been changed to follow that. The sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ expression E,E1,E2; identifier l; @@ *list_add(&E->l,E1); ... when != E1 when != list_del(&E->l) when != list_del_init(&E->l) when != E = E2 *kfree(E);// Signed-off-by: Julia Lawall Cc: Borislav Petkov Cc: Robert Richter Cc: Yinghai Lu Cc: Andreas Herrmann Cc: Link: http://lkml.kernel.org/r/1305294731-12127-1-git-send-email-julia@diku.dk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 167f97b..bb0adad 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -509,6 +509,7 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; -- cgit v1.1 From 5d44670facd3205212f8fe89eb422e3b5f309612 Mon Sep 17 00:00:00 2001 From: Marcus Meissner Date: Thu, 5 May 2011 10:44:11 -0700 Subject: ocfs2: Initialize data_ac (might be used uninitialized) CLANG found that there is a path that has data_ac uninitialized, this place 2917 /* This gets us the dx_root */ 2918 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 2919 if (ret) { 3 Taking true branch 2920 mlog_errno(ret); 2921 goto out; 4 Control jumps to line 3168 2922 } Goes to the out: label without data_ac being initialized. Ciao, Marcus Signed-Off-By: Marcus Meissner Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 9fe5b8fd..8582e3f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2868,7 +2868,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, bytes = blocks_wanted << sb->s_blocksize_bits; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(dir); - struct ocfs2_alloc_context *data_ac; + struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; struct buffer_head *dirdata_bh = NULL; struct buffer_head *dx_root_bh = NULL; -- cgit v1.1 From 9a790ba1ec02bbae0933e7ebd576c0bc329e9796 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Thu, 12 May 2011 20:47:07 +0800 Subject: ocfs2: skip existing hole when removing the last extent_rec in punching-hole codes. In the case of removing a partial extent record which covers a hole, current punching-hole logic will try to remove more than the length of whole extent record, which leads to the failure of following assert(fs/ocfs2/alloc.c): 5507 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); This patch tries to skip existing hole at the last attempt of removing a partial extent record, what's more, it also adds some necessary comments for better understanding of punching-hole codes. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41565ae..89659d6 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1607,6 +1607,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); if (le32_to_cpu(rec->e_cpos) >= trunc_start) { + /* + * remove an entire extent record. + */ *trunc_cpos = le32_to_cpu(rec->e_cpos); /* * Skip holes if any. @@ -1617,7 +1620,16 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, *blkno = le64_to_cpu(rec->e_blkno); *trunc_end = le32_to_cpu(rec->e_cpos); } else if (range > trunc_start) { + /* + * remove a partial extent record, which means we're + * removing the last extent record. + */ *trunc_cpos = trunc_start; + /* + * skip hole if any. + */ + if (range < *trunc_end) + *trunc_end = range; *trunc_len = *trunc_end - trunc_start; coff = trunc_start - le32_to_cpu(rec->e_cpos); *blkno = le64_to_cpu(rec->e_blkno) + -- cgit v1.1 From 4da6dc293604f55d156148b8f60b94053e3195fc Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 4 May 2011 10:27:10 -0700 Subject: ocfs2/dlm: Use negotiated o2dlm protocol version Patch fixes a bug in the o2dlm protocol negotiation in that it is using the builtin version rather than the negotiated version during the domain join. This causes join errors when a node having kernel >= 2.6.37 joins a cluster with nodes having kernels < 2.6.37. This only affects the o2cb cluster stack. Signed-off-by: Sunil Mushran Reported-by: Jacek Stepniewski Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7540a49..3b179d6 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1614,7 +1614,8 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); /* Support for global heartbeat and node info was added in 1.1 */ - if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) { + if (dlm->dlm_locking_proto.pv_major > 1 || + dlm->dlm_locking_proto.pv_minor > 0) { status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map); if (status) { mlog_errno(status); -- cgit v1.1 From 76d9fc2954d057b19bf5d7b854df2b621b00fdec Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 4 May 2011 10:28:00 -0700 Subject: ocfs2/cluster: Increase the live threshold for global heartbeat We have seen isolated cases (very few, I might add) of o2hb not detecting all live nodes on startup. One plausible reasoning for it is that other node had a hb io delay at the same time. The live threshold set at 2 (as low as it can be) could be increased to ameliorate the situation. But increasing the threshold directly affects mount time. Currently it takes around 5 secs to mount a volume in o2cb cluster with local heartbeat. Increasing the threshold will make mounts even slower. As the issue itself is rare, we have left things as they are for the local heartbeat mode. However we can improve the situation for global heartbeat mode as in that mode, we start the heartbeat much before the mount. This patch doubles the live threshold for the start of the first region in global heartbeat mode. Addresses internal Oracle bug#10635585. Signed-off-by: Sunil Mushran Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/cluster/heartbeat.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6437202..1d28505 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1690,6 +1690,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, struct file *filp = NULL; struct inode *inode = NULL; ssize_t ret = -EINVAL; + int live_threshold; if (reg->hr_bdev) goto out; @@ -1766,8 +1767,18 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, * A node is considered live after it has beat LIVE_THRESHOLD * times. We're not steady until we've given them a chance * _after_ our first read. + * The default threshold is bare minimum so as to limit the delay + * during mounts. For global heartbeat, the threshold doubled for the + * first region. */ - atomic_set(®->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1); + live_threshold = O2HB_LIVE_THRESHOLD; + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) + live_threshold <<= 1; + spin_unlock(&o2hb_live_lock); + } + atomic_set(®->hr_steady_iterations, live_threshold + 1); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); -- cgit v1.1 From 33c12a5436464f8d4f56d68e5e79e24a3a1f11aa Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 4 May 2011 10:28:01 -0700 Subject: ocfs2/cluster: Heartbeat mismatch message improved If o2hb finds unexpected values in the heartbeat slot, it prints a message "ERROR: Device "dm-6": another node is heartbeating in our slot!" This message could be misleading. This patch adds two more messages to help users better diagnose the problem. Signed-off-by: Sunil Mushran Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/cluster/heartbeat.c | 48 ++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 1d28505..9a3e6bb 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -539,25 +539,41 @@ static int o2hb_verify_crc(struct o2hb_region *reg, /* We want to make sure that nobody is heartbeating on top of us -- * this will help detect an invalid configuration. */ -static int o2hb_check_last_timestamp(struct o2hb_region *reg) +static void o2hb_check_last_timestamp(struct o2hb_region *reg) { - int node_num, ret; struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; + char *errstr; - node_num = o2nm_this_node(); - - ret = 1; - slot = ®->hr_slots[node_num]; + slot = ®->hr_slots[o2nm_this_node()]; /* Don't check on our 1st timestamp */ - if (slot->ds_last_time) { - hb_block = slot->ds_raw_block; + if (!slot->ds_last_time) + return; - if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time) - ret = 0; - } + hb_block = slot->ds_raw_block; + if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && + le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && + hb_block->hb_node == slot->ds_node_num) + return; - return ret; +#define ERRSTR1 "Another node is heartbeating on device" +#define ERRSTR2 "Heartbeat generation mismatch on device" +#define ERRSTR3 "Heartbeat sequence mismatch on device" + + if (hb_block->hb_node != slot->ds_node_num) + errstr = ERRSTR1; + else if (le64_to_cpu(hb_block->hb_generation) != + slot->ds_last_generation) + errstr = ERRSTR2; + else + errstr = ERRSTR3; + + mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + slot->ds_node_num, (unsigned long long)slot->ds_last_generation, + (unsigned long long)slot->ds_last_time, hb_block->hb_node, + (unsigned long long)le64_to_cpu(hb_block->hb_generation), + (unsigned long long)le64_to_cpu(hb_block->hb_seq)); } static inline void o2hb_prepare_block(struct o2hb_region *reg, @@ -983,9 +999,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* With an up to date view of the slots, we can check that no * other node has been improperly configured to heartbeat in * our slot. */ - if (!o2hb_check_last_timestamp(reg)) - mlog(ML_ERROR, "Device \"%s\": another node is heartbeating " - "in our slot!\n", reg->hr_dev_name); + o2hb_check_last_timestamp(reg); /* fill in the proper info for our next heartbeat */ o2hb_prepare_block(reg, reg->hr_generation); @@ -999,8 +1013,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) } i = -1; - while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { - + while((i = find_next_bit(configured_nodes, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { change |= o2hb_check_slot(reg, ®->hr_slots[i]); } -- cgit v1.1 From 10b3dd76117a327557b8cb898b41c18afd08dc86 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 4 May 2011 10:28:02 -0700 Subject: ocfs2: Skip mount recovery for hard-ro mounts Patch skips mount recovery for hard-ro mounts which otherwise leads to an oops. Signed-off-by: Sunil Mushran Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b141a44..295d564 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1260,6 +1260,9 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) { struct ocfs2_journal *journal = osb->journal; + if (ocfs2_is_hard_readonly(osb)) + return; + /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, -- cgit v1.1 From df016c665b10ae80d8db67ec8103b50c5c234e5c Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 4 May 2011 10:28:07 -0700 Subject: ocfs2/dlm: Target node death during resource migration leads to thread spin During resource migration, if the target node were to die, the thread doing the migration spins until the target node is not removed from the domain map. This patch slows the spin by making the thread wait for the recovery to kick in. Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmmaster.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index fede57e..84d1663 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2574,6 +2574,9 @@ fail: res->state &= ~DLM_LOCK_RES_MIGRATING; wake = 1; spin_unlock(&res->spinlock); + if (dlm_is_host_down(ret)) + dlm_wait_for_node_death(dlm, target, + DLM_NODE_DEATH_WAIT_MAX); goto leave; } -- cgit v1.1 From 47a150edc2ae734c0f4bf50aa19499e23b9a46f8 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Fri, 13 May 2011 04:27:54 +0100 Subject: Cache user_ns in struct cred If !CONFIG_USERNS, have current_user_ns() defined to (&init_user_ns). Get rid of _current_user_ns. This requires nsown_capable() to be defined in capability.c rather than as static inline in capability.h, so do that. Request_key needs init_user_ns defined at current_user_ns if !CONFIG_USERNS, so forward-declare that in cred.h if !CONFIG_USERNS at current_user_ns() define. Compile-tested with and without CONFIG_USERNS. Signed-off-by: Serge E. Hallyn [ This makes a huge performance difference for acl_permission_check(), up to 30%. And that is one of the hottest kernel functions for loads that are pathname-lookup heavy. ] Signed-off-by: Linus Torvalds --- include/linux/capability.h | 13 +------------ include/linux/cred.h | 10 ++++++++-- kernel/capability.c | 12 ++++++++++++ kernel/cred.c | 12 ++++++------ 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index 16ee8b4..d4675af 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -546,18 +546,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); - -/** - * nsown_capable - Check superior capability to one's own user_ns - * @cap: The capability in question - * - * Return true if the current task has the given superior capability - * targeted at its own user namespace. - */ -static inline bool nsown_capable(int cap) -{ - return ns_capable(current_user_ns(), cap); -} +extern bool nsown_capable(int cap); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/linux/cred.h b/include/linux/cred.h index 9aeeb0b..be16b61 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -146,6 +146,7 @@ struct cred { void *security; /* subjective LSM security */ #endif struct user_struct *user; /* real user ID subscription */ + struct user_namespace *user_ns; /* cached user->user_ns */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ }; @@ -354,10 +355,15 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) -#define _current_user_ns() (current_cred_xxx(user)->user_ns) #define current_security() (current_cred_xxx(security)) -extern struct user_namespace *current_user_ns(void); +#ifdef CONFIG_USER_NS +#define current_user_ns() (current_cred_xxx(user_ns)) +#else +extern struct user_namespace init_user_ns; +#define current_user_ns() (&init_user_ns) +#endif + #define current_uid_gid(_uid, _gid) \ do { \ diff --git a/kernel/capability.c b/kernel/capability.c index bf0c734..32a80e0 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -399,3 +399,15 @@ bool task_ns_capable(struct task_struct *t, int cap) return ns_capable(task_cred_xxx(t, user)->user_ns, cap); } EXPORT_SYMBOL(task_ns_capable); + +/** + * nsown_capable - Check superior capability to one's own user_ns + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at its own user namespace. + */ +bool nsown_capable(int cap) +{ + return ns_capable(current_user_ns(), cap); +} diff --git a/kernel/cred.c b/kernel/cred.c index 5557b55..8093c16 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -54,6 +54,7 @@ struct cred init_cred = { .cap_effective = CAP_INIT_EFF_SET, .cap_bset = CAP_INIT_BSET, .user = INIT_USER, + .user_ns = &init_user_ns, .group_info = &init_groups, #ifdef CONFIG_KEYS .tgcred = &init_tgcred, @@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) goto error_put; } + /* cache user_ns in cred. Doesn't need a refcount because it will + * stay pinned by cred->user + */ + new->user_ns = new->user->user_ns; + #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ @@ -741,12 +747,6 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); -struct user_namespace *current_user_ns(void) -{ - return _current_user_ns(); -} -EXPORT_SYMBOL(current_user_ns); - #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) -- cgit v1.1 From 26cf46be954a2dd391d32eeaf7d07c3a953dcc5a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 13 May 2011 11:51:01 -0700 Subject: vfs: micro-optimize acl_permission_check() It's a hot function, and we're better off not mixing types in the mask calculations. The compiler just ends up mixing 16-bit and 32-bit operations, for no good reason. So do everything in 'unsigned int' rather than mixing 'unsigned int' masking with a 'umode_t' (16-bit) mode variable. This, together with the parent commit (47a150edc2ae: "Cache user_ns in struct cred") makes acl_permission_check() much nicer. Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 54fc993..e3c4f11 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -179,7 +179,7 @@ EXPORT_SYMBOL(putname); static int acl_permission_check(struct inode *inode, int mask, unsigned int flags, int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) { - umode_t mode = inode->i_mode; + unsigned int mode = inode->i_mode; mask &= MAY_READ | MAY_WRITE | MAY_EXEC; -- cgit v1.1 From a10e14667635dde504ed9e7ee851494c2cf2ae8e Mon Sep 17 00:00:00 2001 From: Vitalii Demianets Date: Thu, 12 May 2011 23:04:29 +0000 Subject: bonding,llc: Fix structure sizeof incompatibility for some PDUs With some combinations of arch/compiler (e.g. arm-linux-gcc) the sizeof operator on structure returns value greater than expected. In cases when the structure is used for mapping PDU fields it may lead to unexpected results (such as holes and alignment problems in skb data). __packed prevents this undesired behavior. Signed-off-by: Vitalii Demianets Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.h | 10 +++++----- include/net/llc_pdu.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h index b28baff..01b8a6a 100644 --- a/drivers/net/bonding/bond_3ad.h +++ b/drivers/net/bonding/bond_3ad.h @@ -39,7 +39,7 @@ typedef struct mac_addr { u8 mac_addr_value[ETH_ALEN]; -} mac_addr_t; +} __packed mac_addr_t; enum { BOND_AD_STABLE = 0, @@ -134,12 +134,12 @@ typedef struct lacpdu { u8 tlv_type_terminator; // = terminator u8 terminator_length; // = 0 u8 reserved_50[50]; // = 0 -} lacpdu_t; +} __packed lacpdu_t; typedef struct lacpdu_header { struct ethhdr hdr; struct lacpdu lacpdu; -} lacpdu_header_t; +} __packed lacpdu_header_t; // Marker Protocol Data Unit(PDU) structure(43.5.3.2 in the 802.3ad standard) typedef struct bond_marker { @@ -155,12 +155,12 @@ typedef struct bond_marker { u8 tlv_type_terminator; // = 0x00 u8 terminator_length; // = 0x00 u8 reserved_90[90]; // = 0 -} bond_marker_t; +} __packed bond_marker_t; typedef struct bond_marker_header { struct ethhdr hdr; struct bond_marker marker; -} bond_marker_header_t; +} __packed bond_marker_header_t; #pragma pack() diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 75b8e29..f57e7d4 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -199,7 +199,7 @@ struct llc_pdu_sn { u8 ssap; u8 ctrl_1; u8 ctrl_2; -}; +} __packed; static inline struct llc_pdu_sn *llc_pdu_sn_hdr(struct sk_buff *skb) { @@ -211,7 +211,7 @@ struct llc_pdu_un { u8 dsap; u8 ssap; u8 ctrl_1; -}; +} __packed; static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) { @@ -359,7 +359,7 @@ struct llc_xid_info { u8 fmt_id; /* always 0x81 for LLC */ u8 type; /* different if NULL/non-NULL LSAP */ u8 rw; /* sender receive window */ -}; +} __packed; /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID @@ -415,7 +415,7 @@ struct llc_frmr_info { u8 curr_ssv; /* current send state variable val */ u8 curr_rsv; /* current receive state variable */ u8 ind_bits; /* indicator bits set with macro */ -}; +} __packed; extern void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 type); extern void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value); -- cgit v1.1 From 087fbc9962e10a65fb0b542ecfc116ebf6cf1735 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 13 May 2011 12:14:54 -0400 Subject: drm/i915: Revert i915.semaphore=1 default from i915 merge My Q67 / i7-2600 box has rev09 Sandy Bridge graphics. It hangs instantly when GNOME loads and it hangs so hard the reset button doesn't work. Setting i915.semaphore=0 fixes it. Semaphores were disabled in a1656b9090f7 ("drm/i915: Disable GPU semaphores by default") in 2.6.38 but were then re-enabled (by mistake?) by the merge 47ae63e0c2e5 ("Merge branch 'drm-intel-fixes' into drm-intel-next"). (It's worth noting that the offending change is i915_drv.c, which was not marked as a conflict - although a 'git show --cc' on the merge does show that neither parent had it set to 1) Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c34a8dd..32d1b3e 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -49,7 +49,7 @@ module_param_named(panel_ignore_lid, i915_panel_ignore_lid, int, 0600); unsigned int i915_powersave = 1; module_param_named(powersave, i915_powersave, int, 0600); -unsigned int i915_semaphores = 1; +unsigned int i915_semaphores = 0; module_param_named(semaphores, i915_semaphores, int, 0600); unsigned int i915_enable_rc6 = 0; -- cgit v1.1 From cb68552858c64db302771469b1202ea09e696329 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 13 May 2011 16:03:24 -0400 Subject: bridge: fix forwarding of IPv6 The commit 6b1e960fdbd75dcd9bcc3ba5ff8898ff1ad30b6e bridge: Reset IPCB when entering IP stack on NF_FORWARD broke forwarding of IPV6 packets in bridge because it would call bp_parse_ip_options with an IPV6 packet. Reported-by: Noah Meyerhans Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index f3bc322..74ef4d4 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -737,7 +737,7 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, nf_bridge->mask |= BRNF_PKT_TYPE; } - if (br_parse_ip_options(skb)) + if (pf == PF_INET && br_parse_ip_options(skb)) return NF_DROP; /* The physdev module checks on this */ -- cgit v1.1 From 1fec70932d867416ffe620dd17005f168cc84eb5 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 13 May 2011 13:52:56 -0700 Subject: rbd: fix split bio handling The rbd driver currently splits bios when they span an object boundary. However, the blk_end_request expects the completions to roll up the results in block device order, and the split rbd/ceph ops can complete in any order. This patch adds a struct rbd_req_coll to track completion of split requests and ensures that the results are passed back up to the block layer in order. This fixes errors where the file system gets completion of a read operation that spans an object boundary before the data has actually arrived. The bug is easily reproduced with iozone with a working set larger than available RAM. Reported-by: Fyodor Ustinov Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- drivers/block/rbd.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 20 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2146cab..9712fad 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -92,6 +92,8 @@ struct rbd_client { struct list_head node; }; +struct rbd_req_coll; + /* * a single io request */ @@ -100,6 +102,24 @@ struct rbd_request { struct bio *bio; /* cloned bio */ struct page **pages; /* list of used pages */ u64 len; + int coll_index; + struct rbd_req_coll *coll; +}; + +struct rbd_req_status { + int done; + int rc; + u64 bytes; +}; + +/* + * a collection of requests + */ +struct rbd_req_coll { + int total; + int num_done; + struct kref kref; + struct rbd_req_status status[0]; }; struct rbd_snap { @@ -416,6 +436,17 @@ static void rbd_put_client(struct rbd_device *rbd_dev) rbd_dev->client = NULL; } +/* + * Destroy requests collection + */ +static void rbd_coll_release(struct kref *kref) +{ + struct rbd_req_coll *coll = + container_of(kref, struct rbd_req_coll, kref); + + dout("rbd_coll_release %p\n", coll); + kfree(coll); +} /* * Create a new header structure, translate header format from the on-disk @@ -590,6 +621,14 @@ static u64 rbd_get_segment(struct rbd_image_header *header, return len; } +static int rbd_get_num_segments(struct rbd_image_header *header, + u64 ofs, u64 len) +{ + u64 start_seg = ofs >> header->obj_order; + u64 end_seg = (ofs + len - 1) >> header->obj_order; + return end_seg - start_seg + 1; +} + /* * bio helpers */ @@ -735,6 +774,50 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops) kfree(ops); } +static void rbd_coll_end_req_index(struct request *rq, + struct rbd_req_coll *coll, + int index, + int ret, u64 len) +{ + struct request_queue *q; + int min, max, i; + + dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", + coll, index, ret, len); + + if (!rq) + return; + + if (!coll) { + blk_end_request(rq, ret, len); + return; + } + + q = rq->q; + + spin_lock_irq(q->queue_lock); + coll->status[index].done = 1; + coll->status[index].rc = ret; + coll->status[index].bytes = len; + max = min = coll->num_done; + while (max < coll->total && coll->status[max].done) + max++; + + for (i = min; istatus[i].rc, + coll->status[i].bytes); + coll->num_done++; + kref_put(&coll->kref, rbd_coll_release); + } + spin_unlock_irq(q->queue_lock); +} + +static void rbd_coll_end_req(struct rbd_request *req, + int ret, u64 len) +{ + rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); +} + /* * Send ceph osd request */ @@ -749,6 +832,8 @@ static int rbd_do_request(struct request *rq, int flags, struct ceph_osd_req_op *ops, int num_reply, + struct rbd_req_coll *coll, + int coll_index, void (*rbd_cb)(struct ceph_osd_request *req, struct ceph_msg *msg), struct ceph_osd_request **linger_req, @@ -763,12 +848,20 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request_head *reqhead; struct rbd_image_header *header = &dev->header; - ret = -ENOMEM; req_data = kzalloc(sizeof(*req_data), GFP_NOIO); - if (!req_data) - goto done; + if (!req_data) { + if (coll) + rbd_coll_end_req_index(rq, coll, coll_index, + -ENOMEM, len); + return -ENOMEM; + } + + if (coll) { + req_data->coll = coll; + req_data->coll_index = coll_index; + } - dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs); + dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); down_read(&header->snap_rwsem); @@ -828,7 +921,8 @@ static int rbd_do_request(struct request *rq, ret = ceph_osdc_wait_request(&dev->client->osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%lld\n", le64_to_cpu(req->r_reassert_version.version)); + dout("reassert_ver=%lld\n", + le64_to_cpu(req->r_reassert_version.version)); ceph_osdc_put_request(req); } return ret; @@ -837,10 +931,8 @@ done_err: bio_chain_put(req_data->bio); ceph_osdc_put_request(req); done_pages: + rbd_coll_end_req(req_data, ret, len); kfree(req_data); -done: - if (rq) - blk_end_request(rq, ret, len); return ret; } @@ -874,7 +966,7 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) bytes = req_data->len; } - blk_end_request(req_data->rq, rc, bytes); + rbd_coll_end_req(req_data, rc, bytes); if (req_data->bio) bio_chain_put(req_data->bio); @@ -934,6 +1026,7 @@ static int rbd_req_sync_op(struct rbd_device *dev, flags, ops, 2, + NULL, 0, NULL, linger_req, ver); if (ret < 0) @@ -959,7 +1052,9 @@ static int rbd_do_op(struct request *rq, u64 snapid, int opcode, int flags, int num_reply, u64 ofs, u64 len, - struct bio *bio) + struct bio *bio, + struct rbd_req_coll *coll, + int coll_index) { char *seg_name; u64 seg_ofs; @@ -995,6 +1090,7 @@ static int rbd_do_op(struct request *rq, flags, ops, num_reply, + coll, coll_index, rbd_req_cb, 0, NULL); rbd_destroy_ops(ops); @@ -1010,13 +1106,15 @@ static int rbd_req_write(struct request *rq, struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 ofs, u64 len, - struct bio *bio) + struct bio *bio, + struct rbd_req_coll *coll, + int coll_index) { return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 2, - ofs, len, bio); + ofs, len, bio, coll, coll_index); } /* @@ -1026,14 +1124,16 @@ static int rbd_req_read(struct request *rq, struct rbd_device *rbd_dev, u64 snapid, u64 ofs, u64 len, - struct bio *bio) + struct bio *bio, + struct rbd_req_coll *coll, + int coll_index) { return rbd_do_op(rq, rbd_dev, NULL, (snapid ? snapid : CEPH_NOSNAP), CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 2, - ofs, len, bio); + ofs, len, bio, coll, coll_index); } /* @@ -1081,6 +1181,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, CEPH_OSD_FLAG_READ, ops, 1, + NULL, 0, rbd_simple_req_cb, 0, NULL); rbd_destroy_ops(ops); @@ -1278,6 +1379,20 @@ static int rbd_req_sync_exec(struct rbd_device *dev, return ret; } +static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) +{ + struct rbd_req_coll *coll = + kzalloc(sizeof(struct rbd_req_coll) + + sizeof(struct rbd_req_status) * num_reqs, + GFP_ATOMIC); + + if (!coll) + return NULL; + coll->total = num_reqs; + kref_init(&coll->kref); + return coll; +} + /* * block device queue callback */ @@ -1295,6 +1410,8 @@ static void rbd_rq_fn(struct request_queue *q) bool do_write; int size, op_size = 0; u64 ofs; + int num_segs, cur_seg = 0; + struct rbd_req_coll *coll; /* peek at request from block layer */ if (!rq) @@ -1325,6 +1442,14 @@ static void rbd_rq_fn(struct request_queue *q) do_write ? "write" : "read", size, blk_rq_pos(rq) * 512ULL); + num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); + coll = rbd_alloc_coll(num_segs); + if (!coll) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENOMEM); + goto next; + } + do { /* a bio clone to be passed down to OSD req */ dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); @@ -1332,35 +1457,41 @@ static void rbd_rq_fn(struct request_queue *q) rbd_dev->header.block_name, ofs, size, NULL, NULL); + kref_get(&coll->kref); bio = bio_chain_clone(&rq_bio, &next_bio, &bp, op_size, GFP_ATOMIC); if (!bio) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENOMEM); - goto next; + rbd_coll_end_req_index(rq, coll, cur_seg, + -ENOMEM, op_size); + goto next_seg; } + /* init OSD command: write or read */ if (do_write) rbd_req_write(rq, rbd_dev, rbd_dev->header.snapc, ofs, - op_size, bio); + op_size, bio, + coll, cur_seg); else rbd_req_read(rq, rbd_dev, cur_snap_id(rbd_dev), ofs, - op_size, bio); + op_size, bio, + coll, cur_seg); +next_seg: size -= op_size; ofs += op_size; + cur_seg++; rq_bio = next_bio; } while (size > 0); + kref_put(&coll->kref, rbd_coll_release); if (bp) bio_pair_release(bp); - spin_lock_irq(q->queue_lock); next: rq = blk_fetch_request(q); -- cgit v1.1 From 82a3242e11d9e63c8195be46c954efaefee35e22 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 May 2011 16:01:02 -0700 Subject: sysfs: remove "last sysfs file:" line from the oops messages On some arches (x86, sh, arm, unicore, powerpc) the oops message would print out the last sysfs file accessed. This was very useful in finding a number of sysfs and driver core bugs in the 2.5 and early 2.6 development days, but it has been a number of years since this file has actually helped in debugging anything that couldn't also be trivially determined from the stack traceback. So it's time to delete the line. This is good as we need all the space we can get for oops messages at times on consoles. Acked-by: Phil Carmody Acked-by: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/arm/kernel/traps.c | 1 - arch/powerpc/kernel/traps.c | 1 - arch/sh/kernel/traps_32.c | 1 - arch/unicore32/kernel/traps.c | 1 - arch/x86/kernel/dumpstack.c | 1 - fs/sysfs/file.c | 12 ------------ include/linux/sysfs.h | 5 ----- 7 files changed, 22 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 3b54ad1..d52eec2 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -234,7 +234,6 @@ static int __die(const char *str, int err, struct thread_info *thread, struct pt printk(KERN_EMERG "Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n", str, err, ++die_counter); - sysfs_printk_last_file(); /* trap and error numbers are mostly meaningless on ARM */ ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5ddb801..d782cd7 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -143,7 +143,6 @@ int die(const char *str, struct pt_regs *regs, long err) #endif printk("%s\n", ppc_md.name ? ppc_md.name : ""); - sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 3484c2f..b51a171 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -87,7 +87,6 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(1); printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); - sysfs_printk_last_file(); print_modules(); show_regs(regs); diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 254e36f..b9a2646 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -192,7 +192,6 @@ static int __die(const char *str, int err, struct thread_info *thread, printk(KERN_EMERG "Internal error: %s: %x [#%d]\n", str, err, ++die_counter); - sysfs_printk_last_file(); /* trap and error numbers are mostly meaningless on UniCore */ ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, \ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index e2a3f06..f72e719 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index da3fefe..1ad8c93 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -24,13 +24,6 @@ #include "sysfs.h" -/* used in crash dumps to help with debugging */ -static char last_sysfs_file[PATH_MAX]; -void sysfs_printk_last_file(void) -{ - printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file); -} - /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -337,11 +330,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_buffer *buffer; const struct sysfs_ops *ops; int error = -EACCES; - char *p; - - p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (!IS_ERR(p)) - memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active(attr_sd)) diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 30b8815..c3acda6 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -176,7 +176,6 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, const unsigned char *name); struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd); void sysfs_put(struct sysfs_dirent *sd); -void sysfs_printk_last_file(void); /* Called to clear a ns tag when it is no longer valid */ void sysfs_exit_ns(enum kobj_ns_type type, const void *tag); @@ -348,10 +347,6 @@ static inline int __must_check sysfs_init(void) return 0; } -static inline void sysfs_printk_last_file(void) -{ -} - #endif /* CONFIG_SYSFS */ #endif /* _SYSFS_H_ */ -- cgit v1.1 From c42d2237143fcf35cff642cefe2bcf7786aae312 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 12 May 2011 16:50:07 -0700 Subject: debugfs: Silence DEBUG_STRICT_USER_COPY_CHECKS=y warning Enabling DEBUG_STRICT_USER_COPY_CHECKS causes the following warning: In file included from arch/x86/include/asm/uaccess.h:573, from include/linux/uaccess.h:5, from include/linux/highmem.h:7, from include/linux/pagemap.h:10, from fs/debugfs/file.c:18: In function 'copy_from_user', inlined from 'write_file_bool' at fs/debugfs/file.c:435: arch/x86/include/asm/uaccess_64.h:65: warning: call to 'copy_from_user_overflow' declared with attribute warning: copy_from_user() buffer size is not provably correct presumably due to buf_size being signed causing GCC to fail to see that buf_size can't become negative. Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 568304d..90f7657 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -428,7 +428,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; bool bv; u32 *val = file->private_data; -- cgit v1.1 From f550806a7fbca06b487238442546aceb7ecbb0c9 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 15 Feb 2011 22:34:49 -0800 Subject: alpha: convert to clocksource_register_hz Converts alpha to use clocksource_register_hz. Signed-off-by: John Stultz CC: Richard Henderson CC: Ivan Kokshaysky CC: Thomas Gleixner Signed-off-by: Matt Turner --- arch/alpha/kernel/time.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 918e8e0..818e74e 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -375,8 +375,7 @@ static struct clocksource clocksource_rpcc = { static inline void register_rpcc_clocksource(long cycle_freq) { - clocksource_calc_mult_shift(&clocksource_rpcc, cycle_freq, 4); - clocksource_register(&clocksource_rpcc); + clocksource_register_hz(&clocksource_rpcc, cycle_freq); } #else /* !CONFIG_SMP */ static inline void register_rpcc_clocksource(long cycle_freq) -- cgit v1.1 From 90b57f35164aa715dcc7d939a88780a23231f84e Mon Sep 17 00:00:00 2001 From: Michael Cree Date: Wed, 4 May 2011 08:14:50 +0000 Subject: alpha: Wire up syscalls new to 2.6.39 Wire up the syscalls: name_to_handle_at open_by_handle_at clock_adjtime syncfs and adjust some whitespace in the neighbourhood to align commments. Signed-off-by: Michael Cree Signed-off-by: Matt Turner --- arch/alpha/include/asm/unistd.h | 6 +++++- arch/alpha/kernel/systbls.S | 12 ++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index 058937b..b183416 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -452,10 +452,14 @@ #define __NR_fanotify_init 494 #define __NR_fanotify_mark 495 #define __NR_prlimit64 496 +#define __NR_name_to_handle_at 497 +#define __NR_open_by_handle_at 498 +#define __NR_clock_adjtime 499 +#define __NR_syncfs 500 #ifdef __KERNEL__ -#define NR_SYSCALLS 497 +#define NR_SYSCALLS 501 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index a6a1de9..15f999d 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -498,23 +498,27 @@ sys_call_table: .quad sys_ni_syscall /* sys_timerfd */ .quad sys_eventfd .quad sys_recvmmsg - .quad sys_fallocate /* 480 */ + .quad sys_fallocate /* 480 */ .quad sys_timerfd_create .quad sys_timerfd_settime .quad sys_timerfd_gettime .quad sys_signalfd4 - .quad sys_eventfd2 /* 485 */ + .quad sys_eventfd2 /* 485 */ .quad sys_epoll_create1 .quad sys_dup3 .quad sys_pipe2 .quad sys_inotify_init1 - .quad sys_preadv /* 490 */ + .quad sys_preadv /* 490 */ .quad sys_pwritev .quad sys_rt_tgsigqueueinfo .quad sys_perf_event_open .quad sys_fanotify_init - .quad sys_fanotify_mark /* 495 */ + .quad sys_fanotify_mark /* 495 */ .quad sys_prlimit64 + .quad sys_name_to_handle_at + .quad sys_open_by_handle_at + .quad sys_clock_adjtime + .quad sys_syncfs /* 500 */ .size sys_call_table, . - sys_call_table .type sys_call_table, @object -- cgit v1.1 From 89c0b8e2520e12d69dafc663dfbd39f8180438ea Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 8 May 2011 18:47:58 +0100 Subject: clocksource: add common i8253 PIT clocksource This is based upon both arch/arm/mach-footbridge/isa-timer.c and arch/x86/kernel/i8253.c. Acked-by: John Stultz Acked-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ralf Baechle Signed-off-by: Russell King --- drivers/Kconfig | 3 ++ drivers/clocksource/Kconfig | 2 + drivers/clocksource/Makefile | 1 + drivers/clocksource/i8253.c | 88 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+) create mode 100644 drivers/clocksource/Kconfig create mode 100644 drivers/clocksource/i8253.c diff --git a/drivers/Kconfig b/drivers/Kconfig index 177c7d1..557a469 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -119,4 +119,7 @@ source "drivers/platform/Kconfig" source "drivers/clk/Kconfig" source "drivers/hwspinlock/Kconfig" + +source "drivers/clocksource/Kconfig" + endmenu diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig new file mode 100644 index 0000000..110aeeb --- /dev/null +++ b/drivers/clocksource/Kconfig @@ -0,0 +1,2 @@ +config CLKSRC_I8253 + bool diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index be61ece..cfb6383 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC) += cs5535-clockevt.o obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o +obj-$(CONFIG_CLKSRC_I8253) += i8253.o diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c new file mode 100644 index 0000000..225c176 --- /dev/null +++ b/drivers/clocksource/i8253.c @@ -0,0 +1,88 @@ +/* + * i8253 PIT clocksource + */ +#include +#include +#include +#include +#include + +#include + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t i8253_read(struct clocksource *cs) +{ + static int old_count; + static u32 old_jifs; + unsigned long flags; + int count; + u32 jifs; + + raw_spin_lock_irqsave(&i8253_lock, flags); + /* + * Although our caller may have the read side of xtime_lock, + * this is now a seqlock, and we are cheating in this routine + * by having side effects on state that we cannot undo if + * there is a collision on the seqlock and our caller has to + * retry. (Namely, old_jifs and old_count.) So we must treat + * jiffies as volatile despite the lock. We read jiffies + * before latching the timer count to guarantee that although + * the jiffies value might be older than the count (that is, + * the counter may underflow between the last point where + * jiffies was incremented and the point where we latch the + * count), it cannot be newer. + */ + jifs = jiffies; + outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_pit(PIT_CH0); /* read the latched count */ + count |= inb_pit(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_pit(0x34, PIT_MODE); + outb_pit(PIT_LATCH & 0xff, PIT_CH0); + outb_pit(PIT_LATCH >> 8, PIT_CH0); + count = PIT_LATCH - 1; + } + + /* + * It's possible for count to appear to go the wrong way for a + * couple of reasons: + * + * 1. The timer counter underflows, but we haven't handled the + * resulting interrupt and incremented jiffies yet. + * 2. Hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + * + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ + if (count > old_count && jifs == old_jifs) + count = old_count; + + old_count = count; + old_jifs = jifs; + + raw_spin_unlock_irqrestore(&i8253_lock, flags); + + count = (PIT_LATCH - 1) - count; + + return (cycle_t)(jifs * PIT_LATCH) + count; +} + +static struct clocksource i8253_cs = { + .name = "pit", + .rating = 110, + .read = i8253_read, + .mask = CLOCKSOURCE_MASK(32), +}; + +int __init clocksource_i8253_init(void) +{ + return clocksource_register_hz(&i8253_cs, PIT_TICK_RATE); +} -- cgit v1.1 From 8c414ff3f4dcdde228c6a668385218290d73a265 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 8 May 2011 18:50:20 +0100 Subject: clocksource: convert footbridge to generic i8253 clocksource Convert the footbridge isa-timer code to use generic i8253 clocksource. Acked-by: John Stultz Acked-by: Thomas Gleixner Signed-off-by: Russell King --- arch/arm/include/asm/i8253.h | 15 ++++++++++++ arch/arm/mach-footbridge/Kconfig | 2 ++ arch/arm/mach-footbridge/isa-timer.c | 45 ++++-------------------------------- include/linux/clocksource.h | 2 ++ 4 files changed, 23 insertions(+), 41 deletions(-) create mode 100644 arch/arm/include/asm/i8253.h diff --git a/arch/arm/include/asm/i8253.h b/arch/arm/include/asm/i8253.h new file mode 100644 index 0000000..70656b6 --- /dev/null +++ b/arch/arm/include/asm/i8253.h @@ -0,0 +1,15 @@ +#ifndef __ASMARM_I8253_H +#define __ASMARM_I8253_H + +/* i8253A PIT registers */ +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 + +#define PIT_LATCH ((PIT_TICK_RATE + HZ / 2) / HZ) + +extern raw_spinlock_t i8253_lock; + +#define outb_pit outb_p +#define inb_pit inb_p + +#endif diff --git a/arch/arm/mach-footbridge/Kconfig b/arch/arm/mach-footbridge/Kconfig index bdd2579..46adca0 100644 --- a/arch/arm/mach-footbridge/Kconfig +++ b/arch/arm/mach-footbridge/Kconfig @@ -4,6 +4,7 @@ menu "Footbridge Implementations" config ARCH_CATS bool "CATS" + select CLKSRC_I8253 select FOOTBRIDGE_HOST select ISA select ISA_DMA @@ -59,6 +60,7 @@ config ARCH_EBSA285_HOST config ARCH_NETWINDER bool "NetWinder" + select CLKSRC_I8253 select FOOTBRIDGE_HOST select ISA select ISA_DMA diff --git a/arch/arm/mach-footbridge/isa-timer.c b/arch/arm/mach-footbridge/isa-timer.c index 441c6ce..7020f1a 100644 --- a/arch/arm/mach-footbridge/isa-timer.c +++ b/arch/arm/mach-footbridge/isa-timer.c @@ -10,53 +10,16 @@ #include #include #include +#include #include #include - +#include #include #include "common.h" -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 - -#define PIT_LATCH ((PIT_TICK_RATE + HZ / 2) / HZ) - -static cycle_t pit_read(struct clocksource *cs) -{ - unsigned long flags; - static int old_count; - static u32 old_jifs; - int count; - u32 jifs; - - raw_local_irq_save(flags); - - jifs = jiffies; - outb_p(0x00, PIT_MODE); /* latch the count */ - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb_p(PIT_CH0) << 8; - - if (count > old_count && jifs == old_jifs) - count = old_count; - - old_count = count; - old_jifs = jifs; - - raw_local_irq_restore(flags); - - count = (PIT_LATCH - 1) - count; - - return (cycle_t)(jifs * PIT_LATCH) + count; -} - -static struct clocksource pit_cs = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), -}; +DEFINE_RAW_SPINLOCK(i8253_lock); static void pit_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) @@ -121,7 +84,7 @@ static void __init isa_timer_init(void) pit_ce.max_delta_ns = clockevent_delta2ns(0x7fff, &pit_ce); pit_ce.min_delta_ns = clockevent_delta2ns(0x000f, &pit_ce); - clocksource_register_hz(&pit_cs, PIT_TICK_RATE); + clocksource_i8253_init(); setup_irq(pit_ce.irq, &pit_timer_irq); clockevents_register_device(&pit_ce); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index c37b21a..f13469b 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -341,4 +341,6 @@ static inline void update_vsyscall_tz(void) extern void timekeeping_notify(struct clocksource *clock); +extern int clocksource_i8253_init(void); + #endif /* _LINUX_CLOCKSOURCE_H */ -- cgit v1.1 From 82491451dd25a3abe8496ddbd04ddb3f77d285c2 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 8 May 2011 18:55:19 +0100 Subject: clocksource: convert x86 to generic i8253 clocksource Convert x86 i8253 clocksource code to use generic i8253 clocksource. Acked-by: John Stultz Acked-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Russell King --- arch/x86/Kconfig | 1 + arch/x86/include/asm/i8253.h | 2 ++ arch/x86/kernel/i8253.c | 79 +------------------------------------------- 3 files changed, 4 insertions(+), 78 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a..01115d5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -8,6 +8,7 @@ config 64BIT config X86_32 def_bool !64BIT + select CLKSRC_I8253 config X86_64 def_bool 64BIT diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h index fc1f579..65aaa91 100644 --- a/arch/x86/include/asm/i8253.h +++ b/arch/x86/include/asm/i8253.h @@ -6,6 +6,8 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 +#define PIT_LATCH LATCH + extern raw_spinlock_t i8253_lock; extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2dfd315..b904dfb 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -117,81 +117,6 @@ void __init setup_pit_timer(void) } #ifndef CONFIG_X86_64 -/* - * Since the PIT overflows every tick, its not very useful - * to just read by itself. So use jiffies to emulate a free - * running counter: - */ -static cycle_t pit_read(struct clocksource *cs) -{ - static int old_count; - static u32 old_jifs; - unsigned long flags; - int count; - u32 jifs; - - raw_spin_lock_irqsave(&i8253_lock, flags); - /* - * Although our caller may have the read side of xtime_lock, - * this is now a seqlock, and we are cheating in this routine - * by having side effects on state that we cannot undo if - * there is a collision on the seqlock and our caller has to - * retry. (Namely, old_jifs and old_count.) So we must treat - * jiffies as volatile despite the lock. We read jiffies - * before latching the timer count to guarantee that although - * the jiffies value might be older than the count (that is, - * the counter may underflow between the last point where - * jiffies was incremented and the point where we latch the - * count), it cannot be newer. - */ - jifs = jiffies; - outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_pit(PIT_CH0); /* read the latched count */ - count |= inb_pit(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_pit(0x34, PIT_MODE); - outb_pit(LATCH & 0xff, PIT_CH0); - outb_pit(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * It's possible for count to appear to go the wrong way for a - * couple of reasons: - * - * 1. The timer counter underflows, but we haven't handled the - * resulting interrupt and incremented jiffies yet. - * 2. Hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - * - * Previous attempts to handle these cases intelligently were - * buggy, so we just do the simple thing now. - */ - if (count > old_count && jifs == old_jifs) - count = old_count; - - old_count = count; - old_jifs = jifs; - - raw_spin_unlock_irqrestore(&i8253_lock, flags); - - count = (LATCH - 1) - count; - - return (cycle_t)(jifs * LATCH) + count; -} - -static struct clocksource pit_cs = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, -}; - static int __init init_pit_clocksource(void) { /* @@ -205,9 +130,7 @@ static int __init init_pit_clocksource(void) pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); - - return clocksource_register(&pit_cs); + return clocksource_i8253_init(); } arch_initcall(init_pit_clocksource); -- cgit v1.1 From 798778b8653f64b7b2162ac70eca10367cff6ce8 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 8 May 2011 19:03:03 +0100 Subject: clocksource: convert mips to generic i8253 clocksource Convert MIPS i8253 clocksource code to use generic i8253 clocksource. Acked-by: John Stultz Acked-by: Thomas Gleixner Cc: Ralf Baechle Signed-off-by: Russell King --- arch/mips/Kconfig | 1 + arch/mips/include/asm/i8253.h | 5 +++ arch/mips/kernel/i8253.c | 78 +------------------------------------------ 3 files changed, 7 insertions(+), 77 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 8e256cc..f7f6419 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2339,6 +2339,7 @@ config MMU config I8253 bool + select CLKSRC_I8253 select MIPS_EXTERNAL_TIMER config ZONE_DMA32 diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h index 48bb823..9ad0113 100644 --- a/arch/mips/include/asm/i8253.h +++ b/arch/mips/include/asm/i8253.h @@ -12,8 +12,13 @@ #define PIT_CH0 0x40 #define PIT_CH2 0x42 +#define PIT_LATCH LATCH + extern raw_spinlock_t i8253_lock; extern void setup_pit_timer(void); +#define inb_pit inb_p +#define outb_pit outb_p + #endif /* __ASM_I8253_H */ diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index 2392a7a2..391221b 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -125,87 +125,11 @@ void __init setup_pit_timer(void) setup_irq(0, &irq0); } -/* - * Since the PIT overflows every tick, its not very useful - * to just read by itself. So use jiffies to emulate a free - * running counter: - */ -static cycle_t pit_read(struct clocksource *cs) -{ - unsigned long flags; - int count; - u32 jifs; - static int old_count; - static u32 old_jifs; - - raw_spin_lock_irqsave(&i8253_lock, flags); - /* - * Although our caller may have the read side of xtime_lock, - * this is now a seqlock, and we are cheating in this routine - * by having side effects on state that we cannot undo if - * there is a collision on the seqlock and our caller has to - * retry. (Namely, old_jifs and old_count.) So we must treat - * jiffies as volatile despite the lock. We read jiffies - * before latching the timer count to guarantee that although - * the jiffies value might be older than the count (that is, - * the counter may underflow between the last point where - * jiffies was incremented and the point where we latch the - * count), it cannot be newer. - */ - jifs = jiffies; - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb_p(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * It's possible for count to appear to go the wrong way for a - * couple of reasons: - * - * 1. The timer counter underflows, but we haven't handled the - * resulting interrupt and incremented jiffies yet. - * 2. Hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - * - * Previous attempts to handle these cases intelligently were - * buggy, so we just do the simple thing now. - */ - if (count > old_count && jifs == old_jifs) { - count = old_count; - } - old_count = count; - old_jifs = jifs; - - raw_spin_unlock_irqrestore(&i8253_lock, flags); - - count = (LATCH - 1) - count; - - return (cycle_t)(jifs * LATCH) + count; -} - -static struct clocksource clocksource_pit = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, -}; - static int __init init_pit_clocksource(void) { if (num_possible_cpus() > 1) /* PIT does not scale! */ return 0; - clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); - return clocksource_register(&clocksource_pit); + return clocksource_i8253_init(); } arch_initcall(init_pit_clocksource); -- cgit v1.1 From 712f3147aee0fbbbbed2da20b21b272c5505125e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 13 May 2011 16:16:41 -0700 Subject: fbmem: fix remove_conflicting_framebuffers races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a register_framebuffer() call results in us removing old conflicting framebuffers, the new registration_lock doesn't protect that situation. And we can't just add the same locking to the function, because these functions call each other: register_framebuffer() calls remove_conflicting_framebuffers, which in turn calls unregister_framebuffer for any conflicting entry. In order to fix it, this just creates wrapper functions around all three functions and makes the versions that actually do the work be called "do_xxx()", leaving just the wrapper that gets the lock and calls the worker function. So the rule becomes simply that "do_xxxx()" has to be called with the lock held, and now do_register_framebuffer() can just call do_remove_conflicting_framebuffers(), and that in turn can call _do_unregister_framebuffer(), and there is no deadlock, and we can hold the registration lock over the whole sequence, fixing the races. It also makes error cases simpler, and fixes one situation where we would return from unregister_framebuffer() without releasing the lock, pointed out by Bruno Prémont. Tested-by: Bruno Prémont Tested-by: Anca Emanuel Signed-off-by: Linus Torvalds --- drivers/video/fbmem.c | 117 +++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index ea16e65..46ee5e5a 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1537,8 +1537,10 @@ static bool fb_do_apertures_overlap(struct apertures_struct *gena, return false; } +static int do_unregister_framebuffer(struct fb_info *fb_info); + #define VGA_FB_PHYS 0xA0000 -void remove_conflicting_framebuffers(struct apertures_struct *a, +static void do_remove_conflicting_framebuffers(struct apertures_struct *a, const char *name, bool primary) { int i; @@ -1560,24 +1562,12 @@ void remove_conflicting_framebuffers(struct apertures_struct *a, printk(KERN_INFO "fb: conflicting fb hw usage " "%s vs %s - removing generic driver\n", name, registered_fb[i]->fix.id); - unregister_framebuffer(registered_fb[i]); + do_unregister_framebuffer(registered_fb[i]); } } } -EXPORT_SYMBOL(remove_conflicting_framebuffers); - -/** - * register_framebuffer - registers a frame buffer device - * @fb_info: frame buffer info structure - * - * Registers a frame buffer device @fb_info. - * - * Returns negative errno on error, or zero for success. - * - */ -int -register_framebuffer(struct fb_info *fb_info) +static int do_register_framebuffer(struct fb_info *fb_info) { int i; struct fb_event event; @@ -1589,10 +1579,9 @@ register_framebuffer(struct fb_info *fb_info) if (fb_check_foreignness(fb_info)) return -ENOSYS; - remove_conflicting_framebuffers(fb_info->apertures, fb_info->fix.id, + do_remove_conflicting_framebuffers(fb_info->apertures, fb_info->fix.id, fb_is_primary_device(fb_info)); - mutex_lock(®istration_lock); num_registered_fb++; for (i = 0 ; i < FB_MAX; i++) if (!registered_fb[i]) @@ -1635,7 +1624,6 @@ register_framebuffer(struct fb_info *fb_info) fb_var_to_videomode(&mode, &fb_info->var); fb_add_videomode(&mode, &fb_info->modelist); registered_fb[i] = fb_info; - mutex_unlock(®istration_lock); event.info = fb_info; if (!lock_fb_info(fb_info)) @@ -1645,37 +1633,14 @@ register_framebuffer(struct fb_info *fb_info) return 0; } - -/** - * unregister_framebuffer - releases a frame buffer device - * @fb_info: frame buffer info structure - * - * Unregisters a frame buffer device @fb_info. - * - * Returns negative errno on error, or zero for success. - * - * This function will also notify the framebuffer console - * to release the driver. - * - * This is meant to be called within a driver's module_exit() - * function. If this is called outside module_exit(), ensure - * that the driver implements fb_open() and fb_release() to - * check that no processes are using the device. - */ - -int -unregister_framebuffer(struct fb_info *fb_info) +static int do_unregister_framebuffer(struct fb_info *fb_info) { struct fb_event event; int i, ret = 0; - mutex_lock(®istration_lock); i = fb_info->node; - if (!registered_fb[i]) { - ret = -EINVAL; - goto done; - } - + if (!registered_fb[i]) + return -EINVAL; if (!lock_fb_info(fb_info)) return -ENODEV; @@ -1683,10 +1648,8 @@ unregister_framebuffer(struct fb_info *fb_info) ret = fb_notifier_call_chain(FB_EVENT_FB_UNBIND, &event); unlock_fb_info(fb_info); - if (ret) { - ret = -EINVAL; - goto done; - } + if (ret) + return -EINVAL; if (fb_info->pixmap.addr && (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) @@ -1701,8 +1664,64 @@ unregister_framebuffer(struct fb_info *fb_info) /* this may free fb info */ put_fb_info(fb_info); -done: + return 0; +} + +void remove_conflicting_framebuffers(struct apertures_struct *a, + const char *name, bool primary) +{ + mutex_lock(®istration_lock); + do_remove_conflicting_framebuffers(a, name, primary); mutex_unlock(®istration_lock); +} +EXPORT_SYMBOL(remove_conflicting_framebuffers); + +/** + * register_framebuffer - registers a frame buffer device + * @fb_info: frame buffer info structure + * + * Registers a frame buffer device @fb_info. + * + * Returns negative errno on error, or zero for success. + * + */ +int +register_framebuffer(struct fb_info *fb_info) +{ + int ret; + + mutex_lock(®istration_lock); + ret = do_register_framebuffer(fb_info); + mutex_unlock(®istration_lock); + + return ret; +} + +/** + * unregister_framebuffer - releases a frame buffer device + * @fb_info: frame buffer info structure + * + * Unregisters a frame buffer device @fb_info. + * + * Returns negative errno on error, or zero for success. + * + * This function will also notify the framebuffer console + * to release the driver. + * + * This is meant to be called within a driver's module_exit() + * function. If this is called outside module_exit(), ensure + * that the driver implements fb_open() and fb_release() to + * check that no processes are using the device. + */ +int +unregister_framebuffer(struct fb_info *fb_info) +{ + int ret; + + mutex_lock(®istration_lock); + ret = do_unregister_framebuffer(fb_info); + mutex_unlock(®istration_lock); + return ret; } -- cgit v1.1 From c590cece75728a85ea06801df3ebad2d7ad8612c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruno=20Pr=C3=A9mont?= Date: Sat, 14 May 2011 12:24:15 +0200 Subject: Further fbcon sanity checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves the if (num_registered_fb == FB_MAX) return -ENXIO; check _AFTER_ the call to do_remove_conflicting_framebuffers() as this would (now in a safe way) allow a native driver to replace the conflicting one even if all slots in registered_fb[] are taken. This also prevents unregistering a framebuffer that is no longer registered (vga16f will unregister at module unload time even if the frame buffer had been unregistered earlier due to being found conflicting). Signed-off-by: Bruno Prémont Signed-off-by: Linus Torvalds --- drivers/video/fbmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 46ee5e5a..5aac00e 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1573,15 +1573,15 @@ static int do_register_framebuffer(struct fb_info *fb_info) struct fb_event event; struct fb_videomode mode; - if (num_registered_fb == FB_MAX) - return -ENXIO; - if (fb_check_foreignness(fb_info)) return -ENOSYS; do_remove_conflicting_framebuffers(fb_info->apertures, fb_info->fix.id, fb_is_primary_device(fb_info)); + if (num_registered_fb == FB_MAX) + return -ENXIO; + num_registered_fb++; for (i = 0 ; i < FB_MAX; i++) if (!registered_fb[i]) @@ -1639,7 +1639,7 @@ static int do_unregister_framebuffer(struct fb_info *fb_info) int i, ret = 0; i = fb_info->node; - if (!registered_fb[i]) + if (i < 0 || i >= FB_MAX || registered_fb[i] != fb_info) return -EINVAL; if (!lock_fb_info(fb_info)) -- cgit v1.1 From 22fe9446e82f1fe4b59900db4599061384efb0ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 14 May 2011 12:28:04 +0200 Subject: Revert "libata: ahci_start_engine compliant to AHCI spec" This reverts commit 270dac35c26433d06a89150c51e75ca0181ca7e4. The commits causes command timeouts on AC plug/unplug. It isn't yet clear why. As the commit was for a single rather obscure controller, revert the change for now. The problem was reported and bisected by Gu Rui in bug#34692. https://bugzilla.kernel.org/show_bug.cgi?id=34692 Also, reported by Rafael and Michael in the following thread. http://thread.gmane.org/gmane.linux.kernel/1138771 Signed-off-by: Tejun Heo Reported-by: Gu Rui Reported-by: Rafael J. Wysocki Reported-by: Michael Leun Cc: Jian Peng Cc: Jeff Garzik Signed-off-by: Linus Torvalds --- drivers/ata/libahci.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index ff9d832..d38c40f 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -561,27 +561,6 @@ void ahci_start_engine(struct ata_port *ap) { void __iomem *port_mmio = ahci_port_base(ap); u32 tmp; - u8 status; - - status = readl(port_mmio + PORT_TFDATA) & 0xFF; - - /* - * At end of section 10.1 of AHCI spec (rev 1.3), it states - * Software shall not set PxCMD.ST to 1 until it is determined - * that a functoinal device is present on the port as determined by - * PxTFD.STS.BSY=0, PxTFD.STS.DRQ=0 and PxSSTS.DET=3h - * - * Even though most AHCI host controllers work without this check, - * specific controller will fail under this condition - */ - if (status & (ATA_BUSY | ATA_DRQ)) - return; - else { - ahci_scr_read(&ap->link, SCR_STATUS, &tmp); - - if ((tmp & 0xf) != 0x3) - return; - } /* start DMA */ tmp = readl(port_mmio + PORT_CMD); -- cgit v1.1 From 5f6f12ccf3aa42cfc0c5bde9228df0c843dd63f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 May 2011 16:04:11 +0200 Subject: libata: fix oops when LPM is used with PMP ae01b2493c (libata: Implement ATA_FLAG_NO_DIPM and apply it to mcp65) added ATA_FLAG_NO_DIPM and made ata_eh_set_lpm() check the flag. However, @ap is NULL if @link points to a PMP link and thus the unconditional @ap->flags dereference leads to the following oops. BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] ata_eh_recover+0x9a1/0x1510 ... Pid: 295, comm: scsi_eh_4 Tainted: P 2.6.38.5-core2 #1 System76, Inc. Serval Professional/Serval Professional RIP: 0010:[] [] ata_eh_recover+0x9a1/0x1510 RSP: 0018:ffff880132defbf0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880132f40000 RCX: 0000000000000000 RDX: ffff88013377c000 RSI: ffff880132f40000 RDI: 0000000000000000 RBP: ffff880132defce0 R08: ffff88013377dc58 R09: ffff880132defd98 R10: 0000000000000000 R11: 00000000ffffffff R12: 0000000000000000 R13: 0000000000000000 R14: ffff88013377c000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8800bf700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000018 CR3: 0000000001a03000 CR4: 00000000000406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process scsi_eh_4 (pid: 295, threadinfo ffff880132dee000, task ffff880133b416c0) Stack: 0000000000000000 ffff880132defcc0 0000000000000000 ffff880132f42738 ffffffff813ee8f0 ffffffff813eefe0 ffff880132defd98 ffff88013377f190 ffffffffa00b3e30 ffffffff813ef030 0000000032defc60 ffff880100000000 Call Trace: [] sata_pmp_error_handler+0x607/0xc30 [] ahci_error_handler+0x1f/0x70 [libahci] [] ata_scsi_error+0x5be/0x900 [] scsi_error_handler+0x124/0x650 [] kthread+0x96/0xa0 [] kernel_thread_helper+0x4/0x10 Code: 8b 95 70 ff ff ff b8 00 00 00 00 48 3b 9a 10 2e 00 00 48 0f 44 c2 48 89 85 70 ff ff ff 48 8b 8d 70 ff ff ff f6 83 69 02 00 00 01 <48> 8b 41 18 0f 85 48 01 00 00 48 85 c9 74 12 48 8b 51 08 48 83 RIP [] ata_eh_recover+0x9a1/0x1510 RSP CR2: 0000000000000018 Fix it by testing @link->ap->flags instead. stable: ATA_FLAG_NO_DIPM was added during 2.6.39 cycle but was backported to 2.6.37 and 38. This is a fix for that and thus also applicable to 2.6.37 and 38. Signed-off-by: Tejun Heo Reported-by: "Nathan A. Mourey II" LKML-Reference: <1304555277.2059.2.camel@localhost.localdomain> Cc: Connor H Cc: stable@kernel.org Signed-off-by: Jeff Garzik --- drivers/ata/libata-eh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index f26f2fe..dad9fd6 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -3316,7 +3316,7 @@ static int ata_eh_set_lpm(struct ata_link *link, enum ata_lpm_policy policy, struct ata_eh_context *ehc = &link->eh_context; struct ata_device *dev, *link_dev = NULL, *lpm_dev = NULL; enum ata_lpm_policy old_policy = link->lpm_policy; - bool no_dipm = ap->flags & ATA_FLAG_NO_DIPM; + bool no_dipm = link->ap->flags & ATA_FLAG_NO_DIPM; unsigned int hints = ATA_LPM_EMPTY | ATA_LPM_HIPM; unsigned int err_mask; int rc; -- cgit v1.1 From 05bf86b4ccfd0f197da61c67bd372111d15a6620 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 14 May 2011 12:06:42 -0700 Subject: tmpfs: fix race between swapoff and writepage Shame on me! Commit b1dea800ac39 "tmpfs: fix race between umount and writepage" fixed the advertized race, but introduced another: as even its comment makes clear, we cannot safely rely on a peek at list_empty() while holding no lock - until info->swapped is set, shmem_unuse_inode() may delete any formerly-swapped inode from the shmem_swaplist, which in this case would leave a swap area impossible to swapoff. Although I don't relish taking the mutex every time, I don't care much for the alternatives either; and at least the peek at list_empty() in shmem_evict_inode() (a hotter path since most inodes would never have been swapped) remains safe, because we already truncated the whole file. Signed-off-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/shmem.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9e755c1..dfc7069 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1037,7 +1037,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct address_space *mapping; unsigned long index; struct inode *inode; - bool unlock_mutex = false; BUG_ON(!PageLocked(page)); mapping = page->mapping; @@ -1072,15 +1071,14 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * we've taken the spinlock, because shmem_unuse_inode() will * prune a !swapped inode from the swaplist under both locks. */ - if (swap.val && list_empty(&info->swaplist)) { + if (swap.val) { mutex_lock(&shmem_swaplist_mutex); - /* move instead of add in case we're racing */ - list_move_tail(&info->swaplist, &shmem_swaplist); - unlock_mutex = true; + if (list_empty(&info->swaplist)) + list_add_tail(&info->swaplist, &shmem_swaplist); } spin_lock(&info->lock); - if (unlock_mutex) + if (swap.val) mutex_unlock(&shmem_swaplist_mutex); if (index >= info->next_index) { -- cgit v1.1 From f5de93914983bf04b92a786d1d205286fc53b49b Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 3 May 2011 16:44:13 +0000 Subject: Prevent oopsing in posix_acl_valid() If posix_acl_from_xattr() returns an error code, a negative address is dereferenced causing an oops; fix by checking for error code first. Signed-off-by: Daniel J Blueman Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index a892bc2..827be9a 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -178,12 +178,13 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, if (value) { acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { ret = posix_acl_valid(acl); if (ret) goto out; - } else if (IS_ERR(acl)) { - return PTR_ERR(acl); } } -- cgit v1.1 From 1aba86d67f340a8001d67183ec32e8a62e3ec658 Mon Sep 17 00:00:00 2001 From: liubo Date: Fri, 8 Apr 2011 08:44:37 +0000 Subject: Btrfs: fix easily get into ENOSPC in mixed case When a btrfs disk is created by mixed data & metadata option, it will have no pure data or pure metadata space info. In btrfs's for-linus branch, commit 78b1ea13838039cd88afdd62519b40b344d6c920 (Btrfs: fix OOPS of empty filesystem after balance) initializes space infos at the very beginning. The problem is this initialization does not take the mixed case into account, which will cause btrfs will easily get into ENOSPC in mixed case. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd52f7f..9ee6bd5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8856,23 +8856,38 @@ out: int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; int ret; - ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0, - &space_info); - if (ret) - return ret; + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return 1; - ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0, - &space_info); - if (ret) - return ret; + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; - ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0, - &space_info); + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); if (ret) - return ret; + goto out; + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = update_space_info(fs_info, flags, 0, 0, &space_info); + } +out: return ret; } -- cgit v1.1 From e1e8fb6a1ff3f9487e03a4cbf85b81d1316068ce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 Apr 2011 03:02:49 +0000 Subject: fs: remove FS_COW_FL FS_COW_FL and FS_NOCOW_FL were newly introduced to control per file COW in btrfs, but FS_NOCOW_FL is sufficient. The fact is we don't have corresponding BTRFS_INODE_COW flag. COW is default, and FS_NOCOW_FL can be used to switch off COW for a single file. If we mount btrfs with nodatacow, a newly created file will be set with the FS_NOCOW_FL flag. So to turn on COW for it, we can just clear the FS_NOCOW_FL flag. Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 15 ++++++--------- include/linux/fs.h | 1 - 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f580a3a..3240dd9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -144,16 +144,13 @@ static int check_flags(unsigned int flags) if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ FS_SYNC_FL | FS_DIRSYNC_FL | \ - FS_NOCOMP_FL | FS_COMPR_FL | \ - FS_NOCOW_FL | FS_COW_FL)) + FS_NOCOMP_FL | FS_COMPR_FL | + FS_NOCOW_FL)) return -EOPNOTSUPP; if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) return -EINVAL; - if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL)) - return -EINVAL; - return 0; } @@ -218,6 +215,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_DIRSYNC; else ip->flags &= ~BTRFS_INODE_DIRSYNC; + if (flags & FS_NOCOW_FL) + ip->flags |= BTRFS_INODE_NODATACOW; + else + ip->flags &= ~BTRFS_INODE_NODATACOW; /* * The COMPRESS flag can only be changed by users, while the NOCOMPRESS @@ -231,10 +232,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; } - if (flags & FS_NOCOW_FL) - ip->flags |= BTRFS_INODE_NODATACOW; - else if (flags & FS_COW_FL) - ip->flags &= ~BTRFS_INODE_NODATACOW; trans = btrfs_join_transaction(root, 1); BUG_ON(IS_ERR(trans)); diff --git a/include/linux/fs.h b/include/linux/fs.h index de9dd81..56a4141 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -365,7 +365,6 @@ struct inodes_stat_t { #define FS_EXTENT_FL 0x00080000 /* Extents */ #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -#define FS_COW_FL 0x02000000 /* Cow file */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -- cgit v1.1 From d0092bdda819914b8725da76a8c33eb06eb0bd21 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 Apr 2011 03:03:06 +0000 Subject: Btrfs: fix FS_IOC_GETFLAGS ioctl As we've added per file compression/cow support. Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3240dd9..aeabf6b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -81,6 +81,13 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) iflags |= FS_NOATIME_FL; if (flags & BTRFS_INODE_DIRSYNC) iflags |= FS_DIRSYNC_FL; + if (flags & BTRFS_INODE_NODATACOW) + iflags |= FS_NOCOW_FL; + + if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) + iflags |= FS_COMPR_FL; + else if (flags & BTRFS_INODE_NOCOMPRESS) + iflags |= FS_NOCOMP_FL; return iflags; } -- cgit v1.1 From ebcb904dfe31644857422e3bb62e50f76fe86255 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 Apr 2011 03:03:17 +0000 Subject: Btrfs: fix FS_IOC_SETFLAGS ioctl Steps to reproduce the bug: - Call FS_IOC_SETLFAGS ioctl with flags=FS_COMPR_FL - Call FS_IOC_SETFLAGS ioctl with flags=0 - Call FS_IOC_GETFLAGS ioctl, and you'll see FS_COMPR_FL is still set! Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index aeabf6b..3e7031d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -238,6 +238,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } else if (flags & FS_COMPR_FL) { ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + } else { + ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } trans = btrfs_join_transaction(root, 1); -- cgit v1.1 From b90194181988063266f3da0b7bf3e57268c627c8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 25 Apr 2011 16:25:20 -0300 Subject: perf tools: Honour the cpu list parameter when also monitoring a thread list The perf_evlist__create_maps was discarding the --cpu parameter when a --pid or --tid was specified, fix that. Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Link: http://lkml.kernel.org/r/20110426204401.GB1746@ghostprotocols.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 45da8d1..1884a7c 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -348,7 +348,7 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, if (evlist->threads == NULL) return -1; - if (target_tid != -1) + if (cpu_list == NULL && target_tid != -1) evlist->cpus = cpu_map__dummy_new(); else evlist->cpus = cpu_map__new(cpu_list); -- cgit v1.1 From aece948f5ddd70d70df2f35855c706ef9a4f62e2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 15 May 2011 09:39:00 -0300 Subject: perf evlist: Fix per thread mmap setup The PERF_EVENT_IOC_SET_OUTPUT ioctl was returning -EINVAL when using --pid when monitoring multithreaded apps, as we can only share a ring buffer for events on the same thread if not doing per cpu. Fix it by using per thread ring buffers. Tested with: [root@felicio ~]# tuna -t 26131 -CP | nl 1 thread ctxt_switches 2 pid SCHED_ rtpri affinity voluntary nonvoluntary cmd 3 26131 OTHER 0 0,1 10814276 2397830 chromium-browse 4 642 OTHER 0 0,1 14688 0 chromium-browse 5 26148 OTHER 0 0,1 713602 115479 chromium-browse 6 26149 OTHER 0 0,1 801958 2262 chromium-browse 7 26150 OTHER 0 0,1 1271128 248 chromium-browse 8 26151 OTHER 0 0,1 3 0 chromium-browse 9 27049 OTHER 0 0,1 36796 9 chromium-browse 10 618 OTHER 0 0,1 14711 0 chromium-browse 11 661 OTHER 0 0,1 14593 0 chromium-browse 12 29048 OTHER 0 0,1 28125 0 chromium-browse 13 26143 OTHER 0 0,1 2202789 781 chromium-browse [root@felicio ~]# So 11 threads under pid 26131, then: [root@felicio ~]# perf record -F 50000 --pid 26131 [root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl 1 7fa4a2538000-7fa4a25b9000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 2 7fa4a25b9000-7fa4a263a000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 3 7fa4a263a000-7fa4a26bb000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 4 7fa4a26bb000-7fa4a273c000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 5 7fa4a273c000-7fa4a27bd000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 6 7fa4a27bd000-7fa4a283e000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 7 7fa4a283e000-7fa4a28bf000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 8 7fa4a28bf000-7fa4a2940000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 9 7fa4a2940000-7fa4a29c1000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 10 7fa4a29c1000-7fa4a2a42000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 11 7fa4a2a42000-7fa4a2ac3000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] [root@felicio ~]# 11 mmaps, one per thread since we didn't specify any CPU list, so we need one mmap per thread and: [root@felicio ~]# perf record -F 50000 --pid 26131 ^M ^C[ perf record: Woken up 79 times to write data ] [ perf record: Captured and wrote 20.614 MB perf.data (~900639 samples) ] [root@felicio ~]# perf report -D | grep PERF_RECORD_SAMPLE | cut -d/ -f2 | cut -d: -f1 | sort -n | uniq -c | sort -nr | nl 1 371310 26131 2 96516 26148 3 95694 26149 4 95203 26150 5 7291 26143 6 87 27049 7 76 661 8 60 29048 9 47 618 10 43 642 [root@felicio ~]# Ok, one of the threads, 26151 was quiescent, so no samples there, but all the others are there. Then, if I specify one CPU: [root@felicio ~]# perf record -F 50000 --pid 26131 --cpu 1 ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.680 MB perf.data (~29730 samples) ] [root@felicio ~]# perf report -D | grep PERF_RECORD_SAMPLE | cut -d/ -f2 | cut -d: -f1 | sort -n | uniq -c | sort -nr | nl 1 8444 26131 2 2584 26149 3 2518 26148 4 2324 26150 5 123 26143 6 9 661 7 9 29048 [root@felicio ~]# This machine has two cores, so fewer threads appeared on the radar, and: [root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl 1 7f484b922000-7f484b9a3000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] [root@felicio ~]# Just one mmap, as now we can use just one per-cpu buffer instead of the per-thread needed in the previous case. For global profiling: [root@felicio ~]# perf record -F 50000 -a ^C[ perf record: Woken up 26 times to write data ] [ perf record: Captured and wrote 7.128 MB perf.data (~311412 samples) ] [root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl 1 7fb49b435000-7fb49b4b6000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] 2 7fb49b4b6000-7fb49b537000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] [root@felicio ~]# It uses per-cpu buffers. For just one thread: [root@felicio ~]# perf record -F 50000 --tid 26148 ^C[ perf record: Woken up 2 times to write data ] [ perf record: Captured and wrote 0.330 MB perf.data (~14426 samples) ] [root@felicio ~]# perf report -D | grep PERF_RECORD_SAMPLE | cut -d/ -f2 | cut -d: -f1 | sort -n | uniq -c | sort -nr | nl 1 9969 26148 [root@felicio ~]# [root@felicio ~]# grep perf_event /proc/`pidof perf`/maps | nl 1 7f286a51b000-7f286a59c000 rwxs 00000000 00:09 4064 anon_inode:[perf_event] [root@felicio ~]# Tested-by: David Ahern Tested-by: Lin Ming Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Link: http://lkml.kernel.org/r/20110426204401.GB1746@ghostprotocols.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-record.c | 2 +- tools/perf/builtin-test.c | 2 +- tools/perf/builtin-top.c | 8 +-- tools/perf/util/evlist.c | 151 +++++++++++++++++++++++++++++++------------- tools/perf/util/evlist.h | 3 +- tools/perf/util/python.c | 2 +- 6 files changed, 115 insertions(+), 53 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 4165382..0974f95 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -427,7 +427,7 @@ static void mmap_read_all(void) { int i; - for (i = 0; i < evsel_list->cpus->nr; i++) { + for (i = 0; i < evsel_list->nr_mmaps; i++) { if (evsel_list->mmap[i].base) mmap_read(&evsel_list->mmap[i]); } diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 11e3c84..2f9a337 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -549,7 +549,7 @@ static int test__basic_mmap(void) ++foo; } - while ((event = perf_evlist__read_on_cpu(evlist, 0)) != NULL) { + while ((event = perf_evlist__mmap_read(evlist, 0)) != NULL) { struct perf_sample sample; if (event->header.type != PERF_RECORD_SAMPLE) { diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 7e3d6e3..ebfc7cf 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -801,12 +801,12 @@ static void perf_event__process_sample(const union perf_event *event, } } -static void perf_session__mmap_read_cpu(struct perf_session *self, int cpu) +static void perf_session__mmap_read_idx(struct perf_session *self, int idx) { struct perf_sample sample; union perf_event *event; - while ((event = perf_evlist__read_on_cpu(top.evlist, cpu)) != NULL) { + while ((event = perf_evlist__mmap_read(top.evlist, idx)) != NULL) { perf_session__parse_sample(self, event, &sample); if (event->header.type == PERF_RECORD_SAMPLE) @@ -820,8 +820,8 @@ static void perf_session__mmap_read(struct perf_session *self) { int i; - for (i = 0; i < top.evlist->cpus->nr; i++) - perf_session__mmap_read_cpu(self, i); + for (i = 0; i < top.evlist->nr_mmaps; i++) + perf_session__mmap_read_idx(self, i); } static void start_counters(struct perf_evlist *evlist) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 1884a7c..23eb22b 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -166,11 +166,11 @@ struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id) return NULL; } -union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) +union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) { /* XXX Move this to perf.c, making it generally available */ unsigned int page_size = sysconf(_SC_PAGE_SIZE); - struct perf_mmap *md = &evlist->mmap[cpu]; + struct perf_mmap *md = &evlist->mmap[idx]; unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; @@ -235,31 +235,37 @@ union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *evlist, int cpu) void perf_evlist__munmap(struct perf_evlist *evlist) { - int cpu; + int i; - for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { - if (evlist->mmap[cpu].base != NULL) { - munmap(evlist->mmap[cpu].base, evlist->mmap_len); - evlist->mmap[cpu].base = NULL; + for (i = 0; i < evlist->nr_mmaps; i++) { + if (evlist->mmap[i].base != NULL) { + munmap(evlist->mmap[i].base, evlist->mmap_len); + evlist->mmap[i].base = NULL; } } + + free(evlist->mmap); + evlist->mmap = NULL; } int perf_evlist__alloc_mmap(struct perf_evlist *evlist) { - evlist->mmap = zalloc(evlist->cpus->nr * sizeof(struct perf_mmap)); + evlist->nr_mmaps = evlist->cpus->nr; + if (evlist->cpus->map[0] == -1) + evlist->nr_mmaps = evlist->threads->nr; + evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap)); return evlist->mmap != NULL ? 0 : -ENOMEM; } static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *evsel, - int cpu, int prot, int mask, int fd) + int idx, int prot, int mask, int fd) { - evlist->mmap[cpu].prev = 0; - evlist->mmap[cpu].mask = mask; - evlist->mmap[cpu].base = mmap(NULL, evlist->mmap_len, prot, + evlist->mmap[idx].prev = 0; + evlist->mmap[idx].mask = mask; + evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot, MAP_SHARED, fd, 0); - if (evlist->mmap[cpu].base == MAP_FAILED) { - if (evlist->cpus->map[cpu] == -1 && evsel->attr.inherit) + if (evlist->mmap[idx].base == MAP_FAILED) { + if (evlist->cpus->map[idx] == -1 && evsel->attr.inherit) ui__warning("Inherit is not allowed on per-task " "events using mmap.\n"); return -1; @@ -269,6 +275,86 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *ev return 0; } +static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot, int mask) +{ + struct perf_evsel *evsel; + int cpu, thread; + + for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { + int output = -1; + + for (thread = 0; thread < evlist->threads->nr; thread++) { + list_for_each_entry(evsel, &evlist->entries, node) { + int fd = FD(evsel, cpu, thread); + + if (output == -1) { + output = fd; + if (__perf_evlist__mmap(evlist, evsel, cpu, + prot, mask, output) < 0) + goto out_unmap; + } else { + if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, output) != 0) + goto out_unmap; + } + + if ((evsel->attr.read_format & PERF_FORMAT_ID) && + perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0) + goto out_unmap; + } + } + } + + return 0; + +out_unmap: + for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { + if (evlist->mmap[cpu].base != NULL) { + munmap(evlist->mmap[cpu].base, evlist->mmap_len); + evlist->mmap[cpu].base = NULL; + } + } + return -1; +} + +static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, int mask) +{ + struct perf_evsel *evsel; + int thread; + + for (thread = 0; thread < evlist->threads->nr; thread++) { + int output = -1; + + list_for_each_entry(evsel, &evlist->entries, node) { + int fd = FD(evsel, 0, thread); + + if (output == -1) { + output = fd; + if (__perf_evlist__mmap(evlist, evsel, thread, + prot, mask, output) < 0) + goto out_unmap; + } else { + if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, output) != 0) + goto out_unmap; + } + + if ((evsel->attr.read_format & PERF_FORMAT_ID) && + perf_evlist__id_add_fd(evlist, evsel, 0, thread, fd) < 0) + goto out_unmap; + } + } + + return 0; + +out_unmap: + for (thread = 0; thread < evlist->threads->nr; thread++) { + if (evlist->mmap[thread].base != NULL) { + munmap(evlist->mmap[thread].base, evlist->mmap_len); + evlist->mmap[thread].base = NULL; + } + } + return -1; +} + /** perf_evlist__mmap - Create per cpu maps to receive events * * @evlist - list of events @@ -287,11 +373,11 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *ev int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) { unsigned int page_size = sysconf(_SC_PAGE_SIZE); - int mask = pages * page_size - 1, cpu; - struct perf_evsel *first_evsel, *evsel; + int mask = pages * page_size - 1; + struct perf_evsel *evsel; const struct cpu_map *cpus = evlist->cpus; const struct thread_map *threads = evlist->threads; - int thread, prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); + int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0) return -ENOMEM; @@ -301,43 +387,18 @@ int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) evlist->overwrite = overwrite; evlist->mmap_len = (pages + 1) * page_size; - first_evsel = list_entry(evlist->entries.next, struct perf_evsel, node); list_for_each_entry(evsel, &evlist->entries, node) { if ((evsel->attr.read_format & PERF_FORMAT_ID) && evsel->sample_id == NULL && perf_evsel__alloc_id(evsel, cpus->nr, threads->nr) < 0) return -ENOMEM; - - for (cpu = 0; cpu < cpus->nr; cpu++) { - for (thread = 0; thread < threads->nr; thread++) { - int fd = FD(evsel, cpu, thread); - - if (evsel->idx || thread) { - if (ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT, - FD(first_evsel, cpu, 0)) != 0) - goto out_unmap; - } else if (__perf_evlist__mmap(evlist, evsel, cpu, - prot, mask, fd) < 0) - goto out_unmap; - - if ((evsel->attr.read_format & PERF_FORMAT_ID) && - perf_evlist__id_add_fd(evlist, evsel, cpu, thread, fd) < 0) - goto out_unmap; - } - } } - return 0; + if (evlist->cpus->map[0] == -1) + return perf_evlist__mmap_per_thread(evlist, prot, mask); -out_unmap: - for (cpu = 0; cpu < cpus->nr; cpu++) { - if (evlist->mmap[cpu].base != NULL) { - munmap(evlist->mmap[cpu].base, evlist->mmap_len); - evlist->mmap[cpu].base = NULL; - } - } - return -1; + return perf_evlist__mmap_per_cpu(evlist, prot, mask); } int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 8b1cb7a..7109d7a 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -17,6 +17,7 @@ struct perf_evlist { struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; int nr_entries; int nr_fds; + int nr_mmaps; int mmap_len; bool overwrite; union perf_event event_copy; @@ -46,7 +47,7 @@ void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); -union perf_event *perf_evlist__read_on_cpu(struct perf_evlist *self, int cpu); +union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx); int perf_evlist__alloc_mmap(struct perf_evlist *evlist); int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite); diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index f5e3845..99c7226 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -680,7 +680,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, &cpu, &sample_id_all)) return NULL; - event = perf_evlist__read_on_cpu(evlist, cpu); + event = perf_evlist__mmap_read(evlist, cpu); if (event != NULL) { struct perf_evsel *first; PyObject *pyevent = pyrf_event__new(event); -- cgit v1.1 From d8083deb4f1aa0977980dfb834fcc336ef38318f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 13 May 2011 16:03:24 -0400 Subject: bridge: fix forwarding of IPv6 The commit 6b1e960fdbd75dcd9bcc3ba5ff8898ff1ad30b6e bridge: Reset IPCB when entering IP stack on NF_FORWARD broke forwarding of IPV6 packets in bridge because it would call bp_parse_ip_options with an IPV6 packet. Reported-by: Noah Meyerhans Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index f3bc322..74ef4d4 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -737,7 +737,7 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, nf_bridge->mask |= BRNF_PKT_TYPE; } - if (br_parse_ip_options(skb)) + if (pf == PF_INET && br_parse_ip_options(skb)) return NF_DROP; /* The physdev module checks on this */ -- cgit v1.1 From 0f08190fe8af3cdb6ba19690eb0fa253ecef4bde Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Sun, 15 May 2011 17:20:29 +0200 Subject: IPVS: fix netns if reading ip_vs_* procfs entries Without this patch every access to ip_vs in procfs will increase the netns count i.e. an unbalanced get_net()/put_net(). (ipvsadm commands also use procfs.) The result is you can't exit a netns if reading ip_vs_* procfs entries. Signed-off-by: Hans Schillstrom Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_app.c | 2 +- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- net/netfilter/ipvs/ip_vs_ctl.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 51f3af7..059af31 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -572,7 +572,7 @@ static const struct file_operations ip_vs_app_fops = { .open = ip_vs_app_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; #endif diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index d3fd91b..bf28ac2 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1046,7 +1046,7 @@ static const struct file_operations ip_vs_conn_fops = { .open = ip_vs_conn_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; static const char *ip_vs_origin_name(unsigned flags) @@ -1114,7 +1114,7 @@ static const struct file_operations ip_vs_conn_sync_fops = { .open = ip_vs_conn_sync_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; #endif diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ea72281..37890f2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2066,7 +2066,7 @@ static const struct file_operations ip_vs_info_fops = { .open = ip_vs_info_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif @@ -2109,7 +2109,7 @@ static const struct file_operations ip_vs_stats_fops = { .open = ip_vs_stats_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) @@ -2178,7 +2178,7 @@ static const struct file_operations ip_vs_stats_percpu_fops = { .open = ip_vs_stats_percpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = single_release_net, }; #endif -- cgit v1.1 From a67b8887ced9d54cab7759bdb19deafed37481eb Mon Sep 17 00:00:00 2001 From: Florian Mickler Date: Sun, 15 May 2011 16:32:50 +0200 Subject: vga_switcheroo: don't toggle-switch devices If the requested device is already active, ignore the request. This restores the original behaviour of the interface. The change was probably an unintended side effect of commit 66b37c6777c4 vga_switcheroo: split switching into two stages which did not take into account to duplicate the !active check in the split-off stage2. Fix this by factoring that check out of stage1 into the debugfs_write routine. References: https://bugzilla.kernel.org/show_bug.cgi?id=34252 Reported-by: Igor Murzov Tested-by: Igor Murzov Signed-off-by: Florian Mickler Signed-off-by: Dave Airlie --- drivers/gpu/vga/vga_switcheroo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c index e01cacb..498b284 100644 --- a/drivers/gpu/vga/vga_switcheroo.c +++ b/drivers/gpu/vga/vga_switcheroo.c @@ -219,9 +219,6 @@ static int vga_switchto_stage1(struct vga_switcheroo_client *new_client) int i; struct vga_switcheroo_client *active = NULL; - if (new_client->active == true) - return 0; - for (i = 0; i < VGA_SWITCHEROO_MAX_CLIENTS; i++) { if (vgasr_priv.clients[i].active == true) { active = &vgasr_priv.clients[i]; @@ -372,6 +369,9 @@ vga_switcheroo_debugfs_write(struct file *filp, const char __user *ubuf, goto out; } + if (client->active == true) + goto out; + /* okay we want a switch - test if devices are willing to switch */ can_switch = true; for (i = 0; i < VGA_SWITCHEROO_MAX_CLIENTS; i++) { -- cgit v1.1 From 8eea1be174a1ea4b86323167bbadc8a6abdca613 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 13 May 2011 12:14:54 -0400 Subject: drm/i915: Revert i915.semaphore=1 default from 47ae63e0 My Q67 / i7-2600 box has rev09 Sandy Bridge graphics. It hangs instantly when GNOME loads and it hangs so hard the reset button doesn't work. Setting i915.semaphore=0 fixes it. Semaphores were disabled in a1656b9090f7008d2941c314f5a64724bea2ae37 in 2.6.38 and were re-enabled by commit 47ae63e0c2e5fdb582d471dc906eb29be94c732f Merge: c59a333 467cffb Author: Chris Wilson Date: Mon Mar 7 12:32:44 2011 +0000 Merge branch 'drm-intel-fixes' into drm-intel-next Apply the trivial conflicting regression fixes, but keep GPU semaphores enabled. Conflicts: drivers/gpu/drm/i915/i915_drv.h drivers/gpu/drm/i915/i915_gem_execbuffer.c (It's worth noting that the offending change is i915_drv.c, which is not a conflict.) Signed-off-by: Andy Lutomirski Acked-by: Keith Packard Signed-off-by: Dave Airlie --- drivers/gpu/drm/i915/i915_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c34a8dd..32d1b3e 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -49,7 +49,7 @@ module_param_named(panel_ignore_lid, i915_panel_ignore_lid, int, 0600); unsigned int i915_powersave = 1; module_param_named(powersave, i915_powersave, int, 0600); -unsigned int i915_semaphores = 1; +unsigned int i915_semaphores = 0; module_param_named(semaphores, i915_semaphores, int, 0600); unsigned int i915_enable_rc6 = 0; -- cgit v1.1 From 752d2635ebb12b6122ba05775f7d1ccfef14b275 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 22 Apr 2011 11:03:57 +0100 Subject: drm: Take lock around probes for drm_fb_helper_hotplug_event We need to hold the dev->mode_config.mutex whilst detecting the output status. But we also need to drop it for the call into drm_fb_helper_single_fb_probe(), which indirectly acquires the lock when attaching the fbcon. Failure to do so exposes a race with normal output probing. Detected by adding some warnings that the mutex is held to the backend detect routines: [ 17.772456] WARNING: at drivers/gpu/drm/i915/intel_crt.c:471 intel_crt_detect+0x3e/0x373 [i915]() [ 17.772458] Hardware name: Latitude E6400 [ 17.772460] Modules linked in: .... [ 17.772582] Pid: 11, comm: kworker/0:1 Tainted: G W 2.6.38.4-custom.2 #8 [ 17.772584] Call Trace: [ 17.772591] [] ? warn_slowpath_common+0x78/0x8c [ 17.772603] [] ? intel_crt_detect+0x3e/0x373 [i915] [ 17.772612] [] ? drm_helper_probe_single_connector_modes+0xbf/0x2af [drm_kms_helper] [ 17.772619] [] ? drm_fb_helper_probe_connector_modes+0x39/0x4d [drm_kms_helper] [ 17.772625] [] ? drm_fb_helper_hotplug_event+0xa5/0xc3 [drm_kms_helper] [ 17.772633] [] ? output_poll_execute+0x146/0x17c [drm_kms_helper] [ 17.772638] [] ? cfq_init_queue+0x247/0x345 [ 17.772644] [] ? output_poll_execute+0x0/0x17c [drm_kms_helper] [ 17.772648] [] ? process_one_work+0x193/0x28e [ 17.772652] [] ? worker_thread+0xef/0x172 [ 17.772655] [] ? worker_thread+0x0/0x172 [ 17.772658] [] ? worker_thread+0x0/0x172 [ 17.772663] [] ? kthread+0x7a/0x82 [ 17.772668] [] ? kernel_thread_helper+0x4/0x10 [ 17.772671] [] ? kthread+0x0/0x82 [ 17.772674] [] ? kernel_thread_helper+0x0/0x10 Reported-by: Frederik Himpe References: https://bugs.freedesktop.org/show_bug.cgi?id=36394 Signed-off-by: Chris Wilson Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_fb_helper.c | 26 ++++++++++++++++++++++---- include/drm/drm_fb_helper.h | 2 +- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 11d7a72..140b952 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -1516,17 +1516,33 @@ bool drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper, int bpp_sel) } EXPORT_SYMBOL(drm_fb_helper_initial_config); -bool drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) +/** + * drm_fb_helper_hotplug_event - respond to a hotplug notification by + * probing all the outputs attached to the fb. + * @fb_helper: the drm_fb_helper + * + * LOCKING: + * Called at runtime, must take mode config lock. + * + * Scan the connectors attached to the fb_helper and try to put together a + * setup after *notification of a change in output configuration. + * + * RETURNS: + * 0 on success and a non-zero error code otherwise. + */ +int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) { + struct drm_device *dev = fb_helper->dev; int count = 0; u32 max_width, max_height, bpp_sel; bool bound = false, crtcs_bound = false; struct drm_crtc *crtc; if (!fb_helper->fb) - return false; + return 0; - list_for_each_entry(crtc, &fb_helper->dev->mode_config.crtc_list, head) { + mutex_lock(&dev->mode_config.mutex); + list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { if (crtc->fb) crtcs_bound = true; if (crtc->fb == fb_helper->fb) @@ -1535,7 +1551,8 @@ bool drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) if (!bound && crtcs_bound) { fb_helper->delayed_hotplug = true; - return false; + mutex_unlock(&dev->mode_config.mutex); + return 0; } DRM_DEBUG_KMS("\n"); @@ -1546,6 +1563,7 @@ bool drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper) count = drm_fb_helper_probe_connector_modes(fb_helper, max_width, max_height); drm_setup_crtcs(fb_helper); + mutex_unlock(&dev->mode_config.mutex); return drm_fb_helper_single_fb_probe(fb_helper, bpp_sel); } diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index ade09d7..c99c3d3 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -127,7 +127,7 @@ void drm_fb_helper_fill_fix(struct fb_info *info, uint32_t pitch, int drm_fb_helper_setcmap(struct fb_cmap *cmap, struct fb_info *info); -bool drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper); +int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper); bool drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper, int bpp_sel); int drm_fb_helper_single_add_all_connectors(struct drm_fb_helper *fb_helper); int drm_fb_helper_debug_enter(struct fb_info *info); -- cgit v1.1 From ec514c487c3d4b652943da7b0afbc094eee08cfa Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Sat, 14 May 2011 14:20:02 +0800 Subject: sched: Fix rt_rq runtime leakage bug This patch is to fix the real-time scheduler bug reported at: https://lkml.org/lkml/2011/4/26/13 That is, when running multiple real-time threads on every logical CPUs and then turning off one CPU, the kernel will bug at function __disable_runtime(). Function __disable_runtime() bugs and reports leakage of rt_rq runtime. The root cause is __disable_runtime() assumes it iterates through all the existing rt_rq's while walking rq->leaf_rt_rq_list, which actually contains only runnable rt_rq's. This problem also applies to __enable_runtime() and print_rt_stats(). The patch is based on above analysis, appears to fix the problem, but is only lightly tested. Reported-by: Paul E. McKenney Tested-by: Paul E. McKenney Signed-off-by: Cheng Xu Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4DCE1F12.6040609@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc..f8fcf82 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } +typedef struct task_group *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ + (&iter->list != &task_groups) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) + static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { list_add_rcu(&rt_rq->leaf_rt_rq_list, @@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(def_rt_bandwidth.rt_period); } +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { } @@ -402,12 +415,13 @@ next: static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; + rt_rq_iter_t iter; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) return; - for_each_leaf_rt_rq(rt_rq, rq) { + for_each_rt_rq(rt_rq, iter, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); s64 want; int i; @@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq) static void __enable_runtime(struct rq *rq) { + rt_rq_iter_t iter; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) @@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq) /* * Reset each runqueue's bandwidth settings */ - for_each_leaf_rt_rq(rt_rq, rq) { + for_each_rt_rq(rt_rq, iter, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); raw_spin_lock(&rt_b->rt_runtime_lock); @@ -1796,10 +1811,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); static void print_rt_stats(struct seq_file *m, int cpu) { + rt_rq_iter_t iter; struct rt_rq *rt_rq; rcu_read_lock(); - for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) + for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -- cgit v1.1 From 61eadef6a9bde9ea62fda724a9cb501ce9bc925a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 29 Apr 2011 08:36:50 +0200 Subject: sched, rt: Update rq clock when unthrottling of an otherwise idle CPU If an RT task is awakened while it's rt_rq is throttled, the time between wakeup/enqueue and unthrottle/selection may be accounted as rt_time if the CPU is idle. Set rq->skip_clock_update negative upon throttle release to tell put_prev_task() that we need a clock update. Reported-by: Thomas Giesel Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1304059010.7472.1.camel@marge.simson.net Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- kernel/sched_rt.c | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f9778c0..b8b9a7d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -466,7 +466,7 @@ struct rq { u64 nohz_stamp; unsigned char nohz_balance_kick; #endif - unsigned int skip_clock_update; + int skip_clock_update; /* capture load from *all* tasks on this cpu: */ struct load_weight load; @@ -652,7 +652,7 @@ static void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update) + if (rq->skip_clock_update > 0) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; @@ -4127,7 +4127,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->on_rq) + if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 19ecb31..0943ed7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -562,6 +562,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { rt_rq->rt_throttled = 0; enqueue = 1; + + /* + * Force a clock update if the CPU was idle, + * lest wakeup -> unthrottle time accumulate. + */ + if (rt_rq->rt_nr_running && rq->curr == rq->idle) + rq->skip_clock_update = -1; } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; -- cgit v1.1 From db44fc017d5989302713ab4e7f9e922b648f4b59 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 9 May 2011 22:07:05 +0800 Subject: sched: Avoid going ahead if ->cpus_allowed is not changed If cpumask_equal(&p->cpus_allowed, new_mask) is true, seems there is no reason to prevent set_cpus_allowed_ptr() return directly. Signed-off-by: Yong Zhang Acked-by: Hillf Danton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110509140705.GA2219@zhy Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index b8b9a7d..70bec4f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5946,13 +5946,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) rq = task_rq_lock(p, &flags); + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpumask_equal(&p->cpus_allowed, new_mask))) { + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { ret = -EINVAL; goto out; } -- cgit v1.1 From db670dac49b5423b39b5e523d28fe32045d71b10 Mon Sep 17 00:00:00 2001 From: Stephan Baerwolf Date: Wed, 11 May 2011 18:03:29 +0200 Subject: sched: Fix and optimise calculation of the weight-inverse If the inverse loadweight should be zero, function "calc_delta_mine" calculates the inverse of "lw->weight" (in 32bit integer ops). This calculation is actually a little bit impure (because it is inverting something around "lw-weight"+1), especially when "lw->weight" becomes smaller. The correct inverse would be 1/lw->weight multiplied by "WMULT_CONST" for fixcomma-scaling it into integers. (So WMULT_CONST/lw->weight ...) The old, impure algorithm took two divisions for inverting lw->weight, the new, more exact one only takes one and an additional unlikely-if. Signed-off-by: Stephan Baerwolf Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-0pz0wnyalr4tk4ln11xwumdx@git.kernel.org [ This could explain some aritmetical issues for small shares but nothing concrete has been reported yet so we are not confident enough to queue this up in sched/urgent and for -stable backport. But if anyone finds this commit and sees it to fix some badness then we can certainly change our mind! ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 70bec4f..c62acf4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1330,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; + tmp = (u64)delta_exec * weight; + if (!lw->inv_weight) { if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) lw->inv_weight = 1; else - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) - / (lw->weight+1); + lw->inv_weight = WMULT_CONST / lw->weight; } - tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ -- cgit v1.1 From e503f9e4b092e2349a9477a333543de8f3c7f5d9 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Fri, 22 Apr 2011 00:22:43 +0800 Subject: x86, apic: Fix spurious error interrupts triggering on all non-boot APs This patch fixes a bug reported by a customer, who found that many unreasonable error interrupts reported on all non-boot CPUs (APs) during the system boot stage. According to Chapter 10 of Intel Software Developer Manual Volume 3A, Local APIC may signal an illegal vector error when an LVT entry is set as an illegal vector value (0~15) under FIXED delivery mode (bits 8-11 is 0), regardless of whether the mask bit is set or an interrupt actually happen. These errors are seen as error interrupts. The initial value of thermal LVT entries on all APs always reads 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI sequence to them and LVT registers are reset to 0s except for the mask bits which are set to 1s when APs receive INIT IPI. When the BIOS takes over the thermal throttling interrupt, the LVT thermal deliver mode should be SMI and it is required from the kernel to keep AP's LVT thermal monitoring register programmed as such as well. This issue happens when BIOS does not take over thermal throttling interrupt, AP's LVT thermal monitor register will be restored to 0x10000 which means vector 0 and fixed deliver mode, so all APs will signal illegal vector error interrupts. This patch check if interrupt delivery mode is not fixed mode before restoring AP's LVT thermal monitor register. Signed-off-by: Youquan Song Acked-by: Suresh Siddha Acked-by: Yong Wang Cc: hpa@linux.intel.com Cc: joe@perches.com Cc: jbaron@redhat.com Cc: trenn@suse.de Cc: kent.liu@intel.com Cc: chaohong.guo@intel.com Cc: # As far back as possible Link: http://lkml.kernel.org/r/1303402963-17738-1-git-send-email-youquan.song@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 1 + arch/x86/kernel/cpu/mcheck/therm_throt.c | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index d87988b..34595d5 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -78,6 +78,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6f8c5e9..0f03446 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -446,18 +446,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG -- cgit v1.1 From 70087dc38cc77ca8f46059564c00338777734762 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 16 May 2011 15:24:08 +0200 Subject: blk-throttle: Use task_subsys_state() to determine a task's blkio_cgroup Currentlly we first map the task to cgroup and then cgroup to blkio_cgroup. There is a more direct way to get to blkio_cgroup from task using task_subsys_state(). Use that. The real reason for the fix is that it also avoids a race in generic cgroup code. During remount/umount rebind_subsystems() is called and it can do following with and rcu protection. cgrp->subsys[i] = NULL; That means if somebody got hold of cgroup under rcu and then it tried to do cgroup->subsys[] to get to blkio_cgroup, it would get NULL which is wrong. I was running into this race condition with ltp running on a upstream derived kernel and that lead to crash. So ideally we should also fix cgroup generic code to wait for rcu grace period before setting pointer to NULL. Li Zefan is not very keen on introducing synchronize_wait() as he thinks it will slow down moun/remount/umount operations. So for the time being atleast fix the kernel crash by taking a more direct route to blkio_cgroup. One tester had reported a crash while running LTP on a derived kernel and with this fix crash is no more seen while the test has been running for over 6 days. Signed-off-by: Vivek Goyal Reviewed-by: Li Zefan Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 7 +++++++ block/blk-cgroup.h | 3 +++ block/blk-throttle.c | 9 ++++----- block/cfq-iosched.c | 11 +++++------ 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f0605ab..471fdcc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -114,6 +114,13 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); +struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(task_blkio_cgroup); + static inline void blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) { diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 10919fa..c774930 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -291,6 +291,7 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, enum blkio_policy_id plid); @@ -314,6 +315,8 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg, struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } +static inline struct blkio_cgroup * +task_blkio_cgroup(struct task_struct *tsk) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0475a22..252a81a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -160,9 +160,8 @@ static void throtl_put_tg(struct throtl_grp *tg) } static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, - struct cgroup *cgroup) + struct blkio_cgroup *blkcg) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); struct throtl_grp *tg = NULL; void *key = td; struct backing_dev_info *bdi = &td->queue->backing_dev_info; @@ -229,12 +228,12 @@ done: static struct throtl_grp * throtl_get_tg(struct throtl_data *td) { - struct cgroup *cgroup; struct throtl_grp *tg = NULL; + struct blkio_cgroup *blkcg; rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - tg = throtl_find_alloc_tg(td, cgroup); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_alloc_tg(td, blkcg); if (!tg) tg = &td->root_tg; rcu_read_unlock(); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5b52011..ab7a9e6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1014,10 +1014,9 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, cfqg->needs_update = true; } -static struct cfq_group * -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, + struct blkio_cgroup *blkcg, int create) { - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); struct cfq_group *cfqg = NULL; void *key = cfqd; int i, j; @@ -1079,12 +1078,12 @@ done: */ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) { - struct cgroup *cgroup; + struct blkio_cgroup *blkcg; struct cfq_group *cfqg = NULL; rcu_read_lock(); - cgroup = task_cgroup(current, blkio_subsys_id); - cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + blkcg = task_blkio_cgroup(current); + cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create); if (!cfqg && create) cfqg = &cfqd->root_group; rcu_read_unlock(); -- cgit v1.1 From 86f315bbb2374f1f077500ad131dd9b71856e697 Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Mon, 16 May 2011 11:32:26 -0400 Subject: Revert "mmc: fix a race between card-detect rescan and clock-gate work instances" This reverts commit 26fc8775b51484d8c0a671198639c6d5ae60533e, which has been reported to cause boot/resume-time crashes for some users: https://bbs.archlinux.org/viewtopic.php?id=118751. Signed-off-by: Chris Ball Cc: --- drivers/mmc/core/host.c | 9 +++++---- include/linux/mmc/host.h | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 2b200c1..461e6a1 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -94,7 +94,7 @@ static void mmc_host_clk_gate_delayed(struct mmc_host *host) spin_unlock_irqrestore(&host->clk_lock, flags); return; } - mmc_claim_host(host); + mutex_lock(&host->clk_gate_mutex); spin_lock_irqsave(&host->clk_lock, flags); if (!host->clk_requests) { spin_unlock_irqrestore(&host->clk_lock, flags); @@ -104,7 +104,7 @@ static void mmc_host_clk_gate_delayed(struct mmc_host *host) pr_debug("%s: gated MCI clock\n", mmc_hostname(host)); } spin_unlock_irqrestore(&host->clk_lock, flags); - mmc_release_host(host); + mutex_unlock(&host->clk_gate_mutex); } /* @@ -130,7 +130,7 @@ void mmc_host_clk_ungate(struct mmc_host *host) { unsigned long flags; - mmc_claim_host(host); + mutex_lock(&host->clk_gate_mutex); spin_lock_irqsave(&host->clk_lock, flags); if (host->clk_gated) { spin_unlock_irqrestore(&host->clk_lock, flags); @@ -140,7 +140,7 @@ void mmc_host_clk_ungate(struct mmc_host *host) } host->clk_requests++; spin_unlock_irqrestore(&host->clk_lock, flags); - mmc_release_host(host); + mutex_unlock(&host->clk_gate_mutex); } /** @@ -215,6 +215,7 @@ static inline void mmc_host_clk_init(struct mmc_host *host) host->clk_gated = false; INIT_WORK(&host->clk_gate_work, mmc_host_clk_gate_work); spin_lock_init(&host->clk_lock); + mutex_init(&host->clk_gate_mutex); } /** diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index eb792cb..bcb793e 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -183,6 +183,7 @@ struct mmc_host { struct work_struct clk_gate_work; /* delayed clock gate */ unsigned int clk_old; /* old clock value cache */ spinlock_t clk_lock; /* lock for clk fields */ + struct mutex clk_gate_mutex; /* mutex for clock gating */ #endif /* host specific block data */ -- cgit v1.1 From 7c1bfd685bcdc822ab1d7411ea05c82bd2a7b260 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 16 May 2011 13:47:30 -0400 Subject: xen/pci: Fix compiler error when CONFIG_XEN_PRIVILEGED_GUEST is not set. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we have CONFIG_XEN and the other parameters to build an Linux kernel that is non-privileged, the xen_[find|register|unregister]_ device_domain_owner functions should not be compiled. They should use the nops defined in arch/x86/include/asm/xen/pci.h instead. This fixes: arch/x86/pci/xen.c:496: error: redefinition of ‘xen_find_device_domain_owner’ arch/x86/include/asm/xen/pci.h:25: note: previous definition of ‘xen_find_device_domain_owner’ was here arch/x86/pci/xen.c:510: error: redefinition of ‘xen_register_device_domain_owner’ arch/x86/include/asm/xen/pci.h:29: note: previous definition of ‘xen_register_device_domain_owner’ was here arch/x86/pci/xen.c:532: error: redefinition of ‘xen_unregister_device_domain_owner’ arch/x86/include/asm/xen/pci.h:34: note: previous definition of ‘xen_unregister_device_domain_owner’ was here Signed-off-by: Konrad Rzeszutek Wilk Reported-by: Randy Dunlap --- arch/x86/pci/xen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 393981f..8214724 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -473,6 +473,7 @@ void __init xen_setup_pirqs(void) } #endif +#ifdef CONFIG_XEN_DOM0 struct xen_device_domain_owner { domid_t domain; struct pci_dev *dev; @@ -545,3 +546,4 @@ int xen_unregister_device_domain_owner(struct pci_dev *dev) return 0; } EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); +#endif -- cgit v1.1 From 600b776eb39a13a28b090ba9efceb0c69d4508aa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 May 2011 20:15:36 +0200 Subject: OMAP1 / PM: Use generic clock manipulation routines for runtime PM Convert OMAP1 to using the new generic clock manipulation routines and a device power domain for runtime PM instead of overriding the platform bus type's runtime PM callbacks. This allows us to simplify OMAP1-specific code and to share some code with other platforms (shmobile in particular). Signed-off-by: Rafael J. Wysocki Acked-by: Kevin Hilman --- arch/arm/mach-omap1/pm_bus.c | 69 ++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/arch/arm/mach-omap1/pm_bus.c b/arch/arm/mach-omap1/pm_bus.c index 6588c22..fe31d93 100644 --- a/arch/arm/mach-omap1/pm_bus.c +++ b/arch/arm/mach-omap1/pm_bus.c @@ -24,75 +24,50 @@ #ifdef CONFIG_PM_RUNTIME static int omap1_pm_runtime_suspend(struct device *dev) { - struct clk *iclk, *fclk; - int ret = 0; + int ret; dev_dbg(dev, "%s\n", __func__); ret = pm_generic_runtime_suspend(dev); + if (ret) + return ret; - fclk = clk_get(dev, "fck"); - if (!IS_ERR(fclk)) { - clk_disable(fclk); - clk_put(fclk); - } - - iclk = clk_get(dev, "ick"); - if (!IS_ERR(iclk)) { - clk_disable(iclk); - clk_put(iclk); + ret = pm_runtime_clk_suspend(dev); + if (ret) { + pm_generic_runtime_resume(dev); + return ret; } return 0; -}; +} static int omap1_pm_runtime_resume(struct device *dev) { - struct clk *iclk, *fclk; - dev_dbg(dev, "%s\n", __func__); - iclk = clk_get(dev, "ick"); - if (!IS_ERR(iclk)) { - clk_enable(iclk); - clk_put(iclk); - } + pm_runtime_clk_resume(dev); + return pm_generic_runtime_resume(dev); +} - fclk = clk_get(dev, "fck"); - if (!IS_ERR(fclk)) { - clk_enable(fclk); - clk_put(fclk); - } +static struct dev_power_domain default_power_domain = { + .ops = { + .runtime_suspend = omap1_pm_runtime_suspend, + .runtime_resume = omap1_pm_runtime_resume, + USE_PLATFORM_PM_SLEEP_OPS + }, +}; - return pm_generic_runtime_resume(dev); +static struct pm_clk_notifier_block platform_bus_notifier = { + .pwr_domain = &default_power_domain, + .con_ids = { "ick", "fck", NULL, }, }; static int __init omap1_pm_runtime_init(void) { - const struct dev_pm_ops *pm; - struct dev_pm_ops *omap_pm; - if (!cpu_class_is_omap1()) return -ENODEV; - pm = platform_bus_get_pm_ops(); - if (!pm) { - pr_err("%s: unable to get dev_pm_ops from platform_bus\n", - __func__); - return -ENODEV; - } - - omap_pm = kmemdup(pm, sizeof(struct dev_pm_ops), GFP_KERNEL); - if (!omap_pm) { - pr_err("%s: unable to alloc memory for new dev_pm_ops\n", - __func__); - return -ENOMEM; - } - - omap_pm->runtime_suspend = omap1_pm_runtime_suspend; - omap_pm->runtime_resume = omap1_pm_runtime_resume; - - platform_bus_set_pm_ops(omap_pm); + pm_runtime_clk_add_notifier(&platform_bus_type, &platform_bus_notifier); return 0; } -- cgit v1.1 From 2064af917b3ba7589070064ca4ed12cecd99a63c Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Fri, 29 Apr 2011 00:37:26 +0200 Subject: PM: Revert "driver core: platform_bus: allow runtime override of dev_pm_ops" The platform_bus_set_pm_ops() operation is deprecated in favor of the new device power domain infrastructre implemented in commit 7538e3db6e015e890825fbd9f8659952896ddd5b (PM: add support for device power domains) Signed-off-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 35 ----------------------------------- include/linux/platform_device.h | 3 --- 2 files changed, 38 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 079c18a..48425f1 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -916,41 +916,6 @@ struct bus_type platform_bus_type = { }; EXPORT_SYMBOL_GPL(platform_bus_type); -/** - * platform_bus_get_pm_ops() - return pointer to busses dev_pm_ops - * - * This function can be used by platform code to get the current - * set of dev_pm_ops functions used by the platform_bus_type. - */ -const struct dev_pm_ops * __init platform_bus_get_pm_ops(void) -{ - return platform_bus_type.pm; -} - -/** - * platform_bus_set_pm_ops() - update dev_pm_ops for the platform_bus_type - * - * @pm: pointer to new dev_pm_ops struct to be used for platform_bus_type - * - * Platform code can override the dev_pm_ops methods of - * platform_bus_type by using this function. It is expected that - * platform code will first do a platform_bus_get_pm_ops(), then - * kmemdup it, then customize selected methods and pass a pointer to - * the new struct dev_pm_ops to this function. - * - * Since platform-specific code is customizing methods for *all* - * devices (not just platform-specific devices) it is expected that - * any custom overrides of these functions will keep existing behavior - * and simply extend it. For example, any customization of the - * runtime PM methods should continue to call the pm_generic_* - * functions as the default ones do in addition to the - * platform-specific behavior. - */ -void __init platform_bus_set_pm_ops(const struct dev_pm_ops *pm) -{ - platform_bus_type.pm = pm; -} - int __init platform_bus_init(void) { int error; diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index e0093e0..ede1a80 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -150,9 +150,6 @@ extern struct platform_device *platform_create_bundle(struct platform_driver *dr struct resource *res, unsigned int n_res, const void *data, size_t size); -extern const struct dev_pm_ops * platform_bus_get_pm_ops(void); -extern void platform_bus_set_pm_ops(const struct dev_pm_ops *pm); - /* early platform driver interface */ struct early_platform_driver { const char *class_str; -- cgit v1.1 From 72874daa5e9064c4e8d689e6a04b1e96f687f872 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 3 May 2011 19:45:32 +0200 Subject: PM: Fix build issue in clock_ops.c for CONFIG_PM_RUNTIME unset Fix a build issue in drivers/base/power/clock_ops.c occuring when CONFIG_PM_RUNTIME is not set. Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index d74abf3..c0dd09d 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -380,6 +380,7 @@ static int pm_runtime_clk_notify(struct notifier_block *nb, { struct pm_clk_notifier_block *clknb; struct device *dev = data; + char *con_id; dev_dbg(dev, "%s() %ld\n", __func__, action); -- cgit v1.1 From 50e7534427283afd997d58481778c07bea79eb63 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 May 2011 15:39:46 +0200 Subject: x86, AMD, cacheinfo: Fix fallout caused by max3 conversion 732eacc0542d0aa48797f675888b85d6065af837 converted code around the kernel using nested max() macros to use the new max3 macro but forgot to remove the old line in intel_cacheinfo.c. Fix it. Cc: Hagen Paul Pfeifer Cc: Frank Arnold Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1305553188-21061-2-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1ce1af28..31590a0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -- cgit v1.1 From 42be450565b0fc4607fae3e3a7da038d367a23ed Mon Sep 17 00:00:00 2001 From: Frank Arnold Date: Mon, 16 May 2011 15:39:47 +0200 Subject: x86, AMD, cacheinfo: Fix L3 cache index disable checks We provide two slots to disable cache indices, and have a check to prevent both slots to be used for the same index. If the user disables the same index on different subcaches, both slots will hold the same index, e.g. $ echo 2047 > /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_0 $ cat /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_0 2047 $ echo 1050623 > /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_1 $ cat /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_1 2047 due to the fact that the check was looking only at index bits [11:0] and was ignoring writes to bits outside that range. The more correct fix is to simply check whether the index is within the bounds of [0..l3->indices]. While at it, cleanup comments and drop now-unused local macros. Signed-off-by: Frank Arnold Link: http://lkml.kernel.org/r/1305553188-21061-3-git-send-email-bp@amd64.org Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 31590a0..c105c53 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -453,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, { int ret = 0; -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - /* - * check whether this slot is already used or - * the index is already disabled - */ + /* check if @slot is already used or the index is already disabled */ ret = amd_get_l3_disable_slot(l3, slot); if (ret >= 0) return -EINVAL; - /* - * check whether the other slot has disabled the - * same index already - */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index > l3->indices) return -EINVAL; - /* do not allow writes outside of allowed bits */ - if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((index & SUBCACHE_INDEX) > l3->indices)) + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(l3, !slot)) return -EINVAL; amd_l3_disable_index(l3, cpu, slot, index); -- cgit v1.1 From eecaaba5b2e4ae762b4726fae2e3b22630e137ec Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 May 2011 15:39:48 +0200 Subject: Documentation, ABI: Update L3 cache index disable text Change contact person to AMD kernel mailing list, update text and external references, drop "Users:" tag. Cc: Randy Dunlap Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1305553188-21061-4-git-send-email-bp@amd64.org Acked-by: Greg Kroah-Hartman Signed-off-by: H. Peter Anvin --- Documentation/ABI/testing/sysfs-devices-system-cpu | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 7564e88..e7be75b 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -183,21 +183,21 @@ Description: Discover and change clock speed of CPUs to learn how to control the knobs. -What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X -Date: August 2008 +What: /sys/devices/system/cpu/cpu*/cache/index3/cache_disable_{0,1} +Date: August 2008 KernelVersion: 2.6.27 -Contact: mark.langsdorf@amd.com -Description: These files exist in every cpu's cache index directories. - There are currently 2 cache_disable_# files in each - directory. Reading from these files on a supported - processor will return that cache disable index value - for that processor and node. Writing to one of these - files will cause the specificed cache index to be disabled. - - Currently, only AMD Family 10h Processors support cache index - disable, and only for their L3 caches. See the BIOS and - Kernel Developer's Guide at - http://support.amd.com/us/Embedded_TechDocs/31116-Public-GH-BKDG_3-28_5-28-09.pdf - for formatting information and other details on the - cache index disable. -Users: joachim.deguara@amd.com +Contact: discuss@x86-64.org +Description: Disable L3 cache indices + + These files exist in every CPU's cache/index3 directory. Each + cache_disable_{0,1} file corresponds to one disable slot which + can be used to disable a cache index. Reading from these files + on a processor with this functionality will return the currently + disabled index for that node. There is one L3 structure per + node, or per internal node on MCM machines. Writing a valid + index to one of these files will cause the specificed cache + index to be disabled. + + All AMD processors with L3 caches provide this functionality. + For details, see BKDGs at + http://developer.amd.com/documentation/guides/Pages/default.aspx -- cgit v1.1 From dd5477ff3ba978892014ea5f988cb1bf04aa505e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Apr 2011 13:21:17 -0400 Subject: ftrace/trivial: Clean up recordmcount.c to use Linux style comparisons The Linux ftrace subsystem style for comparing is: var == 1 var > 0 and not: 1 == var 0 < var It is considered that Linux developers are smart enough not to do the if (var = 1) mistake. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023737.290712238@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.c | 48 ++++++++++++++++++++++++------------------------ scripts/recordmcount.h | 16 ++++++++-------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index f9f6f52..37afe0e 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -78,7 +78,7 @@ static off_t ulseek(int const fd, off_t const offset, int const whence) { off_t const w = lseek(fd, offset, whence); - if ((off_t)-1 == w) { + if (w == (off_t)-1) { perror("lseek"); fail_file(); } @@ -111,7 +111,7 @@ static void * umalloc(size_t size) { void *const addr = malloc(size); - if (0 == addr) { + if (addr == 0) { fprintf(stderr, "malloc failed: %zu bytes\n", size); fail_file(); } @@ -136,7 +136,7 @@ static void *mmap_file(char const *fname) void *addr; fd_map = open(fname, O_RDWR); - if (0 > fd_map || 0 > fstat(fd_map, &sb)) { + if (fd_map < 0 || fstat(fd_map, &sb) < 0) { perror(fname); fail_file(); } @@ -147,7 +147,7 @@ static void *mmap_file(char const *fname) addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd_map, 0); mmap_failed = 0; - if (MAP_FAILED == addr) { + if (addr == MAP_FAILED) { mmap_failed = 1; addr = umalloc(sb.st_size); uread(fd_map, addr, sb.st_size); @@ -206,12 +206,12 @@ static uint32_t (*w2)(uint16_t); static int is_mcounted_section_name(char const *const txtname) { - return 0 == strcmp(".text", txtname) || - 0 == strcmp(".ref.text", txtname) || - 0 == strcmp(".sched.text", txtname) || - 0 == strcmp(".spinlock.text", txtname) || - 0 == strcmp(".irqentry.text", txtname) || - 0 == strcmp(".text.unlikely", txtname); + return strcmp(".text", txtname) == 0 || + strcmp(".ref.text", txtname) == 0 || + strcmp(".sched.text", txtname) == 0 || + strcmp(".spinlock.text", txtname) == 0 || + strcmp(".irqentry.text", txtname) == 0 || + strcmp(".text.unlikely", txtname) == 0; } /* 32 bit and 64 bit are very similar */ @@ -270,7 +270,7 @@ do_file(char const *const fname) fail_file(); } break; case ELFDATA2LSB: { - if (1 != *(unsigned char const *)&endian) { + if (*(unsigned char const *)&endian != 1) { /* main() is big endian, file.o is little endian. */ w = w4rev; w2 = w2rev; @@ -278,7 +278,7 @@ do_file(char const *const fname) } } break; case ELFDATA2MSB: { - if (0 != *(unsigned char const *)&endian) { + if (*(unsigned char const *)&endian != 0) { /* main() is little endian, file.o is big endian. */ w = w4rev; w2 = w2rev; @@ -286,9 +286,9 @@ do_file(char const *const fname) } } break; } /* end switch */ - if (0 != memcmp(ELFMAG, ehdr->e_ident, SELFMAG) - || ET_REL != w2(ehdr->e_type) - || EV_CURRENT != ehdr->e_ident[EI_VERSION]) { + if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 + || w2(ehdr->e_type) != ET_REL + || ehdr->e_ident[EI_VERSION] != EV_CURRENT) { fprintf(stderr, "unrecognized ET_REL file %s\n", fname); fail_file(); } @@ -321,15 +321,15 @@ do_file(char const *const fname) fail_file(); } break; case ELFCLASS32: { - if (sizeof(Elf32_Ehdr) != w2(ehdr->e_ehsize) - || sizeof(Elf32_Shdr) != w2(ehdr->e_shentsize)) { + if (w2(ehdr->e_ehsize) != sizeof(Elf32_Ehdr) + || w2(ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (EM_S390 == w2(ehdr->e_machine)) + if (w2(ehdr->e_machine) == EM_S390) reltype = R_390_32; - if (EM_MIPS == w2(ehdr->e_machine)) { + if (w2(ehdr->e_machine) == EM_MIPS) { reltype = R_MIPS_32; is_fake_mcount32 = MIPS32_is_fake_mcount; } @@ -337,15 +337,15 @@ do_file(char const *const fname) } break; case ELFCLASS64: { Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; - if (sizeof(Elf64_Ehdr) != w2(ghdr->e_ehsize) - || sizeof(Elf64_Shdr) != w2(ghdr->e_shentsize)) { + if (w2(ghdr->e_ehsize) != sizeof(Elf64_Ehdr) + || w2(ghdr->e_shentsize) != sizeof(Elf64_Shdr)) { fprintf(stderr, "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (EM_S390 == w2(ghdr->e_machine)) + if (w2(ghdr->e_machine) == EM_S390) reltype = R_390_64; - if (EM_MIPS == w2(ghdr->e_machine)) { + if (w2(ghdr->e_machine) == EM_MIPS) { reltype = R_MIPS_64; Elf64_r_sym = MIPS64_r_sym; Elf64_r_info = MIPS64_r_info; @@ -371,7 +371,7 @@ main(int argc, char const *argv[]) } /* Process each file in turn, allowing deep failure. */ - for (--argc, ++argv; 0 < argc; --argc, ++argv) { + for (--argc, ++argv; argc > 0; --argc, ++argv) { int const sjval = setjmp(jmpenv); int len; diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index baf187b..ac7b330 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -275,12 +275,12 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, Elf_Sym const *const symp = &sym0[Elf_r_sym(relp)]; char const *symname = &str0[w(symp->st_name)]; - char const *mcount = '_' == gpfx ? "_mcount" : "mcount"; + char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; - if ('.' == symname[0]) + if (symname[0] == '.') ++symname; /* ppc64 hack */ - if (0 == strcmp(mcount, symname) || - (altmcount && 0 == strcmp(altmcount, symname))) + if (strcmp(mcount, symname) == 0 || + (altmcount && strcmp(altmcount, symname) == 0)) mcountsym = Elf_r_sym(relp); } @@ -290,7 +290,7 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, mrelp->r_offset = _w(offbase + ((void *)mlocp - (void *)mloc0)); Elf_r_info(mrelp, recsym, reltype); - if (sizeof(Elf_Rela) == rel_entsize) { + if (rel_entsize == sizeof(Elf_Rela)) { ((Elf_Rela *)mrelp)->r_addend = addend; *mlocp++ = 0; } else @@ -354,12 +354,12 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ Elf_Shdr const *const txthdr = &shdr0[w(relhdr->sh_info)]; char const *const txtname = &shstrtab[w(txthdr->sh_name)]; - if (0 == strcmp("__mcount_loc", txtname)) { + if (strcmp("__mcount_loc", txtname) == 0) { fprintf(stderr, "warning: __mcount_loc already exists: %s\n", fname); succeed_file(); } - if (SHT_PROGBITS != w(txthdr->sh_type) || + if (w(txthdr->sh_type) != SHT_PROGBITS || !is_mcounted_section_name(txtname)) return NULL; return txtname; @@ -370,7 +370,7 @@ static char const *has_rel_mcount(Elf_Shdr const *const relhdr, char const *const shstrtab, char const *const fname) { - if (SHT_REL != w(relhdr->sh_type) && SHT_RELA != w(relhdr->sh_type)) + if (w(relhdr->sh_type) != SHT_REL && w(relhdr->sh_type) != SHT_RELA) return NULL; return __has_rel_mcount(relhdr, shdr0, shstrtab, fname); } -- cgit v1.1 From e90b0c8bf211958a296d60369fecd51b35864407 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Apr 2011 13:32:24 -0400 Subject: ftrace/trivial: Clean up record mcount to use Linux switch style The Linux style for switch statements is: switch (var) { case x: [...] break; } Not: switch (var) { case x: { [...] } break; Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023737.523968644@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 37afe0e..4ebd839 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -264,27 +264,27 @@ do_file(char const *const fname) w8 = w8nat; switch (ehdr->e_ident[EI_DATA]) { static unsigned int const endian = 1; - default: { + default: fprintf(stderr, "unrecognized ELF data encoding %d: %s\n", ehdr->e_ident[EI_DATA], fname); fail_file(); - } break; - case ELFDATA2LSB: { + break; + case ELFDATA2LSB: if (*(unsigned char const *)&endian != 1) { /* main() is big endian, file.o is little endian. */ w = w4rev; w2 = w2rev; w8 = w8rev; } - } break; - case ELFDATA2MSB: { + break; + case ELFDATA2MSB: if (*(unsigned char const *)&endian != 0) { /* main() is little endian, file.o is big endian. */ w = w4rev; w2 = w2rev; w8 = w8rev; } - } break; + break; } /* end switch */ if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0 || w2(ehdr->e_type) != ET_REL @@ -295,11 +295,11 @@ do_file(char const *const fname) gpfx = 0; switch (w2(ehdr->e_machine)) { - default: { + default: fprintf(stderr, "unrecognized e_machine %d %s\n", w2(ehdr->e_machine), fname); fail_file(); - } break; + break; case EM_386: reltype = R_386_32; break; case EM_ARM: reltype = R_ARM_ABS32; altmcount = "__gnu_mcount_nc"; @@ -315,12 +315,12 @@ do_file(char const *const fname) } /* end switch */ switch (ehdr->e_ident[EI_CLASS]) { - default: { + default: fprintf(stderr, "unrecognized ELF class %d %s\n", ehdr->e_ident[EI_CLASS], fname); fail_file(); - } break; - case ELFCLASS32: { + break; + case ELFCLASS32: if (w2(ehdr->e_ehsize) != sizeof(Elf32_Ehdr) || w2(ehdr->e_shentsize) != sizeof(Elf32_Shdr)) { fprintf(stderr, @@ -334,7 +334,7 @@ do_file(char const *const fname) is_fake_mcount32 = MIPS32_is_fake_mcount; } do32(ehdr, fname, reltype); - } break; + break; case ELFCLASS64: { Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr; if (w2(ghdr->e_ehsize) != sizeof(Elf64_Ehdr) @@ -352,7 +352,8 @@ do_file(char const *const fname) is_fake_mcount64 = MIPS64_is_fake_mcount; } do64(ghdr, fname, reltype); - } break; + break; + } } /* end switch */ cleanup(); @@ -386,23 +387,23 @@ main(int argc, char const *argv[]) continue; switch (sjval) { - default: { + default: fprintf(stderr, "internal error: %s\n", argv[0]); exit(1); - } break; - case SJ_SETJMP: { /* normal sequence */ + break; + case SJ_SETJMP: /* normal sequence */ /* Avoid problems if early cleanup() */ fd_map = -1; ehdr_curr = NULL; mmap_failed = 1; do_file(argv[0]); - } break; - case SJ_FAIL: { /* error in do_file or below */ + break; + case SJ_FAIL: /* error in do_file or below */ ++n_error; - } break; - case SJ_SUCCEED: { /* premature success */ + break; + case SJ_SUCCEED: /* premature success */ /* do nothing */ - } break; + break; } /* end switch */ } return !!n_error; -- cgit v1.1 From 9f087e7612115b7a201d4f3392a95ac7408948ab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Apr 2011 14:10:22 -0400 Subject: ftrace: Add .kprobe.text section to whitelist for recordmcount.c The .kprobe.text section is safe to modify mcount to nop and tracing. Add it to the whitelist in recordmcount.c and recordmcount.pl. Cc: John Reiser Cc: Masami Hiramatsu Link: http://lkml.kernel.org/r/20110421023737.743350547@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.c | 1 + scripts/recordmcount.pl | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 4ebd839..37c5965 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -211,6 +211,7 @@ is_mcounted_section_name(char const *const txtname) strcmp(".sched.text", txtname) == 0 || strcmp(".spinlock.text", txtname) == 0 || strcmp(".irqentry.text", txtname) == 0 || + strcmp(".kprobes.text", txtname) == 0 || strcmp(".text.unlikely", txtname) == 0; } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 4be0dee..a871cd4 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -134,6 +134,7 @@ my %text_sections = ( ".sched.text" => 1, ".spinlock.text" => 1, ".irqentry.text" => 1, + ".kprobes.text" => 1, ".text.unlikely" => 1, ); -- cgit v1.1 From 8abd5724a7f1631ab2276954156c629d4d17149a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2011 13:31:08 -0400 Subject: ftrace/recordmcount: Modify only executable sections PROGBITS is not enough to determine if the section should be modified or not. Only process sections that are marked as executable. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023737.991485123@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.h | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index ac7b330..7f8d5c4 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -360,6 +360,7 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ succeed_file(); } if (w(txthdr->sh_type) != SHT_PROGBITS || + !(w(txthdr->sh_flags) & SHF_EXECINSTR) || !is_mcounted_section_name(txtname)) return NULL; return txtname; -- cgit v1.1 From ffd618fa39284f8cc343894b566dd42ec6e74e77 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 Apr 2011 03:58:48 -0400 Subject: ftrace/recordmcount: Make ignored mcount calls into nops at compile time There are sections that are ignored by ftrace for the function tracing because the text is in a section that can be removed without notice. The mcount calls in these sections are ignored and ftrace never sees them. The downside of this is that the functions in these sections still call mcount. Although the mcount function is defined in assembly simply as a return, this added overhead is unnecessary. The solution is to convert these callers into nops at compile time. A better solution is to add 'notrace' to the section markers, but as new sections come up all the time, it would be nice that they are delt with when they are created. Later patches will deal with finding these sections and doing the proper solution. Thanks to H. Peter Anvin for giving me the right nops to use for x86. Cc: "H. Peter Anvin" Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023738.237101176@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.c | 40 ++++++++++++++++++++++-- scripts/recordmcount.h | 82 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 116 insertions(+), 6 deletions(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 37c5965..78054a4 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -118,6 +118,34 @@ umalloc(size_t size) return addr; } +static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; +static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; +static unsigned char *ideal_nop; + +static char rel_type_nop; + +static int (*make_nop)(void *map, size_t const offset); + +static int make_nop_x86(void *map, size_t const offset) +{ + uint32_t *ptr; + unsigned char *op; + + /* Confirm we have 0xe8 0x0 0x0 0x0 0x0 */ + ptr = map + offset; + if (*ptr != 0) + return -1; + + op = map + offset - 1; + if (*op != 0xe8) + return -1; + + /* convert to nop */ + ulseek(fd_map, offset - 1, SEEK_SET); + uwrite(fd_map, ideal_nop, 5); + return 0; +} + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap @@ -301,7 +329,11 @@ do_file(char const *const fname) w2(ehdr->e_machine), fname); fail_file(); break; - case EM_386: reltype = R_386_32; break; + case EM_386: + reltype = R_386_32; + make_nop = make_nop_x86; + ideal_nop = ideal_nop5_x86_32; + break; case EM_ARM: reltype = R_ARM_ABS32; altmcount = "__gnu_mcount_nc"; break; @@ -312,7 +344,11 @@ do_file(char const *const fname) case EM_S390: /* reltype: e_class */ gpfx = '_'; break; case EM_SH: reltype = R_SH_DIR32; break; case EM_SPARCV9: reltype = R_SPARC_64; gpfx = '_'; break; - case EM_X86_64: reltype = R_X86_64_64; break; + case EM_X86_64: + make_nop = make_nop_x86; + ideal_nop = ideal_nop5_x86_64; + reltype = R_X86_64_64; + break; } /* end switch */ switch (ehdr->e_ident[EI_CLASS]) { diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 7f8d5c4..657dbed 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -23,6 +23,7 @@ #undef fn_is_fake_mcount #undef MIPS_is_fake_mcount #undef sift_rel_mcount +#undef nop_mcount #undef find_secsym_ndx #undef __has_rel_mcount #undef has_rel_mcount @@ -49,6 +50,7 @@ #ifdef RECORD_MCOUNT_64 # define append_func append64 # define sift_rel_mcount sift64_rel_mcount +# define nop_mcount nop_mcount_64 # define find_secsym_ndx find64_secsym_ndx # define __has_rel_mcount __has64_rel_mcount # define has_rel_mcount has64_rel_mcount @@ -77,6 +79,7 @@ #else # define append_func append32 # define sift_rel_mcount sift32_rel_mcount +# define nop_mcount nop_mcount_32 # define find_secsym_ndx find32_secsym_ndx # define __has_rel_mcount __has32_rel_mcount # define has_rel_mcount has32_rel_mcount @@ -304,6 +307,70 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, return mlocp; } +/* + * Read the relocation table again, but this time its called on sections + * that are not going to be traced. The mcount calls here will be converted + * into nops. + */ +static void nop_mcount(Elf_Shdr const *const relhdr, + Elf_Ehdr const *const ehdr) +{ + Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + + (void *)ehdr); + unsigned const symsec_sh_link = w(relhdr->sh_link); + Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; + Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) + + (void *)ehdr); + + Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; + char const *const str0 = (char const *)(_w(strsec->sh_offset) + + (void *)ehdr); + + Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) + + (void *)ehdr); + unsigned rel_entsize = _w(relhdr->sh_entsize); + unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; + Elf_Rel const *relp = rel0; + + Elf_Shdr const *const shdr = &shdr0[w(relhdr->sh_info)]; + + unsigned mcountsym = 0; + unsigned t; + + for (t = nrel; t; --t) { + int ret = -1; + + if (!mcountsym) { + Elf_Sym const *const symp = + &sym0[Elf_r_sym(relp)]; + char const *symname = &str0[w(symp->st_name)]; + char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; + + if (symname[0] == '.') + ++symname; /* ppc64 hack */ + if (strcmp(mcount, symname) == 0 || + (altmcount && strcmp(altmcount, symname) == 0)) + mcountsym = Elf_r_sym(relp); + } + + if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) + ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + + /* + * If we successfully removed the mcount, mark the relocation + * as a nop (don't do anything with it). + */ + if (!ret) { + Elf_Rel rel; + rel = *(Elf_Rel *)relp; + Elf_r_info(&rel, Elf_r_sym(relp), rel_type_nop); + ulseek(fd_map, (void *)relp - (void *)ehdr, SEEK_SET); + uwrite(fd_map, &rel, sizeof(rel)); + } + relp = (Elf_Rel const *)(rel_entsize + (void *)relp); + } +} + /* * Find a symbol in the given section, to be used as the base for relocating @@ -360,8 +427,7 @@ __has_rel_mcount(Elf_Shdr const *const relhdr, /* is SHT_REL or SHT_RELA */ succeed_file(); } if (w(txthdr->sh_type) != SHT_PROGBITS || - !(w(txthdr->sh_flags) & SHF_EXECINSTR) || - !is_mcounted_section_name(txtname)) + !(w(txthdr->sh_flags) & SHF_EXECINSTR)) return NULL; return txtname; } @@ -384,9 +450,11 @@ static unsigned tot_relsize(Elf_Shdr const *const shdr0, { unsigned totrelsz = 0; Elf_Shdr const *shdrp = shdr0; + char const *txtname; for (; nhdr; --nhdr, ++shdrp) { - if (has_rel_mcount(shdrp, shdr0, shstrtab, fname)) + txtname = has_rel_mcount(shdrp, shdr0, shstrtab, fname); + if (txtname && is_mcounted_section_name(txtname)) totrelsz += _w(shdrp->sh_size); } return totrelsz; @@ -422,7 +490,7 @@ do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) for (relhdr = shdr0, k = nhdr; k; --k, ++relhdr) { char const *const txtname = has_rel_mcount(relhdr, shdr0, shstrtab, fname); - if (txtname) { + if (txtname && is_mcounted_section_name(txtname)) { uint_t recval = 0; unsigned const recsym = find_secsym_ndx( w(relhdr->sh_info), txtname, &recval, @@ -433,6 +501,12 @@ do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) mlocp = sift_rel_mcount(mlocp, (void *)mlocp - (void *)mloc0, &mrelp, relhdr, ehdr, recsym, recval, reltype); + } else if (make_nop && txtname) { + /* + * This section is ignored by ftrace, but still + * has mcount calls. Convert them to nops now. + */ + nop_mcount(relhdr, ehdr); } } if (mloc0 != mlocp) { -- cgit v1.1 From dfad3d598c4bbbaf137588e22bac1ce624529f7e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2011 18:53:25 -0400 Subject: ftrace/recordmcount: Add warning logic to warn on mcount not recorded There's some sections that should not have mcount recorded and should not have modifications to the that code. But currently they waste some time by calling mcount anyway (which simply returns). As the real answer should be to either whitelist the section or have gcc ignore it fully. This change adds a option to recordmcount to warn when it finds a section that is ignored by ftrace but still contains mcount callers. This is not on by default as developers may not know if the section should be completely ignored or added to the whitelist. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023738.476989377@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.c | 32 ++++++++++++++++++++++++-------- scripts/recordmcount.h | 22 +++++++++++++++++----- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 78054a4..0e18975 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ static char gpfx; /* prefix for global symbol name (sometimes '_') */ static struct stat sb; /* Remember .st_size, etc. */ static jmp_buf jmpenv; /* setjmp/longjmp per-file error escape */ static const char *altmcount; /* alternate mcount symbol name */ +static int warn_on_notrace_sect; /* warn when section has mcount not being recorded */ /* setjmp() return values */ enum { @@ -397,19 +399,33 @@ do_file(char const *const fname) } int -main(int argc, char const *argv[]) +main(int argc, char *argv[]) { const char ftrace[] = "/ftrace.o"; int ftrace_size = sizeof(ftrace) - 1; int n_error = 0; /* gcc-4.3.0 false positive complaint */ + int c; + int i; - if (argc <= 1) { - fprintf(stderr, "usage: recordmcount file.o...\n"); + while ((c = getopt(argc, argv, "w")) >= 0) { + switch (c) { + case 'w': + warn_on_notrace_sect = 1; + break; + default: + fprintf(stderr, "usage: recordmcount [-w] file.o...\n"); + return 0; + } + } + + if ((argc - optind) < 1) { + fprintf(stderr, "usage: recordmcount [-w] file.o...\n"); return 0; } /* Process each file in turn, allowing deep failure. */ - for (--argc, ++argv; argc > 0; --argc, ++argv) { + for (i = optind; i < argc; i++) { + char *file = argv[i]; int const sjval = setjmp(jmpenv); int len; @@ -418,14 +434,14 @@ main(int argc, char const *argv[]) * function but does not call it. Since ftrace.o should * not be traced anyway, we just skip it. */ - len = strlen(argv[0]); + len = strlen(file); if (len >= ftrace_size && - strcmp(argv[0] + (len - ftrace_size), ftrace) == 0) + strcmp(file + (len - ftrace_size), ftrace) == 0) continue; switch (sjval) { default: - fprintf(stderr, "internal error: %s\n", argv[0]); + fprintf(stderr, "internal error: %s\n", file); exit(1); break; case SJ_SETJMP: /* normal sequence */ @@ -433,7 +449,7 @@ main(int argc, char const *argv[]) fd_map = -1; ehdr_curr = NULL; mmap_failed = 1; - do_file(argv[0]); + do_file(file); break; case SJ_FAIL: /* error in do_file or below */ ++n_error; diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 657dbed..22033d5 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -313,7 +313,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, * into nops. */ static void nop_mcount(Elf_Shdr const *const relhdr, - Elf_Ehdr const *const ehdr) + Elf_Ehdr const *const ehdr, + const char *const txtname) { Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + (void *)ehdr); @@ -336,6 +337,7 @@ static void nop_mcount(Elf_Shdr const *const relhdr, unsigned mcountsym = 0; unsigned t; + int once = 0; for (t = nrel; t; --t) { int ret = -1; @@ -353,8 +355,18 @@ static void nop_mcount(Elf_Shdr const *const relhdr, mcountsym = Elf_r_sym(relp); } - if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) - ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { + if (make_nop) + ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + if (warn_on_notrace_sect && !once) { + printf("Section %s has mcount callers being ignored\n", + txtname); + once = 1; + /* just warn? */ + if (!make_nop) + return; + } + } /* * If we successfully removed the mcount, mark the relocation @@ -501,12 +513,12 @@ do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype) mlocp = sift_rel_mcount(mlocp, (void *)mlocp - (void *)mloc0, &mrelp, relhdr, ehdr, recsym, recval, reltype); - } else if (make_nop && txtname) { + } else if (txtname && (warn_on_notrace_sect || make_nop)) { /* * This section is ignored by ftrace, but still * has mcount calls. Convert them to nops now. */ - nop_mcount(relhdr, ehdr); + nop_mcount(relhdr, ehdr, txtname); } } if (mloc0 != mlocp) { -- cgit v1.1 From 85356f802225fedeee8c3e65bdd93b263ace0a8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2011 18:59:10 -0400 Subject: kbuild/recordmcount: Add RECORDMCOUNT_WARN to warn about mcount callers When mcount is called in a section that ftrace will not modify it into a nop, we want to warn about this. But not warn about this always. Now if the user builds the kernel with the option RECORDMCOUNT_WARN=1 then the build will warn about mcount callers that are ignored and will just waste execution time. Acked-by: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: http://lkml.kernel.org/r/20110421023738.714956282@goodmis.org Signed-off-by: Steven Rostedt --- Makefile | 1 + scripts/Makefile.build | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 41ea6fb..e7d01ada 100644 --- a/Makefile +++ b/Makefile @@ -1268,6 +1268,7 @@ help: @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)' @echo ' make C=2 [targets] Force check of all c source with $$CHECK' @echo ' make W=1 [targets] Enable extra gcc checks' + @echo ' make RECORDMCOUNT_WARN=1 [targets] Warn about ignored mcount sections' @echo '' @echo 'Execute "make" or "make all" to build all targets marked with [*] ' @echo 'For further info see the ./README file' diff --git a/scripts/Makefile.build b/scripts/Makefile.build index d5f925a..fdca952 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -244,13 +244,16 @@ endif ifdef CONFIG_FTRACE_MCOUNT_RECORD ifdef BUILD_C_RECORDMCOUNT +ifeq ("$(origin RECORDMCOUNT_WARN)", "command line") + RECORDMCOUNT_FLAGS = -w +endif # Due to recursion, we must skip empty.o. # The empty.o file is created in the make process in order to determine # the target endianness and word size. It is made before all other C # files, including recordmcount. sub_cmd_record_mcount = \ if [ $(@) != "scripts/mod/empty.o" ]; then \ - $(objtree)/scripts/recordmcount "$(@)"; \ + $(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) "$(@)"; \ fi; else sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ -- cgit v1.1 From f0201738b61b1adcf6b2c4719c5c415745014c1c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2011 19:06:39 -0400 Subject: ftrace: Avoid recording mcount on .init sections directly The init and exit sections should not be traced and adding a call to mcount to them is a waste of text and instruction cache. Have the macro section attributes include notrace to ignore these functions for tracing from the build. Link: http://lkml.kernel.org/r/20110421023738.953028219@goodmis.org Signed-off-by: Steven Rostedt --- include/linux/init.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/init.h b/include/linux/init.h index 577671c..9146f39 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -79,29 +79,29 @@ #define __exitused __used #endif -#define __exit __section(.exit.text) __exitused __cold +#define __exit __section(.exit.text) __exitused __cold notrace /* Used for HOTPLUG */ -#define __devinit __section(.devinit.text) __cold +#define __devinit __section(.devinit.text) __cold notrace #define __devinitdata __section(.devinit.data) #define __devinitconst __section(.devinit.rodata) -#define __devexit __section(.devexit.text) __exitused __cold +#define __devexit __section(.devexit.text) __exitused __cold notrace #define __devexitdata __section(.devexit.data) #define __devexitconst __section(.devexit.rodata) /* Used for HOTPLUG_CPU */ -#define __cpuinit __section(.cpuinit.text) __cold +#define __cpuinit __section(.cpuinit.text) __cold notrace #define __cpuinitdata __section(.cpuinit.data) #define __cpuinitconst __section(.cpuinit.rodata) -#define __cpuexit __section(.cpuexit.text) __exitused __cold +#define __cpuexit __section(.cpuexit.text) __exitused __cold notrace #define __cpuexitdata __section(.cpuexit.data) #define __cpuexitconst __section(.cpuexit.rodata) /* Used for MEMORY_HOTPLUG */ -#define __meminit __section(.meminit.text) __cold +#define __meminit __section(.meminit.text) __cold notrace #define __meminitdata __section(.meminit.data) #define __meminitconst __section(.meminit.rodata) -#define __memexit __section(.memexit.text) __exitused __cold +#define __memexit __section(.memexit.text) __exitused __cold notrace #define __memexitdata __section(.memexit.data) #define __memexitconst __section(.memexit.rodata) -- cgit v1.1 From 2895cd2ab81dfb7bc22637bc110857db44a30b4a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2011 16:43:29 -0400 Subject: ftrace/x86: Do not trace .discard.text section The section called .discard.text has tracing attached to it and is currently ignored by ftrace. But it does include a call to the mcount stub. Adding a notrace to the code keeps gcc from adding the useless mcount caller to it. Link: http://lkml.kernel.org/r/20110421023739.243651696@goodmis.org Signed-off-by: Steven Rostedt --- arch/x86/include/asm/setup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index db8aa19..647d8a0 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align); * executable.) */ #define RESERVE_BRK(name,sz) \ - static void __section(.discard.text) __used \ + static void __section(.discard.text) __used notrace \ __brk_reservation_fn_##name##__(void) { \ asm volatile ( \ ".pushsection .brk_reservation,\"aw\",@nobits;" \ -- cgit v1.1 From 37762cb9977626343b3cd1aab9146313c94748c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 20:47:34 -0400 Subject: ftrace/recordmcount: Remove duplicate code to find mcount symbol The code in sift_rel_mcount() and nop_mcount() to get the mcount symbol number is identical. Replace the two locations with a call to a function that does the work. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023739.488093407@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.h | 51 ++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 22033d5..deb6a51 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -28,6 +28,7 @@ #undef __has_rel_mcount #undef has_rel_mcount #undef tot_relsize +#undef get_mcountsym #undef do_func #undef Elf_Addr #undef Elf_Ehdr @@ -56,6 +57,7 @@ # define has_rel_mcount has64_rel_mcount # define tot_relsize tot64_relsize # define do_func do64 +# define get_mcountsym get_mcountsym_64 # define is_fake_mcount is_fake_mcount64 # define fn_is_fake_mcount fn_is_fake_mcount64 # define MIPS_is_fake_mcount MIPS64_is_fake_mcount @@ -85,6 +87,7 @@ # define has_rel_mcount has32_rel_mcount # define tot_relsize tot32_relsize # define do_func do32 +# define get_mcountsym get_mcountsym_32 # define is_fake_mcount is_fake_mcount32 # define fn_is_fake_mcount fn_is_fake_mcount32 # define MIPS_is_fake_mcount MIPS32_is_fake_mcount @@ -237,6 +240,26 @@ static void append_func(Elf_Ehdr *const ehdr, uwrite(fd_map, ehdr, sizeof(*ehdr)); } +static unsigned get_mcountsym(Elf_Sym const *const sym0, + Elf_Rel const *relp, + char const *const str0) +{ + unsigned mcountsym = 0; + + Elf_Sym const *const symp = + &sym0[Elf_r_sym(relp)]; + char const *symname = &str0[w(symp->st_name)]; + char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; + + if (symname[0] == '.') + ++symname; /* ppc64 hack */ + if (strcmp(mcount, symname) == 0 || + (altmcount && strcmp(altmcount, symname) == 0)) + mcountsym = Elf_r_sym(relp); + + return mcountsym; +} + /* * Look at the relocations in order to find the calls to mcount. * Accumulate the section offsets that are found, and their relocation info, @@ -274,18 +297,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, unsigned t; for (t = nrel; t; --t) { - if (!mcountsym) { - Elf_Sym const *const symp = - &sym0[Elf_r_sym(relp)]; - char const *symname = &str0[w(symp->st_name)]; - char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; - - if (symname[0] == '.') - ++symname; /* ppc64 hack */ - if (strcmp(mcount, symname) == 0 || - (altmcount && strcmp(altmcount, symname) == 0)) - mcountsym = Elf_r_sym(relp); - } + if (!mcountsym) + mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { uint_t const addend = _w(_w(relp->r_offset) - recval); @@ -342,18 +355,8 @@ static void nop_mcount(Elf_Shdr const *const relhdr, for (t = nrel; t; --t) { int ret = -1; - if (!mcountsym) { - Elf_Sym const *const symp = - &sym0[Elf_r_sym(relp)]; - char const *symname = &str0[w(symp->st_name)]; - char const *mcount = gpfx == '_' ? "_mcount" : "mcount"; - - if (symname[0] == '.') - ++symname; /* ppc64 hack */ - if (strcmp(mcount, symname) == 0 || - (altmcount && strcmp(altmcount, symname) == 0)) - mcountsym = Elf_r_sym(relp); - } + if (!mcountsym) + mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { if (make_nop) -- cgit v1.1 From 41b402a201a12efdff4acc990e023a89a409cd41 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Apr 2011 21:13:06 -0400 Subject: ftrace/recordmcount: Add helper function get_sym_str_and_relp() The code to get the symbol, string, and relp pointers in the two functions sift_rel_mcount() and nop_mcount() are identical and also non-trivial. Moving this duplicate code into a single helper function makes the code easier to read and more maintainable. Cc: John Reiser Link: http://lkml.kernel.org/r/20110421023739.723658553@goodmis.org Signed-off-by: Steven Rostedt --- scripts/recordmcount.h | 67 ++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index deb6a51..3c00fab 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -29,6 +29,7 @@ #undef has_rel_mcount #undef tot_relsize #undef get_mcountsym +#undef get_sym_str_and_relp #undef do_func #undef Elf_Addr #undef Elf_Ehdr @@ -56,6 +57,7 @@ # define __has_rel_mcount __has64_rel_mcount # define has_rel_mcount has64_rel_mcount # define tot_relsize tot64_relsize +# define get_sym_str_and_relp get_sym_str_and_relp_64 # define do_func do64 # define get_mcountsym get_mcountsym_64 # define is_fake_mcount is_fake_mcount64 @@ -86,6 +88,7 @@ # define __has_rel_mcount __has32_rel_mcount # define has_rel_mcount has32_rel_mcount # define tot_relsize tot32_relsize +# define get_sym_str_and_relp get_sym_str_and_relp_32 # define do_func do32 # define get_mcountsym get_mcountsym_32 # define is_fake_mcount is_fake_mcount32 @@ -260,6 +263,29 @@ static unsigned get_mcountsym(Elf_Sym const *const sym0, return mcountsym; } +static void get_sym_str_and_relp(Elf_Shdr const *const relhdr, + Elf_Ehdr const *const ehdr, + Elf_Sym const **sym0, + char const **str0, + Elf_Rel const **relp) +{ + Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + + (void *)ehdr); + unsigned const symsec_sh_link = w(relhdr->sh_link); + Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; + Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; + Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) + + (void *)ehdr); + + *sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) + + (void *)ehdr); + + *str0 = (char const *)(_w(strsec->sh_offset) + + (void *)ehdr); + + *relp = rel0; +} + /* * Look at the relocations in order to find the calls to mcount. * Accumulate the section offsets that are found, and their relocation info, @@ -276,26 +302,16 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, { uint_t *const mloc0 = mlocp; Elf_Rel *mrelp = *mrelpp; - Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) - + (void *)ehdr); - unsigned const symsec_sh_link = w(relhdr->sh_link); - Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; - Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) - + (void *)ehdr); - - Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; - char const *const str0 = (char const *)(_w(strsec->sh_offset) - + (void *)ehdr); - - Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) - + (void *)ehdr); + Elf_Sym const *sym0; + char const *str0; + Elf_Rel const *relp; unsigned rel_entsize = _w(relhdr->sh_entsize); unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; - Elf_Rel const *relp = rel0; - unsigned mcountsym = 0; unsigned t; + get_sym_str_and_relp(relhdr, ehdr, &sym0, &str0, &relp); + for (t = nrel; t; --t) { if (!mcountsym) mcountsym = get_mcountsym(sym0, relp, str0); @@ -331,27 +347,18 @@ static void nop_mcount(Elf_Shdr const *const relhdr, { Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff) + (void *)ehdr); - unsigned const symsec_sh_link = w(relhdr->sh_link); - Elf_Shdr const *const symsec = &shdr0[symsec_sh_link]; - Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset) - + (void *)ehdr); - - Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)]; - char const *const str0 = (char const *)(_w(strsec->sh_offset) - + (void *)ehdr); - - Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset) - + (void *)ehdr); + Elf_Sym const *sym0; + char const *str0; + Elf_Rel const *relp; + Elf_Shdr const *const shdr = &shdr0[w(relhdr->sh_info)]; unsigned rel_entsize = _w(relhdr->sh_entsize); unsigned const nrel = _w(relhdr->sh_size) / rel_entsize; - Elf_Rel const *relp = rel0; - - Elf_Shdr const *const shdr = &shdr0[w(relhdr->sh_info)]; - unsigned mcountsym = 0; unsigned t; int once = 0; + get_sym_str_and_relp(relhdr, ehdr, &sym0, &str0, &relp); + for (t = nrel; t; --t) { int ret = -1; -- cgit v1.1 From 07d8b595f367f4604e6027ad4cba33cbe3f55e10 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 10:10:40 +0200 Subject: ftrace/recordmcount: mcount address adjustment Introduce mcount_adjust{,_32,_64} to the C implementation of recordmcount analog to $mcount_adjust in the perl script. The adjustment is added to the address of the relocations against the mcount symbol. If this adjustment is done by recordmcount at compile time the ftrace_call_adjust function can be turned into a nop. Cc: John Reiser Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt --- scripts/recordmcount.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 3c00fab..4be6036 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -22,6 +22,7 @@ #undef is_fake_mcount #undef fn_is_fake_mcount #undef MIPS_is_fake_mcount +#undef mcount_adjust #undef sift_rel_mcount #undef nop_mcount #undef find_secsym_ndx @@ -63,6 +64,7 @@ # define is_fake_mcount is_fake_mcount64 # define fn_is_fake_mcount fn_is_fake_mcount64 # define MIPS_is_fake_mcount MIPS64_is_fake_mcount +# define mcount_adjust mcount_adjust_64 # define Elf_Addr Elf64_Addr # define Elf_Ehdr Elf64_Ehdr # define Elf_Shdr Elf64_Shdr @@ -94,6 +96,7 @@ # define is_fake_mcount is_fake_mcount32 # define fn_is_fake_mcount fn_is_fake_mcount32 # define MIPS_is_fake_mcount MIPS32_is_fake_mcount +# define mcount_adjust mcount_adjust_32 # define Elf_Addr Elf32_Addr # define Elf_Ehdr Elf32_Ehdr # define Elf_Shdr Elf32_Shdr @@ -132,6 +135,8 @@ static void fn_ELF_R_INFO(Elf_Rel *const rp, unsigned sym, unsigned type) } static void (*Elf_r_info)(Elf_Rel *const rp, unsigned sym, unsigned type) = fn_ELF_R_INFO; +static int mcount_adjust = 0; + /* * MIPS mcount long call has 2 _mcount symbols, only the position of the 1st * _mcount symbol is needed for dynamic function tracer, with it, to disable @@ -317,8 +322,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { - uint_t const addend = _w(_w(relp->r_offset) - recval); - + uint_t const addend = + _w(_w(relp->r_offset) - recval + mcount_adjust); mrelp->r_offset = _w(offbase + ((void *)mlocp - (void *)mloc0)); Elf_r_info(mrelp, recsym, reltype); -- cgit v1.1 From 521ccb5c4aece609311bfa7157910a8f0c942af5 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 10:10:41 +0200 Subject: ftrace/x86: mcount offset calculation Do the mcount offset adjustment in the recordmcount.pl/recordmcount.[ch] at compile time and not in ftrace_call_adjust at run time. Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 7 +++---- scripts/recordmcount.c | 2 ++ scripts/recordmcount.pl | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index db24c22..268c783 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -38,11 +38,10 @@ extern void mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* - * call mcount is "e8 <4 byte offset>" - * The addr points to the 4 byte offset and the caller of this - * function wants the pointer to e8. Simply subtract one. + * addr is the address of the mcount call instruction. + * recordmcount does the necessary offset calculation. */ - return addr - 1; + return addr; } #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 0e18975..7648a5d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -335,6 +335,7 @@ do_file(char const *const fname) reltype = R_386_32; make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_32; + mcount_adjust_32 = -1; break; case EM_ARM: reltype = R_ARM_ABS32; altmcount = "__gnu_mcount_nc"; @@ -350,6 +351,7 @@ do_file(char const *const fname) make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_64; reltype = R_X86_64_64; + mcount_adjust_64 = -1; break; } /* end switch */ diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index a871cd4..414e7f5 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -223,6 +223,7 @@ if ($arch eq "x86_64") { $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; $type = ".quad"; $alignment = 8; + $mcount_adjust = -1; # force flags for this arch $ld .= " -m elf_x86_64"; @@ -232,6 +233,7 @@ if ($arch eq "x86_64") { } elsif ($arch eq "i386") { $alignment = 4; + $mcount_adjust = -1; # force flags for this arch $ld .= " -m elf_i386"; -- cgit v1.1 From f29638868280534ed7e2fdd93b31557232597940 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 10 May 2011 10:10:43 +0200 Subject: ftrace/s390: mcount offset calculation Do the mcount offset adjustment in the recordmcount.pl/recordmcount.[ch] at compile time and not in ftrace_call_adjust at run time. Signed-off-by: Martin Schwidefsky Signed-off-by: Steven Rostedt --- arch/s390/include/asm/ftrace.h | 4 +--- scripts/recordmcount.c | 8 ++++++-- scripts/recordmcount.pl | 2 ++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 3c29be4..b7931fa 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -11,15 +11,13 @@ struct dyn_arch_ftrace { }; #ifdef CONFIG_64BIT #define MCOUNT_INSN_SIZE 12 -#define MCOUNT_OFFSET 8 #else #define MCOUNT_INSN_SIZE 20 -#define MCOUNT_OFFSET 4 #endif static inline unsigned long ftrace_call_adjust(unsigned long addr) { - return addr - MCOUNT_OFFSET; + return addr; } #endif /* __ASSEMBLY__ */ diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 7648a5d..ee52cb8 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -368,8 +368,10 @@ do_file(char const *const fname) "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (w2(ehdr->e_machine) == EM_S390) + if (w2(ehdr->e_machine) == EM_S390) { reltype = R_390_32; + mcount_adjust_32 = -4; + } if (w2(ehdr->e_machine) == EM_MIPS) { reltype = R_MIPS_32; is_fake_mcount32 = MIPS32_is_fake_mcount; @@ -384,8 +386,10 @@ do_file(char const *const fname) "unrecognized ET_REL file: %s\n", fname); fail_file(); } - if (w2(ghdr->e_machine) == EM_S390) + if (w2(ghdr->e_machine) == EM_S390) { reltype = R_390_64; + mcount_adjust_64 = -8; + } if (w2(ghdr->e_machine) == EM_MIPS) { reltype = R_MIPS_64; Elf64_r_sym = MIPS64_r_sym; diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 414e7f5..858966a 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -243,12 +243,14 @@ if ($arch eq "x86_64") { } elsif ($arch eq "s390" && $bits == 32) { $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_390_32\\s+_mcount\$"; + $mcount_adjust = -4; $alignment = 4; $ld .= " -m elf_s390"; $cc .= " -m31"; } elsif ($arch eq "s390" && $bits == 64) { $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_390_(PC|PLT)32DBL\\s+_mcount\\+0x2\$"; + $mcount_adjust = -8; $alignment = 8; $type = ".quad"; $ld .= " -m elf64_s390"; -- cgit v1.1 From 867955f5682f7157fdafe8670804b9f8ea077bc7 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 16 May 2011 06:13:49 +0000 Subject: sfc: Fix oops in register dump after mapping change Commit 747df2258b1b9a2e25929ef496262c339c380009 ('sfc: Always map MCDI shared memory as uncacheable') introduced a separate mapping for the MCDI shared memory (MC_TREG_SMEM). This means we can no longer easily include it in the register dump. Since it is not particularly useful in debugging, substitute a recognisable dummy value. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/sfc/nic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/sfc/nic.c b/drivers/net/sfc/nic.c index 10f1cb7..9b29a8d 100644 --- a/drivers/net/sfc/nic.c +++ b/drivers/net/sfc/nic.c @@ -1937,6 +1937,13 @@ void efx_nic_get_regs(struct efx_nic *efx, void *buf) size = min_t(size_t, table->step, 16); + if (table->offset >= efx->type->mem_map_size) { + /* No longer mapped; return dummy data */ + memcpy(buf, "\xde\xc0\xad\xde", 4); + buf += table->rows * size; + continue; + } + for (i = 0; i < table->rows; i++) { switch (table->step) { case 4: /* 32-bit register or SRAM */ -- cgit v1.1 From ebde6f8acba92abfc203585198a54f47e83e2cd0 Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Mon, 16 May 2011 06:28:15 +0000 Subject: vmxnet3: Fix inconsistent LRO state after initialization During initialization of vmxnet3, the state of LRO gets out of sync with netdev->features. This leads to very poor TCP performance in a IP forwarding setup and is hitting many VMware users. Simplified call sequence: 1. vmxnet3_declare_features() initializes "adapter->lro" to true. 2. The kernel automatically disables LRO if IP forwarding is enabled, so vmxnet3_set_flags() gets called. This also updates netdev->features. 3. Now vmxnet3_setup_driver_shared() is called. "adapter->lro" is still set to true and LRO gets enabled again, even though netdev->features shows it's disabled. Fix it by updating "adapter->lro", too. The private vmxnet3 adapter flags are scheduled for removal in net-next, see commit a0d2730c9571aeba793cb5d3009094ee1d8fda35 "net: vmxnet3: convert to hw_features". Patch applies to 2.6.37 / 2.6.38 and 2.6.39-rc6. Please CC: comments. Signed-off-by: Thomas Jarosch Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index 51f2ef1..9764672 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -311,6 +311,9 @@ vmxnet3_set_flags(struct net_device *netdev, u32 data) /* toggle the LRO feature*/ netdev->features ^= NETIF_F_LRO; + /* Update private LRO flag */ + adapter->lro = lro_requested; + /* update harware LRO capability accordingly */ if (lro_requested) adapter->shared->devRead.misc.uptFeatures |= -- cgit v1.1 From 6f404e441d169afc90929ef5e451ec9779c1f11a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 16 May 2011 15:14:21 -0400 Subject: net: Change netdev_fix_features messages loglevel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those reduced to DEBUG can possibly be triggered by unprivileged processes and are nothing exceptional. Illegal checksum combinations can only be caused by driver bug, so promote those messages to WARN. Since GSO without SG will now only cause DEBUG message from netdev_fix_features(), remove the workaround from register_netdevice(). Signed-off-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9200944..b624fe4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5186,27 +5186,27 @@ u32 netdev_fix_features(struct net_device *dev, u32 features) /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_info(dev, "mixed HW and IP checksum settings.\n"); + netdev_warn(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } if ((features & NETIF_F_NO_CSUM) && (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_info(dev, "mixed no checksumming and other settings.\n"); + netdev_warn(dev, "mixed no checksumming and other settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); } /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { - netdev_info(dev, - "Dropping NETIF_F_SG since no checksum feature.\n"); + netdev_dbg(dev, + "Dropping NETIF_F_SG since no checksum feature.\n"); features &= ~NETIF_F_SG; } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { - netdev_info(dev, "Dropping TSO features since no SG feature.\n"); + netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } @@ -5216,7 +5216,7 @@ u32 netdev_fix_features(struct net_device *dev, u32 features) /* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { - netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); features &= ~NETIF_F_GSO; } @@ -5226,13 +5226,13 @@ u32 netdev_fix_features(struct net_device *dev, u32 features) if (!((features & NETIF_F_GEN_CSUM) || (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_info(dev, + netdev_dbg(dev, "Dropping NETIF_F_UFO since no checksum offload features.\n"); features &= ~NETIF_F_UFO; } if (!(features & NETIF_F_SG)) { - netdev_info(dev, + netdev_dbg(dev, "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); features &= ~NETIF_F_UFO; } @@ -5414,12 +5414,6 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_SOFT_FEATURES; dev->wanted_features = dev->features & dev->hw_features; - /* Avoid warning from netdev_fix_features() for GSO without SG */ - if (!(dev->wanted_features & NETIF_F_SG)) { - dev->wanted_features &= ~NETIF_F_GSO; - dev->features &= ~NETIF_F_GSO; - } - /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features * are enabled only if supported by underlying device. -- cgit v1.1 From 865be7a81071a77014c83cd01536c989eed362b4 Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Mon, 16 May 2011 21:38:08 +0200 Subject: x86, cpu: Fix detection of Celeron Covington stepping A1 and B0 Steppings A1 and B0 of Celeron Covington are currently misdetected as Pentium II (Dixon). Fix it by removing the stepping check. [ hpa: this fixes this specific bug... the CPUID documentation specifies that the L2 cache size can disambiguate additional CPUs; this patch does not fix that. ] Signed-off-by: Ondrej Zary Link: http://lkml.kernel.org/r/201105162138.15416.linux@rainbow-software.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index df86bc8..32e86aa 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -400,12 +400,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 5: - if (c->x86_mask == 0) { - if (l2 == 0) - p = "Celeron (Covington)"; - else if (l2 == 256) - p = "Mobile Pentium II (Dixon)"; - } + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; break; case 6: -- cgit v1.1 From dc382fd5bcca7098a984705ed6ac880f539d068e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 16 May 2011 13:54:10 -0700 Subject: x86, mm: Allow ZONE_DMA to be configurable ZONE_DMA is unnecessary for a large number of machines that do not require less than 32-bit DMA addressing, e.g. ISA legacy DMA or PCI cards with a restricted DMA address mask. This patch allows users to disable ZONE_DMA for x86 if they know they will not be using such devices with their kernel. This prevents the VM from unnecessarily reserving a ratio of memory (defaulting to 1/256th of system capacity) with lowmem_reserve_ratio for such allocations when it will never be used. Signed-off-by: David Rientjes Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1105161353560.4353@chino.kir.corp.google.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 9 ++++++++- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 648fca4..0eb801a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -112,7 +112,14 @@ config MMU def_bool y config ZONE_DMA - def_bool y + bool "DMA memory allocation support" if EXPERT + default y + help + DMA memory allocation support allows devices with less than 32-bit + addressing to allocate within the first 16MB of address space. + Disable if no such devices will be used. + + If unsure, say Y. config SBUS bool diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cde0a34..29f7c6d9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -678,8 +678,10 @@ static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; +#endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0404bb3..d865c4ae 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -616,7 +616,9 @@ void __init paging_init(void) unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); +#ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; -- cgit v1.1 From 07f4beb0b5bbfaf36a64aa00d59e670ec578a95a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 May 2011 11:07:48 +0200 Subject: tick: Clear broadcast active bit when switching to oneshot The first cpu which switches from periodic to oneshot mode switches also the broadcast device into oneshot mode. The broadcast device serves as a backup for per cpu timers which stop in deeper C-states. To avoid starvation of the cpus which might be in idle and depend on broadcast mode it marks the other cpus as broadcast active and sets the brodcast expiry value of those cpus to the next tick. The oneshot mode broadcast bit for the other cpus is sticky and gets only cleared when those cpus exit idle. If a cpu was not idle while the bit got set in consequence the bit prevents that the broadcast device is armed on behalf of that cpu when it enters idle for the first time after it switched to oneshot mode. In most cases that goes unnoticed as one of the other cpus has usually a timer pending which keeps the broadcast device armed with a short timeout. Now if the only cpu which has a short timer active has the bit set then the broadcast device will not be armed on behalf of that cpu and will fire way after the expected timer expiry. In the case of Christians bug report it took ~145 seconds which is about half of the wrap around time of HPET (the limit for that device) due to the fact that all other cpus had no timers armed which expired before the 145 seconds timeframe. The solution is simply to clear the broadcast active bit unconditionally when a cpu switches to oneshot mode after the first cpu switched the broadcast device over. It's not idle at that point otherwise it would not be executing that code. [ I fundamentally hate that broadcast crap. Why the heck thought some folks that when going into deep idle it's a brilliant concept to switch off the last device which brings the cpu back from that state? ] Thanks to Christian for providing all the valuable debug information! Reported-and-tested-by: Christian Hoffmann Cc: John Stultz Link: http://lkml.kernel.org/r/%3Calpine.LFD.2.02.1105161105170.3078%40ionos%3E Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da800ff..723c763 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -522,10 +522,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { + int cpu = smp_processor_id(); + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; - int cpu = smp_processor_id(); bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -551,6 +552,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); } } -- cgit v1.1 From 93d2175d3d31f11ba04fcfa0e9a496a1b4bc8b34 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 May 2011 18:06:17 -0700 Subject: PCI: Clear bridge resource flags if requested size is 0 During pci remove/rescan testing found: pci 0000:c0:03.0: PCI bridge to [bus c4-c9] pci 0000:c0:03.0: bridge window [io 0x1000-0x0fff] pci 0000:c0:03.0: bridge window [mem 0xf0000000-0xf00fffff] pci 0000:c0:03.0: bridge window [mem 0xfc180000000-0xfc197ffffff 64bit pref] pci 0000:c0:03.0: device not available (can't reserve [io 0x1000-0x0fff]) pci 0000:c0:03.0: Error enabling bridge (-22), continuing pci 0000:c0:03.0: enabling bus mastering pci 0000:c0:03.0: setting latency timer to 64 pcieport 0000:c0:03.0: device not available (can't reserve [io 0x1000-0x0fff]) pcieport: probe of 0000:c0:03.0 failed with error -22 This bug was caused by commit c8adf9a3e873 ("PCI: pre-allocate additional resources to devices only after successful allocation of essential resources.") After that commit, pci_hotplug_io_size is changed to additional_io_size from minium size. So it will not go through resource_size(res) != 0 path, and will not be reset. The root cause is: pci_bridge_check_ranges will set RESOURCE_IO flag for pci bridge, and later if children do not need IO resource. those bridge resources will not need to be allocated. but flags is still there. that will confuse the the pci_enable_bridges later. related code: static void assign_requested_resources_sorted(struct resource_list *head, struct resource_list_x *fail_head) { struct resource *res; struct resource_list *list; int idx; for (list = head->next; list; list = list->next) { res = list->res; idx = res - &list->dev->resource[0]; if (resource_size(res) && pci_assign_resource(list->dev, idx)) { ... reset_resource(res); } } } At last, We have to clear the flags in pbus_size_mem/io when requested size == 0 and !add_head. becasue this case it will not go through adjust_resources_sorted(). Just make size1 = size0 when !add_head. it will make flags get cleared. At the same time when requested size == 0, add_size != 0, will still have in head and add_list. because we do not clear the flags for it. After this, we will get right result: pci 0000:c0:03.0: PCI bridge to [bus c4-c9] pci 0000:c0:03.0: bridge window [io disabled] pci 0000:c0:03.0: bridge window [mem 0xf0000000-0xf00fffff] pci 0000:c0:03.0: bridge window [mem 0xfc180000000-0xfc197ffffff 64bit pref] pci 0000:c0:03.0: enabling bus mastering pci 0000:c0:03.0: setting latency timer to 64 pcieport 0000:c0:03.0: setting latency timer to 64 pcieport 0000:c0:03.0: irq 160 for MSI/MSI-X pcieport 0000:c0:03.0: Signaling PME through PCIe PME interrupt pci 0000:c4:00.0: Signaling PME through PCIe PME interrupt pcie_pme 0000:c0:03.0:pcie01: service driver pcie_pme loaded aer 0000:c0:03.0:pcie02: service driver aer loaded pciehp 0000:c0:03.0:pcie04: Hotplug Controller: v3: more simple fix. also fix one typo in pbus_size_mem Signed-off-by: Yinghai Lu Reviewed-by: Ram Pai Cc: Jesse Barnes Cc: Bjorn Helgaas Signed-off-by: Linus Torvalds --- drivers/pci/setup-bus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index ebf51ad..a806cb3 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -579,7 +579,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } size0 = calculate_iosize(size, min_size, size1, resource_size(b_res), 4096); - size1 = !add_size? size0: + size1 = (!add_head || (add_head && !add_size)) ? size0 : calculate_iosize(size, min_size+add_size, size1, resource_size(b_res), 4096); if (!size0 && !size1) { @@ -677,7 +677,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, align += aligns[order]; } size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); - size1 = !add_size ? size : + size1 = (!add_head || (add_head && !add_size)) ? size0 : calculate_memsize(size, min_size+add_size, 0, resource_size(b_res), min_align); if (!size0 && !size1) { -- cgit v1.1 From b5e6ab589d570ac79cc939517fab05c87a23c262 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 May 2011 13:16:54 -0700 Subject: mm: fix kernel-doc warning in page_alloc.c Fix new kernel-doc warning in mm/page_alloc.c: Warning(mm/page_alloc.c:2370): No description found for parameter 'nid' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 570d944..3f8bce2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2358,6 +2358,7 @@ EXPORT_SYMBOL(alloc_pages_exact); /** * alloc_pages_exact_nid - allocate an exact number of physically-contiguous * pages on a node. + * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation * -- cgit v1.1 From 9937a5e2f32892db0dbeefc2b3bc74b3ae3ea9c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 May 2011 11:04:44 +0200 Subject: scsi: remove performance regression due to async queue run Commit c21e6beb removed our queue request_fn re-enter protection, and defaulted to always running the queues from kblockd to be safe. This was a known potential slow down, but should be safe. Unfortunately this is causing big performance regressions for some, so we need to improve this logic. Looking into the details of the re-enter, the real issue is on requeue of requests. Requeue of requests upon seeing a BUSY condition from the device ends up re-running the queue, causing traces like this: scsi_request_fn() scsi_dispatch_cmd() scsi_queue_insert() __scsi_queue_insert() scsi_run_queue() scsi_request_fn() ... potentially causing the issue we want to avoid. So special case the requeue re-run of the queue, but improve it to offload the entire run of local queue and starved queue from a single workqueue callback. This is a lot better than potentially kicking off a workqueue run for each device seen. This also fixes the issue of the local device going into recursion, since the above mentioned commit never moved that queue run out of line. Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 20 ++++++++++++++++---- drivers/scsi/scsi_scan.c | 2 ++ include/scsi/scsi_device.h | 1 + 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e9901b8..01e4e51 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -74,8 +74,6 @@ struct kmem_cache *scsi_sdb_cache; */ #define SCSI_QUEUE_DELAY 3 -static void scsi_run_queue(struct request_queue *q); - /* * Function: scsi_unprep_request() * @@ -161,7 +159,7 @@ static int __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy) blk_requeue_request(q, cmd->request); spin_unlock_irqrestore(q->queue_lock, flags); - scsi_run_queue(q); + kblockd_schedule_work(q, &device->requeue_work); return 0; } @@ -433,7 +431,11 @@ static void scsi_run_queue(struct request_queue *q) continue; } - blk_run_queue_async(sdev->request_queue); + spin_unlock(shost->host_lock); + spin_lock(sdev->request_queue->queue_lock); + __blk_run_queue(sdev->request_queue); + spin_unlock(sdev->request_queue->queue_lock); + spin_lock(shost->host_lock); } /* put any unprocessed entries back */ list_splice(&starved_list, &shost->starved_list); @@ -442,6 +444,16 @@ static void scsi_run_queue(struct request_queue *q) blk_run_queue(q); } +void scsi_requeue_run_queue(struct work_struct *work) +{ + struct scsi_device *sdev; + struct request_queue *q; + + sdev = container_of(work, struct scsi_device, requeue_work); + q = sdev->request_queue; + scsi_run_queue(q); +} + /* * Function: scsi_requeue_command() * diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 087821f..58584dc 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -242,6 +242,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, int display_failure_msg = 1, ret; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); extern void scsi_evt_thread(struct work_struct *work); + extern void scsi_requeue_run_queue(struct work_struct *work); sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size, GFP_ATOMIC); @@ -264,6 +265,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, INIT_LIST_HEAD(&sdev->event_list); spin_lock_init(&sdev->list_lock); INIT_WORK(&sdev->event_work, scsi_evt_thread); + INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue); sdev->sdev_gendev.parent = get_device(&starget->dev); sdev->sdev_target = starget; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 2d3ec50..dd82e02 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -169,6 +169,7 @@ struct scsi_device { sdev_dev; struct execute_work ew; /* used to get process context on put */ + struct work_struct requeue_work; struct scsi_dh_data *scsi_dh_data; enum scsi_device_state sdev_state; -- cgit v1.1 From 328935e6348c6a7cb34798a68c326f4b8372e68a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 May 2011 14:55:18 +0200 Subject: Revert "x86, AMD: Fix APIC timer erratum 400 affecting K8 Rev.A-E processors" This reverts commit e20a2d205c05cef6b5783df339a7d54adeb50962, as it crashes certain boxes with specific AMD CPU models. Moving the lower endpoint of the Erratum 400 check to accomodate earlier K8 revisions (A-E) opens a can of worms which is simply not worth to fix properly by tweaking the errata checking framework: * missing IntPenging MSR on revisions < CG cause #GP: http://marc.info/?l=linux-kernel&m=130541471818831 * makes earlier revisions use the LAPIC timer instead of the C1E idle routine which switches to HPET, thus not waking up in deeper C-states: http://lkml.org/lkml/2011/4/24/20 Therefore, leave the original boundary starting with K8-revF. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bb9eb29..3532d3b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -698,7 +698,7 @@ cpu_dev_register(amd_cpu_dev); */ const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf), + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); EXPORT_SYMBOL_GPL(amd_erratum_400); -- cgit v1.1 From 14fb57dccb6e1defe9f89a66f548fcb24c374c1d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 17 May 2011 14:55:19 +0200 Subject: x86, AMD: Fix ARAT feature setting again Trying to enable the local APIC timer on early K8 revisions uncovers a number of other issues with it, in conjunction with the C1E enter path on AMD. Fixing those causes much more churn and troubles than the benefit of using that timer brings so don't enable it on K8 at all, falling back to the original functionality the kernel had wrt to that. Reported-and-bisected-by: Nick Bowler Cc: Boris Ostrovsky Cc: Andreas Herrmann Cc: Greg Kroah-Hartman Cc: Hans Rosenfeld Cc: Nick Bowler Cc: Joerg-Volker-Peetz Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1305636919-31165-3-git-send-email-bp@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3532d3b..6f9d1f6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -613,7 +613,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #endif /* As a rule processors have APIC timer running in deep C states */ - if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) + if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400)) set_cpu_cap(c, X86_FEATURE_ARAT); /* -- cgit v1.1 From 94692349c4fc1bc74c19a28f9379509361a06a3b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 17 May 2011 15:36:19 +0200 Subject: perf: Fix multi-event parsing bug This patch fixes an issue with event parsing. The following commit appears to have broken the ability to specify a comma separated list of events: commit ceb53fbf6dbb1df26d38379a262c6981fe73dd36 Author: Ingo Molnar Date: Wed Apr 27 04:06:33 2011 +0200 perf stat: Fail more clearly when an invalid modifier is specified This patch fixes this while preserving the desired effect: $ perf stat -e instructions:u,instructions:k ls /dev/null /dev/null Performance counter stats for 'ls /dev/null': 365956 instructions:u # 0.00 insns per cycle 731806 instructions:k # 0.00 insns per cycle 0.001108862 seconds time elapsed $ perf stat -e task-clock-msecs true invalid event modifier: '-msecs' Run 'perf list' for a list of valid events and modifiers Signed-off-by: Stephane Eranian Cc: acme@redhat.com Cc: peterz@infradead.org Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/20110517133619.GA6999@quad Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index ffa493a..41982c3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -734,6 +734,9 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) if (!*str) return 0; + if (*str == ',') + return 0; + if (*str++ != ':') return -1; -- cgit v1.1 From 221d1d797202984cb874e3ed9f1388593d34ee22 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 17 May 2011 06:40:30 -0400 Subject: cifs: add fallback in is_path_accessible for old servers The is_path_accessible check uses a QPathInfo call, which isn't supported by ancient win9x era servers. Fall back to an older SMBQueryInfo call if it fails with the magic error codes. Cc: stable@kernel.org Reported-and-Tested-by: Sandro Bonazzola Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 05f1dcf..277262a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2673,6 +2673,11 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon, 0 /* not legacy */, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc == -EOPNOTSUPP || rc == -EINVAL) + rc = SMBQueryInformation(xid, tcon, full_path, pfile_info, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); kfree(pfile_info); return rc; } -- cgit v1.1 From 2494b030ba9334c7dd7df9b9f7abe4eacc950ec5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 12:33:26 -0700 Subject: x86, cpufeature: Fix cpuid leaf 7 feature detection CPUID leaf 7, subleaf 0 returns the maximum subleaf in EAX, not the number of subleaves. Since so far only subleaf 0 is defined (and only the EBX bitfield) we do not need to qualify the test. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305660806-17519-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin Cc: 2.6.36..39 --- arch/x86/kernel/cpu/common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e2ced00..173f3a3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - if (eax > 0) - c->x86_capability[9] = ebx; + c->x86_capability[9] = ebx; } /* AMD-defined flags: level 0x80000001 */ -- cgit v1.1 From 11379b5e33950048ad66825da7f462b0d0da9d73 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 17 May 2011 15:28:21 -0400 Subject: cifs: fix cifsConvertToUCS() for the mapchars case As Metze pointed out, commit 84cdf74e broke mapchars option: Commit "cifs: fix unaligned accesses in cifsConvertToUCS" (84cdf74e8096a10dd6acbb870dd404b92f07a756) does multiple steps in just one commit (moving the function and changing it without testing). put_unaligned_le16(temp, &target[j]); is never called for any codepoint the goes via the 'default' switch statement. As a result we put just zero (or maybe uninitialized) bytes into the target buffer. His proposed patch looks correct, but doesn't apply to the current head of the tree. This patch should also fix it. Cc: # .38.x: 581ade4: cifs: clean up various nits in unicode routines (try #2) Reported-by: Stefan Metzmacher Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_unicode.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 23d43cd..1b2e180 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -277,6 +277,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen, for (i = 0, j = 0; i < srclen; j++) { src_char = source[i]; + charlen = 1; switch (src_char) { case 0: put_unaligned(0, &target[j]); @@ -316,16 +317,13 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen, dst_char = cpu_to_le16(0x003f); charlen = 1; } - /* - * character may take more than one byte in the source - * string, but will take exactly two bytes in the - * target string - */ - i += charlen; - continue; } + /* + * character may take more than one byte in the source string, + * but will take exactly two bytes in the target string + */ + i += charlen; put_unaligned(dst_char, &target[j]); - i++; /* move to next char in source string */ } ctoUCS_out: -- cgit v1.1 From 3c431936087e93d2219a184a8e19eaa68077e379 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Fri, 22 Apr 2011 22:00:54 +0200 Subject: PM / Suspend: Do not ignore error codes returned by suspend_enter() The current implementation of suspend-to-RAM returns 0 if there is an error from suspend_enter(), because suspend_devices_and_enter() ignores the return value from suspend_enter(). This patch addresses this issue and properly keep the error return from suspend_enter() and let suspend_devices_and_enter relay the error return. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6275970..fa9fabd 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -226,7 +226,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_test(TEST_DEVICES)) goto Recover_platform; - suspend_enter(state); + error = suspend_enter(state); Resume_devices: suspend_test_start(); -- cgit v1.1 From ee940d8dccd899aa1777ea84da3d9cd04b1d2e8e Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 25 Apr 2011 12:33:15 +0200 Subject: Freezer: Use SMP barriers The freezer processes are dealing with multiple threads running simultaneously, and on a UP system, the memory reads/writes do not need barriers to keep things in sync. These are only needed on SMP systems, so use SMP barriers instead. Signed-off-by: Mike Frysinger Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/freezer.c b/kernel/freezer.c index 66ecd2e..7b01de98 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -17,7 +17,7 @@ static inline void frozen_process(void) { if (!unlikely(current->flags & PF_NOFREEZE)) { current->flags |= PF_FROZEN; - wmb(); + smp_wmb(); } clear_freeze_flag(current); } @@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) * the task as frozen and next clears its TIF_FREEZE. */ if (!freezing(p)) { - rmb(); + smp_rmb(); if (frozen(p)) return false; -- cgit v1.1 From e1866b33b1e89f077b7132daae3dfd9a594e9a1a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 29 Apr 2011 00:33:45 +0200 Subject: PM / Runtime: Rework runtime PM handling during driver removal The driver core tries to prevent race conditions between runtime PM and driver removal from happening by incrementing the runtime PM usage counter of the device and executing pm_runtime_barrier() before running the bus notifier and the ->remove() callbacks provided by the device's subsystem or driver. This guarantees that, if a future runtime suspend of the device has been scheduled or a runtime resume or idle request has been queued up right before the driver removal, it will be canceled or waited for to complete and no other asynchronous runtime suspend or idle requests for the device will be put into the PM workqueue until the ->remove() callback returns. However, it doesn't prevent resume requests from being queued up after pm_runtime_barrier() has been called and it doesn't prevent pm_runtime_resume() from executing the device subsystem's runtime resume callback. Morever, it prevents the device's subsystem or driver from putting the device into the suspended state by calling pm_runtime_suspend() from its ->remove() routine. This turns out to be a major inconvenience for some subsystems and drivers that want to leave the devices they handle in the suspended state. To really prevent runtime PM callbacks from racing with the bus notifier callback in __device_release_driver(), which is necessary, because the notifier is used by some subsystems to carry out operations affecting the runtime PM functionality, use pm_runtime_get_sync() instead of the combination of pm_runtime_get_noresume() and pm_runtime_barrier(). This will resume the device if it's in the suspended state and will prevent it from being suspended again until pm_runtime_put_*() is called. To allow subsystems and drivers to put devices into the suspended state by calling pm_runtime_suspend() from their ->remove() routines, execute pm_runtime_put_sync() after running the bus notifier in __device_release_driver(). This will require subsystems and drivers to make their ->remove() callbacks avoid races with runtime PM directly, but it will allow of more flexibility in the handling of devices during the removal of their drivers. Signed-off-by: Rafael J. Wysocki --- drivers/base/dd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index da57ee9..29917c7 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -316,8 +316,7 @@ static void __device_release_driver(struct device *dev) drv = dev->driver; if (drv) { - pm_runtime_get_noresume(dev); - pm_runtime_barrier(dev); + pm_runtime_get_sync(dev); driver_sysfs_remove(dev); @@ -326,6 +325,8 @@ static void __device_release_driver(struct device *dev) BUS_NOTIFY_UNBIND_DRIVER, dev); + pm_runtime_put_sync(dev); + if (dev->bus && dev->bus->remove) dev->bus->remove(dev); else if (drv->remove) @@ -338,7 +339,6 @@ static void __device_release_driver(struct device *dev) BUS_NOTIFY_UNBOUND_DRIVER, dev); - pm_runtime_put_sync(dev); } } -- cgit v1.1 From a144c6a6c924aa1da04dd77fb84b89927354fdff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 May 2011 20:09:42 +0200 Subject: PM: Print a warning if firmware is requested when tasks are frozen Some drivers erroneously use request_firmware() from their ->resume() (or ->thaw(), or ->restore()) callbacks, which is not going to work unless the firmware has been built in. This causes system resume to stall until the firmware-loading timeout expires, which makes users think that the resume has failed and reboot their machines unnecessarily. For this reason, make _request_firmware() print a warning and return immediately with error code if it has been called when tasks are frozen and it's impossible to start any new usermode helpers. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Reviewed-by: Valdis Kletnieks --- drivers/base/firmware_class.c | 5 +++++ include/linux/kmod.h | 5 +++++ kernel/kmod.c | 9 +++++++++ 3 files changed, 19 insertions(+) diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 8c798ef..bbb03e6 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -521,6 +521,11 @@ static int _request_firmware(const struct firmware **firmware_p, if (!firmware_p) return -EINVAL; + if (WARN_ON(usermodehelper_is_disabled())) { + dev_err(device, "firmware: %s will not be loaded\n", name); + return -EBUSY; + } + *firmware_p = firmware = kzalloc(sizeof(*firmware), GFP_KERNEL); if (!firmware) { dev_err(device, "%s: kmalloc(struct firmware) failed\n", diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 6efd7a7..7f3dbcb 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -111,7 +111,12 @@ call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) extern void usermodehelper_init(void); +#ifdef CONFIG_PM_SLEEP extern int usermodehelper_disable(void); extern void usermodehelper_enable(void); +extern bool usermodehelper_is_disabled(void); +#else +static inline bool usermodehelper_is_disabled(void) { return false; } +#endif #endif /* __LINUX_KMOD_H__ */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591..9ab513b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -301,6 +301,15 @@ void usermodehelper_enable(void) usermodehelper_disabled = 0; } +/** + * usermodehelper_is_disabled - check if new helpers are allowed to be started + */ +bool usermodehelper_is_disabled(void) +{ + return usermodehelper_disabled; +} +EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); + static void helper_lock(void) { atomic_inc(&running_helpers); -- cgit v1.1 From e762318baae3002944d68220910aef7caffcd065 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 7 May 2011 18:11:52 +0200 Subject: PM / Wakeup: Fix build warning related to the "wakeup" sysfs file The "wakeup" device sysfs file is only created if CONFIG_PM_SLEEP is set, so put it under CONFIG_PM_SLEEP and make a build warning related to it go away. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- drivers/base/power/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index fff49be..a9f5b89 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -212,8 +212,9 @@ static ssize_t autosuspend_delay_ms_store(struct device *dev, static DEVICE_ATTR(autosuspend_delay_ms, 0644, autosuspend_delay_ms_show, autosuspend_delay_ms_store); -#endif +#endif /* CONFIG_PM_RUNTIME */ +#ifdef CONFIG_PM_SLEEP static ssize_t wake_show(struct device * dev, struct device_attribute *attr, char * buf) { @@ -248,7 +249,6 @@ wake_store(struct device * dev, struct device_attribute *attr, static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store); -#ifdef CONFIG_PM_SLEEP static ssize_t wakeup_count_show(struct device *dev, struct device_attribute *attr, char *buf) { -- cgit v1.1 From c3b0795c98c08351567464150db66d11e05d7611 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 10 May 2011 21:09:38 +0200 Subject: PM / ACPI: Remove acpi_sleep=s4_nonvs acpi_sleep=s4_nonvs is superseded by acpi_sleep=nonvs, so remove it. Signed-off-by: WANG Cong Acked-by: Pavel Machek Acked-by: Len Brown Signed-off-by: Rafael J. Wysocki --- Documentation/feature-removal-schedule.txt | 8 -------- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/acpi/sleep.c | 5 ----- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 492e81d..f6a24e8 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -460,14 +460,6 @@ Who: Thomas Gleixner ---------------------------- -What: The acpi_sleep=s4_nonvs command line option -When: 2.6.37 -Files: arch/x86/kernel/acpi/sleep.c -Why: superseded by acpi_sleep=nonvs -Who: Rafael J. Wysocki - ----------------------------- - What: PCI DMA unmap state API When: August 2012 Why: PCI DMA unmap state API (include/linux/pci-dma.h) was replaced diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cc85a92..259037b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -245,7 +245,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. acpi_sleep= [HW,ACPI] Sleep options Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, - old_ordering, s4_nonvs, sci_force_enable } + old_ordering, nonvs, sci_force_enable } See Documentation/power/video.txt for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ff93bc1..18a857b 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -112,11 +112,6 @@ static int __init acpi_sleep_setup(char *str) #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); - if (strncmp(str, "s4_nonvs", 8) == 0) { - pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " - "please use acpi_sleep=nonvs instead"); - acpi_nvs_nosave(); - } #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); -- cgit v1.1 From 13d53f8775c6a00b070a3eef6833795412eb7fcd Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 10 May 2011 21:27:34 +0200 Subject: kmod: always provide usermodehelper_disable() We need to prevent kernel-forked processes during system poweroff. Such processes try to access the filesystem whose disks we are trying to shutdown at the same time. This causes delays and exceptions in the storage drivers. A follow-up patch will add these calls and need usermodehelper_disable() also on systems without suspend support. Signed-off-by: Kay Sievers Signed-off-by: Rafael J. Wysocki --- include/linux/kmod.h | 4 ---- kernel/kmod.c | 7 ------- 2 files changed, 11 deletions(-) diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 7f3dbcb..3102318 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -111,12 +111,8 @@ call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) extern void usermodehelper_init(void); -#ifdef CONFIG_PM_SLEEP extern int usermodehelper_disable(void); extern void usermodehelper_enable(void); extern bool usermodehelper_is_disabled(void); -#else -static inline bool usermodehelper_is_disabled(void) { return false; } -#endif #endif /* __LINUX_KMOD_H__ */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 9ab513b..5ae0ff3 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -245,7 +245,6 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -321,12 +320,6 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } -#else /* CONFIG_PM_SLEEP */ -#define usermodehelper_disabled 0 - -static inline void helper_lock(void) {} -static inline void helper_unlock(void) {} -#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper -- cgit v1.1 From 13e381365614855bf14c8ad68f9b65e3afd3dd2c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 May 2011 22:40:45 +0200 Subject: PM / Wakeup: Remove useless synchronize_rcu() call wakeup_source_add() adds an item into wakeup_sources list. There is no need to call synchronize_rcu() at this point. Its only needed in wakeup_source_remove() Signed-off-by: Eric Dumazet Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index abbbd33..84f7c7d 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -110,7 +110,6 @@ void wakeup_source_add(struct wakeup_source *ws) spin_lock_irq(&events_lock); list_add_rcu(&ws->entry, &wakeup_sources); spin_unlock_irq(&events_lock); - synchronize_rcu(); } EXPORT_SYMBOL_GPL(wakeup_source_add); -- cgit v1.1 From ddeb648708108091a641adad0a438ec4fd8bf190 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 15 May 2011 11:38:48 +0200 Subject: PM / Hibernate: Add sysfs knob to control size of memory for drivers Martin reports that on his system hibernation occasionally fails due to the lack of memory, because the radeon driver apparently allocates too much of it during the device freeze stage. It turns out that the amount of memory allocated by radeon during hibernation (and presumably during system suspend too) depends on the utilization of the GPU (e.g. hibernating while there are two KDE 4 sessions with compositing enabled causes radeon to allocate more memory than for one KDE 4 session). In principle it should be possible to use image_size to make the memory preallocation mechanism free enough memory for the radeon driver, but in practice it is not easy to guess the right value because of the way the preallocation code uses image_size. For this reason, it seems reasonable to allow users to control the amount of memory reserved for driver allocations made after the hibernate preallocation, which currently is constant and amounts to 1 MB. Introduce a new sysfs file, /sys/power/reserved_size, whose value will be used as the amount of memory to reserve for the post-preallocation reservations made by device drivers, in bytes. For backwards compatibility, set its default (and initial) value to the currently used number (1 MB). References: https://bugzilla.kernel.org/show_bug.cgi?id=34102 Reported-and-tested-by: Martin Steigerwald Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 14 ++++++++++++++ kernel/power/hibernate.c | 23 +++++++++++++++++++++++ kernel/power/main.c | 1 + kernel/power/power.h | 4 ++++ kernel/power/snapshot.c | 25 ++++++++++++++++++++----- 5 files changed, 62 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 194ca44..b464d1276 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -158,3 +158,17 @@ Description: successful, will make the kernel abort a subsequent transition to a sleep state if any wakeup events are reported after the write has returned. + +What: /sys/power/reserved_size +Date: May 2011 +Contact: Rafael J. Wysocki +Description: + The /sys/power/reserved_size file allows user space to control + the amount of memory reserved for allocations made by device + drivers during the "device freeze" stage of hibernation. It can + be written a string representing a non-negative integer that + will be used as the amount of memory to reserve for allocations + made by device drivers' "freeze" callbacks, in bytes. + + Reading from this file will display the current value, which is + set to 1 MB by default. diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 50aae66..4317213 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -982,10 +982,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att power_attr(image_size); +static ssize_t reserved_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", reserved_size); +} + +static ssize_t reserved_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + reserved_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(reserved_size); + static struct attribute * g[] = { &disk_attr.attr, &resume_attr.attr, &image_size_attr.attr, + &reserved_size_attr.attr, NULL, }; diff --git a/kernel/power/main.c b/kernel/power/main.c index de9aef8..2981af4 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -337,6 +337,7 @@ static int __init pm_init(void) if (error) return error; hibernate_image_size_init(); + hibernate_reserved_size_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 03634be..9a00a0a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -15,6 +15,7 @@ struct swsusp_info { #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ +extern void __init hibernate_reserved_size_init(void); extern void __init hibernate_image_size_init(void); #ifdef CONFIG_ARCH_HIBERNATION_HEADER @@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void); #else /* !CONFIG_HIBERNATION */ +static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} #endif /* !CONFIG_HIBERNATION */ @@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \ /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; +/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +extern unsigned long reserved_size; extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ca0aacc..d69e332 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -41,6 +41,18 @@ static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); /* + * Number of bytes to reserve for memory allocations made by device drivers + * from their ->freeze() and ->freeze_noirq() callbacks so that they don't + * cause image creation to fail (tunable via /sys/power/reserved_size). + */ +unsigned long reserved_size; + +void __init hibernate_reserved_size_init(void) +{ + reserved_size = SPARE_PAGES * PAGE_SIZE; +} + +/* * Preferred image size in bytes (tunable via /sys/power/image_size). * When it is set to N, the image creating code will do its best to * ensure the image size will not exceed N bytes, but if that is @@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable) * frame in use. We also need a number of page frames to be free during * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation - * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, - * respectively, both of which are rough estimates). To make this happen, we - * compute the total number of available page frames and allocate at least + * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough + * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * /sys/power/reserved_size, respectively). To make this happen, we compute the + * total number of available page frames and allocate at least * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) * * of them, which corresponds to the maximum size of a hibernation image. * @@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void) count -= totalreserve_pages; /* Compute the maximum number of saveable pages to leave in memory. */ - max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + max_size = (count - (size + PAGES_FOR_IO)) / 2 + - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); /* Compute the desired number of image pages specified by image_size. */ size = DIV_ROUND_UP(image_size, PAGE_SIZE); if (size > max_size) -- cgit v1.1 From 1c1be3a949a61427a962771c85a347c822aeb991 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 15 May 2011 11:39:48 +0200 Subject: Revert "PM / Hibernate: Reduce autotuned default image size" This reverts commit bea3864fb627d110933cfb8babe048b63c4fc76e (PM / Hibernate: Reduce autotuned default image size), because users are now able to resolve the issue this commit was supposed to address in a different way (i.e. by using the new /sys/power/reserved_size interface). Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d69e332..ace5588 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -54,15 +54,15 @@ void __init hibernate_reserved_size_init(void) /* * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, the image creating code will do its best to - * ensure the image size will not exceed N bytes, but if that is - * impossible, it will try to create the smallest image possible. + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. */ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = (totalram_pages / 3) * PAGE_SIZE; + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; } /* List of PBEs needed for restoring the pages that were allocated before -- cgit v1.1 From c650da23d59d2c82307380414606774c6d49b8bd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 May 2011 23:25:10 +0200 Subject: PM: Remove CONFIG_PM_VERBOSE Now that we have CONFIG_DYNAMIC_DEBUG there is no need for yet another flag causing dev_dbg() and pr_debug() statements in the core PM code to produce output. Moreover, CONFIG_PM_VERBOSE causes so much output to be generated that it's not really useful and almost no one sets it. References: https://bugzilla.kernel.org/show_bug.cgi?id=23182 Signed-off-by: Rafael J. Wysocki --- arch/sh/configs/apsh4ad0a_defconfig | 1 - arch/sh/configs/sdk7786_defconfig | 1 - drivers/base/power/Makefile | 3 +-- kernel/power/Kconfig | 6 ------ 4 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index e71a531..77ec0e7 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -48,7 +48,6 @@ CONFIG_PREEMPT=y CONFIG_BINFMT_MISC=y CONFIG_PM=y CONFIG_PM_DEBUG=y -CONFIG_PM_VERBOSE=y CONFIG_PM_RUNTIME=y CONFIG_CPU_IDLE=y CONFIG_NET=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index dc4a2eb..c416505 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -83,7 +83,6 @@ CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y CONFIG_BINFMT_MISC=y CONFIG_PM=y CONFIG_PM_DEBUG=y -CONFIG_PM_VERBOSE=y CONFIG_PM_RUNTIME=y CONFIG_CPU_IDLE=y CONFIG_NET=y diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 06a7073..3647e11 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -5,5 +5,4 @@ obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_PM_OPP) += opp.o obj-$(CONFIG_HAVE_CLK) += clock_ops.o -ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG -ccflags-$(CONFIG_PM_VERBOSE) += -DDEBUG +ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG \ No newline at end of file diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d74ad4a..87f4d24 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -125,12 +125,6 @@ config PM_DEBUG code. This is helpful when debugging and reporting PM bugs, like suspend support. -config PM_VERBOSE - bool "Verbose Power Management debugging" - depends on PM_DEBUG - ---help--- - This option enables verbose messages from the Power Management code. - config PM_ADVANCED_DEBUG bool "Extra PM attributes in sysfs for low-level debugging/testing" depends on PM_DEBUG -- cgit v1.1 From 91e7c75ba93c48a82670d630b9daac92ff70095d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 May 2011 23:26:00 +0200 Subject: PM: Allow drivers to allocate memory from .prepare() callbacks safely If device drivers allocate substantial amounts of memory (above 1 MB) in their hibernate .freeze() callbacks (or in their legacy suspend callbcks during hibernation), the subsequent creation of hibernate image may fail due to the lack of memory. This is the case, because the drivers' .freeze() callbacks are executed after the hibernate memory preallocation has been carried out and the preallocated amount of memory may be too small to cover the new driver allocations. Unfortunately, the drivers' .prepare() callbacks also are executed after the hibernate memory preallocation has completed, so they are not suitable for allocating additional memory either. Thus the only way a driver can safely allocate memory during hibernation is to use a hibernate/suspend notifier. However, the notifiers are called before the freezing of user space and the drivers wanting to use them for allocating additional memory may not know how much memory needs to be allocated at that point. To let device drivers overcome this difficulty rework the hibernation sequence so that the memory preallocation is carried out after the drivers' .prepare() callbacks have been executed, so that the .prepare() callbacks can be used for allocating additional memory to be used by the drivers' .freeze() callbacks. Update documentation to match the new behavior of the code. Signed-off-by: Rafael J. Wysocki --- Documentation/power/devices.txt | 14 +++++++---- Documentation/power/notifiers.txt | 51 ++++++++++++++++++--------------------- drivers/base/power/main.c | 18 +++++++++----- include/linux/pm.h | 4 +++ kernel/power/hibernate.c | 17 ++++++++++--- 5 files changed, 61 insertions(+), 43 deletions(-) diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 1971bcf..8888083 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -279,11 +279,15 @@ When the system goes into the standby or memory sleep state, the phases are: time.) Unlike the other suspend-related phases, during the prepare phase the device tree is traversed top-down. - The prepare phase uses only a bus callback. After the callback method - returns, no new children may be registered below the device. The method - may also prepare the device or driver in some way for the upcoming - system power transition, but it should not put the device into a - low-power state. + In addition to that, if device drivers need to allocate additional + memory to be able to hadle device suspend correctly, that should be + done in the prepare phase. + + After the prepare callback method returns, no new children may be + registered below the device. The method may also prepare the device or + driver in some way for the upcoming system power transition (for + example, by allocating additional memory required for this purpose), but + it should not put the device into a low-power state. 2. The suspend methods should quiesce the device to stop it from performing I/O. They also may save the device registers and put it into the diff --git a/Documentation/power/notifiers.txt b/Documentation/power/notifiers.txt index cf98070..c2a4a34 100644 --- a/Documentation/power/notifiers.txt +++ b/Documentation/power/notifiers.txt @@ -1,46 +1,41 @@ Suspend notifiers - (C) 2007 Rafael J. Wysocki , GPL - -There are some operations that device drivers may want to carry out in their -.suspend() routines, but shouldn't, because they can cause the hibernation or -suspend to fail. For example, a driver may want to allocate a substantial amount -of memory (like 50 MB) in .suspend(), but that shouldn't be done after the -swsusp's memory shrinker has run. - -Also, there may be some operations, that subsystems want to carry out before a -hibernation/suspend or after a restore/resume, requiring the system to be fully -functional, so the drivers' .suspend() and .resume() routines are not suitable -for this purpose. For example, device drivers may want to upload firmware to -their devices after a restore from a hibernation image, but they cannot do it by -calling request_firmware() from their .resume() routines (user land processes -are frozen at this point). The solution may be to load the firmware into -memory before processes are frozen and upload it from there in the .resume() -routine. Of course, a hibernation notifier may be used for this purpose. - -The subsystems that have such needs can register suspend notifiers that will be -called upon the following events by the suspend core: + (C) 2007-2011 Rafael J. Wysocki , GPL + +There are some operations that subsystems or drivers may want to carry out +before hibernation/suspend or after restore/resume, but they require the system +to be fully functional, so the drivers' and subsystems' .suspend() and .resume() +or even .prepare() and .complete() callbacks are not suitable for this purpose. +For example, device drivers may want to upload firmware to their devices after +resume/restore, but they cannot do it by calling request_firmware() from their +.resume() or .complete() routines (user land processes are frozen at these +points). The solution may be to load the firmware into memory before processes +are frozen and upload it from there in the .resume() routine. +A suspend/hibernation notifier may be used for this purpose. + +The subsystems or drivers having such needs can register suspend notifiers that +will be called upon the following events by the PM core: PM_HIBERNATION_PREPARE The system is going to hibernate or suspend, tasks will be frozen immediately. PM_POST_HIBERNATION The system memory state has been restored from a - hibernation image or an error occurred during the - hibernation. Device drivers' .resume() callbacks have + hibernation image or an error occurred during + hibernation. Device drivers' restore callbacks have been executed and tasks have been thawed. PM_RESTORE_PREPARE The system is going to restore a hibernation image. - If all goes well the restored kernel will issue a + If all goes well, the restored kernel will issue a PM_POST_HIBERNATION notification. -PM_POST_RESTORE An error occurred during the hibernation restore. - Device drivers' .resume() callbacks have been executed +PM_POST_RESTORE An error occurred during restore from hibernation. + Device drivers' restore callbacks have been executed and tasks have been thawed. -PM_SUSPEND_PREPARE The system is preparing for a suspend. +PM_SUSPEND_PREPARE The system is preparing for suspend. PM_POST_SUSPEND The system has just resumed or an error occurred during - the suspend. Device drivers' .resume() callbacks have - been executed and tasks have been thawed. + suspend. Device drivers' resume callbacks have been + executed and tasks have been thawed. It is generally assumed that whatever the notifiers do for PM_HIBERNATION_PREPARE, should be undone for PM_POST_HIBERNATION. Analogously, diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 3b35456..aa632020 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -579,11 +579,13 @@ static bool is_async(struct device *dev) * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ -static void dpm_resume(pm_message_t state) +void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); + might_sleep(); + mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; @@ -656,10 +658,12 @@ static void device_complete(struct device *dev, pm_message_t state) * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ -static void dpm_complete(pm_message_t state) +void dpm_complete(pm_message_t state) { struct list_head list; + might_sleep(); + INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { @@ -688,7 +692,6 @@ static void dpm_complete(pm_message_t state) */ void dpm_resume_end(pm_message_t state) { - might_sleep(); dpm_resume(state); dpm_complete(state); } @@ -912,11 +915,13 @@ static int device_suspend(struct device *dev) * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ -static int dpm_suspend(pm_message_t state) +int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; + might_sleep(); + mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; @@ -1003,10 +1008,12 @@ static int device_prepare(struct device *dev, pm_message_t state) * * Execute the ->prepare() callback(s) for all devices. */ -static int dpm_prepare(pm_message_t state) +int dpm_prepare(pm_message_t state) { int error = 0; + might_sleep(); + mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list)) { struct device *dev = to_device(dpm_list.next); @@ -1055,7 +1062,6 @@ int dpm_suspend_start(pm_message_t state) { int error; - might_sleep(); error = dpm_prepare(state); if (!error) error = dpm_suspend(state); diff --git a/include/linux/pm.h b/include/linux/pm.h index 3cc3e7e..dce7c71 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -533,10 +533,14 @@ struct dev_power_domain { extern void device_pm_lock(void); extern void dpm_resume_noirq(pm_message_t state); extern void dpm_resume_end(pm_message_t state); +extern void dpm_resume(pm_message_t state); +extern void dpm_complete(pm_message_t state); extern void device_pm_unlock(void); extern int dpm_suspend_noirq(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); +extern int dpm_suspend(pm_message_t state); +extern int dpm_prepare(pm_message_t state); extern void __suspend_report_result(const char *function, void *fn, int ret); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 95a2ac4..f9bec56 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -327,20 +327,25 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { + pm_message_t msg = PMSG_RECOVER; int error; error = platform_begin(platform_mode); if (error) goto Close; + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto Complete_devices; + /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) - goto Close; + goto Complete_devices; suspend_console(); pm_restrict_gfp_mask(); - error = dpm_suspend_start(PMSG_FREEZE); + error = dpm_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -358,13 +363,17 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) swsusp_free(); - dpm_resume_end(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; + dpm_resume(msg); if (error || !in_suspend) pm_restore_gfp_mask(); resume_console(); + + Complete_devices: + dpm_complete(msg); + Close: platform_end(platform_mode); return error; -- cgit v1.1 From 6538df80194e305f1b78cafb556f4bb442f808b3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 May 2011 23:26:21 +0200 Subject: PM: Introduce generic prepare and complete callbacks for subsystems Introduce generic .prepare() and .complete() power management callbacks, currently missing, that can be used by subsystems and power domains and export them. Provide NULL definitions of all the generic system sleep callbacks for CONFIG_PM_SLEEP unset. Signed-off-by: Rafael J. Wysocki --- drivers/base/power/generic_ops.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/pm.h | 26 +++++++++++++++++++------- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c index 42f97f9..cb3bb36 100644 --- a/drivers/base/power/generic_ops.c +++ b/drivers/base/power/generic_ops.c @@ -74,6 +74,23 @@ EXPORT_SYMBOL_GPL(pm_generic_runtime_resume); #ifdef CONFIG_PM_SLEEP /** + * pm_generic_prepare - Generic routine preparing a device for power transition. + * @dev: Device to prepare. + * + * Prepare a device for a system-wide power transition. + */ +int pm_generic_prepare(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm && drv->pm->prepare) + ret = drv->pm->prepare(dev); + + return ret; +} + +/** * __pm_generic_call - Generic suspend/freeze/poweroff/thaw subsystem callback. * @dev: Device to handle. * @event: PM transition of the system under way. @@ -213,16 +230,38 @@ int pm_generic_restore(struct device *dev) return __pm_generic_resume(dev, PM_EVENT_RESTORE); } EXPORT_SYMBOL_GPL(pm_generic_restore); + +/** + * pm_generic_complete - Generic routine competing a device power transition. + * @dev: Device to handle. + * + * Complete a device power transition during a system-wide power transition. + */ +void pm_generic_complete(struct device *dev) +{ + struct device_driver *drv = dev->driver; + + if (drv && drv->pm && drv->pm->complete) + drv->pm->complete(dev); + + /* + * Let runtime PM try to suspend devices that haven't been in use before + * going into the system-wide sleep state we're resuming from. + */ + pm_runtime_idle(dev); +} #endif /* CONFIG_PM_SLEEP */ struct dev_pm_ops generic_subsys_pm_ops = { #ifdef CONFIG_PM_SLEEP + .prepare = pm_generic_prepare, .suspend = pm_generic_suspend, .resume = pm_generic_resume, .freeze = pm_generic_freeze, .thaw = pm_generic_thaw, .poweroff = pm_generic_poweroff, .restore = pm_generic_restore, + .complete = pm_generic_complete, #endif #ifdef CONFIG_PM_RUNTIME .runtime_suspend = pm_generic_runtime_suspend, diff --git a/include/linux/pm.h b/include/linux/pm.h index dce7c71..3160648 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -550,6 +550,16 @@ extern void __suspend_report_result(const char *function, void *fn, int ret); } while (0) extern int device_pm_wait_for_dev(struct device *sub, struct device *dev); + +extern int pm_generic_prepare(struct device *dev); +extern int pm_generic_suspend(struct device *dev); +extern int pm_generic_resume(struct device *dev); +extern int pm_generic_freeze(struct device *dev); +extern int pm_generic_thaw(struct device *dev); +extern int pm_generic_restore(struct device *dev); +extern int pm_generic_poweroff(struct device *dev); +extern void pm_generic_complete(struct device *dev); + #else /* !CONFIG_PM_SLEEP */ #define device_pm_lock() do {} while (0) @@ -566,6 +576,15 @@ static inline int device_pm_wait_for_dev(struct device *a, struct device *b) { return 0; } + +#define pm_generic_prepare NULL +#define pm_generic_suspend NULL +#define pm_generic_resume NULL +#define pm_generic_freeze NULL +#define pm_generic_thaw NULL +#define pm_generic_restore NULL +#define pm_generic_poweroff NULL +#define pm_generic_complete NULL #endif /* !CONFIG_PM_SLEEP */ /* How to reorder dpm_list after device_move() */ @@ -576,11 +595,4 @@ enum dpm_order { DPM_ORDER_DEV_LAST, }; -extern int pm_generic_suspend(struct device *dev); -extern int pm_generic_resume(struct device *dev); -extern int pm_generic_freeze(struct device *dev); -extern int pm_generic_thaw(struct device *dev); -extern int pm_generic_restore(struct device *dev); -extern int pm_generic_poweroff(struct device *dev); - #endif /* _LINUX_PM_H */ -- cgit v1.1 From 724a92ee45c04cb9d82884a856b03b1e594d9de1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:10 -0700 Subject: x86, cpufeature: Add CPU feature bit for enhanced REP MOVSB/STOSB Intel processors are adding enhancements to REP MOVSB/STOSB and the use of REP MOVSB/STOSB for optimal memcpy/memset or similar functions is recommended. Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/ STOSB). Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-2-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 91f3e087..7f2f7b1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -195,6 +195,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.1 From 161ec53c702ce9df2f439804dfb9331807066daa Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:11 -0700 Subject: x86, mem, intel: Initialize Enhanced REP MOVSB/STOSB If kernel intends to use enhanced REP MOVSB/STOSB, it must ensure IA32_MISC_ENABLE.Fast_String_Enable (bit 0) is set and CPUID.(EAX=07H, ECX=0H): EBX[bit 9] also reports 1. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-3-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index df86bc8..fc73a34 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -29,10 +29,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { + u64 misc_enable; + /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { @@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * (model 2) with the same problem. */ if (c->x86 == 15) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { @@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) } } #endif + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + printk(KERN_INFO "Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } } #ifdef CONFIG_X86_32 -- cgit v1.1 From 509731336313b3799cf03071d72c64fa6383895e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:12 -0700 Subject: x86, alternative, doc: Add comment for applying alternatives order Some string operation functions may be patched twice, e.g. on enhanced REP MOVSB /STOSB processors, memcpy is patched first by fast string alternative function, then it is patched by enhanced REP MOVSB/STOSB alternative function. Add comment for applying alternatives order to warn people who may change the applying alternatives order for any reason. [ Documentation-only patch ] Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-4-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4a23467..f4fe15d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite a previous scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ for (a = start; a < end; a++) { u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); -- cgit v1.1 From 9072d11da15a71e086eab3b5085184f2c1d06913 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:13 -0700 Subject: x86, alternative: Add altinstruction_entry macro Add altinstruction_entry macro to generate .altinstructions section entries from assembly code. This should be less failure-prone than open-coding. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-5-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative-asm.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index a63a68b..94d420b 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -15,4 +15,13 @@ .endm #endif +.macro altinstruction_entry orig alt feature orig_len alt_len + .align 8 + .quad \orig + .quad \alt + .word \feature + .byte \orig_len + .byte \alt_len +.endm + #endif /* __ASSEMBLY__ */ -- cgit v1.1 From e365c9df2f2f001450decf9512412d2d5bd1cdef Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:14 -0700 Subject: x86, mem: clear_page_64.S: Support clear_page() with enhanced REP MOVSB/STOSB Intel processors are adding enhancements to REP MOVSB/STOSB and the use of REP MOVSB/STOSB for optimal memcpy/memset or similar functions is recommended. Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/ STOSB). Support clear_page() with rep stosb for processor supporting enhanced REP MOVSB /STOSB. On processors supporting enhanced REP MOVSB/STOSB, the alternative clear_page_c_e function using enhanced REP STOSB overrides the original function and the fast string function. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-6-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/clear_page_64.S | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index aa4326b..f2145cf 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,6 @@ #include #include +#include /* * Zero a page. @@ -14,6 +15,15 @@ ENTRY(clear_page_c) CFI_ENDPROC ENDPROC(clear_page_c) +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) + ENTRY(clear_page) CFI_STARTPROC xorl %eax,%eax @@ -38,21 +48,26 @@ ENTRY(clear_page) .Lclear_page_end: ENDPROC(clear_page) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs support enhanced REP MOVSB/STOSB instructions. + * It is recommended to use this when possible. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original function. + * + */ #include .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp */ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: +2: .byte 0xeb /* jmp */ + .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ +3: .previous .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .word X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b + altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ + .Lclear_page_end-clear_page, 2b-1b + altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ + .Lclear_page_end-clear_page,3b-2b .previous -- cgit v1.1 From 4307bec9344aed83f8107c3eb4285bd9d218fc10 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:15 -0700 Subject: x86, mem: copy_user_64.S: Support copy_to/from_user by enhanced REP MOVSB/STOSB Support copy_to_user/copy_from_user() by enhanced REP MOVSB/STOSB. On processors supporting enhanced REP MOVSB/STOSB, the alternative copy_user_enhanced_fast_string function using enhanced rep movsb overrides the original function and the fast string function. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-7-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/copy_user_64.S | 65 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 99e4826..d17a117 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -15,23 +15,30 @@ #include #include #include +#include - .macro ALTERNATIVE_JUMP feature,orig,alt +/* + * By placing feature2 after feature1 in altinstructions section, we logically + * implement: + * If CPU has feature2, jmp to alt2 is used + * else if CPU has feature1, jmp to alt1 is used + * else jmp to orig is used. + */ + .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: .byte 0xe9 /* 32bit jump */ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" 2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ + .long \alt1-1b /* offset */ /* or alternatively to alt1 */ +3: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt2-1b /* offset */ /* or alternatively to alt2 */ .previous + .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .word \feature /* when feature is set */ - .byte 5 - .byte 5 + altinstruction_entry 0b,2b,\feature1,5,5 + altinstruction_entry 0b,3b,\feature2,5,5 .previous .endm @@ -73,7 +80,9 @@ ENTRY(_copy_to_user) jc bad_to_user cmpq TI_addr_limit(%rax),%rcx jae bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(_copy_to_user) @@ -86,7 +95,9 @@ ENTRY(_copy_from_user) jc bad_from_user cmpq TI_addr_limit(%rax),%rcx jae bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(_copy_from_user) @@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) .previous CFI_ENDPROC ENDPROC(copy_user_generic_string) + +/* + * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_enhanced_fast_string) + CFI_STARTPROC + andl %edx,%edx + jz 2f + movl %edx,%ecx +1: rep + movsb +2: xorl %eax,%eax + ret + + .section .fixup,"ax" +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + .section __ex_table,"a" + .align 8 + .quad 1b,12b + .previous + CFI_ENDPROC +ENDPROC(copy_user_enhanced_fast_string) -- cgit v1.1 From 101068c1f4a947ffa08f2782c78e40097300754d Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:16 -0700 Subject: x86, mem: memcpy_64.S: Optimize memcpy by enhanced REP MOVSB/STOSB Support memcpy() with enhanced rep movsb. On processors supporting enhanced rep movsb, the alternative memcpy() function using enhanced rep movsb overrides the original function and the fast string function. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-8-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/memcpy_64.S | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e..daab21d 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,6 +4,7 @@ #include #include +#include /* * memcpy - Copy a memory block. @@ -37,6 +38,23 @@ .Lmemcpy_e: .previous +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + + movl %edx, %ecx + rep movsb + ret +.Lmemcpy_e_e: + .previous + ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC @@ -171,21 +189,22 @@ ENDPROC(memcpy) ENDPROC(__memcpy) /* - * Some CPUs run faster using the string copy instructions. - * It is also a lot simpler. Use this when possible: - */ - - .section .altinstructions, "a" - .align 8 - .quad memcpy - .quad .Lmemcpy_c - .word X86_FEATURE_REP_GOOD - - /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte .Lmemcpy_e - .Lmemcpy_c - .byte .Lmemcpy_e - .Lmemcpy_c + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous -- cgit v1.1 From 057e05c1d6440117875f455e59da8691e08f65d5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:17 -0700 Subject: x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/memmove_64.S | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0ecb843..d0ec9c2 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,6 +8,7 @@ #define _STRING_C #include #include +#include #undef memmove @@ -24,6 +25,7 @@ */ ENTRY(memmove) CFI_STARTPROC + /* Handle more 32bytes in loop */ mov %rdi, %rax cmp $0x20, %rdx @@ -31,8 +33,13 @@ ENTRY(memmove) /* Decide forward/backward copy mode */ cmp %rdi, %rsi - jb 2f + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f +.Lmemmove_begin_forward: /* * movsq instruction have many startup latency * so we handle small size by general register. @@ -78,6 +85,8 @@ ENTRY(memmove) rep movsq movq %r11, (%r10) jmp 13f +.Lmemmove_end_forward: + /* * Handle data backward by movsq. */ @@ -194,4 +203,22 @@ ENTRY(memmove) 13: retq CFI_ENDPROC + + .section .altinstr_replacement,"ax" +.Lmemmove_begin_forward_efs: + /* Forward moving data. */ + movq %rdx, %rcx + rep movsb + retq +.Lmemmove_end_forward_efs: + .previous + + .section .altinstructions,"a" + .align 8 + .quad .Lmemmove_begin_forward + .quad .Lmemmove_begin_forward_efs + .word X86_FEATURE_ERMS + .byte .Lmemmove_end_forward-.Lmemmove_begin_forward + .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .previous ENDPROC(memmove) -- cgit v1.1 From 2f19e06ac30771c7cb96fd61d8aeacfa74dac21c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 15:29:18 -0700 Subject: x86, mem: memset_64.S: Optimize memset by enhanced REP MOVSB/STOSB Support memset() with enhanced rep stosb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memset_c_e function using enhanced rep stosb overrides the fast string alternative memset_c and the original function. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1305671358-14478-10-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/lib/memset_64.S | 54 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 09d3442..79bd454 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -2,9 +2,13 @@ #include #include +#include +#include /* - * ISO C memset - set a memory block to a byte value. + * ISO C memset - set a memory block to a byte value. This function uses fast + * string to get better performance than the original function. The code is + * simpler and shorter than the orignal function as well. * * rdi destination * rsi value (char) @@ -31,6 +35,28 @@ .Lmemset_e: .previous +/* + * ISO C memset - set a memory block to a byte value. This function uses + * enhanced rep stosb to override the fast string function. + * The code is simpler and shorter than the fast string function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c_e: + movq %rdi,%r9 + movb %sil,%al + movl %edx,%ecx + rep stosb + movq %r9,%rax + ret +.Lmemset_e_e: + .previous + ENTRY(memset) ENTRY(__memset) CFI_STARTPROC @@ -112,16 +138,20 @@ ENTRY(__memset) ENDPROC(memset) ENDPROC(__memset) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include - + /* Some CPUs support enhanced REP MOVSB/STOSB feature. + * It is recommended to use this when possible. + * + * If enhanced REP MOVSB/STOSB feature is not available, use fast string + * instructions. + * + * Otherwise, use original memset function. + * + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + */ .section .altinstructions,"a" - .align 8 - .quad memset - .quad .Lmemset_c - .word X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte .Lmemset_e - .Lmemset_c + altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-memset,.Lmemset_e-.Lmemset_c + altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e .previous -- cgit v1.1 From d0281a257f370b09c410e466571858b4e12869c9 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 18:44:26 -0700 Subject: x86, cpufeature: Add cpufeature flag for SMEP Add support for newly documented SMEP (Supervisor Mode Execution Protection) CPU feature flag. SMEP prevents the CPU in kernel-mode to jump to an executable page that has the user flag set in the PTE. This prevents the kernel from executing user-space code accidentally or maliciously, so it for example prevents kernel exploits from jumping to specially prepared user-mode shell code. [ hpa: added better description by Ingo Molnar ] Signed-off-by: Fenghua Yu LKML-Reference: <1305683069-25394-2-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7f2f7b1..8808cdb 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -195,6 +195,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit v1.1 From dc23c0bccf5eea171c87b3db285d032b9a5f06c4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 17 May 2011 18:44:27 -0700 Subject: x86, cpu: Add SMEP CPU feature in CR4 Add support for newly documented SMEP (Supervisor Mode Execution Protection) CPU feature in CR4. Signed-off-by: Fenghua Yu LKML-Reference: <1305683069-25394-3-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor-flags.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index a898a2b..59ab4df 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -60,6 +60,7 @@ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ +#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ /* * x86-64 Task Priority Register, CR8 -- cgit v1.1 From de5397ad5b9ad22e2401c4dacdf1bb3b19c05679 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 11 May 2011 16:51:05 -0700 Subject: x86, cpu: Enable/disable Supervisor Mode Execution Protection Enable/disable newly documented SMEP (Supervisor Mode Execution Protection) CPU feature in kernel. CR4.SMEP (bit 20) is 0 at power-on. If the feature is supported by CPU (X86_FEATURE_SMEP), enable SMEP by setting CR4.SMEP. New kernel option nosmep disables the feature even if the feature is supported by CPU. [ hpa: moved the call to setup_smep() until after the vendor-specific initialization; that ensures that CPUID features are unmasked. We will still run it before we have userspace (never mind uncontrolled userspace). ] Signed-off-by: Fenghua Yu LKML-Reference: <1305157865-31727-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 4 ++++ arch/x86/kernel/cpu/common.c | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cc85a92..56fb8c1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1664,6 +1664,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings + nosmep [X86] + Disable SMEP (Supervisor Mode Execution Protection) + even if it is supported by processor. + noexec32 [X86-64] This affects only 32-bit executables. noexec32=on: enable non-executable mappings (default) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 173f3a3..cbc70a2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif +static int disable_smep __initdata; +static __init int setup_disable_smep(char *arg) +{ + disable_smep = 1; + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __init void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) { + if (unlikely(disable_smep)) { + setup_clear_cpu_cap(X86_FEATURE_SMEP); + clear_in_cr4(X86_CR4_SMEP); + } else + set_in_cr4(X86_CR4_SMEP); + } +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -667,6 +686,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; #endif filter_cpuid_features(c, false); + + setup_smep(c); } void __init early_cpu_init(void) @@ -752,6 +773,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) #endif } + setup_smep(c); + get_model_name(c); /* Default name */ detect_nopl(c); -- cgit v1.1 From 0bf2461fdd9008290cf429e50e4f362dafab4249 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 17 May 2011 15:44:08 -0700 Subject: rapidio: fix default routing initialization Fix switch initialization to ensure that all switches have default routing disabled. This guarantees that no unexpected RapidIO packets arrive to the default port set by reset and there is no default routing destination until it is properly configured by software. This update also unifies handling of unmapped destinations by tsi57x, IDT Gen1 and IDT Gen2 switches. Signed-off-by: Alexandre Bounine Cc: Kumar Gala Cc: Matt Porter Cc: Li Yang Cc: Thomas Moll Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/switches/idt_gen2.c | 9 +++++++++ drivers/rapidio/switches/idtcps.c | 6 ++++++ drivers/rapidio/switches/tsi57x.c | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/drivers/rapidio/switches/idt_gen2.c b/drivers/rapidio/switches/idt_gen2.c index ac2701b..043ee31 100644 --- a/drivers/rapidio/switches/idt_gen2.c +++ b/drivers/rapidio/switches/idt_gen2.c @@ -95,6 +95,9 @@ idtg2_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount, else table++; + if (route_port == RIO_INVALID_ROUTE) + route_port = IDT_DEFAULT_ROUTE; + rio_mport_write_config_32(mport, destid, hopcount, LOCAL_RTE_CONF_DESTID_SEL, table); @@ -411,6 +414,12 @@ static int idtg2_switch_init(struct rio_dev *rdev, int do_enum) rdev->rswitch->em_handle = idtg2_em_handler; rdev->rswitch->sw_sysfs = idtg2_sysfs; + if (do_enum) { + /* Ensure that default routing is disabled on startup */ + rio_write_config_32(rdev, + RIO_STD_RTE_DEFAULT_PORT, IDT_NO_ROUTE); + } + return 0; } diff --git a/drivers/rapidio/switches/idtcps.c b/drivers/rapidio/switches/idtcps.c index 3a97107..d06ee2d 100644 --- a/drivers/rapidio/switches/idtcps.c +++ b/drivers/rapidio/switches/idtcps.c @@ -26,6 +26,9 @@ idtcps_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount, { u32 result; + if (route_port == RIO_INVALID_ROUTE) + route_port = CPS_DEFAULT_ROUTE; + if (table == RIO_GLOBAL_TABLE) { rio_mport_write_config_32(mport, destid, hopcount, RIO_STD_RTE_CONF_DESTID_SEL_CSR, route_destid); @@ -130,6 +133,9 @@ static int idtcps_switch_init(struct rio_dev *rdev, int do_enum) /* set TVAL = ~50us */ rio_write_config_32(rdev, rdev->phys_efptr + RIO_PORT_LINKTO_CTL_CSR, 0x8e << 8); + /* Ensure that default routing is disabled on startup */ + rio_write_config_32(rdev, + RIO_STD_RTE_DEFAULT_PORT, CPS_NO_ROUTE); } return 0; diff --git a/drivers/rapidio/switches/tsi57x.c b/drivers/rapidio/switches/tsi57x.c index 1a62934..db8b802 100644 --- a/drivers/rapidio/switches/tsi57x.c +++ b/drivers/rapidio/switches/tsi57x.c @@ -303,6 +303,12 @@ static int tsi57x_switch_init(struct rio_dev *rdev, int do_enum) rdev->rswitch->em_init = tsi57x_em_init; rdev->rswitch->em_handle = tsi57x_em_handler; + if (do_enum) { + /* Ensure that default routing is disabled on startup */ + rio_write_config_32(rdev, RIO_STD_RTE_DEFAULT_PORT, + RIO_INVALID_ROUTE); + } + return 0; } -- cgit v1.1 From d5f33d45e4c0e306e8d16b4573891a65d9ad544f Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 17 May 2011 15:44:09 -0700 Subject: drivers/leds/leds-lm3530.c: add MODULE_DEVICE_TABLE Adding the necessary MODULE_DEVICE_TABLE() information allows the driver to be automatically loaded by udev. Signed-off-by: Axel Lin Cc: Shreshtha Kumar SAHU Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-lm3530.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/leds/leds-lm3530.c b/drivers/leds/leds-lm3530.c index e7089a1..b37e618 100644 --- a/drivers/leds/leds-lm3530.c +++ b/drivers/leds/leds-lm3530.c @@ -349,6 +349,7 @@ static const struct i2c_device_id lm3530_id[] = { {LM3530_NAME, 0}, {} }; +MODULE_DEVICE_TABLE(i2c, lm3530_id); static struct i2c_driver lm3530_i2c_driver = { .probe = lm3530_probe, -- cgit v1.1 From d6c438b6cd733834a3cec55af8577a8fc3548016 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 17 May 2011 15:44:10 -0700 Subject: memcg: fix zone congestion ZONE_CONGESTED should be a state of global memory reclaim. If not, a busy memcg sets this and give unnecessary throttoling in wait_iff_congested() against memory recalim in other contexts. This makes system performance bad. I'll think about "memcg is congested!" flag is required or not, later. But this fix is required first. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Daisuke Nishimura Acked-by: Ying Han Cc: Balbir Singh Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f6b435c..8bfd450 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -937,7 +937,7 @@ keep_lumpy: * back off and wait for congestion to clear because further reclaim * will encounter the same problem */ - if (nr_dirty == nr_congested && nr_dirty != 0) + if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) zone_set_flag(zone, ZONE_CONGESTED); free_page_list(&free_pages); -- cgit v1.1 From b2db21997f43907f54500edaf063253ca2a186f9 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 17 May 2011 15:44:11 -0700 Subject: um: fix abort os_dump_core() uses abort() to terminate UML in case of an fatal error. glibc's abort() calls raise(SIGABRT) which makes use of tgkill(). tgkill() has no effect within UML's kernel threads because they are not pthreads. As fallback abort() executes an invalid instruction to terminate the process. Therefore UML gets killed by SIGSEGV and leaves a ugly log entry in the host's kernel ring buffer. To get rid of this we use our own abort routine. Signed-off-by: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/os-Linux/util.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 6ea7797..42827ca 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -75,6 +76,26 @@ void setup_hostinfo(char *buf, int len) host.release, host.version, host.machine); } +/* + * We cannot use glibc's abort(). It makes use of tgkill() which + * has no effect within UML's kernel threads. + * After that glibc would execute an invalid instruction to kill + * the calling process and UML crashes with SIGSEGV. + */ +static inline void __attribute__ ((noreturn)) uml_abort(void) +{ + sigset_t sig; + + fflush(NULL); + + if (!sigemptyset(&sig) && !sigaddset(&sig, SIGABRT)) + sigprocmask(SIG_UNBLOCK, &sig, 0); + + for (;;) + if (kill(getpid(), SIGABRT) < 0) + exit(127); +} + void os_dump_core(void) { int pid; @@ -116,5 +137,5 @@ void os_dump_core(void) while ((pid = waitpid(-1, NULL, WNOHANG | __WALL)) > 0) os_kill_ptraced_process(pid, 0); - abort(); + uml_abort(); } -- cgit v1.1 From f12a20fc9bfba4218ecbc4e40c8e08dc2a85dc99 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 17 May 2011 15:44:12 -0700 Subject: procfs: add stub for proc_mkdir_mode() Provide a stub for proc_mkdir_mode() when CONFIG_PROC_FS is not enabled, just like the stub for proc_mkdir(). Fixes this linux-next build error: drivers/net/wireless/airo.c:4504: error: implicit declaration of function 'proc_mkdir_mode' Signed-off-by: Randy Dunlap Cc: Stephen Rothwell Cc: Alexey Dobriyan Cc: "John W. Linville" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 838c114..eaf4350 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -208,6 +208,8 @@ static inline struct proc_dir_entry *proc_symlink(const char *name, struct proc_dir_entry *parent,const char *dest) {return NULL;} static inline struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) {return NULL;} +static inline struct proc_dir_entry *proc_mkdir_mode(const char *name, + mode_t mode, struct proc_dir_entry *parent) { return NULL; } static inline struct proc_dir_entry *create_proc_read_entry(const char *name, mode_t mode, struct proc_dir_entry *base, -- cgit v1.1 From 3ec717b7ca4ee1d75d77e4f6286430d8f01d1dbd Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 18 May 2011 11:22:43 +0200 Subject: block: don't delay blk_run_queue_async Let's check a scenario: 1. blk_delay_queue(q, SCSI_QUEUE_DELAY); 2. blk_run_queue_async(); the second one will became a noop, because q->delay_work already has WORK_STRUCT_PENDING_BIT set, so the delayed work will still run after SCSI_QUEUE_DELAY. But blk_run_queue_async actually hopes the delayed work runs immediately. Fix this by doing a cancel on potentially pending delayed work before queuing an immediate run of the workqueue. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index a2e58ee..3fe00a1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -316,8 +316,10 @@ EXPORT_SYMBOL(__blk_run_queue); */ void blk_run_queue_async(struct request_queue *q) { - if (likely(!blk_queue_stopped(q))) + if (likely(!blk_queue_stopped(q))) { + __cancel_delayed_work(&q->delay_work); queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); + } } EXPORT_SYMBOL(blk_run_queue_async); -- cgit v1.1 From df7f99670a4c76f269ae57ce91876b309417a316 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 22 Feb 2011 01:09:49 -0800 Subject: configfs: Don't try to d_delete() negative dentries. When configfs is faking mkdir() on its subsystem or default group objects, it starts by adding a negative dentry. It then tries to instantiate the group. If that should fail, it must clean up after itself. I was using d_delete() here, but configfs_attach_group() promises to return an empty dentry on error. d_delete() explodes with the entry dentry. Let's try d_drop() instead. The unhashing is what we want for our dentry. Signed-off-by: Joel Becker --- fs/configfs/dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 3313dd1..b11d734 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -689,7 +689,8 @@ static int create_default_group(struct config_group *parent_group, sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; } else { - d_delete(child); + BUG_ON(child->d_inode); + d_drop(child); dput(child); } } @@ -1683,7 +1684,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) err = configfs_attach_group(sd->s_element, &group->cg_item, dentry); if (err) { - d_delete(dentry); + BUG_ON(dentry->d_inode); + d_drop(dentry); dput(dentry); } else { spin_lock(&configfs_dirent_lock); -- cgit v1.1 From 26afb7c661080ae3f1f13ddf7f0c58c4f931c22b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 12 May 2011 16:30:30 +0200 Subject: x86, 64-bit: Fix copy_[to/from]_user() checks for the userspace address limit As reported in BZ #30352: https://bugzilla.kernel.org/show_bug.cgi?id=30352 there's a kernel bug related to reading the last allowed page on x86_64. The _copy_to_user() and _copy_from_user() functions use the following check for address limit: if (buf + size >= limit) fail(); while it should be more permissive: if (buf + size > limit) fail(); That's because the size represents the number of bytes being read/write from/to buf address AND including the buf address. So the copy function will actually never touch the limit address even if "buf + size == limit". Following program fails to use the last page as buffer due to the wrong limit check: #include #include #include #define PAGE_SIZE (4096) #define LAST_PAGE ((void*)(0x7fffffffe000)) int main() { int fds[2], err; void * ptr = mmap(LAST_PAGE, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); assert(ptr == LAST_PAGE); err = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); assert(err == 0); err = send(fds[0], ptr, PAGE_SIZE, 0); perror("send"); assert(err == PAGE_SIZE); err = recv(fds[1], ptr, PAGE_SIZE, MSG_WAITALL); perror("recv"); assert(err == PAGE_SIZE); return 0; } The other place checking the addr limit is the access_ok() function, which is working properly. There's just a misleading comment for the __range_not_ok() macro - which this patch fixes as well. The last page of the user-space address range is a guard page and Brian Gerst observed that the guard page itself due to an erratum on K8 cpus (#121 Sequential Execution Across Non-Canonical Boundary Causes Processor Hang). However, the test code is using the last valid page before the guard page. The bug is that the last byte before the guard page can't be read because of the off-by-one error. The guard page is left in place. This bug would normally not show up because the last page is part of the process stack and never accessed via syscalls. Signed-off-by: Jiri Olsa Acked-by: Brian Gerst Acked-by: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1305210630-7136-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 2 +- arch/x86/lib/copy_user_64.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index abd3e0e..99f0ad7 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -42,7 +42,7 @@ * Returns 0 if the range is valid, nonzero otherwise. * * This is equivalent to the following test: - * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) + * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) * * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d17a117..0248402 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -79,7 +79,7 @@ ENTRY(_copy_to_user) addq %rdx,%rcx jc bad_to_user cmpq TI_addr_limit(%rax),%rcx - jae bad_to_user + ja bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ copy_user_generic_unrolled,copy_user_generic_string, \ copy_user_enhanced_fast_string @@ -94,7 +94,7 @@ ENTRY(_copy_from_user) addq %rdx,%rcx jc bad_from_user cmpq TI_addr_limit(%rax),%rcx - jae bad_from_user + ja bad_from_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ copy_user_generic_unrolled,copy_user_generic_string, \ copy_user_enhanced_fast_string -- cgit v1.1 From 24307aa1e707b31613be92deaba7990e16bc1aec Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 18 May 2011 04:08:16 -0700 Subject: configfs: Fix race between configfs_readdir() and configfs_d_iput() configfs_readdir() will use the existing inode numbers of inodes in the dcache, but it makes them up for attribute files that aren't currently instantiated. There is a race where a closing attribute file can be tearing down at the same time as configfs_readdir() is trying to get its inode number. We want to get the inode number of open attribute files, because they should match while instantiated. We can't lock down the transition where dentry->d_inode is set to NULL, so we just check for NULL there. We can, however, ensure that an inode we find isn't iput() in configfs_d_iput() until after we've accessed it. Signed-off-by: Joel Becker --- fs/configfs/dir.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b11d734..9a37a9b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -53,11 +53,14 @@ DEFINE_SPINLOCK(configfs_dirent_lock); static void configfs_d_iput(struct dentry * dentry, struct inode * inode) { - struct configfs_dirent * sd = dentry->d_fsdata; + struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { BUG_ON(sd->s_dentry != dentry); + /* Coordinate with configfs_readdir */ + spin_lock(&configfs_dirent_lock); sd->s_dentry = NULL; + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } iput(inode); @@ -1546,7 +1549,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir struct configfs_dirent * parent_sd = dentry->d_fsdata; struct configfs_dirent *cursor = filp->private_data; struct list_head *p, *q = &cursor->s_sibling; - ino_t ino; + ino_t ino = 0; int i = filp->f_pos; switch (i) { @@ -1574,6 +1577,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir struct configfs_dirent *next; const char * name; int len; + struct inode *inode = NULL; next = list_entry(p, struct configfs_dirent, s_sibling); @@ -1582,9 +1586,28 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir name = configfs_get_name(next); len = strlen(name); - if (next->s_dentry) - ino = next->s_dentry->d_inode->i_ino; - else + + /* + * We'll have a dentry and an inode for + * PINNED items and for open attribute + * files. We lock here to prevent a race + * with configfs_d_iput() clearing + * s_dentry before calling iput(). + * + * Why do we go to the trouble? If + * someone has an attribute file open, + * the inode number should match until + * they close it. Beyond that, we don't + * care. + */ + spin_lock(&configfs_dirent_lock); + dentry = next->s_dentry; + if (dentry) + inode = dentry->d_inode; + if (inode) + ino = inode->i_ino; + spin_unlock(&configfs_dirent_lock); + if (!inode) ino = iunique(configfs_sb, 2); if (filldir(dirent, name, len, filp->f_pos, ino, -- cgit v1.1 From fe0514348452f5b0ad7e842b0d71b8322b1297de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 12:53:03 +0200 Subject: genirq: Remove pointless ifdefs kernel/irq/ is only built when CONFIG_GENERIC_HARDIRQS=y. So making code inside of kernel/irq/ conditional on CONFIG_GENERIC_HARDIRQS is pointless. Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2c039c9..e07b975 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -22,7 +22,7 @@ */ static struct lock_class_key irq_desc_lock_class; -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +#if defined(CONFIG_SMP) static void __init init_irq_default_affinity(void) { alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); @@ -430,7 +430,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } -#ifdef CONFIG_GENERIC_HARDIRQS unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -443,4 +442,3 @@ unsigned int kstat_irqs(unsigned int irq) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } -#endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.1 From fe12bc2c996d3e492b2920e32ac79f7bbae3e15d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 12:48:00 +0200 Subject: genirq: Uninline and sanity check generic_handle_irq() generic_handle_irq() is missing a NULL pointer check for the result of irq_to_desc. This was a not a big problem, but we want to expose it to drivers, so we better have sanity checks in place. Add a return value as well, which indicates that the irq number was valid and the handler was invoked. Based on the pure code move from Jonathan Cameron. Signed-off-by: Thomas Gleixner Cc: Jonathan Cameron --- include/linux/irqdesc.h | 5 +---- kernel/irq/irqdesc.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index c70b1aa..2d921b3 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -111,10 +111,7 @@ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *de desc->handle_irq(irq, desc); } -static inline void generic_handle_irq(unsigned int irq) -{ - generic_handle_irq_desc(irq, irq_to_desc(irq)); -} +int generic_handle_irq(unsigned int irq); /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index e07b975..9f65b02 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -290,6 +290,21 @@ static int irq_expand_nr_irqs(unsigned int nr) #endif /* !CONFIG_SPARSE_IRQ */ +/** + * generic_handle_irq - Invoke the handler for a particular irq + * @irq: The irq number to handle + * + */ +int generic_handle_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return -EINVAL; + generic_handle_irq_desc(irq, desc); + return 0; +} + /* Dynamic interrupt handling */ /** -- cgit v1.1 From edf76f8307c350bcb81f0c760118a991b3e62956 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 18 May 2011 10:39:04 +0100 Subject: irq: Export functions to allow modular irq drivers Export handle_simple_irq, irq_modify_status, irq_alloc_descs, irq_free_descs and generic_handle_irq to allow their usage in modules. First user is IIO, which wants to be built modular, but needs to be able to create irq chips, allocate and configure interrupt descriptors and handle demultiplexing interrupts. [ tglx: Moved the uninlinig of generic_handle_irq to a separate patch ] Signed-off-by: Jonathan Cameron Link: http://lkml.kernel.org/r/%3C1305711544-505-1-git-send-email-jic23%40cam.ac.uk%3E Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 ++ kernel/irq/irqdesc.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 52d856d..d5a3009 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) out_unlock: raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_simple_irq); /** * handle_level_irq - Level type irq handler @@ -613,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(irq_modify_status); /** * irq_cpu_online - Invoke all irq_cpu_online functions. diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9f65b02..886e803 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -304,6 +304,7 @@ int generic_handle_irq(unsigned int irq) generic_handle_irq_desc(irq, desc); return 0; } +EXPORT_SYMBOL_GPL(generic_handle_irq); /* Dynamic interrupt handling */ @@ -326,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } +EXPORT_SYMBOL_GPL(irq_free_descs); /** * irq_alloc_descs - allocate and initialize a range of irq descriptors @@ -366,6 +368,7 @@ err: mutex_unlock(&sparse_irq_lock); return ret; } +EXPORT_SYMBOL_GPL(irq_alloc_descs); /** * irq_reserve_irqs - mark irqs allocated -- cgit v1.1 From 3436830af53c38b7674097c00b02b7a4064476f2 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 12 May 2011 13:55:48 +0100 Subject: MIPS: RB532: Fix iomap resource size miscalculation. This is the MIPS portion of Joe Perches 's https://patchwork.linux-mips.org/patch/2172/ which seems to have been lost in time and space. Signed-off-by: Ralf Baechle --- arch/mips/rb532/gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c index 37de05d..6c47dfe 100644 --- a/arch/mips/rb532/gpio.c +++ b/arch/mips/rb532/gpio.c @@ -185,7 +185,7 @@ int __init rb532_gpio_init(void) struct resource *r; r = rb532_gpio_reg0_res; - rb532_gpio_chip->regbase = ioremap_nocache(r->start, r->end - r->start); + rb532_gpio_chip->regbase = ioremap_nocache(r->start, resource_size(r)); if (!rb532_gpio_chip->regbase) { printk(KERN_ERR "rb532: cannot remap GPIO register 0\n"); -- cgit v1.1 From 10423c91ffc8e59d4f99d401f7beb3115cdc117a Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 13 May 2011 10:33:28 +0100 Subject: MIPS: Fix duplicate invocation of notify_die. Initial patch by Yury Polyanskiy . Signed-off-by: Ralf Baechle Patchwork: https://patchwork.linux-mips.org/patch/2373/ --- arch/mips/kernel/traps.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 71350f7..e9b3af2 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -374,7 +374,8 @@ void __noreturn die(const char *str, struct pt_regs *regs) unsigned long dvpret = dvpe(); #endif /* CONFIG_MIPS_MT_SMTC */ - notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV); + if (notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV) == NOTIFY_STOP) + sig = 0; console_verbose(); spin_lock_irq(&die_lock); @@ -383,9 +384,6 @@ void __noreturn die(const char *str, struct pt_regs *regs) mips_mt_regdump(dvpret); #endif /* CONFIG_MIPS_MT_SMTC */ - if (notify_die(DIE_OOPS, str, regs, 0, regs_to_trapnr(regs), SIGSEGV) == NOTIFY_STOP) - sig = 0; - printk("%s[#%d]:\n", str, ++die_counter); show_registers(regs); add_taint(TAINT_DIE); -- cgit v1.1 From 3e9957b4866f3767f19bf0e543b322ad7906c564 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 13 May 2011 17:41:21 +0200 Subject: MIPS: AR7: Fix GPIO register size for Titan variant. The 'size' variable contains the correct register size for both AR7 and Titan, but we never used it to ioremap the correct register size. This problem only shows up on Titan. [ralf@linux-mips.org: Fixed the fix. The original patch as in patchwork recognizes the problem correctly then fails to fix it ...] Reported-by: Alexander Clouter Signed-off-by: Florian Fainelli Patchwork: https://patchwork.linux-mips.org/patch/2380/ Signed-off-by: Ralf Baechle --- arch/mips/ar7/gpio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/mips/ar7/gpio.c b/arch/mips/ar7/gpio.c index 425dfa5..bb571bc 100644 --- a/arch/mips/ar7/gpio.c +++ b/arch/mips/ar7/gpio.c @@ -325,9 +325,7 @@ int __init ar7_gpio_init(void) size = 0x1f; } - gpch->regs = ioremap_nocache(AR7_REGS_GPIO, - AR7_REGS_GPIO + 0x10); - + gpch->regs = ioremap_nocache(AR7_REGS_GPIO, size); if (!gpch->regs) { printk(KERN_ERR "%s: failed to ioremap regs\n", gpch->chip.label); -- cgit v1.1 From a5602a3273774c720aaf165ff670e5b85e5910a5 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 18 May 2011 13:14:36 +0100 Subject: MIPS: Kludge IP27 build for 2.6.39. Signed-off-by: Ralf Baechle --- arch/mips/include/asm/dma-mapping.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index 655f849..7aa37dd 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -5,7 +5,9 @@ #include #include +#ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */ #include +#endif extern struct dma_map_ops *mips_dma_map_ops; -- cgit v1.1 From 01294d82622d6d9d64bde8e4530c7e2c6dbb6ee6 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Wed, 18 May 2011 10:27:39 -0500 Subject: of: fix race when matching drivers If two drivers are probing devices at the same time, both will write their match table result to the dev->of_match cache at the same time. Only write the result if the device matches. In a thread titled "SBus devices sometimes detected, sometimes not", Meelis reported his SBus hme was not detected about 50% of the time. From the debug suggested by Grant it was obvious another driver matched some devices between the call to match the hme and the hme discovery failling. Reported-by: Meelis Roos Signed-off-by: Milton Miller [grant.likely: modified to only call of_match_device() once] Signed-off-by: Grant Likely --- include/linux/of_device.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 8bfe6c1..b33d688 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -21,8 +21,12 @@ extern void of_device_make_bus_id(struct device *dev); static inline int of_driver_match_device(struct device *dev, const struct device_driver *drv) { - dev->of_match = of_match_device(drv->of_match_table, dev); - return dev->of_match != NULL; + const struct of_device_id *match; + + match = of_match_device(drv->of_match_table, dev); + if (match) + dev->of_match = match; + return match != NULL; } extern struct platform_device *of_dev_get(struct platform_device *dev); -- cgit v1.1 From b1608d69cb804e414d0887140ba08a9398e4e638 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 18 May 2011 11:19:24 -0600 Subject: drivercore: revert addition of of_match to struct device Commit b826291c, "drivercore/dt: add a match table pointer to struct device" added an of_match pointer to struct device to cache the of_match_table entry discovered at driver match time. This was unsafe because matching is not an atomic operation with probing a driver. If two or more drivers are attempted to be matched to a driver at the same time, then the cached matching entry pointer could get overwritten. This patch reverts the of_match cache pointer and reworks all users to call of_match_device() directly instead. Signed-off-by: Grant Likely --- arch/powerpc/platforms/83xx/suspend.c | 7 +++++-- arch/powerpc/sysdev/fsl_msi.c | 7 +++++-- arch/sparc/kernel/pci_sabre.c | 5 ++++- arch/sparc/kernel/pci_schizo.c | 8 ++++++-- drivers/atm/fore200e.c | 7 +++++-- drivers/char/hw_random/n2-drv.c | 7 +++++-- drivers/char/ipmi/ipmi_si_intf.c | 7 +++++-- drivers/char/xilinx_hwicap/xilinx_hwicap.c | 14 +++++++++----- drivers/edac/ppc4xx_edac.c | 2 +- drivers/i2c/busses/i2c-mpc.c | 9 ++++++--- drivers/mmc/host/sdhci-of-core.c | 7 +++++-- drivers/mtd/maps/physmap_of.c | 7 +++++-- drivers/net/can/mscan/mpc5xxx_can.c | 7 +++++-- drivers/net/fs_enet/fs_enet-main.c | 9 ++++++--- drivers/net/fs_enet/mii-fec.c | 7 +++++-- drivers/net/sunhme.c | 7 +++++-- drivers/scsi/qlogicpti.c | 7 +++++-- drivers/tty/serial/of_serial.c | 7 +++++-- drivers/usb/gadget/fsl_qe_udc.c | 7 +++++-- drivers/watchdog/mpc8xxx_wdt.c | 7 +++++-- include/linux/device.h | 1 - include/linux/of_device.h | 12 ++++++------ 22 files changed, 108 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c index 1882729..104faa8 100644 --- a/arch/powerpc/platforms/83xx/suspend.c +++ b/arch/powerpc/platforms/83xx/suspend.c @@ -318,17 +318,20 @@ static const struct platform_suspend_ops mpc83xx_suspend_ops = { .end = mpc83xx_suspend_end, }; +static struct of_device_id pmc_match[]; static int pmc_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct device_node *np = ofdev->dev.of_node; struct resource res; struct pmc_type *type; int ret = 0; - if (!ofdev->dev.of_match) + match = of_match_device(pmc_match, &ofdev->dev); + if (!match) return -EINVAL; - type = ofdev->dev.of_match->data; + type = match->data; if (!of_device_is_available(np)) return -ENODEV; diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index d5679dc..01cd2f0 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -304,8 +304,10 @@ static int __devinit fsl_msi_setup_hwirq(struct fsl_msi *msi, return 0; } +static const struct of_device_id fsl_of_msi_ids[]; static int __devinit fsl_of_msi_probe(struct platform_device *dev) { + const struct of_device_id *match; struct fsl_msi *msi; struct resource res; int err, i, j, irq_index, count; @@ -316,9 +318,10 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev) u32 offset; static const u32 all_avail[] = { 0, NR_MSI_IRQS }; - if (!dev->dev.of_match) + match = of_match_device(fsl_of_msi_ids, &dev->dev); + if (!match) return -EINVAL; - features = dev->dev.of_match->data; + features = match->data; printk(KERN_DEBUG "Setting up Freescale MSI support\n"); diff --git a/arch/sparc/kernel/pci_sabre.c b/arch/sparc/kernel/pci_sabre.c index 948068a..d1840db 100644 --- a/arch/sparc/kernel/pci_sabre.c +++ b/arch/sparc/kernel/pci_sabre.c @@ -452,8 +452,10 @@ static void __devinit sabre_pbm_init(struct pci_pbm_info *pbm, sabre_scan_bus(pbm, &op->dev); } +static const struct of_device_id sabre_match[]; static int __devinit sabre_probe(struct platform_device *op) { + const struct of_device_id *match; const struct linux_prom64_registers *pr_regs; struct device_node *dp = op->dev.of_node; struct pci_pbm_info *pbm; @@ -463,7 +465,8 @@ static int __devinit sabre_probe(struct platform_device *op) const u32 *vdma; u64 clear_irq; - hummingbird_p = op->dev.of_match && (op->dev.of_match->data != NULL); + match = of_match_device(sabre_match, &op->dev); + hummingbird_p = match && (match->data != NULL); if (!hummingbird_p) { struct device_node *cpu_dp; diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c index fecfcb2..283fbc3 100644 --- a/arch/sparc/kernel/pci_schizo.c +++ b/arch/sparc/kernel/pci_schizo.c @@ -1458,11 +1458,15 @@ out_err: return err; } +static const struct of_device_id schizo_match[]; static int __devinit schizo_probe(struct platform_device *op) { - if (!op->dev.of_match) + const struct of_device_id *match; + + match = of_match_device(schizo_match, &op->dev); + if (!match) return -EINVAL; - return __schizo_init(op, (unsigned long) op->dev.of_match->data); + return __schizo_init(op, (unsigned long)match->data); } /* The ordering of this table is very important. Some Tomatillo diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index bdd2719..bc9e702 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -2643,16 +2643,19 @@ fore200e_init(struct fore200e* fore200e, struct device *parent) } #ifdef CONFIG_SBUS +static const struct of_device_id fore200e_sba_match[]; static int __devinit fore200e_sba_probe(struct platform_device *op) { + const struct of_device_id *match; const struct fore200e_bus *bus; struct fore200e *fore200e; static int index = 0; int err; - if (!op->dev.of_match) + match = of_match_device(fore200e_sba_match, &op->dev); + if (!match) return -EINVAL; - bus = op->dev.of_match->data; + bus = match->data; fore200e = kzalloc(sizeof(struct fore200e), GFP_KERNEL); if (!fore200e) diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c index 43ac619..ac6739e 100644 --- a/drivers/char/hw_random/n2-drv.c +++ b/drivers/char/hw_random/n2-drv.c @@ -619,15 +619,18 @@ static void __devinit n2rng_driver_version(void) pr_info("%s", version); } +static const struct of_device_id n2rng_match[]; static int __devinit n2rng_probe(struct platform_device *op) { + const struct of_device_id *match; int victoria_falls; int err = -ENOMEM; struct n2rng *np; - if (!op->dev.of_match) + match = of_match_device(n2rng_match, &op->dev); + if (!match) return -EINVAL; - victoria_falls = (op->dev.of_match->data != NULL); + victoria_falls = (match->data != NULL); n2rng_driver_version(); np = kzalloc(sizeof(*np), GFP_KERNEL); diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index cc6c9b2..64c6b85 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -2554,9 +2554,11 @@ static struct pci_driver ipmi_pci_driver = { }; #endif /* CONFIG_PCI */ +static struct of_device_id ipmi_match[]; static int __devinit ipmi_probe(struct platform_device *dev) { #ifdef CONFIG_OF + const struct of_device_id *match; struct smi_info *info; struct resource resource; const __be32 *regsize, *regspacing, *regshift; @@ -2566,7 +2568,8 @@ static int __devinit ipmi_probe(struct platform_device *dev) dev_info(&dev->dev, "probing via device tree\n"); - if (!dev->dev.of_match) + match = of_match_device(ipmi_match, &dev->dev); + if (!match) return -EINVAL; ret = of_address_to_resource(np, 0, &resource); @@ -2601,7 +2604,7 @@ static int __devinit ipmi_probe(struct platform_device *dev) return -ENOMEM; } - info->si_type = (enum si_type) dev->dev.of_match->data; + info->si_type = (enum si_type) match->data; info->addr_source = SI_DEVICETREE; info->irq_setup = std_irq_setup; diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c index d6412c1..39ccdea 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c @@ -715,13 +715,13 @@ static int __devexit hwicap_remove(struct device *dev) } #ifdef CONFIG_OF -static int __devinit hwicap_of_probe(struct platform_device *op) +static int __devinit hwicap_of_probe(struct platform_device *op, + const struct hwicap_driver_config *config) { struct resource res; const unsigned int *id; const char *family; int rc; - const struct hwicap_driver_config *config = op->dev.of_match->data; const struct config_registers *regs; @@ -751,20 +751,24 @@ static int __devinit hwicap_of_probe(struct platform_device *op) regs); } #else -static inline int hwicap_of_probe(struct platform_device *op) +static inline int hwicap_of_probe(struct platform_device *op, + const struct hwicap_driver_config *config) { return -EINVAL; } #endif /* CONFIG_OF */ +static const struct of_device_id __devinitconst hwicap_of_match[]; static int __devinit hwicap_drv_probe(struct platform_device *pdev) { + const struct of_device_id *match; struct resource *res; const struct config_registers *regs; const char *family; - if (pdev->dev.of_match) - return hwicap_of_probe(pdev); + match = of_match_device(hwicap_of_match, &pdev->dev); + if (match) + return hwicap_of_probe(pdev, match->data); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c index c1f0045..af8e7b1 100644 --- a/drivers/edac/ppc4xx_edac.c +++ b/drivers/edac/ppc4xx_edac.c @@ -1019,7 +1019,7 @@ ppc4xx_edac_mc_init(struct mem_ctl_info *mci, struct ppc4xx_edac_pdata *pdata = NULL; const struct device_node *np = op->dev.of_node; - if (op->dev.of_match == NULL) + if (of_match_device(ppc4xx_edac_match, &op->dev) == NULL) return -EINVAL; /* Initial driver pointers and private data */ diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c index 75b984c..107397a 100644 --- a/drivers/i2c/busses/i2c-mpc.c +++ b/drivers/i2c/busses/i2c-mpc.c @@ -560,15 +560,18 @@ static struct i2c_adapter mpc_ops = { .timeout = HZ, }; +static const struct of_device_id mpc_i2c_of_match[]; static int __devinit fsl_i2c_probe(struct platform_device *op) { + const struct of_device_id *match; struct mpc_i2c *i2c; const u32 *prop; u32 clock = MPC_I2C_CLOCK_LEGACY; int result = 0; int plen; - if (!op->dev.of_match) + match = of_match_device(mpc_i2c_of_match, &op->dev); + if (!match) return -EINVAL; i2c = kzalloc(sizeof(*i2c), GFP_KERNEL); @@ -605,8 +608,8 @@ static int __devinit fsl_i2c_probe(struct platform_device *op) clock = *prop; } - if (op->dev.of_match->data) { - struct mpc_i2c_data *data = op->dev.of_match->data; + if (match->data) { + struct mpc_i2c_data *data = match->data; data->setup(op->dev.of_node, i2c, clock, data->prescaler); } else { /* Backwards compatibility */ diff --git a/drivers/mmc/host/sdhci-of-core.c b/drivers/mmc/host/sdhci-of-core.c index f9b611f..60e4186 100644 --- a/drivers/mmc/host/sdhci-of-core.c +++ b/drivers/mmc/host/sdhci-of-core.c @@ -124,8 +124,10 @@ static bool __devinit sdhci_of_wp_inverted(struct device_node *np) #endif } +static const struct of_device_id sdhci_of_match[]; static int __devinit sdhci_of_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct device_node *np = ofdev->dev.of_node; struct sdhci_of_data *sdhci_of_data; struct sdhci_host *host; @@ -134,9 +136,10 @@ static int __devinit sdhci_of_probe(struct platform_device *ofdev) int size; int ret; - if (!ofdev->dev.of_match) + match = of_match_device(sdhci_of_match, &ofdev->dev); + if (!match) return -EINVAL; - sdhci_of_data = ofdev->dev.of_match->data; + sdhci_of_data = match->data; if (!of_device_is_available(np)) return -ENODEV; diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c index bd483f0..c1d3346 100644 --- a/drivers/mtd/maps/physmap_of.c +++ b/drivers/mtd/maps/physmap_of.c @@ -214,11 +214,13 @@ static void __devinit of_free_probes(const char **probes) } #endif +static struct of_device_id of_flash_match[]; static int __devinit of_flash_probe(struct platform_device *dev) { #ifdef CONFIG_MTD_PARTITIONS const char **part_probe_types; #endif + const struct of_device_id *match; struct device_node *dp = dev->dev.of_node; struct resource res; struct of_flash *info; @@ -232,9 +234,10 @@ static int __devinit of_flash_probe(struct platform_device *dev) struct mtd_info **mtd_list = NULL; resource_size_t res_size; - if (!dev->dev.of_match) + match = of_match_device(of_flash_match, &dev->dev); + if (!match) return -EINVAL; - probe_type = dev->dev.of_match->data; + probe_type = match->data; reg_tuple_size = (of_n_addr_cells(dp) + of_n_size_cells(dp)) * sizeof(u32); diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c index bd1d811..5fedc33 100644 --- a/drivers/net/can/mscan/mpc5xxx_can.c +++ b/drivers/net/can/mscan/mpc5xxx_can.c @@ -247,8 +247,10 @@ static u32 __devinit mpc512x_can_get_clock(struct platform_device *ofdev, } #endif /* CONFIG_PPC_MPC512x */ +static struct of_device_id mpc5xxx_can_table[]; static int __devinit mpc5xxx_can_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct mpc5xxx_can_data *data; struct device_node *np = ofdev->dev.of_node; struct net_device *dev; @@ -258,9 +260,10 @@ static int __devinit mpc5xxx_can_probe(struct platform_device *ofdev) int irq, mscan_clksrc = 0; int err = -ENOMEM; - if (!ofdev->dev.of_match) + match = of_match_device(mpc5xxx_can_table, &ofdev->dev); + if (!match) return -EINVAL; - data = (struct mpc5xxx_can_data *)ofdev->dev.of_match->data; + data = match->data; base = of_iomap(np, 0); if (!base) { diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c index 24cb953..5131e61 100644 --- a/drivers/net/fs_enet/fs_enet-main.c +++ b/drivers/net/fs_enet/fs_enet-main.c @@ -998,8 +998,10 @@ static const struct net_device_ops fs_enet_netdev_ops = { #endif }; +static struct of_device_id fs_enet_match[]; static int __devinit fs_enet_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct net_device *ndev; struct fs_enet_private *fep; struct fs_platform_info *fpi; @@ -1007,14 +1009,15 @@ static int __devinit fs_enet_probe(struct platform_device *ofdev) const u8 *mac_addr; int privsize, len, ret = -ENODEV; - if (!ofdev->dev.of_match) + match = of_match_device(fs_enet_match, &ofdev->dev); + if (!match) return -EINVAL; fpi = kzalloc(sizeof(*fpi), GFP_KERNEL); if (!fpi) return -ENOMEM; - if (!IS_FEC(ofdev->dev.of_match)) { + if (!IS_FEC(match)) { data = of_get_property(ofdev->dev.of_node, "fsl,cpm-command", &len); if (!data || len != 4) goto out_free_fpi; @@ -1049,7 +1052,7 @@ static int __devinit fs_enet_probe(struct platform_device *ofdev) fep->dev = &ofdev->dev; fep->ndev = ndev; fep->fpi = fpi; - fep->ops = ofdev->dev.of_match->data; + fep->ops = match->data; ret = fep->ops->setup_data(ndev); if (ret) diff --git a/drivers/net/fs_enet/mii-fec.c b/drivers/net/fs_enet/mii-fec.c index 7e840d3..6a2e150 100644 --- a/drivers/net/fs_enet/mii-fec.c +++ b/drivers/net/fs_enet/mii-fec.c @@ -101,17 +101,20 @@ static int fs_enet_fec_mii_reset(struct mii_bus *bus) return 0; } +static struct of_device_id fs_enet_mdio_fec_match[]; static int __devinit fs_enet_mdio_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct resource res; struct mii_bus *new_bus; struct fec_info *fec; int (*get_bus_freq)(struct device_node *); int ret = -ENOMEM, clock, speed; - if (!ofdev->dev.of_match) + match = of_match_device(fs_enet_mdio_fec_match, &ofdev->dev); + if (!match) return -EINVAL; - get_bus_freq = ofdev->dev.of_match->data; + get_bus_freq = match->data; new_bus = mdiobus_alloc(); if (!new_bus) diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c index eb4f59f..bff2f79 100644 --- a/drivers/net/sunhme.c +++ b/drivers/net/sunhme.c @@ -3237,15 +3237,18 @@ static void happy_meal_pci_exit(void) #endif #ifdef CONFIG_SBUS +static const struct of_device_id hme_sbus_match[]; static int __devinit hme_sbus_probe(struct platform_device *op) { + const struct of_device_id *match; struct device_node *dp = op->dev.of_node; const char *model = of_get_property(dp, "model", NULL); int is_qfe; - if (!op->dev.of_match) + match = of_match_device(hme_sbus_match, &op->dev); + if (!match) return -EINVAL; - is_qfe = (op->dev.of_match->data != NULL); + is_qfe = (match->data != NULL); if (!is_qfe && model && !strcmp(model, "SUNW,sbus-qfe")) is_qfe = 1; diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c index e2d45c9..9689d41c 100644 --- a/drivers/scsi/qlogicpti.c +++ b/drivers/scsi/qlogicpti.c @@ -1292,8 +1292,10 @@ static struct scsi_host_template qpti_template = { .use_clustering = ENABLE_CLUSTERING, }; +static const struct of_device_id qpti_match[]; static int __devinit qpti_sbus_probe(struct platform_device *op) { + const struct of_device_id *match; struct scsi_host_template *tpnt; struct device_node *dp = op->dev.of_node; struct Scsi_Host *host; @@ -1301,9 +1303,10 @@ static int __devinit qpti_sbus_probe(struct platform_device *op) static int nqptis; const char *fcode; - if (!op->dev.of_match) + match = of_match_device(qpti_match, &op->dev); + if (!match) return -EINVAL; - tpnt = op->dev.of_match->data; + tpnt = match->data; /* Sometimes Antares cards come up not completely * setup, and we get a report of a zero IRQ. diff --git a/drivers/tty/serial/of_serial.c b/drivers/tty/serial/of_serial.c index 0e8eec5..c911b24 100644 --- a/drivers/tty/serial/of_serial.c +++ b/drivers/tty/serial/of_serial.c @@ -80,14 +80,17 @@ static int __devinit of_platform_serial_setup(struct platform_device *ofdev, /* * Try to register a serial port */ +static struct of_device_id of_platform_serial_table[]; static int __devinit of_platform_serial_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct of_serial_info *info; struct uart_port port; int port_type; int ret; - if (!ofdev->dev.of_match) + match = of_match_device(of_platform_serial_table, &ofdev->dev); + if (!match) return -EINVAL; if (of_find_property(ofdev->dev.of_node, "used-by-rtas", NULL)) @@ -97,7 +100,7 @@ static int __devinit of_platform_serial_probe(struct platform_device *ofdev) if (info == NULL) return -ENOMEM; - port_type = (unsigned long)ofdev->dev.of_match->data; + port_type = (unsigned long)match->data; ret = of_platform_serial_setup(ofdev, port_type, &port); if (ret) goto out; diff --git a/drivers/usb/gadget/fsl_qe_udc.c b/drivers/usb/gadget/fsl_qe_udc.c index 36613b3..3a68e09 100644 --- a/drivers/usb/gadget/fsl_qe_udc.c +++ b/drivers/usb/gadget/fsl_qe_udc.c @@ -2539,15 +2539,18 @@ static void qe_udc_release(struct device *dev) } /* Driver probe functions */ +static const struct of_device_id qe_udc_match[]; static int __devinit qe_udc_probe(struct platform_device *ofdev) { + const struct of_device_id *match; struct device_node *np = ofdev->dev.of_node; struct qe_ep *ep; unsigned int ret = 0; unsigned int i; const void *prop; - if (!ofdev->dev.of_match) + match = of_match_device(qe_udc_match, &ofdev->dev); + if (!match) return -EINVAL; prop = of_get_property(np, "mode", NULL); @@ -2561,7 +2564,7 @@ static int __devinit qe_udc_probe(struct platform_device *ofdev) return -ENOMEM; } - udc_controller->soc_type = (unsigned long)ofdev->dev.of_match->data; + udc_controller->soc_type = (unsigned long)match->data; udc_controller->usb_regs = of_iomap(np, 0); if (!udc_controller->usb_regs) { ret = -ENOMEM; diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c index 528bceb..eed5436f 100644 --- a/drivers/watchdog/mpc8xxx_wdt.c +++ b/drivers/watchdog/mpc8xxx_wdt.c @@ -185,17 +185,20 @@ static struct miscdevice mpc8xxx_wdt_miscdev = { .fops = &mpc8xxx_wdt_fops, }; +static const struct of_device_id mpc8xxx_wdt_match[]; static int __devinit mpc8xxx_wdt_probe(struct platform_device *ofdev) { int ret; + const struct of_device_id *match; struct device_node *np = ofdev->dev.of_node; struct mpc8xxx_wdt_type *wdt_type; u32 freq = fsl_get_sys_freq(); bool enabled; - if (!ofdev->dev.of_match) + match = of_match_device(mpc8xxx_wdt_match, &ofdev->dev); + if (!match) return -EINVAL; - wdt_type = ofdev->dev.of_match->data; + wdt_type = match->data; if (!freq || freq == -1) return -EINVAL; diff --git a/include/linux/device.h b/include/linux/device.h index ab8dfc0..d08399d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -442,7 +442,6 @@ struct device { struct dev_archdata archdata; struct device_node *of_node; /* associated device tree node */ - const struct of_device_id *of_match; /* matching of_device_id from driver */ dev_t devt; /* dev_t, creates the sysfs "dev" */ diff --git a/include/linux/of_device.h b/include/linux/of_device.h index b33d688..ae56384 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -21,12 +21,7 @@ extern void of_device_make_bus_id(struct device *dev); static inline int of_driver_match_device(struct device *dev, const struct device_driver *drv) { - const struct of_device_id *match; - - match = of_match_device(drv->of_match_table, dev); - if (match) - dev->of_match = match; - return match != NULL; + return of_match_device(drv->of_match_table, dev) != NULL; } extern struct platform_device *of_dev_get(struct platform_device *dev); @@ -62,6 +57,11 @@ static inline int of_device_uevent(struct device *dev, static inline void of_device_node_put(struct device *dev) { } +static inline const struct of_device_id *of_match_device( + const struct of_device_id *matches, const struct device *dev) +{ + return NULL; +} #endif /* CONFIG_OF_DEVICE */ #endif /* _LINUX_OF_DEVICE_H */ -- cgit v1.1 From b313207286a78abac19f1dd2721292eae598b0f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 May 2011 21:00:44 +0200 Subject: perf bench, x86: Add alternatives-asm.h wrapper perf bench needs this to build the kernel's memcpy routine: In file included from bench/mem-memcpy-x86-64-asm.S:2:0: bench/../../../arch/x86/lib/memcpy_64.S:7:33: fatal error: asm/alternative-asm.h: No such file or directory Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-c5d41xibgullk8h2280q4gv0@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/include/asm/alternative-asm.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 tools/perf/util/include/asm/alternative-asm.h diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h new file mode 100644 index 0000000..6789d788 --- /dev/null +++ b/tools/perf/util/include/asm/alternative-asm.h @@ -0,0 +1,8 @@ +#ifndef _PERF_ASM_ALTERNATIVE_ASM_H +#define _PERF_ASM_ALTERNATIVE_ASM_H + +/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ + +#define altinstruction_entry # + +#endif -- cgit v1.1 From b448c4e3ae6d20108dba1d7833f2c0d3dbad87ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 15:12:32 -0400 Subject: ftrace: Replace FTRACE_FL_NOTRACE flag with a hash of ignored functions To prepare for the accounting system that will allow multiple users of the function tracer, having the FTRACE_FL_NOTRACE as a flag in the dyn_trace record does not make sense. All ftrace_ops will soon have a hash of functions they should trace and not trace. By making a global hash of functions not to trace makes this easier for the transition. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 176 +++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 150 insertions(+), 27 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3204744..fe0a90a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -149,7 +149,6 @@ enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FILTER = (1 << 1), FTRACE_FL_ENABLED = (1 << 2), - FTRACE_FL_NOTRACE = (1 << 3), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d340634..04c002a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -57,6 +57,7 @@ /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +#define FTRACE_HASH_MAX_BITS 10 /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; @@ -865,6 +866,22 @@ enum { FTRACE_START_FUNC_RET = (1 << 3), FTRACE_STOP_FUNC_RET = (1 << 4), }; +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; +}; + +static struct hlist_head notrace_buckets[1 << FTRACE_HASH_MAX_BITS]; +static struct ftrace_hash notrace_hash = { + .size_bits = FTRACE_HASH_MAX_BITS, + .buckets = notrace_buckets, +}; static int ftrace_filtered; @@ -889,6 +906,79 @@ static struct ftrace_page *ftrace_pages; static struct dyn_ftrace *ftrace_free_records; +static struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + unsigned long key; + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *n; + + if (!hash->count) + return NULL; + + if (hash->size_bits > 0) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + + hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + if (entry->ip == ip) + return entry; + } + return NULL; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + unsigned long key; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + if (hash->size_bits) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + entry->ip = ip; + hhd = &hash->buckets[key]; + hlist_add_head(&entry->hlist, hhd); + hash->count++; + + return 0; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + kfree(entry); + hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ + struct hlist_head *hhd; + struct hlist_node *tp, *tn; + struct ftrace_func_entry *entry; + int size = 1 << hash->size_bits; + int i; + + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + remove_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + /* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. @@ -1032,7 +1122,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * If we want to enable it and filtering is on, enable it only if * it's filtered */ - if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { + if (enable && !ftrace_lookup_ip(¬race_hash, rec->ip)) { if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) flag = FTRACE_FL_ENABLED; } @@ -1465,7 +1555,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !(rec->flags & FTRACE_FL_FILTER)) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !(rec->flags & FTRACE_FL_NOTRACE))) { + !ftrace_lookup_ip(¬race_hash, rec->ip))) { rec = NULL; goto retry; } @@ -1609,14 +1699,15 @@ static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; struct dyn_ftrace *rec; - unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; mutex_lock(&ftrace_lock); - if (enable) + if (enable) { ftrace_filtered = 0; - do_for_each_ftrace_rec(pg, rec) { - rec->flags &= ~type; - } while_for_each_ftrace_rec(); + do_for_each_ftrace_rec(pg, rec) { + rec->flags &= ~FTRACE_FL_FILTER; + } while_for_each_ftrace_rec(); + } else + ftrace_hash_clear(¬race_hash); mutex_unlock(&ftrace_lock); } @@ -1716,13 +1807,36 @@ static int ftrace_match(char *str, char *regex, int len, int type) return matched; } -static void -update_record(struct dyn_ftrace *rec, unsigned long flag, int not) +static int +update_record(struct dyn_ftrace *rec, int enable, int not) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + struct ftrace_func_entry *entry; + struct ftrace_hash *hash = ¬race_hash; + int ret = 0; + + if (enable) { + if (not) + rec->flags &= ~FTRACE_FL_FILTER; + else + rec->flags |= FTRACE_FL_FILTER; + } else { + if (not) { + /* Do nothing if it doesn't exist */ + entry = ftrace_lookup_ip(hash, rec->ip); + if (!entry) + return 0; + + remove_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + entry = ftrace_lookup_ip(hash, rec->ip); + if (entry) + return 0; + + ret = add_hash_entry(hash, rec->ip); + } + } + return ret; } static int @@ -1754,16 +1868,14 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) struct dyn_ftrace *rec; int type = MATCH_FULL; char *search = buff; - unsigned long flag; int found = 0; + int ret; if (len) { type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); } - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - mutex_lock(&ftrace_lock); if (unlikely(ftrace_disabled)) @@ -1772,7 +1884,11 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, mod, search, search_len, type)) { - update_record(rec, flag, not); + ret = update_record(rec, enable, not); + if (ret < 0) { + found = ret; + goto out_unlock; + } found = 1; } /* @@ -1821,6 +1937,7 @@ static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { char *mod; + int ret = -EINVAL; /* * cmd == 'mod' because we only registered this func @@ -1832,15 +1949,19 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) /* we must have a module name */ if (!param) - return -EINVAL; + return ret; mod = strsep(¶m, ":"); if (!strlen(mod)) - return -EINVAL; + return ret; - if (ftrace_match_module_records(func, mod, enable)) - return 0; - return -EINVAL; + ret = ftrace_match_module_records(func, mod, enable); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + + return 0; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -2132,14 +2253,17 @@ static int ftrace_process_regex(char *buff, int len, int enable) { char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); if (!next) { - if (ftrace_match_records(func, len, enable)) - return 0; - return ret; + ret = ftrace_match_records(func, len, enable); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + return 0; } /* command found */ -- cgit v1.1 From 1cf41dd79993389b012e4542ab502ce36ae7343f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Apr 2011 20:59:51 -0400 Subject: ftrace: Use hash instead for FTRACE_FL_FILTER When multiple users are allowed to have their own set of functions to trace, having the FTRACE_FL_FILTER flag will not be enough to handle the accounting of those users. Each user will need their own set of functions. Replace the FTRACE_FL_FILTER with a filter_hash instead. This is temporary until the rest of the function filtering accounting gets in. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +- kernel/trace/ftrace.c | 151 ++++++++++++++++++++++--------------------------- 2 files changed, 70 insertions(+), 84 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fe0a90a..52fc5d4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -147,8 +147,7 @@ extern int ftrace_text_reserved(void *start, void *end); enum { FTRACE_FL_FREE = (1 << 0), - FTRACE_FL_FILTER = (1 << 1), - FTRACE_FL_ENABLED = (1 << 2), + FTRACE_FL_ENABLED = (1 << 1), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 04c002a..222eca4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -883,7 +883,11 @@ static struct ftrace_hash notrace_hash = { .buckets = notrace_buckets, }; -static int ftrace_filtered; +static struct hlist_head filter_buckets[1 << FTRACE_HASH_MAX_BITS]; +static struct ftrace_hash filter_hash = { + .size_bits = FTRACE_HASH_MAX_BITS, + .buckets = filter_buckets, +}; static struct dyn_ftrace *ftrace_new_addrs; @@ -1123,7 +1127,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * it's filtered */ if (enable && !ftrace_lookup_ip(¬race_hash, rec->ip)) { - if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) + if (!filter_hash.count || ftrace_lookup_ip(&filter_hash, rec->ip)) flag = FTRACE_FL_ENABLED; } @@ -1430,6 +1434,7 @@ struct ftrace_iterator { struct dyn_ftrace *func; struct ftrace_func_probe *probe; struct trace_parser parser; + struct ftrace_hash *hash; int hidx; int idx; unsigned flags; @@ -1552,7 +1557,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if ((rec->flags & FTRACE_FL_FREE) || ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(ftrace_lookup_ip(&filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && !ftrace_lookup_ip(¬race_hash, rec->ip))) { @@ -1598,7 +1603,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { + if (iter->flags & FTRACE_ITER_FILTER && !filter_hash.count) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -1695,24 +1700,16 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } -static void ftrace_filter_reset(int enable) +static void ftrace_filter_reset(struct ftrace_hash *hash) { - struct ftrace_page *pg; - struct dyn_ftrace *rec; - mutex_lock(&ftrace_lock); - if (enable) { - ftrace_filtered = 0; - do_for_each_ftrace_rec(pg, rec) { - rec->flags &= ~FTRACE_FL_FILTER; - } while_for_each_ftrace_rec(); - } else - ftrace_hash_clear(¬race_hash); + ftrace_hash_clear(hash); mutex_unlock(&ftrace_lock); } static int -ftrace_regex_open(struct inode *inode, struct file *file, int enable) +ftrace_regex_open(struct ftrace_hash *hash, int flag, + struct inode *inode, struct file *file) { struct ftrace_iterator *iter; int ret = 0; @@ -1729,15 +1726,16 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) return -ENOMEM; } + iter->hash = hash; + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_filter_reset(enable); + ftrace_filter_reset(hash); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->flags = enable ? FTRACE_ITER_FILTER : - FTRACE_ITER_NOTRACE; + iter->flags = flag; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -1757,13 +1755,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(inode, file, 1); + return ftrace_regex_open(&filter_hash, FTRACE_ITER_FILTER, + inode, file); } static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(inode, file, 0); + return ftrace_regex_open(¬race_hash, FTRACE_ITER_NOTRACE, + inode, file); } static loff_t @@ -1808,33 +1808,24 @@ static int ftrace_match(char *str, char *regex, int len, int type) } static int -update_record(struct dyn_ftrace *rec, int enable, int not) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) { struct ftrace_func_entry *entry; - struct ftrace_hash *hash = ¬race_hash; int ret = 0; - if (enable) { - if (not) - rec->flags &= ~FTRACE_FL_FILTER; - else - rec->flags |= FTRACE_FL_FILTER; - } else { - if (not) { - /* Do nothing if it doesn't exist */ - entry = ftrace_lookup_ip(hash, rec->ip); - if (!entry) - return 0; + entry = ftrace_lookup_ip(hash, rec->ip); + if (not) { + /* Do nothing if it doesn't exist */ + if (!entry) + return 0; - remove_hash_entry(hash, entry); - } else { - /* Do nothing if it exists */ - entry = ftrace_lookup_ip(hash, rec->ip); - if (entry) - return 0; + remove_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + if (entry) + return 0; - ret = add_hash_entry(hash, rec->ip); - } + ret = add_hash_entry(hash, rec->ip); } return ret; } @@ -1861,7 +1852,9 @@ ftrace_match_record(struct dyn_ftrace *rec, char *mod, return ftrace_match(str, regex, len, type); } -static int match_records(char *buff, int len, char *mod, int enable, int not) +static int +match_records(struct ftrace_hash *hash, char *buff, + int len, char *mod, int not) { unsigned search_len = 0; struct ftrace_page *pg; @@ -1884,20 +1877,13 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) do_for_each_ftrace_rec(pg, rec) { if (ftrace_match_record(rec, mod, search, search_len, type)) { - ret = update_record(rec, enable, not); + ret = enter_record(hash, rec, not); if (ret < 0) { found = ret; goto out_unlock; } found = 1; } - /* - * Only enable filtering if we have a function that - * is filtered on. - */ - if (enable && (rec->flags & FTRACE_FL_FILTER)) - ftrace_filtered = 1; - } while_for_each_ftrace_rec(); out_unlock: mutex_unlock(&ftrace_lock); @@ -1906,12 +1892,13 @@ static int match_records(char *buff, int len, char *mod, int enable, int not) } static int -ftrace_match_records(char *buff, int len, int enable) +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) { - return match_records(buff, len, NULL, enable, 0); + return match_records(hash, buff, len, NULL, 0); } -static int ftrace_match_module_records(char *buff, char *mod, int enable) +static int +ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) { int not = 0; @@ -1925,7 +1912,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) not = 1; } - return match_records(buff, strlen(buff), mod, enable, not); + return match_records(hash, buff, strlen(buff), mod, not); } /* @@ -1936,6 +1923,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { + struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -1955,7 +1943,12 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return ret; - ret = ftrace_match_module_records(func, mod, enable); + if (enable) + hash = &filter_hash; + else + hash = ¬race_hash; + + ret = ftrace_match_module_records(hash, func, mod); if (!ret) ret = -EINVAL; if (ret < 0) @@ -2253,12 +2246,18 @@ static int ftrace_process_regex(char *buff, int len, int enable) { char *func, *command, *next = buff; struct ftrace_func_command *p; + struct ftrace_hash *hash; int ret; + if (enable) + hash = &filter_hash; + else + hash = ¬race_hash; + func = strsep(&next, ":"); if (!next) { - ret = ftrace_match_records(func, len, enable); + ret = ftrace_match_records(hash, func, len); if (!ret) ret = -EINVAL; if (ret < 0) @@ -2340,16 +2339,16 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static void -ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +ftrace_set_regex(struct ftrace_hash *hash, unsigned char *buf, int len, int reset) { if (unlikely(ftrace_disabled)) return; mutex_lock(&ftrace_regex_lock); if (reset) - ftrace_filter_reset(enable); + ftrace_filter_reset(hash); if (buf) - ftrace_match_records(buf, len, enable); + ftrace_match_records(hash, buf, len); mutex_unlock(&ftrace_regex_lock); } @@ -2364,7 +2363,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - ftrace_set_regex(buf, len, reset, 1); + ftrace_set_regex(&filter_hash, buf, len, reset); } /** @@ -2379,7 +2378,7 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) */ void ftrace_set_notrace(unsigned char *buf, int len, int reset) { - ftrace_set_regex(buf, len, reset, 0); + ftrace_set_regex(¬race_hash, buf, len, reset); } /* @@ -2431,22 +2430,22 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init set_ftrace_early_filter(char *buf, int enable) +static void __init set_ftrace_early_filter(struct ftrace_hash *hash, char *buf) { char *func; while (buf) { func = strsep(&buf, ","); - ftrace_set_regex(func, strlen(func), 0, enable); + ftrace_set_regex(hash, func, strlen(func), 0); } } static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(ftrace_filter_buf, 1); + set_ftrace_early_filter(&filter_hash, ftrace_filter_buf); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(ftrace_notrace_buf, 0); + set_ftrace_early_filter(¬race_hash, ftrace_notrace_buf); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); @@ -2454,7 +2453,7 @@ static void __init set_ftrace_early_filters(void) } static int -ftrace_regex_release(struct inode *inode, struct file *file, int enable) +ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; @@ -2471,7 +2470,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) parser = &iter->parser; if (trace_parser_loaded(parser)) { parser->buffer[parser->idx] = 0; - ftrace_match_records(parser->buffer, parser->idx, enable); + ftrace_match_records(iter->hash, parser->buffer, parser->idx); } trace_parser_put(parser); @@ -2488,18 +2487,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) return 0; } -static int -ftrace_filter_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 1); -} - -static int -ftrace_notrace_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 0); -} - static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -2512,7 +2499,7 @@ static const struct file_operations ftrace_filter_fops = { .read = seq_read, .write = ftrace_filter_write, .llseek = ftrace_regex_lseek, - .release = ftrace_filter_release, + .release = ftrace_regex_release, }; static const struct file_operations ftrace_notrace_fops = { @@ -2520,7 +2507,7 @@ static const struct file_operations ftrace_notrace_fops = { .read = seq_read, .write = ftrace_notrace_write, .llseek = ftrace_regex_lseek, - .release = ftrace_notrace_release, + .release = ftrace_regex_release, }; #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.1 From f45948e898e7bc76a73a468796d2ce80dd040058 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 May 2011 12:29:25 -0400 Subject: ftrace: Create a global_ops to hold the filter and notrace hashes Combine the filter and notrace hashes to be accessed by a single entity, the global_ops. The global_ops is a ftrace_ops structure that is passed to different functions that can read or modify the filtering of the function tracer. The ftrace_ops structure was modified to hold a filter and notrace hashes so that later patches may allow each ftrace_ops to have its own set of rules to what functions may be filtered. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 10 ++++++-- kernel/trace/ftrace.c | 65 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 52fc5d4..6658a51 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -29,9 +29,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); +struct ftrace_hash; + struct ftrace_ops { - ftrace_func_t func; - struct ftrace_ops *next; + ftrace_func_t func; + struct ftrace_ops *next; +#ifdef CONFIG_DYNAMIC_FTRACE + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; +#endif }; extern int function_trace_stop; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 222eca4..a517a6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,12 @@ static struct ftrace_hash filter_hash = { .buckets = filter_buckets, }; +struct ftrace_ops global_ops = { + .func = ftrace_stub, + .notrace_hash = ¬race_hash, + .filter_hash = &filter_hash, +}; + static struct dyn_ftrace *ftrace_new_addrs; static DEFINE_MUTEX(ftrace_regex_lock); @@ -1112,6 +1118,7 @@ int ftrace_text_reserved(void *start, void *end) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + struct ftrace_ops *ops = &global_ops; unsigned long ftrace_addr; unsigned long flag = 0UL; @@ -1126,8 +1133,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * If we want to enable it and filtering is on, enable it only if * it's filtered */ - if (enable && !ftrace_lookup_ip(¬race_hash, rec->ip)) { - if (!filter_hash.count || ftrace_lookup_ip(&filter_hash, rec->ip)) + if (enable && !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) { + if (!ops->filter_hash->count || + ftrace_lookup_ip(ops->filter_hash, rec->ip)) flag = FTRACE_FL_ENABLED; } @@ -1531,6 +1539,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; struct dyn_ftrace *rec = NULL; if (unlikely(ftrace_disabled)) @@ -1557,10 +1566,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if ((rec->flags & FTRACE_FL_FREE) || ((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(&filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(¬race_hash, rec->ip))) { + !ftrace_lookup_ip(ops->notrace_hash, rec->ip))) { rec = NULL; goto retry; } @@ -1584,6 +1593,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; void *p = NULL; loff_t l; @@ -1603,7 +1613,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !filter_hash.count) { + if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -1708,10 +1718,11 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) } static int -ftrace_regex_open(struct ftrace_hash *hash, int flag, +ftrace_regex_open(struct ftrace_ops *ops, int flag, struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + struct ftrace_hash *hash; int ret = 0; if (unlikely(ftrace_disabled)) @@ -1726,6 +1737,11 @@ ftrace_regex_open(struct ftrace_hash *hash, int flag, return -ENOMEM; } + if (flag & FTRACE_ITER_NOTRACE) + hash = ops->notrace_hash; + else + hash = ops->filter_hash; + iter->hash = hash; mutex_lock(&ftrace_regex_lock); @@ -1755,14 +1771,14 @@ ftrace_regex_open(struct ftrace_hash *hash, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&filter_hash, FTRACE_ITER_FILTER, + return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, inode, file); } static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(¬race_hash, FTRACE_ITER_NOTRACE, + return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -1923,6 +1939,7 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { + struct ftrace_ops *ops = &global_ops; struct ftrace_hash *hash; char *mod; int ret = -EINVAL; @@ -1944,9 +1961,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) return ret; if (enable) - hash = &filter_hash; + hash = ops->filter_hash; else - hash = ¬race_hash; + hash = ops->notrace_hash; ret = ftrace_match_module_records(hash, func, mod); if (!ret) @@ -2245,14 +2262,15 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) static int ftrace_process_regex(char *buff, int len, int enable) { char *func, *command, *next = buff; + struct ftrace_ops *ops = &global_ops; struct ftrace_func_command *p; struct ftrace_hash *hash; int ret; if (enable) - hash = &filter_hash; + hash = ops->filter_hash; else - hash = ¬race_hash; + hash = ops->notrace_hash; func = strsep(&next, ":"); @@ -2339,11 +2357,19 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static void -ftrace_set_regex(struct ftrace_hash *hash, unsigned char *buf, int len, int reset) +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) { + struct ftrace_hash *hash; + if (unlikely(ftrace_disabled)) return; + if (enable) + hash = ops->filter_hash; + else + hash = ops->notrace_hash; + mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); @@ -2363,7 +2389,7 @@ ftrace_set_regex(struct ftrace_hash *hash, unsigned char *buf, int len, int rese */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - ftrace_set_regex(&filter_hash, buf, len, reset); + ftrace_set_regex(&global_ops, buf, len, reset, 1); } /** @@ -2378,7 +2404,7 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) */ void ftrace_set_notrace(unsigned char *buf, int len, int reset) { - ftrace_set_regex(¬race_hash, buf, len, reset); + ftrace_set_regex(&global_ops, buf, len, reset, 0); } /* @@ -2430,22 +2456,23 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init set_ftrace_early_filter(struct ftrace_hash *hash, char *buf) +static void __init +set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; while (buf) { func = strsep(&buf, ","); - ftrace_set_regex(hash, func, strlen(func), 0); + ftrace_set_regex(ops, func, strlen(func), 0, enable); } } static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(&filter_hash, ftrace_filter_buf); + set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(¬race_hash, ftrace_notrace_buf); + set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); -- cgit v1.1 From 33dc9b1267d59cef46ff0bd6bc043190845dc919 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 May 2011 17:34:47 -0400 Subject: ftrace: Separate hash allocation and assignment When filtering, allocate a hash to insert the function records. After the filtering is complete, assign it to the ftrace_ops structure. This allows the ftrace_ops structure to have a much smaller array of hash buckets instead of wasting a lot of memory. A read only empty_hash is created to be the minimum size that any ftrace_ops can point to. When a new hash is created, it has the following steps: o Allocate a default hash. o Walk the function records assigning the filtered records to the hash o Allocate a new hash with the appropriate size buckets o Move the entries from the default hash to the new hash. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 275 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 233 insertions(+), 42 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a517a6c..46f0826 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -57,7 +57,8 @@ /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) -#define FTRACE_HASH_MAX_BITS 10 +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; @@ -877,22 +878,22 @@ struct ftrace_hash { unsigned long count; }; -static struct hlist_head notrace_buckets[1 << FTRACE_HASH_MAX_BITS]; -static struct ftrace_hash notrace_hash = { - .size_bits = FTRACE_HASH_MAX_BITS, - .buckets = notrace_buckets, -}; - -static struct hlist_head filter_buckets[1 << FTRACE_HASH_MAX_BITS]; -static struct ftrace_hash filter_hash = { - .size_bits = FTRACE_HASH_MAX_BITS, - .buckets = filter_buckets, +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { + .buckets = (struct hlist_head *)empty_buckets, }; +#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) struct ftrace_ops global_ops = { .func = ftrace_stub, - .notrace_hash = ¬race_hash, - .filter_hash = &filter_hash, + .notrace_hash = EMPTY_HASH, + .filter_hash = EMPTY_HASH, }; static struct dyn_ftrace *ftrace_new_addrs; @@ -941,31 +942,38 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return NULL; } -static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +static void __add_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) { - struct ftrace_func_entry *entry; struct hlist_head *hhd; unsigned long key; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - if (hash->size_bits) - key = hash_long(ip, hash->size_bits); + key = hash_long(entry->ip, hash->size_bits); else key = 0; - entry->ip = ip; hhd = &hash->buckets[key]; hlist_add_head(&entry->hlist, hhd); hash->count++; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->ip = ip; + __add_hash_entry(hash, entry); return 0; } static void -remove_hash_entry(struct ftrace_hash *hash, +free_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { hlist_del(&entry->hlist); @@ -973,6 +981,14 @@ remove_hash_entry(struct ftrace_hash *hash, hash->count--; } +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + hash->count--; +} + static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; @@ -981,14 +997,156 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) int size = 1 << hash->size_bits; int i; + if (!hash->count) + return; + for (i = 0; i < size; i++) { hhd = &hash->buckets[i]; hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) - remove_hash_entry(hash, entry); + free_hash_entry(hash, entry); } FTRACE_WARN_ON(hash->count); } +static void free_ftrace_hash(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + ftrace_hash_clear(hash); + kfree(hash->buckets); + kfree(hash); +} + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ + struct ftrace_hash *hash; + int size; + + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return NULL; + + size = 1 << size_bits; + hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + + if (!hash->buckets) { + kfree(hash); + return NULL; + } + + hash->size_bits = size_bits; + + return hash; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + struct hlist_node *tp; + int size; + int ret; + int i; + + new_hash = alloc_ftrace_hash(size_bits); + if (!new_hash) + return NULL; + + /* Empty hash? */ + if (!hash || !hash->count) + return new_hash; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + ret = add_hash_entry(new_hash, entry->ip); + if (ret < 0) + goto free_hash; + } + } + + FTRACE_WARN_ON(new_hash->count != hash->count); + + return new_hash; + + free_hash: + free_ftrace_hash(new_hash); + return NULL; +} + +static int +ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tp, *tn; + struct hlist_head *hhd; + struct ftrace_hash *hash = *dst; + unsigned long key; + int size = src->count; + int bits = 0; + int i; + + /* + * If the new source is empty, just free dst and assign it + * the empty_hash. + */ + if (!src->count) { + free_ftrace_hash(*dst); + *dst = EMPTY_HASH; + return 0; + } + + ftrace_hash_clear(hash); + + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + /* Don't allocate too much */ + if (bits > FTRACE_HASH_MAX_BITS) + bits = FTRACE_HASH_MAX_BITS; + + /* We can't modify the empty_hash */ + if (hash == EMPTY_HASH) { + /* Create a new hash */ + *dst = alloc_ftrace_hash(bits); + if (!*dst) { + *dst = EMPTY_HASH; + return -ENOMEM; + } + hash = *dst; + } else { + size = 1 << bits; + + /* Use the old hash, but create new buckets */ + hhd = kzalloc(sizeof(*hhd) * size, GFP_KERNEL); + if (!hhd) + return -ENOMEM; + + kfree(hash->buckets); + hash->buckets = hhd; + hash->size_bits = bits; + } + + size = 1 << src->size_bits; + for (i = 0; i < size; i++) { + hhd = &src->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { + if (bits > 0) + key = hash_long(entry->ip, bits); + else + key = 0; + remove_hash_entry(src, entry); + __add_hash_entry(hash, entry); + } + } + + return 0; +} + /* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. @@ -1443,6 +1601,7 @@ struct ftrace_iterator { struct ftrace_func_probe *probe; struct trace_parser parser; struct ftrace_hash *hash; + struct ftrace_ops *ops; int hidx; int idx; unsigned flags; @@ -1742,22 +1901,37 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, else hash = ops->filter_hash; - iter->hash = hash; + iter->ops = ops; + iter->flags = flag; + + if (file->f_mode & FMODE_WRITE) { + mutex_lock(&ftrace_lock); + iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + mutex_unlock(&ftrace_lock); + + if (!iter->hash) { + trace_parser_put(&iter->parser); + kfree(iter); + return -ENOMEM; + } + } mutex_lock(&ftrace_regex_lock); + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_filter_reset(hash); + ftrace_filter_reset(iter->hash); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->flags = flag; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = iter; } else { + /* Failed */ + free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); kfree(iter); } @@ -1835,7 +2009,7 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) if (!entry) return 0; - remove_hash_entry(hash, entry); + free_hash_entry(hash, entry); } else { /* Do nothing if it exists */ if (entry) @@ -2259,19 +2433,13 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(char *buff, int len, int enable) +static int ftrace_process_regex(struct ftrace_hash *hash, + char *buff, int len, int enable) { char *func, *command, *next = buff; - struct ftrace_ops *ops = &global_ops; struct ftrace_func_command *p; - struct ftrace_hash *hash; int ret; - if (enable) - hash = ops->filter_hash; - else - hash = ops->notrace_hash; - func = strsep(&next, ":"); if (!next) { @@ -2328,7 +2496,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(parser->buffer, + ret = ftrace_process_regex(iter->hash, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret) @@ -2356,26 +2524,40 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, return ftrace_regex_write(file, ubuf, cnt, ppos, 0); } -static void +static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { + struct ftrace_hash **orig_hash; struct ftrace_hash *hash; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; if (enable) - hash = ops->filter_hash; + orig_hash = &ops->filter_hash; else - hash = ops->notrace_hash; + orig_hash = &ops->notrace_hash; + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + return -ENOMEM; mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); if (buf) ftrace_match_records(hash, buf, len); + + mutex_lock(&ftrace_lock); + ret = ftrace_hash_move(orig_hash, hash); + mutex_unlock(&ftrace_lock); + mutex_unlock(&ftrace_regex_lock); + + free_ftrace_hash(hash); + return ret; } /** @@ -2484,7 +2666,9 @@ ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct ftrace_hash **orig_hash; struct trace_parser *parser; + int ret; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2501,14 +2685,21 @@ ftrace_regex_release(struct inode *inode, struct file *file) } trace_parser_put(parser); - kfree(iter); if (file->f_mode & FMODE_WRITE) { + if (iter->flags & FTRACE_ITER_NOTRACE) + orig_hash = &iter->ops->notrace_hash; + else + orig_hash = &iter->ops->filter_hash; + mutex_lock(&ftrace_lock); - if (ftrace_start_up && ftrace_enabled) + ret = ftrace_hash_move(orig_hash, iter->hash); + if (!ret && ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); } + free_ftrace_hash(iter->hash); + kfree(iter); mutex_unlock(&ftrace_regex_lock); return 0; -- cgit v1.1 From ed926f9b35cda0988234c356e16a7cb30f4e5338 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 13:25:24 -0400 Subject: ftrace: Use counters to enable functions to trace Every function has its own record that stores the instruction pointer and flags for the function to be traced. There are only two flags: enabled and free. The enabled flag states that tracing for the function has been enabled (actively traced), and the free flag states that the record no longer points to a function and can be used by new functions (loaded modules). These flags are now moved to the MSB of the flags (actually just the top 32bits). The rest of the bits (30 bits) are now used as a ref counter. Everytime a tracer register functions to trace, those functions will have its counter incremented. When tracing is enabled, to determine if a function should be traced, the counter is examined, and if it is non-zero it is set to trace. When a ftrace_ops is registered to trace functions, its hashes are examined. If the ftrace_ops filter_hash count is zero, then all functions are set to be traced, otherwise only the functions in the hash are to be traced. The exception to this is if a function is also in the ftrace_ops notrace_hash. Then that function's counter is not incremented for this ftrace_ops. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 8 ++- kernel/trace/ftrace.c | 158 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 148 insertions(+), 18 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6658a51..ab1c46e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -37,6 +37,7 @@ struct ftrace_ops { #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; + unsigned long flags; #endif }; @@ -152,10 +153,13 @@ extern void unregister_ftrace_function_probe_all(char *glob); extern int ftrace_text_reserved(void *start, void *end); enum { - FTRACE_FL_FREE = (1 << 0), - FTRACE_FL_ENABLED = (1 << 1), + FTRACE_FL_ENABLED = (1 << 30), + FTRACE_FL_FREE = (1 << 31), }; +#define FTRACE_FL_MASK (0x3UL << 30) +#define FTRACE_REF_MAX ((1 << 30) - 1) + struct dyn_ftrace { union { unsigned long ip; /* address of mcount call-site */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 46f0826..5dd332cc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -890,6 +890,10 @@ static const struct ftrace_hash empty_hash = { }; #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) +enum { + FTRACE_OPS_FL_ENABLED = 1, +}; + struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, @@ -1161,6 +1165,105 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) } \ } +static void __ftrace_hash_rec_update(struct ftrace_ops *ops, + int filter_hash, + bool inc) +{ + struct ftrace_hash *hash; + struct ftrace_hash *other_hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int count = 0; + int all = 0; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; + + /* + * In the filter_hash case: + * If the count is zero, we update all records. + * Otherwise we just update the items in the hash. + * + * In the notrace_hash case: + * We enable the update in the hash. + * As disabling notrace means enabling the tracing, + * and enabling notrace means disabling, the inc variable + * gets inversed. + */ + if (filter_hash) { + hash = ops->filter_hash; + other_hash = ops->notrace_hash; + if (!hash->count) + all = 1; + } else { + inc = !inc; + hash = ops->notrace_hash; + other_hash = ops->filter_hash; + /* + * If the notrace hash has no items, + * then there's nothing to do. + */ + if (!hash->count) + return; + } + + do_for_each_ftrace_rec(pg, rec) { + int in_other_hash = 0; + int in_hash = 0; + int match = 0; + + if (all) { + /* + * Only the filter_hash affects all records. + * Update if the record is not in the notrace hash. + */ + if (!ftrace_lookup_ip(other_hash, rec->ip)) + match = 1; + } else { + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + + /* + * + */ + if (filter_hash && in_hash && !in_other_hash) + match = 1; + else if (!filter_hash && in_hash && + (in_other_hash || !other_hash->count)) + match = 1; + } + if (!match) + continue; + + if (inc) { + rec->flags++; + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + return; + } else { + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + return; + rec->flags--; + } + count++; + /* Shortcut, if we handled all records, we are done. */ + if (!all && count == hash->count) + return; + } while_for_each_ftrace_rec(); +} + +static void ftrace_hash_rec_disable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 1); +} + static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1276,26 +1379,24 @@ int ftrace_text_reserved(void *start, void *end) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { - struct ftrace_ops *ops = &global_ops; unsigned long ftrace_addr; unsigned long flag = 0UL; ftrace_addr = (unsigned long)FTRACE_ADDR; /* - * If this record is not to be traced or we want to disable it, - * then disable it. + * If we are enabling tracing: * - * If we want to enable it and filtering is off, then enable it. + * If the record has a ref count, then we need to enable it + * because someone is using it. * - * If we want to enable it and filtering is on, enable it only if - * it's filtered + * Otherwise we make sure its disabled. + * + * If we are disabling tracing, then disable all records that + * are enabled. */ - if (enable && !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) { - if (!ops->filter_hash->count || - ftrace_lookup_ip(ops->filter_hash, rec->ip)) - flag = FTRACE_FL_ENABLED; - } + if (enable && (rec->flags & ~FTRACE_FL_MASK)) + flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1423,17 +1524,25 @@ static void ftrace_startup_enable(int command) static void ftrace_startup(int command) { + struct ftrace_ops *ops = &global_ops; + if (unlikely(ftrace_disabled)) return; ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (ftrace_start_up == 1) + ftrace_hash_rec_enable(ops, 1); + ftrace_startup_enable(command); } static void ftrace_shutdown(int command) { + struct ftrace_ops *ops = &global_ops; + if (unlikely(ftrace_disabled)) return; @@ -1446,7 +1555,12 @@ static void ftrace_shutdown(int command) WARN_ON_ONCE(ftrace_start_up < 0); if (!ftrace_start_up) + ftrace_hash_rec_disable(ops, 1); + + if (!ftrace_start_up) { command |= FTRACE_DISABLE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -2668,6 +2782,7 @@ ftrace_regex_release(struct inode *inode, struct file *file) struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; struct trace_parser *parser; + int filter_hash; int ret; mutex_lock(&ftrace_regex_lock); @@ -2687,15 +2802,26 @@ ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); if (file->f_mode & FMODE_WRITE) { - if (iter->flags & FTRACE_ITER_NOTRACE) - orig_hash = &iter->ops->notrace_hash; - else + filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); + + if (filter_hash) orig_hash = &iter->ops->filter_hash; + else + orig_hash = &iter->ops->notrace_hash; mutex_lock(&ftrace_lock); + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(iter->ops, filter_hash); ret = ftrace_hash_move(orig_hash, iter->hash); - if (!ret && ftrace_start_up && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); + if (!ret) { + ftrace_hash_rec_enable(iter->ops, filter_hash); + if (iter->ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + } mutex_unlock(&ftrace_lock); } free_ftrace_hash(iter->hash); -- cgit v1.1 From 647bcd03d5b2fb44fd9c9ef1a4f50c2eee8f779a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 14:39:21 -0400 Subject: ftrace: Add enabled_functions file Add the enabled_functions file that is used to show all the functions that have been enabled for tracing as well as their ref counts. This helps seeing if any function has been registered and what functions are being traced. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5dd332cc..065f1e6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1703,6 +1703,7 @@ enum { FTRACE_ITER_NOTRACE = (1 << 1), FTRACE_ITER_PRINTALL = (1 << 2), FTRACE_ITER_HASH = (1 << 3), + FTRACE_ITER_ENABLED = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1842,7 +1843,11 @@ t_next(struct seq_file *m, void *v, loff_t *pos) !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip))) { + !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + + ((iter->flags & FTRACE_ITER_ENABLED) && + !(rec->flags & ~FTRACE_FL_MASK))) { + rec = NULL; goto retry; } @@ -1944,7 +1949,11 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps\n", (void *)rec->ip); + seq_printf(m, "%ps", (void *)rec->ip); + if (iter->flags & FTRACE_ITER_ENABLED) + seq_printf(m, " (%ld)", + rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, "\n"); return 0; } @@ -1983,6 +1992,34 @@ ftrace_avail_open(struct inode *inode, struct file *file) return ret; } +static int +ftrace_enabled_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + static void ftrace_filter_reset(struct ftrace_hash *hash) { mutex_lock(&ftrace_lock); @@ -2838,6 +2875,13 @@ static const struct file_operations ftrace_avail_fops = { .release = seq_release_private, }; +static const struct file_operations ftrace_enabled_fops = { + .open = ftrace_enabled_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, @@ -3069,6 +3113,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); + trace_create_file("enabled_functions", 0444, + d_tracer, NULL, &ftrace_enabled_fops); + trace_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); -- cgit v1.1 From bd69c30b1d08032d97ab0dabd7a1eb7fb73ca2b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 21:55:54 -0400 Subject: ftrace: Add ops parameter to ftrace_startup/shutdown functions In order to allow different ops to enable different functions, the ftrace_startup() and ftrace_shutdown() functions need the ops parameter passed to them. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 065f1e6..8fef1d9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1522,10 +1522,8 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } -static void ftrace_startup(int command) +static void ftrace_startup(struct ftrace_ops *ops, int command) { - struct ftrace_ops *ops = &global_ops; - if (unlikely(ftrace_disabled)) return; @@ -1539,10 +1537,8 @@ static void ftrace_startup(int command) ftrace_startup_enable(command); } -static void ftrace_shutdown(int command) +static void ftrace_shutdown(struct ftrace_ops *ops, int command) { - struct ftrace_ops *ops = &global_ops; - if (unlikely(ftrace_disabled)) return; @@ -2362,7 +2358,7 @@ static void __enable_ftrace_function_probe(void) return; __register_ftrace_function(&trace_probe_ops); - ftrace_startup(0); + ftrace_startup(&global_ops, 0); ftrace_probe_registered = 1; } @@ -2381,7 +2377,7 @@ static void __disable_ftrace_function_probe(void) /* no more funcs left */ __unregister_ftrace_function(&trace_probe_ops); - ftrace_shutdown(0); + ftrace_shutdown(&global_ops, 0); ftrace_probe_registered = 0; } @@ -3267,6 +3263,10 @@ void __init ftrace_init(void) #else +struct ftrace_ops global_ops = { + .func = ftrace_stub, +}; + static int __init ftrace_nodyn_init(void) { ftrace_enabled = 1; @@ -3277,8 +3277,8 @@ device_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(command) do { } while (0) -# define ftrace_shutdown(command) do { } while (0) +# define ftrace_startup(ops, command) do { } while (0) +# define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ @@ -3583,7 +3583,7 @@ int register_ftrace_function(struct ftrace_ops *ops) goto out_unlock; ret = __register_ftrace_function(ops); - ftrace_startup(0); + ftrace_startup(&global_ops, 0); out_unlock: mutex_unlock(&ftrace_lock); @@ -3602,7 +3602,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(0); + ftrace_shutdown(&global_ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -3825,7 +3825,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ftrace_startup(FTRACE_START_FUNC_RET); + ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -3842,7 +3842,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.1 From 2b499381bc50ede01b3d8eab164ca2fad00655f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 May 2011 22:49:52 -0400 Subject: ftrace: Have global_ops store the functions that are to be traced This is a step towards each ops structure defining its own set of functions to trace. As the current code with pid's and such are specific to the global_ops, it is restructured to be used with the global ops. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 69 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8fef1d9..dcce0bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -91,6 +91,7 @@ static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +static struct ftrace_ops global_ops; /* * Traverse the ftrace_list, invoking all entries. The reason that we @@ -153,7 +154,7 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif -static void update_ftrace_function(void) +static void update_global_ops(void) { ftrace_func_t func; @@ -173,6 +174,18 @@ static void update_ftrace_function(void) set_ftrace_pid_function(func); func = ftrace_pid_func; } + + global_ops.func = func; +} + +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + update_global_ops(); + + func = global_ops.func; + #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; #else @@ -181,24 +194,19 @@ static void update_ftrace_function(void) #endif } -static int __register_ftrace_function(struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { - ops->next = ftrace_list; + ops->next = *list; /* * We are entering ops into the ftrace_list but another * CPU might be walking that list. We need to make sure * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - rcu_assign_pointer(ftrace_list, ops); - - if (ftrace_enabled) - update_ftrace_function(); - - return 0; + rcu_assign_pointer(*list, ops); } -static int __unregister_ftrace_function(struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -206,13 +214,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (ftrace_list == ops && ops->next == &ftrace_list_end) { - ftrace_trace_function = ftrace_stub; - ftrace_list = &ftrace_list_end; + if (*list == ops && ops->next == &ftrace_list_end) { + *list = &ftrace_list_end; return 0; } - for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + for (p = list; *p != &ftrace_list_end; p = &(*p)->next) if (*p == ops) break; @@ -220,7 +227,37 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -1; *p = (*p)->next; + return 0; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ftrace_disabled) + return -ENODEV; + + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + add_ftrace_ops(&ftrace_list, ops); + if (ftrace_enabled) + update_ftrace_function(); + + return 0; +} +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (ftrace_disabled) + return -ENODEV; + + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + ret = remove_ftrace_ops(&ftrace_list, ops); + if (ret < 0) + return ret; if (ftrace_enabled) update_ftrace_function(); @@ -894,7 +931,7 @@ enum { FTRACE_OPS_FL_ENABLED = 1, }; -struct ftrace_ops global_ops = { +static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, @@ -3263,7 +3300,7 @@ void __init ftrace_init(void) #else -struct ftrace_ops global_ops = { +static struct ftrace_ops global_ops = { .func = ftrace_stub, }; -- cgit v1.1 From 07fd5515f3b5c20704707f63e7f4485b534508a8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 May 2011 18:03:47 -0400 Subject: ftrace: Free hash with call_rcu_sched() When a hash is modified and might be in use, we need to perform a schedule RCU operation on it, as the hashes will soon be used directly in the function tracer callback. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 55 ++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcce0bf..92b6fdf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -913,6 +913,7 @@ struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; + struct rcu_head rcu; }; /* @@ -1058,6 +1059,21 @@ static void free_ftrace_hash(struct ftrace_hash *hash) kfree(hash); } +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ + struct ftrace_hash *hash; + + hash = container_of(rcu, struct ftrace_hash, rcu); + free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; @@ -1122,7 +1138,8 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) struct ftrace_func_entry *entry; struct hlist_node *tp, *tn; struct hlist_head *hhd; - struct ftrace_hash *hash = *dst; + struct ftrace_hash *old_hash; + struct ftrace_hash *new_hash; unsigned long key; int size = src->count; int bits = 0; @@ -1133,13 +1150,11 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) * the empty_hash. */ if (!src->count) { - free_ftrace_hash(*dst); - *dst = EMPTY_HASH; + free_ftrace_hash_rcu(*dst); + rcu_assign_pointer(*dst, EMPTY_HASH); return 0; } - ftrace_hash_clear(hash); - /* * Make the hash size about 1/2 the # found */ @@ -1150,27 +1165,9 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) if (bits > FTRACE_HASH_MAX_BITS) bits = FTRACE_HASH_MAX_BITS; - /* We can't modify the empty_hash */ - if (hash == EMPTY_HASH) { - /* Create a new hash */ - *dst = alloc_ftrace_hash(bits); - if (!*dst) { - *dst = EMPTY_HASH; - return -ENOMEM; - } - hash = *dst; - } else { - size = 1 << bits; - - /* Use the old hash, but create new buckets */ - hhd = kzalloc(sizeof(*hhd) * size, GFP_KERNEL); - if (!hhd) - return -ENOMEM; - - kfree(hash->buckets); - hash->buckets = hhd; - hash->size_bits = bits; - } + new_hash = alloc_ftrace_hash(bits); + if (!new_hash) + return -ENOMEM; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1181,10 +1178,14 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) else key = 0; remove_hash_entry(src, entry); - __add_hash_entry(hash, entry); + __add_hash_entry(new_hash, entry); } } + old_hash = *dst; + rcu_assign_pointer(*dst, new_hash); + free_ftrace_hash_rcu(old_hash); + return 0; } -- cgit v1.1 From b848914ce39589d89ee0078a6d1ef452b464729e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 May 2011 09:27:52 -0400 Subject: ftrace: Implement separate user function filtering ftrace_ops that are registered to trace functions can now be agnostic to each other in respect to what functions they trace. Each ops has their own hash of the functions they want to trace and a hash to what they do not want to trace. A empty hash for the functions they want to trace denotes all functions should be traced that are not in the notrace hash. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 193 ++++++++++++++++++++++++++++++-------- kernel/trace/trace_functions.c | 2 + kernel/trace/trace_irqsoff.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + kernel/trace/trace_stack.c | 1 + 6 files changed, 166 insertions(+), 39 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ab1c46e..4609c0e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,13 +31,18 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); struct ftrace_hash; +enum { + FTRACE_OPS_FL_ENABLED = 1 << 0, + FTRACE_OPS_FL_GLOBAL = 1 << 1, +}; + struct ftrace_ops { ftrace_func_t func; struct ftrace_ops *next; + unsigned long flags; #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; - unsigned long flags; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92b6fdf..6c7e1df 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -87,24 +87,29 @@ static struct ftrace_ops ftrace_list_end __read_mostly = .func = ftrace_stub, }; -static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); + /* - * Traverse the ftrace_list, invoking all entries. The reason that we + * Traverse the ftrace_global_list, invoking all entries. The reason that we * can use rcu_dereference_raw() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_list. + * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_global_list_func(unsigned long ip, + unsigned long parent_ip) { - struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ + struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { op->func(ip, parent_ip); @@ -163,11 +168,11 @@ static void update_global_ops(void) * function directly. Otherwise, we need to iterate over the * registered callers. */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - func = ftrace_list->func; + if (ftrace_global_list == &ftrace_list_end || + ftrace_global_list->next == &ftrace_list_end) + func = ftrace_global_list->func; else - func = ftrace_list_func; + func = ftrace_global_list_func; /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { @@ -184,7 +189,11 @@ static void update_ftrace_function(void) update_global_ops(); - func = global_ops.func; + if (ftrace_ops_list == &ftrace_list_end || + ftrace_ops_list->next == &ftrace_list_end) + func = ftrace_ops_list->func; + else + func = ftrace_ops_list_func; #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; @@ -198,10 +207,10 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { ops->next = *list; /* - * We are entering ops into the ftrace_list but another + * We are entering ops into the list but another * CPU might be walking that list. We need to make sure * the ops->next pointer is valid before another CPU sees - * the ops pointer included into the ftrace_list. + * the ops pointer included into the list. */ rcu_assign_pointer(*list, ops); } @@ -238,7 +247,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; - add_ftrace_ops(&ftrace_list, ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + int first = ftrace_global_list == &ftrace_list_end; + add_ftrace_ops(&ftrace_global_list, ops); + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (first) + add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else + add_ftrace_ops(&ftrace_ops_list, ops); + if (ftrace_enabled) update_ftrace_function(); @@ -252,12 +272,24 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_disabled) return -ENODEV; + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) + return -EBUSY; + if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; - ret = remove_ftrace_ops(&ftrace_list, ops); + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ret = remove_ftrace_ops(&ftrace_global_list, ops); + if (!ret && ftrace_global_list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + if (!ret) + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + if (ret < 0) return ret; + if (ftrace_enabled) update_ftrace_function(); @@ -928,10 +960,6 @@ static const struct ftrace_hash empty_hash = { }; #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) -enum { - FTRACE_OPS_FL_ENABLED = 1, -}; - static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, @@ -1190,6 +1218,40 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) } /* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + * AND + * the ip is not in the ops->notrace_hash. + */ +static int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + struct ftrace_hash *filter_hash; + struct ftrace_hash *notrace_hash; + int ret; + + /* The hashes are freed with call_rcu_sched() */ + preempt_disable_notrace(); + + filter_hash = rcu_dereference_raw(ops->filter_hash); + notrace_hash = rcu_dereference_raw(ops->notrace_hash); + + if ((!filter_hash || !filter_hash->count || + ftrace_lookup_ip(filter_hash, ip)) && + (!notrace_hash || !notrace_hash->count || + !ftrace_lookup_ip(notrace_hash, ip))) + ret = 1; + else + ret = 0; + preempt_enable_notrace(); + + return ret; +} + +/* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. */ @@ -1232,7 +1294,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (filter_hash) { hash = ops->filter_hash; other_hash = ops->notrace_hash; - if (!hash->count) + if (!hash || !hash->count) all = 1; } else { inc = !inc; @@ -1242,7 +1304,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * If the notrace hash has no items, * then there's nothing to do. */ - if (!hash->count) + if (hash && !hash->count) return; } @@ -1256,11 +1318,11 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * Only the filter_hash affects all records. * Update if the record is not in the notrace hash. */ - if (!ftrace_lookup_ip(other_hash, rec->ip)) + if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) match = 1; } else { - in_hash = !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); /* * @@ -1546,6 +1608,7 @@ static void ftrace_run_update_code(int command) static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; +static int global_start_up; static void ftrace_startup_enable(int command) { @@ -1562,14 +1625,25 @@ static void ftrace_startup_enable(int command) static void ftrace_startup(struct ftrace_ops *ops, int command) { + bool hash_enable = true; + if (unlikely(ftrace_disabled)) return; ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; + /* ops marked global share the filter hashes */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + /* Don't update hash if global is already set */ + if (global_start_up) + hash_enable = false; + global_start_up++; + } + ops->flags |= FTRACE_OPS_FL_ENABLED; - if (ftrace_start_up == 1) + if (hash_enable) ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -1577,6 +1651,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command) static void ftrace_shutdown(struct ftrace_ops *ops, int command) { + bool hash_disable = true; + if (unlikely(ftrace_disabled)) return; @@ -1588,13 +1664,25 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (!ftrace_start_up) + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + global_start_up--; + WARN_ON_ONCE(global_start_up < 0); + /* Don't update hash if global still has users */ + if (global_start_up) { + WARN_ON_ONCE(!ftrace_start_up); + hash_disable = false; + } + } + + if (hash_disable) ftrace_hash_rec_disable(ops, 1); - if (!ftrace_start_up) { - command |= FTRACE_DISABLE_CALLS; + if (ops != &global_ops || !global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } + + if (!ftrace_start_up) + command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -2381,6 +2469,7 @@ static int ftrace_probe_registered; static void __enable_ftrace_function_probe(void) { + int ret; int i; if (ftrace_probe_registered) @@ -2395,13 +2484,16 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - __register_ftrace_function(&trace_probe_ops); - ftrace_startup(&global_ops, 0); + ret = __register_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_startup(&trace_probe_ops, 0); + ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { + int ret; int i; if (!ftrace_probe_registered) @@ -2414,8 +2506,10 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - __unregister_ftrace_function(&trace_probe_ops); - ftrace_shutdown(&global_ops, 0); + ret = __unregister_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_shutdown(&trace_probe_ops, 0); + ftrace_probe_registered = 0; } @@ -3319,8 +3413,28 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) + +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + return 1; +} + #endif /* CONFIG_DYNAMIC_FTRACE */ +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +{ + /* see comment above ftrace_global_list_func */ + struct ftrace_ops *op = rcu_dereference_raw(ftrace_ops_list); + + while (op != &ftrace_list_end) { + if (ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + op = rcu_dereference_raw(op->next); + }; +} + static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -3621,7 +3735,9 @@ int register_ftrace_function(struct ftrace_ops *ops) goto out_unlock; ret = __register_ftrace_function(ops); - ftrace_startup(&global_ops, 0); + if (!ret) + ftrace_startup(ops, 0); + out_unlock: mutex_unlock(&ftrace_lock); @@ -3640,7 +3756,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(&global_ops, 0); + if (!ret) + ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -3670,11 +3787,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_list != &ftrace_list_end) { - if (ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_ops_list != &ftrace_list_end) { + if (ftrace_ops_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_ops_list->func; else - ftrace_trace_function = ftrace_list_func; + ftrace_trace_function = ftrace_ops_list_func; } } else { diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d..8d0e1cc 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; /* Our two options */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a4969b4..c77424b 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559..f029dd4 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead..b0b53b8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t -- cgit v1.1 From cdbe61bfe70440939e457fb4a8d0995eaaed17de Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 May 2011 21:14:55 -0400 Subject: ftrace: Allow dynamically allocated function tracers Now that functions may be selected individually, it only makes sense that we should allow dynamically allocated trace structures to be traced. This will allow perf to allocate a ftrace_ops structure at runtime and use it to pick and choose which functions that structure will trace. Note, a dynamically allocated ftrace_ops will always be called indirectly instead of being called directly from the mcount in entry.S. This is because there's no safe way to prevent mcount from being preempted before calling the function, unless we modify every entry.S to do so (not likely). Thus, dynamically allocated functions will now be called by the ftrace_ops_list_func() that loops through the ops that are allocated if there are more than one op allocated at a time. This loop is protected with a preempt_disable. To determine if an ftrace_ops structure is allocated or not, a new util function was added to the kernel/extable.c called core_kernel_data(), which returns 1 if the address is between _sdata and _edata. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 + include/linux/kernel.h | 1 + kernel/extable.c | 8 ++++++++ kernel/trace/ftrace.c | 37 ++++++++++++++++++++++++++++++------- 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4609c0e..caba694 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -34,6 +34,7 @@ struct ftrace_hash; enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, + FTRACE_OPS_FL_DYNAMIC = 1 << 2, }; struct ftrace_ops { diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 00cec4d..f37ba71 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -283,6 +283,7 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); +extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263..c2d625f 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -72,6 +72,14 @@ int core_kernel_text(unsigned long addr) return 0; } +int core_kernel_data(unsigned long addr) +{ + if (addr >= (unsigned long)_sdata && + addr < (unsigned long)_edata) + return 1; + return 0; +} + int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c7e1df..5b3ee04 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -189,8 +189,14 @@ static void update_ftrace_function(void) update_global_ops(); + /* + * If we are at the end of the list and this ops is + * not dynamic, then have the mcount trampoline call + * the function directly + */ if (ftrace_ops_list == &ftrace_list_end || - ftrace_ops_list->next == &ftrace_list_end) + (ftrace_ops_list->next == &ftrace_list_end && + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) func = ftrace_ops_list->func; else func = ftrace_ops_list_func; @@ -250,6 +256,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + if (!core_kernel_data((unsigned long)ops)) + ops->flags |= FTRACE_OPS_FL_DYNAMIC; + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { int first = ftrace_global_list == &ftrace_list_end; add_ftrace_ops(&ftrace_global_list, ops); @@ -293,6 +302,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + synchronize_sched(); + return 0; } @@ -1225,6 +1241,9 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) * the filter_hash does not exist or is empty, * AND * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu_sched(). */ static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) @@ -1233,9 +1252,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) struct ftrace_hash *notrace_hash; int ret; - /* The hashes are freed with call_rcu_sched() */ - preempt_disable_notrace(); - filter_hash = rcu_dereference_raw(ops->filter_hash); notrace_hash = rcu_dereference_raw(ops->notrace_hash); @@ -1246,7 +1262,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) ret = 1; else ret = 0; - preempt_enable_notrace(); return ret; } @@ -3425,14 +3440,20 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { - /* see comment above ftrace_global_list_func */ - struct ftrace_ops *op = rcu_dereference_raw(ftrace_ops_list); + struct ftrace_ops *op; + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip); op = rcu_dereference_raw(op->next); }; + preempt_enable_notrace(); } static void clear_ftrace_swapper(void) @@ -3743,6 +3764,7 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_unlock(&ftrace_lock); return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. @@ -3762,6 +3784,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) return ret; } +EXPORT_SYMBOL_GPL(unregister_ftrace_function); int ftrace_enable_sysctl(struct ctl_table *table, int write, -- cgit v1.1 From 936e074b286ae779f134312178dbab139ee7ea52 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 May 2011 22:54:01 -0400 Subject: ftrace: Modify ftrace_set_filter/notrace to take ops Since users of the function tracer can now pick and choose which functions they want to trace agnostically from other users of the function tracer, we need to pass the ops struct to the ftrace_set_filter() functions. The functions ftrace_set_global_filter() and ftrace_set_global_notrace() is added to keep the old filter functions which are used to modify the generic function tracers. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 ++++++- kernel/trace/ftrace.c | 46 +++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace_selftest.c | 4 ++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index caba694..9d88e1c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -179,7 +179,12 @@ struct dyn_ftrace { }; int ftrace_force_update(void); -void ftrace_set_filter(unsigned char *buf, int len, int reset); +void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +void ftrace_set_global_filter(unsigned char *buf, int len, int reset); +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b3ee04..d017c2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2826,6 +2826,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, struct ftrace_hash *hash; int ret; + /* All global ops uses the global ops filters */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) + ops = &global_ops; + if (unlikely(ftrace_disabled)) return -ENODEV; @@ -2856,6 +2860,41 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, /** * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_set_regex(ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_set_regex(ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -2863,13 +2902,15 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(unsigned char *buf, int len, int reset) +void ftrace_set_global_filter(unsigned char *buf, int len, int reset) { ftrace_set_regex(&global_ops, buf, len, reset, 1); } +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -2878,10 +2919,11 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(unsigned char *buf, int len, int reset) +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) { ftrace_set_regex(&global_ops, buf, len, reset, 0); } +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); /* * command line interface to allow users to set filters on boot up. diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 659732e..0fa2db3 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -131,7 +131,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); /* filter only on our function */ - ftrace_set_filter(func_name, strlen(func_name), 1); + ftrace_set_global_filter(func_name, strlen(func_name), 1); /* enable tracing */ ret = tracer_init(trace, tr); @@ -181,7 +181,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, tracer_enabled = save_tracer_enabled; /* Enable tracing on all functions again */ - ftrace_set_filter(NULL, 0, 1); + ftrace_set_global_filter(NULL, 0, 1); return ret; } -- cgit v1.1 From 95950c2ecb31314ef827428e43ff771cf3b037e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 May 2011 00:08:51 -0400 Subject: ftrace: Add self-tests for multiple function trace users Add some basic sanity tests for multiple users of the function tracer at startup. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 + kernel/trace/trace_selftest.c | 210 +++++++++++++++++++++++++++++++++- kernel/trace/trace_selftest_dynamic.c | 6 + 3 files changed, 217 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5e9dfc6..6b69c4b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern unsigned long ftrace_update_tot_cnt; #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); #endif extern int ring_buffer_expanded; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0fa2db3..288541f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) #ifdef CONFIG_DYNAMIC_FTRACE +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { + .func = trace_selftest_test_probe1_func, +}; + +static struct ftrace_ops test_probe2 = { + .func = trace_selftest_test_probe2_func, +}; + +static struct ftrace_ops test_probe3 = { + .func = trace_selftest_test_probe3_func, +}; + +static struct ftrace_ops test_global = { + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +static void print_counts(void) +{ + printk("(%d %d %d %d %d) ", + trace_selftest_test_probe1_cnt, + trace_selftest_test_probe2_cnt, + trace_selftest_test_probe3_cnt, + trace_selftest_test_global_cnt, + trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ + trace_selftest_test_probe1_cnt = 0; + trace_selftest_test_probe2_cnt = 0; + trace_selftest_test_probe3_cnt = 0; + trace_selftest_test_global_cnt = 0; + trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(int cnt) +{ + int save_ftrace_enabled = ftrace_enabled; + struct ftrace_ops *dyn_ops; + char *func1_name; + char *func2_name; + int len1; + int len2; + int ret = -1; + + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace ops #%d: ", cnt); + + ftrace_enabled = 1; + reset_counts(); + + /* Handle PPC64 '.' name */ + func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); + len1 = strlen(func1_name); + len2 = strlen(func2_name); + + /* + * Probe 1 will trace function 1. + * Probe 2 will trace function 2. + * Probe 3 will trace functions 1 and 2. + */ + ftrace_set_filter(&test_probe1, func1_name, len1, 1); + ftrace_set_filter(&test_probe2, func2_name, len2, 1); + ftrace_set_filter(&test_probe3, func1_name, len1, 1); + ftrace_set_filter(&test_probe3, func2_name, len2, 0); + + register_ftrace_function(&test_probe1); + register_ftrace_function(&test_probe2); + register_ftrace_function(&test_probe3); + register_ftrace_function(&test_global); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 0) + goto out; + if (trace_selftest_test_probe3_cnt != 1) + goto out; + if (trace_selftest_test_global_cnt == 0) + goto out; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 1) + goto out; + if (trace_selftest_test_probe3_cnt != 2) + goto out; + + /* Add a dynamic probe */ + dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + if (!dyn_ops) { + printk("MEMORY ERROR "); + goto out; + } + + dyn_ops->func = trace_selftest_test_dyn_func; + + register_ftrace_function(dyn_ops); + + trace_selftest_test_global_cnt = 0; + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 1) + goto out_free; + if (trace_selftest_test_probe3_cnt != 3) + goto out_free; + if (trace_selftest_test_global_cnt == 0) + goto out; + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + + ret = 0; + out_free: + unregister_ftrace_function(dyn_ops); + kfree(dyn_ops); + + out: + /* Purposely unregister in the same order */ + unregister_ftrace_function(&test_probe1); + unregister_ftrace_function(&test_probe2); + unregister_ftrace_function(&test_probe3); + unregister_ftrace_function(&test_global); + + /* Make sure everything is off */ + reset_counts(); + DYN_FTRACE_TEST_NAME(); + DYN_FTRACE_TEST_NAME(); + + if (trace_selftest_test_probe1_cnt || + trace_selftest_test_probe2_cnt || + trace_selftest_test_probe3_cnt || + trace_selftest_test_global_cnt || + trace_selftest_test_dyn_cnt) + ret = -1; + + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + /* Test dynamic code modification and ftrace filters */ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, @@ -166,16 +366,20 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(tr, &count); - trace->reset(tr); tracing_start(); /* we should only have one item */ if (!ret && count != 1) { + trace->reset(tr); printk(KERN_CONT ".. filter failed count=%ld ..", count); ret = -1; goto out; } + /* Test the ops with global tracing running */ + ret = trace_selftest_ops(1); + trace->reset(tr); + out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; @@ -183,6 +387,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* Enable tracing on all functions again */ ftrace_set_global_filter(NULL, 0, 1); + /* Test the ops with global tracing off */ + if (!ret) + ret = trace_selftest_ops(2); + return ret; } #else diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77c..b4c475a 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) /* used to call mcount */ return 0; } + +int DYN_FTRACE_TEST_NAME2(void) +{ + /* used to call mcount */ + return 0; +} -- cgit v1.1 From 61c4f2c81c61f73549928dfd9f3e8f26aa36a8cf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 May 2011 21:06:34 -0700 Subject: Linux 2.6.39 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 41ea6fb..123d858 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 39 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* -- cgit v1.1 From b4bc842802db3314f9a657094da0450a903ea619 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 7 Feb 2011 16:02:25 -0800 Subject: module: deal with alignment issues in built-in module versions On m68k natural alignment is 2-byte boundary but we are trying to align structures in __modver section on sizeof(void *) boundary. This causes trouble when we try to access elements in this section in array-like fashion when create "version" attributes for built-in modules. Moreover, as DaveM said, we can't reliably put structures into independent objects, put them into a special section, and then expect array access over them (via the section boundaries) after linking the objects together to just "work" due to variable alignment choices in different situations. The only solution that seems to work reliably is to make an array of plain pointers to the objects in question and put those pointers in the special section. Reported-by: Geert Uytterhoeven Signed-off-by: Dmitry Torokhov Signed-off-by: Rusty Russell --- include/linux/module.h | 10 +++++----- kernel/params.c | 9 ++++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 5de4204..4cfebc2 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -174,10 +174,7 @@ extern struct module __this_module; #define MODULE_VERSION(_version) \ extern ssize_t __modver_version_show(struct module_attribute *, \ struct module *, char *); \ - static struct module_version_attribute __modver_version_attr \ - __used \ - __attribute__ ((__section__ ("__modver"),aligned(sizeof(void *)))) \ - = { \ + static struct module_version_attribute ___modver_attr = { \ .mattr = { \ .attr = { \ .name = "version", \ @@ -187,7 +184,10 @@ extern struct module __this_module; }, \ .module_name = KBUILD_MODNAME, \ .version = _version, \ - } + }; \ + static const struct module_version_attribute \ + __used __attribute__ ((__section__ ("__modver"))) \ + * __moduleparam_const __modver_attr = &___modver_attr #endif /* Optional firmware file (or files) needed by the module diff --git a/kernel/params.c b/kernel/params.c index 7ab388a..28c5d5c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -821,15 +821,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr, return sprintf(buf, "%s\n", vattr->version); } -extern struct module_version_attribute __start___modver[], __stop___modver[]; +extern const struct module_version_attribute *__start___modver[]; +extern const struct module_version_attribute *__stop___modver[]; static void __init version_sysfs_builtin(void) { - const struct module_version_attribute *vattr; + const struct module_version_attribute **p; struct module_kobject *mk; int err; - for (vattr = __start___modver; vattr < __stop___modver; vattr++) { + for (p = __start___modver; p < __stop___modver; p++) { + const struct module_version_attribute *vattr = *p; + mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); -- cgit v1.1 From 9b73a5840c7d5f77e5766626716df13787cb258c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 7 Feb 2011 16:02:27 -0800 Subject: module: do not hide __modver_version_show declaration behind ifdef Doing so prevents the following warning from sparse: CHECK kernel/params.c kernel/params.c:817:9: warning: symbol '__modver_version_show' was not declared. Should it be static? since kernel/params.c is never compiled with MODULE being set. Signed-off-by: Dmitry Torokhov Signed-off-by: Rusty Russell --- include/linux/module.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 4cfebc2..23996ad 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -64,6 +64,9 @@ struct module_version_attribute { const char *version; } __attribute__ ((__aligned__(sizeof(void *)))); +extern ssize_t __modver_version_show(struct module_attribute *, + struct module *, char *); + struct module_kobject { struct kobject kobj; @@ -172,8 +175,6 @@ extern struct module __this_module; #define MODULE_VERSION(_version) MODULE_INFO(version, _version) #else #define MODULE_VERSION(_version) \ - extern ssize_t __modver_version_show(struct module_attribute *, \ - struct module *, char *); \ static struct module_version_attribute ___modver_attr = { \ .mattr = { \ .attr = { \ -- cgit v1.1 From a288bd651f4180c224cfddf837a0416157a36661 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 19 May 2011 16:55:25 -0600 Subject: module: remove 64 bit alignment padding from struct module with CONFIG_TRACE* Reorder struct module to remove 24 bytes of alignment padding on 64 bit builds when the CONFIG_TRACE options are selected. This allows the structure to fit into one fewer cache lines, and its size drops from 592 to 568 on x86_64. Signed-off-by: Richard Kennedy Signed-off-by: Rusty Russell --- include/linux/module.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 23996ad..65cc6cc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -368,34 +368,35 @@ struct module struct module_notes_attrs *notes_attrs; #endif + /* The command line arguments (may be mangled). People like + keeping pointers to this stuff */ + char *args; + #ifdef CONFIG_SMP /* Per-cpu data. */ void __percpu *percpu; unsigned int percpu_size; #endif - /* The command line arguments (may be mangled). People like - keeping pointers to this stuff */ - char *args; #ifdef CONFIG_TRACEPOINTS - struct tracepoint * const *tracepoints_ptrs; unsigned int num_tracepoints; + struct tracepoint * const *tracepoints_ptrs; #endif #ifdef HAVE_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; #endif #ifdef CONFIG_TRACING - const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; + const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call **trace_events; unsigned int num_trace_events; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD - unsigned long *ftrace_callsites; unsigned int num_ftrace_callsites; + unsigned long *ftrace_callsites; #endif #ifdef CONFIG_MODULE_UNLOAD -- cgit v1.1 From c5be0b2eb1ca05e0cd747f9c0ba552c6ee8827a0 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 19 May 2011 16:55:25 -0600 Subject: module: reorder kparam_array to remove alignment padding on 64 bit builds Reorder structure kparam_array to remove 8 bytes of alignment padding on 64 bit builds, dropping its size from 40 to 32 bytes. Also update the macro module_param_array_named to initialise the structure using its member names to allow it to be changed without touching all its call sites. 'git grep' finds module_param_array in 1037 places so this patch will save a small amount of data space across many modules. Signed-off-by: Richard Kennedy Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 07b4195..ddaae98 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -67,9 +67,9 @@ struct kparam_string { struct kparam_array { unsigned int max; + unsigned int elemsize; unsigned int *num; const struct kernel_param_ops *ops; - unsigned int elemsize; void *elem; }; @@ -371,8 +371,9 @@ extern int param_get_invbool(char *buffer, const struct kernel_param *kp); */ #define module_param_array_named(name, array, type, nump, perm) \ static const struct kparam_array __param_arr_##name \ - = { ARRAY_SIZE(array), nump, ¶m_ops_##type, \ - sizeof(array[0]), array }; \ + = { .max = ARRAY_SIZE(array), .num = nump, \ + .ops = ¶m_ops_##type, \ + .elemsize = sizeof(array[0]), .elem = array }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ -- cgit v1.1 From 5d05c70849f760ac8f4ed3ebfeefb92689858834 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 8 Mar 2011 22:01:47 +0800 Subject: minor ANSI prototype sparse fix Fix function prototype to be ANSI-C compliant, consistent with other function prototypes, addressing a sparse warning. Signed-off-by: Daniel J Blueman Signed-off-by: Rusty Russell --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index d5938a5..523c40b71 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1627,7 +1627,7 @@ void unset_section_ro_nx(struct module *mod, void *module_region) } /* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw() +void set_all_modules_text_rw(void) { struct module *mod; @@ -1648,7 +1648,7 @@ void set_all_modules_text_rw() } /* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro() +void set_all_modules_text_ro(void) { struct module *mod; -- cgit v1.1 From 4d10380e720a3ce19dbe88d0133f66ded07b6a8f Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Thu, 19 May 2011 16:55:25 -0600 Subject: module: zero mod->init_ro_size after init is freed. Reset mod->init_ro_size to zero after the init part of a module is unloaded. Otherwise we need to check if module->init is NULL in the unprotect functions in the next patch. Signed-off-by: Jan Glauber Signed-off-by: Rusty Russell --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/module.c b/kernel/module.c index 523c40b71..92112c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2935,6 +2935,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; + mod->init_ro_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); -- cgit v1.1 From 448694a1d50432be63aafccb42d6f54d8cf3d02c Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Thu, 19 May 2011 16:55:26 -0600 Subject: module: undo module RONX protection correctly. While debugging I stumbled over two problems in the code that protects module pages. First issue is that disabling the protection before freeing init or unload of a module is not symmetric with the enablement. For instance, if pages are set to RO the page range from module_core to module_core + core_ro_size is protected. If a module is unloaded the page range from module_core to module_core + core_size is set back to RW. So pages that were not set to RO are also changed to RW. This is not critical but IMHO it should be symmetric. Second issue is that while set_memory_rw & set_memory_ro are used for RO/RW changes only set_memory_nx is involved for NX/X. One would await that the inverse function is called when the NX protection should be removed, which is not the case here, unless I'm missing something. Signed-off-by: Jan Glauber Signed-off-by: Rusty Russell --- arch/s390/include/asm/cacheflush.h | 1 + arch/s390/mm/pageattr.c | 5 +++++ kernel/module.c | 25 +++++++++++++------------ 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/cacheflush.h b/arch/s390/include/asm/cacheflush.h index 43a5c78..3e20383 100644 --- a/arch/s390/include/asm/cacheflush.h +++ b/arch/s390/include/asm/cacheflush.h @@ -11,5 +11,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable); int set_memory_ro(unsigned long addr, int numpages); int set_memory_rw(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); #endif /* _S390_CACHEFLUSH_H */ diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 0607e4b..f05edcc 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -54,3 +54,8 @@ int set_memory_nx(unsigned long addr, int numpages) return 0; } EXPORT_SYMBOL_GPL(set_memory_nx); + +int set_memory_x(unsigned long addr, int numpages) +{ + return 0; +} diff --git a/kernel/module.c b/kernel/module.c index 92112c9..b99dceb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1607,22 +1607,23 @@ static void set_section_ro_nx(void *base, } } -/* Setting memory back to RW+NX before releasing it */ +/* Setting memory back to W+X before releasing it */ void unset_section_ro_nx(struct module *mod, void *module_region) { - unsigned long total_pages; - if (mod->module_core == module_region) { - /* Set core as NX+RW */ - total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); - set_memory_nx((unsigned long)mod->module_core, total_pages); - set_memory_rw((unsigned long)mod->module_core, total_pages); - + set_page_attributes(mod->module_core + mod->core_text_size, + mod->module_core + mod->core_size, + set_memory_x); + set_page_attributes(mod->module_core, + mod->module_core + mod->core_ro_size, + set_memory_rw); } else if (mod->module_init == module_region) { - /* Set init as NX+RW */ - total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); - set_memory_nx((unsigned long)mod->module_init, total_pages); - set_memory_rw((unsigned long)mod->module_init, total_pages); + set_page_attributes(mod->module_init + mod->init_text_size, + mod->module_init + mod->init_size, + set_memory_x); + set_page_attributes(mod->module_init, + mod->module_init + mod->init_ro_size, + set_memory_rw); } } -- cgit v1.1 From 01526ed0830643bd53a8434c3068e4c077e1b09d Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Thu, 19 May 2011 16:55:26 -0600 Subject: module: split unset_section_ro_nx function. Split the unprotect function into a function per section to make the code more readable and add the missing static declaration. Signed-off-by: Jan Glauber Signed-off-by: Rusty Russell --- kernel/module.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index b99dceb..0e6f97f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1607,24 +1607,24 @@ static void set_section_ro_nx(void *base, } } -/* Setting memory back to W+X before releasing it */ -void unset_section_ro_nx(struct module *mod, void *module_region) -{ - if (mod->module_core == module_region) { - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); - } else if (mod->module_init == module_region) { - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); - } +static void unset_module_core_ro_nx(struct module *mod) +{ + set_page_attributes(mod->module_core + mod->core_text_size, + mod->module_core + mod->core_size, + set_memory_x); + set_page_attributes(mod->module_core, + mod->module_core + mod->core_ro_size, + set_memory_rw); +} + +static void unset_module_init_ro_nx(struct module *mod) +{ + set_page_attributes(mod->module_init + mod->init_text_size, + mod->module_init + mod->init_size, + set_memory_x); + set_page_attributes(mod->module_init, + mod->module_init + mod->init_ro_size, + set_memory_rw); } /* Iterate through all modules and set each module's text as RW */ @@ -1670,7 +1670,8 @@ void set_all_modules_text_ro(void) } #else static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } -static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } +static void unset_module_core_ro_nx(struct module *mod) { } +static void unset_module_init_ro_nx(struct module *mod) { } #endif /* Free a module, remove from lists, etc. */ @@ -1697,7 +1698,7 @@ static void free_module(struct module *mod) destroy_params(mod->kp, mod->num_kp); /* This may be NULL, but that's OK */ - unset_section_ro_nx(mod, mod->module_init); + unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1706,7 +1707,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ - unset_section_ro_nx(mod, mod->module_core); + unset_module_core_ro_nx(mod); module_free(mod, mod->module_core); #ifdef CONFIG_MPU @@ -2932,7 +2933,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->symtab = mod->core_symtab; mod->strtab = mod->core_strtab; #endif - unset_section_ro_nx(mod, mod->module_init); + unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; -- cgit v1.1 From de4d8d53465483168d6a627d409ee2d09d8e3308 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 19 Apr 2011 21:49:58 +0200 Subject: module: each_symbol_section instead of each_symbol Instead of having a callback function for each symbol in the kernel, have a callback for each array of symbols. This eases the logic when we move to sorted symbols and binary search. Signed-off-by: Rusty Russell Signed-off-by: Alessio Igor Bogani --- include/linux/module.h | 5 +++-- kernel/module.c | 42 +++++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 65cc6cc..49f4ad0 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -477,8 +477,9 @@ const struct kernel_symbol *find_symbol(const char *name, bool warn); /* Walk the exported symbol table */ -bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, - unsigned int symnum, void *data), void *data); +bool each_symbol_section(bool (*fn)(const struct symsearch *arr, + struct module *owner, + void *data), void *data); /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if symnum out of range. */ diff --git a/kernel/module.c b/kernel/module.c index 0e6f97f..e8aa462 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -240,23 +240,24 @@ static bool each_symbol_in_section(const struct symsearch *arr, struct module *owner, bool (*fn)(const struct symsearch *syms, struct module *owner, - unsigned int symnum, void *data), + void *data), void *data) { - unsigned int i, j; + unsigned int j; for (j = 0; j < arrsize; j++) { - for (i = 0; i < arr[j].stop - arr[j].start; i++) - if (fn(&arr[j], owner, i, data)) - return true; + if (fn(&arr[j], owner, data)) + return true; } return false; } /* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, - unsigned int symnum, void *data), void *data) +bool each_symbol_section(bool (*fn)(const struct symsearch *arr, + struct module *owner, + void *data), + void *data) { struct module *mod; static const struct symsearch arr[] = { @@ -309,7 +310,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, } return false; } -EXPORT_SYMBOL_GPL(each_symbol); +EXPORT_SYMBOL_GPL(each_symbol_section); struct find_symbol_arg { /* Input */ @@ -323,15 +324,12 @@ struct find_symbol_arg { const struct kernel_symbol *sym; }; -static bool find_symbol_in_section(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) +static bool check_symbol(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) { struct find_symbol_arg *fsa = data; - if (strcmp(syms->start[symnum].name, fsa->name) != 0) - return false; - if (!fsa->gplok) { if (syms->licence == GPL_ONLY) return false; @@ -365,6 +363,20 @@ static bool find_symbol_in_section(const struct symsearch *syms, return true; } +static bool find_symbol_in_section(const struct symsearch *syms, + struct module *owner, + void *data) +{ + struct find_symbol_arg *fsa = data; + unsigned int i; + + for (i = 0; i < syms->stop - syms->start; i++) { + if (strcmp(syms->start[i].name, fsa->name) == 0) + return check_symbol(syms, owner, i, data); + } + return false; +} + /* Find a symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, @@ -379,7 +391,7 @@ const struct kernel_symbol *find_symbol(const char *name, fsa.gplok = gplok; fsa.warn = warn; - if (each_symbol(find_symbol_in_section, &fsa)) { + if (each_symbol_section(find_symbol_in_section, &fsa)) { if (owner) *owner = fsa.owner; if (crc) -- cgit v1.1 From f02e8a6596b7dc9b2171f7ff5654039ef0950cdc Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Thu, 14 Apr 2011 14:59:39 +0200 Subject: module: Sort exported symbols This patch places every exported symbol in its own section (i.e. "___ksymtab+printk"). Thus the linker will use its SORT() directive to sort and finally merge all symbol in the right and final section (i.e. "__ksymtab"). The symbol prefixed archs use an underscore as prefix for symbols. To avoid collision we use a different character to create the temporary section names. This work was supported by a hardware donation from the CE Linux Forum. Signed-off-by: Alessio Igor Bogani Signed-off-by: Rusty Russell (folded in '+' fixup) Tested-by: Dirk Behme --- include/asm-generic/vmlinux.lds.h | 20 ++++++++++---------- include/linux/module.h | 4 ++-- scripts/module-common.lds | 11 +++++++++++ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index bd297a2..b27445e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -274,70 +274,70 @@ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ - *(__ksymtab) \ + *(SORT(___ksymtab+*)) \ VMLINUX_SYMBOL(__stop___ksymtab) = .; \ } \ \ /* Kernel symbol table: GPL-only symbols */ \ __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl) = .; \ - *(__ksymtab_gpl) \ + *(SORT(___ksymtab_gpl+*)) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .; \ } \ \ /* Kernel symbol table: Normal unused symbols */ \ __ksymtab_unused : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused) = .; \ - *(__ksymtab_unused) \ + *(SORT(___ksymtab_unused+*)) \ VMLINUX_SYMBOL(__stop___ksymtab_unused) = .; \ } \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_unused_gpl) = .; \ - *(__ksymtab_unused_gpl) \ + *(SORT(___ksymtab_unused_gpl+*)) \ VMLINUX_SYMBOL(__stop___ksymtab_unused_gpl) = .; \ } \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .; \ - *(__ksymtab_gpl_future) \ + *(SORT(___ksymtab_gpl_future+*)) \ VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .; \ } \ \ /* Kernel symbol table: Normal symbols */ \ __kcrctab : AT(ADDR(__kcrctab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab) = .; \ - *(__kcrctab) \ + *(SORT(___kcrctab+*)) \ VMLINUX_SYMBOL(__stop___kcrctab) = .; \ } \ \ /* Kernel symbol table: GPL-only symbols */ \ __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl) = .; \ - *(__kcrctab_gpl) \ + *(SORT(___kcrctab_gpl+*)) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .; \ } \ \ /* Kernel symbol table: Normal unused symbols */ \ __kcrctab_unused : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused) = .; \ - *(__kcrctab_unused) \ + *(SORT(___kcrctab_unused+*)) \ VMLINUX_SYMBOL(__stop___kcrctab_unused) = .; \ } \ \ /* Kernel symbol table: GPL-only unused symbols */ \ __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_unused_gpl) = .; \ - *(__kcrctab_unused_gpl) \ + *(SORT(___kcrctab_unused_gpl+*)) \ VMLINUX_SYMBOL(__stop___kcrctab_unused_gpl) = .; \ } \ \ /* Kernel symbol table: GPL-future-only symbols */ \ __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .; \ - *(__kcrctab_gpl_future) \ + *(SORT(___kcrctab_gpl_future+*)) \ VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .; \ } \ \ diff --git a/include/linux/module.h b/include/linux/module.h index 49f4ad0..d9ca2d5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -224,7 +224,7 @@ struct module_use { extern void *__crc_##sym __attribute__((weak)); \ static const unsigned long __kcrctab_##sym \ __used \ - __attribute__((section("__kcrctab" sec), unused)) \ + __attribute__((section("___kcrctab" sec "+" #sym), unused)) \ = (unsigned long) &__crc_##sym; #else #define __CRC_SYMBOL(sym, sec) @@ -239,7 +239,7 @@ struct module_use { = MODULE_SYMBOL_PREFIX #sym; \ static const struct kernel_symbol __ksymtab_##sym \ __used \ - __attribute__((section("__ksymtab" sec), unused)) \ + __attribute__((section("___ksymtab" sec "+" #sym), unused)) \ = { (unsigned long)&sym, __kstrtab_##sym } #define EXPORT_SYMBOL(sym) \ diff --git a/scripts/module-common.lds b/scripts/module-common.lds index 47a1f9a..0865b3e 100644 --- a/scripts/module-common.lds +++ b/scripts/module-common.lds @@ -5,4 +5,15 @@ */ SECTIONS { /DISCARD/ : { *(.discard) } + + __ksymtab : { *(SORT(___ksymtab+*)) } + __ksymtab_gpl : { *(SORT(___ksymtab_gpl+*)) } + __ksymtab_unused : { *(SORT(___ksymtab_unused+*)) } + __ksymtab_unused_gpl : { *(SORT(___ksymtab_unused_gpl+*)) } + __ksymtab_gpl_future : { *(SORT(___ksymtab_gpl_future+*)) } + __kcrctab : { *(SORT(___kcrctab+*)) } + __kcrctab_gpl : { *(SORT(___kcrctab_gpl+*)) } + __kcrctab_unused : { *(SORT(___kcrctab_unused+*)) } + __kcrctab_unused_gpl : { *(SORT(___kcrctab_unused_gpl+*)) } + __kcrctab_gpl_future : { *(SORT(___kcrctab_gpl_future+*)) } } -- cgit v1.1 From 1a94dc35bc5c166d89913dc01a49d27a3c21a455 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Thu, 14 Apr 2011 20:00:19 +0200 Subject: lib: Add generic binary search function to the kernel. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There a large number hand-coded binary searches in the kernel (run "git grep search | grep binary" to find many of them). Since in my experience, hand-coding binary searches can be error-prone, it seems worth cleaning this up by providing a generic binary search function. This generic binary search implementation comes from Ksplice. It has the same basic API as the C library bsearch() function. Ksplice uses it in half a dozen places with 4 different comparison functions, and I think our code is substantially cleaner because of this. Signed-off-by: Tim Abbott Extra-bikeshedding-by: Alan Jenkins Extra-bikeshedding-by: André Goddard Rosa Extra-bikeshedding-by: Rusty Russell Signed-off-by: Rusty Russell Signed-off-by: Alessio Igor Bogani Signed-off-by: Rusty Russell --- include/linux/bsearch.h | 9 +++++++++ lib/Makefile | 3 ++- lib/bsearch.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 include/linux/bsearch.h create mode 100644 lib/bsearch.c diff --git a/include/linux/bsearch.h b/include/linux/bsearch.h new file mode 100644 index 0000000..90b1aa8 --- /dev/null +++ b/include/linux/bsearch.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_BSEARCH_H +#define _LINUX_BSEARCH_H + +#include + +void *bsearch(const void *key, const void *base, size_t num, size_t size, + int (*cmp)(const void *key, const void *elt)); + +#endif /* _LINUX_BSEARCH_H */ diff --git a/lib/Makefile b/lib/Makefile index ef0f285..4b49a24 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -21,7 +21,8 @@ lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o + string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ + bsearch.o obj-y += kstrtox.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o diff --git a/lib/bsearch.c b/lib/bsearch.c new file mode 100644 index 0000000..5b54758 --- /dev/null +++ b/lib/bsearch.c @@ -0,0 +1,53 @@ +/* + * A generic implementation of binary search for the Linux kernel + * + * Copyright (C) 2008-2009 Ksplice, Inc. + * Author: Tim Abbott + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; version 2. + */ + +#include +#include + +/* + * bsearch - binary search an array of elements + * @key: pointer to item being searched for + * @base: pointer to first element to search + * @num: number of elements + * @size: size of each element + * @cmp: pointer to comparison function + * + * This function does a binary search on the given array. The + * contents of the array should already be in ascending sorted order + * under the provided comparison function. + * + * Note that the key need not have the same type as the elements in + * the array, e.g. key could be a string and the comparison function + * could compare the string with the struct's name field. However, if + * the key and elements in the array are of the same type, you can use + * the same comparison function for both sort() and bsearch(). + */ +void *bsearch(const void *key, const void *base, size_t num, size_t size, + int (*cmp)(const void *key, const void *elt)) +{ + size_t start = 0, end = num; + int result; + + while (start < end) { + size_t mid = start + (end - start) / 2; + + result = cmp(key, base + mid * size); + if (result < 0) + end = mid; + else if (result > 0) + start = mid + 1; + else + return (void *)base + mid * size; + } + + return NULL; +} +EXPORT_SYMBOL(bsearch); -- cgit v1.1 From 403ed27846aa126ecf0b842b5b179c506b9d989c Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Wed, 20 Apr 2011 11:10:52 +0200 Subject: module: Use the binary search for symbols resolution Takes advantage of the order and locates symbols using binary search. This work was supported by a hardware donation from the CE Linux Forum. Signed-off-by: Alessio Igor Bogani Signed-off-by: Rusty Russell Tested-by: Dirk Behme --- kernel/module.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index e8aa462..d1db8eb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -57,6 +57,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -363,17 +364,27 @@ static bool check_symbol(const struct symsearch *syms, return true; } +static int cmp_name(const void *va, const void *vb) +{ + const char *a; + const struct kernel_symbol *b; + a = va; b = vb; + return strcmp(a, b->name); +} + static bool find_symbol_in_section(const struct symsearch *syms, struct module *owner, void *data) { struct find_symbol_arg *fsa = data; - unsigned int i; + struct kernel_symbol *sym; + + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, + sizeof(struct kernel_symbol), cmp_name); + + if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) + return true; - for (i = 0; i < syms->stop - syms->start; i++) { - if (strcmp(syms->start[i].name, fsa->name) == 0) - return check_symbol(syms, owner, i, data); - } return false; } -- cgit v1.1 From 9d63487f86115b1d3ef69670043bcf2b83c4d227 Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Wed, 18 May 2011 22:35:59 +0200 Subject: module: Use binary search in lookup_symbol() The function is_exported() with its helper function lookup_symbol() are used to verify if a provided symbol is effectively exported by the kernel or by the modules. Now that both have their symbols sorted we can replace a linear search with a binary search which provide a considerably speed-up. This work was supported by a hardware donation from the CE Linux Forum. Signed-off-by: Alessio Igor Bogani Acked-by: Greg Kroah-Hartman Signed-off-by: Rusty Russell --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index d1db8eb..2287972 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2055,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name, const struct kernel_symbol *start, const struct kernel_symbol *stop) { - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); } static int is_exported(const char *name, unsigned long value, -- cgit v1.1 From 6845756b29e4c4e7db41e2d75cafa9d091bc1c07 Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Thu, 19 May 2011 16:55:27 -0600 Subject: modpost: Update 64k section support for binutils 2.18.50 Binutils 2.18.50 made a backwards-incompatible change in the way it writes ELF objects with over 65280 sections, to improve conformance with the ELF specification and interoperability with other ELF tools. Specifically, it no longer adds 256 to section indices SHN_LORESERVE and higher to skip over the reserved range SHN_LORESERVE through SHN_HIRESERVE; those values are only considered special in the st_shndx field, and not in other places where section indices are stored. See: http://sourceware.org/bugzilla/show_bug.cgi?id=5900 http://groups.google.com/group/generic-abi/browse_thread/thread/e8bb63714b072e67/6c63738f12cc8a17 Signed-off-by: Anders Kaseorg Signed-off-by: Rusty Russell --- scripts/mod/modpost.c | 16 ++++++---------- scripts/mod/modpost.h | 27 ++++++++------------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index cd104af..413c536 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -420,11 +420,10 @@ static int parse_elf(struct elf_info *info, const char *filename) return 0; } - if (hdr->e_shnum == 0) { + if (hdr->e_shnum == SHN_UNDEF) { /* * There are more than 64k sections, * read count from .sh_size. - * note: it doesn't need shndx2secindex() */ info->num_sections = TO_NATIVE(sechdrs[0].sh_size); } @@ -432,8 +431,7 @@ static int parse_elf(struct elf_info *info, const char *filename) info->num_sections = hdr->e_shnum; } if (hdr->e_shstrndx == SHN_XINDEX) { - info->secindex_strings = - shndx2secindex(TO_NATIVE(sechdrs[0].sh_link)); + info->secindex_strings = TO_NATIVE(sechdrs[0].sh_link); } else { info->secindex_strings = hdr->e_shstrndx; @@ -489,7 +487,7 @@ static int parse_elf(struct elf_info *info, const char *filename) sechdrs[i].sh_offset; info->symtab_stop = (void *)hdr + sechdrs[i].sh_offset + sechdrs[i].sh_size; - sh_link_idx = shndx2secindex(sechdrs[i].sh_link); + sh_link_idx = sechdrs[i].sh_link; info->strtab = (void *)hdr + sechdrs[sh_link_idx].sh_offset; } @@ -516,11 +514,9 @@ static int parse_elf(struct elf_info *info, const char *filename) if (symtab_shndx_idx != ~0U) { Elf32_Word *p; - if (symtab_idx != - shndx2secindex(sechdrs[symtab_shndx_idx].sh_link)) + if (symtab_idx != sechdrs[symtab_shndx_idx].sh_link) fatal("%s: SYMTAB_SHNDX has bad sh_link: %u!=%u\n", - filename, - shndx2secindex(sechdrs[symtab_shndx_idx].sh_link), + filename, sechdrs[symtab_shndx_idx].sh_link, symtab_idx); /* Fix endianness */ for (p = info->symtab_shndx_start; p < info->symtab_shndx_stop; @@ -1446,7 +1442,7 @@ static unsigned int *reloc_location(struct elf_info *elf, Elf_Shdr *sechdr, Elf_Rela *r) { Elf_Shdr *sechdrs = elf->sechdrs; - int section = shndx2secindex(sechdr->sh_info); + int section = sechdr->sh_info; return (void *)elf->hdr + sechdrs[section].sh_offset + r->r_offset; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 0388cfcc..2031119 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -145,33 +145,22 @@ static inline int is_shndx_special(unsigned int i) return i != SHN_XINDEX && i >= SHN_LORESERVE && i <= SHN_HIRESERVE; } -/* shndx is in [0..SHN_LORESERVE) U (SHN_HIRESERVE, 0xfffffff], thus: - * shndx == 0 <=> sechdrs[0] - * ...... - * shndx == SHN_LORESERVE-1 <=> sechdrs[SHN_LORESERVE-1] - * shndx == SHN_HIRESERVE+1 <=> sechdrs[SHN_LORESERVE] - * shndx == SHN_HIRESERVE+2 <=> sechdrs[SHN_LORESERVE+1] - * ...... - * fyi: sym->st_shndx is uint16, SHN_LORESERVE = ff00, SHN_HIRESERVE = ffff, - * so basically we map 0000..feff -> 0000..feff - * ff00..ffff -> (you are a bad boy, dont do it) - * 10000..xxxx -> ff00..(xxxx-0x100) +/* + * Move reserved section indices SHN_LORESERVE..SHN_HIRESERVE out of + * the way to -256..-1, to avoid conflicting with real section + * indices. */ -static inline unsigned int shndx2secindex(unsigned int i) -{ - if (i <= SHN_HIRESERVE) - return i; - return i - (SHN_HIRESERVE + 1 - SHN_LORESERVE); -} +#define SPECIAL(i) ((i) - (SHN_HIRESERVE + 1)) /* Accessor for sym->st_shndx, hides ugliness of "64k sections" */ static inline unsigned int get_secindex(const struct elf_info *info, const Elf_Sym *sym) { + if (is_shndx_special(sym->st_shndx)) + return SPECIAL(sym->st_shndx); if (sym->st_shndx != SHN_XINDEX) return sym->st_shndx; - return shndx2secindex(info->symtab_shndx_start[sym - - info->symtab_start]); + return info->symtab_shndx_start[sym - info->symtab_start]; } /* file2alias.c */ -- cgit v1.1 From d0f1fed29e6e73d9d17f4c91a5896a4ce3938d45 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 19 Apr 2011 12:43:45 +0100 Subject: Add a strtobool function matching semantics of existing in kernel equivalents This is a rename of the usr_strtobool proposal, which was a renamed, relocated and fixed version of previous kstrtobool RFC Signed-off-by: Jonathan Cameron Signed-off-by: Rusty Russell --- include/linux/string.h | 1 + lib/string.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index a716ee2..a176db2 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -123,6 +123,7 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); +extern int strtobool(const char *s, bool *res); #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); diff --git a/lib/string.c b/lib/string.c index f71bead..01fad9b 100644 --- a/lib/string.c +++ b/lib/string.c @@ -535,6 +535,35 @@ bool sysfs_streq(const char *s1, const char *s2) } EXPORT_SYMBOL(sysfs_streq); +/** + * strtobool - convert common user inputs into boolean values + * @s: input string + * @res: result + * + * This routine returns 0 iff the first character is one of 'Yy1Nn0'. + * Otherwise it will return -EINVAL. Value pointed to by res is + * updated upon finding a match. + */ +int strtobool(const char *s, bool *res) +{ + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + break; + case 'n': + case 'N': + case '0': + *res = false; + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(strtobool); + #ifndef __HAVE_ARCH_MEMSET /** * memset - Fill a region of memory with the given value -- cgit v1.1 From a0374396375d06398c419ebb6857fb5809cff81f Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 19 Apr 2011 12:43:46 +0100 Subject: debugfs: move to new strtobool No functional changes requires that we eat errors from strtobool. If people want to not do this, then it should be fixed at a later date. V2: Simplification suggested by Rusty Russell removes the need for additional variable ret. Signed-off-by: Jonathan Cameron Signed-off-by: Rusty Russell --- fs/debugfs/file.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 89d394d..568304d 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -429,25 +429,16 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, { char buf[32]; int buf_size; + bool bv; u32 *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - *val = 1; - break; - case 'n': - case 'N': - case '0': - *val = 0; - break; - } - + if (strtobool(buf, &bv) == 0) + *val = bv; + return count; } -- cgit v1.1 From f721a465cddbe7f03e6cd2272008da558cf93818 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 19 Apr 2011 12:43:47 +0100 Subject: params.c: Use new strtobool function to process boolean inputs Signed-off-by: Jonathan Cameron Signed-off-by: Rusty Russell --- kernel/params.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 28c5d5c..ed72e13 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); int param_set_bool(const char *val, const struct kernel_param *kp) { bool v; + int ret; /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ - switch (val[0]) { - case 'y': case 'Y': case '1': - v = true; - break; - case 'n': case 'N': case '0': - v = false; - break; - default: - return -EINVAL; - } + ret = strtobool(val, &v); + if (ret) + return ret; if (kp->flags & KPARAM_ISBOOL) *(bool *)kp->arg = v; -- cgit v1.1 From a7117c6bddcbfff2fa237a14a853b32cb94bf59a Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Wed, 11 May 2011 12:04:58 +0530 Subject: MIPS: Netlogic XLR/XLS processor IDs. Add Netlogic Microsystems company ID and processor IDs for XLR and XLS processors for CPU probe. Add CPU_XLR to cpu_type_enum. Signed-off-by: Jayachandran C Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2367/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cpu.h | 27 +++++++++++++++++++++ arch/mips/kernel/cpu-probe.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index 8687753..34c0d3c 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -33,6 +33,7 @@ #define PRID_COMP_TOSHIBA 0x070000 #define PRID_COMP_LSI 0x080000 #define PRID_COMP_LEXRA 0x0b0000 +#define PRID_COMP_NETLOGIC 0x0c0000 #define PRID_COMP_CAVIUM 0x0d0000 #define PRID_COMP_INGENIC 0xd00000 @@ -142,6 +143,31 @@ #define PRID_IMP_JZRISC 0x0200 /* + * These are the PRID's for when 23:16 == PRID_COMP_NETLOGIC + */ +#define PRID_IMP_NETLOGIC_XLR732 0x0000 +#define PRID_IMP_NETLOGIC_XLR716 0x0200 +#define PRID_IMP_NETLOGIC_XLR532 0x0900 +#define PRID_IMP_NETLOGIC_XLR308 0x0600 +#define PRID_IMP_NETLOGIC_XLR532C 0x0800 +#define PRID_IMP_NETLOGIC_XLR516C 0x0a00 +#define PRID_IMP_NETLOGIC_XLR508C 0x0b00 +#define PRID_IMP_NETLOGIC_XLR308C 0x0f00 +#define PRID_IMP_NETLOGIC_XLS608 0x8000 +#define PRID_IMP_NETLOGIC_XLS408 0x8800 +#define PRID_IMP_NETLOGIC_XLS404 0x8c00 +#define PRID_IMP_NETLOGIC_XLS208 0x8e00 +#define PRID_IMP_NETLOGIC_XLS204 0x8f00 +#define PRID_IMP_NETLOGIC_XLS108 0xce00 +#define PRID_IMP_NETLOGIC_XLS104 0xcf00 +#define PRID_IMP_NETLOGIC_XLS616B 0x4000 +#define PRID_IMP_NETLOGIC_XLS608B 0x4a00 +#define PRID_IMP_NETLOGIC_XLS416B 0x4400 +#define PRID_IMP_NETLOGIC_XLS412B 0x4c00 +#define PRID_IMP_NETLOGIC_XLS408B 0x4e00 +#define PRID_IMP_NETLOGIC_XLS404B 0x4f00 + +/* * Definitions for 7:0 on legacy processors */ @@ -234,6 +260,7 @@ enum cpu_type_enum { */ CPU_5KC, CPU_20KC, CPU_25KF, CPU_SB1, CPU_SB1A, CPU_LOONGSON2, CPU_CAVIUM_OCTEON, CPU_CAVIUM_OCTEON_PLUS, CPU_CAVIUM_OCTEON2, + CPU_XLR, CPU_LAST }; diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index f65d4c8..c7b7eb2 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -988,6 +988,59 @@ static inline void cpu_probe_ingenic(struct cpuinfo_mips *c, unsigned int cpu) } } +static inline void cpu_probe_netlogic(struct cpuinfo_mips *c, int cpu) +{ + decode_configs(c); + + c->options = (MIPS_CPU_TLB | + MIPS_CPU_4KEX | + MIPS_CPU_COUNTER | + MIPS_CPU_DIVEC | + MIPS_CPU_WATCH | + MIPS_CPU_EJTAG | + MIPS_CPU_LLSC); + + switch (c->processor_id & 0xff00) { + case PRID_IMP_NETLOGIC_XLR732: + case PRID_IMP_NETLOGIC_XLR716: + case PRID_IMP_NETLOGIC_XLR532: + case PRID_IMP_NETLOGIC_XLR308: + case PRID_IMP_NETLOGIC_XLR532C: + case PRID_IMP_NETLOGIC_XLR516C: + case PRID_IMP_NETLOGIC_XLR508C: + case PRID_IMP_NETLOGIC_XLR308C: + c->cputype = CPU_XLR; + __cpu_name[cpu] = "Netlogic XLR"; + break; + + case PRID_IMP_NETLOGIC_XLS608: + case PRID_IMP_NETLOGIC_XLS408: + case PRID_IMP_NETLOGIC_XLS404: + case PRID_IMP_NETLOGIC_XLS208: + case PRID_IMP_NETLOGIC_XLS204: + case PRID_IMP_NETLOGIC_XLS108: + case PRID_IMP_NETLOGIC_XLS104: + case PRID_IMP_NETLOGIC_XLS616B: + case PRID_IMP_NETLOGIC_XLS608B: + case PRID_IMP_NETLOGIC_XLS416B: + case PRID_IMP_NETLOGIC_XLS412B: + case PRID_IMP_NETLOGIC_XLS408B: + case PRID_IMP_NETLOGIC_XLS404B: + c->cputype = CPU_XLR; + __cpu_name[cpu] = "Netlogic XLS"; + break; + + default: + printk(KERN_INFO "Unknown Netlogic chip id [%02x]!\n", + c->processor_id); + c->cputype = CPU_XLR; + break; + } + + c->isa_level = MIPS_CPU_ISA_M64R1; + c->tlbsize = ((read_c0_config1() >> 25) & 0x3f) + 1; +} + #ifdef CONFIG_64BIT /* For use by uaccess.h */ u64 __ua_limit; @@ -1035,6 +1088,9 @@ __cpuinit void cpu_probe(void) case PRID_COMP_INGENIC: cpu_probe_ingenic(c, cpu); break; + case PRID_COMP_NETLOGIC: + cpu_probe_netlogic(c, cpu); + break; } BUG_ON(!__cpu_name[cpu]); -- cgit v1.1 From 3c595a515dbb61ae96e8f5607d895820aa06e870 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sat, 7 May 2011 01:36:05 +0530 Subject: MIPS: Netlogic: mach-netlogic include files Add war.h and irq.h with XLR/XLS definitions. Signed-off-by: Jayachandran C To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2331/ Signed-off-by: Ralf Baechle --- .../asm/mach-netlogic/cpu-feature-overrides.h | 47 ++++++++++++++++++++++ arch/mips/include/asm/mach-netlogic/irq.h | 14 +++++++ arch/mips/include/asm/mach-netlogic/war.h | 26 ++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 arch/mips/include/asm/mach-netlogic/cpu-feature-overrides.h create mode 100644 arch/mips/include/asm/mach-netlogic/irq.h create mode 100644 arch/mips/include/asm/mach-netlogic/war.h diff --git a/arch/mips/include/asm/mach-netlogic/cpu-feature-overrides.h b/arch/mips/include/asm/mach-netlogic/cpu-feature-overrides.h new file mode 100644 index 0000000..3b72827 --- /dev/null +++ b/arch/mips/include/asm/mach-netlogic/cpu-feature-overrides.h @@ -0,0 +1,47 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2011 Netlogic Microsystems + * Copyright (C) 2003 Ralf Baechle + */ +#ifndef __ASM_MACH_NETLOGIC_CPU_FEATURE_OVERRIDES_H +#define __ASM_MACH_NETLOGIC_CPU_FEATURE_OVERRIDES_H + +#define cpu_has_4kex 1 +#define cpu_has_4k_cache 1 +#define cpu_has_watch 1 +#define cpu_has_mips16 0 +#define cpu_has_counter 1 +#define cpu_has_divec 1 +#define cpu_has_vce 0 +#define cpu_has_cache_cdex_p 0 +#define cpu_has_cache_cdex_s 0 +#define cpu_has_prefetch 1 +#define cpu_has_mcheck 1 +#define cpu_has_ejtag 1 + +#define cpu_has_llsc 1 +#define cpu_has_vtag_icache 0 +#define cpu_has_dc_aliases 0 +#define cpu_has_ic_fills_f_dc 0 +#define cpu_has_dsp 0 +#define cpu_has_mipsmt 0 +#define cpu_has_userlocal 0 +#define cpu_icache_snoops_remote_store 0 + +#define cpu_has_nofpuex 0 +#define cpu_has_64bits 1 + +#define cpu_has_mips32r1 1 +#define cpu_has_mips32r2 0 +#define cpu_has_mips64r1 1 +#define cpu_has_mips64r2 0 + +#define cpu_has_inclusive_pcaches 0 + +#define cpu_dcache_line_size() 32 +#define cpu_icache_line_size() 32 + +#endif /* __ASM_MACH_NETLOGIC_CPU_FEATURE_OVERRIDES_H */ diff --git a/arch/mips/include/asm/mach-netlogic/irq.h b/arch/mips/include/asm/mach-netlogic/irq.h new file mode 100644 index 0000000..b590245 --- /dev/null +++ b/arch/mips/include/asm/mach-netlogic/irq.h @@ -0,0 +1,14 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2011 Netlogic Microsystems. + */ +#ifndef __ASM_NETLOGIC_IRQ_H +#define __ASM_NETLOGIC_IRQ_H + +#define NR_IRQS 64 +#define MIPS_CPU_IRQ_BASE 0 + +#endif /* __ASM_NETLOGIC_IRQ_H */ diff --git a/arch/mips/include/asm/mach-netlogic/war.h b/arch/mips/include/asm/mach-netlogic/war.h new file mode 100644 index 0000000..22da893 --- /dev/null +++ b/arch/mips/include/asm/mach-netlogic/war.h @@ -0,0 +1,26 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2011 Netlogic Microsystems. + * Copyright (C) 2002, 2004, 2007 by Ralf Baechle + */ +#ifndef __ASM_MIPS_MACH_NLM_WAR_H +#define __ASM_MIPS_MACH_NLM_WAR_H + +#define R4600_V1_INDEX_ICACHEOP_WAR 0 +#define R4600_V1_HIT_CACHEOP_WAR 0 +#define R4600_V2_HIT_CACHEOP_WAR 0 +#define R5432_CP0_INTERRUPT_WAR 0 +#define BCM1250_M3_WAR 0 +#define SIBYTE_1956_WAR 0 +#define MIPS4K_ICACHE_REFILL_WAR 0 +#define MIPS_CACHE_SYNC_WAR 0 +#define TX49XX_ICACHE_INDEX_INV_WAR 0 +#define RM9000_CDEX_SMP_WAR 0 +#define ICACHE_REFILLS_WORKAROUND_WAR 0 +#define R10000_LLSC_WAR 0 +#define MIPS34K_MISSED_ITLB_WAR 0 + +#endif /* __ASM_MIPS_MACH_NLM_WAR_H */ -- cgit v1.1 From efa0f81c11021c95b1e72c65868115b6fb4ecc6a Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sat, 7 May 2011 01:36:21 +0530 Subject: MIPS: Netlogic: Cache, TLB support and feature overrides for XLR CPU_XLR case added to mm/tlbex.c CPU_XLR case added to mm/c-r4k.c for PINDEX attribute Feature overrides for XLR cpu. Signed-off-by: Jayachandran C To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2333/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/module.h | 2 ++ arch/mips/mm/c-r4k.c | 1 + arch/mips/mm/tlbex.c | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/mips/include/asm/module.h b/arch/mips/include/asm/module.h index d94085a..bc01a02 100644 --- a/arch/mips/include/asm/module.h +++ b/arch/mips/include/asm/module.h @@ -118,6 +118,8 @@ search_module_dbetables(unsigned long addr) #define MODULE_PROC_FAMILY "LOONGSON2 " #elif defined CONFIG_CPU_CAVIUM_OCTEON #define MODULE_PROC_FAMILY "OCTEON " +#elif defined CONFIG_CPU_XLR +#define MODULE_PROC_FAMILY "XLR " #else #error MODULE_PROC_FAMILY undefined for your processor configuration #endif diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 71bddf8..d9bc5d3 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1006,6 +1006,7 @@ static void __cpuinit probe_pcache(void) case CPU_25KF: case CPU_SB1: case CPU_SB1A: + case CPU_XLR: c->dcache.flags |= MIPS_CACHE_PINDEX; break; diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index f5734c2..424ed4b 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -404,6 +404,7 @@ static void __cpuinit build_tlb_write_entry(u32 **p, struct uasm_label **l, case CPU_5KC: case CPU_TX49XX: case CPU_PR4450: + case CPU_XLR: uasm_i_nop(p); tlbw(p); break; -- cgit v1.1 From 5c642506740ecbf20fb7a9e482287e4e5c639e5c Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sat, 7 May 2011 01:36:40 +0530 Subject: MIPS: Platform files for XLR/XLS processor support * include/asm/netlogic added with files common for all Netlogic processors (common with XLP which will be added later) * include/asm/netlogic/xlr for XLR/XLS chip specific files * netlogic/xlr for XLR/XLS platform files Signed-off-by: Jayachandran C To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2334/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/netlogic/interrupt.h | 45 +++++ arch/mips/include/asm/netlogic/mips-extns.h | 76 +++++++++ arch/mips/include/asm/netlogic/psb-bootinfo.h | 109 ++++++++++++ arch/mips/include/asm/netlogic/xlr/gpio.h | 73 ++++++++ arch/mips/include/asm/netlogic/xlr/iomap.h | 131 +++++++++++++++ arch/mips/include/asm/netlogic/xlr/pic.h | 231 ++++++++++++++++++++++++++ arch/mips/include/asm/netlogic/xlr/xlr.h | 54 ++++++ arch/mips/netlogic/xlr/irq.c | 216 ++++++++++++++++++++++++ arch/mips/netlogic/xlr/platform.c | 98 +++++++++++ arch/mips/netlogic/xlr/setup.c | 188 +++++++++++++++++++++ arch/mips/netlogic/xlr/smp.c | 225 +++++++++++++++++++++++++ arch/mips/netlogic/xlr/smpboot.S | 94 +++++++++++ arch/mips/netlogic/xlr/time.c | 51 ++++++ arch/mips/netlogic/xlr/xlr_console.c | 46 +++++ 14 files changed, 1637 insertions(+) create mode 100644 arch/mips/include/asm/netlogic/interrupt.h create mode 100644 arch/mips/include/asm/netlogic/mips-extns.h create mode 100644 arch/mips/include/asm/netlogic/psb-bootinfo.h create mode 100644 arch/mips/include/asm/netlogic/xlr/gpio.h create mode 100644 arch/mips/include/asm/netlogic/xlr/iomap.h create mode 100644 arch/mips/include/asm/netlogic/xlr/pic.h create mode 100644 arch/mips/include/asm/netlogic/xlr/xlr.h create mode 100644 arch/mips/netlogic/xlr/irq.c create mode 100644 arch/mips/netlogic/xlr/platform.c create mode 100644 arch/mips/netlogic/xlr/setup.c create mode 100644 arch/mips/netlogic/xlr/smp.c create mode 100644 arch/mips/netlogic/xlr/smpboot.S create mode 100644 arch/mips/netlogic/xlr/time.c create mode 100644 arch/mips/netlogic/xlr/xlr_console.c diff --git a/arch/mips/include/asm/netlogic/interrupt.h b/arch/mips/include/asm/netlogic/interrupt.h new file mode 100644 index 0000000..a85aadb --- /dev/null +++ b/arch/mips/include/asm/netlogic/interrupt.h @@ -0,0 +1,45 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NLM_INTERRUPT_H +#define _ASM_NLM_INTERRUPT_H + +/* Defines for the IRQ numbers */ + +#define IRQ_IPI_SMP_FUNCTION 3 +#define IRQ_IPI_SMP_RESCHEDULE 4 +#define IRQ_MSGRING 6 +#define IRQ_TIMER 7 + +#endif diff --git a/arch/mips/include/asm/netlogic/mips-extns.h b/arch/mips/include/asm/netlogic/mips-extns.h new file mode 100644 index 0000000..8c53d0b --- /dev/null +++ b/arch/mips/include/asm/netlogic/mips-extns.h @@ -0,0 +1,76 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NLM_MIPS_EXTS_H +#define _ASM_NLM_MIPS_EXTS_H + +/* + * XLR and XLP interrupt request and interrupt mask registers + */ +#define read_c0_eirr() __read_64bit_c0_register($9, 6) +#define read_c0_eimr() __read_64bit_c0_register($9, 7) +#define write_c0_eirr(val) __write_64bit_c0_register($9, 6, val) + +/* + * Writing EIMR in 32 bit is a special case, the lower 8 bit of the + * EIMR is shadowed in the status register, so we cannot save and + * restore status register for split read. + */ +#define write_c0_eimr(val) \ +do { \ + if (sizeof(unsigned long) == 4) { \ + unsigned long __flags; \ + \ + local_irq_save(__flags); \ + __asm__ __volatile__( \ + ".set\tmips64\n\t" \ + "dsll\t%L0, %L0, 32\n\t" \ + "dsrl\t%L0, %L0, 32\n\t" \ + "dsll\t%M0, %M0, 32\n\t" \ + "or\t%L0, %L0, %M0\n\t" \ + "dmtc0\t%L0, $9, 7\n\t" \ + ".set\tmips0" \ + : : "r" (val)); \ + __flags = (__flags & 0xffff00ff) | (((val) & 0xff) << 8);\ + local_irq_restore(__flags); \ + } else \ + __write_64bit_c0_register($9, 7, (val)); \ +} while (0) + +static inline int hard_smp_processor_id(void) +{ + return __read_32bit_c0_register($15, 1) & 0x3ff; +} + +#endif /*_ASM_NLM_MIPS_EXTS_H */ diff --git a/arch/mips/include/asm/netlogic/psb-bootinfo.h b/arch/mips/include/asm/netlogic/psb-bootinfo.h new file mode 100644 index 0000000..6878307 --- /dev/null +++ b/arch/mips/include/asm/netlogic/psb-bootinfo.h @@ -0,0 +1,109 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NETLOGIC_BOOTINFO_H +#define _ASM_NETLOGIC_BOOTINFO_H + +struct psb_info { + uint64_t boot_level; + uint64_t io_base; + uint64_t output_device; + uint64_t uart_print; + uint64_t led_output; + uint64_t init; + uint64_t exit; + uint64_t warm_reset; + uint64_t wakeup; + uint64_t online_cpu_map; + uint64_t master_reentry_sp; + uint64_t master_reentry_gp; + uint64_t master_reentry_fn; + uint64_t slave_reentry_fn; + uint64_t magic_dword; + uint64_t uart_putchar; + uint64_t size; + uint64_t uart_getchar; + uint64_t nmi_handler; + uint64_t psb_version; + uint64_t mac_addr; + uint64_t cpu_frequency; + uint64_t board_version; + uint64_t malloc; + uint64_t free; + uint64_t global_shmem_addr; + uint64_t global_shmem_size; + uint64_t psb_os_cpu_map; + uint64_t userapp_cpu_map; + uint64_t wakeup_os; + uint64_t psb_mem_map; + uint64_t board_major_version; + uint64_t board_minor_version; + uint64_t board_manf_revision; + uint64_t board_serial_number; + uint64_t psb_physaddr_map; + uint64_t xlr_loaderip_config; + uint64_t bldr_envp; + uint64_t avail_mem_map; +}; + +enum { + NETLOGIC_IO_SPACE = 0x10, + PCIX_IO_SPACE, + PCIX_CFG_SPACE, + PCIX_MEMORY_SPACE, + HT_IO_SPACE, + HT_CFG_SPACE, + HT_MEMORY_SPACE, + SRAM_SPACE, + FLASH_CONTROLLER_SPACE +}; + +#define NLM_MAX_ARGS 64 +#define NLM_MAX_ENVS 32 + +/* This is what netlboot passes and linux boot_mem_map is subtly different */ +#define NLM_BOOT_MEM_MAP_MAX 32 +struct nlm_boot_mem_map { + int nr_map; + struct nlm_boot_mem_map_entry { + uint64_t addr; /* start of memory segment */ + uint64_t size; /* size of memory segment */ + uint32_t type; /* type of memory segment */ + } map[NLM_BOOT_MEM_MAP_MAX]; +}; + +/* Pointer to saved boot loader info */ +extern struct psb_info nlm_prom_info; + +#endif diff --git a/arch/mips/include/asm/netlogic/xlr/gpio.h b/arch/mips/include/asm/netlogic/xlr/gpio.h new file mode 100644 index 0000000..51f6ad4 --- /dev/null +++ b/arch/mips/include/asm/netlogic/xlr/gpio.h @@ -0,0 +1,73 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NLM_GPIO_H +#define _ASM_NLM_GPIO_H + +#define NETLOGIC_GPIO_INT_EN_REG 0 +#define NETLOGIC_GPIO_INPUT_INVERSION_REG 1 +#define NETLOGIC_GPIO_IO_DIR_REG 2 +#define NETLOGIC_GPIO_IO_DATA_WR_REG 3 +#define NETLOGIC_GPIO_IO_DATA_RD_REG 4 + +#define NETLOGIC_GPIO_SWRESET_REG 8 +#define NETLOGIC_GPIO_DRAM1_CNTRL_REG 9 +#define NETLOGIC_GPIO_DRAM1_RATIO_REG 10 +#define NETLOGIC_GPIO_DRAM1_RESET_REG 11 +#define NETLOGIC_GPIO_DRAM1_STATUS_REG 12 +#define NETLOGIC_GPIO_DRAM2_CNTRL_REG 13 +#define NETLOGIC_GPIO_DRAM2_RATIO_REG 14 +#define NETLOGIC_GPIO_DRAM2_RESET_REG 15 +#define NETLOGIC_GPIO_DRAM2_STATUS_REG 16 + +#define NETLOGIC_GPIO_PWRON_RESET_CFG_REG 21 +#define NETLOGIC_GPIO_BIST_ALL_GO_STATUS_REG 24 +#define NETLOGIC_GPIO_BIST_CPU_GO_STATUS_REG 25 +#define NETLOGIC_GPIO_BIST_DEV_GO_STATUS_REG 26 + +#define NETLOGIC_GPIO_FUSE_BANK_REG 35 +#define NETLOGIC_GPIO_CPU_RESET_REG 40 +#define NETLOGIC_GPIO_RNG_REG 43 + +#define NETLOGIC_PWRON_RESET_PCMCIA_BOOT 17 +#define NETLOGIC_GPIO_LED_BITMAP 0x1700000 +#define NETLOGIC_GPIO_LED_0_SHIFT 20 +#define NETLOGIC_GPIO_LED_1_SHIFT 24 + +#define NETLOGIC_GPIO_LED_OUTPUT_CODE_RESET 0x01 +#define NETLOGIC_GPIO_LED_OUTPUT_CODE_HARD_RESET 0x02 +#define NETLOGIC_GPIO_LED_OUTPUT_CODE_SOFT_RESET 0x03 +#define NETLOGIC_GPIO_LED_OUTPUT_CODE_MAIN 0x04 + +#endif diff --git a/arch/mips/include/asm/netlogic/xlr/iomap.h b/arch/mips/include/asm/netlogic/xlr/iomap.h new file mode 100644 index 0000000..2e3a4dd --- /dev/null +++ b/arch/mips/include/asm/netlogic/xlr/iomap.h @@ -0,0 +1,131 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NLM_IOMAP_H +#define _ASM_NLM_IOMAP_H + +#define DEFAULT_NETLOGIC_IO_BASE CKSEG1ADDR(0x1ef00000) +#define NETLOGIC_IO_DDR2_CHN0_OFFSET 0x01000 +#define NETLOGIC_IO_DDR2_CHN1_OFFSET 0x02000 +#define NETLOGIC_IO_DDR2_CHN2_OFFSET 0x03000 +#define NETLOGIC_IO_DDR2_CHN3_OFFSET 0x04000 +#define NETLOGIC_IO_PIC_OFFSET 0x08000 +#define NETLOGIC_IO_UART_0_OFFSET 0x14000 +#define NETLOGIC_IO_UART_1_OFFSET 0x15100 + +#define NETLOGIC_IO_SIZE 0x1000 + +#define NETLOGIC_IO_BRIDGE_OFFSET 0x00000 + +#define NETLOGIC_IO_RLD2_CHN0_OFFSET 0x05000 +#define NETLOGIC_IO_RLD2_CHN1_OFFSET 0x06000 + +#define NETLOGIC_IO_SRAM_OFFSET 0x07000 + +#define NETLOGIC_IO_PCIX_OFFSET 0x09000 +#define NETLOGIC_IO_HT_OFFSET 0x0A000 + +#define NETLOGIC_IO_SECURITY_OFFSET 0x0B000 + +#define NETLOGIC_IO_GMAC_0_OFFSET 0x0C000 +#define NETLOGIC_IO_GMAC_1_OFFSET 0x0D000 +#define NETLOGIC_IO_GMAC_2_OFFSET 0x0E000 +#define NETLOGIC_IO_GMAC_3_OFFSET 0x0F000 + +/* XLS devices */ +#define NETLOGIC_IO_GMAC_4_OFFSET 0x20000 +#define NETLOGIC_IO_GMAC_5_OFFSET 0x21000 +#define NETLOGIC_IO_GMAC_6_OFFSET 0x22000 +#define NETLOGIC_IO_GMAC_7_OFFSET 0x23000 + +#define NETLOGIC_IO_PCIE_0_OFFSET 0x1E000 +#define NETLOGIC_IO_PCIE_1_OFFSET 0x1F000 +#define NETLOGIC_IO_SRIO_0_OFFSET 0x1E000 +#define NETLOGIC_IO_SRIO_1_OFFSET 0x1F000 + +#define NETLOGIC_IO_USB_0_OFFSET 0x24000 +#define NETLOGIC_IO_USB_1_OFFSET 0x25000 + +#define NETLOGIC_IO_COMP_OFFSET 0x1D000 +/* end XLS devices */ + +/* XLR devices */ +#define NETLOGIC_IO_SPI4_0_OFFSET 0x10000 +#define NETLOGIC_IO_XGMAC_0_OFFSET 0x11000 +#define NETLOGIC_IO_SPI4_1_OFFSET 0x12000 +#define NETLOGIC_IO_XGMAC_1_OFFSET 0x13000 +/* end XLR devices */ + +#define NETLOGIC_IO_I2C_0_OFFSET 0x16000 +#define NETLOGIC_IO_I2C_1_OFFSET 0x17000 + +#define NETLOGIC_IO_GPIO_OFFSET 0x18000 +#define NETLOGIC_IO_FLASH_OFFSET 0x19000 +#define NETLOGIC_IO_TB_OFFSET 0x1C000 + +#define NETLOGIC_CPLD_OFFSET KSEG1ADDR(0x1d840000) + +/* + * Base Address (Virtual) of the PCI Config address space + * For now, choose 256M phys in kseg1 = 0xA0000000 + (1<<28) + * Config space spans 256 (num of buses) * 256 (num functions) * 256 bytes + * ie 1<<24 = 16M + */ +#define DEFAULT_PCI_CONFIG_BASE 0x18000000 +#define DEFAULT_HT_TYPE0_CFG_BASE 0x16000000 +#define DEFAULT_HT_TYPE1_CFG_BASE 0x17000000 + +#ifndef __ASSEMBLY__ +#include +#include + +typedef volatile __u32 nlm_reg_t; +extern unsigned long netlogic_io_base; + +/* FIXME read once in write_reg */ +#ifdef CONFIG_CPU_LITTLE_ENDIAN +#define netlogic_read_reg(base, offset) ((base)[(offset)]) +#define netlogic_write_reg(base, offset, value) ((base)[(offset)] = (value)) +#else +#define netlogic_read_reg(base, offset) (be32_to_cpu((base)[(offset)])) +#define netlogic_write_reg(base, offset, value) \ + ((base)[(offset)] = cpu_to_be32((value))) +#endif + +#define netlogic_read_reg_le32(base, offset) (le32_to_cpu((base)[(offset)])) +#define netlogic_write_reg_le32(base, offset, value) \ + ((base)[(offset)] = cpu_to_le32((value))) +#define netlogic_io_mmio(offset) ((nlm_reg_t *)(netlogic_io_base+(offset))) +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/mips/include/asm/netlogic/xlr/pic.h b/arch/mips/include/asm/netlogic/xlr/pic.h new file mode 100644 index 0000000..5cceb74 --- /dev/null +++ b/arch/mips/include/asm/netlogic/xlr/pic.h @@ -0,0 +1,231 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NLM_XLR_PIC_H +#define _ASM_NLM_XLR_PIC_H + +#define PIC_CLKS_PER_SEC 66666666ULL +/* PIC hardware interrupt numbers */ +#define PIC_IRT_WD_INDEX 0 +#define PIC_IRT_TIMER_0_INDEX 1 +#define PIC_IRT_TIMER_1_INDEX 2 +#define PIC_IRT_TIMER_2_INDEX 3 +#define PIC_IRT_TIMER_3_INDEX 4 +#define PIC_IRT_TIMER_4_INDEX 5 +#define PIC_IRT_TIMER_5_INDEX 6 +#define PIC_IRT_TIMER_6_INDEX 7 +#define PIC_IRT_TIMER_7_INDEX 8 +#define PIC_IRT_CLOCK_INDEX PIC_IRT_TIMER_7_INDEX +#define PIC_IRT_UART_0_INDEX 9 +#define PIC_IRT_UART_1_INDEX 10 +#define PIC_IRT_I2C_0_INDEX 11 +#define PIC_IRT_I2C_1_INDEX 12 +#define PIC_IRT_PCMCIA_INDEX 13 +#define PIC_IRT_GPIO_INDEX 14 +#define PIC_IRT_HYPER_INDEX 15 +#define PIC_IRT_PCIX_INDEX 16 +/* XLS */ +#define PIC_IRT_CDE_INDEX 15 +#define PIC_IRT_BRIDGE_TB_XLS_INDEX 16 +/* XLS */ +#define PIC_IRT_GMAC0_INDEX 17 +#define PIC_IRT_GMAC1_INDEX 18 +#define PIC_IRT_GMAC2_INDEX 19 +#define PIC_IRT_GMAC3_INDEX 20 +#define PIC_IRT_XGS0_INDEX 21 +#define PIC_IRT_XGS1_INDEX 22 +#define PIC_IRT_HYPER_FATAL_INDEX 23 +#define PIC_IRT_PCIX_FATAL_INDEX 24 +#define PIC_IRT_BRIDGE_AERR_INDEX 25 +#define PIC_IRT_BRIDGE_BERR_INDEX 26 +#define PIC_IRT_BRIDGE_TB_XLR_INDEX 27 +#define PIC_IRT_BRIDGE_AERR_NMI_INDEX 28 +/* XLS */ +#define PIC_IRT_GMAC4_INDEX 21 +#define PIC_IRT_GMAC5_INDEX 22 +#define PIC_IRT_GMAC6_INDEX 23 +#define PIC_IRT_GMAC7_INDEX 24 +#define PIC_IRT_BRIDGE_ERR_INDEX 25 +#define PIC_IRT_PCIE_LINK0_INDEX 26 +#define PIC_IRT_PCIE_LINK1_INDEX 27 +#define PIC_IRT_PCIE_LINK2_INDEX 23 +#define PIC_IRT_PCIE_LINK3_INDEX 24 +#define PIC_IRT_PCIE_XLSB0_LINK2_INDEX 28 +#define PIC_IRT_PCIE_XLSB0_LINK3_INDEX 29 +#define PIC_IRT_SRIO_LINK0_INDEX 26 +#define PIC_IRT_SRIO_LINK1_INDEX 27 +#define PIC_IRT_SRIO_LINK2_INDEX 28 +#define PIC_IRT_SRIO_LINK3_INDEX 29 +#define PIC_IRT_PCIE_INT_INDEX 28 +#define PIC_IRT_PCIE_FATAL_INDEX 29 +#define PIC_IRT_GPIO_B_INDEX 30 +#define PIC_IRT_USB_INDEX 31 +/* XLS */ +#define PIC_NUM_IRTS 32 + + +#define PIC_CLOCK_TIMER 7 + +/* PIC Registers */ +#define PIC_CTRL 0x00 +#define PIC_IPI 0x04 +#define PIC_INT_ACK 0x06 + +#define WD_MAX_VAL_0 0x08 +#define WD_MAX_VAL_1 0x09 +#define WD_MASK_0 0x0a +#define WD_MASK_1 0x0b +#define WD_HEARBEAT_0 0x0c +#define WD_HEARBEAT_1 0x0d + +#define PIC_IRT_0_BASE 0x40 +#define PIC_IRT_1_BASE 0x80 +#define PIC_TIMER_MAXVAL_0_BASE 0x100 +#define PIC_TIMER_MAXVAL_1_BASE 0x110 +#define PIC_TIMER_COUNT_0_BASE 0x120 +#define PIC_TIMER_COUNT_1_BASE 0x130 + +#define PIC_IRT_0(picintr) (PIC_IRT_0_BASE + (picintr)) +#define PIC_IRT_1(picintr) (PIC_IRT_1_BASE + (picintr)) + +#define PIC_TIMER_MAXVAL_0(i) (PIC_TIMER_MAXVAL_0_BASE + (i)) +#define PIC_TIMER_MAXVAL_1(i) (PIC_TIMER_MAXVAL_1_BASE + (i)) +#define PIC_TIMER_COUNT_0(i) (PIC_TIMER_COUNT_0_BASE + (i)) +#define PIC_TIMER_COUNT_1(i) (PIC_TIMER_COUNT_0_BASE + (i)) + +/* + * Mapping between hardware interrupt numbers and IRQs on CPU + * we use a simple scheme to map PIC interrupts 0-31 to IRQs + * 8-39. This leaves the IRQ 0-7 for cpu interrupts like + * count/compare and FMN + */ +#define PIC_IRQ_BASE 8 +#define PIC_INTR_TO_IRQ(i) (PIC_IRQ_BASE + (i)) +#define PIC_IRQ_TO_INTR(i) ((i) - PIC_IRQ_BASE) + +#define PIC_IRT_FIRST_IRQ PIC_IRQ_BASE +#define PIC_WD_IRQ PIC_INTR_TO_IRQ(PIC_IRT_WD_INDEX) +#define PIC_TIMER_0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_0_INDEX) +#define PIC_TIMER_1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_1_INDEX) +#define PIC_TIMER_2_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_2_INDEX) +#define PIC_TIMER_3_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_3_INDEX) +#define PIC_TIMER_4_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_4_INDEX) +#define PIC_TIMER_5_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_5_INDEX) +#define PIC_TIMER_6_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_6_INDEX) +#define PIC_TIMER_7_IRQ PIC_INTR_TO_IRQ(PIC_IRT_TIMER_7_INDEX) +#define PIC_CLOCK_IRQ (PIC_TIMER_7_IRQ) +#define PIC_UART_0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_UART_0_INDEX) +#define PIC_UART_1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_UART_1_INDEX) +#define PIC_I2C_0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_I2C_0_INDEX) +#define PIC_I2C_1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_I2C_1_INDEX) +#define PIC_PCMCIA_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCMCIA_INDEX) +#define PIC_GPIO_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GPIO_INDEX) +#define PIC_HYPER_IRQ PIC_INTR_TO_IRQ(PIC_IRT_HYPER_INDEX) +#define PIC_PCIX_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIX_INDEX) +/* XLS */ +#define PIC_CDE_IRQ PIC_INTR_TO_IRQ(PIC_IRT_CDE_INDEX) +#define PIC_BRIDGE_TB_XLS_IRQ PIC_INTR_TO_IRQ(PIC_IRT_BRIDGE_TB_XLS_INDEX) +/* end XLS */ +#define PIC_GMAC_0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC0_INDEX) +#define PIC_GMAC_1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC1_INDEX) +#define PIC_GMAC_2_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC2_INDEX) +#define PIC_GMAC_3_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC3_INDEX) +#define PIC_XGS_0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_XGS0_INDEX) +#define PIC_XGS_1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_XGS1_INDEX) +#define PIC_HYPER_FATAL_IRQ PIC_INTR_TO_IRQ(PIC_IRT_HYPER_FATAL_INDEX) +#define PIC_PCIX_FATAL_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIX_FATAL_INDEX) +#define PIC_BRIDGE_AERR_IRQ PIC_INTR_TO_IRQ(PIC_IRT_BRIDGE_AERR_INDEX) +#define PIC_BRIDGE_BERR_IRQ PIC_INTR_TO_IRQ(PIC_IRT_BRIDGE_BERR_INDEX) +#define PIC_BRIDGE_TB_XLR_IRQ PIC_INTR_TO_IRQ(PIC_IRT_BRIDGE_TB_XLR_INDEX) +#define PIC_BRIDGE_AERR_NMI_IRQ PIC_INTR_TO_IRQ(PIC_IRT_BRIDGE_AERR_NMI_INDEX) +/* XLS defines */ +#define PIC_GMAC_4_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC4_INDEX) +#define PIC_GMAC_5_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC5_INDEX) +#define PIC_GMAC_6_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC6_INDEX) +#define PIC_GMAC_7_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GMAC7_INDEX) +#define PIC_BRIDGE_ERR_IRQ PIC_INTR_TO_IRQ(PIC_IRT_BRIDGE_ERR_INDEX) +#define PIC_PCIE_LINK0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_LINK0_INDEX) +#define PIC_PCIE_LINK1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_LINK1_INDEX) +#define PIC_PCIE_LINK2_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_LINK2_INDEX) +#define PIC_PCIE_LINK3_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_LINK3_INDEX) +#define PIC_PCIE_XLSB0_LINK2_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_XLSB0_LINK2_INDEX) +#define PIC_PCIE_XLSB0_LINK3_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_XLSB0_LINK3_INDEX) +#define PIC_SRIO_LINK0_IRQ PIC_INTR_TO_IRQ(PIC_IRT_SRIO_LINK0_INDEX) +#define PIC_SRIO_LINK1_IRQ PIC_INTR_TO_IRQ(PIC_IRT_SRIO_LINK1_INDEX) +#define PIC_SRIO_LINK2_IRQ PIC_INTR_TO_IRQ(PIC_IRT_SRIO_LINK2_INDEX) +#define PIC_SRIO_LINK3_IRQ PIC_INTR_TO_IRQ(PIC_IRT_SRIO_LINK3_INDEX) +#define PIC_PCIE_INT_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_INT__INDEX) +#define PIC_PCIE_FATAL_IRQ PIC_INTR_TO_IRQ(PIC_IRT_PCIE_FATAL_INDEX) +#define PIC_GPIO_B_IRQ PIC_INTR_TO_IRQ(PIC_IRT_GPIO_B_INDEX) +#define PIC_USB_IRQ PIC_INTR_TO_IRQ(PIC_IRT_USB_INDEX) +#define PIC_IRT_LAST_IRQ PIC_USB_IRQ +/* end XLS */ + +#ifndef __ASSEMBLY__ +static inline void pic_send_ipi(u32 ipi) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + + netlogic_write_reg(mmio, PIC_IPI, ipi); +} + +static inline u32 pic_read_control(void) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + + return netlogic_read_reg(mmio, PIC_CTRL); +} + +static inline void pic_write_control(u32 control) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + + netlogic_write_reg(mmio, PIC_CTRL, control); +} + +static inline void pic_update_control(u32 control) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + + netlogic_write_reg(mmio, PIC_CTRL, + (control | netlogic_read_reg(mmio, PIC_CTRL))); +} + +#define PIC_IRQ_IS_EDGE_TRIGGERED(irq) (((irq) >= PIC_TIMER_0_IRQ) && \ + ((irq) <= PIC_TIMER_7_IRQ)) +#define PIC_IRQ_IS_IRT(irq) (((irq) >= PIC_IRT_FIRST_IRQ) && \ + ((irq) <= PIC_IRT_LAST_IRQ)) +#endif + +#endif /* _ASM_NLM_XLR_PIC_H */ diff --git a/arch/mips/include/asm/netlogic/xlr/xlr.h b/arch/mips/include/asm/netlogic/xlr/xlr.h new file mode 100644 index 0000000..454c236 --- /dev/null +++ b/arch/mips/include/asm/netlogic/xlr/xlr.h @@ -0,0 +1,54 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ASM_NLM_XLR_H +#define _ASM_NLM_XLR_H + +/* Platform UART functions */ +struct uart_port; +unsigned int nlm_xlr_uart_in(struct uart_port *, int); +void nlm_xlr_uart_out(struct uart_port *, int, int); + +/* SMP support functions */ +void nlm_smp_function_ipi_handler(unsigned int irq, struct irq_desc *desc); +void nlm_smp_resched_ipi_handler(unsigned int irq, struct irq_desc *desc); +int nlm_wakeup_secondary_cpus(u32 wakeup_mask); +void nlm_smp_irq_init(void); +void nlm_boot_smp_nmi(void); +void prom_pre_boot_secondary_cpus(void); + +extern struct plat_smp_ops nlm_smp_ops; +extern unsigned long nlm_common_ebase; + +#endif /* _ASM_NLM_XLR_H */ diff --git a/arch/mips/netlogic/xlr/irq.c b/arch/mips/netlogic/xlr/irq.c new file mode 100644 index 0000000..2033f56 --- /dev/null +++ b/arch/mips/netlogic/xlr/irq.c @@ -0,0 +1,216 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +static u64 nlm_irq_mask; +static DEFINE_SPINLOCK(nlm_pic_lock); + +static void xlr_pic_enable(struct irq_data *d) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + unsigned long flags; + nlm_reg_t reg; + int irq = d->irq; + + WARN(!PIC_IRQ_IS_IRT(irq), "Bad irq %d", irq); + + spin_lock_irqsave(&nlm_pic_lock, flags); + reg = netlogic_read_reg(mmio, PIC_IRT_1_BASE + irq - PIC_IRQ_BASE); + netlogic_write_reg(mmio, PIC_IRT_1_BASE + irq - PIC_IRQ_BASE, + reg | (1 << 6) | (1 << 30) | (1 << 31)); + spin_unlock_irqrestore(&nlm_pic_lock, flags); +} + +static void xlr_pic_mask(struct irq_data *d) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + unsigned long flags; + nlm_reg_t reg; + int irq = d->irq; + + WARN(!PIC_IRQ_IS_IRT(irq), "Bad irq %d", irq); + + spin_lock_irqsave(&nlm_pic_lock, flags); + reg = netlogic_read_reg(mmio, PIC_IRT_1_BASE + irq - PIC_IRQ_BASE); + netlogic_write_reg(mmio, PIC_IRT_1_BASE + irq - PIC_IRQ_BASE, + reg | (1 << 6) | (1 << 30) | (0 << 31)); + spin_unlock_irqrestore(&nlm_pic_lock, flags); +} + +static void xlr_pic_ack(struct irq_data *d) +{ + unsigned long flags; + nlm_reg_t *mmio; + int irq = d->irq; + + WARN(!PIC_IRQ_IS_IRT(irq), "Bad irq %d", irq); + + mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + spin_lock_irqsave(&nlm_pic_lock, flags); + netlogic_write_reg(mmio, PIC_INT_ACK, (1 << (irq - PIC_IRQ_BASE))); + spin_unlock_irqrestore(&nlm_pic_lock, flags); +} + +/* + * This chip definition handles interrupts routed thru the XLR + * hardware PIC, currently IRQs 8-39 are mapped to hardware intr + * 0-31 wired the XLR PIC + */ +static struct irq_chip xlr_pic = { + .name = "XLR-PIC", + .irq_enable = xlr_pic_enable, + .irq_mask = xlr_pic_mask, + .irq_ack = xlr_pic_ack, +}; + +static void rsvd_irq_handler(struct irq_data *d) +{ + WARN(d->irq >= PIC_IRQ_BASE, "Bad irq %d", d->irq); +} + +/* + * Chip definition for CPU originated interrupts(timer, msg) and + * IPIs + */ +struct irq_chip nlm_cpu_intr = { + .name = "XLR-CPU-INTR", + .irq_enable = rsvd_irq_handler, + .irq_mask = rsvd_irq_handler, + .irq_ack = rsvd_irq_handler, +}; + +void __init init_xlr_irqs(void) +{ + nlm_reg_t *mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); + uint32_t thread_mask = 1; + int level, i; + + pr_info("Interrupt thread mask [%x]\n", thread_mask); + for (i = 0; i < PIC_NUM_IRTS; i++) { + level = PIC_IRQ_IS_EDGE_TRIGGERED(i); + + /* Bind all PIC irqs to boot cpu */ + netlogic_write_reg(mmio, PIC_IRT_0_BASE + i, thread_mask); + + /* + * Use local scheduling and high polarity for all IRTs + * Invalidate all IRTs, by default + */ + netlogic_write_reg(mmio, PIC_IRT_1_BASE + i, + (level << 30) | (1 << 6) | (PIC_IRQ_BASE + i)); + } + + /* Make all IRQs as level triggered by default */ + for (i = 0; i < NR_IRQS; i++) { + if (PIC_IRQ_IS_IRT(i)) + irq_set_chip_and_handler(i, &xlr_pic, handle_level_irq); + else + irq_set_chip_and_handler(i, &nlm_cpu_intr, + handle_level_irq); + } +#ifdef CONFIG_SMP + irq_set_chip_and_handler(IRQ_IPI_SMP_FUNCTION, &nlm_cpu_intr, + nlm_smp_function_ipi_handler); + irq_set_chip_and_handler(IRQ_IPI_SMP_RESCHEDULE, &nlm_cpu_intr, + nlm_smp_resched_ipi_handler); + nlm_irq_mask |= + ((1ULL << IRQ_IPI_SMP_FUNCTION) | (1ULL << IRQ_IPI_SMP_RESCHEDULE)); +#endif + /* unmask all PIC related interrupts. If no handler is installed by the + * drivers, it'll just ack the interrupt and return + */ + for (i = PIC_IRT_FIRST_IRQ; i <= PIC_IRT_LAST_IRQ; i++) + nlm_irq_mask |= (1ULL << i); + + nlm_irq_mask |= (1ULL << IRQ_TIMER); +} + +void __init arch_init_irq(void) +{ + /* Initialize the irq descriptors */ + init_xlr_irqs(); + write_c0_eimr(nlm_irq_mask); +} + +void __cpuinit nlm_smp_irq_init(void) +{ + /* set interrupt mask for non-zero cpus */ + write_c0_eimr(nlm_irq_mask); +} + +asmlinkage void plat_irq_dispatch(void) +{ + uint64_t eirr; + int i; + + eirr = read_c0_eirr() & read_c0_eimr(); + if (!eirr) + return; + + /* no need of EIRR here, writing compare clears interrupt */ + if (eirr & (1 << IRQ_TIMER)) { + do_IRQ(IRQ_TIMER); + return; + } + + /* TODO use dcltz: optimize below code */ + for (i = 63; i != -1; i--) { + if (eirr & (1ULL << i)) + break; + } + if (i == -1) { + pr_err("no interrupt !!\n"); + return; + } + + /* Ack eirr */ + write_c0_eirr(1ULL << i); + + do_IRQ(i); +} diff --git a/arch/mips/netlogic/xlr/platform.c b/arch/mips/netlogic/xlr/platform.c new file mode 100644 index 0000000..609ec25 --- /dev/null +++ b/arch/mips/netlogic/xlr/platform.c @@ -0,0 +1,98 @@ +/* + * Copyright 2011, Netlogic Microsystems. + * Copyright 2004, Matt Porter + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +unsigned int nlm_xlr_uart_in(struct uart_port *p, int offset) +{ + nlm_reg_t *mmio; + unsigned int value; + + /* XLR uart does not need any mapping of regs */ + mmio = (nlm_reg_t *)(p->membase + (offset << p->regshift)); + value = netlogic_read_reg(mmio, 0); + + /* See XLR/XLS errata */ + if (offset == UART_MSR) + value ^= 0xF0; + else if (offset == UART_MCR) + value ^= 0x3; + + return value; +} + +void nlm_xlr_uart_out(struct uart_port *p, int offset, int value) +{ + nlm_reg_t *mmio; + + /* XLR uart does not need any mapping of regs */ + mmio = (nlm_reg_t *)(p->membase + (offset << p->regshift)); + + /* See XLR/XLS errata */ + if (offset == UART_MSR) + value ^= 0xF0; + else if (offset == UART_MCR) + value ^= 0x3; + + netlogic_write_reg(mmio, 0, value); +} + +#define PORT(_irq) \ + { \ + .irq = _irq, \ + .regshift = 2, \ + .iotype = UPIO_MEM32, \ + .flags = (UPF_SKIP_TEST | \ + UPF_FIXED_TYPE | UPF_BOOT_AUTOCONF),\ + .uartclk = PIC_CLKS_PER_SEC, \ + .type = PORT_16550A, \ + .serial_in = nlm_xlr_uart_in, \ + .serial_out = nlm_xlr_uart_out, \ + } + +static struct plat_serial8250_port xlr_uart_data[] = { + PORT(PIC_UART_0_IRQ), + PORT(PIC_UART_1_IRQ), + {}, +}; + +static struct platform_device uart_device = { + .name = "serial8250", + .id = PLAT8250_DEV_PLATFORM, + .dev = { + .platform_data = xlr_uart_data, + }, +}; + +static int __init nlm_uart_init(void) +{ + nlm_reg_t *mmio; + + mmio = netlogic_io_mmio(NETLOGIC_IO_UART_0_OFFSET); + xlr_uart_data[0].membase = (void __iomem *)mmio; + xlr_uart_data[0].mapbase = CPHYSADDR((unsigned long)mmio); + + mmio = netlogic_io_mmio(NETLOGIC_IO_UART_1_OFFSET); + xlr_uart_data[1].membase = (void __iomem *)mmio; + xlr_uart_data[1].mapbase = CPHYSADDR((unsigned long)mmio); + + return platform_device_register(&uart_device); +} + +arch_initcall(nlm_uart_init); diff --git a/arch/mips/netlogic/xlr/setup.c b/arch/mips/netlogic/xlr/setup.c new file mode 100644 index 0000000..4828025 --- /dev/null +++ b/arch/mips/netlogic/xlr/setup.c @@ -0,0 +1,188 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +unsigned long netlogic_io_base = (unsigned long)(DEFAULT_NETLOGIC_IO_BASE); +unsigned long nlm_common_ebase = 0x0; +struct psb_info nlm_prom_info; + +static void nlm_early_serial_setup(void) +{ + struct uart_port s; + nlm_reg_t *uart_base; + + uart_base = netlogic_io_mmio(NETLOGIC_IO_UART_0_OFFSET); + memset(&s, 0, sizeof(s)); + s.flags = ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST; + s.iotype = UPIO_MEM32; + s.regshift = 2; + s.irq = PIC_UART_0_IRQ; + s.uartclk = PIC_CLKS_PER_SEC; + s.serial_in = nlm_xlr_uart_in; + s.serial_out = nlm_xlr_uart_out; + s.mapbase = (unsigned long)uart_base; + s.membase = (unsigned char __iomem *)uart_base; + early_serial_setup(&s); +} + +static void nlm_linux_exit(void) +{ + nlm_reg_t *mmio; + + mmio = netlogic_io_mmio(NETLOGIC_IO_GPIO_OFFSET); + /* trigger a chip reset by writing 1 to GPIO_SWRESET_REG */ + netlogic_write_reg(mmio, NETLOGIC_GPIO_SWRESET_REG, 1); + for ( ; ; ) + cpu_wait(); +} + +void __init plat_mem_setup(void) +{ + panic_timeout = 5; + _machine_restart = (void (*)(char *))nlm_linux_exit; + _machine_halt = nlm_linux_exit; + pm_power_off = nlm_linux_exit; +} + +const char *get_system_type(void) +{ + return "Netlogic XLR/XLS Series"; +} + +void __init prom_free_prom_memory(void) +{ + /* Nothing yet */ +} + +static void build_arcs_cmdline(int *argv) +{ + int i, remain, len; + char *arg; + + remain = sizeof(arcs_cmdline) - 1; + arcs_cmdline[0] = '\0'; + for (i = 0; argv[i] != 0; i++) { + arg = (char *)(long)argv[i]; + len = strlen(arg); + if (len + 1 > remain) + break; + strcat(arcs_cmdline, arg); + strcat(arcs_cmdline, " "); + remain -= len + 1; + } + + /* Add the default options here */ + if ((strstr(arcs_cmdline, "console=")) == NULL) { + arg = "console=ttyS0,38400 "; + len = strlen(arg); + if (len > remain) + goto fail; + strcat(arcs_cmdline, arg); + remain -= len; + } +#ifdef CONFIG_BLK_DEV_INITRD + if ((strstr(arcs_cmdline, "rdinit=")) == NULL) { + arg = "rdinit=/sbin/init "; + len = strlen(arg); + if (len > remain) + goto fail; + strcat(arcs_cmdline, arg); + remain -= len; + } +#endif + return; +fail: + panic("Cannot add %s, command line too big!", arg); +} + +static void prom_add_memory(void) +{ + struct nlm_boot_mem_map *bootm; + u64 start, size; + u64 pref_backup = 512; /* avoid pref walking beyond end */ + int i; + + bootm = (void *)(long)nlm_prom_info.psb_mem_map; + for (i = 0; i < bootm->nr_map; i++) { + if (bootm->map[i].type != BOOT_MEM_RAM) + continue; + start = bootm->map[i].addr; + size = bootm->map[i].size; + + /* Work around for using bootloader mem */ + if (i == 0 && start == 0 && size == 0x0c000000) + size = 0x0ff00000; + + add_memory_region(start, size - pref_backup, BOOT_MEM_RAM); + } +} + +void __init prom_init(void) +{ + int *argv, *envp; /* passed as 32 bit ptrs */ + struct psb_info *prom_infop; + + /* truncate to 32 bit and sign extend all args */ + argv = (int *)(long)(int)fw_arg1; + envp = (int *)(long)(int)fw_arg2; + prom_infop = (struct psb_info *)(long)(int)fw_arg3; + + nlm_prom_info = *prom_infop; + + nlm_early_serial_setup(); + build_arcs_cmdline(argv); + nlm_common_ebase = read_c0_ebase() & (~((1 << 12) - 1)); + prom_add_memory(); + +#ifdef CONFIG_SMP + nlm_wakeup_secondary_cpus(nlm_prom_info.online_cpu_map); + register_smp_ops(&nlm_smp_ops); +#endif +} diff --git a/arch/mips/netlogic/xlr/smp.c b/arch/mips/netlogic/xlr/smp.c new file mode 100644 index 0000000..b495a7f --- /dev/null +++ b/arch/mips/netlogic/xlr/smp.c @@ -0,0 +1,225 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +void core_send_ipi(int logical_cpu, unsigned int action) +{ + int cpu = cpu_logical_map(logical_cpu); + u32 tid = cpu & 0x3; + u32 pid = (cpu >> 2) & 0x07; + u32 ipi = (tid << 16) | (pid << 20); + + if (action & SMP_CALL_FUNCTION) + ipi |= IRQ_IPI_SMP_FUNCTION; + else if (action & SMP_RESCHEDULE_YOURSELF) + ipi |= IRQ_IPI_SMP_RESCHEDULE; + else + return; + + pic_send_ipi(ipi); +} + +void nlm_send_ipi_single(int cpu, unsigned int action) +{ + core_send_ipi(cpu, action); +} + +void nlm_send_ipi_mask(const struct cpumask *mask, unsigned int action) +{ + int cpu; + + for_each_cpu(cpu, mask) { + core_send_ipi(cpu, action); + } +} + +/* IRQ_IPI_SMP_FUNCTION Handler */ +void nlm_smp_function_ipi_handler(unsigned int irq, struct irq_desc *desc) +{ + smp_call_function_interrupt(); +} + +/* IRQ_IPI_SMP_RESCHEDULE handler */ +void nlm_smp_resched_ipi_handler(unsigned int irq, struct irq_desc *desc) +{ + set_need_resched(); +} + +void nlm_common_ipi_handler(int irq, struct pt_regs *regs) +{ + if (irq == IRQ_IPI_SMP_FUNCTION) { + smp_call_function_interrupt(); + } else { + /* Announce that we are for reschduling */ + set_need_resched(); + } +} + +/* + * Called before going into mips code, early cpu init + */ +void nlm_early_init_secondary(void) +{ + write_c0_ebase((uint32_t)nlm_common_ebase); + /* TLB partition here later */ +} + +/* + * Code to run on secondary just after probing the CPU + */ +static void __cpuinit nlm_init_secondary(void) +{ + nlm_smp_irq_init(); +} + +void nlm_smp_finish(void) +{ +#ifdef notyet + nlm_common_msgring_cpu_init(); +#endif +} + +void nlm_cpus_done(void) +{ +} + +/* + * Boot all other cpus in the system, initialize them, and bring them into + * the boot function + */ +int nlm_cpu_unblock[NR_CPUS]; +int nlm_cpu_ready[NR_CPUS]; +unsigned long nlm_next_gp; +unsigned long nlm_next_sp; +cpumask_t phys_cpu_present_map; + +void nlm_boot_secondary(int logical_cpu, struct task_struct *idle) +{ + unsigned long gp = (unsigned long)task_thread_info(idle); + unsigned long sp = (unsigned long)__KSTK_TOS(idle); + int cpu = cpu_logical_map(logical_cpu); + + nlm_next_sp = sp; + nlm_next_gp = gp; + + /* barrier */ + __sync(); + nlm_cpu_unblock[cpu] = 1; +} + +void __init nlm_smp_setup(void) +{ + unsigned int boot_cpu; + int num_cpus, i; + + boot_cpu = hard_smp_processor_id(); + cpus_clear(phys_cpu_present_map); + + cpu_set(boot_cpu, phys_cpu_present_map); + __cpu_number_map[boot_cpu] = 0; + __cpu_logical_map[0] = boot_cpu; + cpu_set(0, cpu_possible_map); + + num_cpus = 1; + for (i = 0; i < NR_CPUS; i++) { + if (nlm_cpu_ready[i]) { + cpu_set(i, phys_cpu_present_map); + __cpu_number_map[i] = num_cpus; + __cpu_logical_map[num_cpus] = i; + cpu_set(num_cpus, cpu_possible_map); + ++num_cpus; + } + } + + pr_info("Phys CPU present map: %lx, possible map %lx\n", + (unsigned long)phys_cpu_present_map.bits[0], + (unsigned long)cpu_possible_map.bits[0]); + + pr_info("Detected %i Slave CPU(s)\n", num_cpus); +} + +void nlm_prepare_cpus(unsigned int max_cpus) +{ +} + +struct plat_smp_ops nlm_smp_ops = { + .send_ipi_single = nlm_send_ipi_single, + .send_ipi_mask = nlm_send_ipi_mask, + .init_secondary = nlm_init_secondary, + .smp_finish = nlm_smp_finish, + .cpus_done = nlm_cpus_done, + .boot_secondary = nlm_boot_secondary, + .smp_setup = nlm_smp_setup, + .prepare_cpus = nlm_prepare_cpus, +}; + +unsigned long secondary_entry_point; + +int nlm_wakeup_secondary_cpus(u32 wakeup_mask) +{ + unsigned int tid, pid, ipi, i, boot_cpu; + void *reset_vec; + + secondary_entry_point = (unsigned long)prom_pre_boot_secondary_cpus; + reset_vec = (void *)CKSEG1ADDR(0x1fc00000); + memcpy(reset_vec, nlm_boot_smp_nmi, 0x80); + boot_cpu = hard_smp_processor_id(); + + for (i = 0; i < NR_CPUS; i++) { + if (i == boot_cpu) + continue; + if (wakeup_mask & (1u << i)) { + tid = i & 0x3; + pid = (i >> 2) & 0x7; + ipi = (tid << 16) | (pid << 20) | (1 << 8); + pic_send_ipi(ipi); + } + } + + return 0; +} diff --git a/arch/mips/netlogic/xlr/smpboot.S b/arch/mips/netlogic/xlr/smpboot.S new file mode 100644 index 0000000..b8e0744 --- /dev/null +++ b/arch/mips/netlogic/xlr/smpboot.S @@ -0,0 +1,94 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + + +/* Don't jump to linux function from Bootloader stack. Change it + * here. Kernel might allocate bootloader memory before all the CPUs are + * brought up (eg: Inode cache region) and we better don't overwrite this + * memory + */ +NESTED(prom_pre_boot_secondary_cpus, 16, sp) + .set mips64 + mfc0 t0, $15, 1 # read ebase + andi t0, 0x1f # t0 has the processor_id() + sll t0, 2 # offset in cpu array + + PTR_LA t1, nlm_cpu_ready # mark CPU ready + PTR_ADDU t1, t0 + li t2, 1 + sw t2, 0(t1) + + PTR_LA t1, nlm_cpu_unblock + PTR_ADDU t1, t0 +1: lw t2, 0(t1) # wait till unblocked + beqz t2, 1b + nop + + PTR_LA t1, nlm_next_sp + PTR_L sp, 0(t1) + PTR_LA t1, nlm_next_gp + PTR_L gp, 0(t1) + + PTR_LA t0, nlm_early_init_secondary + jalr t0 + nop + + PTR_LA t0, smp_bootstrap + jr t0 + nop +END(prom_pre_boot_secondary_cpus) + +NESTED(nlm_boot_smp_nmi, 0, sp) + .set push + .set noat + .set mips64 + .set noreorder + + /* Clear the NMI and BEV bits */ + MFC0 k0, CP0_STATUS + li k1, 0xffb7ffff + and k0, k0, k1 + MTC0 k0, CP0_STATUS + + PTR_LA k1, secondary_entry_point + PTR_L k0, 0(k1) + jr k0 + nop + .set pop +END(nlm_boot_smp_nmi) diff --git a/arch/mips/netlogic/xlr/time.c b/arch/mips/netlogic/xlr/time.c new file mode 100644 index 0000000..0d81b26 --- /dev/null +++ b/arch/mips/netlogic/xlr/time.c @@ -0,0 +1,51 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +unsigned int __cpuinit get_c0_compare_int(void) +{ + return IRQ_TIMER; +} + +void __init plat_time_init(void) +{ + mips_hpt_frequency = nlm_prom_info.cpu_frequency; + pr_info("MIPS counter frequency [%ld]\n", + (unsigned long)mips_hpt_frequency); +} diff --git a/arch/mips/netlogic/xlr/xlr_console.c b/arch/mips/netlogic/xlr/xlr_console.c new file mode 100644 index 0000000..759df06 --- /dev/null +++ b/arch/mips/netlogic/xlr/xlr_console.c @@ -0,0 +1,46 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +void prom_putchar(char c) +{ + nlm_reg_t *mmio; + + mmio = netlogic_io_mmio(NETLOGIC_IO_UART_0_OFFSET); + while (netlogic_read_reg(mmio, 0x5) == 0) + ; + netlogic_write_reg(mmio, 0x0, c); +} -- cgit v1.1 From 7f058e852b229ec77b37676b2b78baf2e78ffee8 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sat, 7 May 2011 01:36:57 +0530 Subject: MIPS: Kconfig and Makefile update for Netlogic XLR/XLS Add NLM_XLR_BOARD, CPU_XLR and other config options Makefile updates, mostly based on r4k Signed-off-by: Jayachandran C To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2334/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 42 +++++++++++++++++++++++++++++++++++++++++ arch/mips/Makefile | 12 ++++++++++++ arch/mips/kernel/Makefile | 1 + arch/mips/lib/Makefile | 1 + arch/mips/mm/Makefile | 1 + arch/mips/netlogic/Kconfig | 5 +++++ arch/mips/netlogic/xlr/Makefile | 5 +++++ 7 files changed, 67 insertions(+) create mode 100644 arch/mips/netlogic/Kconfig create mode 100644 arch/mips/netlogic/xlr/Makefile diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 351c80f..5016caa 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -736,6 +736,33 @@ config CAVIUM_OCTEON_REFERENCE_BOARD Hikari Say Y here for most Octeon reference boards. +config NLM_XLR_BOARD + bool "Netlogic XLR/XLS based systems" + depends on EXPERIMENTAL + select BOOT_ELF32 + select NLM_COMMON + select NLM_XLR + select SYS_HAS_CPU_XLR + select SYS_SUPPORTS_SMP + select HW_HAS_PCI + select SWAP_IO_SPACE + select SYS_SUPPORTS_32BIT_KERNEL + select SYS_SUPPORTS_64BIT_KERNEL + select 64BIT_PHYS_ADDR + select SYS_SUPPORTS_BIG_ENDIAN + select SYS_SUPPORTS_HIGHMEM + select DMA_COHERENT + select NR_CPUS_DEFAULT_32 + select CEVT_R4K + select CSRC_R4K + select IRQ_CPU + select ZONE_DMA if 64BIT + select SYNC_R4K + select SYS_HAS_EARLY_PRINTK + help + Support for systems based on Netlogic XLR and XLS processors. + Say Y here if you have a XLR or XLS based board. + endchoice source "arch/mips/alchemy/Kconfig" @@ -752,6 +779,7 @@ source "arch/mips/txx9/Kconfig" source "arch/mips/vr41xx/Kconfig" source "arch/mips/cavium-octeon/Kconfig" source "arch/mips/loongson/Kconfig" +source "arch/mips/netlogic/Kconfig" endmenu @@ -1420,6 +1448,17 @@ config CPU_BMIPS5000 help Broadcom BMIPS5000 processors. +config CPU_XLR + bool "Netlogic XLR SoC" + depends on SYS_HAS_CPU_XLR + select CPU_SUPPORTS_32BIT_KERNEL + select CPU_SUPPORTS_64BIT_KERNEL + select CPU_SUPPORTS_HIGHMEM + select WEAK_ORDERING + select WEAK_REORDERING_BEYOND_LLSC + select CPU_SUPPORTS_HUGEPAGES + help + Netlogic Microsystems XLR/XLS processors. endchoice if CPU_LOONGSON2F @@ -1550,6 +1589,9 @@ config SYS_HAS_CPU_BMIPS4380 config SYS_HAS_CPU_BMIPS5000 bool +config SYS_HAS_CPU_XLR + bool + # # CPU may reorder R->R, R->W, W->R, W->W # Reordering beyond LL and SC is handled in WEAK_REORDERING_BEYOND_LLSC diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 53e3514..884819c 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -191,6 +191,18 @@ endif # include $(srctree)/arch/mips/Kbuild.platforms +# +# NETLOGIC SOC Common (common) +# +cflags-$(CONFIG_NLM_COMMON) += -I$(srctree)/arch/mips/include/asm/mach-netlogic +cflags-$(CONFIG_NLM_COMMON) += -I$(srctree)/arch/mips/include/asm/netlogic + +# +# NETLOGIC XLR/XLS SoC, Simulator and boards +# +core-$(CONFIG_NLM_XLR) += arch/mips/netlogic/xlr/ +load-$(CONFIG_NLM_XLR_BOARD) += 0xffffffff84000000 + cflags-y += -I$(srctree)/arch/mips/include/asm/mach-generic drivers-$(CONFIG_PCI) += arch/mips/pci/ diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index cedee2b..83bba33 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_CPU_TX39XX) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX49XX) += r4k_fpu.o r4k_switch.o obj-$(CONFIG_CPU_VR41XX) += r4k_fpu.o r4k_switch.o obj-$(CONFIG_CPU_CAVIUM_OCTEON) += octeon_switch.o +obj-$(CONFIG_CPU_XLR) += r4k_fpu.o r4k_switch.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP_UP) += smp-up.o diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index 2adead5..b2cad4f 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_CPU_TX39XX) += r3k_dump_tlb.o obj-$(CONFIG_CPU_TX49XX) += dump_tlb.o obj-$(CONFIG_CPU_VR41XX) += dump_tlb.o obj-$(CONFIG_CPU_CAVIUM_OCTEON) += dump_tlb.o +obj-$(CONFIG_CPU_XLR) += dump_tlb.o # libgcc-style stuff needed in the kernel obj-y += ashldi3.o ashrdi3.o cmpdi2.o lshrdi3.o ucmpdi2.o diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index d679c77..eb44636 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_CPU_TX39XX) += c-tx39.o tlb-r3k.o obj-$(CONFIG_CPU_TX49XX) += c-r4k.o cex-gen.o tlb-r4k.o obj-$(CONFIG_CPU_VR41XX) += c-r4k.o cex-gen.o tlb-r4k.o obj-$(CONFIG_CPU_CAVIUM_OCTEON) += c-octeon.o cex-oct.o tlb-r4k.o +obj-$(CONFIG_CPU_XLR) += c-r4k.o tlb-r4k.o cex-gen.o obj-$(CONFIG_IP22_CPU_SCACHE) += sc-ip22.o obj-$(CONFIG_R5000_CPU_SCACHE) += sc-r5k.o diff --git a/arch/mips/netlogic/Kconfig b/arch/mips/netlogic/Kconfig new file mode 100644 index 0000000..a5ca743 --- /dev/null +++ b/arch/mips/netlogic/Kconfig @@ -0,0 +1,5 @@ +config NLM_COMMON + bool + +config NLM_XLR + bool diff --git a/arch/mips/netlogic/xlr/Makefile b/arch/mips/netlogic/xlr/Makefile new file mode 100644 index 0000000..9bd3f73 --- /dev/null +++ b/arch/mips/netlogic/xlr/Makefile @@ -0,0 +1,5 @@ +obj-y += setup.o platform.o irq.o setup.o time.o +obj-$(CONFIG_SMP) += smp.o smpboot.o +obj-$(CONFIG_EARLY_PRINTK) += xlr_console.o + +EXTRA_CFLAGS += -Werror -- cgit v1.1 From f9cab74fd9b0cf19f52a989694e7a1d8213af3a1 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sat, 7 May 2011 01:37:14 +0530 Subject: MIPS: Add default configuration for XLR/XLS processors Enable XLR CPU support, SMP, initramfs based root filesystem etc. [ralf@linux-mips.org: shrink the defconfig file through make savedefconfig.] Signed-off-by: Jayachandran C To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2338/ Signed-off-by: Ralf Baechle --- arch/mips/configs/nlm_xlr_defconfig | 574 ++++++++++++++++++++++++++++++++++++ 1 file changed, 574 insertions(+) create mode 100644 arch/mips/configs/nlm_xlr_defconfig diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig new file mode 100644 index 0000000..e4b399f --- /dev/null +++ b/arch/mips/configs/nlm_xlr_defconfig @@ -0,0 +1,574 @@ +CONFIG_NLM_XLR_BOARD=y +CONFIG_HIGHMEM=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_SMP=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_PREEMPT_VOLUNTARY=y +CONFIG_KEXEC=y +CONFIG_EXPERIMENTAL=y +CONFIG_CROSS_COMPILE="mips64-unknown-linux-gnu-" +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_AUDIT=y +CONFIG_NAMESPACES=y +CONFIG_SCHED_AUTOGROUP=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="usr/dev_file_list usr/rootfs" +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_INITRAMFS_COMPRESSION_GZIP=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_EXPERT=y +CONFIG_KALLSYMS_ALL=y +# CONFIG_ELF_CORE is not set +# CONFIG_PCSPKR_PLATFORM is not set +# CONFIG_PERF_EVENTS is not set +# CONFIG_COMPAT_BRK is not set +CONFIG_PROFILING=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BINFMT_MISC=m +CONFIG_PM_RUNTIME=y +CONFIG_PM_DEBUG=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=m +CONFIG_NET_KEY=m +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_NET_IPIP=m +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_PRIVACY=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NETLABEL=y +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_UDPLITE=m +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NETFILTER_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m +CONFIG_IP_VS_FTP=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +CONFIG_NF_CONNTRACK_IPV6=m +CONFIG_IP6_NF_QUEUE=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_DECNET_NF_GRABULATOR=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_ULOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +CONFIG_IP_DCCP=m +CONFIG_RDS=m +CONFIG_RDS_TCP=m +CONFIG_TIPC=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +CONFIG_BRIDGE=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_DECNET=m +CONFIG_LLC2=m +CONFIG_IPX=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +CONFIG_X25=m +CONFIG_LAPB=m +CONFIG_ECONET=m +CONFIG_ECONET_AUNUDP=y +CONFIG_ECONET_NATIVE=y +CONFIG_WAN_ROUTER=m +CONFIG_PHONET=m +CONFIG_IEEE802154=m +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_DCB=y +CONFIG_NET_PKTGEN=m +# CONFIG_WIRELESS is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +# CONFIG_STANDALONE is not set +CONFIG_CONNECTOR=y +CONFIG_MTD=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_OSD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=65536 +CONFIG_CDROM_PKTCDVD=y +CONFIG_MISC_DEVICES=y +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_SCSI_TGT=m +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=y +CONFIG_CHR_DEV_SG=y +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_TGT_ATTRS=y +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_SCSI_SRP_TGT_ATTRS=y +CONFIG_ISCSI_TCP=m +CONFIG_LIBFCOE=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m +# CONFIG_INPUT_MOUSEDEV is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_EVBUG=m +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_SERIO_I8042 is not set +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_LIBPS2=y +CONFIG_SERIO_RAW=m +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_DEVPTS_MULTIPLE_INSTANCES=y +CONFIG_LEGACY_PTY_COUNT=0 +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_N_HDLC=m +# CONFIG_DEVKMEM is not set +CONFIG_STALDRV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_NR_UARTS=48 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_8250_RSA=y +CONFIG_HW_RANDOM=y +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_RAW_DRIVER=m +# CONFIG_HWMON is not set +# CONFIG_VGA_CONSOLE is not set +# CONFIG_HID_SUPPORT is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_UIO=y +CONFIG_UIO_PDRV=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_NILFS2_FS=m +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_AUTOFS4_FS=m +CONFIG_FUSE_FS=y +CONFIG_CUSE=m +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +CONFIG_CACHEFILES=m +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_NTFS_FS=m +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_CONFIGFS_FS=y +CONFIG_ADFS_FS=m +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=y +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +CONFIG_BFS_FS=m +CONFIG_EFS_FS=m +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +CONFIG_VXFS_FS=m +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +CONFIG_HPFS_FS=m +CONFIG_QNX4FS_FS=m +CONFIG_ROMFS_FS=m +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +CONFIG_EXOFS_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFS_FSCACHE=y +CONFIG_NFSD=m +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_CIFS=m +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_EXPERIMENTAL=y +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +CONFIG_PARTITION_ADVANCED=y +CONFIG_ACORN_PARTITION=y +CONFIG_ACORN_PARTITION_ICS=y +CONFIG_ACORN_PARTITION_RISCIX=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +CONFIG_SYSV68_PARTITION=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="cp437" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_PRINTK_TIME=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_UNUSED_SYMBOLS=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_MEMORY_INIT=y +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KGDB=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_LSM_MMAP_MIN_ADDR=0 +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 +CONFIG_SECURITY_SELINUX_DISABLE=y +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=m +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_ZLIB=m +CONFIG_CRYPTO_LZO=m +CONFIG_CRC_CCITT=m +CONFIG_CRC7=m -- cgit v1.1 From 9b130f8004e51c65b20b0f0e17cdee073a719047 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Sat, 7 May 2011 01:37:31 +0530 Subject: MIPS: XLR, XLS: Add PCI support. Adds pci/pci-xlr.c to support for XLR PCI/PCI-X interface and XLS PCIe interface. Update irq.c to ack PCI interrupts, use irq handler data to do the PCI/PCIe bus ack. Signed-off-by: Jayachandran C Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2337/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/netlogic/xlr/xlr.h | 21 +++ arch/mips/netlogic/xlr/irq.c | 86 ++++++++++++- arch/mips/pci/Makefile | 1 + arch/mips/pci/pci-xlr.c | 214 +++++++++++++++++++++++++++++++ 4 files changed, 321 insertions(+), 1 deletion(-) create mode 100644 arch/mips/pci/pci-xlr.c diff --git a/arch/mips/include/asm/netlogic/xlr/xlr.h b/arch/mips/include/asm/netlogic/xlr/xlr.h index 454c236..3e63726 100644 --- a/arch/mips/include/asm/netlogic/xlr/xlr.h +++ b/arch/mips/include/asm/netlogic/xlr/xlr.h @@ -41,6 +41,7 @@ unsigned int nlm_xlr_uart_in(struct uart_port *, int); void nlm_xlr_uart_out(struct uart_port *, int, int); /* SMP support functions */ +struct irq_desc; void nlm_smp_function_ipi_handler(unsigned int irq, struct irq_desc *desc); void nlm_smp_resched_ipi_handler(unsigned int irq, struct irq_desc *desc); int nlm_wakeup_secondary_cpus(u32 wakeup_mask); @@ -51,4 +52,24 @@ void prom_pre_boot_secondary_cpus(void); extern struct plat_smp_ops nlm_smp_ops; extern unsigned long nlm_common_ebase; +/* XLS B silicon "Rook" */ +static inline unsigned int nlm_chip_is_xls_b(void) +{ + uint32_t prid = read_c0_prid(); + + return ((prid & 0xf000) == 0x4000); +} + +/* + * XLR chip types + */ + /* The XLS product line has chip versions 0x[48c]? */ +static inline unsigned int nlm_chip_is_xls(void) +{ + uint32_t prid = read_c0_prid(); + + return ((prid & 0xf000) == 0x8000 || (prid & 0xf000) == 0x4000 || + (prid & 0xf000) == 0xc000); +} + #endif /* _ASM_NLM_XLR_H */ diff --git a/arch/mips/netlogic/xlr/irq.c b/arch/mips/netlogic/xlr/irq.c index 2033f56..1446d58e 100644 --- a/arch/mips/netlogic/xlr/irq.c +++ b/arch/mips/netlogic/xlr/irq.c @@ -83,14 +83,71 @@ static void xlr_pic_mask(struct irq_data *d) spin_unlock_irqrestore(&nlm_pic_lock, flags); } +#ifdef CONFIG_PCI +/* Extra ACK needed for XLR on chip PCI controller */ +static void xlr_pci_ack(struct irq_data *d) +{ + nlm_reg_t *pci_mmio = netlogic_io_mmio(NETLOGIC_IO_PCIX_OFFSET); + + netlogic_read_reg(pci_mmio, (0x140 >> 2)); +} + +/* Extra ACK needed for XLS on chip PCIe controller */ +static void xls_pcie_ack(struct irq_data *d) +{ + nlm_reg_t *pcie_mmio_le = netlogic_io_mmio(NETLOGIC_IO_PCIE_1_OFFSET); + + switch (d->irq) { + case PIC_PCIE_LINK0_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x90 >> 2), 0xffffffff); + break; + case PIC_PCIE_LINK1_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x94 >> 2), 0xffffffff); + break; + case PIC_PCIE_LINK2_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x190 >> 2), 0xffffffff); + break; + case PIC_PCIE_LINK3_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x194 >> 2), 0xffffffff); + break; + } +} + +/* For XLS B silicon, the 3,4 PCI interrupts are different */ +static void xls_pcie_ack_b(struct irq_data *d) +{ + nlm_reg_t *pcie_mmio_le = netlogic_io_mmio(NETLOGIC_IO_PCIE_1_OFFSET); + + switch (d->irq) { + case PIC_PCIE_LINK0_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x90 >> 2), 0xffffffff); + break; + case PIC_PCIE_LINK1_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x94 >> 2), 0xffffffff); + break; + case PIC_PCIE_XLSB0_LINK2_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x190 >> 2), 0xffffffff); + break; + case PIC_PCIE_XLSB0_LINK3_IRQ: + netlogic_write_reg(pcie_mmio_le, (0x194 >> 2), 0xffffffff); + break; + } +} +#endif + static void xlr_pic_ack(struct irq_data *d) { unsigned long flags; nlm_reg_t *mmio; int irq = d->irq; + void *hd = irq_data_get_irq_handler_data(d); WARN(!PIC_IRQ_IS_IRT(irq), "Bad irq %d", irq); + if (hd) { + void (*extra_ack)(void *) = hd; + extra_ack(d); + } mmio = netlogic_io_mmio(NETLOGIC_IO_PIC_OFFSET); spin_lock_irqsave(&nlm_pic_lock, flags); netlogic_write_reg(mmio, PIC_INT_ACK, (1 << (irq - PIC_IRQ_BASE))); @@ -162,6 +219,33 @@ void __init init_xlr_irqs(void) nlm_irq_mask |= ((1ULL << IRQ_IPI_SMP_FUNCTION) | (1ULL << IRQ_IPI_SMP_RESCHEDULE)); #endif + +#ifdef CONFIG_PCI + /* + * For PCI interrupts, we need to ack the PIC controller too, overload + * irq handler data to do this + */ + if (nlm_chip_is_xls()) { + if (nlm_chip_is_xls_b()) { + irq_set_handler_data(PIC_PCIE_LINK0_IRQ, + xls_pcie_ack_b); + irq_set_handler_data(PIC_PCIE_LINK1_IRQ, + xls_pcie_ack_b); + irq_set_handler_data(PIC_PCIE_XLSB0_LINK2_IRQ, + xls_pcie_ack_b); + irq_set_handler_data(PIC_PCIE_XLSB0_LINK3_IRQ, + xls_pcie_ack_b); + } else { + irq_set_handler_data(PIC_PCIE_LINK0_IRQ, xls_pcie_ack); + irq_set_handler_data(PIC_PCIE_LINK1_IRQ, xls_pcie_ack); + irq_set_handler_data(PIC_PCIE_LINK2_IRQ, xls_pcie_ack); + irq_set_handler_data(PIC_PCIE_LINK3_IRQ, xls_pcie_ack); + } + } else { + /* XLR PCI controller ACK */ + irq_set_handler_data(PIC_PCIE_XLSB0_LINK3_IRQ, xlr_pci_ack); + } +#endif /* unmask all PIC related interrupts. If no handler is installed by the * drivers, it'll just ack the interrupt and return */ @@ -199,7 +283,7 @@ asmlinkage void plat_irq_dispatch(void) return; } - /* TODO use dcltz: optimize below code */ + /* use dcltz: optimize below code */ for (i = 63; i != -1; i--) { if (eirr & (1ULL << i)) break; diff --git a/arch/mips/pci/Makefile b/arch/mips/pci/Makefile index c9209ca..f0d5329 100644 --- a/arch/mips/pci/Makefile +++ b/arch/mips/pci/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_ZAO_CAPCELLA) += fixup-capcella.o obj-$(CONFIG_WR_PPMC) += fixup-wrppmc.o obj-$(CONFIG_MIKROTIK_RB532) += pci-rc32434.o ops-rc32434.o fixup-rc32434.o obj-$(CONFIG_CPU_CAVIUM_OCTEON) += pci-octeon.o pcie-octeon.o +obj-$(CONFIG_NLM_XLR) += pci-xlr.o ifdef CONFIG_PCI_MSI obj-$(CONFIG_CPU_CAVIUM_OCTEON) += msi-octeon.o diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c new file mode 100644 index 0000000..38fece1 --- /dev/null +++ b/arch/mips/pci/pci-xlr.c @@ -0,0 +1,214 @@ +/* + * Copyright 2003-2011 NetLogic Microsystems, Inc. (NetLogic). All rights + * reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the NetLogic + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETLOGIC ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETLOGIC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +static void *pci_config_base; + +#define pci_cfg_addr(bus, devfn, off) (((bus) << 16) | ((devfn) << 8) | (off)) + +/* PCI ops */ +static inline u32 pci_cfg_read_32bit(struct pci_bus *bus, unsigned int devfn, + int where) +{ + u32 data; + u32 *cfgaddr; + + cfgaddr = (u32 *)(pci_config_base + + pci_cfg_addr(bus->number, devfn, where & ~3)); + data = *cfgaddr; + return cpu_to_le32(data); +} + +static inline void pci_cfg_write_32bit(struct pci_bus *bus, unsigned int devfn, + int where, u32 data) +{ + u32 *cfgaddr; + + cfgaddr = (u32 *)(pci_config_base + + pci_cfg_addr(bus->number, devfn, where & ~3)); + *cfgaddr = cpu_to_le32(data); +} + +static int nlm_pcibios_read(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *val) +{ + u32 data; + + if ((size == 2) && (where & 1)) + return PCIBIOS_BAD_REGISTER_NUMBER; + else if ((size == 4) && (where & 3)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + data = pci_cfg_read_32bit(bus, devfn, where); + + if (size == 1) + *val = (data >> ((where & 3) << 3)) & 0xff; + else if (size == 2) + *val = (data >> ((where & 3) << 3)) & 0xffff; + else + *val = data; + + return PCIBIOS_SUCCESSFUL; +} + + +static int nlm_pcibios_write(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + u32 data; + + if ((size == 2) && (where & 1)) + return PCIBIOS_BAD_REGISTER_NUMBER; + else if ((size == 4) && (where & 3)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + data = pci_cfg_read_32bit(bus, devfn, where); + + if (size == 1) + data = (data & ~(0xff << ((where & 3) << 3))) | + (val << ((where & 3) << 3)); + else if (size == 2) + data = (data & ~(0xffff << ((where & 3) << 3))) | + (val << ((where & 3) << 3)); + else + data = val; + + pci_cfg_write_32bit(bus, devfn, where, data); + + return PCIBIOS_SUCCESSFUL; +} + +struct pci_ops nlm_pci_ops = { + .read = nlm_pcibios_read, + .write = nlm_pcibios_write +}; + +static struct resource nlm_pci_mem_resource = { + .name = "XLR PCI MEM", + .start = 0xd0000000UL, /* 256MB PCI mem @ 0xd000_0000 */ + .end = 0xdfffffffUL, + .flags = IORESOURCE_MEM, +}; + +static struct resource nlm_pci_io_resource = { + .name = "XLR IO MEM", + .start = 0x10000000UL, /* 16MB PCI IO @ 0x1000_0000 */ + .end = 0x100fffffUL, + .flags = IORESOURCE_IO, +}; + +struct pci_controller nlm_pci_controller = { + .index = 0, + .pci_ops = &nlm_pci_ops, + .mem_resource = &nlm_pci_mem_resource, + .mem_offset = 0x00000000UL, + .io_resource = &nlm_pci_io_resource, + .io_offset = 0x00000000UL, +}; + +int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +{ + if (!nlm_chip_is_xls()) + return PIC_PCIX_IRQ; /* for XLR just one IRQ*/ + + /* + * For XLS PCIe, there is an IRQ per Link, find out which + * link the device is on to assign interrupts + */ + if (dev->bus->self == NULL) + return 0; + + switch (dev->bus->self->devfn) { + case 0x0: + return PIC_PCIE_LINK0_IRQ; + case 0x8: + return PIC_PCIE_LINK1_IRQ; + case 0x10: + if (nlm_chip_is_xls_b()) + return PIC_PCIE_XLSB0_LINK2_IRQ; + else + return PIC_PCIE_LINK2_IRQ; + case 0x18: + if (nlm_chip_is_xls_b()) + return PIC_PCIE_XLSB0_LINK3_IRQ; + else + return PIC_PCIE_LINK3_IRQ; + } + WARN(1, "Unexpected devfn %d\n", dev->bus->self->devfn); + return 0; +} + +/* Do platform specific device initialization at pci_enable_device() time */ +int pcibios_plat_dev_init(struct pci_dev *dev) +{ + return 0; +} + +static int __init pcibios_init(void) +{ + /* PSB assigns PCI resources */ + pci_probe_only = 1; + pci_config_base = ioremap(DEFAULT_PCI_CONFIG_BASE, 16 << 20); + + /* Extend IO port for memory mapped io */ + ioport_resource.start = 0; + ioport_resource.end = ~0; + + set_io_port_base(CKSEG1); + nlm_pci_controller.io_map_base = CKSEG1; + + pr_info("Registering XLR/XLS PCIX/PCIE Controller.\n"); + register_pci_controller(&nlm_pci_controller); + + return 0; +} + +arch_initcall(pcibios_init); + +struct pci_fixup pcibios_fixups[] = { + {0} +}; -- cgit v1.1 From c0a5afb9bcf6b5aa5685e4fcf1282cad5fab3d91 Mon Sep 17 00:00:00 2001 From: Maxin John Date: Tue, 29 Mar 2011 00:15:55 +0300 Subject: MIPS: Enable kmemleak for MIPS Signed-off-by: Maxin B. John To: Catalin Marinas Cc: Daniel Baluta Cc: naveen yadav Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Patchwork: https://patchwork.linux-mips.org/patch/2244/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/vmlinux.lds.S | 1 + lib/Kconfig.debug | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index e4b0b0b..cd2ca54 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -68,6 +68,7 @@ SECTIONS RODATA /* writeable */ + _sdata = .; /* Start of data section */ .data : { /* Data */ . = . + DATAOFFSET; /* for CONFIG_MAPPED_KERNEL */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c768bcd..f0aa00b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -398,7 +398,7 @@ config SLUB_STATS config DEBUG_KMEMLEAK bool "Kernel memory leak detector" depends on DEBUG_KERNEL && EXPERIMENTAL && !MEMORY_HOTPLUG && \ - (X86 || ARM || PPC || S390 || SPARC64 || SUPERH || MICROBLAZE || TILE) + (X86 || ARM || PPC || MIPS || S390 || SPARC64 || SUPERH || MICROBLAZE || TILE) select DEBUG_FS if SYSFS select STACKTRACE if STACKTRACE_SUPPORT -- cgit v1.1 From 171bb2f19ed6f3627f4f783f658f2f475b2fbd50 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:47 +0200 Subject: MIPS: Lantiq: Add initial support for Lantiq SoCs Add initial support for Mips based SoCs made by Lantiq. This series will add support for the XWAY family. The series allows booting a minimal system using a initramfs or NOR. Missing drivers and support for Amazon and GPON family will be provided in a later series. [Ralf: Remove some cargo cult programming and fixed formatting.] Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2252/ Patchwork: https://patchwork.linux-mips.org/patch/2371/ Signed-off-by: Ralf Baechle --- arch/mips/Kbuild.platforms | 1 + arch/mips/Kconfig | 17 ++ arch/mips/include/asm/mach-lantiq/lantiq.h | 63 ++++++ arch/mips/include/asm/mach-lantiq/war.h | 24 +++ arch/mips/lantiq/Makefile | 9 + arch/mips/lantiq/Platform | 7 + arch/mips/lantiq/clk.c | 140 +++++++++++++ arch/mips/lantiq/clk.h | 18 ++ arch/mips/lantiq/early_printk.c | 33 +++ arch/mips/lantiq/irq.c | 326 +++++++++++++++++++++++++++++ arch/mips/lantiq/prom.c | 71 +++++++ arch/mips/lantiq/prom.h | 24 +++ arch/mips/lantiq/setup.c | 41 ++++ 13 files changed, 774 insertions(+) create mode 100644 arch/mips/include/asm/mach-lantiq/lantiq.h create mode 100644 arch/mips/include/asm/mach-lantiq/war.h create mode 100644 arch/mips/lantiq/Makefile create mode 100644 arch/mips/lantiq/Platform create mode 100644 arch/mips/lantiq/clk.c create mode 100644 arch/mips/lantiq/clk.h create mode 100644 arch/mips/lantiq/early_printk.c create mode 100644 arch/mips/lantiq/irq.c create mode 100644 arch/mips/lantiq/prom.c create mode 100644 arch/mips/lantiq/prom.h create mode 100644 arch/mips/lantiq/setup.c diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index 7ff9b54..aef6c91 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -11,6 +11,7 @@ platforms += dec platforms += emma platforms += jazz platforms += jz4740 +platforms += lantiq platforms += lasat platforms += loongson platforms += mipssim diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5016caa..1787572 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -212,6 +212,23 @@ config MACH_JZ4740 select HAVE_PWM select HAVE_CLK +config LANTIQ + bool "Lantiq based platforms" + select DMA_NONCOHERENT + select IRQ_CPU + select CEVT_R4K + select CSRC_R4K + select SYS_HAS_CPU_MIPS32_R1 + select SYS_HAS_CPU_MIPS32_R2 + select SYS_SUPPORTS_BIG_ENDIAN + select SYS_SUPPORTS_32BIT_KERNEL + select SYS_SUPPORTS_MULTITHREADING + select SYS_HAS_EARLY_PRINTK + select ARCH_REQUIRE_GPIOLIB + select SWAP_IO_SPACE + select BOOT_RAW + select HAVE_CLK + config LASAT bool "LASAT Networks platforms" select CEVT_R4K diff --git a/arch/mips/include/asm/mach-lantiq/lantiq.h b/arch/mips/include/asm/mach-lantiq/lantiq.h new file mode 100644 index 0000000..ce2f029 --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/lantiq.h @@ -0,0 +1,63 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ +#ifndef _LANTIQ_H__ +#define _LANTIQ_H__ + +#include + +/* generic reg access functions */ +#define ltq_r32(reg) __raw_readl(reg) +#define ltq_w32(val, reg) __raw_writel(val, reg) +#define ltq_w32_mask(clear, set, reg) \ + ltq_w32((ltq_r32(reg) & ~(clear)) | (set), reg) +#define ltq_r8(reg) __raw_readb(reg) +#define ltq_w8(val, reg) __raw_writeb(val, reg) + +/* register access macros for EBU and CGU */ +#define ltq_ebu_w32(x, y) ltq_w32((x), ltq_ebu_membase + (y)) +#define ltq_ebu_r32(x) ltq_r32(ltq_ebu_membase + (x)) +#define ltq_cgu_w32(x, y) ltq_w32((x), ltq_cgu_membase + (y)) +#define ltq_cgu_r32(x) ltq_r32(ltq_cgu_membase + (x)) + +extern __iomem void *ltq_ebu_membase; +extern __iomem void *ltq_cgu_membase; + +extern unsigned int ltq_get_cpu_ver(void); +extern unsigned int ltq_get_soc_type(void); + +/* clock speeds */ +#define CLOCK_60M 60000000 +#define CLOCK_83M 83333333 +#define CLOCK_111M 111111111 +#define CLOCK_133M 133333333 +#define CLOCK_167M 166666667 +#define CLOCK_200M 200000000 +#define CLOCK_266M 266666666 +#define CLOCK_333M 333333333 +#define CLOCK_400M 400000000 + +/* spinlock all ebu i/o */ +extern spinlock_t ebu_lock; + +/* some irq helpers */ +extern void ltq_disable_irq(struct irq_data *data); +extern void ltq_mask_and_ack_irq(struct irq_data *data); +extern void ltq_enable_irq(struct irq_data *data); + +/* find out what caused the last cpu reset */ +extern int ltq_reset_cause(void); +#define LTQ_RST_CAUSE_WDTRST 0x20 + +#define IOPORT_RESOURCE_START 0x10000000 +#define IOPORT_RESOURCE_END 0xffffffff +#define IOMEM_RESOURCE_START 0x10000000 +#define IOMEM_RESOURCE_END 0xffffffff +#define LTQ_FLASH_START 0x10000000 +#define LTQ_FLASH_MAX 0x04000000 + +#endif diff --git a/arch/mips/include/asm/mach-lantiq/war.h b/arch/mips/include/asm/mach-lantiq/war.h new file mode 100644 index 0000000..01b08ef --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/war.h @@ -0,0 +1,24 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + */ +#ifndef __ASM_MIPS_MACH_LANTIQ_WAR_H +#define __ASM_MIPS_MACH_LANTIQ_WAR_H + +#define R4600_V1_INDEX_ICACHEOP_WAR 0 +#define R4600_V1_HIT_CACHEOP_WAR 0 +#define R4600_V2_HIT_CACHEOP_WAR 0 +#define R5432_CP0_INTERRUPT_WAR 0 +#define BCM1250_M3_WAR 0 +#define SIBYTE_1956_WAR 0 +#define MIPS4K_ICACHE_REFILL_WAR 0 +#define MIPS_CACHE_SYNC_WAR 0 +#define TX49XX_ICACHE_INDEX_INV_WAR 0 +#define RM9000_CDEX_SMP_WAR 0 +#define ICACHE_REFILLS_WORKAROUND_WAR 0 +#define R10000_LLSC_WAR 0 +#define MIPS34K_MISSED_ITLB_WAR 0 + +#endif diff --git a/arch/mips/lantiq/Makefile b/arch/mips/lantiq/Makefile new file mode 100644 index 0000000..6a30de6 --- /dev/null +++ b/arch/mips/lantiq/Makefile @@ -0,0 +1,9 @@ +# Copyright (C) 2010 John Crispin +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License version 2 as published +# by the Free Software Foundation. + +obj-y := irq.o setup.o clk.o prom.o + +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/mips/lantiq/Platform b/arch/mips/lantiq/Platform new file mode 100644 index 0000000..eef587f --- /dev/null +++ b/arch/mips/lantiq/Platform @@ -0,0 +1,7 @@ +# +# Lantiq +# + +platform-$(CONFIG_LANTIQ) += lantiq/ +cflags-$(CONFIG_LANTIQ) += -I$(srctree)/arch/mips/include/asm/mach-lantiq +load-$(CONFIG_LANTIQ) = 0xffffffff80002000 diff --git a/arch/mips/lantiq/clk.c b/arch/mips/lantiq/clk.c new file mode 100644 index 0000000..9456089 --- /dev/null +++ b/arch/mips/lantiq/clk.c @@ -0,0 +1,140 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 Thomas Langer + * Copyright (C) 2010 John Crispin + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "clk.h" + +struct clk { + const char *name; + unsigned long rate; + unsigned long (*get_rate) (void); +}; + +static struct clk *cpu_clk; +static int cpu_clk_cnt; + +/* lantiq socs have 3 static clocks */ +static struct clk cpu_clk_generic[] = { + { + .name = "cpu", + .get_rate = ltq_get_cpu_hz, + }, { + .name = "fpi", + .get_rate = ltq_get_fpi_hz, + }, { + .name = "io", + .get_rate = ltq_get_io_region_clock, + }, +}; + +static struct resource ltq_cgu_resource = { + .name = "cgu", + .start = LTQ_CGU_BASE_ADDR, + .end = LTQ_CGU_BASE_ADDR + LTQ_CGU_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +/* remapped clock register range */ +void __iomem *ltq_cgu_membase; + +void clk_init(void) +{ + cpu_clk = cpu_clk_generic; + cpu_clk_cnt = ARRAY_SIZE(cpu_clk_generic); +} + +static inline int clk_good(struct clk *clk) +{ + return clk && !IS_ERR(clk); +} + +unsigned long clk_get_rate(struct clk *clk) +{ + if (unlikely(!clk_good(clk))) + return 0; + + if (clk->rate != 0) + return clk->rate; + + if (clk->get_rate != NULL) + return clk->get_rate(); + + return 0; +} +EXPORT_SYMBOL(clk_get_rate); + +struct clk *clk_get(struct device *dev, const char *id) +{ + int i; + + for (i = 0; i < cpu_clk_cnt; i++) + if (!strcmp(id, cpu_clk[i].name)) + return &cpu_clk[i]; + BUG(); + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(clk_get); + +void clk_put(struct clk *clk) +{ + /* not used */ +} +EXPORT_SYMBOL(clk_put); + +static inline u32 ltq_get_counter_resolution(void) +{ + u32 res; + + __asm__ __volatile__( + ".set push\n" + ".set mips32r2\n" + "rdhwr %0, $3\n" + ".set pop\n" + : "=&r" (res) + : /* no input */ + : "memory"); + + return res; +} + +void __init plat_time_init(void) +{ + struct clk *clk; + + if (insert_resource(&iomem_resource, <q_cgu_resource) < 0) + panic("Failed to insert cgu memory\n"); + + if (request_mem_region(ltq_cgu_resource.start, + resource_size(<q_cgu_resource), "cgu") < 0) + panic("Failed to request cgu memory\n"); + + ltq_cgu_membase = ioremap_nocache(ltq_cgu_resource.start, + resource_size(<q_cgu_resource)); + if (!ltq_cgu_membase) { + pr_err("Failed to remap cgu memory\n"); + unreachable(); + } + clk = clk_get(0, "cpu"); + mips_hpt_frequency = clk_get_rate(clk) / ltq_get_counter_resolution(); + write_c0_compare(read_c0_count()); + clk_put(clk); +} diff --git a/arch/mips/lantiq/clk.h b/arch/mips/lantiq/clk.h new file mode 100644 index 0000000..3328925 --- /dev/null +++ b/arch/mips/lantiq/clk.h @@ -0,0 +1,18 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LTQ_CLK_H__ +#define _LTQ_CLK_H__ + +extern void clk_init(void); + +extern unsigned long ltq_get_cpu_hz(void); +extern unsigned long ltq_get_fpi_hz(void); +extern unsigned long ltq_get_io_region_clock(void); + +#endif diff --git a/arch/mips/lantiq/early_printk.c b/arch/mips/lantiq/early_printk.c new file mode 100644 index 0000000..972e05f --- /dev/null +++ b/arch/mips/lantiq/early_printk.c @@ -0,0 +1,33 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include + +#include +#include + +/* no ioremap possible at this early stage, lets use KSEG1 instead */ +#define LTQ_ASC_BASE KSEG1ADDR(LTQ_ASC1_BASE_ADDR) +#define ASC_BUF 1024 +#define LTQ_ASC_FSTAT ((u32 *)(LTQ_ASC_BASE + 0x0048)) +#define LTQ_ASC_TBUF ((u32 *)(LTQ_ASC_BASE + 0x0020)) +#define TXMASK 0x3F00 +#define TXOFFSET 8 + +void prom_putchar(char c) +{ + unsigned long flags; + + local_irq_save(flags); + do { } while ((ltq_r32(LTQ_ASC_FSTAT) & TXMASK) >> TXOFFSET); + if (c == '\n') + ltq_w32('\r', LTQ_ASC_TBUF); + ltq_w32(c, LTQ_ASC_TBUF); + local_irq_restore(flags); +} diff --git a/arch/mips/lantiq/irq.c b/arch/mips/lantiq/irq.c new file mode 100644 index 0000000..fc89795 --- /dev/null +++ b/arch/mips/lantiq/irq.c @@ -0,0 +1,326 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + * Copyright (C) 2010 Thomas Langer + */ + +#include +#include + +#include +#include + +#include +#include + +/* register definitions */ +#define LTQ_ICU_IM0_ISR 0x0000 +#define LTQ_ICU_IM0_IER 0x0008 +#define LTQ_ICU_IM0_IOSR 0x0010 +#define LTQ_ICU_IM0_IRSR 0x0018 +#define LTQ_ICU_IM0_IMR 0x0020 +#define LTQ_ICU_IM1_ISR 0x0028 +#define LTQ_ICU_OFFSET (LTQ_ICU_IM1_ISR - LTQ_ICU_IM0_ISR) + +#define LTQ_EIU_EXIN_C 0x0000 +#define LTQ_EIU_EXIN_INIC 0x0004 +#define LTQ_EIU_EXIN_INEN 0x000C + +/* irq numbers used by the external interrupt unit (EIU) */ +#define LTQ_EIU_IR0 (INT_NUM_IM4_IRL0 + 30) +#define LTQ_EIU_IR1 (INT_NUM_IM3_IRL0 + 31) +#define LTQ_EIU_IR2 (INT_NUM_IM1_IRL0 + 26) +#define LTQ_EIU_IR3 INT_NUM_IM1_IRL0 +#define LTQ_EIU_IR4 (INT_NUM_IM1_IRL0 + 1) +#define LTQ_EIU_IR5 (INT_NUM_IM1_IRL0 + 2) +#define LTQ_EIU_IR6 (INT_NUM_IM2_IRL0 + 30) + +#define MAX_EIU 6 + +/* irqs generated by device attached to the EBU need to be acked in + * a special manner + */ +#define LTQ_ICU_EBU_IRQ 22 + +#define ltq_icu_w32(x, y) ltq_w32((x), ltq_icu_membase + (y)) +#define ltq_icu_r32(x) ltq_r32(ltq_icu_membase + (x)) + +#define ltq_eiu_w32(x, y) ltq_w32((x), ltq_eiu_membase + (y)) +#define ltq_eiu_r32(x) ltq_r32(ltq_eiu_membase + (x)) + +static unsigned short ltq_eiu_irq[MAX_EIU] = { + LTQ_EIU_IR0, + LTQ_EIU_IR1, + LTQ_EIU_IR2, + LTQ_EIU_IR3, + LTQ_EIU_IR4, + LTQ_EIU_IR5, +}; + +static struct resource ltq_icu_resource = { + .name = "icu", + .start = LTQ_ICU_BASE_ADDR, + .end = LTQ_ICU_BASE_ADDR + LTQ_ICU_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +static struct resource ltq_eiu_resource = { + .name = "eiu", + .start = LTQ_EIU_BASE_ADDR, + .end = LTQ_EIU_BASE_ADDR + LTQ_ICU_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +static void __iomem *ltq_icu_membase; +static void __iomem *ltq_eiu_membase; + +void ltq_disable_irq(struct irq_data *d) +{ + u32 ier = LTQ_ICU_IM0_IER; + int irq_nr = d->irq - INT_NUM_IRQ0; + + ier += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET); + irq_nr %= INT_NUM_IM_OFFSET; + ltq_icu_w32(ltq_icu_r32(ier) & ~(1 << irq_nr), ier); +} + +void ltq_mask_and_ack_irq(struct irq_data *d) +{ + u32 ier = LTQ_ICU_IM0_IER; + u32 isr = LTQ_ICU_IM0_ISR; + int irq_nr = d->irq - INT_NUM_IRQ0; + + ier += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET); + isr += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET); + irq_nr %= INT_NUM_IM_OFFSET; + ltq_icu_w32(ltq_icu_r32(ier) & ~(1 << irq_nr), ier); + ltq_icu_w32((1 << irq_nr), isr); +} + +static void ltq_ack_irq(struct irq_data *d) +{ + u32 isr = LTQ_ICU_IM0_ISR; + int irq_nr = d->irq - INT_NUM_IRQ0; + + isr += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET); + irq_nr %= INT_NUM_IM_OFFSET; + ltq_icu_w32((1 << irq_nr), isr); +} + +void ltq_enable_irq(struct irq_data *d) +{ + u32 ier = LTQ_ICU_IM0_IER; + int irq_nr = d->irq - INT_NUM_IRQ0; + + ier += LTQ_ICU_OFFSET * (irq_nr / INT_NUM_IM_OFFSET); + irq_nr %= INT_NUM_IM_OFFSET; + ltq_icu_w32(ltq_icu_r32(ier) | (1 << irq_nr), ier); +} + +static unsigned int ltq_startup_eiu_irq(struct irq_data *d) +{ + int i; + int irq_nr = d->irq - INT_NUM_IRQ0; + + ltq_enable_irq(d); + for (i = 0; i < MAX_EIU; i++) { + if (irq_nr == ltq_eiu_irq[i]) { + /* low level - we should really handle set_type */ + ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_C) | + (0x6 << (i * 4)), LTQ_EIU_EXIN_C); + /* clear all pending */ + ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INIC) & ~(1 << i), + LTQ_EIU_EXIN_INIC); + /* enable */ + ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INEN) | (1 << i), + LTQ_EIU_EXIN_INEN); + break; + } + } + + return 0; +} + +static void ltq_shutdown_eiu_irq(struct irq_data *d) +{ + int i; + int irq_nr = d->irq - INT_NUM_IRQ0; + + ltq_disable_irq(d); + for (i = 0; i < MAX_EIU; i++) { + if (irq_nr == ltq_eiu_irq[i]) { + /* disable */ + ltq_eiu_w32(ltq_eiu_r32(LTQ_EIU_EXIN_INEN) & ~(1 << i), + LTQ_EIU_EXIN_INEN); + break; + } + } +} + +static struct irq_chip ltq_irq_type = { + "icu", + .irq_enable = ltq_enable_irq, + .irq_disable = ltq_disable_irq, + .irq_unmask = ltq_enable_irq, + .irq_ack = ltq_ack_irq, + .irq_mask = ltq_disable_irq, + .irq_mask_ack = ltq_mask_and_ack_irq, +}; + +static struct irq_chip ltq_eiu_type = { + "eiu", + .irq_startup = ltq_startup_eiu_irq, + .irq_shutdown = ltq_shutdown_eiu_irq, + .irq_enable = ltq_enable_irq, + .irq_disable = ltq_disable_irq, + .irq_unmask = ltq_enable_irq, + .irq_ack = ltq_ack_irq, + .irq_mask = ltq_disable_irq, + .irq_mask_ack = ltq_mask_and_ack_irq, +}; + +static void ltq_hw_irqdispatch(int module) +{ + u32 irq; + + irq = ltq_icu_r32(LTQ_ICU_IM0_IOSR + (module * LTQ_ICU_OFFSET)); + if (irq == 0) + return; + + /* silicon bug causes only the msb set to 1 to be valid. all + * other bits might be bogus + */ + irq = __fls(irq); + do_IRQ((int)irq + INT_NUM_IM0_IRL0 + (INT_NUM_IM_OFFSET * module)); + + /* if this is a EBU irq, we need to ack it or get a deadlock */ + if ((irq == LTQ_ICU_EBU_IRQ) && (module == 0)) + ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_ISTAT) | 0x10, + LTQ_EBU_PCC_ISTAT); +} + +#define DEFINE_HWx_IRQDISPATCH(x) \ + static void ltq_hw ## x ## _irqdispatch(void) \ + { \ + ltq_hw_irqdispatch(x); \ + } +DEFINE_HWx_IRQDISPATCH(0) +DEFINE_HWx_IRQDISPATCH(1) +DEFINE_HWx_IRQDISPATCH(2) +DEFINE_HWx_IRQDISPATCH(3) +DEFINE_HWx_IRQDISPATCH(4) + +static void ltq_hw5_irqdispatch(void) +{ + do_IRQ(MIPS_CPU_TIMER_IRQ); +} + +asmlinkage void plat_irq_dispatch(void) +{ + unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; + unsigned int i; + + if (pending & CAUSEF_IP7) { + do_IRQ(MIPS_CPU_TIMER_IRQ); + goto out; + } else { + for (i = 0; i < 5; i++) { + if (pending & (CAUSEF_IP2 << i)) { + ltq_hw_irqdispatch(i); + goto out; + } + } + } + pr_alert("Spurious IRQ: CAUSE=0x%08x\n", read_c0_status()); + +out: + return; +} + +static struct irqaction cascade = { + .handler = no_action, + .flags = IRQF_DISABLED, + .name = "cascade", +}; + +void __init arch_init_irq(void) +{ + int i; + + if (insert_resource(&iomem_resource, <q_icu_resource) < 0) + panic("Failed to insert icu memory\n"); + + if (request_mem_region(ltq_icu_resource.start, + resource_size(<q_icu_resource), "icu") < 0) + panic("Failed to request icu memory\n"); + + ltq_icu_membase = ioremap_nocache(ltq_icu_resource.start, + resource_size(<q_icu_resource)); + if (!ltq_icu_membase) + panic("Failed to remap icu memory\n"); + + if (insert_resource(&iomem_resource, <q_eiu_resource) < 0) + panic("Failed to insert eiu memory\n"); + + if (request_mem_region(ltq_eiu_resource.start, + resource_size(<q_eiu_resource), "eiu") < 0) + panic("Failed to request eiu memory\n"); + + ltq_eiu_membase = ioremap_nocache(ltq_eiu_resource.start, + resource_size(<q_eiu_resource)); + if (!ltq_eiu_membase) + panic("Failed to remap eiu memory\n"); + + /* make sure all irqs are turned off by default */ + for (i = 0; i < 5; i++) + ltq_icu_w32(0, LTQ_ICU_IM0_IER + (i * LTQ_ICU_OFFSET)); + + /* clear all possibly pending interrupts */ + ltq_icu_w32(~0, LTQ_ICU_IM0_ISR + (i * LTQ_ICU_OFFSET)); + + mips_cpu_irq_init(); + + for (i = 2; i <= 6; i++) + setup_irq(i, &cascade); + + if (cpu_has_vint) { + pr_info("Setting up vectored interrupts\n"); + set_vi_handler(2, ltq_hw0_irqdispatch); + set_vi_handler(3, ltq_hw1_irqdispatch); + set_vi_handler(4, ltq_hw2_irqdispatch); + set_vi_handler(5, ltq_hw3_irqdispatch); + set_vi_handler(6, ltq_hw4_irqdispatch); + set_vi_handler(7, ltq_hw5_irqdispatch); + } + + for (i = INT_NUM_IRQ0; + i <= (INT_NUM_IRQ0 + (5 * INT_NUM_IM_OFFSET)); i++) + if ((i == LTQ_EIU_IR0) || (i == LTQ_EIU_IR1) || + (i == LTQ_EIU_IR2)) + irq_set_chip_and_handler(i, <q_eiu_type, + handle_level_irq); + /* EIU3-5 only exist on ar9 and vr9 */ + else if (((i == LTQ_EIU_IR3) || (i == LTQ_EIU_IR4) || + (i == LTQ_EIU_IR5)) && (ltq_is_ar9() || ltq_is_vr9())) + irq_set_chip_and_handler(i, <q_eiu_type, + handle_level_irq); + else + irq_set_chip_and_handler(i, <q_irq_type, + handle_level_irq); + +#if !defined(CONFIG_MIPS_MT_SMP) && !defined(CONFIG_MIPS_MT_SMTC) + set_c0_status(IE_IRQ0 | IE_IRQ1 | IE_IRQ2 | + IE_IRQ3 | IE_IRQ4 | IE_IRQ5); +#else + set_c0_status(IE_SW0 | IE_SW1 | IE_IRQ0 | IE_IRQ1 | + IE_IRQ2 | IE_IRQ3 | IE_IRQ4 | IE_IRQ5); +#endif +} + +unsigned int __cpuinit get_c0_compare_int(void) +{ + return CP0_LEGACY_COMPARE_IRQ; +} diff --git a/arch/mips/lantiq/prom.c b/arch/mips/lantiq/prom.c new file mode 100644 index 0000000..56ba007 --- /dev/null +++ b/arch/mips/lantiq/prom.c @@ -0,0 +1,71 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include + +#include + +#include "prom.h" +#include "clk.h" + +static struct ltq_soc_info soc_info; + +unsigned int ltq_get_cpu_ver(void) +{ + return soc_info.rev; +} +EXPORT_SYMBOL(ltq_get_cpu_ver); + +unsigned int ltq_get_soc_type(void) +{ + return soc_info.type; +} +EXPORT_SYMBOL(ltq_get_soc_type); + +const char *get_system_type(void) +{ + return soc_info.sys_type; +} + +void prom_free_prom_memory(void) +{ +} + +static void __init prom_init_cmdline(void) +{ + int argc = fw_arg0; + char **argv = (char **) KSEG1ADDR(fw_arg1); + int i; + + for (i = 0; i < argc; i++) { + char *p = (char *) KSEG1ADDR(argv[i]); + + if (p && *p) { + strlcat(arcs_cmdline, p, sizeof(arcs_cmdline)); + strlcat(arcs_cmdline, " ", sizeof(arcs_cmdline)); + } + } +} + +void __init prom_init(void) +{ + struct clk *clk; + + ltq_soc_detect(&soc_info); + clk_init(); + clk = clk_get(0, "cpu"); + snprintf(soc_info.sys_type, LTQ_SYS_TYPE_LEN - 1, "%s rev1.%d", + soc_info.name, soc_info.rev); + clk_put(clk); + soc_info.sys_type[LTQ_SYS_TYPE_LEN - 1] = '\0'; + pr_info("SoC: %s\n", soc_info.sys_type); + prom_init_cmdline(); +} diff --git a/arch/mips/lantiq/prom.h b/arch/mips/lantiq/prom.h new file mode 100644 index 0000000..4165ad1 --- /dev/null +++ b/arch/mips/lantiq/prom.h @@ -0,0 +1,24 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LTQ_PROM_H__ +#define _LTQ_PROM_H__ + +#define LTQ_SYS_TYPE_LEN 0x100 + +struct ltq_soc_info { + unsigned char *name; + unsigned int rev; + unsigned int partnum; + unsigned int type; + unsigned char sys_type[LTQ_SYS_TYPE_LEN]; +}; + +extern void ltq_soc_detect(struct ltq_soc_info *i); + +#endif diff --git a/arch/mips/lantiq/setup.c b/arch/mips/lantiq/setup.c new file mode 100644 index 0000000..79a2b0c --- /dev/null +++ b/arch/mips/lantiq/setup.c @@ -0,0 +1,41 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include + +#include + +void __init plat_mem_setup(void) +{ + /* assume 16M as default incase uboot fails to pass proper ramsize */ + unsigned long memsize = 16; + char **envp = (char **) KSEG1ADDR(fw_arg2); + + ioport_resource.start = IOPORT_RESOURCE_START; + ioport_resource.end = IOPORT_RESOURCE_END; + iomem_resource.start = IOMEM_RESOURCE_START; + iomem_resource.end = IOMEM_RESOURCE_END; + + set_io_port_base((unsigned long) KSEG1); + + while (*envp) { + char *e = (char *)KSEG1ADDR(*envp); + if (!strncmp(e, "memsize=", 8)) { + e += 8; + if (strict_strtoul(e, 0, &memsize)) + pr_warn("bad memsize specified\n"); + } + envp++; + } + memsize *= 1024 * 1024; + add_memory_region(0x00000000, memsize, BOOT_MEM_RAM); +} -- cgit v1.1 From 8ec6d93508f705dacafd5fcd058c69ef405002f9 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:48 +0200 Subject: MIPS: Lantiq: add SoC specific code for XWAY family Add support for the Lantiq XWAY family of Mips24KEc SoCs. * Danube (PSB50702) * Twinpass (PSB4000) * AR9 (PSB50802) * Amazon SE (PSB5061) The Amazon SE is a lightweight SoC and has no PCI as well as a different clock. We split the code out into seperate files to handle this. The GPIO pins on the SoCs are multi function and there are several bits we can use to configure the pins. To be as compatible as possible to GPIOLIB we add a function int lq_gpio_request(unsigned int pin, unsigned int alt0, unsigned int alt1, unsigned int dir, const char *name); which lets you configure the 2 "alternate function" bits. This way drivers like PCI can make use of GPIOLIB without a cubersome wrapper. The PLL code inside arch/mips/lantiq/xway/clk-xway.c is voodoo to me. It was taken from a 2.4.20 source tree and was never really changed by me since then. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2249/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 1 + arch/mips/include/asm/mach-lantiq/xway/irq.h | 18 ++ .../mips/include/asm/mach-lantiq/xway/lantiq_irq.h | 66 ++++++ .../mips/include/asm/mach-lantiq/xway/lantiq_soc.h | 140 +++++++++++++ arch/mips/lantiq/Kconfig | 21 ++ arch/mips/lantiq/Makefile | 2 + arch/mips/lantiq/Platform | 1 + arch/mips/lantiq/xway/Makefile | 4 + arch/mips/lantiq/xway/clk-ase.c | 48 +++++ arch/mips/lantiq/xway/clk-xway.c | 223 +++++++++++++++++++++ arch/mips/lantiq/xway/ebu.c | 53 +++++ arch/mips/lantiq/xway/gpio.c | 195 ++++++++++++++++++ arch/mips/lantiq/xway/pmu.c | 70 +++++++ arch/mips/lantiq/xway/prom-ase.c | 39 ++++ arch/mips/lantiq/xway/prom-xway.c | 54 +++++ arch/mips/lantiq/xway/reset.c | 91 +++++++++ 16 files changed, 1026 insertions(+) create mode 100644 arch/mips/include/asm/mach-lantiq/xway/irq.h create mode 100644 arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h create mode 100644 arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h create mode 100644 arch/mips/lantiq/Kconfig create mode 100644 arch/mips/lantiq/xway/Makefile create mode 100644 arch/mips/lantiq/xway/clk-ase.c create mode 100644 arch/mips/lantiq/xway/clk-xway.c create mode 100644 arch/mips/lantiq/xway/ebu.c create mode 100644 arch/mips/lantiq/xway/gpio.c create mode 100644 arch/mips/lantiq/xway/pmu.c create mode 100644 arch/mips/lantiq/xway/prom-ase.c create mode 100644 arch/mips/lantiq/xway/prom-xway.c create mode 100644 arch/mips/lantiq/xway/reset.c diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 1787572..b3b4999 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -787,6 +787,7 @@ source "arch/mips/ath79/Kconfig" source "arch/mips/bcm63xx/Kconfig" source "arch/mips/jazz/Kconfig" source "arch/mips/jz4740/Kconfig" +source "arch/mips/lantiq/Kconfig" source "arch/mips/lasat/Kconfig" source "arch/mips/pmc-sierra/Kconfig" source "arch/mips/powertv/Kconfig" diff --git a/arch/mips/include/asm/mach-lantiq/xway/irq.h b/arch/mips/include/asm/mach-lantiq/xway/irq.h new file mode 100644 index 0000000..a1471d2 --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/xway/irq.h @@ -0,0 +1,18 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef __LANTIQ_IRQ_H +#define __LANTIQ_IRQ_H + +#include + +#define NR_IRQS 256 + +#include_next + +#endif diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h new file mode 100644 index 0000000..b4465a8 --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_irq.h @@ -0,0 +1,66 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LANTIQ_XWAY_IRQ_H__ +#define _LANTIQ_XWAY_IRQ_H__ + +#define INT_NUM_IRQ0 8 +#define INT_NUM_IM0_IRL0 (INT_NUM_IRQ0 + 0) +#define INT_NUM_IM1_IRL0 (INT_NUM_IRQ0 + 32) +#define INT_NUM_IM2_IRL0 (INT_NUM_IRQ0 + 64) +#define INT_NUM_IM3_IRL0 (INT_NUM_IRQ0 + 96) +#define INT_NUM_IM4_IRL0 (INT_NUM_IRQ0 + 128) +#define INT_NUM_IM_OFFSET (INT_NUM_IM1_IRL0 - INT_NUM_IM0_IRL0) + +#define LTQ_ASC_TIR(x) (INT_NUM_IM3_IRL0 + (x * 8)) +#define LTQ_ASC_RIR(x) (INT_NUM_IM3_IRL0 + (x * 8) + 1) +#define LTQ_ASC_EIR(x) (INT_NUM_IM3_IRL0 + (x * 8) + 2) + +#define LTQ_ASC_ASE_TIR INT_NUM_IM2_IRL0 +#define LTQ_ASC_ASE_RIR (INT_NUM_IM2_IRL0 + 2) +#define LTQ_ASC_ASE_EIR (INT_NUM_IM2_IRL0 + 3) + +#define LTQ_SSC_TIR (INT_NUM_IM0_IRL0 + 15) +#define LTQ_SSC_RIR (INT_NUM_IM0_IRL0 + 14) +#define LTQ_SSC_EIR (INT_NUM_IM0_IRL0 + 16) + +#define LTQ_MEI_DYING_GASP_INT (INT_NUM_IM1_IRL0 + 21) +#define LTQ_MEI_INT (INT_NUM_IM1_IRL0 + 23) + +#define LTQ_TIMER6_INT (INT_NUM_IM1_IRL0 + 23) +#define LTQ_USB_INT (INT_NUM_IM1_IRL0 + 22) +#define LTQ_USB_OC_INT (INT_NUM_IM4_IRL0 + 23) + +#define MIPS_CPU_TIMER_IRQ 7 + +#define LTQ_DMA_CH0_INT (INT_NUM_IM2_IRL0) +#define LTQ_DMA_CH1_INT (INT_NUM_IM2_IRL0 + 1) +#define LTQ_DMA_CH2_INT (INT_NUM_IM2_IRL0 + 2) +#define LTQ_DMA_CH3_INT (INT_NUM_IM2_IRL0 + 3) +#define LTQ_DMA_CH4_INT (INT_NUM_IM2_IRL0 + 4) +#define LTQ_DMA_CH5_INT (INT_NUM_IM2_IRL0 + 5) +#define LTQ_DMA_CH6_INT (INT_NUM_IM2_IRL0 + 6) +#define LTQ_DMA_CH7_INT (INT_NUM_IM2_IRL0 + 7) +#define LTQ_DMA_CH8_INT (INT_NUM_IM2_IRL0 + 8) +#define LTQ_DMA_CH9_INT (INT_NUM_IM2_IRL0 + 9) +#define LTQ_DMA_CH10_INT (INT_NUM_IM2_IRL0 + 10) +#define LTQ_DMA_CH11_INT (INT_NUM_IM2_IRL0 + 11) +#define LTQ_DMA_CH12_INT (INT_NUM_IM2_IRL0 + 25) +#define LTQ_DMA_CH13_INT (INT_NUM_IM2_IRL0 + 26) +#define LTQ_DMA_CH14_INT (INT_NUM_IM2_IRL0 + 27) +#define LTQ_DMA_CH15_INT (INT_NUM_IM2_IRL0 + 28) +#define LTQ_DMA_CH16_INT (INT_NUM_IM2_IRL0 + 29) +#define LTQ_DMA_CH17_INT (INT_NUM_IM2_IRL0 + 30) +#define LTQ_DMA_CH18_INT (INT_NUM_IM2_IRL0 + 16) +#define LTQ_DMA_CH19_INT (INT_NUM_IM2_IRL0 + 21) + +#define LTQ_PPE_MBOX_INT (INT_NUM_IM2_IRL0 + 24) + +#define INT_NUM_IM4_IRL14 (INT_NUM_IM4_IRL0 + 14) + +#endif diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h new file mode 100644 index 0000000..343e82cb --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h @@ -0,0 +1,140 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LTQ_XWAY_H__ +#define _LTQ_XWAY_H__ + +#ifdef CONFIG_SOC_TYPE_XWAY + +#include + +/* Chip IDs */ +#define SOC_ID_DANUBE1 0x129 +#define SOC_ID_DANUBE2 0x12B +#define SOC_ID_TWINPASS 0x12D +#define SOC_ID_AMAZON_SE 0x152 +#define SOC_ID_ARX188 0x16C +#define SOC_ID_ARX168 0x16D +#define SOC_ID_ARX182 0x16F + +/* SoC Types */ +#define SOC_TYPE_DANUBE 0x01 +#define SOC_TYPE_TWINPASS 0x02 +#define SOC_TYPE_AR9 0x03 +#define SOC_TYPE_VR9 0x04 +#define SOC_TYPE_AMAZON_SE 0x05 + +/* ASC0/1 - serial port */ +#define LTQ_ASC0_BASE_ADDR 0x1E100400 +#define LTQ_ASC1_BASE_ADDR 0x1E100C00 +#define LTQ_ASC_SIZE 0x400 + +/* RCU - reset control unit */ +#define LTQ_RCU_BASE_ADDR 0x1F203000 +#define LTQ_RCU_SIZE 0x1000 + +/* GPTU - general purpose timer unit */ +#define LTQ_GPTU_BASE_ADDR 0x18000300 +#define LTQ_GPTU_SIZE 0x100 + +/* EBU - external bus unit */ +#define LTQ_EBU_GPIO_START 0x14000000 +#define LTQ_EBU_GPIO_SIZE 0x1000 + +#define LTQ_EBU_BASE_ADDR 0x1E105300 +#define LTQ_EBU_SIZE 0x100 + +#define LTQ_EBU_BUSCON0 0x0060 +#define LTQ_EBU_PCC_CON 0x0090 +#define LTQ_EBU_PCC_IEN 0x00A4 +#define LTQ_EBU_PCC_ISTAT 0x00A0 +#define LTQ_EBU_BUSCON1 0x0064 +#define LTQ_EBU_ADDRSEL1 0x0024 +#define EBU_WRDIS 0x80000000 + +/* CGU - clock generation unit */ +#define LTQ_CGU_BASE_ADDR 0x1F103000 +#define LTQ_CGU_SIZE 0x1000 + +/* ICU - interrupt control unit */ +#define LTQ_ICU_BASE_ADDR 0x1F880200 +#define LTQ_ICU_SIZE 0x100 + +/* EIU - external interrupt unit */ +#define LTQ_EIU_BASE_ADDR 0x1F101000 +#define LTQ_EIU_SIZE 0x1000 + +/* PMU - power management unit */ +#define LTQ_PMU_BASE_ADDR 0x1F102000 +#define LTQ_PMU_SIZE 0x1000 + +#define PMU_DMA 0x0020 +#define PMU_USB 0x8041 +#define PMU_LED 0x0800 +#define PMU_GPT 0x1000 +#define PMU_PPE 0x2000 +#define PMU_FPI 0x4000 +#define PMU_SWITCH 0x10000000 + +/* ETOP - ethernet */ +#define LTQ_PPE32_BASE_ADDR 0xBE180000 +#define LTQ_PPE32_SIZE 0x40000 + +/* DMA */ +#define LTQ_DMA_BASE_ADDR 0xBE104100 + +/* PCI */ +#define PCI_CR_BASE_ADDR 0x1E105400 +#define PCI_CR_SIZE 0x400 + +/* WDT */ +#define LTQ_WDT_BASE_ADDR 0x1F8803F0 +#define LTQ_WDT_SIZE 0x10 + +/* STP - serial to parallel conversion unit */ +#define LTQ_STP_BASE_ADDR 0x1E100BB0 +#define LTQ_STP_SIZE 0x40 + +/* GPIO */ +#define LTQ_GPIO0_BASE_ADDR 0x1E100B10 +#define LTQ_GPIO1_BASE_ADDR 0x1E100B40 +#define LTQ_GPIO2_BASE_ADDR 0x1E100B70 +#define LTQ_GPIO_SIZE 0x30 + +/* SSC */ +#define LTQ_SSC_BASE_ADDR 0x1e100800 +#define LTQ_SSC_SIZE 0x100 + +/* MEI - dsl core */ +#define LTQ_MEI_BASE_ADDR 0x1E116000 + +/* DEU - data encryption unit */ +#define LTQ_DEU_BASE_ADDR 0x1E103100 + +/* MPS - multi processor unit (voice) */ +#define LTQ_MPS_BASE_ADDR (KSEG1 + 0x1F107000) +#define LTQ_MPS_CHIPID ((u32 *)(LTQ_MPS_BASE_ADDR + 0x0344)) + +/* request a non-gpio and set the PIO config */ +extern int ltq_gpio_request(unsigned int pin, unsigned int alt0, + unsigned int alt1, unsigned int dir, const char *name); +extern void ltq_pmu_enable(unsigned int module); +extern void ltq_pmu_disable(unsigned int module); + +static inline int ltq_is_ar9(void) +{ + return (ltq_get_soc_type() == SOC_TYPE_AR9); +} + +static inline int ltq_is_vr9(void) +{ + return (ltq_get_soc_type() == SOC_TYPE_VR9); +} + +#endif /* CONFIG_SOC_TYPE_XWAY */ +#endif /* _LTQ_XWAY_H__ */ diff --git a/arch/mips/lantiq/Kconfig b/arch/mips/lantiq/Kconfig new file mode 100644 index 0000000..2780461 --- /dev/null +++ b/arch/mips/lantiq/Kconfig @@ -0,0 +1,21 @@ +if LANTIQ + +config SOC_TYPE_XWAY + bool + default n + +choice + prompt "SoC Type" + default SOC_XWAY + +config SOC_AMAZON_SE + bool "Amazon SE" + select SOC_TYPE_XWAY + +config SOC_XWAY + bool "XWAY" + select SOC_TYPE_XWAY + select HW_HAS_PCI +endchoice + +endif diff --git a/arch/mips/lantiq/Makefile b/arch/mips/lantiq/Makefile index 6a30de6..a268391 100644 --- a/arch/mips/lantiq/Makefile +++ b/arch/mips/lantiq/Makefile @@ -7,3 +7,5 @@ obj-y := irq.o setup.o clk.o prom.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_SOC_TYPE_XWAY) += xway/ diff --git a/arch/mips/lantiq/Platform b/arch/mips/lantiq/Platform index eef587f..f3dff05 100644 --- a/arch/mips/lantiq/Platform +++ b/arch/mips/lantiq/Platform @@ -5,3 +5,4 @@ platform-$(CONFIG_LANTIQ) += lantiq/ cflags-$(CONFIG_LANTIQ) += -I$(srctree)/arch/mips/include/asm/mach-lantiq load-$(CONFIG_LANTIQ) = 0xffffffff80002000 +cflags-$(CONFIG_SOC_TYPE_XWAY) += -I$(srctree)/arch/mips/include/asm/mach-lantiq/xway diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile new file mode 100644 index 0000000..9c85ff9 --- /dev/null +++ b/arch/mips/lantiq/xway/Makefile @@ -0,0 +1,4 @@ +obj-y := pmu.o ebu.o reset.o gpio.o + +obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o +obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o diff --git a/arch/mips/lantiq/xway/clk-ase.c b/arch/mips/lantiq/xway/clk-ase.c new file mode 100644 index 0000000..22d823a --- /dev/null +++ b/arch/mips/lantiq/xway/clk-ase.c @@ -0,0 +1,48 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2011 John Crispin + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +/* cgu registers */ +#define LTQ_CGU_SYS 0x0010 + +unsigned int ltq_get_io_region_clock(void) +{ + return CLOCK_133M; +} +EXPORT_SYMBOL(ltq_get_io_region_clock); + +unsigned int ltq_get_fpi_bus_clock(int fpi) +{ + return CLOCK_133M; +} +EXPORT_SYMBOL(ltq_get_fpi_bus_clock); + +unsigned int ltq_get_cpu_hz(void) +{ + if (ltq_cgu_r32(LTQ_CGU_SYS) & (1 << 5)) + return CLOCK_266M; + else + return CLOCK_133M; +} +EXPORT_SYMBOL(ltq_get_cpu_hz); + +unsigned int ltq_get_fpi_hz(void) +{ + return CLOCK_133M; +} +EXPORT_SYMBOL(ltq_get_fpi_hz); diff --git a/arch/mips/lantiq/xway/clk-xway.c b/arch/mips/lantiq/xway/clk-xway.c new file mode 100644 index 0000000..ddd3959 --- /dev/null +++ b/arch/mips/lantiq/xway/clk-xway.c @@ -0,0 +1,223 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +static unsigned int ltq_ram_clocks[] = { + CLOCK_167M, CLOCK_133M, CLOCK_111M, CLOCK_83M }; +#define DDR_HZ ltq_ram_clocks[ltq_cgu_r32(LTQ_CGU_SYS) & 0x3] + +#define BASIC_FREQUENCY_1 35328000 +#define BASIC_FREQUENCY_2 36000000 +#define BASIS_REQUENCY_USB 12000000 + +#define GET_BITS(x, msb, lsb) \ + (((x) & ((1 << ((msb) + 1)) - 1)) >> (lsb)) + +#define LTQ_CGU_PLL0_CFG 0x0004 +#define LTQ_CGU_PLL1_CFG 0x0008 +#define LTQ_CGU_PLL2_CFG 0x000C +#define LTQ_CGU_SYS 0x0010 +#define LTQ_CGU_UPDATE 0x0014 +#define LTQ_CGU_IF_CLK 0x0018 +#define LTQ_CGU_OSC_CON 0x001C +#define LTQ_CGU_SMD 0x0020 +#define LTQ_CGU_CT1SR 0x0028 +#define LTQ_CGU_CT2SR 0x002C +#define LTQ_CGU_PCMCR 0x0030 +#define LTQ_CGU_PCI_CR 0x0034 +#define LTQ_CGU_PD_PC 0x0038 +#define LTQ_CGU_FMR 0x003C + +#define CGU_PLL0_PHASE_DIVIDER_ENABLE \ + (ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 31)) +#define CGU_PLL0_BYPASS \ + (ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 30)) +#define CGU_PLL0_CFG_DSMSEL \ + (ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 28)) +#define CGU_PLL0_CFG_FRAC_EN \ + (ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & (1 << 27)) +#define CGU_PLL1_SRC \ + (ltq_cgu_r32(LTQ_CGU_PLL1_CFG) & (1 << 31)) +#define CGU_PLL2_PHASE_DIVIDER_ENABLE \ + (ltq_cgu_r32(LTQ_CGU_PLL2_CFG) & (1 << 20)) +#define CGU_SYS_FPI_SEL (1 << 6) +#define CGU_SYS_DDR_SEL 0x3 +#define CGU_PLL0_SRC (1 << 29) + +#define CGU_PLL0_CFG_PLLK GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL0_CFG), 26, 17) +#define CGU_PLL0_CFG_PLLN GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL0_CFG), 12, 6) +#define CGU_PLL0_CFG_PLLM GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL0_CFG), 5, 2) +#define CGU_PLL2_SRC GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL2_CFG), 18, 17) +#define CGU_PLL2_CFG_INPUT_DIV GET_BITS(ltq_cgu_r32(LTQ_CGU_PLL2_CFG), 16, 13) + +static unsigned int ltq_get_pll0_fdiv(void); + +static inline unsigned int get_input_clock(int pll) +{ + switch (pll) { + case 0: + if (ltq_cgu_r32(LTQ_CGU_PLL0_CFG) & CGU_PLL0_SRC) + return BASIS_REQUENCY_USB; + else if (CGU_PLL0_PHASE_DIVIDER_ENABLE) + return BASIC_FREQUENCY_1; + else + return BASIC_FREQUENCY_2; + case 1: + if (CGU_PLL1_SRC) + return BASIS_REQUENCY_USB; + else if (CGU_PLL0_PHASE_DIVIDER_ENABLE) + return BASIC_FREQUENCY_1; + else + return BASIC_FREQUENCY_2; + case 2: + switch (CGU_PLL2_SRC) { + case 0: + return ltq_get_pll0_fdiv(); + case 1: + return CGU_PLL2_PHASE_DIVIDER_ENABLE ? + BASIC_FREQUENCY_1 : + BASIC_FREQUENCY_2; + case 2: + return BASIS_REQUENCY_USB; + } + default: + return 0; + } +} + +static inline unsigned int cal_dsm(int pll, unsigned int num, unsigned int den) +{ + u64 res, clock = get_input_clock(pll); + + res = num * clock; + do_div(res, den); + return res; +} + +static inline unsigned int mash_dsm(int pll, unsigned int M, unsigned int N, + unsigned int K) +{ + unsigned int num = ((N + 1) << 10) + K; + unsigned int den = (M + 1) << 10; + + return cal_dsm(pll, num, den); +} + +static inline unsigned int ssff_dsm_1(int pll, unsigned int M, unsigned int N, + unsigned int K) +{ + unsigned int num = ((N + 1) << 11) + K + 512; + unsigned int den = (M + 1) << 11; + + return cal_dsm(pll, num, den); +} + +static inline unsigned int ssff_dsm_2(int pll, unsigned int M, unsigned int N, + unsigned int K) +{ + unsigned int num = K >= 512 ? + ((N + 1) << 12) + K - 512 : ((N + 1) << 12) + K + 3584; + unsigned int den = (M + 1) << 12; + + return cal_dsm(pll, num, den); +} + +static inline unsigned int dsm(int pll, unsigned int M, unsigned int N, + unsigned int K, unsigned int dsmsel, unsigned int phase_div_en) +{ + if (!dsmsel) + return mash_dsm(pll, M, N, K); + else if (!phase_div_en) + return mash_dsm(pll, M, N, K); + else + return ssff_dsm_2(pll, M, N, K); +} + +static inline unsigned int ltq_get_pll0_fosc(void) +{ + if (CGU_PLL0_BYPASS) + return get_input_clock(0); + else + return !CGU_PLL0_CFG_FRAC_EN + ? dsm(0, CGU_PLL0_CFG_PLLM, CGU_PLL0_CFG_PLLN, 0, + CGU_PLL0_CFG_DSMSEL, + CGU_PLL0_PHASE_DIVIDER_ENABLE) + : dsm(0, CGU_PLL0_CFG_PLLM, CGU_PLL0_CFG_PLLN, + CGU_PLL0_CFG_PLLK, CGU_PLL0_CFG_DSMSEL, + CGU_PLL0_PHASE_DIVIDER_ENABLE); +} + +static unsigned int ltq_get_pll0_fdiv(void) +{ + unsigned int div = CGU_PLL2_CFG_INPUT_DIV + 1; + + return (ltq_get_pll0_fosc() + (div >> 1)) / div; +} + +unsigned int ltq_get_io_region_clock(void) +{ + unsigned int ret = ltq_get_pll0_fosc(); + + switch (ltq_cgu_r32(LTQ_CGU_PLL2_CFG) & CGU_SYS_DDR_SEL) { + default: + case 0: + return (ret + 1) / 2; + case 1: + return (ret * 2 + 2) / 5; + case 2: + return (ret + 1) / 3; + case 3: + return (ret + 2) / 4; + } +} +EXPORT_SYMBOL(ltq_get_io_region_clock); + +unsigned int ltq_get_fpi_bus_clock(int fpi) +{ + unsigned int ret = ltq_get_io_region_clock(); + + if ((fpi == 2) && (ltq_cgu_r32(LTQ_CGU_SYS) & CGU_SYS_FPI_SEL)) + ret >>= 1; + return ret; +} +EXPORT_SYMBOL(ltq_get_fpi_bus_clock); + +unsigned int ltq_get_cpu_hz(void) +{ + switch (ltq_cgu_r32(LTQ_CGU_SYS) & 0xc) { + case 0: + return CLOCK_333M; + case 4: + return DDR_HZ; + case 8: + return DDR_HZ << 1; + default: + return DDR_HZ >> 1; + } +} +EXPORT_SYMBOL(ltq_get_cpu_hz); + +unsigned int ltq_get_fpi_hz(void) +{ + unsigned int ddr_clock = DDR_HZ; + + if (ltq_cgu_r32(LTQ_CGU_SYS) & 0x40) + return ddr_clock >> 1; + return ddr_clock; +} +EXPORT_SYMBOL(ltq_get_fpi_hz); diff --git a/arch/mips/lantiq/xway/ebu.c b/arch/mips/lantiq/xway/ebu.c new file mode 100644 index 0000000..66eb52f --- /dev/null +++ b/arch/mips/lantiq/xway/ebu.c @@ -0,0 +1,53 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * EBU - the external bus unit attaches PCI, NOR and NAND + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include + +#include + +/* all access to the ebu must be locked */ +DEFINE_SPINLOCK(ebu_lock); +EXPORT_SYMBOL_GPL(ebu_lock); + +static struct resource ltq_ebu_resource = { + .name = "ebu", + .start = LTQ_EBU_BASE_ADDR, + .end = LTQ_EBU_BASE_ADDR + LTQ_EBU_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +/* remapped base addr of the clock unit and external bus unit */ +void __iomem *ltq_ebu_membase; + +static int __init lantiq_ebu_init(void) +{ + /* insert and request the memory region */ + if (insert_resource(&iomem_resource, <q_ebu_resource) < 0) + panic("Failed to insert ebu memory\n"); + + if (request_mem_region(ltq_ebu_resource.start, + resource_size(<q_ebu_resource), "ebu") < 0) + panic("Failed to request ebu memory\n"); + + /* remap ebu register range */ + ltq_ebu_membase = ioremap_nocache(ltq_ebu_resource.start, + resource_size(<q_ebu_resource)); + if (!ltq_ebu_membase) + panic("Failed to remap ebu memory\n"); + + /* make sure to unprotect the memory region where flash is located */ + ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_BUSCON0) & ~EBU_WRDIS, LTQ_EBU_BUSCON0); + return 0; +} + +postcore_initcall(lantiq_ebu_init); diff --git a/arch/mips/lantiq/xway/gpio.c b/arch/mips/lantiq/xway/gpio.c new file mode 100644 index 0000000..a321451 --- /dev/null +++ b/arch/mips/lantiq/xway/gpio.c @@ -0,0 +1,195 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include + +#include + +#define LTQ_GPIO_OUT 0x00 +#define LTQ_GPIO_IN 0x04 +#define LTQ_GPIO_DIR 0x08 +#define LTQ_GPIO_ALTSEL0 0x0C +#define LTQ_GPIO_ALTSEL1 0x10 +#define LTQ_GPIO_OD 0x14 + +#define PINS_PER_PORT 16 +#define MAX_PORTS 3 + +#define ltq_gpio_getbit(m, r, p) (!!(ltq_r32(m + r) & (1 << p))) +#define ltq_gpio_setbit(m, r, p) ltq_w32_mask(0, (1 << p), m + r) +#define ltq_gpio_clearbit(m, r, p) ltq_w32_mask((1 << p), 0, m + r) + +struct ltq_gpio { + void __iomem *membase; + struct gpio_chip chip; +}; + +static struct ltq_gpio ltq_gpio_port[MAX_PORTS]; + +int gpio_to_irq(unsigned int gpio) +{ + return -EINVAL; +} +EXPORT_SYMBOL(gpio_to_irq); + +int irq_to_gpio(unsigned int gpio) +{ + return -EINVAL; +} +EXPORT_SYMBOL(irq_to_gpio); + +int ltq_gpio_request(unsigned int pin, unsigned int alt0, + unsigned int alt1, unsigned int dir, const char *name) +{ + int id = 0; + + if (pin >= (MAX_PORTS * PINS_PER_PORT)) + return -EINVAL; + if (gpio_request(pin, name)) { + pr_err("failed to setup lantiq gpio: %s\n", name); + return -EBUSY; + } + if (dir) + gpio_direction_output(pin, 1); + else + gpio_direction_input(pin); + while (pin >= PINS_PER_PORT) { + pin -= PINS_PER_PORT; + id++; + } + if (alt0) + ltq_gpio_setbit(ltq_gpio_port[id].membase, + LTQ_GPIO_ALTSEL0, pin); + else + ltq_gpio_clearbit(ltq_gpio_port[id].membase, + LTQ_GPIO_ALTSEL0, pin); + if (alt1) + ltq_gpio_setbit(ltq_gpio_port[id].membase, + LTQ_GPIO_ALTSEL1, pin); + else + ltq_gpio_clearbit(ltq_gpio_port[id].membase, + LTQ_GPIO_ALTSEL1, pin); + return 0; +} +EXPORT_SYMBOL(ltq_gpio_request); + +static void ltq_gpio_set(struct gpio_chip *chip, unsigned int offset, int value) +{ + struct ltq_gpio *ltq_gpio = container_of(chip, struct ltq_gpio, chip); + + if (value) + ltq_gpio_setbit(ltq_gpio->membase, LTQ_GPIO_OUT, offset); + else + ltq_gpio_clearbit(ltq_gpio->membase, LTQ_GPIO_OUT, offset); +} + +static int ltq_gpio_get(struct gpio_chip *chip, unsigned int offset) +{ + struct ltq_gpio *ltq_gpio = container_of(chip, struct ltq_gpio, chip); + + return ltq_gpio_getbit(ltq_gpio->membase, LTQ_GPIO_IN, offset); +} + +static int ltq_gpio_direction_input(struct gpio_chip *chip, unsigned int offset) +{ + struct ltq_gpio *ltq_gpio = container_of(chip, struct ltq_gpio, chip); + + ltq_gpio_clearbit(ltq_gpio->membase, LTQ_GPIO_OD, offset); + ltq_gpio_clearbit(ltq_gpio->membase, LTQ_GPIO_DIR, offset); + + return 0; +} + +static int ltq_gpio_direction_output(struct gpio_chip *chip, + unsigned int offset, int value) +{ + struct ltq_gpio *ltq_gpio = container_of(chip, struct ltq_gpio, chip); + + ltq_gpio_setbit(ltq_gpio->membase, LTQ_GPIO_OD, offset); + ltq_gpio_setbit(ltq_gpio->membase, LTQ_GPIO_DIR, offset); + ltq_gpio_set(chip, offset, value); + + return 0; +} + +static int ltq_gpio_req(struct gpio_chip *chip, unsigned offset) +{ + struct ltq_gpio *ltq_gpio = container_of(chip, struct ltq_gpio, chip); + + ltq_gpio_clearbit(ltq_gpio->membase, LTQ_GPIO_ALTSEL0, offset); + ltq_gpio_clearbit(ltq_gpio->membase, LTQ_GPIO_ALTSEL1, offset); + return 0; +} + +static int ltq_gpio_probe(struct platform_device *pdev) +{ + struct resource *res; + + if (pdev->id >= MAX_PORTS) { + dev_err(&pdev->dev, "invalid gpio port %d\n", + pdev->id); + return -EINVAL; + } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "failed to get memory for gpio port %d\n", + pdev->id); + return -ENOENT; + } + res = devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), dev_name(&pdev->dev)); + if (!res) { + dev_err(&pdev->dev, + "failed to request memory for gpio port %d\n", + pdev->id); + return -EBUSY; + } + ltq_gpio_port[pdev->id].membase = devm_ioremap_nocache(&pdev->dev, + res->start, resource_size(res)); + if (!ltq_gpio_port[pdev->id].membase) { + dev_err(&pdev->dev, "failed to remap memory for gpio port %d\n", + pdev->id); + return -ENOMEM; + } + ltq_gpio_port[pdev->id].chip.label = "ltq_gpio"; + ltq_gpio_port[pdev->id].chip.direction_input = ltq_gpio_direction_input; + ltq_gpio_port[pdev->id].chip.direction_output = + ltq_gpio_direction_output; + ltq_gpio_port[pdev->id].chip.get = ltq_gpio_get; + ltq_gpio_port[pdev->id].chip.set = ltq_gpio_set; + ltq_gpio_port[pdev->id].chip.request = ltq_gpio_req; + ltq_gpio_port[pdev->id].chip.base = PINS_PER_PORT * pdev->id; + ltq_gpio_port[pdev->id].chip.ngpio = PINS_PER_PORT; + platform_set_drvdata(pdev, <q_gpio_port[pdev->id]); + return gpiochip_add(<q_gpio_port[pdev->id].chip); +} + +static struct platform_driver +ltq_gpio_driver = { + .probe = ltq_gpio_probe, + .driver = { + .name = "ltq_gpio", + .owner = THIS_MODULE, + }, +}; + +int __init ltq_gpio_init(void) +{ + int ret = platform_driver_register(<q_gpio_driver); + + if (ret) + pr_info("ltq_gpio : Error registering platfom driver!"); + return ret; +} + +postcore_initcall(ltq_gpio_init); diff --git a/arch/mips/lantiq/xway/pmu.c b/arch/mips/lantiq/xway/pmu.c new file mode 100644 index 0000000..9d69f01e --- /dev/null +++ b/arch/mips/lantiq/xway/pmu.c @@ -0,0 +1,70 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include + +#include + +/* PMU - the power management unit allows us to turn part of the core + * on and off + */ + +/* the enable / disable registers */ +#define LTQ_PMU_PWDCR 0x1C +#define LTQ_PMU_PWDSR 0x20 + +#define ltq_pmu_w32(x, y) ltq_w32((x), ltq_pmu_membase + (y)) +#define ltq_pmu_r32(x) ltq_r32(ltq_pmu_membase + (x)) + +static struct resource ltq_pmu_resource = { + .name = "pmu", + .start = LTQ_PMU_BASE_ADDR, + .end = LTQ_PMU_BASE_ADDR + LTQ_PMU_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +static void __iomem *ltq_pmu_membase; + +void ltq_pmu_enable(unsigned int module) +{ + int err = 1000000; + + ltq_pmu_w32(ltq_pmu_r32(LTQ_PMU_PWDCR) & ~module, LTQ_PMU_PWDCR); + do {} while (--err && (ltq_pmu_r32(LTQ_PMU_PWDSR) & module)); + + if (!err) + panic("activating PMU module failed!\n"); +} +EXPORT_SYMBOL(ltq_pmu_enable); + +void ltq_pmu_disable(unsigned int module) +{ + ltq_pmu_w32(ltq_pmu_r32(LTQ_PMU_PWDCR) | module, LTQ_PMU_PWDCR); +} +EXPORT_SYMBOL(ltq_pmu_disable); + +int __init ltq_pmu_init(void) +{ + if (insert_resource(&iomem_resource, <q_pmu_resource) < 0) + panic("Failed to insert pmu memory\n"); + + if (request_mem_region(ltq_pmu_resource.start, + resource_size(<q_pmu_resource), "pmu") < 0) + panic("Failed to request pmu memory\n"); + + ltq_pmu_membase = ioremap_nocache(ltq_pmu_resource.start, + resource_size(<q_pmu_resource)); + if (!ltq_pmu_membase) + panic("Failed to remap pmu memory\n"); + return 0; +} + +core_initcall(ltq_pmu_init); diff --git a/arch/mips/lantiq/xway/prom-ase.c b/arch/mips/lantiq/xway/prom-ase.c new file mode 100644 index 0000000..abe49f4 --- /dev/null +++ b/arch/mips/lantiq/xway/prom-ase.c @@ -0,0 +1,39 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include + +#include + +#include "../prom.h" + +#define SOC_AMAZON_SE "Amazon_SE" + +#define PART_SHIFT 12 +#define PART_MASK 0x0FFFFFFF +#define REV_SHIFT 28 +#define REV_MASK 0xF0000000 + +void __init ltq_soc_detect(struct ltq_soc_info *i) +{ + i->partnum = (ltq_r32(LTQ_MPS_CHIPID) & PART_MASK) >> PART_SHIFT; + i->rev = (ltq_r32(LTQ_MPS_CHIPID) & REV_MASK) >> REV_SHIFT; + switch (i->partnum) { + case SOC_ID_AMAZON_SE: + i->name = SOC_AMAZON_SE; + i->type = SOC_TYPE_AMAZON_SE; + break; + + default: + unreachable(); + break; + } +} diff --git a/arch/mips/lantiq/xway/prom-xway.c b/arch/mips/lantiq/xway/prom-xway.c new file mode 100644 index 0000000..1686692a --- /dev/null +++ b/arch/mips/lantiq/xway/prom-xway.c @@ -0,0 +1,54 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include + +#include + +#include "../prom.h" + +#define SOC_DANUBE "Danube" +#define SOC_TWINPASS "Twinpass" +#define SOC_AR9 "AR9" + +#define PART_SHIFT 12 +#define PART_MASK 0x0FFFFFFF +#define REV_SHIFT 28 +#define REV_MASK 0xF0000000 + +void __init ltq_soc_detect(struct ltq_soc_info *i) +{ + i->partnum = (ltq_r32(LTQ_MPS_CHIPID) & PART_MASK) >> PART_SHIFT; + i->rev = (ltq_r32(LTQ_MPS_CHIPID) & REV_MASK) >> REV_SHIFT; + switch (i->partnum) { + case SOC_ID_DANUBE1: + case SOC_ID_DANUBE2: + i->name = SOC_DANUBE; + i->type = SOC_TYPE_DANUBE; + break; + + case SOC_ID_TWINPASS: + i->name = SOC_TWINPASS; + i->type = SOC_TYPE_DANUBE; + break; + + case SOC_ID_ARX188: + case SOC_ID_ARX168: + case SOC_ID_ARX182: + i->name = SOC_AR9; + i->type = SOC_TYPE_AR9; + break; + + default: + unreachable(); + break; + } +} diff --git a/arch/mips/lantiq/xway/reset.c b/arch/mips/lantiq/xway/reset.c new file mode 100644 index 0000000..a1be36d --- /dev/null +++ b/arch/mips/lantiq/xway/reset.c @@ -0,0 +1,91 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include + +#include + +#define ltq_rcu_w32(x, y) ltq_w32((x), ltq_rcu_membase + (y)) +#define ltq_rcu_r32(x) ltq_r32(ltq_rcu_membase + (x)) + +/* register definitions */ +#define LTQ_RCU_RST 0x0010 +#define LTQ_RCU_RST_ALL 0x40000000 + +#define LTQ_RCU_RST_STAT 0x0014 +#define LTQ_RCU_STAT_SHIFT 26 + +static struct resource ltq_rcu_resource = { + .name = "rcu", + .start = LTQ_RCU_BASE_ADDR, + .end = LTQ_RCU_BASE_ADDR + LTQ_RCU_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +/* remapped base addr of the reset control unit */ +static void __iomem *ltq_rcu_membase; + +/* This function is used by the watchdog driver */ +int ltq_reset_cause(void) +{ + u32 val = ltq_rcu_r32(LTQ_RCU_RST_STAT); + return val >> LTQ_RCU_STAT_SHIFT; +} +EXPORT_SYMBOL_GPL(ltq_reset_cause); + +static void ltq_machine_restart(char *command) +{ + pr_notice("System restart\n"); + local_irq_disable(); + ltq_rcu_w32(ltq_rcu_r32(LTQ_RCU_RST) | LTQ_RCU_RST_ALL, LTQ_RCU_RST); + unreachable(); +} + +static void ltq_machine_halt(void) +{ + pr_notice("System halted.\n"); + local_irq_disable(); + unreachable(); +} + +static void ltq_machine_power_off(void) +{ + pr_notice("Please turn off the power now.\n"); + local_irq_disable(); + unreachable(); +} + +static int __init mips_reboot_setup(void) +{ + /* insert and request the memory region */ + if (insert_resource(&iomem_resource, <q_rcu_resource) < 0) + panic("Failed to insert rcu memory\n"); + + if (request_mem_region(ltq_rcu_resource.start, + resource_size(<q_rcu_resource), "rcu") < 0) + panic("Failed to request rcu memory\n"); + + /* remap rcu register range */ + ltq_rcu_membase = ioremap_nocache(ltq_rcu_resource.start, + resource_size(<q_rcu_resource)); + if (!ltq_rcu_membase) + panic("Failed to remap rcu memory\n"); + + _machine_restart = ltq_machine_restart; + _machine_halt = ltq_machine_halt; + pm_power_off = ltq_machine_power_off; + + return 0; +} + +arch_initcall(mips_reboot_setup); -- cgit v1.1 From e47d488935ed0b2dd3d59d3ba4e13956ff6849c0 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:49 +0200 Subject: MIPS: Lantiq: Add PCI controller support. The Lantiq family of SoCs have a EBU (External Bus Unit). This patch adds the driver that allows us to use the EBU as a PCI controller. In order for PCI to work the EBU is set to endianess swap all the data. In addition we need to make use of SWAP_IO_SPACE for device->host DMA to work. The clock of the PCI works in several modes (internal/external). If this is not configured correctly the SoC will hang. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2250/ Signed-off-by: Ralf Baechle --- .../mips/include/asm/mach-lantiq/lantiq_platform.h | 46 ++++ arch/mips/pci/Makefile | 1 + arch/mips/pci/ops-lantiq.c | 116 ++++++++ arch/mips/pci/pci-lantiq.c | 297 +++++++++++++++++++++ arch/mips/pci/pci-lantiq.h | 18 ++ 5 files changed, 478 insertions(+) create mode 100644 arch/mips/include/asm/mach-lantiq/lantiq_platform.h create mode 100644 arch/mips/pci/ops-lantiq.c create mode 100644 arch/mips/pci/pci-lantiq.c create mode 100644 arch/mips/pci/pci-lantiq.h diff --git a/arch/mips/include/asm/mach-lantiq/lantiq_platform.h b/arch/mips/include/asm/mach-lantiq/lantiq_platform.h new file mode 100644 index 0000000..1f1dba6 --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/lantiq_platform.h @@ -0,0 +1,46 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LANTIQ_PLATFORM_H__ +#define _LANTIQ_PLATFORM_H__ + +#include + +/* struct used to pass info to the pci core */ +enum { + PCI_CLOCK_INT = 0, + PCI_CLOCK_EXT +}; + +#define PCI_EXIN0 0x0001 +#define PCI_EXIN1 0x0002 +#define PCI_EXIN2 0x0004 +#define PCI_EXIN3 0x0008 +#define PCI_EXIN4 0x0010 +#define PCI_EXIN5 0x0020 +#define PCI_EXIN_MAX 6 + +#define PCI_GNT1 0x0040 +#define PCI_GNT2 0x0080 +#define PCI_GNT3 0x0100 +#define PCI_GNT4 0x0200 + +#define PCI_REQ1 0x0400 +#define PCI_REQ2 0x0800 +#define PCI_REQ3 0x1000 +#define PCI_REQ4 0x2000 +#define PCI_REQ_SHIFT 10 +#define PCI_REQ_MASK 0xf + +struct ltq_pci_data { + int clock; + int gpio; + int irq[16]; +}; + +#endif diff --git a/arch/mips/pci/Makefile b/arch/mips/pci/Makefile index f0d5329..4df8799 100644 --- a/arch/mips/pci/Makefile +++ b/arch/mips/pci/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_SIBYTE_SB1250) += fixup-sb1250.o pci-sb1250.o obj-$(CONFIG_SIBYTE_BCM112X) += fixup-sb1250.o pci-sb1250.o obj-$(CONFIG_SIBYTE_BCM1x80) += pci-bcm1480.o pci-bcm1480ht.o obj-$(CONFIG_SNI_RM) += fixup-sni.o ops-sni.o +obj-$(CONFIG_SOC_XWAY) += pci-lantiq.o ops-lantiq.o obj-$(CONFIG_TANBAC_TB0219) += fixup-tb0219.o obj-$(CONFIG_TANBAC_TB0226) += fixup-tb0226.o obj-$(CONFIG_TANBAC_TB0287) += fixup-tb0287.o diff --git a/arch/mips/pci/ops-lantiq.c b/arch/mips/pci/ops-lantiq.c new file mode 100644 index 0000000..1f2afb5 --- /dev/null +++ b/arch/mips/pci/ops-lantiq.c @@ -0,0 +1,116 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "pci-lantiq.h" + +#define LTQ_PCI_CFG_BUSNUM_SHF 16 +#define LTQ_PCI_CFG_DEVNUM_SHF 11 +#define LTQ_PCI_CFG_FUNNUM_SHF 8 + +#define PCI_ACCESS_READ 0 +#define PCI_ACCESS_WRITE 1 + +static int ltq_pci_config_access(unsigned char access_type, struct pci_bus *bus, + unsigned int devfn, unsigned int where, u32 *data) +{ + unsigned long cfg_base; + unsigned long flags; + u32 temp; + + /* we support slot from 0 to 15 dev_fn & 0x68 (AD29) is the + SoC itself */ + if ((bus->number != 0) || ((devfn & 0xf8) > 0x78) + || ((devfn & 0xf8) == 0) || ((devfn & 0xf8) == 0x68)) + return 1; + + spin_lock_irqsave(&ebu_lock, flags); + + cfg_base = (unsigned long) ltq_pci_mapped_cfg; + cfg_base |= (bus->number << LTQ_PCI_CFG_BUSNUM_SHF) | (devfn << + LTQ_PCI_CFG_FUNNUM_SHF) | (where & ~0x3); + + /* Perform access */ + if (access_type == PCI_ACCESS_WRITE) { + ltq_w32(swab32(*data), ((u32 *)cfg_base)); + } else { + *data = ltq_r32(((u32 *)(cfg_base))); + *data = swab32(*data); + } + wmb(); + + /* clean possible Master abort */ + cfg_base = (unsigned long) ltq_pci_mapped_cfg; + cfg_base |= (0x0 << LTQ_PCI_CFG_FUNNUM_SHF) + 4; + temp = ltq_r32(((u32 *)(cfg_base))); + temp = swab32(temp); + cfg_base = (unsigned long) ltq_pci_mapped_cfg; + cfg_base |= (0x68 << LTQ_PCI_CFG_FUNNUM_SHF) + 4; + ltq_w32(temp, ((u32 *)cfg_base)); + + spin_unlock_irqrestore(&ebu_lock, flags); + + if (((*data) == 0xffffffff) && (access_type == PCI_ACCESS_READ)) + return 1; + + return 0; +} + +int ltq_pci_read_config_dword(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *val) +{ + u32 data = 0; + + if (ltq_pci_config_access(PCI_ACCESS_READ, bus, devfn, where, &data)) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (size == 1) + *val = (data >> ((where & 3) << 3)) & 0xff; + else if (size == 2) + *val = (data >> ((where & 3) << 3)) & 0xffff; + else + *val = data; + + return PCIBIOS_SUCCESSFUL; +} + +int ltq_pci_write_config_dword(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + u32 data = 0; + + if (size == 4) { + data = val; + } else { + if (ltq_pci_config_access(PCI_ACCESS_READ, bus, + devfn, where, &data)) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (size == 1) + data = (data & ~(0xff << ((where & 3) << 3))) | + (val << ((where & 3) << 3)); + else if (size == 2) + data = (data & ~(0xffff << ((where & 3) << 3))) | + (val << ((where & 3) << 3)); + } + + if (ltq_pci_config_access(PCI_ACCESS_WRITE, bus, devfn, where, &data)) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c new file mode 100644 index 0000000..603d749 --- /dev/null +++ b/arch/mips/pci/pci-lantiq.c @@ -0,0 +1,297 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "pci-lantiq.h" + +#define LTQ_PCI_CFG_BASE 0x17000000 +#define LTQ_PCI_CFG_SIZE 0x00008000 +#define LTQ_PCI_MEM_BASE 0x18000000 +#define LTQ_PCI_MEM_SIZE 0x02000000 +#define LTQ_PCI_IO_BASE 0x1AE00000 +#define LTQ_PCI_IO_SIZE 0x00200000 + +#define PCI_CR_FCI_ADDR_MAP0 0x00C0 +#define PCI_CR_FCI_ADDR_MAP1 0x00C4 +#define PCI_CR_FCI_ADDR_MAP2 0x00C8 +#define PCI_CR_FCI_ADDR_MAP3 0x00CC +#define PCI_CR_FCI_ADDR_MAP4 0x00D0 +#define PCI_CR_FCI_ADDR_MAP5 0x00D4 +#define PCI_CR_FCI_ADDR_MAP6 0x00D8 +#define PCI_CR_FCI_ADDR_MAP7 0x00DC +#define PCI_CR_CLK_CTRL 0x0000 +#define PCI_CR_PCI_MOD 0x0030 +#define PCI_CR_PC_ARB 0x0080 +#define PCI_CR_FCI_ADDR_MAP11hg 0x00E4 +#define PCI_CR_BAR11MASK 0x0044 +#define PCI_CR_BAR12MASK 0x0048 +#define PCI_CR_BAR13MASK 0x004C +#define PCI_CS_BASE_ADDR1 0x0010 +#define PCI_CR_PCI_ADDR_MAP11 0x0064 +#define PCI_CR_FCI_BURST_LENGTH 0x00E8 +#define PCI_CR_PCI_EOI 0x002C +#define PCI_CS_STS_CMD 0x0004 + +#define PCI_MASTER0_REQ_MASK_2BITS 8 +#define PCI_MASTER1_REQ_MASK_2BITS 10 +#define PCI_MASTER2_REQ_MASK_2BITS 12 +#define INTERNAL_ARB_ENABLE_BIT 0 + +#define LTQ_CGU_IFCCR 0x0018 +#define LTQ_CGU_PCICR 0x0034 + +#define ltq_pci_w32(x, y) ltq_w32((x), ltq_pci_membase + (y)) +#define ltq_pci_r32(x) ltq_r32(ltq_pci_membase + (x)) + +#define ltq_pci_cfg_w32(x, y) ltq_w32((x), ltq_pci_mapped_cfg + (y)) +#define ltq_pci_cfg_r32(x) ltq_r32(ltq_pci_mapped_cfg + (x)) + +struct ltq_pci_gpio_map { + int pin; + int alt0; + int alt1; + int dir; + char *name; +}; + +/* the pci core can make use of the following gpios */ +static struct ltq_pci_gpio_map ltq_pci_gpio_map[] = { + { 0, 1, 0, 0, "pci-exin0" }, + { 1, 1, 0, 0, "pci-exin1" }, + { 2, 1, 0, 0, "pci-exin2" }, + { 39, 1, 0, 0, "pci-exin3" }, + { 10, 1, 0, 0, "pci-exin4" }, + { 9, 1, 0, 0, "pci-exin5" }, + { 30, 1, 0, 1, "pci-gnt1" }, + { 23, 1, 0, 1, "pci-gnt2" }, + { 19, 1, 0, 1, "pci-gnt3" }, + { 38, 1, 0, 1, "pci-gnt4" }, + { 29, 1, 0, 0, "pci-req1" }, + { 31, 1, 0, 0, "pci-req2" }, + { 3, 1, 0, 0, "pci-req3" }, + { 37, 1, 0, 0, "pci-req4" }, +}; + +__iomem void *ltq_pci_mapped_cfg; +static __iomem void *ltq_pci_membase; + +int (*ltqpci_plat_dev_init)(struct pci_dev *dev) = NULL; + +/* Since the PCI REQ pins can be reused for other functionality, make it + possible to exclude those from interpretation by the PCI controller */ +static int ltq_pci_req_mask = 0xf; + +static int *ltq_pci_irq_map; + +struct pci_ops ltq_pci_ops = { + .read = ltq_pci_read_config_dword, + .write = ltq_pci_write_config_dword +}; + +static struct resource pci_io_resource = { + .name = "pci io space", + .start = LTQ_PCI_IO_BASE, + .end = LTQ_PCI_IO_BASE + LTQ_PCI_IO_SIZE - 1, + .flags = IORESOURCE_IO +}; + +static struct resource pci_mem_resource = { + .name = "pci memory space", + .start = LTQ_PCI_MEM_BASE, + .end = LTQ_PCI_MEM_BASE + LTQ_PCI_MEM_SIZE - 1, + .flags = IORESOURCE_MEM +}; + +static struct pci_controller ltq_pci_controller = { + .pci_ops = <q_pci_ops, + .mem_resource = &pci_mem_resource, + .mem_offset = 0x00000000UL, + .io_resource = &pci_io_resource, + .io_offset = 0x00000000UL, +}; + +int pcibios_plat_dev_init(struct pci_dev *dev) +{ + if (ltqpci_plat_dev_init) + return ltqpci_plat_dev_init(dev); + + return 0; +} + +static u32 ltq_calc_bar11mask(void) +{ + u32 mem, bar11mask; + + /* BAR11MASK value depends on available memory on system. */ + mem = num_physpages * PAGE_SIZE; + bar11mask = (0x0ffffff0 & ~((1 << (fls(mem) - 1)) - 1)) | 8; + + return bar11mask; +} + +static void ltq_pci_setup_gpio(int gpio) +{ + int i; + for (i = 0; i < ARRAY_SIZE(ltq_pci_gpio_map); i++) { + if (gpio & (1 << i)) { + ltq_gpio_request(ltq_pci_gpio_map[i].pin, + ltq_pci_gpio_map[i].alt0, + ltq_pci_gpio_map[i].alt1, + ltq_pci_gpio_map[i].dir, + ltq_pci_gpio_map[i].name); + } + } + ltq_gpio_request(21, 0, 0, 1, "pci-reset"); + ltq_pci_req_mask = (gpio >> PCI_REQ_SHIFT) & PCI_REQ_MASK; +} + +static int __devinit ltq_pci_startup(struct ltq_pci_data *conf) +{ + u32 temp_buffer; + + /* set clock to 33Mhz */ + ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) & ~0xf00000, LTQ_CGU_IFCCR); + ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) | 0x800000, LTQ_CGU_IFCCR); + + /* external or internal clock ? */ + if (conf->clock) { + ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) & ~(1 << 16), + LTQ_CGU_IFCCR); + ltq_cgu_w32((1 << 30), LTQ_CGU_PCICR); + } else { + ltq_cgu_w32(ltq_cgu_r32(LTQ_CGU_IFCCR) | (1 << 16), + LTQ_CGU_IFCCR); + ltq_cgu_w32((1 << 31) | (1 << 30), LTQ_CGU_PCICR); + } + + /* setup pci clock and gpis used by pci */ + ltq_pci_setup_gpio(conf->gpio); + + /* enable auto-switching between PCI and EBU */ + ltq_pci_w32(0xa, PCI_CR_CLK_CTRL); + + /* busy, i.e. configuration is not done, PCI access has to be retried */ + ltq_pci_w32(ltq_pci_r32(PCI_CR_PCI_MOD) & ~(1 << 24), PCI_CR_PCI_MOD); + wmb(); + /* BUS Master/IO/MEM access */ + ltq_pci_cfg_w32(ltq_pci_cfg_r32(PCI_CS_STS_CMD) | 7, PCI_CS_STS_CMD); + + /* enable external 2 PCI masters */ + temp_buffer = ltq_pci_r32(PCI_CR_PC_ARB); + temp_buffer &= (~(ltq_pci_req_mask << 16)); + /* enable internal arbiter */ + temp_buffer |= (1 << INTERNAL_ARB_ENABLE_BIT); + /* enable internal PCI master reqest */ + temp_buffer &= (~(3 << PCI_MASTER0_REQ_MASK_2BITS)); + + /* enable EBU request */ + temp_buffer &= (~(3 << PCI_MASTER1_REQ_MASK_2BITS)); + + /* enable all external masters request */ + temp_buffer &= (~(3 << PCI_MASTER2_REQ_MASK_2BITS)); + ltq_pci_w32(temp_buffer, PCI_CR_PC_ARB); + wmb(); + + /* setup BAR memory regions */ + ltq_pci_w32(0x18000000, PCI_CR_FCI_ADDR_MAP0); + ltq_pci_w32(0x18400000, PCI_CR_FCI_ADDR_MAP1); + ltq_pci_w32(0x18800000, PCI_CR_FCI_ADDR_MAP2); + ltq_pci_w32(0x18c00000, PCI_CR_FCI_ADDR_MAP3); + ltq_pci_w32(0x19000000, PCI_CR_FCI_ADDR_MAP4); + ltq_pci_w32(0x19400000, PCI_CR_FCI_ADDR_MAP5); + ltq_pci_w32(0x19800000, PCI_CR_FCI_ADDR_MAP6); + ltq_pci_w32(0x19c00000, PCI_CR_FCI_ADDR_MAP7); + ltq_pci_w32(0x1ae00000, PCI_CR_FCI_ADDR_MAP11hg); + ltq_pci_w32(ltq_calc_bar11mask(), PCI_CR_BAR11MASK); + ltq_pci_w32(0, PCI_CR_PCI_ADDR_MAP11); + ltq_pci_w32(0, PCI_CS_BASE_ADDR1); + /* both TX and RX endian swap are enabled */ + ltq_pci_w32(ltq_pci_r32(PCI_CR_PCI_EOI) | 3, PCI_CR_PCI_EOI); + wmb(); + ltq_pci_w32(ltq_pci_r32(PCI_CR_BAR12MASK) | 0x80000000, + PCI_CR_BAR12MASK); + ltq_pci_w32(ltq_pci_r32(PCI_CR_BAR13MASK) | 0x80000000, + PCI_CR_BAR13MASK); + /*use 8 dw burst length */ + ltq_pci_w32(0x303, PCI_CR_FCI_BURST_LENGTH); + ltq_pci_w32(ltq_pci_r32(PCI_CR_PCI_MOD) | (1 << 24), PCI_CR_PCI_MOD); + wmb(); + + /* setup irq line */ + ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_CON) | 0xc, LTQ_EBU_PCC_CON); + ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_IEN) | 0x10, LTQ_EBU_PCC_IEN); + + /* toggle reset pin */ + __gpio_set_value(21, 0); + wmb(); + mdelay(1); + __gpio_set_value(21, 1); + return 0; +} + +int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +{ + if (ltq_pci_irq_map[slot]) + return ltq_pci_irq_map[slot]; + printk(KERN_ERR "lq_pci: trying to map irq for unknown slot %d\n", + slot); + + return 0; +} + +static int __devinit ltq_pci_probe(struct platform_device *pdev) +{ + struct ltq_pci_data *ltq_pci_data = + (struct ltq_pci_data *) pdev->dev.platform_data; + pci_probe_only = 0; + ltq_pci_irq_map = ltq_pci_data->irq; + ltq_pci_membase = ioremap_nocache(PCI_CR_BASE_ADDR, PCI_CR_SIZE); + ltq_pci_mapped_cfg = + ioremap_nocache(LTQ_PCI_CFG_BASE, LTQ_PCI_CFG_BASE); + ltq_pci_controller.io_map_base = + (unsigned long)ioremap(LTQ_PCI_IO_BASE, LTQ_PCI_IO_SIZE - 1); + ltq_pci_startup(ltq_pci_data); + register_pci_controller(<q_pci_controller); + + return 0; +} + +static struct platform_driver +ltq_pci_driver = { + .probe = ltq_pci_probe, + .driver = { + .name = "ltq_pci", + .owner = THIS_MODULE, + }, +}; + +int __init pcibios_init(void) +{ + int ret = platform_driver_register(<q_pci_driver); + if (ret) + printk(KERN_INFO "ltq_pci: Error registering platfom driver!"); + return ret; +} + +arch_initcall(pcibios_init); diff --git a/arch/mips/pci/pci-lantiq.h b/arch/mips/pci/pci-lantiq.h new file mode 100644 index 0000000..66bf6cd --- /dev/null +++ b/arch/mips/pci/pci-lantiq.h @@ -0,0 +1,18 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LTQ_PCI_H__ +#define _LTQ_PCI_H__ + +extern __iomem void *ltq_pci_mapped_cfg; +extern int ltq_pci_read_config_dword(struct pci_bus *bus, + unsigned int devfn, int where, int size, u32 *val); +extern int ltq_pci_write_config_dword(struct pci_bus *bus, + unsigned int devfn, int where, int size, u32 val); + +#endif -- cgit v1.1 From 3c5447390c3e1a462913e327a016a55cae501580 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 12 Apr 2011 18:10:01 +0200 Subject: MIPS: Lantiq: Add NOR flash support This patch adds the driver/map for NOR devices attached to the SoC via the External Bus Unit (EBU). Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: David Woodhouse Cc: Daniel Schwierzeck Cc: linux-mips@linux-mips.org Cc: linux-mtd@lists.infradead.org Acked-by: Artem Bityutskiy Patchwork: https://patchwork.linux-mips.org/patch/2285/ Signed-off-by: Ralf Baechle --- drivers/mtd/maps/Kconfig | 7 ++ drivers/mtd/maps/Makefile | 1 + drivers/mtd/maps/lantiq-flash.c | 251 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 drivers/mtd/maps/lantiq-flash.c diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index 44b1f46..5069111 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -260,6 +260,13 @@ config MTD_BCM963XX Support for parsing CFE image tag and creating MTD partitions on Broadcom BCM63xx boards. +config MTD_LANTIQ + tristate "Lantiq SoC NOR support" + depends on LANTIQ + select MTD_PARTITIONS + help + Support for NOR flash attached to the Lantiq SoC's External Bus Unit. + config MTD_DILNETPC tristate "CFI Flash device mapped on DIL/Net PC" depends on X86 && MTD_PARTITIONS && MTD_CFI_INTELEXT && BROKEN diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile index 08533bd..6adf4c9 100644 --- a/drivers/mtd/maps/Makefile +++ b/drivers/mtd/maps/Makefile @@ -60,3 +60,4 @@ obj-$(CONFIG_MTD_VMU) += vmu-flash.o obj-$(CONFIG_MTD_GPIO_ADDR) += gpio-addr-flash.o obj-$(CONFIG_MTD_BCM963XX) += bcm963xx-flash.o obj-$(CONFIG_MTD_LATCH_ADDR) += latch-addr-flash.o +obj-$(CONFIG_MTD_LANTIQ) += lantiq-flash.o diff --git a/drivers/mtd/maps/lantiq-flash.c b/drivers/mtd/maps/lantiq-flash.c new file mode 100644 index 0000000..a90cabd --- /dev/null +++ b/drivers/mtd/maps/lantiq-flash.c @@ -0,0 +1,251 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2004 Liu Peng Infineon IFAP DC COM CPE + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * The NOR flash is connected to the same external bus unit (EBU) as PCI. + * To make PCI work we need to enable the endianness swapping for the address + * written to the EBU. This endianness swapping works for PCI correctly but + * fails for attached NOR devices. To workaround this we need to use a complex + * map. The workaround involves swapping all addresses whilst probing the chip. + * Once probing is complete we stop swapping the addresses but swizzle the + * unlock addresses to ensure that access to the NOR device works correctly. + */ + +enum { + LTQ_NOR_PROBING, + LTQ_NOR_NORMAL +}; + +struct ltq_mtd { + struct resource *res; + struct mtd_info *mtd; + struct map_info *map; +}; + +static char ltq_map_name[] = "ltq_nor"; + +static map_word +ltq_read16(struct map_info *map, unsigned long adr) +{ + unsigned long flags; + map_word temp; + + if (map->map_priv_1 == LTQ_NOR_PROBING) + adr ^= 2; + spin_lock_irqsave(&ebu_lock, flags); + temp.x[0] = *(u16 *)(map->virt + adr); + spin_unlock_irqrestore(&ebu_lock, flags); + return temp; +} + +static void +ltq_write16(struct map_info *map, map_word d, unsigned long adr) +{ + unsigned long flags; + + if (map->map_priv_1 == LTQ_NOR_PROBING) + adr ^= 2; + spin_lock_irqsave(&ebu_lock, flags); + *(u16 *)(map->virt + adr) = d.x[0]; + spin_unlock_irqrestore(&ebu_lock, flags); +} + +/* + * The following 2 functions copy data between iomem and a cached memory + * section. As memcpy() makes use of pre-fetching we cannot use it here. + * The normal alternative of using memcpy_{to,from}io also makes use of + * memcpy() on MIPS so it is not applicable either. We are therefore stuck + * with having to use our own loop. + */ +static void +ltq_copy_from(struct map_info *map, void *to, + unsigned long from, ssize_t len) +{ + unsigned char *f = (unsigned char *)map->virt + from; + unsigned char *t = (unsigned char *)to; + unsigned long flags; + + spin_lock_irqsave(&ebu_lock, flags); + while (len--) + *t++ = *f++; + spin_unlock_irqrestore(&ebu_lock, flags); +} + +static void +ltq_copy_to(struct map_info *map, unsigned long to, + const void *from, ssize_t len) +{ + unsigned char *f = (unsigned char *)from; + unsigned char *t = (unsigned char *)map->virt + to; + unsigned long flags; + + spin_lock_irqsave(&ebu_lock, flags); + while (len--) + *t++ = *f++; + spin_unlock_irqrestore(&ebu_lock, flags); +} + +static const char const *part_probe_types[] = { "cmdlinepart", NULL }; + +static int __init +ltq_mtd_probe(struct platform_device *pdev) +{ + struct physmap_flash_data *ltq_mtd_data = dev_get_platdata(&pdev->dev); + struct ltq_mtd *ltq_mtd; + struct mtd_partition *parts; + struct resource *res; + int nr_parts = 0; + struct cfi_private *cfi; + int err; + + ltq_mtd = kzalloc(sizeof(struct ltq_mtd), GFP_KERNEL); + platform_set_drvdata(pdev, ltq_mtd); + + ltq_mtd->res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!ltq_mtd->res) { + dev_err(&pdev->dev, "failed to get memory resource"); + err = -ENOENT; + goto err_out; + } + + res = devm_request_mem_region(&pdev->dev, ltq_mtd->res->start, + resource_size(ltq_mtd->res), dev_name(&pdev->dev)); + if (!ltq_mtd->res) { + dev_err(&pdev->dev, "failed to request mem resource"); + err = -EBUSY; + goto err_out; + } + + ltq_mtd->map = kzalloc(sizeof(struct map_info), GFP_KERNEL); + ltq_mtd->map->phys = res->start; + ltq_mtd->map->size = resource_size(res); + ltq_mtd->map->virt = devm_ioremap_nocache(&pdev->dev, + ltq_mtd->map->phys, ltq_mtd->map->size); + if (!ltq_mtd->map->virt) { + dev_err(&pdev->dev, "failed to ioremap!\n"); + err = -ENOMEM; + goto err_free; + } + + ltq_mtd->map->name = ltq_map_name; + ltq_mtd->map->bankwidth = 2; + ltq_mtd->map->read = ltq_read16; + ltq_mtd->map->write = ltq_write16; + ltq_mtd->map->copy_from = ltq_copy_from; + ltq_mtd->map->copy_to = ltq_copy_to; + + ltq_mtd->map->map_priv_1 = LTQ_NOR_PROBING; + ltq_mtd->mtd = do_map_probe("cfi_probe", ltq_mtd->map); + ltq_mtd->map->map_priv_1 = LTQ_NOR_NORMAL; + + if (!ltq_mtd->mtd) { + dev_err(&pdev->dev, "probing failed\n"); + err = -ENXIO; + goto err_unmap; + } + + ltq_mtd->mtd->owner = THIS_MODULE; + + cfi = ltq_mtd->map->fldrv_priv; + cfi->addr_unlock1 ^= 1; + cfi->addr_unlock2 ^= 1; + + nr_parts = parse_mtd_partitions(ltq_mtd->mtd, + part_probe_types, &parts, 0); + if (nr_parts > 0) { + dev_info(&pdev->dev, + "using %d partitions from cmdline", nr_parts); + } else { + nr_parts = ltq_mtd_data->nr_parts; + parts = ltq_mtd_data->parts; + } + + err = add_mtd_partitions(ltq_mtd->mtd, parts, nr_parts); + if (err) { + dev_err(&pdev->dev, "failed to add partitions\n"); + goto err_destroy; + } + + return 0; + +err_destroy: + map_destroy(ltq_mtd->mtd); +err_unmap: + iounmap(ltq_mtd->map->virt); +err_free: + kfree(ltq_mtd->map); +err_out: + kfree(ltq_mtd); + return err; +} + +static int __devexit +ltq_mtd_remove(struct platform_device *pdev) +{ + struct ltq_mtd *ltq_mtd = platform_get_drvdata(pdev); + + if (ltq_mtd) { + if (ltq_mtd->mtd) { + del_mtd_partitions(ltq_mtd->mtd); + map_destroy(ltq_mtd->mtd); + } + if (ltq_mtd->map->virt) + iounmap(ltq_mtd->map->virt); + kfree(ltq_mtd->map); + kfree(ltq_mtd); + } + return 0; +} + +static struct platform_driver ltq_mtd_driver = { + .remove = __devexit_p(ltq_mtd_remove), + .driver = { + .name = "ltq_nor", + .owner = THIS_MODULE, + }, +}; + +static int __init +init_ltq_mtd(void) +{ + int ret = platform_driver_probe(<q_mtd_driver, ltq_mtd_probe); + + if (ret) + pr_err("ltq_nor: error registering platform driver"); + return ret; +} + +static void __exit +exit_ltq_mtd(void) +{ + platform_driver_unregister(<q_mtd_driver); +} + +module_init(init_ltq_mtd); +module_exit(exit_ltq_mtd); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("John Crispin "); +MODULE_DESCRIPTION("Lantiq SoC NOR"); -- cgit v1.1 From 24aff71fa8df0d6a73dab17f3f2285a24b8f658f Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:53 +0200 Subject: MIPS: Lantiq: Add platform device support This patch adds the wrappers for registering our platform devices. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2254/ Patchwork: https://patchwork.linux-mips.org/patch/2360/ Patchwork: https://patchwork.linux-mips.org/patch/2359/ Signed-off-by: Ralf Baechle --- arch/mips/lantiq/Makefile | 2 +- arch/mips/lantiq/devices.c | 122 ++++++++++++++++++++++++++++++++++++++++ arch/mips/lantiq/devices.h | 23 ++++++++ arch/mips/lantiq/xway/Makefile | 2 +- arch/mips/lantiq/xway/devices.c | 98 ++++++++++++++++++++++++++++++++ arch/mips/lantiq/xway/devices.h | 18 ++++++ 6 files changed, 263 insertions(+), 2 deletions(-) create mode 100644 arch/mips/lantiq/devices.c create mode 100644 arch/mips/lantiq/devices.h create mode 100644 arch/mips/lantiq/xway/devices.c create mode 100644 arch/mips/lantiq/xway/devices.h diff --git a/arch/mips/lantiq/Makefile b/arch/mips/lantiq/Makefile index a268391..e5dae0e 100644 --- a/arch/mips/lantiq/Makefile +++ b/arch/mips/lantiq/Makefile @@ -4,7 +4,7 @@ # under the terms of the GNU General Public License version 2 as published # by the Free Software Foundation. -obj-y := irq.o setup.o clk.o prom.o +obj-y := irq.o setup.o clk.o prom.o devices.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/mips/lantiq/devices.c b/arch/mips/lantiq/devices.c new file mode 100644 index 0000000..7b82c34 --- /dev/null +++ b/arch/mips/lantiq/devices.c @@ -0,0 +1,122 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "devices.h" + +/* nor flash */ +static struct resource ltq_nor_resource = { + .name = "nor", + .start = LTQ_FLASH_START, + .end = LTQ_FLASH_START + LTQ_FLASH_MAX - 1, + .flags = IORESOURCE_MEM, +}; + +static struct platform_device ltq_nor = { + .name = "ltq_nor", + .resource = <q_nor_resource, + .num_resources = 1, +}; + +void __init ltq_register_nor(struct physmap_flash_data *data) +{ + ltq_nor.dev.platform_data = data; + platform_device_register(<q_nor); +} + +/* watchdog */ +static struct resource ltq_wdt_resource = { + .name = "watchdog", + .start = LTQ_WDT_BASE_ADDR, + .end = LTQ_WDT_BASE_ADDR + LTQ_WDT_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +void __init ltq_register_wdt(void) +{ + platform_device_register_simple("ltq_wdt", 0, <q_wdt_resource, 1); +} + +/* asc ports */ +static struct resource ltq_asc0_resources[] = { + { + .name = "asc0", + .start = LTQ_ASC0_BASE_ADDR, + .end = LTQ_ASC0_BASE_ADDR + LTQ_ASC_SIZE - 1, + .flags = IORESOURCE_MEM, + }, + IRQ_RES(tx, LTQ_ASC_TIR(0)), + IRQ_RES(rx, LTQ_ASC_RIR(0)), + IRQ_RES(err, LTQ_ASC_EIR(0)), +}; + +static struct resource ltq_asc1_resources[] = { + { + .name = "asc1", + .start = LTQ_ASC1_BASE_ADDR, + .end = LTQ_ASC1_BASE_ADDR + LTQ_ASC_SIZE - 1, + .flags = IORESOURCE_MEM, + }, + IRQ_RES(tx, LTQ_ASC_TIR(1)), + IRQ_RES(rx, LTQ_ASC_RIR(1)), + IRQ_RES(err, LTQ_ASC_EIR(1)), +}; + +void __init ltq_register_asc(int port) +{ + switch (port) { + case 0: + platform_device_register_simple("ltq_asc", 0, + ltq_asc0_resources, ARRAY_SIZE(ltq_asc0_resources)); + break; + case 1: + platform_device_register_simple("ltq_asc", 1, + ltq_asc1_resources, ARRAY_SIZE(ltq_asc1_resources)); + break; + default: + break; + } +} + +#ifdef CONFIG_PCI +/* pci */ +static struct platform_device ltq_pci = { + .name = "ltq_pci", + .num_resources = 0, +}; + +void __init ltq_register_pci(struct ltq_pci_data *data) +{ + ltq_pci.dev.platform_data = data; + platform_device_register(<q_pci); +} +#else +void __init ltq_register_pci(struct ltq_pci_data *data) +{ + pr_err("kernel is compiled without PCI support\n"); +} +#endif diff --git a/arch/mips/lantiq/devices.h b/arch/mips/lantiq/devices.h new file mode 100644 index 0000000..2947bb1 --- /dev/null +++ b/arch/mips/lantiq/devices.h @@ -0,0 +1,23 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LTQ_DEVICES_H__ +#define _LTQ_DEVICES_H__ + +#include +#include + +#define IRQ_RES(resname, irq) \ + {.name = #resname, .start = (irq), .flags = IORESOURCE_IRQ} + +extern void ltq_register_nor(struct physmap_flash_data *data); +extern void ltq_register_wdt(void); +extern void ltq_register_asc(int port); +extern void ltq_register_pci(struct ltq_pci_data *data); + +#endif diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile index 9c85ff9..74ce438 100644 --- a/arch/mips/lantiq/xway/Makefile +++ b/arch/mips/lantiq/xway/Makefile @@ -1,4 +1,4 @@ -obj-y := pmu.o ebu.o reset.o gpio.o +obj-y := pmu.o ebu.o reset.o gpio.o devices.o obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o diff --git a/arch/mips/lantiq/xway/devices.c b/arch/mips/lantiq/xway/devices.c new file mode 100644 index 0000000..a71b3b5 --- /dev/null +++ b/arch/mips/lantiq/xway/devices.c @@ -0,0 +1,98 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "devices.h" + +/* gpio */ +static struct resource ltq_gpio_resource[] = { + { + .name = "gpio0", + .start = LTQ_GPIO0_BASE_ADDR, + .end = LTQ_GPIO0_BASE_ADDR + LTQ_GPIO_SIZE - 1, + .flags = IORESOURCE_MEM, + }, { + .name = "gpio1", + .start = LTQ_GPIO1_BASE_ADDR, + .end = LTQ_GPIO1_BASE_ADDR + LTQ_GPIO_SIZE - 1, + .flags = IORESOURCE_MEM, + }, { + .name = "gpio2", + .start = LTQ_GPIO2_BASE_ADDR, + .end = LTQ_GPIO2_BASE_ADDR + LTQ_GPIO_SIZE - 1, + .flags = IORESOURCE_MEM, + } +}; + +void __init ltq_register_gpio(void) +{ + platform_device_register_simple("ltq_gpio", 0, + <q_gpio_resource[0], 1); + platform_device_register_simple("ltq_gpio", 1, + <q_gpio_resource[1], 1); + + /* AR9 and VR9 have an extra gpio block */ + if (ltq_is_ar9() || ltq_is_vr9()) { + platform_device_register_simple("ltq_gpio", 2, + <q_gpio_resource[2], 1); + } +} + +/* serial to parallel conversion */ +static struct resource ltq_stp_resource = { + .name = "stp", + .start = LTQ_STP_BASE_ADDR, + .end = LTQ_STP_BASE_ADDR + LTQ_STP_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +void __init ltq_register_gpio_stp(void) +{ + platform_device_register_simple("ltq_stp", 0, <q_stp_resource, 1); +} + +/* asc ports - amazon se has its own serial mapping */ +static struct resource ltq_ase_asc_resources[] = { + { + .name = "asc0", + .start = LTQ_ASC1_BASE_ADDR, + .end = LTQ_ASC1_BASE_ADDR + LTQ_ASC_SIZE - 1, + .flags = IORESOURCE_MEM, + }, + IRQ_RES(tx, LTQ_ASC_ASE_TIR), + IRQ_RES(rx, LTQ_ASC_ASE_RIR), + IRQ_RES(err, LTQ_ASC_ASE_EIR), +}; + +void __init ltq_register_ase_asc(void) +{ + platform_device_register_simple("ltq_asc", 0, + ltq_ase_asc_resources, ARRAY_SIZE(ltq_ase_asc_resources)); +} diff --git a/arch/mips/lantiq/xway/devices.h b/arch/mips/lantiq/xway/devices.h new file mode 100644 index 0000000..51f56b5 --- /dev/null +++ b/arch/mips/lantiq/xway/devices.h @@ -0,0 +1,18 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LTQ_DEVICES_XWAY_H__ +#define _LTQ_DEVICES_XWAY_H__ + +#include "../devices.h" + +extern void ltq_register_gpio(void); +extern void ltq_register_gpio_stp(void); +extern void ltq_register_ase_asc(void); + +#endif -- cgit v1.1 From a053ac17024561f3a2fd02424b5f92823282b5ad Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:54 +0200 Subject: MIPS: Lantiq: Add mips_machine support This patch adds support for Gabor's mips_machine patch. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: Gabor Juhos Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2251/ Patchwork: https://patchwork.linux-mips.org/patch/2358/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 1 + arch/mips/lantiq/machtypes.h | 18 ++++++++++++++++++ arch/mips/lantiq/prom.h | 1 + arch/mips/lantiq/setup.c | 25 +++++++++++++++++++++++++ arch/mips/lantiq/xway/Makefile | 4 ++-- arch/mips/lantiq/xway/setup-ase.c | 19 +++++++++++++++++++ arch/mips/lantiq/xway/setup-xway.c | 20 ++++++++++++++++++++ 7 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 arch/mips/lantiq/machtypes.h create mode 100644 arch/mips/lantiq/xway/setup-ase.c create mode 100644 arch/mips/lantiq/xway/setup-xway.c diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b3b4999..2d1cf97 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -228,6 +228,7 @@ config LANTIQ select SWAP_IO_SPACE select BOOT_RAW select HAVE_CLK + select MIPS_MACHINE config LASAT bool "LASAT Networks platforms" diff --git a/arch/mips/lantiq/machtypes.h b/arch/mips/lantiq/machtypes.h new file mode 100644 index 0000000..ffcacfc --- /dev/null +++ b/arch/mips/lantiq/machtypes.h @@ -0,0 +1,18 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#ifndef _LANTIQ_MACH_H__ +#define _LANTIQ_MACH_H__ + +#include + +enum lantiq_mach_type { + LTQ_MACH_GENERIC = 0, +}; + +#endif diff --git a/arch/mips/lantiq/prom.h b/arch/mips/lantiq/prom.h index 4165ad1..b4229d9 100644 --- a/arch/mips/lantiq/prom.h +++ b/arch/mips/lantiq/prom.h @@ -20,5 +20,6 @@ struct ltq_soc_info { }; extern void ltq_soc_detect(struct ltq_soc_info *i); +extern void ltq_soc_setup(void); #endif diff --git a/arch/mips/lantiq/setup.c b/arch/mips/lantiq/setup.c index 79a2b0c..9b8af77 100644 --- a/arch/mips/lantiq/setup.c +++ b/arch/mips/lantiq/setup.c @@ -14,6 +14,10 @@ #include +#include "machtypes.h" +#include "devices.h" +#include "prom.h" + void __init plat_mem_setup(void) { /* assume 16M as default incase uboot fails to pass proper ramsize */ @@ -39,3 +43,24 @@ void __init plat_mem_setup(void) memsize *= 1024 * 1024; add_memory_region(0x00000000, memsize, BOOT_MEM_RAM); } + +static int __init +lantiq_setup(void) +{ + ltq_soc_setup(); + mips_machine_setup(); + return 0; +} + +arch_initcall(lantiq_setup); + +static void __init +lantiq_generic_init(void) +{ + /* Nothing to do */ +} + +MIPS_MACHINE(LTQ_MACH_GENERIC, + "Generic", + "Generic Lantiq based board", + lantiq_generic_init); diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile index 74ce438..8c06a97 100644 --- a/arch/mips/lantiq/xway/Makefile +++ b/arch/mips/lantiq/xway/Makefile @@ -1,4 +1,4 @@ obj-y := pmu.o ebu.o reset.o gpio.o devices.o -obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o -obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o +obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o setup-xway.o +obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o setup-ase.o diff --git a/arch/mips/lantiq/xway/setup-ase.c b/arch/mips/lantiq/xway/setup-ase.c new file mode 100644 index 0000000..f6f3267 --- /dev/null +++ b/arch/mips/lantiq/xway/setup-ase.c @@ -0,0 +1,19 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2011 John Crispin + */ + +#include + +#include "../prom.h" +#include "devices.h" + +void __init ltq_soc_setup(void) +{ + ltq_register_ase_asc(); + ltq_register_gpio(); + ltq_register_wdt(); +} diff --git a/arch/mips/lantiq/xway/setup-xway.c b/arch/mips/lantiq/xway/setup-xway.c new file mode 100644 index 0000000..c292f64 --- /dev/null +++ b/arch/mips/lantiq/xway/setup-xway.c @@ -0,0 +1,20 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2011 John Crispin + */ + +#include + +#include "../prom.h" +#include "devices.h" + +void __init ltq_soc_setup(void) +{ + ltq_register_asc(0); + ltq_register_asc(1); + ltq_register_gpio(); + ltq_register_wdt(); +} -- cgit v1.1 From 973c32eb7f2d5c45d0e68b0083ead9ee763d9a6f Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:55 +0200 Subject: MIPS: Lantiq: Add machtypes for lantiq eval kits This patch adds mach specific code for the Lantiq EASY50712/50601 evaluation boards Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2255/ Patchwork: https://patchwork.linux-mips.org/patch/2361/ Signed-off-by: Ralf Baechle --- arch/mips/lantiq/Kconfig | 2 + arch/mips/lantiq/machtypes.h | 2 + arch/mips/lantiq/xway/Kconfig | 23 ++++++++++++ arch/mips/lantiq/xway/Makefile | 3 ++ arch/mips/lantiq/xway/mach-easy50601.c | 57 ++++++++++++++++++++++++++++ arch/mips/lantiq/xway/mach-easy50712.c | 68 ++++++++++++++++++++++++++++++++++ 6 files changed, 155 insertions(+) create mode 100644 arch/mips/lantiq/xway/Kconfig create mode 100644 arch/mips/lantiq/xway/mach-easy50601.c create mode 100644 arch/mips/lantiq/xway/mach-easy50712.c diff --git a/arch/mips/lantiq/Kconfig b/arch/mips/lantiq/Kconfig index 2780461..3fccf21 100644 --- a/arch/mips/lantiq/Kconfig +++ b/arch/mips/lantiq/Kconfig @@ -18,4 +18,6 @@ config SOC_XWAY select HW_HAS_PCI endchoice +source "arch/mips/lantiq/xway/Kconfig" + endif diff --git a/arch/mips/lantiq/machtypes.h b/arch/mips/lantiq/machtypes.h index ffcacfc..7e01b8c 100644 --- a/arch/mips/lantiq/machtypes.h +++ b/arch/mips/lantiq/machtypes.h @@ -13,6 +13,8 @@ enum lantiq_mach_type { LTQ_MACH_GENERIC = 0, + LTQ_MACH_EASY50712, /* Danube evaluation board */ + LTQ_MACH_EASY50601, /* Amazon SE evaluation board */ }; #endif diff --git a/arch/mips/lantiq/xway/Kconfig b/arch/mips/lantiq/xway/Kconfig new file mode 100644 index 0000000..2b857de --- /dev/null +++ b/arch/mips/lantiq/xway/Kconfig @@ -0,0 +1,23 @@ +if SOC_XWAY + +menu "MIPS Machine" + +config LANTIQ_MACH_EASY50712 + bool "Easy50712 - Danube" + default y + +endmenu + +endif + +if SOC_AMAZON_SE + +menu "MIPS Machine" + +config LANTIQ_MACH_EASY50601 + bool "Easy50601 - Amazon SE" + default y + +endmenu + +endif diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile index 8c06a97..b1d3640 100644 --- a/arch/mips/lantiq/xway/Makefile +++ b/arch/mips/lantiq/xway/Makefile @@ -2,3 +2,6 @@ obj-y := pmu.o ebu.o reset.o gpio.o devices.o obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o setup-xway.o obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o setup-ase.o + +obj-$(CONFIG_LANTIQ_MACH_EASY50712) += mach-easy50712.o +obj-$(CONFIG_LANTIQ_MACH_EASY50601) += mach-easy50601.o diff --git a/arch/mips/lantiq/xway/mach-easy50601.c b/arch/mips/lantiq/xway/mach-easy50601.c new file mode 100644 index 0000000..d5aaf63 --- /dev/null +++ b/arch/mips/lantiq/xway/mach-easy50601.c @@ -0,0 +1,57 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "../machtypes.h" +#include "devices.h" + +static struct mtd_partition easy50601_partitions[] = { + { + .name = "uboot", + .offset = 0x0, + .size = 0x10000, + }, + { + .name = "uboot_env", + .offset = 0x10000, + .size = 0x10000, + }, + { + .name = "linux", + .offset = 0x20000, + .size = 0xE0000, + }, + { + .name = "rootfs", + .offset = 0x100000, + .size = 0x300000, + }, +}; + +static struct physmap_flash_data easy50601_flash_data = { + .nr_parts = ARRAY_SIZE(easy50601_partitions), + .parts = easy50601_partitions, +}; + +static void __init easy50601_init(void) +{ + ltq_register_nor(&easy50601_flash_data); +} + +MIPS_MACHINE(LTQ_MACH_EASY50601, + "EASY50601", + "EASY50601 Eval Board", + easy50601_init); diff --git a/arch/mips/lantiq/xway/mach-easy50712.c b/arch/mips/lantiq/xway/mach-easy50712.c new file mode 100644 index 0000000..e5e7e09 --- /dev/null +++ b/arch/mips/lantiq/xway/mach-easy50712.c @@ -0,0 +1,68 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../machtypes.h" +#include "devices.h" + +static struct mtd_partition easy50712_partitions[] = { + { + .name = "uboot", + .offset = 0x0, + .size = 0x10000, + }, + { + .name = "uboot_env", + .offset = 0x10000, + .size = 0x10000, + }, + { + .name = "linux", + .offset = 0x20000, + .size = 0xe0000, + }, + { + .name = "rootfs", + .offset = 0x100000, + .size = 0x300000, + }, +}; + +static struct physmap_flash_data easy50712_flash_data = { + .nr_parts = ARRAY_SIZE(easy50712_partitions), + .parts = easy50712_partitions, +}; + +static struct ltq_pci_data ltq_pci_data = { + .clock = PCI_CLOCK_INT, + .gpio = PCI_GNT1 | PCI_REQ1, + .irq = { + [14] = INT_NUM_IM0_IRL0 + 22, + }, +}; + +static void __init easy50712_init(void) +{ + ltq_register_gpio_stp(); + ltq_register_nor(&easy50712_flash_data); + ltq_register_pci(<q_pci_data); +} + +MIPS_MACHINE(LTQ_MACH_EASY50712, + "EASY50712", + "EASY50712 Eval Board", + easy50712_init); -- cgit v1.1 From 935c500c377d8e414bbe08e0e169f6c85d2a4273 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 30 Mar 2011 09:27:56 +0200 Subject: MIPS: Lantiq: Add more gpio drivers The XWAY family allows to extend the number of gpios by using shift registers or latches. This patch adds the 2 drivers needed for this. The extended gpios are output only. [ralf@linux-mips.org: Fixed ltq_stp_probe section() attributes.] Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2258/ Signed-off-by: Ralf Baechle --- arch/mips/lantiq/xway/Makefile | 2 +- arch/mips/lantiq/xway/gpio_ebu.c | 126 +++++++++++++++++++++++++++++++ arch/mips/lantiq/xway/gpio_stp.c | 157 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 arch/mips/lantiq/xway/gpio_ebu.c create mode 100644 arch/mips/lantiq/xway/gpio_stp.c diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile index b1d3640..6b5e07e 100644 --- a/arch/mips/lantiq/xway/Makefile +++ b/arch/mips/lantiq/xway/Makefile @@ -1,4 +1,4 @@ -obj-y := pmu.o ebu.o reset.o gpio.o devices.o +obj-y := pmu.o ebu.o reset.o gpio.o gpio_stp.o gpio_ebu.o devices.o obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o setup-xway.o obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o setup-ase.o diff --git a/arch/mips/lantiq/xway/gpio_ebu.c b/arch/mips/lantiq/xway/gpio_ebu.c new file mode 100644 index 0000000..a479355 --- /dev/null +++ b/arch/mips/lantiq/xway/gpio_ebu.c @@ -0,0 +1,126 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * By attaching hardware latches to the EBU it is possible to create output + * only gpios. This driver configures a special memory address, which when + * written to outputs 16 bit to the latches. + */ + +#define LTQ_EBU_BUSCON 0x1e7ff /* 16 bit access, slowest timing */ +#define LTQ_EBU_WP 0x80000000 /* write protect bit */ + +/* we keep a shadow value of the last value written to the ebu */ +static int ltq_ebu_gpio_shadow = 0x0; +static void __iomem *ltq_ebu_gpio_membase; + +static void ltq_ebu_apply(void) +{ + unsigned long flags; + + spin_lock_irqsave(&ebu_lock, flags); + ltq_ebu_w32(LTQ_EBU_BUSCON, LTQ_EBU_BUSCON1); + *((__u16 *)ltq_ebu_gpio_membase) = ltq_ebu_gpio_shadow; + ltq_ebu_w32(LTQ_EBU_BUSCON | LTQ_EBU_WP, LTQ_EBU_BUSCON1); + spin_unlock_irqrestore(&ebu_lock, flags); +} + +static void ltq_ebu_set(struct gpio_chip *chip, unsigned offset, int value) +{ + if (value) + ltq_ebu_gpio_shadow |= (1 << offset); + else + ltq_ebu_gpio_shadow &= ~(1 << offset); + ltq_ebu_apply(); +} + +static int ltq_ebu_direction_output(struct gpio_chip *chip, unsigned offset, + int value) +{ + ltq_ebu_set(chip, offset, value); + + return 0; +} + +static struct gpio_chip ltq_ebu_chip = { + .label = "ltq_ebu", + .direction_output = ltq_ebu_direction_output, + .set = ltq_ebu_set, + .base = 72, + .ngpio = 16, + .can_sleep = 1, + .owner = THIS_MODULE, +}; + +static int ltq_ebu_probe(struct platform_device *pdev) +{ + int ret = 0; + struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + if (!res) { + dev_err(&pdev->dev, "failed to get memory resource\n"); + return -ENOENT; + } + + res = devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), dev_name(&pdev->dev)); + if (!res) { + dev_err(&pdev->dev, "failed to request memory resource\n"); + return -EBUSY; + } + + ltq_ebu_gpio_membase = devm_ioremap_nocache(&pdev->dev, res->start, + resource_size(res)); + if (!ltq_ebu_gpio_membase) { + dev_err(&pdev->dev, "Failed to ioremap mem region\n"); + return -ENOMEM; + } + + /* grab the default shadow value passed form the platform code */ + ltq_ebu_gpio_shadow = (unsigned int) pdev->dev.platform_data; + + /* tell the ebu controller which memory address we will be using */ + ltq_ebu_w32(pdev->resource->start | 0x1, LTQ_EBU_ADDRSEL1); + + /* write protect the region */ + ltq_ebu_w32(LTQ_EBU_BUSCON | LTQ_EBU_WP, LTQ_EBU_BUSCON1); + + ret = gpiochip_add(<q_ebu_chip); + if (!ret) + ltq_ebu_apply(); + return ret; +} + +static struct platform_driver ltq_ebu_driver = { + .probe = ltq_ebu_probe, + .driver = { + .name = "ltq_ebu", + .owner = THIS_MODULE, + }, +}; + +static int __init ltq_ebu_init(void) +{ + int ret = platform_driver_register(<q_ebu_driver); + + if (ret) + pr_info("ltq_ebu : Error registering platfom driver!"); + return ret; +} + +postcore_initcall(ltq_ebu_init); diff --git a/arch/mips/lantiq/xway/gpio_stp.c b/arch/mips/lantiq/xway/gpio_stp.c new file mode 100644 index 0000000..67d59d6 --- /dev/null +++ b/arch/mips/lantiq/xway/gpio_stp.c @@ -0,0 +1,157 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2007 John Crispin + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define LTQ_STP_CON0 0x00 +#define LTQ_STP_CON1 0x04 +#define LTQ_STP_CPU0 0x08 +#define LTQ_STP_CPU1 0x0C +#define LTQ_STP_AR 0x10 + +#define LTQ_STP_CON_SWU (1 << 31) +#define LTQ_STP_2HZ 0 +#define LTQ_STP_4HZ (1 << 23) +#define LTQ_STP_8HZ (2 << 23) +#define LTQ_STP_10HZ (3 << 23) +#define LTQ_STP_SPEED_MASK (0xf << 23) +#define LTQ_STP_UPD_FPI (1 << 31) +#define LTQ_STP_UPD_MASK (3 << 30) +#define LTQ_STP_ADSL_SRC (3 << 24) + +#define LTQ_STP_GROUP0 (1 << 0) + +#define LTQ_STP_RISING 0 +#define LTQ_STP_FALLING (1 << 26) +#define LTQ_STP_EDGE_MASK (1 << 26) + +#define ltq_stp_r32(reg) __raw_readl(ltq_stp_membase + reg) +#define ltq_stp_w32(val, reg) __raw_writel(val, ltq_stp_membase + reg) +#define ltq_stp_w32_mask(clear, set, reg) \ + ltq_w32((ltq_r32(ltq_stp_membase + reg) & ~(clear)) | (set), \ + ltq_stp_membase + (reg)) + +static int ltq_stp_shadow = 0xffff; +static void __iomem *ltq_stp_membase; + +static void ltq_stp_set(struct gpio_chip *chip, unsigned offset, int value) +{ + if (value) + ltq_stp_shadow |= (1 << offset); + else + ltq_stp_shadow &= ~(1 << offset); + ltq_stp_w32(ltq_stp_shadow, LTQ_STP_CPU0); +} + +static int ltq_stp_direction_output(struct gpio_chip *chip, unsigned offset, + int value) +{ + ltq_stp_set(chip, offset, value); + + return 0; +} + +static struct gpio_chip ltq_stp_chip = { + .label = "ltq_stp", + .direction_output = ltq_stp_direction_output, + .set = ltq_stp_set, + .base = 48, + .ngpio = 24, + .can_sleep = 1, + .owner = THIS_MODULE, +}; + +static int ltq_stp_hw_init(void) +{ + /* the 3 pins used to control the external stp */ + ltq_gpio_request(4, 1, 0, 1, "stp-st"); + ltq_gpio_request(5, 1, 0, 1, "stp-d"); + ltq_gpio_request(6, 1, 0, 1, "stp-sh"); + + /* sane defaults */ + ltq_stp_w32(0, LTQ_STP_AR); + ltq_stp_w32(0, LTQ_STP_CPU0); + ltq_stp_w32(0, LTQ_STP_CPU1); + ltq_stp_w32(LTQ_STP_CON_SWU, LTQ_STP_CON0); + ltq_stp_w32(0, LTQ_STP_CON1); + + /* rising or falling edge */ + ltq_stp_w32_mask(LTQ_STP_EDGE_MASK, LTQ_STP_FALLING, LTQ_STP_CON0); + + /* per default stp 15-0 are set */ + ltq_stp_w32_mask(0, LTQ_STP_GROUP0, LTQ_STP_CON1); + + /* stp are update periodically by the FPI bus */ + ltq_stp_w32_mask(LTQ_STP_UPD_MASK, LTQ_STP_UPD_FPI, LTQ_STP_CON1); + + /* set stp update speed */ + ltq_stp_w32_mask(LTQ_STP_SPEED_MASK, LTQ_STP_8HZ, LTQ_STP_CON1); + + /* tell the hardware that pin (led) 0 and 1 are controlled + * by the dsl arc + */ + ltq_stp_w32_mask(0, LTQ_STP_ADSL_SRC, LTQ_STP_CON0); + + ltq_pmu_enable(PMU_LED); + return 0; +} + +static int __devinit ltq_stp_probe(struct platform_device *pdev) +{ + struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + int ret = 0; + + if (!res) + return -ENOENT; + res = devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), dev_name(&pdev->dev)); + if (!res) { + dev_err(&pdev->dev, "failed to request STP memory\n"); + return -EBUSY; + } + ltq_stp_membase = devm_ioremap_nocache(&pdev->dev, res->start, + resource_size(res)); + if (!ltq_stp_membase) { + dev_err(&pdev->dev, "failed to remap STP memory\n"); + return -ENOMEM; + } + ret = gpiochip_add(<q_stp_chip); + if (!ret) + ret = ltq_stp_hw_init(); + + return ret; +} + +static struct platform_driver ltq_stp_driver = { + .probe = ltq_stp_probe, + .driver = { + .name = "ltq_stp", + .owner = THIS_MODULE, + }, +}; + +int __init ltq_stp_init(void) +{ + int ret = platform_driver_register(<q_stp_driver); + + if (ret) + pr_info("ltq_stp: error registering platfom driver"); + return ret; +} + +postcore_initcall(ltq_stp_init); -- cgit v1.1 From 2f0fc4159a6abc20b13569522c545150b99485cf Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 5 Apr 2011 14:10:57 +0200 Subject: SERIAL: Lantiq: Add driver for MIPS Lantiq SOCs. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Signed-off-by: Felix Fietkau Cc: alan@lxorguk.ukuu.org.uk Cc: linux-mips@linux-mips.org Cc: linux-serial@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2269/ Acked-by: Alan Cox Signed-off-by: Ralf Baechle --- drivers/tty/serial/Kconfig | 8 + drivers/tty/serial/Makefile | 1 + drivers/tty/serial/lantiq.c | 756 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 765 insertions(+) create mode 100644 drivers/tty/serial/lantiq.c diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 80484af..b1f0f83 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1391,6 +1391,14 @@ config SERIAL_OF_PLATFORM_NWPSERIAL_CONSOLE help Support for Console on the NWP serial ports. +config SERIAL_LANTIQ + bool "Lantiq serial driver" + depends on LANTIQ + select SERIAL_CORE + select SERIAL_CORE_CONSOLE + help + Support for console and UART on Lantiq SoCs. + config SERIAL_QE tristate "Freescale QUICC Engine serial port support" depends on QUICC_ENGINE diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index fee0690..3527604 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -94,3 +94,4 @@ obj-$(CONFIG_SERIAL_IFX6X60) += ifx6x60.o obj-$(CONFIG_SERIAL_PCH_UART) += pch_uart.o obj-$(CONFIG_SERIAL_MSM_SMD) += msm_smd_tty.o obj-$(CONFIG_SERIAL_MXS_AUART) += mxs-auart.o +obj-$(CONFIG_SERIAL_LANTIQ) += lantiq.o diff --git a/drivers/tty/serial/lantiq.c b/drivers/tty/serial/lantiq.c new file mode 100644 index 0000000..58cf279e --- /dev/null +++ b/drivers/tty/serial/lantiq.c @@ -0,0 +1,756 @@ +/* + * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (C) 2004 Infineon IFAP DC COM CPE + * Copyright (C) 2007 Felix Fietkau + * Copyright (C) 2007 John Crispin + * Copyright (C) 2010 Thomas Langer, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define PORT_LTQ_ASC 111 +#define MAXPORTS 2 +#define UART_DUMMY_UER_RX 1 +#define DRVNAME "ltq_asc" +#ifdef __BIG_ENDIAN +#define LTQ_ASC_TBUF (0x0020 + 3) +#define LTQ_ASC_RBUF (0x0024 + 3) +#else +#define LTQ_ASC_TBUF 0x0020 +#define LTQ_ASC_RBUF 0x0024 +#endif +#define LTQ_ASC_FSTAT 0x0048 +#define LTQ_ASC_WHBSTATE 0x0018 +#define LTQ_ASC_STATE 0x0014 +#define LTQ_ASC_IRNCR 0x00F8 +#define LTQ_ASC_CLC 0x0000 +#define LTQ_ASC_ID 0x0008 +#define LTQ_ASC_PISEL 0x0004 +#define LTQ_ASC_TXFCON 0x0044 +#define LTQ_ASC_RXFCON 0x0040 +#define LTQ_ASC_CON 0x0010 +#define LTQ_ASC_BG 0x0050 +#define LTQ_ASC_IRNREN 0x00F4 + +#define ASC_IRNREN_TX 0x1 +#define ASC_IRNREN_RX 0x2 +#define ASC_IRNREN_ERR 0x4 +#define ASC_IRNREN_TX_BUF 0x8 +#define ASC_IRNCR_TIR 0x1 +#define ASC_IRNCR_RIR 0x2 +#define ASC_IRNCR_EIR 0x4 + +#define ASCOPT_CSIZE 0x3 +#define TXFIFO_FL 1 +#define RXFIFO_FL 1 +#define ASCCLC_DISS 0x2 +#define ASCCLC_RMCMASK 0x0000FF00 +#define ASCCLC_RMCOFFSET 8 +#define ASCCON_M_8ASYNC 0x0 +#define ASCCON_M_7ASYNC 0x2 +#define ASCCON_ODD 0x00000020 +#define ASCCON_STP 0x00000080 +#define ASCCON_BRS 0x00000100 +#define ASCCON_FDE 0x00000200 +#define ASCCON_R 0x00008000 +#define ASCCON_FEN 0x00020000 +#define ASCCON_ROEN 0x00080000 +#define ASCCON_TOEN 0x00100000 +#define ASCSTATE_PE 0x00010000 +#define ASCSTATE_FE 0x00020000 +#define ASCSTATE_ROE 0x00080000 +#define ASCSTATE_ANY (ASCSTATE_ROE|ASCSTATE_PE|ASCSTATE_FE) +#define ASCWHBSTATE_CLRREN 0x00000001 +#define ASCWHBSTATE_SETREN 0x00000002 +#define ASCWHBSTATE_CLRPE 0x00000004 +#define ASCWHBSTATE_CLRFE 0x00000008 +#define ASCWHBSTATE_CLRROE 0x00000020 +#define ASCTXFCON_TXFEN 0x0001 +#define ASCTXFCON_TXFFLU 0x0002 +#define ASCTXFCON_TXFITLMASK 0x3F00 +#define ASCTXFCON_TXFITLOFF 8 +#define ASCRXFCON_RXFEN 0x0001 +#define ASCRXFCON_RXFFLU 0x0002 +#define ASCRXFCON_RXFITLMASK 0x3F00 +#define ASCRXFCON_RXFITLOFF 8 +#define ASCFSTAT_RXFFLMASK 0x003F +#define ASCFSTAT_TXFFLMASK 0x3F00 +#define ASCFSTAT_TXFREEMASK 0x3F000000 +#define ASCFSTAT_TXFREEOFF 24 + +static void lqasc_tx_chars(struct uart_port *port); +static struct ltq_uart_port *lqasc_port[MAXPORTS]; +static struct uart_driver lqasc_reg; +static DEFINE_SPINLOCK(ltq_asc_lock); + +struct ltq_uart_port { + struct uart_port port; + struct clk *clk; + unsigned int tx_irq; + unsigned int rx_irq; + unsigned int err_irq; +}; + +static inline struct +ltq_uart_port *to_ltq_uart_port(struct uart_port *port) +{ + return container_of(port, struct ltq_uart_port, port); +} + +static void +lqasc_stop_tx(struct uart_port *port) +{ + return; +} + +static void +lqasc_start_tx(struct uart_port *port) +{ + unsigned long flags; + spin_lock_irqsave(<q_asc_lock, flags); + lqasc_tx_chars(port); + spin_unlock_irqrestore(<q_asc_lock, flags); + return; +} + +static void +lqasc_stop_rx(struct uart_port *port) +{ + ltq_w32(ASCWHBSTATE_CLRREN, port->membase + LTQ_ASC_WHBSTATE); +} + +static void +lqasc_enable_ms(struct uart_port *port) +{ +} + +static int +lqasc_rx_chars(struct uart_port *port) +{ + struct tty_struct *tty = tty_port_tty_get(&port->state->port); + unsigned int ch = 0, rsr = 0, fifocnt; + + if (!tty) { + dev_dbg(port->dev, "%s:tty is busy now", __func__); + return -EBUSY; + } + fifocnt = + ltq_r32(port->membase + LTQ_ASC_FSTAT) & ASCFSTAT_RXFFLMASK; + while (fifocnt--) { + u8 flag = TTY_NORMAL; + ch = ltq_r8(port->membase + LTQ_ASC_RBUF); + rsr = (ltq_r32(port->membase + LTQ_ASC_STATE) + & ASCSTATE_ANY) | UART_DUMMY_UER_RX; + tty_flip_buffer_push(tty); + port->icount.rx++; + + /* + * Note that the error handling code is + * out of the main execution path + */ + if (rsr & ASCSTATE_ANY) { + if (rsr & ASCSTATE_PE) { + port->icount.parity++; + ltq_w32_mask(0, ASCWHBSTATE_CLRPE, + port->membase + LTQ_ASC_WHBSTATE); + } else if (rsr & ASCSTATE_FE) { + port->icount.frame++; + ltq_w32_mask(0, ASCWHBSTATE_CLRFE, + port->membase + LTQ_ASC_WHBSTATE); + } + if (rsr & ASCSTATE_ROE) { + port->icount.overrun++; + ltq_w32_mask(0, ASCWHBSTATE_CLRROE, + port->membase + LTQ_ASC_WHBSTATE); + } + + rsr &= port->read_status_mask; + + if (rsr & ASCSTATE_PE) + flag = TTY_PARITY; + else if (rsr & ASCSTATE_FE) + flag = TTY_FRAME; + } + + if ((rsr & port->ignore_status_mask) == 0) + tty_insert_flip_char(tty, ch, flag); + + if (rsr & ASCSTATE_ROE) + /* + * Overrun is special, since it's reported + * immediately, and doesn't affect the current + * character + */ + tty_insert_flip_char(tty, 0, TTY_OVERRUN); + } + if (ch != 0) + tty_flip_buffer_push(tty); + tty_kref_put(tty); + return 0; +} + +static void +lqasc_tx_chars(struct uart_port *port) +{ + struct circ_buf *xmit = &port->state->xmit; + if (uart_tx_stopped(port)) { + lqasc_stop_tx(port); + return; + } + + while (((ltq_r32(port->membase + LTQ_ASC_FSTAT) & + ASCFSTAT_TXFREEMASK) >> ASCFSTAT_TXFREEOFF) != 0) { + if (port->x_char) { + ltq_w8(port->x_char, port->membase + LTQ_ASC_TBUF); + port->icount.tx++; + port->x_char = 0; + continue; + } + + if (uart_circ_empty(xmit)) + break; + + ltq_w8(port->state->xmit.buf[port->state->xmit.tail], + port->membase + LTQ_ASC_TBUF); + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1); + port->icount.tx++; + } + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); +} + +static irqreturn_t +lqasc_tx_int(int irq, void *_port) +{ + unsigned long flags; + struct uart_port *port = (struct uart_port *)_port; + spin_lock_irqsave(<q_asc_lock, flags); + ltq_w32(ASC_IRNCR_TIR, port->membase + LTQ_ASC_IRNCR); + spin_unlock_irqrestore(<q_asc_lock, flags); + lqasc_start_tx(port); + return IRQ_HANDLED; +} + +static irqreturn_t +lqasc_err_int(int irq, void *_port) +{ + unsigned long flags; + struct uart_port *port = (struct uart_port *)_port; + spin_lock_irqsave(<q_asc_lock, flags); + /* clear any pending interrupts */ + ltq_w32_mask(0, ASCWHBSTATE_CLRPE | ASCWHBSTATE_CLRFE | + ASCWHBSTATE_CLRROE, port->membase + LTQ_ASC_WHBSTATE); + spin_unlock_irqrestore(<q_asc_lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t +lqasc_rx_int(int irq, void *_port) +{ + unsigned long flags; + struct uart_port *port = (struct uart_port *)_port; + spin_lock_irqsave(<q_asc_lock, flags); + ltq_w32(ASC_IRNCR_RIR, port->membase + LTQ_ASC_IRNCR); + lqasc_rx_chars(port); + spin_unlock_irqrestore(<q_asc_lock, flags); + return IRQ_HANDLED; +} + +static unsigned int +lqasc_tx_empty(struct uart_port *port) +{ + int status; + status = ltq_r32(port->membase + LTQ_ASC_FSTAT) & ASCFSTAT_TXFFLMASK; + return status ? 0 : TIOCSER_TEMT; +} + +static unsigned int +lqasc_get_mctrl(struct uart_port *port) +{ + return TIOCM_CTS | TIOCM_CAR | TIOCM_DSR; +} + +static void +lqasc_set_mctrl(struct uart_port *port, u_int mctrl) +{ +} + +static void +lqasc_break_ctl(struct uart_port *port, int break_state) +{ +} + +static int +lqasc_startup(struct uart_port *port) +{ + struct ltq_uart_port *ltq_port = to_ltq_uart_port(port); + int retval; + + port->uartclk = clk_get_rate(ltq_port->clk); + + ltq_w32_mask(ASCCLC_DISS | ASCCLC_RMCMASK, (1 << ASCCLC_RMCOFFSET), + port->membase + LTQ_ASC_CLC); + + ltq_w32(0, port->membase + LTQ_ASC_PISEL); + ltq_w32( + ((TXFIFO_FL << ASCTXFCON_TXFITLOFF) & ASCTXFCON_TXFITLMASK) | + ASCTXFCON_TXFEN | ASCTXFCON_TXFFLU, + port->membase + LTQ_ASC_TXFCON); + ltq_w32( + ((RXFIFO_FL << ASCRXFCON_RXFITLOFF) & ASCRXFCON_RXFITLMASK) + | ASCRXFCON_RXFEN | ASCRXFCON_RXFFLU, + port->membase + LTQ_ASC_RXFCON); + /* make sure other settings are written to hardware before + * setting enable bits + */ + wmb(); + ltq_w32_mask(0, ASCCON_M_8ASYNC | ASCCON_FEN | ASCCON_TOEN | + ASCCON_ROEN, port->membase + LTQ_ASC_CON); + + retval = request_irq(ltq_port->tx_irq, lqasc_tx_int, + IRQF_DISABLED, "asc_tx", port); + if (retval) { + pr_err("failed to request lqasc_tx_int\n"); + return retval; + } + + retval = request_irq(ltq_port->rx_irq, lqasc_rx_int, + IRQF_DISABLED, "asc_rx", port); + if (retval) { + pr_err("failed to request lqasc_rx_int\n"); + goto err1; + } + + retval = request_irq(ltq_port->err_irq, lqasc_err_int, + IRQF_DISABLED, "asc_err", port); + if (retval) { + pr_err("failed to request lqasc_err_int\n"); + goto err2; + } + + ltq_w32(ASC_IRNREN_RX | ASC_IRNREN_ERR | ASC_IRNREN_TX, + port->membase + LTQ_ASC_IRNREN); + return 0; + +err2: + free_irq(ltq_port->rx_irq, port); +err1: + free_irq(ltq_port->tx_irq, port); + return retval; +} + +static void +lqasc_shutdown(struct uart_port *port) +{ + struct ltq_uart_port *ltq_port = to_ltq_uart_port(port); + free_irq(ltq_port->tx_irq, port); + free_irq(ltq_port->rx_irq, port); + free_irq(ltq_port->err_irq, port); + + ltq_w32(0, port->membase + LTQ_ASC_CON); + ltq_w32_mask(ASCRXFCON_RXFEN, ASCRXFCON_RXFFLU, + port->membase + LTQ_ASC_RXFCON); + ltq_w32_mask(ASCTXFCON_TXFEN, ASCTXFCON_TXFFLU, + port->membase + LTQ_ASC_TXFCON); +} + +static void +lqasc_set_termios(struct uart_port *port, + struct ktermios *new, struct ktermios *old) +{ + unsigned int cflag; + unsigned int iflag; + unsigned int divisor; + unsigned int baud; + unsigned int con = 0; + unsigned long flags; + + cflag = new->c_cflag; + iflag = new->c_iflag; + + switch (cflag & CSIZE) { + case CS7: + con = ASCCON_M_7ASYNC; + break; + + case CS5: + case CS6: + default: + new->c_cflag &= ~ CSIZE; + new->c_cflag |= CS8; + con = ASCCON_M_8ASYNC; + break; + } + + cflag &= ~CMSPAR; /* Mark/Space parity is not supported */ + + if (cflag & CSTOPB) + con |= ASCCON_STP; + + if (cflag & PARENB) { + if (!(cflag & PARODD)) + con &= ~ASCCON_ODD; + else + con |= ASCCON_ODD; + } + + port->read_status_mask = ASCSTATE_ROE; + if (iflag & INPCK) + port->read_status_mask |= ASCSTATE_FE | ASCSTATE_PE; + + port->ignore_status_mask = 0; + if (iflag & IGNPAR) + port->ignore_status_mask |= ASCSTATE_FE | ASCSTATE_PE; + + if (iflag & IGNBRK) { + /* + * If we're ignoring parity and break indicators, + * ignore overruns too (for real raw support). + */ + if (iflag & IGNPAR) + port->ignore_status_mask |= ASCSTATE_ROE; + } + + if ((cflag & CREAD) == 0) + port->ignore_status_mask |= UART_DUMMY_UER_RX; + + /* set error signals - framing, parity and overrun, enable receiver */ + con |= ASCCON_FEN | ASCCON_TOEN | ASCCON_ROEN; + + spin_lock_irqsave(<q_asc_lock, flags); + + /* set up CON */ + ltq_w32_mask(0, con, port->membase + LTQ_ASC_CON); + + /* Set baud rate - take a divider of 2 into account */ + baud = uart_get_baud_rate(port, new, old, 0, port->uartclk / 16); + divisor = uart_get_divisor(port, baud); + divisor = divisor / 2 - 1; + + /* disable the baudrate generator */ + ltq_w32_mask(ASCCON_R, 0, port->membase + LTQ_ASC_CON); + + /* make sure the fractional divider is off */ + ltq_w32_mask(ASCCON_FDE, 0, port->membase + LTQ_ASC_CON); + + /* set up to use divisor of 2 */ + ltq_w32_mask(ASCCON_BRS, 0, port->membase + LTQ_ASC_CON); + + /* now we can write the new baudrate into the register */ + ltq_w32(divisor, port->membase + LTQ_ASC_BG); + + /* turn the baudrate generator back on */ + ltq_w32_mask(0, ASCCON_R, port->membase + LTQ_ASC_CON); + + /* enable rx */ + ltq_w32(ASCWHBSTATE_SETREN, port->membase + LTQ_ASC_WHBSTATE); + + spin_unlock_irqrestore(<q_asc_lock, flags); + + /* Don't rewrite B0 */ + if (tty_termios_baud_rate(new)) + tty_termios_encode_baud_rate(new, baud, baud); +} + +static const char* +lqasc_type(struct uart_port *port) +{ + if (port->type == PORT_LTQ_ASC) + return DRVNAME; + else + return NULL; +} + +static void +lqasc_release_port(struct uart_port *port) +{ + if (port->flags & UPF_IOREMAP) { + iounmap(port->membase); + port->membase = NULL; + } +} + +static int +lqasc_request_port(struct uart_port *port) +{ + struct platform_device *pdev = to_platform_device(port->dev); + struct resource *res; + int size; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "cannot obtain I/O memory region"); + return -ENODEV; + } + size = resource_size(res); + + res = devm_request_mem_region(&pdev->dev, res->start, + size, dev_name(&pdev->dev)); + if (!res) { + dev_err(&pdev->dev, "cannot request I/O memory region"); + return -EBUSY; + } + + if (port->flags & UPF_IOREMAP) { + port->membase = devm_ioremap_nocache(&pdev->dev, + port->mapbase, size); + if (port->membase == NULL) + return -ENOMEM; + } + return 0; +} + +static void +lqasc_config_port(struct uart_port *port, int flags) +{ + if (flags & UART_CONFIG_TYPE) { + port->type = PORT_LTQ_ASC; + lqasc_request_port(port); + } +} + +static int +lqasc_verify_port(struct uart_port *port, + struct serial_struct *ser) +{ + int ret = 0; + if (ser->type != PORT_UNKNOWN && ser->type != PORT_LTQ_ASC) + ret = -EINVAL; + if (ser->irq < 0 || ser->irq >= NR_IRQS) + ret = -EINVAL; + if (ser->baud_base < 9600) + ret = -EINVAL; + return ret; +} + +static struct uart_ops lqasc_pops = { + .tx_empty = lqasc_tx_empty, + .set_mctrl = lqasc_set_mctrl, + .get_mctrl = lqasc_get_mctrl, + .stop_tx = lqasc_stop_tx, + .start_tx = lqasc_start_tx, + .stop_rx = lqasc_stop_rx, + .enable_ms = lqasc_enable_ms, + .break_ctl = lqasc_break_ctl, + .startup = lqasc_startup, + .shutdown = lqasc_shutdown, + .set_termios = lqasc_set_termios, + .type = lqasc_type, + .release_port = lqasc_release_port, + .request_port = lqasc_request_port, + .config_port = lqasc_config_port, + .verify_port = lqasc_verify_port, +}; + +static void +lqasc_console_putchar(struct uart_port *port, int ch) +{ + int fifofree; + + if (!port->membase) + return; + + do { + fifofree = (ltq_r32(port->membase + LTQ_ASC_FSTAT) + & ASCFSTAT_TXFREEMASK) >> ASCFSTAT_TXFREEOFF; + } while (fifofree == 0); + ltq_w8(ch, port->membase + LTQ_ASC_TBUF); +} + + +static void +lqasc_console_write(struct console *co, const char *s, u_int count) +{ + struct ltq_uart_port *ltq_port; + struct uart_port *port; + unsigned long flags; + + if (co->index >= MAXPORTS) + return; + + ltq_port = lqasc_port[co->index]; + if (!ltq_port) + return; + + port = <q_port->port; + + spin_lock_irqsave(<q_asc_lock, flags); + uart_console_write(port, s, count, lqasc_console_putchar); + spin_unlock_irqrestore(<q_asc_lock, flags); +} + +static int __init +lqasc_console_setup(struct console *co, char *options) +{ + struct ltq_uart_port *ltq_port; + struct uart_port *port; + int baud = 115200; + int bits = 8; + int parity = 'n'; + int flow = 'n'; + + if (co->index >= MAXPORTS) + return -ENODEV; + + ltq_port = lqasc_port[co->index]; + if (!ltq_port) + return -ENODEV; + + port = <q_port->port; + + port->uartclk = clk_get_rate(ltq_port->clk); + + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + return uart_set_options(port, co, baud, parity, bits, flow); +} + +static struct console lqasc_console = { + .name = "ttyLTQ", + .write = lqasc_console_write, + .device = uart_console_device, + .setup = lqasc_console_setup, + .flags = CON_PRINTBUFFER, + .index = -1, + .data = &lqasc_reg, +}; + +static int __init +lqasc_console_init(void) +{ + register_console(&lqasc_console); + return 0; +} +console_initcall(lqasc_console_init); + +static struct uart_driver lqasc_reg = { + .owner = THIS_MODULE, + .driver_name = DRVNAME, + .dev_name = "ttyLTQ", + .major = 0, + .minor = 0, + .nr = MAXPORTS, + .cons = &lqasc_console, +}; + +static int __init +lqasc_probe(struct platform_device *pdev) +{ + struct ltq_uart_port *ltq_port; + struct uart_port *port; + struct resource *mmres, *irqres; + int tx_irq, rx_irq, err_irq; + struct clk *clk; + int ret; + + mmres = platform_get_resource(pdev, IORESOURCE_MEM, 0); + irqres = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (!mmres || !irqres) + return -ENODEV; + + if (pdev->id >= MAXPORTS) + return -EBUSY; + + if (lqasc_port[pdev->id] != NULL) + return -EBUSY; + + clk = clk_get(&pdev->dev, "fpi"); + if (IS_ERR(clk)) { + pr_err("failed to get fpi clk\n"); + return -ENOENT; + } + + tx_irq = platform_get_irq_byname(pdev, "tx"); + rx_irq = platform_get_irq_byname(pdev, "rx"); + err_irq = platform_get_irq_byname(pdev, "err"); + if ((tx_irq < 0) | (rx_irq < 0) | (err_irq < 0)) + return -ENODEV; + + ltq_port = kzalloc(sizeof(struct ltq_uart_port), GFP_KERNEL); + if (!ltq_port) + return -ENOMEM; + + port = <q_port->port; + + port->iotype = SERIAL_IO_MEM; + port->flags = ASYNC_BOOT_AUTOCONF | UPF_IOREMAP; + port->ops = &lqasc_pops; + port->fifosize = 16; + port->type = PORT_LTQ_ASC, + port->line = pdev->id; + port->dev = &pdev->dev; + + port->irq = tx_irq; /* unused, just to be backward-compatibe */ + port->mapbase = mmres->start; + + ltq_port->clk = clk; + + ltq_port->tx_irq = tx_irq; + ltq_port->rx_irq = rx_irq; + ltq_port->err_irq = err_irq; + + lqasc_port[pdev->id] = ltq_port; + platform_set_drvdata(pdev, ltq_port); + + ret = uart_add_one_port(&lqasc_reg, port); + + return ret; +} + +static struct platform_driver lqasc_driver = { + .driver = { + .name = DRVNAME, + .owner = THIS_MODULE, + }, +}; + +int __init +init_lqasc(void) +{ + int ret; + + ret = uart_register_driver(&lqasc_reg); + if (ret != 0) + return ret; + + ret = platform_driver_probe(&lqasc_driver, lqasc_probe); + if (ret != 0) + uart_unregister_driver(&lqasc_reg); + + return ret; +} + +module_init(init_lqasc); + +MODULE_DESCRIPTION("Lantiq serial port driver"); +MODULE_LICENSE("GPL"); -- cgit v1.1 From dfec1a827d2bdc35d0990afd100f79a685ec0985 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Fri, 6 May 2011 00:10:00 +0200 Subject: MIPS: Lantiq: Add DMA support This patch adds support for the DMA engine found inside the XWAY family of SoCs. The engine has 5 ports and 20 channels. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2355/ Signed-off-by: Ralf Baechle --- .../mips/include/asm/mach-lantiq/xway/lantiq_soc.h | 3 +- arch/mips/include/asm/mach-lantiq/xway/xway_dma.h | 60 +++++ arch/mips/lantiq/xway/Makefile | 2 +- arch/mips/lantiq/xway/devices.h | 1 + arch/mips/lantiq/xway/dma.c | 253 +++++++++++++++++++++ 5 files changed, 317 insertions(+), 2 deletions(-) create mode 100644 arch/mips/include/asm/mach-lantiq/xway/xway_dma.h create mode 100644 arch/mips/lantiq/xway/dma.c diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h index 343e82cb..4827afb 100644 --- a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h +++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h @@ -86,7 +86,8 @@ #define LTQ_PPE32_SIZE 0x40000 /* DMA */ -#define LTQ_DMA_BASE_ADDR 0xBE104100 +#define LTQ_DMA_BASE_ADDR 0x1E104100 +#define LTQ_DMA_SIZE 0x800 /* PCI */ #define PCI_CR_BASE_ADDR 0x1E105400 diff --git a/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h b/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h new file mode 100644 index 0000000..872943a --- /dev/null +++ b/arch/mips/include/asm/mach-lantiq/xway/xway_dma.h @@ -0,0 +1,60 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2011 John Crispin + */ + +#ifndef LTQ_DMA_H__ +#define LTQ_DMA_H__ + +#define LTQ_DESC_SIZE 0x08 /* each descriptor is 64bit */ +#define LTQ_DESC_NUM 0x40 /* 64 descriptors / channel */ + +#define LTQ_DMA_OWN BIT(31) /* owner bit */ +#define LTQ_DMA_C BIT(30) /* complete bit */ +#define LTQ_DMA_SOP BIT(29) /* start of packet */ +#define LTQ_DMA_EOP BIT(28) /* end of packet */ +#define LTQ_DMA_TX_OFFSET(x) ((x & 0x1f) << 23) /* data bytes offset */ +#define LTQ_DMA_RX_OFFSET(x) ((x & 0x7) << 23) /* data bytes offset */ +#define LTQ_DMA_SIZE_MASK (0xffff) /* the size field is 16 bit */ + +struct ltq_dma_desc { + u32 ctl; + u32 addr; +}; + +struct ltq_dma_channel { + int nr; /* the channel number */ + int irq; /* the mapped irq */ + int desc; /* the current descriptor */ + struct ltq_dma_desc *desc_base; /* the descriptor base */ + int phys; /* physical addr */ +}; + +enum { + DMA_PORT_ETOP = 0, + DMA_PORT_DEU, +}; + +extern void ltq_dma_enable_irq(struct ltq_dma_channel *ch); +extern void ltq_dma_disable_irq(struct ltq_dma_channel *ch); +extern void ltq_dma_ack_irq(struct ltq_dma_channel *ch); +extern void ltq_dma_open(struct ltq_dma_channel *ch); +extern void ltq_dma_close(struct ltq_dma_channel *ch); +extern void ltq_dma_alloc_tx(struct ltq_dma_channel *ch); +extern void ltq_dma_alloc_rx(struct ltq_dma_channel *ch); +extern void ltq_dma_free(struct ltq_dma_channel *ch); +extern void ltq_dma_init_port(int p); + +#endif diff --git a/arch/mips/lantiq/xway/Makefile b/arch/mips/lantiq/xway/Makefile index 6b5e07e..c517f2e 100644 --- a/arch/mips/lantiq/xway/Makefile +++ b/arch/mips/lantiq/xway/Makefile @@ -1,4 +1,4 @@ -obj-y := pmu.o ebu.o reset.o gpio.o gpio_stp.o gpio_ebu.o devices.o +obj-y := pmu.o ebu.o reset.o gpio.o gpio_stp.o gpio_ebu.o devices.o dma.o obj-$(CONFIG_SOC_XWAY) += clk-xway.o prom-xway.o setup-xway.o obj-$(CONFIG_SOC_AMAZON_SE) += clk-ase.o prom-ase.o setup-ase.o diff --git a/arch/mips/lantiq/xway/devices.h b/arch/mips/lantiq/xway/devices.h index 51f56b5..d573084 100644 --- a/arch/mips/lantiq/xway/devices.h +++ b/arch/mips/lantiq/xway/devices.h @@ -10,6 +10,7 @@ #define _LTQ_DEVICES_XWAY_H__ #include "../devices.h" +#include extern void ltq_register_gpio(void); extern void ltq_register_gpio_stp(void); diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c new file mode 100644 index 0000000..4278a45 --- /dev/null +++ b/arch/mips/lantiq/xway/dma.c @@ -0,0 +1,253 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2011 John Crispin + */ + +#include +#include +#include +#include + +#include +#include + +#define LTQ_DMA_CTRL 0x10 +#define LTQ_DMA_CPOLL 0x14 +#define LTQ_DMA_CS 0x18 +#define LTQ_DMA_CCTRL 0x1C +#define LTQ_DMA_CDBA 0x20 +#define LTQ_DMA_CDLEN 0x24 +#define LTQ_DMA_CIS 0x28 +#define LTQ_DMA_CIE 0x2C +#define LTQ_DMA_PS 0x40 +#define LTQ_DMA_PCTRL 0x44 +#define LTQ_DMA_IRNEN 0xf4 + +#define DMA_DESCPT BIT(3) /* descriptor complete irq */ +#define DMA_TX BIT(8) /* TX channel direction */ +#define DMA_CHAN_ON BIT(0) /* channel on / off bit */ +#define DMA_PDEN BIT(6) /* enable packet drop */ +#define DMA_CHAN_RST BIT(1) /* channel on / off bit */ +#define DMA_RESET BIT(0) /* channel on / off bit */ +#define DMA_IRQ_ACK 0x7e /* IRQ status register */ +#define DMA_POLL BIT(31) /* turn on channel polling */ +#define DMA_CLK_DIV4 BIT(6) /* polling clock divider */ +#define DMA_2W_BURST BIT(1) /* 2 word burst length */ +#define DMA_MAX_CHANNEL 20 /* the soc has 20 channels */ +#define DMA_ETOP_ENDIANESS (0xf << 8) /* endianess swap etop channels */ +#define DMA_WEIGHT (BIT(17) | BIT(16)) /* default channel wheight */ + +#define ltq_dma_r32(x) ltq_r32(ltq_dma_membase + (x)) +#define ltq_dma_w32(x, y) ltq_w32(x, ltq_dma_membase + (y)) +#define ltq_dma_w32_mask(x, y, z) ltq_w32_mask(x, y, \ + ltq_dma_membase + (z)) + +static struct resource ltq_dma_resource = { + .name = "dma", + .start = LTQ_DMA_BASE_ADDR, + .end = LTQ_DMA_BASE_ADDR + LTQ_DMA_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +static void __iomem *ltq_dma_membase; + +void +ltq_dma_enable_irq(struct ltq_dma_channel *ch) +{ + unsigned long flags; + + local_irq_save(flags); + ltq_dma_w32(ch->nr, LTQ_DMA_CS); + ltq_dma_w32_mask(0, 1 << ch->nr, LTQ_DMA_IRNEN); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ltq_dma_enable_irq); + +void +ltq_dma_disable_irq(struct ltq_dma_channel *ch) +{ + unsigned long flags; + + local_irq_save(flags); + ltq_dma_w32(ch->nr, LTQ_DMA_CS); + ltq_dma_w32_mask(1 << ch->nr, 0, LTQ_DMA_IRNEN); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ltq_dma_disable_irq); + +void +ltq_dma_ack_irq(struct ltq_dma_channel *ch) +{ + unsigned long flags; + + local_irq_save(flags); + ltq_dma_w32(ch->nr, LTQ_DMA_CS); + ltq_dma_w32(DMA_IRQ_ACK, LTQ_DMA_CIS); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ltq_dma_ack_irq); + +void +ltq_dma_open(struct ltq_dma_channel *ch) +{ + unsigned long flag; + + local_irq_save(flag); + ltq_dma_w32(ch->nr, LTQ_DMA_CS); + ltq_dma_w32_mask(0, DMA_CHAN_ON, LTQ_DMA_CCTRL); + ltq_dma_enable_irq(ch); + local_irq_restore(flag); +} +EXPORT_SYMBOL_GPL(ltq_dma_open); + +void +ltq_dma_close(struct ltq_dma_channel *ch) +{ + unsigned long flag; + + local_irq_save(flag); + ltq_dma_w32(ch->nr, LTQ_DMA_CS); + ltq_dma_w32_mask(DMA_CHAN_ON, 0, LTQ_DMA_CCTRL); + ltq_dma_disable_irq(ch); + local_irq_restore(flag); +} +EXPORT_SYMBOL_GPL(ltq_dma_close); + +static void +ltq_dma_alloc(struct ltq_dma_channel *ch) +{ + unsigned long flags; + + ch->desc = 0; + ch->desc_base = dma_alloc_coherent(NULL, + LTQ_DESC_NUM * LTQ_DESC_SIZE, + &ch->phys, GFP_ATOMIC); + memset(ch->desc_base, 0, LTQ_DESC_NUM * LTQ_DESC_SIZE); + + local_irq_save(flags); + ltq_dma_w32(ch->nr, LTQ_DMA_CS); + ltq_dma_w32(ch->phys, LTQ_DMA_CDBA); + ltq_dma_w32(LTQ_DESC_NUM, LTQ_DMA_CDLEN); + ltq_dma_w32_mask(DMA_CHAN_ON, 0, LTQ_DMA_CCTRL); + wmb(); + ltq_dma_w32_mask(0, DMA_CHAN_RST, LTQ_DMA_CCTRL); + while (ltq_dma_r32(LTQ_DMA_CCTRL) & DMA_CHAN_RST) + ; + local_irq_restore(flags); +} + +void +ltq_dma_alloc_tx(struct ltq_dma_channel *ch) +{ + unsigned long flags; + + ltq_dma_alloc(ch); + + local_irq_save(flags); + ltq_dma_w32(DMA_DESCPT, LTQ_DMA_CIE); + ltq_dma_w32_mask(0, 1 << ch->nr, LTQ_DMA_IRNEN); + ltq_dma_w32(DMA_WEIGHT | DMA_TX, LTQ_DMA_CCTRL); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ltq_dma_alloc_tx); + +void +ltq_dma_alloc_rx(struct ltq_dma_channel *ch) +{ + unsigned long flags; + + ltq_dma_alloc(ch); + + local_irq_save(flags); + ltq_dma_w32(DMA_DESCPT, LTQ_DMA_CIE); + ltq_dma_w32_mask(0, 1 << ch->nr, LTQ_DMA_IRNEN); + ltq_dma_w32(DMA_WEIGHT, LTQ_DMA_CCTRL); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ltq_dma_alloc_rx); + +void +ltq_dma_free(struct ltq_dma_channel *ch) +{ + if (!ch->desc_base) + return; + ltq_dma_close(ch); + dma_free_coherent(NULL, LTQ_DESC_NUM * LTQ_DESC_SIZE, + ch->desc_base, ch->phys); +} +EXPORT_SYMBOL_GPL(ltq_dma_free); + +void +ltq_dma_init_port(int p) +{ + ltq_dma_w32(p, LTQ_DMA_PS); + switch (p) { + case DMA_PORT_ETOP: + /* + * Tell the DMA engine to swap the endianess of data frames and + * drop packets if the channel arbitration fails. + */ + ltq_dma_w32_mask(0, DMA_ETOP_ENDIANESS | DMA_PDEN, + LTQ_DMA_PCTRL); + break; + + case DMA_PORT_DEU: + ltq_dma_w32((DMA_2W_BURST << 4) | (DMA_2W_BURST << 2), + LTQ_DMA_PCTRL); + break; + + default: + break; + } +} +EXPORT_SYMBOL_GPL(ltq_dma_init_port); + +int __init +ltq_dma_init(void) +{ + int i; + + /* insert and request the memory region */ + if (insert_resource(&iomem_resource, <q_dma_resource) < 0) + panic("Failed to insert dma memory\n"); + + if (request_mem_region(ltq_dma_resource.start, + resource_size(<q_dma_resource), "dma") < 0) + panic("Failed to request dma memory\n"); + + /* remap dma register range */ + ltq_dma_membase = ioremap_nocache(ltq_dma_resource.start, + resource_size(<q_dma_resource)); + if (!ltq_dma_membase) + panic("Failed to remap dma memory\n"); + + /* power up and reset the dma engine */ + ltq_pmu_enable(PMU_DMA); + ltq_dma_w32_mask(0, DMA_RESET, LTQ_DMA_CTRL); + + /* disable all interrupts */ + ltq_dma_w32(0, LTQ_DMA_IRNEN); + + /* reset/configure each channel */ + for (i = 0; i < DMA_MAX_CHANNEL; i++) { + ltq_dma_w32(i, LTQ_DMA_CS); + ltq_dma_w32(DMA_CHAN_RST, LTQ_DMA_CCTRL); + ltq_dma_w32(DMA_POLL | DMA_CLK_DIV4, LTQ_DMA_CPOLL); + ltq_dma_w32_mask(DMA_CHAN_ON, 0, LTQ_DMA_CCTRL); + } + return 0; +} + +postcore_initcall(ltq_dma_init); -- cgit v1.1 From 504d4721ee8e432af4b5f196a08af38bc4dac5fe Mon Sep 17 00:00:00 2001 From: John Crispin Date: Fri, 6 May 2011 00:10:01 +0200 Subject: MIPS: Lantiq: Add ethernet driver This patch adds the driver for the ETOP Packet Processing Engine (PPE32) found inside the XWAY family of Lantiq MIPS SoCs. This driver makes 100MBit ethernet work. Support for all 8 dma channels, gbit and the embedded switch found on the ar9/vr9 still needs to be implemented. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: linux-mips@linux-mips.org Cc: netdev@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2357/ Acked-by: David S. Miller Signed-off-by: Ralf Baechle --- .../mips/include/asm/mach-lantiq/lantiq_platform.h | 7 + .../mips/include/asm/mach-lantiq/xway/lantiq_soc.h | 4 +- arch/mips/lantiq/xway/devices.c | 23 + arch/mips/lantiq/xway/devices.h | 1 + drivers/net/Kconfig | 7 + drivers/net/Makefile | 1 + drivers/net/lantiq_etop.c | 805 +++++++++++++++++++++ 7 files changed, 846 insertions(+), 2 deletions(-) create mode 100644 drivers/net/lantiq_etop.c diff --git a/arch/mips/include/asm/mach-lantiq/lantiq_platform.h b/arch/mips/include/asm/mach-lantiq/lantiq_platform.h index 1f1dba6..a305f1d 100644 --- a/arch/mips/include/asm/mach-lantiq/lantiq_platform.h +++ b/arch/mips/include/asm/mach-lantiq/lantiq_platform.h @@ -10,6 +10,7 @@ #define _LANTIQ_PLATFORM_H__ #include +#include /* struct used to pass info to the pci core */ enum { @@ -43,4 +44,10 @@ struct ltq_pci_data { int irq[16]; }; +/* struct used to pass info to network drivers */ +struct ltq_eth_data { + struct sockaddr mac; + int mii_mode; +}; + #endif diff --git a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h index 4827afb..8a3c6be 100644 --- a/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h +++ b/arch/mips/include/asm/mach-lantiq/xway/lantiq_soc.h @@ -82,8 +82,8 @@ #define PMU_SWITCH 0x10000000 /* ETOP - ethernet */ -#define LTQ_PPE32_BASE_ADDR 0xBE180000 -#define LTQ_PPE32_SIZE 0x40000 +#define LTQ_ETOP_BASE_ADDR 0x1E180000 +#define LTQ_ETOP_SIZE 0x40000 /* DMA */ #define LTQ_DMA_BASE_ADDR 0x1E104100 diff --git a/arch/mips/lantiq/xway/devices.c b/arch/mips/lantiq/xway/devices.c index a71b3b5..e09e789 100644 --- a/arch/mips/lantiq/xway/devices.c +++ b/arch/mips/lantiq/xway/devices.c @@ -96,3 +96,26 @@ void __init ltq_register_ase_asc(void) platform_device_register_simple("ltq_asc", 0, ltq_ase_asc_resources, ARRAY_SIZE(ltq_ase_asc_resources)); } + +/* ethernet */ +static struct resource ltq_etop_resources = { + .name = "etop", + .start = LTQ_ETOP_BASE_ADDR, + .end = LTQ_ETOP_BASE_ADDR + LTQ_ETOP_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +static struct platform_device ltq_etop = { + .name = "ltq_etop", + .resource = <q_etop_resources, + .num_resources = 1, +}; + +void __init +ltq_register_etop(struct ltq_eth_data *eth) +{ + if (eth) { + ltq_etop.dev.platform_data = eth; + platform_device_register(<q_etop); + } +} diff --git a/arch/mips/lantiq/xway/devices.h b/arch/mips/lantiq/xway/devices.h index d573084..e904934 100644 --- a/arch/mips/lantiq/xway/devices.h +++ b/arch/mips/lantiq/xway/devices.h @@ -15,5 +15,6 @@ extern void ltq_register_gpio(void); extern void ltq_register_gpio_stp(void); extern void ltq_register_ase_asc(void); +extern void ltq_register_etop(struct ltq_eth_data *eth); #endif diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 6c884ef..19f04a3 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2017,6 +2017,13 @@ config FTMAC100 from Faraday. It is used on Faraday A320, Andes AG101 and some other ARM/NDS32 SoC's. +config LANTIQ_ETOP + tristate "Lantiq SoC ETOP driver" + depends on SOC_TYPE_XWAY + help + Support for the MII0 inside the Lantiq SoC + + source "drivers/net/fs_enet/Kconfig" source "drivers/net/octeon/Kconfig" diff --git a/drivers/net/Makefile b/drivers/net/Makefile index e5a7375..209fbb7 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -259,6 +259,7 @@ obj-$(CONFIG_MLX4_CORE) += mlx4/ obj-$(CONFIG_ENC28J60) += enc28j60.o obj-$(CONFIG_ETHOC) += ethoc.o obj-$(CONFIG_GRETH) += greth.o +obj-$(CONFIG_LANTIQ_ETOP) += lantiq_etop.o obj-$(CONFIG_XTENSA_XT2000_SONIC) += xtsonic.o diff --git a/drivers/net/lantiq_etop.c b/drivers/net/lantiq_etop.c new file mode 100644 index 0000000..45f252b --- /dev/null +++ b/drivers/net/lantiq_etop.c @@ -0,0 +1,805 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2011 John Crispin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#define LTQ_ETOP_MDIO 0x11804 +#define MDIO_REQUEST 0x80000000 +#define MDIO_READ 0x40000000 +#define MDIO_ADDR_MASK 0x1f +#define MDIO_ADDR_OFFSET 0x15 +#define MDIO_REG_MASK 0x1f +#define MDIO_REG_OFFSET 0x10 +#define MDIO_VAL_MASK 0xffff + +#define PPE32_CGEN 0x800 +#define LQ_PPE32_ENET_MAC_CFG 0x1840 + +#define LTQ_ETOP_ENETS0 0x11850 +#define LTQ_ETOP_MAC_DA0 0x1186C +#define LTQ_ETOP_MAC_DA1 0x11870 +#define LTQ_ETOP_CFG 0x16020 +#define LTQ_ETOP_IGPLEN 0x16080 + +#define MAX_DMA_CHAN 0x8 +#define MAX_DMA_CRC_LEN 0x4 +#define MAX_DMA_DATA_LEN 0x600 + +#define ETOP_FTCU BIT(28) +#define ETOP_MII_MASK 0xf +#define ETOP_MII_NORMAL 0xd +#define ETOP_MII_REVERSE 0xe +#define ETOP_PLEN_UNDER 0x40 +#define ETOP_CGEN 0x800 + +/* use 2 static channels for TX/RX */ +#define LTQ_ETOP_TX_CHANNEL 1 +#define LTQ_ETOP_RX_CHANNEL 6 +#define IS_TX(x) (x == LTQ_ETOP_TX_CHANNEL) +#define IS_RX(x) (x == LTQ_ETOP_RX_CHANNEL) + +#define ltq_etop_r32(x) ltq_r32(ltq_etop_membase + (x)) +#define ltq_etop_w32(x, y) ltq_w32(x, ltq_etop_membase + (y)) +#define ltq_etop_w32_mask(x, y, z) \ + ltq_w32_mask(x, y, ltq_etop_membase + (z)) + +#define DRV_VERSION "1.0" + +static void __iomem *ltq_etop_membase; + +struct ltq_etop_chan { + int idx; + int tx_free; + struct net_device *netdev; + struct napi_struct napi; + struct ltq_dma_channel dma; + struct sk_buff *skb[LTQ_DESC_NUM]; +}; + +struct ltq_etop_priv { + struct net_device *netdev; + struct ltq_eth_data *pldata; + struct resource *res; + + struct mii_bus *mii_bus; + struct phy_device *phydev; + + struct ltq_etop_chan ch[MAX_DMA_CHAN]; + int tx_free[MAX_DMA_CHAN >> 1]; + + spinlock_t lock; +}; + +static int +ltq_etop_alloc_skb(struct ltq_etop_chan *ch) +{ + ch->skb[ch->dma.desc] = dev_alloc_skb(MAX_DMA_DATA_LEN); + if (!ch->skb[ch->dma.desc]) + return -ENOMEM; + ch->dma.desc_base[ch->dma.desc].addr = dma_map_single(NULL, + ch->skb[ch->dma.desc]->data, MAX_DMA_DATA_LEN, + DMA_FROM_DEVICE); + ch->dma.desc_base[ch->dma.desc].addr = + CPHYSADDR(ch->skb[ch->dma.desc]->data); + ch->dma.desc_base[ch->dma.desc].ctl = + LTQ_DMA_OWN | LTQ_DMA_RX_OFFSET(NET_IP_ALIGN) | + MAX_DMA_DATA_LEN; + skb_reserve(ch->skb[ch->dma.desc], NET_IP_ALIGN); + return 0; +} + +static void +ltq_etop_hw_receive(struct ltq_etop_chan *ch) +{ + struct ltq_etop_priv *priv = netdev_priv(ch->netdev); + struct ltq_dma_desc *desc = &ch->dma.desc_base[ch->dma.desc]; + struct sk_buff *skb = ch->skb[ch->dma.desc]; + int len = (desc->ctl & LTQ_DMA_SIZE_MASK) - MAX_DMA_CRC_LEN; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + if (ltq_etop_alloc_skb(ch)) { + netdev_err(ch->netdev, + "failed to allocate new rx buffer, stopping DMA\n"); + ltq_dma_close(&ch->dma); + } + ch->dma.desc++; + ch->dma.desc %= LTQ_DESC_NUM; + spin_unlock_irqrestore(&priv->lock, flags); + + skb_put(skb, len); + skb->dev = ch->netdev; + skb->protocol = eth_type_trans(skb, ch->netdev); + netif_receive_skb(skb); +} + +static int +ltq_etop_poll_rx(struct napi_struct *napi, int budget) +{ + struct ltq_etop_chan *ch = container_of(napi, + struct ltq_etop_chan, napi); + int rx = 0; + int complete = 0; + + while ((rx < budget) && !complete) { + struct ltq_dma_desc *desc = &ch->dma.desc_base[ch->dma.desc]; + + if ((desc->ctl & (LTQ_DMA_OWN | LTQ_DMA_C)) == LTQ_DMA_C) { + ltq_etop_hw_receive(ch); + rx++; + } else { + complete = 1; + } + } + if (complete || !rx) { + napi_complete(&ch->napi); + ltq_dma_ack_irq(&ch->dma); + } + return rx; +} + +static int +ltq_etop_poll_tx(struct napi_struct *napi, int budget) +{ + struct ltq_etop_chan *ch = + container_of(napi, struct ltq_etop_chan, napi); + struct ltq_etop_priv *priv = netdev_priv(ch->netdev); + struct netdev_queue *txq = + netdev_get_tx_queue(ch->netdev, ch->idx >> 1); + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + while ((ch->dma.desc_base[ch->tx_free].ctl & + (LTQ_DMA_OWN | LTQ_DMA_C)) == LTQ_DMA_C) { + dev_kfree_skb_any(ch->skb[ch->tx_free]); + ch->skb[ch->tx_free] = NULL; + memset(&ch->dma.desc_base[ch->tx_free], 0, + sizeof(struct ltq_dma_desc)); + ch->tx_free++; + ch->tx_free %= LTQ_DESC_NUM; + } + spin_unlock_irqrestore(&priv->lock, flags); + + if (netif_tx_queue_stopped(txq)) + netif_tx_start_queue(txq); + napi_complete(&ch->napi); + ltq_dma_ack_irq(&ch->dma); + return 1; +} + +static irqreturn_t +ltq_etop_dma_irq(int irq, void *_priv) +{ + struct ltq_etop_priv *priv = _priv; + int ch = irq - LTQ_DMA_CH0_INT; + + napi_schedule(&priv->ch[ch].napi); + return IRQ_HANDLED; +} + +static void +ltq_etop_free_channel(struct net_device *dev, struct ltq_etop_chan *ch) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + + ltq_dma_free(&ch->dma); + if (ch->dma.irq) + free_irq(ch->dma.irq, priv); + if (IS_RX(ch->idx)) { + int desc; + for (desc = 0; desc < LTQ_DESC_NUM; desc++) + dev_kfree_skb_any(ch->skb[ch->dma.desc]); + } +} + +static void +ltq_etop_hw_exit(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + int i; + + ltq_pmu_disable(PMU_PPE); + for (i = 0; i < MAX_DMA_CHAN; i++) + if (IS_TX(i) || IS_RX(i)) + ltq_etop_free_channel(dev, &priv->ch[i]); +} + +static int +ltq_etop_hw_init(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + int i; + + ltq_pmu_enable(PMU_PPE); + + switch (priv->pldata->mii_mode) { + case PHY_INTERFACE_MODE_RMII: + ltq_etop_w32_mask(ETOP_MII_MASK, + ETOP_MII_REVERSE, LTQ_ETOP_CFG); + break; + + case PHY_INTERFACE_MODE_MII: + ltq_etop_w32_mask(ETOP_MII_MASK, + ETOP_MII_NORMAL, LTQ_ETOP_CFG); + break; + + default: + netdev_err(dev, "unknown mii mode %d\n", + priv->pldata->mii_mode); + return -ENOTSUPP; + } + + /* enable crc generation */ + ltq_etop_w32(PPE32_CGEN, LQ_PPE32_ENET_MAC_CFG); + + ltq_dma_init_port(DMA_PORT_ETOP); + + for (i = 0; i < MAX_DMA_CHAN; i++) { + int irq = LTQ_DMA_CH0_INT + i; + struct ltq_etop_chan *ch = &priv->ch[i]; + + ch->idx = ch->dma.nr = i; + + if (IS_TX(i)) { + ltq_dma_alloc_tx(&ch->dma); + request_irq(irq, ltq_etop_dma_irq, IRQF_DISABLED, + "etop_tx", priv); + } else if (IS_RX(i)) { + ltq_dma_alloc_rx(&ch->dma); + for (ch->dma.desc = 0; ch->dma.desc < LTQ_DESC_NUM; + ch->dma.desc++) + if (ltq_etop_alloc_skb(ch)) + return -ENOMEM; + ch->dma.desc = 0; + request_irq(irq, ltq_etop_dma_irq, IRQF_DISABLED, + "etop_rx", priv); + } + ch->dma.irq = irq; + } + return 0; +} + +static void +ltq_etop_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "Lantiq ETOP"); + strcpy(info->bus_info, "internal"); + strcpy(info->version, DRV_VERSION); +} + +static int +ltq_etop_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + + return phy_ethtool_gset(priv->phydev, cmd); +} + +static int +ltq_etop_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + + return phy_ethtool_sset(priv->phydev, cmd); +} + +static int +ltq_etop_nway_reset(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + + return phy_start_aneg(priv->phydev); +} + +static const struct ethtool_ops ltq_etop_ethtool_ops = { + .get_drvinfo = ltq_etop_get_drvinfo, + .get_settings = ltq_etop_get_settings, + .set_settings = ltq_etop_set_settings, + .nway_reset = ltq_etop_nway_reset, +}; + +static int +ltq_etop_mdio_wr(struct mii_bus *bus, int phy_addr, int phy_reg, u16 phy_data) +{ + u32 val = MDIO_REQUEST | + ((phy_addr & MDIO_ADDR_MASK) << MDIO_ADDR_OFFSET) | + ((phy_reg & MDIO_REG_MASK) << MDIO_REG_OFFSET) | + phy_data; + + while (ltq_etop_r32(LTQ_ETOP_MDIO) & MDIO_REQUEST) + ; + ltq_etop_w32(val, LTQ_ETOP_MDIO); + return 0; +} + +static int +ltq_etop_mdio_rd(struct mii_bus *bus, int phy_addr, int phy_reg) +{ + u32 val = MDIO_REQUEST | MDIO_READ | + ((phy_addr & MDIO_ADDR_MASK) << MDIO_ADDR_OFFSET) | + ((phy_reg & MDIO_REG_MASK) << MDIO_REG_OFFSET); + + while (ltq_etop_r32(LTQ_ETOP_MDIO) & MDIO_REQUEST) + ; + ltq_etop_w32(val, LTQ_ETOP_MDIO); + while (ltq_etop_r32(LTQ_ETOP_MDIO) & MDIO_REQUEST) + ; + val = ltq_etop_r32(LTQ_ETOP_MDIO) & MDIO_VAL_MASK; + return val; +} + +static void +ltq_etop_mdio_link(struct net_device *dev) +{ + /* nothing to do */ +} + +static int +ltq_etop_mdio_probe(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + struct phy_device *phydev = NULL; + int phy_addr; + + for (phy_addr = 0; phy_addr < PHY_MAX_ADDR; phy_addr++) { + if (priv->mii_bus->phy_map[phy_addr]) { + phydev = priv->mii_bus->phy_map[phy_addr]; + break; + } + } + + if (!phydev) { + netdev_err(dev, "no PHY found\n"); + return -ENODEV; + } + + phydev = phy_connect(dev, dev_name(&phydev->dev), <q_etop_mdio_link, + 0, priv->pldata->mii_mode); + + if (IS_ERR(phydev)) { + netdev_err(dev, "Could not attach to PHY\n"); + return PTR_ERR(phydev); + } + + phydev->supported &= (SUPPORTED_10baseT_Half + | SUPPORTED_10baseT_Full + | SUPPORTED_100baseT_Half + | SUPPORTED_100baseT_Full + | SUPPORTED_Autoneg + | SUPPORTED_MII + | SUPPORTED_TP); + + phydev->advertising = phydev->supported; + priv->phydev = phydev; + pr_info("%s: attached PHY [%s] (phy_addr=%s, irq=%d)\n", + dev->name, phydev->drv->name, + dev_name(&phydev->dev), phydev->irq); + + return 0; +} + +static int +ltq_etop_mdio_init(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + int i; + int err; + + priv->mii_bus = mdiobus_alloc(); + if (!priv->mii_bus) { + netdev_err(dev, "failed to allocate mii bus\n"); + err = -ENOMEM; + goto err_out; + } + + priv->mii_bus->priv = dev; + priv->mii_bus->read = ltq_etop_mdio_rd; + priv->mii_bus->write = ltq_etop_mdio_wr; + priv->mii_bus->name = "ltq_mii"; + snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%x", 0); + priv->mii_bus->irq = kmalloc(sizeof(int) * PHY_MAX_ADDR, GFP_KERNEL); + if (!priv->mii_bus->irq) { + err = -ENOMEM; + goto err_out_free_mdiobus; + } + + for (i = 0; i < PHY_MAX_ADDR; ++i) + priv->mii_bus->irq[i] = PHY_POLL; + + if (mdiobus_register(priv->mii_bus)) { + err = -ENXIO; + goto err_out_free_mdio_irq; + } + + if (ltq_etop_mdio_probe(dev)) { + err = -ENXIO; + goto err_out_unregister_bus; + } + return 0; + +err_out_unregister_bus: + mdiobus_unregister(priv->mii_bus); +err_out_free_mdio_irq: + kfree(priv->mii_bus->irq); +err_out_free_mdiobus: + mdiobus_free(priv->mii_bus); +err_out: + return err; +} + +static void +ltq_etop_mdio_cleanup(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + + phy_disconnect(priv->phydev); + mdiobus_unregister(priv->mii_bus); + kfree(priv->mii_bus->irq); + mdiobus_free(priv->mii_bus); +} + +static int +ltq_etop_open(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < MAX_DMA_CHAN; i++) { + struct ltq_etop_chan *ch = &priv->ch[i]; + + if (!IS_TX(i) && (!IS_RX(i))) + continue; + ltq_dma_open(&ch->dma); + napi_enable(&ch->napi); + } + phy_start(priv->phydev); + netif_tx_start_all_queues(dev); + return 0; +} + +static int +ltq_etop_stop(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + int i; + + netif_tx_stop_all_queues(dev); + phy_stop(priv->phydev); + for (i = 0; i < MAX_DMA_CHAN; i++) { + struct ltq_etop_chan *ch = &priv->ch[i]; + + if (!IS_RX(i) && !IS_TX(i)) + continue; + napi_disable(&ch->napi); + ltq_dma_close(&ch->dma); + } + return 0; +} + +static int +ltq_etop_tx(struct sk_buff *skb, struct net_device *dev) +{ + int queue = skb_get_queue_mapping(skb); + struct netdev_queue *txq = netdev_get_tx_queue(dev, queue); + struct ltq_etop_priv *priv = netdev_priv(dev); + struct ltq_etop_chan *ch = &priv->ch[(queue << 1) | 1]; + struct ltq_dma_desc *desc = &ch->dma.desc_base[ch->dma.desc]; + int len; + unsigned long flags; + u32 byte_offset; + + len = skb->len < ETH_ZLEN ? ETH_ZLEN : skb->len; + + if ((desc->ctl & (LTQ_DMA_OWN | LTQ_DMA_C)) || ch->skb[ch->dma.desc]) { + dev_kfree_skb_any(skb); + netdev_err(dev, "tx ring full\n"); + netif_tx_stop_queue(txq); + return NETDEV_TX_BUSY; + } + + /* dma needs to start on a 16 byte aligned address */ + byte_offset = CPHYSADDR(skb->data) % 16; + ch->skb[ch->dma.desc] = skb; + + dev->trans_start = jiffies; + + spin_lock_irqsave(&priv->lock, flags); + desc->addr = ((unsigned int) dma_map_single(NULL, skb->data, len, + DMA_TO_DEVICE)) - byte_offset; + wmb(); + desc->ctl = LTQ_DMA_OWN | LTQ_DMA_SOP | LTQ_DMA_EOP | + LTQ_DMA_TX_OFFSET(byte_offset) | (len & LTQ_DMA_SIZE_MASK); + ch->dma.desc++; + ch->dma.desc %= LTQ_DESC_NUM; + spin_unlock_irqrestore(&priv->lock, flags); + + if (ch->dma.desc_base[ch->dma.desc].ctl & LTQ_DMA_OWN) + netif_tx_stop_queue(txq); + + return NETDEV_TX_OK; +} + +static int +ltq_etop_change_mtu(struct net_device *dev, int new_mtu) +{ + int ret = eth_change_mtu(dev, new_mtu); + + if (!ret) { + struct ltq_etop_priv *priv = netdev_priv(dev); + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + ltq_etop_w32((ETOP_PLEN_UNDER << 16) | new_mtu, + LTQ_ETOP_IGPLEN); + spin_unlock_irqrestore(&priv->lock, flags); + } + return ret; +} + +static int +ltq_etop_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + + /* TODO: mii-toll reports "No MII transceiver present!." ?!*/ + return phy_mii_ioctl(priv->phydev, rq, cmd); +} + +static int +ltq_etop_set_mac_address(struct net_device *dev, void *p) +{ + int ret = eth_mac_addr(dev, p); + + if (!ret) { + struct ltq_etop_priv *priv = netdev_priv(dev); + unsigned long flags; + + /* store the mac for the unicast filter */ + spin_lock_irqsave(&priv->lock, flags); + ltq_etop_w32(*((u32 *)dev->dev_addr), LTQ_ETOP_MAC_DA0); + ltq_etop_w32(*((u16 *)&dev->dev_addr[4]) << 16, + LTQ_ETOP_MAC_DA1); + spin_unlock_irqrestore(&priv->lock, flags); + } + return ret; +} + +static void +ltq_etop_set_multicast_list(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + unsigned long flags; + + /* ensure that the unicast filter is not enabled in promiscious mode */ + spin_lock_irqsave(&priv->lock, flags); + if ((dev->flags & IFF_PROMISC) || (dev->flags & IFF_ALLMULTI)) + ltq_etop_w32_mask(ETOP_FTCU, 0, LTQ_ETOP_ENETS0); + else + ltq_etop_w32_mask(0, ETOP_FTCU, LTQ_ETOP_ENETS0); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static u16 +ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb) +{ + /* we are currently only using the first queue */ + return 0; +} + +static int +ltq_etop_init(struct net_device *dev) +{ + struct ltq_etop_priv *priv = netdev_priv(dev); + struct sockaddr mac; + int err; + + ether_setup(dev); + dev->watchdog_timeo = 10 * HZ; + err = ltq_etop_hw_init(dev); + if (err) + goto err_hw; + ltq_etop_change_mtu(dev, 1500); + + memcpy(&mac, &priv->pldata->mac, sizeof(struct sockaddr)); + if (!is_valid_ether_addr(mac.sa_data)) { + pr_warn("etop: invalid MAC, using random\n"); + random_ether_addr(mac.sa_data); + } + + err = ltq_etop_set_mac_address(dev, &mac); + if (err) + goto err_netdev; + ltq_etop_set_multicast_list(dev); + err = ltq_etop_mdio_init(dev); + if (err) + goto err_netdev; + return 0; + +err_netdev: + unregister_netdev(dev); + free_netdev(dev); +err_hw: + ltq_etop_hw_exit(dev); + return err; +} + +static void +ltq_etop_tx_timeout(struct net_device *dev) +{ + int err; + + ltq_etop_hw_exit(dev); + err = ltq_etop_hw_init(dev); + if (err) + goto err_hw; + dev->trans_start = jiffies; + netif_wake_queue(dev); + return; + +err_hw: + ltq_etop_hw_exit(dev); + netdev_err(dev, "failed to restart etop after TX timeout\n"); +} + +static const struct net_device_ops ltq_eth_netdev_ops = { + .ndo_open = ltq_etop_open, + .ndo_stop = ltq_etop_stop, + .ndo_start_xmit = ltq_etop_tx, + .ndo_change_mtu = ltq_etop_change_mtu, + .ndo_do_ioctl = ltq_etop_ioctl, + .ndo_set_mac_address = ltq_etop_set_mac_address, + .ndo_validate_addr = eth_validate_addr, + .ndo_set_multicast_list = ltq_etop_set_multicast_list, + .ndo_select_queue = ltq_etop_select_queue, + .ndo_init = ltq_etop_init, + .ndo_tx_timeout = ltq_etop_tx_timeout, +}; + +static int __init +ltq_etop_probe(struct platform_device *pdev) +{ + struct net_device *dev; + struct ltq_etop_priv *priv; + struct resource *res; + int err; + int i; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "failed to get etop resource\n"); + err = -ENOENT; + goto err_out; + } + + res = devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), dev_name(&pdev->dev)); + if (!res) { + dev_err(&pdev->dev, "failed to request etop resource\n"); + err = -EBUSY; + goto err_out; + } + + ltq_etop_membase = devm_ioremap_nocache(&pdev->dev, + res->start, resource_size(res)); + if (!ltq_etop_membase) { + dev_err(&pdev->dev, "failed to remap etop engine %d\n", + pdev->id); + err = -ENOMEM; + goto err_out; + } + + dev = alloc_etherdev_mq(sizeof(struct ltq_etop_priv), 4); + strcpy(dev->name, "eth%d"); + dev->netdev_ops = <q_eth_netdev_ops; + dev->ethtool_ops = <q_etop_ethtool_ops; + priv = netdev_priv(dev); + priv->res = res; + priv->pldata = dev_get_platdata(&pdev->dev); + priv->netdev = dev; + spin_lock_init(&priv->lock); + + for (i = 0; i < MAX_DMA_CHAN; i++) { + if (IS_TX(i)) + netif_napi_add(dev, &priv->ch[i].napi, + ltq_etop_poll_tx, 8); + else if (IS_RX(i)) + netif_napi_add(dev, &priv->ch[i].napi, + ltq_etop_poll_rx, 32); + priv->ch[i].netdev = dev; + } + + err = register_netdev(dev); + if (err) + goto err_free; + + platform_set_drvdata(pdev, dev); + return 0; + +err_free: + kfree(dev); +err_out: + return err; +} + +static int __devexit +ltq_etop_remove(struct platform_device *pdev) +{ + struct net_device *dev = platform_get_drvdata(pdev); + + if (dev) { + netif_tx_stop_all_queues(dev); + ltq_etop_hw_exit(dev); + ltq_etop_mdio_cleanup(dev); + unregister_netdev(dev); + } + return 0; +} + +static struct platform_driver ltq_mii_driver = { + .remove = __devexit_p(ltq_etop_remove), + .driver = { + .name = "ltq_etop", + .owner = THIS_MODULE, + }, +}; + +int __init +init_ltq_etop(void) +{ + int ret = platform_driver_probe(<q_mii_driver, ltq_etop_probe); + + if (ret) + pr_err("ltq_etop: Error registering platfom driver!"); + return ret; +} + +static void __exit +exit_ltq_etop(void) +{ + platform_driver_unregister(<q_mii_driver); +} + +module_init(init_ltq_etop); +module_exit(exit_ltq_etop); + +MODULE_AUTHOR("John Crispin "); +MODULE_DESCRIPTION("Lantiq SoC ETOP"); +MODULE_LICENSE("GPL"); -- cgit v1.1 From f1f0ceaada9d040a41023017c87abb1d651b44af Mon Sep 17 00:00:00 2001 From: John Crispin Date: Fri, 6 May 2011 00:10:02 +0200 Subject: MIPS: Lantiq: Add etop board support Register the etop platform device inside the machtype specific init code. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2356/ Patchwork: https://patchwork.linux-mips.org/patch/2370/ Signed-off-by: Ralf Baechle --- arch/mips/lantiq/xway/mach-easy50712.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/lantiq/xway/mach-easy50712.c b/arch/mips/lantiq/xway/mach-easy50712.c index e5e7e09..ea5027b 100644 --- a/arch/mips/lantiq/xway/mach-easy50712.c +++ b/arch/mips/lantiq/xway/mach-easy50712.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -55,11 +56,16 @@ static struct ltq_pci_data ltq_pci_data = { }, }; +static struct ltq_eth_data ltq_eth_data = { + .mii_mode = PHY_INTERFACE_MODE_MII, +}; + static void __init easy50712_init(void) { ltq_register_gpio_stp(); ltq_register_nor(&easy50712_flash_data); ltq_register_pci(<q_pci_data); + ltq_register_etop(<q_eth_data); } MIPS_MACHINE(LTQ_MACH_EASY50712, -- cgit v1.1 From 2f58b8d04e680ec13157ba6eee44455438c56d5f Mon Sep 17 00:00:00 2001 From: John Crispin Date: Thu, 5 May 2011 23:00:23 +0200 Subject: MIPS: Lantiq: Add watchdog support This patch adds the driver for the watchdog found inside the Lantiq SoC family. Signed-off-by: John Crispin Signed-off-by: Ralph Hempel Cc: Wim Van Sebroeck Cc: linux-mips@linux-mips.org Cc: linux-watchdog@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2327/ Signed-off-by: Ralf Baechle --- drivers/watchdog/Kconfig | 6 + drivers/watchdog/Makefile | 1 + drivers/watchdog/lantiq_wdt.c | 261 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 drivers/watchdog/lantiq_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 1b0f98b..022f9eb 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -990,6 +990,12 @@ config BCM63XX_WDT To compile this driver as a loadable module, choose M here. The module will be called bcm63xx_wdt. +config LANTIQ_WDT + tristate "Lantiq SoC watchdog" + depends on LANTIQ + help + Hardware driver for the Lantiq SoC Watchdog Timer. + # PARISC Architecture # POWERPC Architecture diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 3f8608b..ed26f70 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_AR7_WDT) += ar7_wdt.o obj-$(CONFIG_TXX9_WDT) += txx9wdt.o obj-$(CONFIG_OCTEON_WDT) += octeon-wdt.o octeon-wdt-y := octeon-wdt-main.o octeon-wdt-nmi.o +obj-$(CONFIG_LANTIQ_WDT) += lantiq_wdt.o # PARISC Architecture diff --git a/drivers/watchdog/lantiq_wdt.c b/drivers/watchdog/lantiq_wdt.c new file mode 100644 index 0000000..7d82ada --- /dev/null +++ b/drivers/watchdog/lantiq_wdt.c @@ -0,0 +1,261 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * Copyright (C) 2010 John Crispin + * Based on EP93xx wdt driver + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Section 3.4 of the datasheet + * The password sequence protects the WDT control register from unintended + * write actions, which might cause malfunction of the WDT. + * + * essentially the following two magic passwords need to be written to allow + * IO access to the WDT core + */ +#define LTQ_WDT_PW1 0x00BE0000 +#define LTQ_WDT_PW2 0x00DC0000 + +#define LTQ_WDT_CR 0x0 /* watchdog control register */ +#define LTQ_WDT_SR 0x8 /* watchdog status register */ + +#define LTQ_WDT_SR_EN (0x1 << 31) /* enable bit */ +#define LTQ_WDT_SR_PWD (0x3 << 26) /* turn on power */ +#define LTQ_WDT_SR_CLKDIV (0x3 << 24) /* turn on clock and set */ + /* divider to 0x40000 */ +#define LTQ_WDT_DIVIDER 0x40000 +#define LTQ_MAX_TIMEOUT ((1 << 16) - 1) /* the reload field is 16 bit */ + +static int nowayout = WATCHDOG_NOWAYOUT; + +static void __iomem *ltq_wdt_membase; +static unsigned long ltq_io_region_clk_rate; + +static unsigned long ltq_wdt_bootstatus; +static unsigned long ltq_wdt_in_use; +static int ltq_wdt_timeout = 30; +static int ltq_wdt_ok_to_close; + +static void +ltq_wdt_enable(void) +{ + ltq_wdt_timeout = ltq_wdt_timeout * + (ltq_io_region_clk_rate / LTQ_WDT_DIVIDER) + 0x1000; + if (ltq_wdt_timeout > LTQ_MAX_TIMEOUT) + ltq_wdt_timeout = LTQ_MAX_TIMEOUT; + + /* write the first password magic */ + ltq_w32(LTQ_WDT_PW1, ltq_wdt_membase + LTQ_WDT_CR); + /* write the second magic plus the configuration and new timeout */ + ltq_w32(LTQ_WDT_SR_EN | LTQ_WDT_SR_PWD | LTQ_WDT_SR_CLKDIV | + LTQ_WDT_PW2 | ltq_wdt_timeout, ltq_wdt_membase + LTQ_WDT_CR); +} + +static void +ltq_wdt_disable(void) +{ + /* write the first password magic */ + ltq_w32(LTQ_WDT_PW1, ltq_wdt_membase + LTQ_WDT_CR); + /* write the second password magic with no config + * this turns the watchdog off + */ + ltq_w32(LTQ_WDT_PW2, ltq_wdt_membase + LTQ_WDT_CR); +} + +static ssize_t +ltq_wdt_write(struct file *file, const char __user *data, + size_t len, loff_t *ppos) +{ + if (len) { + if (!nowayout) { + size_t i; + + ltq_wdt_ok_to_close = 0; + for (i = 0; i != len; i++) { + char c; + + if (get_user(c, data + i)) + return -EFAULT; + if (c == 'V') + ltq_wdt_ok_to_close = 1; + else + ltq_wdt_ok_to_close = 0; + } + } + ltq_wdt_enable(); + } + + return len; +} + +static struct watchdog_info ident = { + .options = WDIOF_MAGICCLOSE | WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | + WDIOF_CARDRESET, + .identity = "ltq_wdt", +}; + +static long +ltq_wdt_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int ret = -ENOTTY; + + switch (cmd) { + case WDIOC_GETSUPPORT: + ret = copy_to_user((struct watchdog_info __user *)arg, &ident, + sizeof(ident)) ? -EFAULT : 0; + break; + + case WDIOC_GETBOOTSTATUS: + ret = put_user(ltq_wdt_bootstatus, (int __user *)arg); + break; + + case WDIOC_GETSTATUS: + ret = put_user(0, (int __user *)arg); + break; + + case WDIOC_SETTIMEOUT: + ret = get_user(ltq_wdt_timeout, (int __user *)arg); + if (!ret) + ltq_wdt_enable(); + /* intentional drop through */ + case WDIOC_GETTIMEOUT: + ret = put_user(ltq_wdt_timeout, (int __user *)arg); + break; + + case WDIOC_KEEPALIVE: + ltq_wdt_enable(); + ret = 0; + break; + } + return ret; +} + +static int +ltq_wdt_open(struct inode *inode, struct file *file) +{ + if (test_and_set_bit(0, <q_wdt_in_use)) + return -EBUSY; + ltq_wdt_in_use = 1; + ltq_wdt_enable(); + + return nonseekable_open(inode, file); +} + +static int +ltq_wdt_release(struct inode *inode, struct file *file) +{ + if (ltq_wdt_ok_to_close) + ltq_wdt_disable(); + else + pr_err("ltq_wdt: watchdog closed without warning\n"); + ltq_wdt_ok_to_close = 0; + clear_bit(0, <q_wdt_in_use); + + return 0; +} + +static const struct file_operations ltq_wdt_fops = { + .owner = THIS_MODULE, + .write = ltq_wdt_write, + .unlocked_ioctl = ltq_wdt_ioctl, + .open = ltq_wdt_open, + .release = ltq_wdt_release, + .llseek = no_llseek, +}; + +static struct miscdevice ltq_wdt_miscdev = { + .minor = WATCHDOG_MINOR, + .name = "watchdog", + .fops = <q_wdt_fops, +}; + +static int __init +ltq_wdt_probe(struct platform_device *pdev) +{ + struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + struct clk *clk; + + if (!res) { + dev_err(&pdev->dev, "cannot obtain I/O memory region"); + return -ENOENT; + } + res = devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), dev_name(&pdev->dev)); + if (!res) { + dev_err(&pdev->dev, "cannot request I/O memory region"); + return -EBUSY; + } + ltq_wdt_membase = devm_ioremap_nocache(&pdev->dev, res->start, + resource_size(res)); + if (!ltq_wdt_membase) { + dev_err(&pdev->dev, "cannot remap I/O memory region\n"); + return -ENOMEM; + } + + /* we do not need to enable the clock as it is always running */ + clk = clk_get(&pdev->dev, "io"); + WARN_ON(!clk); + ltq_io_region_clk_rate = clk_get_rate(clk); + clk_put(clk); + + if (ltq_reset_cause() == LTQ_RST_CAUSE_WDTRST) + ltq_wdt_bootstatus = WDIOF_CARDRESET; + + return misc_register(<q_wdt_miscdev); +} + +static int __devexit +ltq_wdt_remove(struct platform_device *pdev) +{ + misc_deregister(<q_wdt_miscdev); + + if (ltq_wdt_membase) + iounmap(ltq_wdt_membase); + + return 0; +} + + +static struct platform_driver ltq_wdt_driver = { + .remove = __devexit_p(ltq_wdt_remove), + .driver = { + .name = "ltq_wdt", + .owner = THIS_MODULE, + }, +}; + +static int __init +init_ltq_wdt(void) +{ + return platform_driver_probe(<q_wdt_driver, ltq_wdt_probe); +} + +static void __exit +exit_ltq_wdt(void) +{ + return platform_driver_unregister(<q_wdt_driver); +} + +module_init(init_ltq_wdt); +module_exit(exit_ltq_wdt); + +module_param(nowayout, int, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started"); + +MODULE_AUTHOR("John Crispin "); +MODULE_DESCRIPTION("Lantiq SoC Watchdog"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); -- cgit v1.1 From 4f0ad950880a33df792b1e63649e29f8784b0163 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 19 May 2011 09:21:28 +0100 Subject: MIPS: IP27: Remove pointless switch statement. Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip27/ip27-timer.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c index a152538..3f810c9 100644 --- a/arch/mips/sgi-ip27/ip27-timer.c +++ b/arch/mips/sgi-ip27/ip27-timer.c @@ -66,18 +66,7 @@ static int rt_next_event(unsigned long delta, struct clock_event_device *evt) static void rt_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_ONESHOT: - /* The only mode supported */ - break; - - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do */ - break; - } + /* Nothing to do ... */ } int rt_timer_irq; -- cgit v1.1 From c19c20ac6338435469a2c222ef5dc55e0469a6dc Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 19 May 2011 09:21:28 +0100 Subject: MIPS: Use single define for pending work on syscall exit Signed-off-by: Ralf Baechle --- arch/mips/include/asm/thread_info.h | 3 +++ arch/mips/kernel/entry.S | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index d71160d..97f8bf6 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -149,6 +149,9 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define _TIF_FPUBOUND (1< Date: Thu, 19 May 2011 09:21:29 +0100 Subject: MIPS: Split do_syscall_trace into two functions. Signed-off-by: Ralf Baechle --- arch/mips/include/asm/ptrace.h | 3 ++- arch/mips/kernel/entry.S | 5 ++--- arch/mips/kernel/ptrace.c | 43 ++++++++++++++++++++++++++++++++++-------- arch/mips/kernel/scall32-o32.S | 3 +-- arch/mips/kernel/scall64-64.S | 3 +-- arch/mips/kernel/scall64-n32.S | 3 +-- arch/mips/kernel/scall64-o32.S | 3 +-- 7 files changed, 43 insertions(+), 20 deletions(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 9f1b8db..de39b1f 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -141,7 +141,8 @@ extern int ptrace_set_watch_regs(struct task_struct *child, #define instruction_pointer(regs) ((regs)->cp0_epc) #define profile_pc(regs) instruction_pointer(regs) -extern asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit); +extern asmlinkage void syscall_trace_enter(struct pt_regs *regs); +extern asmlinkage void syscall_trace_leave(struct pt_regs *regs); extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET; diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S index a917e34..37acfa0 100644 --- a/arch/mips/kernel/entry.S +++ b/arch/mips/kernel/entry.S @@ -170,11 +170,10 @@ syscall_exit_work: li t0, _TIF_WORK_SYSCALL_EXIT and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? - local_irq_enable # could let do_syscall_trace() + local_irq_enable # could let syscall_trace_leave() # call schedule() instead move a0, sp - li a1, 1 - jal do_syscall_trace + jal syscall_trace_leave b resume_userspace #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_MIPS_MT) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 584e6b5..4e6ea1f 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -533,15 +533,10 @@ static inline int audit_arch(void) * Notification of system call entry/exit * - triggered by current->work.syscall_trace */ -asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) +asmlinkage void syscall_trace_enter(struct pt_regs *regs) { /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->regs[2]); - - if (unlikely(current->audit_context) && entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), - -regs->regs[2]); + secure_computing(regs->regs[2]); if (!(current->ptrace & PT_PTRACED)) goto out; @@ -565,8 +560,40 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit) } out: - if (unlikely(current->audit_context) && !entryexit) + if (unlikely(current->audit_context)) audit_syscall_entry(audit_arch(), regs->regs[2], regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]); } + +/* + * Notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]), + -regs->regs[2]); + + if (!(current->ptrace & PT_PTRACED)) + return; + + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return; + + /* The 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? + 0x80 : 0)); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 7f1377e..7a8e1dd 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -88,8 +88,7 @@ syscall_trace_entry: SAVE_STATIC move s0, t2 move a0, sp - li a1, 0 - jal do_syscall_trace + jal syscall_trace_enter move t0, s0 RESTORE_STATIC diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 7c0ef7f..2d31c83 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -91,8 +91,7 @@ syscall_trace_entry: SAVE_STATIC move s0, t2 move a0, sp - li a1, 0 - jal do_syscall_trace + jal syscall_trace_enter move t0, s0 RESTORE_STATIC diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index de6c556..38a0503 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -89,8 +89,7 @@ n32_syscall_trace_entry: SAVE_STATIC move s0, t2 move a0, sp - li a1, 0 - jal do_syscall_trace + jal syscall_trace_enter move t0, s0 RESTORE_STATIC diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index b0541dd..91ea5e4 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -123,8 +123,7 @@ trace_a_syscall: move s0, t2 # Save syscall pointer move a0, sp - li a1, 0 - jal do_syscall_trace + jal syscall_trace_enter move t0, s0 RESTORE_STATIC -- cgit v1.1 From 0591128066bdfe07e0ef0ab7f877f794d8ba071d Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:12 +0200 Subject: MIPS: DB1200: Set Config[OD] for improved stability. Setting Config[OD] gets rid of a _LOT_ of spurious CPLD interrupts, but also decreases overall performance a bit. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2347/ Signed-off-by: Ralf Baechle --- arch/mips/alchemy/common/setup.c | 4 ++-- arch/mips/alchemy/devboards/db1200/setup.c | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c index 561e5da..1b887c8 100644 --- a/arch/mips/alchemy/common/setup.c +++ b/arch/mips/alchemy/common/setup.c @@ -52,8 +52,6 @@ void __init plat_mem_setup(void) /* this is faster than wasting cycles trying to approximate it */ preset_lpj = (est_freq >> 1) / HZ; - board_setup(); /* board specific setup */ - if (au1xxx_cpu_needs_config_od()) /* Various early Au1xx0 errata corrected by this */ set_c0_config(1 << 19); /* Set Config[OD] */ @@ -61,6 +59,8 @@ void __init plat_mem_setup(void) /* Clear to obtain best system bus performance */ clear_c0_config(1 << 19); /* Clear Config[OD] */ + board_setup(); /* board specific setup */ + /* IO/MEM resources. */ set_io_port_base(0); ioport_resource.start = IOPORT_RESOURCE_START; diff --git a/arch/mips/alchemy/devboards/db1200/setup.c b/arch/mips/alchemy/devboards/db1200/setup.c index 4a89800..1dac4f2 100644 --- a/arch/mips/alchemy/devboards/db1200/setup.c +++ b/arch/mips/alchemy/devboards/db1200/setup.c @@ -23,6 +23,13 @@ void __init board_setup(void) unsigned long freq0, clksrc, div, pfc; unsigned short whoami; + /* Set Config[OD] (disable overlapping bus transaction): + * This gets rid of a _lot_ of spurious interrupts (especially + * wrt. IDE); but incurs ~10% performance hit in some + * cpu-bound applications. + */ + set_c0_config(1 << 19); + bcsr_init(DB1200_BCSR_PHYS_ADDR, DB1200_BCSR_PHYS_ADDR + DB1200_BCSR_HEXLED_OFS); -- cgit v1.1 From c1e58a3129bc327f7e0eb06fd4fe5ebf2af5d8ef Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:13 +0200 Subject: MIPS: Alchemy: update inlinable GPIO API This fixes a build failure with gpio_keys and CONFIG_GPIOLIB=n (mtx1): CC drivers/input/keyboard/gpio_keys.o gpio_keys.c: In function 'gpio_keys_report_event': gpio_keys.c:325:2: error: implicit declaration of function 'gpio_get_value_cansleep' gpio_keys.c: In function 'gpio_keys_setup_key': gpio_keys.c:390:3: error: implicit declaration of function 'gpio_set_debounce' Also add stubs for the other new functions. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2346/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mach-au1x00/gpio-au1000.h | 51 +++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/arch/mips/include/asm/mach-au1x00/gpio-au1000.h b/arch/mips/include/asm/mach-au1x00/gpio-au1000.h index 62d2f13..8f8c1c5 100644 --- a/arch/mips/include/asm/mach-au1x00/gpio-au1000.h +++ b/arch/mips/include/asm/mach-au1x00/gpio-au1000.h @@ -24,6 +24,7 @@ #define MAKE_IRQ(intc, off) (AU1000_INTC##intc##_INT_BASE + (off)) +struct gpio; static inline int au1000_gpio1_to_irq(int gpio) { @@ -556,6 +557,16 @@ static inline void gpio_set_value(int gpio, int v) alchemy_gpio_set_value(gpio, v); } +static inline int gpio_get_value_cansleep(unsigned gpio) +{ + return gpio_get_value(gpio); +} + +static inline void gpio_set_value_cansleep(unsigned gpio, int value) +{ + gpio_set_value(gpio, value); +} + static inline int gpio_is_valid(int gpio) { return alchemy_gpio_is_valid(gpio); @@ -581,10 +592,50 @@ static inline int gpio_request(unsigned gpio, const char *label) return 0; } +static inline int gpio_request_one(unsigned gpio, + unsigned long flags, const char *label) +{ + return 0; +} + +static inline int gpio_request_array(struct gpio *array, size_t num) +{ + return 0; +} + static inline void gpio_free(unsigned gpio) { } +static inline void gpio_free_array(struct gpio *array, size_t num) +{ +} + +static inline int gpio_set_debounce(unsigned gpio, unsigned debounce) +{ + return -ENOSYS; +} + +static inline int gpio_export(unsigned gpio, bool direction_may_change) +{ + return -ENOSYS; +} + +static inline int gpio_export_link(struct device *dev, const char *name, + unsigned gpio) +{ + return -ENOSYS; +} + +static inline int gpio_sysfs_set_active_low(unsigned gpio, int value) +{ + return -ENOSYS; +} + +static inline void gpio_unexport(unsigned gpio) +{ +} + #endif /* !CONFIG_ALCHEMY_GPIO_INDIRECT */ -- cgit v1.1 From dca7587185b3a499a09a9e2755316eee31c49c7f Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:14 +0200 Subject: MIPS: Alchemy: irq code and constant cleanup replace au_readl/au_writel with __raw_readl/__raw_writel, and clean up IC-related stuff from the headers. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2354/ Signed-off-by: Ralf Baechle --- arch/mips/alchemy/common/irq.c | 250 ++++++++++++++++------------- arch/mips/include/asm/mach-au1x00/au1000.h | 121 +------------- 2 files changed, 140 insertions(+), 231 deletions(-) diff --git a/arch/mips/alchemy/common/irq.c b/arch/mips/alchemy/common/irq.c index 55dd7c8..b72e128 100644 --- a/arch/mips/alchemy/common/irq.c +++ b/arch/mips/alchemy/common/irq.c @@ -39,6 +39,36 @@ #include #endif +/* Interrupt Controller register offsets */ +#define IC_CFG0RD 0x40 +#define IC_CFG0SET 0x40 +#define IC_CFG0CLR 0x44 +#define IC_CFG1RD 0x48 +#define IC_CFG1SET 0x48 +#define IC_CFG1CLR 0x4C +#define IC_CFG2RD 0x50 +#define IC_CFG2SET 0x50 +#define IC_CFG2CLR 0x54 +#define IC_REQ0INT 0x54 +#define IC_SRCRD 0x58 +#define IC_SRCSET 0x58 +#define IC_SRCCLR 0x5C +#define IC_REQ1INT 0x5C +#define IC_ASSIGNRD 0x60 +#define IC_ASSIGNSET 0x60 +#define IC_ASSIGNCLR 0x64 +#define IC_WAKERD 0x68 +#define IC_WAKESET 0x68 +#define IC_WAKECLR 0x6C +#define IC_MASKRD 0x70 +#define IC_MASKSET 0x70 +#define IC_MASKCLR 0x74 +#define IC_RISINGRD 0x78 +#define IC_RISINGCLR 0x78 +#define IC_FALLINGRD 0x7C +#define IC_FALLINGCLR 0x7C +#define IC_TESTBIT 0x80 + static int au1x_ic_settype(struct irq_data *d, unsigned int flow_type); /* NOTE on interrupt priorities: The original writers of this code said: @@ -221,89 +251,101 @@ struct au1xxx_irqmap au1200_irqmap[] __initdata = { static void au1x_ic0_unmask(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC0_INT_BASE; - au_writel(1 << bit, IC0_MASKSET); - au_writel(1 << bit, IC0_WAKESET); - au_sync(); + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR); + + __raw_writel(1 << bit, base + IC_MASKSET); + __raw_writel(1 << bit, base + IC_WAKESET); + wmb(); } static void au1x_ic1_unmask(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC1_INT_BASE; - au_writel(1 << bit, IC1_MASKSET); - au_writel(1 << bit, IC1_WAKESET); + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR); + + __raw_writel(1 << bit, base + IC_MASKSET); + __raw_writel(1 << bit, base + IC_WAKESET); /* very hacky. does the pb1000 cpld auto-disable this int? * nowhere in the current kernel sources is it disabled. --mlau */ #if defined(CONFIG_MIPS_PB1000) if (d->irq == AU1000_GPIO15_INT) - au_writel(0x4000, PB1000_MDR); /* enable int */ + __raw_writel(0x4000, (void __iomem *)PB1000_MDR); /* enable int */ #endif - au_sync(); + wmb(); } static void au1x_ic0_mask(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC0_INT_BASE; - au_writel(1 << bit, IC0_MASKCLR); - au_writel(1 << bit, IC0_WAKECLR); - au_sync(); + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR); + + __raw_writel(1 << bit, base + IC_MASKCLR); + __raw_writel(1 << bit, base + IC_WAKECLR); + wmb(); } static void au1x_ic1_mask(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC1_INT_BASE; - au_writel(1 << bit, IC1_MASKCLR); - au_writel(1 << bit, IC1_WAKECLR); - au_sync(); + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR); + + __raw_writel(1 << bit, base + IC_MASKCLR); + __raw_writel(1 << bit, base + IC_WAKECLR); + wmb(); } static void au1x_ic0_ack(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC0_INT_BASE; + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR); /* * This may assume that we don't get interrupts from * both edges at once, or if we do, that we don't care. */ - au_writel(1 << bit, IC0_FALLINGCLR); - au_writel(1 << bit, IC0_RISINGCLR); - au_sync(); + __raw_writel(1 << bit, base + IC_FALLINGCLR); + __raw_writel(1 << bit, base + IC_RISINGCLR); + wmb(); } static void au1x_ic1_ack(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC1_INT_BASE; + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR); /* * This may assume that we don't get interrupts from * both edges at once, or if we do, that we don't care. */ - au_writel(1 << bit, IC1_FALLINGCLR); - au_writel(1 << bit, IC1_RISINGCLR); - au_sync(); + __raw_writel(1 << bit, base + IC_FALLINGCLR); + __raw_writel(1 << bit, base + IC_RISINGCLR); + wmb(); } static void au1x_ic0_maskack(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC0_INT_BASE; + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR); - au_writel(1 << bit, IC0_WAKECLR); - au_writel(1 << bit, IC0_MASKCLR); - au_writel(1 << bit, IC0_RISINGCLR); - au_writel(1 << bit, IC0_FALLINGCLR); - au_sync(); + __raw_writel(1 << bit, base + IC_WAKECLR); + __raw_writel(1 << bit, base + IC_MASKCLR); + __raw_writel(1 << bit, base + IC_RISINGCLR); + __raw_writel(1 << bit, base + IC_FALLINGCLR); + wmb(); } static void au1x_ic1_maskack(struct irq_data *d) { unsigned int bit = d->irq - AU1000_INTC1_INT_BASE; + void __iomem *base = (void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR); - au_writel(1 << bit, IC1_WAKECLR); - au_writel(1 << bit, IC1_MASKCLR); - au_writel(1 << bit, IC1_RISINGCLR); - au_writel(1 << bit, IC1_FALLINGCLR); - au_sync(); + __raw_writel(1 << bit, base + IC_WAKECLR); + __raw_writel(1 << bit, base + IC_MASKCLR); + __raw_writel(1 << bit, base + IC_RISINGCLR); + __raw_writel(1 << bit, base + IC_FALLINGCLR); + wmb(); } static int au1x_ic1_setwake(struct irq_data *d, unsigned int on) @@ -318,13 +360,13 @@ static int au1x_ic1_setwake(struct irq_data *d, unsigned int on) return -EINVAL; local_irq_save(flags); - wakemsk = au_readl(SYS_WAKEMSK); + wakemsk = __raw_readl((void __iomem *)SYS_WAKEMSK); if (on) wakemsk |= 1 << bit; else wakemsk &= ~(1 << bit); - au_writel(wakemsk, SYS_WAKEMSK); - au_sync(); + __raw_writel(wakemsk, (void __iomem *)SYS_WAKEMSK); + wmb(); local_irq_restore(flags); return 0; @@ -356,81 +398,74 @@ static struct irq_chip au1x_ic1_chip = { static int au1x_ic_settype(struct irq_data *d, unsigned int flow_type) { struct irq_chip *chip; - unsigned long icr[6]; - unsigned int bit, ic, irq = d->irq; + unsigned int bit, irq = d->irq; irq_flow_handler_t handler = NULL; unsigned char *name = NULL; + void __iomem *base; int ret; if (irq >= AU1000_INTC1_INT_BASE) { bit = irq - AU1000_INTC1_INT_BASE; chip = &au1x_ic1_chip; - ic = 1; + base = (void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR); } else { bit = irq - AU1000_INTC0_INT_BASE; chip = &au1x_ic0_chip; - ic = 0; + base = (void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR); } if (bit > 31) return -EINVAL; - icr[0] = ic ? IC1_CFG0SET : IC0_CFG0SET; - icr[1] = ic ? IC1_CFG1SET : IC0_CFG1SET; - icr[2] = ic ? IC1_CFG2SET : IC0_CFG2SET; - icr[3] = ic ? IC1_CFG0CLR : IC0_CFG0CLR; - icr[4] = ic ? IC1_CFG1CLR : IC0_CFG1CLR; - icr[5] = ic ? IC1_CFG2CLR : IC0_CFG2CLR; - ret = 0; switch (flow_type) { /* cfgregs 2:1:0 */ case IRQ_TYPE_EDGE_RISING: /* 0:0:1 */ - au_writel(1 << bit, icr[5]); - au_writel(1 << bit, icr[4]); - au_writel(1 << bit, icr[0]); + __raw_writel(1 << bit, base + IC_CFG2CLR); + __raw_writel(1 << bit, base + IC_CFG1CLR); + __raw_writel(1 << bit, base + IC_CFG0SET); handler = handle_edge_irq; name = "riseedge"; break; case IRQ_TYPE_EDGE_FALLING: /* 0:1:0 */ - au_writel(1 << bit, icr[5]); - au_writel(1 << bit, icr[1]); - au_writel(1 << bit, icr[3]); + __raw_writel(1 << bit, base + IC_CFG2CLR); + __raw_writel(1 << bit, base + IC_CFG1SET); + __raw_writel(1 << bit, base + IC_CFG0CLR); handler = handle_edge_irq; name = "falledge"; break; case IRQ_TYPE_EDGE_BOTH: /* 0:1:1 */ - au_writel(1 << bit, icr[5]); - au_writel(1 << bit, icr[1]); - au_writel(1 << bit, icr[0]); + __raw_writel(1 << bit, base + IC_CFG2CLR); + __raw_writel(1 << bit, base + IC_CFG1SET); + __raw_writel(1 << bit, base + IC_CFG0SET); handler = handle_edge_irq; name = "bothedge"; break; case IRQ_TYPE_LEVEL_HIGH: /* 1:0:1 */ - au_writel(1 << bit, icr[2]); - au_writel(1 << bit, icr[4]); - au_writel(1 << bit, icr[0]); + __raw_writel(1 << bit, base + IC_CFG2SET); + __raw_writel(1 << bit, base + IC_CFG1CLR); + __raw_writel(1 << bit, base + IC_CFG0SET); handler = handle_level_irq; name = "hilevel"; break; case IRQ_TYPE_LEVEL_LOW: /* 1:1:0 */ - au_writel(1 << bit, icr[2]); - au_writel(1 << bit, icr[1]); - au_writel(1 << bit, icr[3]); + __raw_writel(1 << bit, base + IC_CFG2SET); + __raw_writel(1 << bit, base + IC_CFG1SET); + __raw_writel(1 << bit, base + IC_CFG0CLR); handler = handle_level_irq; name = "lowlevel"; break; case IRQ_TYPE_NONE: /* 0:0:0 */ - au_writel(1 << bit, icr[5]); - au_writel(1 << bit, icr[4]); - au_writel(1 << bit, icr[3]); + __raw_writel(1 << bit, base + IC_CFG2CLR); + __raw_writel(1 << bit, base + IC_CFG1CLR); + __raw_writel(1 << bit, base + IC_CFG0CLR); break; default: ret = -EINVAL; } __irq_set_chip_handler_name_locked(d->irq, chip, handler, name); - au_sync(); + wmb(); return ret; } @@ -444,21 +479,21 @@ asmlinkage void plat_irq_dispatch(void) off = MIPS_CPU_IRQ_BASE + 7; goto handle; } else if (pending & CAUSEF_IP2) { - s = IC0_REQ0INT; + s = KSEG1ADDR(AU1000_IC0_PHYS_ADDR) + IC_REQ0INT; off = AU1000_INTC0_INT_BASE; } else if (pending & CAUSEF_IP3) { - s = IC0_REQ1INT; + s = KSEG1ADDR(AU1000_IC0_PHYS_ADDR) + IC_REQ1INT; off = AU1000_INTC0_INT_BASE; } else if (pending & CAUSEF_IP4) { - s = IC1_REQ0INT; + s = KSEG1ADDR(AU1000_IC1_PHYS_ADDR) + IC_REQ0INT; off = AU1000_INTC1_INT_BASE; } else if (pending & CAUSEF_IP5) { - s = IC1_REQ1INT; + s = KSEG1ADDR(AU1000_IC1_PHYS_ADDR) + IC_REQ1INT; off = AU1000_INTC1_INT_BASE; } else goto spurious; - s = au_readl(s); + s = __raw_readl((void __iomem *)s); if (unlikely(!s)) { spurious: spurious_interrupt(); @@ -469,48 +504,42 @@ handle: do_IRQ(off); } + +static inline void ic_init(void __iomem *base) +{ + /* initialize interrupt controller to a safe state */ + __raw_writel(0xffffffff, base + IC_CFG0CLR); + __raw_writel(0xffffffff, base + IC_CFG1CLR); + __raw_writel(0xffffffff, base + IC_CFG2CLR); + __raw_writel(0xffffffff, base + IC_MASKCLR); + __raw_writel(0xffffffff, base + IC_ASSIGNCLR); + __raw_writel(0xffffffff, base + IC_WAKECLR); + __raw_writel(0xffffffff, base + IC_SRCSET); + __raw_writel(0xffffffff, base + IC_FALLINGCLR); + __raw_writel(0xffffffff, base + IC_RISINGCLR); + __raw_writel(0x00000000, base + IC_TESTBIT); + wmb(); +} + static void __init au1000_init_irq(struct au1xxx_irqmap *map) { unsigned int bit, irq_nr; - int i; - - /* - * Initialize interrupt controllers to a safe state. - */ - au_writel(0xffffffff, IC0_CFG0CLR); - au_writel(0xffffffff, IC0_CFG1CLR); - au_writel(0xffffffff, IC0_CFG2CLR); - au_writel(0xffffffff, IC0_MASKCLR); - au_writel(0xffffffff, IC0_ASSIGNCLR); - au_writel(0xffffffff, IC0_WAKECLR); - au_writel(0xffffffff, IC0_SRCSET); - au_writel(0xffffffff, IC0_FALLINGCLR); - au_writel(0xffffffff, IC0_RISINGCLR); - au_writel(0x00000000, IC0_TESTBIT); - - au_writel(0xffffffff, IC1_CFG0CLR); - au_writel(0xffffffff, IC1_CFG1CLR); - au_writel(0xffffffff, IC1_CFG2CLR); - au_writel(0xffffffff, IC1_MASKCLR); - au_writel(0xffffffff, IC1_ASSIGNCLR); - au_writel(0xffffffff, IC1_WAKECLR); - au_writel(0xffffffff, IC1_SRCSET); - au_writel(0xffffffff, IC1_FALLINGCLR); - au_writel(0xffffffff, IC1_RISINGCLR); - au_writel(0x00000000, IC1_TESTBIT); + void __iomem *base; + ic_init((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR)); + ic_init((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR)); mips_cpu_irq_init(); /* register all 64 possible IC0+IC1 irq sources as type "none". * Use set_irq_type() to set edge/level behaviour at runtime. */ - for (i = AU1000_INTC0_INT_BASE; - (i < AU1000_INTC0_INT_BASE + 32); i++) - au1x_ic_settype(irq_get_irq_data(i), IRQ_TYPE_NONE); + for (irq_nr = AU1000_INTC0_INT_BASE; + (irq_nr < AU1000_INTC0_INT_BASE + 32); irq_nr++) + au1x_ic_settype(irq_get_irq_data(irq_nr), IRQ_TYPE_NONE); - for (i = AU1000_INTC1_INT_BASE; - (i < AU1000_INTC1_INT_BASE + 32); i++) - au1x_ic_settype(irq_get_irq_data(i), IRQ_TYPE_NONE); + for (irq_nr = AU1000_INTC1_INT_BASE; + (irq_nr < AU1000_INTC1_INT_BASE + 32); irq_nr++) + au1x_ic_settype(irq_get_irq_data(irq_nr), IRQ_TYPE_NONE); /* * Initialize IC0, which is fixed per processor. @@ -520,13 +549,13 @@ static void __init au1000_init_irq(struct au1xxx_irqmap *map) if (irq_nr >= AU1000_INTC1_INT_BASE) { bit = irq_nr - AU1000_INTC1_INT_BASE; - if (map->im_request) - au_writel(1 << bit, IC1_ASSIGNSET); + base = (void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR); } else { bit = irq_nr - AU1000_INTC0_INT_BASE; - if (map->im_request) - au_writel(1 << bit, IC0_ASSIGNSET); + base = (void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR); } + if (map->im_request) + __raw_writel(1 << bit, base + IC_ASSIGNSET); au1x_ic_settype(irq_get_irq_data(irq_nr), map->im_type); ++map; @@ -583,17 +612,8 @@ static int alchemy_ic_resume(struct sys_device *dev) struct alchemy_ic_sysdev *icdev = container_of(dev, struct alchemy_ic_sysdev, sysdev); - __raw_writel(0xffffffff, icdev->base + IC_MASKCLR); - __raw_writel(0xffffffff, icdev->base + IC_CFG0CLR); - __raw_writel(0xffffffff, icdev->base + IC_CFG1CLR); - __raw_writel(0xffffffff, icdev->base + IC_CFG2CLR); - __raw_writel(0xffffffff, icdev->base + IC_SRCCLR); - __raw_writel(0xffffffff, icdev->base + IC_ASSIGNCLR); - __raw_writel(0xffffffff, icdev->base + IC_WAKECLR); - __raw_writel(0xffffffff, icdev->base + IC_RISINGCLR); - __raw_writel(0xffffffff, icdev->base + IC_FALLINGCLR); - __raw_writel(0x00000000, icdev->base + IC_TESTBIT); - wmb(); + ic_init(icdev->base); + __raw_writel(icdev->pmdata[0], icdev->base + IC_CFG0SET); __raw_writel(icdev->pmdata[1], icdev->base + IC_CFG1SET); __raw_writel(icdev->pmdata[2], icdev->base + IC_CFG2SET); @@ -617,7 +637,7 @@ static struct sysdev_class alchemy_ic_sysdev_class = { static int __init alchemy_ic_sysdev_init(void) { struct alchemy_ic_sysdev *icdev; - unsigned long icbase[2] = { IC0_PHYS_ADDR, IC1_PHYS_ADDR }; + unsigned long icbase[2] = { AU1000_IC0_PHYS_ADDR, AU1000_IC1_PHYS_ADDR }; int err, i; err = sysdev_class_register(&alchemy_ic_sysdev_class); diff --git a/arch/mips/include/asm/mach-au1x00/au1000.h b/arch/mips/include/asm/mach-au1x00/au1000.h index a697661..66cfcdc 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000.h +++ b/arch/mips/include/asm/mach-au1x00/au1000.h @@ -630,8 +630,13 @@ enum soc_au1200_ints { /* * Physical base addresses for integrated peripherals + * 0..au1000 1..au1500 2..au1100 3..au1550 4..au1200 */ +#define AU1000_IC0_PHYS_ADDR 0x10400000 /* 01234 */ +#define AU1000_IC1_PHYS_ADDR 0x11800000 /* 01234 */ + + #ifdef CONFIG_SOC_AU1000 #define MEM_PHYS_ADDR 0x14000000 #define STATIC_MEM_PHYS_ADDR 0x14001000 @@ -643,8 +648,6 @@ enum soc_au1200_ints { #define DMA5_PHYS_ADDR 0x14002500 #define DMA6_PHYS_ADDR 0x14002600 #define DMA7_PHYS_ADDR 0x14002700 -#define IC0_PHYS_ADDR 0x10400000 -#define IC1_PHYS_ADDR 0x11800000 #define AC97_PHYS_ADDR 0x10000000 #define USBH_PHYS_ADDR 0x10100000 #define USBD_PHYS_ADDR 0x10200000 @@ -680,8 +683,6 @@ enum soc_au1200_ints { #define DMA5_PHYS_ADDR 0x14002500 #define DMA6_PHYS_ADDR 0x14002600 #define DMA7_PHYS_ADDR 0x14002700 -#define IC0_PHYS_ADDR 0x10400000 -#define IC1_PHYS_ADDR 0x11800000 #define AC97_PHYS_ADDR 0x10000000 #define USBH_PHYS_ADDR 0x10100000 #define USBD_PHYS_ADDR 0x10200000 @@ -718,10 +719,8 @@ enum soc_au1200_ints { #define DMA5_PHYS_ADDR 0x14002500 #define DMA6_PHYS_ADDR 0x14002600 #define DMA7_PHYS_ADDR 0x14002700 -#define IC0_PHYS_ADDR 0x10400000 #define SD0_PHYS_ADDR 0x10600000 #define SD1_PHYS_ADDR 0x10680000 -#define IC1_PHYS_ADDR 0x11800000 #define AC97_PHYS_ADDR 0x10000000 #define USBH_PHYS_ADDR 0x10100000 #define USBD_PHYS_ADDR 0x10200000 @@ -749,8 +748,6 @@ enum soc_au1200_ints { #ifdef CONFIG_SOC_AU1550 #define MEM_PHYS_ADDR 0x14000000 #define STATIC_MEM_PHYS_ADDR 0x14001000 -#define IC0_PHYS_ADDR 0x10400000 -#define IC1_PHYS_ADDR 0x11800000 #define USBH_PHYS_ADDR 0x14020000 #define USBD_PHYS_ADDR 0x10200000 #define PCI_PHYS_ADDR 0x14005000 @@ -786,8 +783,6 @@ enum soc_au1200_ints { #define STATIC_MEM_PHYS_ADDR 0x14001000 #define AES_PHYS_ADDR 0x10300000 #define CIM_PHYS_ADDR 0x14004000 -#define IC0_PHYS_ADDR 0x10400000 -#define IC1_PHYS_ADDR 0x11800000 #define USBM_PHYS_ADDR 0x14020000 #define USBH_PHYS_ADDR 0x14020100 #define UART0_PHYS_ADDR 0x11100000 @@ -835,112 +830,6 @@ enum soc_au1200_ints { #endif -/* Interrupt Controller register offsets */ -#define IC_CFG0RD 0x40 -#define IC_CFG0SET 0x40 -#define IC_CFG0CLR 0x44 -#define IC_CFG1RD 0x48 -#define IC_CFG1SET 0x48 -#define IC_CFG1CLR 0x4C -#define IC_CFG2RD 0x50 -#define IC_CFG2SET 0x50 -#define IC_CFG2CLR 0x54 -#define IC_REQ0INT 0x54 -#define IC_SRCRD 0x58 -#define IC_SRCSET 0x58 -#define IC_SRCCLR 0x5C -#define IC_REQ1INT 0x5C -#define IC_ASSIGNRD 0x60 -#define IC_ASSIGNSET 0x60 -#define IC_ASSIGNCLR 0x64 -#define IC_WAKERD 0x68 -#define IC_WAKESET 0x68 -#define IC_WAKECLR 0x6C -#define IC_MASKRD 0x70 -#define IC_MASKSET 0x70 -#define IC_MASKCLR 0x74 -#define IC_RISINGRD 0x78 -#define IC_RISINGCLR 0x78 -#define IC_FALLINGRD 0x7C -#define IC_FALLINGCLR 0x7C -#define IC_TESTBIT 0x80 - - -/* Interrupt Controller 0 */ -#define IC0_CFG0RD 0xB0400040 -#define IC0_CFG0SET 0xB0400040 -#define IC0_CFG0CLR 0xB0400044 - -#define IC0_CFG1RD 0xB0400048 -#define IC0_CFG1SET 0xB0400048 -#define IC0_CFG1CLR 0xB040004C - -#define IC0_CFG2RD 0xB0400050 -#define IC0_CFG2SET 0xB0400050 -#define IC0_CFG2CLR 0xB0400054 - -#define IC0_REQ0INT 0xB0400054 -#define IC0_SRCRD 0xB0400058 -#define IC0_SRCSET 0xB0400058 -#define IC0_SRCCLR 0xB040005C -#define IC0_REQ1INT 0xB040005C - -#define IC0_ASSIGNRD 0xB0400060 -#define IC0_ASSIGNSET 0xB0400060 -#define IC0_ASSIGNCLR 0xB0400064 - -#define IC0_WAKERD 0xB0400068 -#define IC0_WAKESET 0xB0400068 -#define IC0_WAKECLR 0xB040006C - -#define IC0_MASKRD 0xB0400070 -#define IC0_MASKSET 0xB0400070 -#define IC0_MASKCLR 0xB0400074 - -#define IC0_RISINGRD 0xB0400078 -#define IC0_RISINGCLR 0xB0400078 -#define IC0_FALLINGRD 0xB040007C -#define IC0_FALLINGCLR 0xB040007C - -#define IC0_TESTBIT 0xB0400080 - -/* Interrupt Controller 1 */ -#define IC1_CFG0RD 0xB1800040 -#define IC1_CFG0SET 0xB1800040 -#define IC1_CFG0CLR 0xB1800044 - -#define IC1_CFG1RD 0xB1800048 -#define IC1_CFG1SET 0xB1800048 -#define IC1_CFG1CLR 0xB180004C - -#define IC1_CFG2RD 0xB1800050 -#define IC1_CFG2SET 0xB1800050 -#define IC1_CFG2CLR 0xB1800054 - -#define IC1_REQ0INT 0xB1800054 -#define IC1_SRCRD 0xB1800058 -#define IC1_SRCSET 0xB1800058 -#define IC1_SRCCLR 0xB180005C -#define IC1_REQ1INT 0xB180005C - -#define IC1_ASSIGNRD 0xB1800060 -#define IC1_ASSIGNSET 0xB1800060 -#define IC1_ASSIGNCLR 0xB1800064 - -#define IC1_WAKERD 0xB1800068 -#define IC1_WAKESET 0xB1800068 -#define IC1_WAKECLR 0xB180006C - -#define IC1_MASKRD 0xB1800070 -#define IC1_MASKSET 0xB1800070 -#define IC1_MASKCLR 0xB1800074 - -#define IC1_RISINGRD 0xB1800078 -#define IC1_RISINGCLR 0xB1800078 -#define IC1_FALLINGRD 0xB180007C -#define IC1_FALLINGCLR 0xB180007C - -#define IC1_TESTBIT 0xB1800080 /* Au1000 */ -- cgit v1.1 From 4b5c82b5e57ac6cb919e7e74984e28b312bdf10c Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:15 +0200 Subject: MIPS: Alchemy: Convert irq.c to syscore_ops. Convert the PM sysdev to use syscore_ops instead. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2350/ Signed-off-by: Ralf Baechle --- arch/mips/alchemy/common/irq.c | 101 +++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 60 deletions(-) diff --git a/arch/mips/alchemy/common/irq.c b/arch/mips/alchemy/common/irq.c index b72e128..8b60ba0 100644 --- a/arch/mips/alchemy/common/irq.c +++ b/arch/mips/alchemy/common/irq.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include @@ -585,81 +585,62 @@ void __init arch_init_irq(void) } } -struct alchemy_ic_sysdev { - struct sys_device sysdev; - void __iomem *base; - unsigned long pmdata[7]; -}; - -static int alchemy_ic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct alchemy_ic_sysdev *icdev = - container_of(dev, struct alchemy_ic_sysdev, sysdev); - icdev->pmdata[0] = __raw_readl(icdev->base + IC_CFG0RD); - icdev->pmdata[1] = __raw_readl(icdev->base + IC_CFG1RD); - icdev->pmdata[2] = __raw_readl(icdev->base + IC_CFG2RD); - icdev->pmdata[3] = __raw_readl(icdev->base + IC_SRCRD); - icdev->pmdata[4] = __raw_readl(icdev->base + IC_ASSIGNRD); - icdev->pmdata[5] = __raw_readl(icdev->base + IC_WAKERD); - icdev->pmdata[6] = __raw_readl(icdev->base + IC_MASKRD); +static unsigned long alchemy_ic_pmdata[7 * 2]; - return 0; +static inline void alchemy_ic_suspend_one(void __iomem *base, unsigned long *d) +{ + d[0] = __raw_readl(base + IC_CFG0RD); + d[1] = __raw_readl(base + IC_CFG1RD); + d[2] = __raw_readl(base + IC_CFG2RD); + d[3] = __raw_readl(base + IC_SRCRD); + d[4] = __raw_readl(base + IC_ASSIGNRD); + d[5] = __raw_readl(base + IC_WAKERD); + d[6] = __raw_readl(base + IC_MASKRD); + ic_init(base); /* shut it up too while at it */ } -static int alchemy_ic_resume(struct sys_device *dev) +static inline void alchemy_ic_resume_one(void __iomem *base, unsigned long *d) { - struct alchemy_ic_sysdev *icdev = - container_of(dev, struct alchemy_ic_sysdev, sysdev); - - ic_init(icdev->base); - - __raw_writel(icdev->pmdata[0], icdev->base + IC_CFG0SET); - __raw_writel(icdev->pmdata[1], icdev->base + IC_CFG1SET); - __raw_writel(icdev->pmdata[2], icdev->base + IC_CFG2SET); - __raw_writel(icdev->pmdata[3], icdev->base + IC_SRCSET); - __raw_writel(icdev->pmdata[4], icdev->base + IC_ASSIGNSET); - __raw_writel(icdev->pmdata[5], icdev->base + IC_WAKESET); + ic_init(base); + + __raw_writel(d[0], base + IC_CFG0SET); + __raw_writel(d[1], base + IC_CFG1SET); + __raw_writel(d[2], base + IC_CFG2SET); + __raw_writel(d[3], base + IC_SRCSET); + __raw_writel(d[4], base + IC_ASSIGNSET); + __raw_writel(d[5], base + IC_WAKESET); wmb(); - __raw_writel(icdev->pmdata[6], icdev->base + IC_MASKSET); + __raw_writel(d[6], base + IC_MASKSET); wmb(); +} +static int alchemy_ic_suspend(void) +{ + alchemy_ic_suspend_one((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR), + alchemy_ic_pmdata); + alchemy_ic_suspend_one((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR), + &alchemy_ic_pmdata[7]); return 0; } -static struct sysdev_class alchemy_ic_sysdev_class = { - .name = "ic", +static void alchemy_ic_resume(void) +{ + alchemy_ic_resume_one((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR), + &alchemy_ic_pmdata[7]); + alchemy_ic_resume_one((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR), + alchemy_ic_pmdata); +} + +static struct syscore_ops alchemy_ic_syscore_ops = { .suspend = alchemy_ic_suspend, .resume = alchemy_ic_resume, }; -static int __init alchemy_ic_sysdev_init(void) +static int __init alchemy_ic_pm_init(void) { - struct alchemy_ic_sysdev *icdev; - unsigned long icbase[2] = { AU1000_IC0_PHYS_ADDR, AU1000_IC1_PHYS_ADDR }; - int err, i; - - err = sysdev_class_register(&alchemy_ic_sysdev_class); - if (err) - return err; - - for (i = 0; i < 2; i++) { - icdev = kzalloc(sizeof(struct alchemy_ic_sysdev), GFP_KERNEL); - if (!icdev) - return -ENOMEM; - - icdev->base = ioremap(icbase[i], 0x1000); - - icdev->sysdev.id = i; - icdev->sysdev.cls = &alchemy_ic_sysdev_class; - err = sysdev_register(&icdev->sysdev); - if (err) { - kfree(icdev); - return err; - } - } - + register_syscore_ops(&alchemy_ic_syscore_ops); return 0; } -device_initcall(alchemy_ic_sysdev_init); +device_initcall(alchemy_ic_pm_init); -- cgit v1.1 From adcb86279f1e4d7a1a9f267b49441aecf4a5110a Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:16 +0200 Subject: MIPS: Alchemy: Convert dbdma.c to syscore_ops Convert the PM sysdev to syscore_ops and clean up the ddma addresses a bit. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2351/ Signed-off-by: Ralf Baechle --- arch/mips/alchemy/common/dbdma.c | 123 +++++++++-------------- arch/mips/include/asm/mach-au1x00/au1000.h | 4 +- arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h | 8 -- 3 files changed, 47 insertions(+), 88 deletions(-) diff --git a/arch/mips/alchemy/common/dbdma.c b/arch/mips/alchemy/common/dbdma.c index ca0506a..3a5abb5 100644 --- a/arch/mips/alchemy/common/dbdma.c +++ b/arch/mips/alchemy/common/dbdma.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include @@ -58,7 +58,8 @@ static DEFINE_SPINLOCK(au1xxx_dbdma_spin_lock); /* I couldn't find a macro that did this... */ #define ALIGN_ADDR(x, a) ((((u32)(x)) + (a-1)) & ~(a-1)) -static dbdma_global_t *dbdma_gptr = (dbdma_global_t *)DDMA_GLOBAL_BASE; +static dbdma_global_t *dbdma_gptr = + (dbdma_global_t *)KSEG1ADDR(AU1550_DBDMA_CONF_PHYS_ADDR); static int dbdma_initialized; static dbdev_tab_t dbdev_tab[] = { @@ -299,7 +300,7 @@ u32 au1xxx_dbdma_chan_alloc(u32 srcid, u32 destid, if (ctp != NULL) { memset(ctp, 0, sizeof(chan_tab_t)); ctp->chan_index = chan = i; - dcp = DDMA_CHANNEL_BASE; + dcp = KSEG1ADDR(AU1550_DBDMA_PHYS_ADDR); dcp += (0x0100 * chan); ctp->chan_ptr = (au1x_dma_chan_t *)dcp; cp = (au1x_dma_chan_t *)dcp; @@ -958,105 +959,75 @@ u32 au1xxx_dbdma_put_dscr(u32 chanid, au1x_ddma_desc_t *dscr) } -struct alchemy_dbdma_sysdev { - struct sys_device sysdev; - u32 pm_regs[NUM_DBDMA_CHANS + 1][6]; -}; +static unsigned long alchemy_dbdma_pm_data[NUM_DBDMA_CHANS + 1][6]; -static int alchemy_dbdma_suspend(struct sys_device *dev, - pm_message_t state) +static int alchemy_dbdma_suspend(void) { - struct alchemy_dbdma_sysdev *sdev = - container_of(dev, struct alchemy_dbdma_sysdev, sysdev); int i; - u32 addr; + void __iomem *addr; - addr = DDMA_GLOBAL_BASE; - sdev->pm_regs[0][0] = au_readl(addr + 0x00); - sdev->pm_regs[0][1] = au_readl(addr + 0x04); - sdev->pm_regs[0][2] = au_readl(addr + 0x08); - sdev->pm_regs[0][3] = au_readl(addr + 0x0c); + addr = (void __iomem *)KSEG1ADDR(AU1550_DBDMA_CONF_PHYS_ADDR); + alchemy_dbdma_pm_data[0][0] = __raw_readl(addr + 0x00); + alchemy_dbdma_pm_data[0][1] = __raw_readl(addr + 0x04); + alchemy_dbdma_pm_data[0][2] = __raw_readl(addr + 0x08); + alchemy_dbdma_pm_data[0][3] = __raw_readl(addr + 0x0c); /* save channel configurations */ - for (i = 1, addr = DDMA_CHANNEL_BASE; i <= NUM_DBDMA_CHANS; i++) { - sdev->pm_regs[i][0] = au_readl(addr + 0x00); - sdev->pm_regs[i][1] = au_readl(addr + 0x04); - sdev->pm_regs[i][2] = au_readl(addr + 0x08); - sdev->pm_regs[i][3] = au_readl(addr + 0x0c); - sdev->pm_regs[i][4] = au_readl(addr + 0x10); - sdev->pm_regs[i][5] = au_readl(addr + 0x14); + addr = (void __iomem *)KSEG1ADDR(AU1550_DBDMA_PHYS_ADDR); + for (i = 1; i <= NUM_DBDMA_CHANS; i++) { + alchemy_dbdma_pm_data[i][0] = __raw_readl(addr + 0x00); + alchemy_dbdma_pm_data[i][1] = __raw_readl(addr + 0x04); + alchemy_dbdma_pm_data[i][2] = __raw_readl(addr + 0x08); + alchemy_dbdma_pm_data[i][3] = __raw_readl(addr + 0x0c); + alchemy_dbdma_pm_data[i][4] = __raw_readl(addr + 0x10); + alchemy_dbdma_pm_data[i][5] = __raw_readl(addr + 0x14); /* halt channel */ - au_writel(sdev->pm_regs[i][0] & ~1, addr + 0x00); - au_sync(); - while (!(au_readl(addr + 0x14) & 1)) - au_sync(); + __raw_writel(alchemy_dbdma_pm_data[i][0] & ~1, addr + 0x00); + wmb(); + while (!(__raw_readl(addr + 0x14) & 1)) + wmb(); addr += 0x100; /* next channel base */ } /* disable channel interrupts */ - au_writel(0, DDMA_GLOBAL_BASE + 0x0c); - au_sync(); + addr = (void __iomem *)KSEG1ADDR(AU1550_DBDMA_CONF_PHYS_ADDR); + __raw_writel(0, addr + 0x0c); + wmb(); return 0; } -static int alchemy_dbdma_resume(struct sys_device *dev) +static void alchemy_dbdma_resume(void) { - struct alchemy_dbdma_sysdev *sdev = - container_of(dev, struct alchemy_dbdma_sysdev, sysdev); int i; - u32 addr; + void __iomem *addr; - addr = DDMA_GLOBAL_BASE; - au_writel(sdev->pm_regs[0][0], addr + 0x00); - au_writel(sdev->pm_regs[0][1], addr + 0x04); - au_writel(sdev->pm_regs[0][2], addr + 0x08); - au_writel(sdev->pm_regs[0][3], addr + 0x0c); + addr = (void __iomem *)KSEG1ADDR(AU1550_DBDMA_CONF_PHYS_ADDR); + __raw_writel(alchemy_dbdma_pm_data[0][0], addr + 0x00); + __raw_writel(alchemy_dbdma_pm_data[0][1], addr + 0x04); + __raw_writel(alchemy_dbdma_pm_data[0][2], addr + 0x08); + __raw_writel(alchemy_dbdma_pm_data[0][3], addr + 0x0c); /* restore channel configurations */ - for (i = 1, addr = DDMA_CHANNEL_BASE; i <= NUM_DBDMA_CHANS; i++) { - au_writel(sdev->pm_regs[i][0], addr + 0x00); - au_writel(sdev->pm_regs[i][1], addr + 0x04); - au_writel(sdev->pm_regs[i][2], addr + 0x08); - au_writel(sdev->pm_regs[i][3], addr + 0x0c); - au_writel(sdev->pm_regs[i][4], addr + 0x10); - au_writel(sdev->pm_regs[i][5], addr + 0x14); - au_sync(); + addr = (void __iomem *)KSEG1ADDR(AU1550_DBDMA_PHYS_ADDR); + for (i = 1; i <= NUM_DBDMA_CHANS; i++) { + __raw_writel(alchemy_dbdma_pm_data[i][0], addr + 0x00); + __raw_writel(alchemy_dbdma_pm_data[i][1], addr + 0x04); + __raw_writel(alchemy_dbdma_pm_data[i][2], addr + 0x08); + __raw_writel(alchemy_dbdma_pm_data[i][3], addr + 0x0c); + __raw_writel(alchemy_dbdma_pm_data[i][4], addr + 0x10); + __raw_writel(alchemy_dbdma_pm_data[i][5], addr + 0x14); + wmb(); addr += 0x100; /* next channel base */ } - - return 0; } -static struct sysdev_class alchemy_dbdma_sysdev_class = { - .name = "dbdma", +static struct syscore_ops alchemy_dbdma_syscore_ops = { .suspend = alchemy_dbdma_suspend, .resume = alchemy_dbdma_resume, }; -static int __init alchemy_dbdma_sysdev_init(void) -{ - struct alchemy_dbdma_sysdev *sdev; - int ret; - - ret = sysdev_class_register(&alchemy_dbdma_sysdev_class); - if (ret) - return ret; - - sdev = kzalloc(sizeof(struct alchemy_dbdma_sysdev), GFP_KERNEL); - if (!sdev) - return -ENOMEM; - - sdev->sysdev.id = -1; - sdev->sysdev.cls = &alchemy_dbdma_sysdev_class; - ret = sysdev_register(&sdev->sysdev); - if (ret) - kfree(sdev); - - return ret; -} - static int __init au1xxx_dbdma_init(void) { int irq_nr, ret; @@ -1084,11 +1055,7 @@ static int __init au1xxx_dbdma_init(void) else { dbdma_initialized = 1; printk(KERN_INFO "Alchemy DBDMA initialized\n"); - ret = alchemy_dbdma_sysdev_init(); - if (ret) { - printk(KERN_ERR "DBDMA PM init failed\n"); - ret = 0; - } + register_syscore_ops(&alchemy_dbdma_syscore_ops); } return ret; diff --git a/arch/mips/include/asm/mach-au1x00/au1000.h b/arch/mips/include/asm/mach-au1x00/au1000.h index 66cfcdc..eb8f103 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000.h +++ b/arch/mips/include/asm/mach-au1x00/au1000.h @@ -635,6 +635,8 @@ enum soc_au1200_ints { #define AU1000_IC0_PHYS_ADDR 0x10400000 /* 01234 */ #define AU1000_IC1_PHYS_ADDR 0x11800000 /* 01234 */ +#define AU1550_DBDMA_PHYS_ADDR 0x14002000 /* 34 */ +#define AU1550_DBDMA_CONF_PHYS_ADDR 0x14003000 /* 34 */ #ifdef CONFIG_SOC_AU1000 @@ -761,7 +763,6 @@ enum soc_au1200_ints { #define UART3_PHYS_ADDR 0x11400000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 -#define DDMA_PHYS_ADDR 0x14002000 #define PE_PHYS_ADDR 0x14008000 #define PSC0_PHYS_ADDR 0x11A00000 #define PSC1_PHYS_ADDR 0x11B00000 @@ -789,7 +790,6 @@ enum soc_au1200_ints { #define UART1_PHYS_ADDR 0x11200000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 -#define DDMA_PHYS_ADDR 0x14002000 #define PSC0_PHYS_ADDR 0x11A00000 #define PSC1_PHYS_ADDR 0x11B00000 #define SD0_PHYS_ADDR 0x10600000 diff --git a/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h b/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h index c8a553a3..2fdacfe 100644 --- a/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h +++ b/arch/mips/include/asm/mach-au1x00/au1xxx_dbdma.h @@ -37,14 +37,6 @@ #ifndef _LANGUAGE_ASSEMBLY -/* - * The DMA base addresses. - * The channels are every 256 bytes (0x0100) from the channel 0 base. - * Interrupt status/enable is bits 15:0 for channels 15 to zero. - */ -#define DDMA_GLOBAL_BASE 0xb4003000 -#define DDMA_CHANNEL_BASE 0xb4002000 - typedef volatile struct dbdma_global { u32 ddma_config; u32 ddma_intstat; -- cgit v1.1 From 80130204b43ce9c3b50924e4c2d44e9f2881f8c3 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:17 +0200 Subject: MIPS: Alchemy: Rewrite UART setup and constants. Detect CPU type at runtime and setup uarts accordingly; also clean up the uart base address mess in the process as far as possible. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2352/ Signed-off-by: Ralf Baechle membase + UART_MOD_CNTRL) & 3) != 3) { - /* power-on sequence as suggested in the databooks */ - __raw_writel(0, port->membase + UART_MOD_CNTRL); - wmb(); - __raw_writel(1, port->membase + UART_MOD_CNTRL); - wmb(); - } - __raw_writel(3, port->membase + UART_MOD_CNTRL); /* full on */ - wmb(); + alchemy_uart_enable(CPHYSADDR(port->membase)); serial8250_do_pm(port, state, old_state); break; case 3: /* power off */ serial8250_do_pm(port, state, old_state); - __raw_writel(0, port->membase + UART_MOD_CNTRL); - wmb(); + alchemy_uart_disable(CPHYSADDR(port->membase)); break; default: serial8250_do_pm(port, state, old_state); @@ -65,38 +56,60 @@ static void alchemy_8250_pm(struct uart_port *port, unsigned int state, .pm = alchemy_8250_pm, \ } -static struct plat_serial8250_port au1x00_uart_data[] = { -#if defined(CONFIG_SOC_AU1000) - PORT(UART0_PHYS_ADDR, AU1000_UART0_INT), - PORT(UART1_PHYS_ADDR, AU1000_UART1_INT), - PORT(UART2_PHYS_ADDR, AU1000_UART2_INT), - PORT(UART3_PHYS_ADDR, AU1000_UART3_INT), -#elif defined(CONFIG_SOC_AU1500) - PORT(UART0_PHYS_ADDR, AU1500_UART0_INT), - PORT(UART3_PHYS_ADDR, AU1500_UART3_INT), -#elif defined(CONFIG_SOC_AU1100) - PORT(UART0_PHYS_ADDR, AU1100_UART0_INT), - PORT(UART1_PHYS_ADDR, AU1100_UART1_INT), - PORT(UART3_PHYS_ADDR, AU1100_UART3_INT), -#elif defined(CONFIG_SOC_AU1550) - PORT(UART0_PHYS_ADDR, AU1550_UART0_INT), - PORT(UART1_PHYS_ADDR, AU1550_UART1_INT), - PORT(UART3_PHYS_ADDR, AU1550_UART3_INT), -#elif defined(CONFIG_SOC_AU1200) - PORT(UART0_PHYS_ADDR, AU1200_UART0_INT), - PORT(UART1_PHYS_ADDR, AU1200_UART1_INT), -#endif - { }, +static struct plat_serial8250_port au1x00_uart_data[][4] __initdata = { + [ALCHEMY_CPU_AU1000] = { + PORT(AU1000_UART0_PHYS_ADDR, AU1000_UART0_INT), + PORT(AU1000_UART1_PHYS_ADDR, AU1000_UART1_INT), + PORT(AU1000_UART2_PHYS_ADDR, AU1000_UART2_INT), + PORT(AU1000_UART3_PHYS_ADDR, AU1000_UART3_INT), + }, + [ALCHEMY_CPU_AU1500] = { + PORT(AU1000_UART0_PHYS_ADDR, AU1500_UART0_INT), + PORT(AU1000_UART3_PHYS_ADDR, AU1500_UART3_INT), + }, + [ALCHEMY_CPU_AU1100] = { + PORT(AU1000_UART0_PHYS_ADDR, AU1100_UART0_INT), + PORT(AU1000_UART1_PHYS_ADDR, AU1100_UART1_INT), + PORT(AU1000_UART3_PHYS_ADDR, AU1100_UART3_INT), + }, + [ALCHEMY_CPU_AU1550] = { + PORT(AU1000_UART0_PHYS_ADDR, AU1550_UART0_INT), + PORT(AU1000_UART1_PHYS_ADDR, AU1550_UART1_INT), + PORT(AU1000_UART3_PHYS_ADDR, AU1550_UART3_INT), + }, + [ALCHEMY_CPU_AU1200] = { + PORT(AU1000_UART0_PHYS_ADDR, AU1200_UART0_INT), + PORT(AU1000_UART1_PHYS_ADDR, AU1200_UART1_INT), + }, }; static struct platform_device au1xx0_uart_device = { .name = "serial8250", .id = PLAT8250_DEV_AU1X00, - .dev = { - .platform_data = au1x00_uart_data, - }, }; +static void __init alchemy_setup_uarts(int ctype) +{ + unsigned int uartclk = get_au1x00_uart_baud_base() * 16; + int s = sizeof(struct plat_serial8250_port); + int c = alchemy_get_uarts(ctype); + struct plat_serial8250_port *ports; + + ports = kzalloc(s * (c + 1), GFP_KERNEL); + if (!ports) { + printk(KERN_INFO "Alchemy: no memory for UART data\n"); + return; + } + memcpy(ports, au1x00_uart_data[ctype], s * c); + au1xx0_uart_device.dev.platform_data = ports; + + /* Fill up uartclk. */ + for (s = 0; s < c; s++) + ports[s].uartclk = uartclk; + if (platform_device_register(&au1xx0_uart_device)) + printk(KERN_INFO "Alchemy: failed to register UARTs\n"); +} + /* OHCI (USB full speed host controller) */ static struct resource au1xxx_usb_ohci_resources[] = { [0] = { @@ -442,7 +455,6 @@ void __init au1xxx_override_eth_cfg(unsigned int port, } static struct platform_device *au1xxx_platform_devices[] __initdata = { - &au1xx0_uart_device, &au1xxx_usb_ohci_device, #ifdef CONFIG_FB_AU1100 &au1100_lcd_device, @@ -465,13 +477,10 @@ static struct platform_device *au1xxx_platform_devices[] __initdata = { static int __init au1xxx_platform_init(void) { - unsigned int uartclk = get_au1x00_uart_baud_base() * 16; - int err, i; + int err, i, ctype = alchemy_get_cputype(); unsigned char ethaddr[6]; - /* Fill up uartclk. */ - for (i = 0; au1x00_uart_data[i].flags; i++) - au1x00_uart_data[i].uartclk = uartclk; + alchemy_setup_uarts(ctype); /* use firmware-provided mac addr if available and necessary */ i = prom_get_ethernet_addr(ethaddr); diff --git a/arch/mips/alchemy/devboards/prom.c b/arch/mips/alchemy/devboards/prom.c index baeb213..e5306b5 100644 --- a/arch/mips/alchemy/devboards/prom.c +++ b/arch/mips/alchemy/devboards/prom.c @@ -62,5 +62,5 @@ void __init prom_init(void) void prom_putchar(unsigned char c) { - alchemy_uart_putchar(UART0_PHYS_ADDR, c); + alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); } diff --git a/arch/mips/alchemy/gpr/board_setup.c b/arch/mips/alchemy/gpr/board_setup.c index ad2e3f1..5f8f069 100644 --- a/arch/mips/alchemy/gpr/board_setup.c +++ b/arch/mips/alchemy/gpr/board_setup.c @@ -36,9 +36,6 @@ #include -#define UART1_ADDR KSEG1ADDR(UART1_PHYS_ADDR) -#define UART3_ADDR KSEG1ADDR(UART3_PHYS_ADDR) - char irq_tab_alchemy[][5] __initdata = { [0] = { -1, AU1500_PCI_INTA, AU1500_PCI_INTB, 0xff, 0xff }, }; @@ -67,18 +64,15 @@ static void gpr_power_off(void) void __init board_setup(void) { - printk(KERN_INFO "Tarpeze ITS GPR board\n"); + printk(KERN_INFO "Trapeze ITS GPR board\n"); pm_power_off = gpr_power_off; _machine_halt = gpr_power_off; _machine_restart = gpr_reset; - /* Enable UART3 */ - au_writel(0x1, UART3_ADDR + UART_MOD_CNTRL);/* clock enable (CE) */ - au_writel(0x3, UART3_ADDR + UART_MOD_CNTRL); /* CE and "enable" */ - /* Enable UART1 */ - au_writel(0x1, UART1_ADDR + UART_MOD_CNTRL); /* clock enable (CE) */ - au_writel(0x3, UART1_ADDR + UART_MOD_CNTRL); /* CE and "enable" */ + /* Enable UART1/3 */ + alchemy_uart_enable(AU1000_UART3_PHYS_ADDR); + alchemy_uart_enable(AU1000_UART1_PHYS_ADDR); /* Take away Reset of UMTS-card */ alchemy_gpio_direction_output(215, 1); diff --git a/arch/mips/alchemy/gpr/init.c b/arch/mips/alchemy/gpr/init.c index f044f4c54..229aafa 100644 --- a/arch/mips/alchemy/gpr/init.c +++ b/arch/mips/alchemy/gpr/init.c @@ -59,5 +59,5 @@ void __init prom_init(void) void prom_putchar(unsigned char c) { - alchemy_uart_putchar(UART0_PHYS_ADDR, c); + alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); } diff --git a/arch/mips/alchemy/mtx-1/init.c b/arch/mips/alchemy/mtx-1/init.c index f8d2557..2e81cc7 100644 --- a/arch/mips/alchemy/mtx-1/init.c +++ b/arch/mips/alchemy/mtx-1/init.c @@ -62,5 +62,5 @@ void __init prom_init(void) void prom_putchar(unsigned char c) { - alchemy_uart_putchar(UART0_PHYS_ADDR, c); + alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); } diff --git a/arch/mips/alchemy/xxs1500/board_setup.c b/arch/mips/alchemy/xxs1500/board_setup.c index febfb0fb..81e57fa 100644 --- a/arch/mips/alchemy/xxs1500/board_setup.c +++ b/arch/mips/alchemy/xxs1500/board_setup.c @@ -66,13 +66,10 @@ void __init board_setup(void) au_writel(pin_func, SYS_PINFUNC); /* Enable UART */ - au_writel(0x01, UART3_ADDR + UART_MOD_CNTRL); /* clock enable (CE) */ - mdelay(10); - au_writel(0x03, UART3_ADDR + UART_MOD_CNTRL); /* CE and "enable" */ - mdelay(10); - - /* Enable DTR = USB power up */ - au_writel(0x01, UART3_ADDR + UART_MCR); /* UART_MCR_DTR is 0x01??? */ + alchemy_uart_enable(AU1000_UART3_PHYS_ADDR); + /* Enable DTR (MCR bit 0) = USB power up */ + __raw_writel(1, (void __iomem *)KSEG1ADDR(AU1000_UART3_PHYS_ADDR + 0x18)); + wmb(); #ifdef CONFIG_PCI #if defined(__MIPSEB__) diff --git a/arch/mips/alchemy/xxs1500/init.c b/arch/mips/alchemy/xxs1500/init.c index 34a90a4..0ee02cf 100644 --- a/arch/mips/alchemy/xxs1500/init.c +++ b/arch/mips/alchemy/xxs1500/init.c @@ -59,5 +59,5 @@ void __init prom_init(void) void prom_putchar(unsigned char c) { - alchemy_uart_putchar(UART0_PHYS_ADDR, c); + alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); } diff --git a/arch/mips/boot/compressed/uart-alchemy.c b/arch/mips/boot/compressed/uart-alchemy.c index 1bff22f..eb063e6 100644 --- a/arch/mips/boot/compressed/uart-alchemy.c +++ b/arch/mips/boot/compressed/uart-alchemy.c @@ -3,5 +3,5 @@ void putc(char c) { /* all current (Jan. 2010) in-kernel boards */ - alchemy_uart_putchar(UART0_PHYS_ADDR, c); + alchemy_uart_putchar(AU1000_UART0_PHYS_ADDR, c); } diff --git a/arch/mips/include/asm/mach-au1x00/au1000.h b/arch/mips/include/asm/mach-au1x00/au1000.h index eb8f103..c4ffb20 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000.h +++ b/arch/mips/include/asm/mach-au1x00/au1000.h @@ -161,6 +161,45 @@ static inline int alchemy_get_cputype(void) return ALCHEMY_CPU_UNKNOWN; } +/* return number of uarts on a given cputype */ +static inline int alchemy_get_uarts(int type) +{ + switch (type) { + case ALCHEMY_CPU_AU1000: + return 4; + case ALCHEMY_CPU_AU1500: + case ALCHEMY_CPU_AU1200: + return 2; + case ALCHEMY_CPU_AU1100: + case ALCHEMY_CPU_AU1550: + return 3; + } + return 0; +} + +/* enable an UART block if it isn't already */ +static inline void alchemy_uart_enable(u32 uart_phys) +{ + void __iomem *addr = (void __iomem *)KSEG1ADDR(uart_phys); + + /* reset, enable clock, deassert reset */ + if ((__raw_readl(addr + 0x100) & 3) != 3) { + __raw_writel(0, addr + 0x100); + wmb(); + __raw_writel(1, addr + 0x100); + wmb(); + } + __raw_writel(3, addr + 0x100); + wmb(); +} + +static inline void alchemy_uart_disable(u32 uart_phys) +{ + void __iomem *addr = (void __iomem *)KSEG1ADDR(uart_phys); + __raw_writel(0, addr + 0x100); /* UART_MOD_CNTRL */ + wmb(); +} + static inline void alchemy_uart_putchar(u32 uart_phys, u8 c) { void __iomem *base = (void __iomem *)KSEG1ADDR(uart_phys); @@ -634,6 +673,10 @@ enum soc_au1200_ints { */ #define AU1000_IC0_PHYS_ADDR 0x10400000 /* 01234 */ +#define AU1000_UART0_PHYS_ADDR 0x11100000 /* 01234 */ +#define AU1000_UART1_PHYS_ADDR 0x11200000 /* 0234 */ +#define AU1000_UART2_PHYS_ADDR 0x11300000 /* 0 */ +#define AU1000_UART3_PHYS_ADDR 0x11400000 /* 0123 */ #define AU1000_IC1_PHYS_ADDR 0x11800000 /* 01234 */ #define AU1550_DBDMA_PHYS_ADDR 0x14002000 /* 34 */ #define AU1550_DBDMA_CONF_PHYS_ADDR 0x14003000 /* 34 */ @@ -660,10 +703,6 @@ enum soc_au1200_ints { #define MACDMA0_PHYS_ADDR 0x14004000 #define MACDMA1_PHYS_ADDR 0x14004200 #define I2S_PHYS_ADDR 0x11000000 -#define UART0_PHYS_ADDR 0x11100000 -#define UART1_PHYS_ADDR 0x11200000 -#define UART2_PHYS_ADDR 0x11300000 -#define UART3_PHYS_ADDR 0x11400000 #define SSI0_PHYS_ADDR 0x11600000 #define SSI1_PHYS_ADDR 0x11680000 #define SYS_PHYS_ADDR 0x11900000 @@ -695,8 +734,6 @@ enum soc_au1200_ints { #define MACDMA0_PHYS_ADDR 0x14004000 #define MACDMA1_PHYS_ADDR 0x14004200 #define I2S_PHYS_ADDR 0x11000000 -#define UART0_PHYS_ADDR 0x11100000 -#define UART3_PHYS_ADDR 0x11400000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 #define PCI_MEM_PHYS_ADDR 0x400000000ULL @@ -732,9 +769,6 @@ enum soc_au1200_ints { #define MACDMA0_PHYS_ADDR 0x14004000 #define MACDMA1_PHYS_ADDR 0x14004200 #define I2S_PHYS_ADDR 0x11000000 -#define UART0_PHYS_ADDR 0x11100000 -#define UART1_PHYS_ADDR 0x11200000 -#define UART3_PHYS_ADDR 0x11400000 #define SSI0_PHYS_ADDR 0x11600000 #define SSI1_PHYS_ADDR 0x11680000 #define GPIO2_PHYS_ADDR 0x11700000 @@ -758,9 +792,6 @@ enum soc_au1200_ints { #define MACEN_PHYS_ADDR 0x10520000 #define MACDMA0_PHYS_ADDR 0x14004000 #define MACDMA1_PHYS_ADDR 0x14004200 -#define UART0_PHYS_ADDR 0x11100000 -#define UART1_PHYS_ADDR 0x11200000 -#define UART3_PHYS_ADDR 0x11400000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 #define PE_PHYS_ADDR 0x14008000 @@ -786,8 +817,6 @@ enum soc_au1200_ints { #define CIM_PHYS_ADDR 0x14004000 #define USBM_PHYS_ADDR 0x14020000 #define USBH_PHYS_ADDR 0x14020100 -#define UART0_PHYS_ADDR 0x11100000 -#define UART1_PHYS_ADDR 0x11200000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 #define PSC0_PHYS_ADDR 0x11A00000 -- cgit v1.1 From 40d8bc281711d188f35f035f28d94b111b735484 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:18 +0200 Subject: MIPS: Alchemy: Rewrite ethernet platform setup Rewrite ethernet setup to use runtime cpu detection, and also clean up the ethernet base address mess as far as possible. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2353/ Signed-off-by: Ralf Baechle #include +#include #include #include -#include +#include #include #include @@ -372,15 +373,16 @@ static struct platform_device pbdb_smbus_device = { #endif /* Macro to help defining the Ethernet MAC resources */ +#define MAC_RES_COUNT 3 /* MAC regs base, MAC enable reg, MAC INT */ #define MAC_RES(_base, _enable, _irq) \ { \ - .start = CPHYSADDR(_base), \ - .end = CPHYSADDR(_base + 0xffff), \ + .start = _base, \ + .end = _base + 0xffff, \ .flags = IORESOURCE_MEM, \ }, \ { \ - .start = CPHYSADDR(_enable), \ - .end = CPHYSADDR(_enable + 0x3), \ + .start = _enable, \ + .end = _enable + 0x3, \ .flags = IORESOURCE_MEM, \ }, \ { \ @@ -389,19 +391,29 @@ static struct platform_device pbdb_smbus_device = { .flags = IORESOURCE_IRQ \ } -static struct resource au1xxx_eth0_resources[] = { -#if defined(CONFIG_SOC_AU1000) - MAC_RES(AU1000_ETH0_BASE, AU1000_MAC0_ENABLE, AU1000_MAC0_DMA_INT), -#elif defined(CONFIG_SOC_AU1100) - MAC_RES(AU1100_ETH0_BASE, AU1100_MAC0_ENABLE, AU1100_MAC0_DMA_INT), -#elif defined(CONFIG_SOC_AU1550) - MAC_RES(AU1550_ETH0_BASE, AU1550_MAC0_ENABLE, AU1550_MAC0_DMA_INT), -#elif defined(CONFIG_SOC_AU1500) - MAC_RES(AU1500_ETH0_BASE, AU1500_MAC0_ENABLE, AU1500_MAC0_DMA_INT), -#endif +static struct resource au1xxx_eth0_resources[][MAC_RES_COUNT] __initdata = { + [ALCHEMY_CPU_AU1000] = { + MAC_RES(AU1000_MAC0_PHYS_ADDR, + AU1000_MACEN_PHYS_ADDR, + AU1000_MAC0_DMA_INT) + }, + [ALCHEMY_CPU_AU1500] = { + MAC_RES(AU1500_MAC0_PHYS_ADDR, + AU1500_MACEN_PHYS_ADDR, + AU1500_MAC0_DMA_INT) + }, + [ALCHEMY_CPU_AU1100] = { + MAC_RES(AU1000_MAC0_PHYS_ADDR, + AU1000_MACEN_PHYS_ADDR, + AU1100_MAC0_DMA_INT) + }, + [ALCHEMY_CPU_AU1550] = { + MAC_RES(AU1000_MAC0_PHYS_ADDR, + AU1000_MACEN_PHYS_ADDR, + AU1550_MAC0_DMA_INT) + }, }; - static struct au1000_eth_platform_data au1xxx_eth0_platform_data = { .phy1_search_mac0 = 1, }; @@ -409,20 +421,26 @@ static struct au1000_eth_platform_data au1xxx_eth0_platform_data = { static struct platform_device au1xxx_eth0_device = { .name = "au1000-eth", .id = 0, - .num_resources = ARRAY_SIZE(au1xxx_eth0_resources), - .resource = au1xxx_eth0_resources, + .num_resources = MAC_RES_COUNT, .dev.platform_data = &au1xxx_eth0_platform_data, }; -#ifndef CONFIG_SOC_AU1100 -static struct resource au1xxx_eth1_resources[] = { -#if defined(CONFIG_SOC_AU1000) - MAC_RES(AU1000_ETH1_BASE, AU1000_MAC1_ENABLE, AU1000_MAC1_DMA_INT), -#elif defined(CONFIG_SOC_AU1550) - MAC_RES(AU1550_ETH1_BASE, AU1550_MAC1_ENABLE, AU1550_MAC1_DMA_INT), -#elif defined(CONFIG_SOC_AU1500) - MAC_RES(AU1500_ETH1_BASE, AU1500_MAC1_ENABLE, AU1500_MAC1_DMA_INT), -#endif +static struct resource au1xxx_eth1_resources[][MAC_RES_COUNT] __initdata = { + [ALCHEMY_CPU_AU1000] = { + MAC_RES(AU1000_MAC1_PHYS_ADDR, + AU1000_MACEN_PHYS_ADDR + 4, + AU1000_MAC1_DMA_INT) + }, + [ALCHEMY_CPU_AU1500] = { + MAC_RES(AU1500_MAC1_PHYS_ADDR, + AU1500_MACEN_PHYS_ADDR + 4, + AU1500_MAC1_DMA_INT) + }, + [ALCHEMY_CPU_AU1550] = { + MAC_RES(AU1000_MAC1_PHYS_ADDR, + AU1000_MACEN_PHYS_ADDR + 4, + AU1550_MAC1_DMA_INT) + }, }; static struct au1000_eth_platform_data au1xxx_eth1_platform_data = { @@ -432,11 +450,9 @@ static struct au1000_eth_platform_data au1xxx_eth1_platform_data = { static struct platform_device au1xxx_eth1_device = { .name = "au1000-eth", .id = 1, - .num_resources = ARRAY_SIZE(au1xxx_eth1_resources), - .resource = au1xxx_eth1_resources, + .num_resources = MAC_RES_COUNT, .dev.platform_data = &au1xxx_eth1_platform_data, }; -#endif void __init au1xxx_override_eth_cfg(unsigned int port, struct au1000_eth_platform_data *eth_data) @@ -447,11 +463,62 @@ void __init au1xxx_override_eth_cfg(unsigned int port, if (port == 0) memcpy(&au1xxx_eth0_platform_data, eth_data, sizeof(struct au1000_eth_platform_data)); -#ifndef CONFIG_SOC_AU1100 else memcpy(&au1xxx_eth1_platform_data, eth_data, sizeof(struct au1000_eth_platform_data)); -#endif +} + +static void __init alchemy_setup_macs(int ctype) +{ + int ret, i; + unsigned char ethaddr[6]; + struct resource *macres; + + /* Handle 1st MAC */ + if (alchemy_get_macs(ctype) < 1) + return; + + macres = kmalloc(sizeof(struct resource) * MAC_RES_COUNT, GFP_KERNEL); + if (!macres) { + printk(KERN_INFO "Alchemy: no memory for MAC0 resources\n"); + return; + } + memcpy(macres, au1xxx_eth0_resources[ctype], + sizeof(struct resource) * MAC_RES_COUNT); + au1xxx_eth0_device.resource = macres; + + i = prom_get_ethernet_addr(ethaddr); + if (!i && !is_valid_ether_addr(au1xxx_eth0_platform_data.mac)) + memcpy(au1xxx_eth0_platform_data.mac, ethaddr, 6); + + ret = platform_device_register(&au1xxx_eth0_device); + if (!ret) + printk(KERN_INFO "Alchemy: failed to register MAC0\n"); + + + /* Handle 2nd MAC */ + if (alchemy_get_macs(ctype) < 2) + return; + + macres = kmalloc(sizeof(struct resource) * MAC_RES_COUNT, GFP_KERNEL); + if (!macres) { + printk(KERN_INFO "Alchemy: no memory for MAC1 resources\n"); + return; + } + memcpy(macres, au1xxx_eth1_resources[ctype], + sizeof(struct resource) * MAC_RES_COUNT); + au1xxx_eth1_device.resource = macres; + + ethaddr[5] += 1; /* next addr for 2nd MAC */ + if (!i && !is_valid_ether_addr(au1xxx_eth1_platform_data.mac)) + memcpy(au1xxx_eth1_platform_data.mac, ethaddr, 6); + + /* Register second MAC if enabled in pinfunc */ + if (!(au_readl(SYS_PINFUNC) & (u32)SYS_PF_NI2)) { + ret = platform_device_register(&au1xxx_eth1_device); + if (ret) + printk(KERN_INFO "Alchemy: failed to register MAC1\n"); + } } static struct platform_device *au1xxx_platform_devices[] __initdata = { @@ -472,33 +539,17 @@ static struct platform_device *au1xxx_platform_devices[] __initdata = { #ifdef SMBUS_PSC_BASE &pbdb_smbus_device, #endif - &au1xxx_eth0_device, }; static int __init au1xxx_platform_init(void) { - int err, i, ctype = alchemy_get_cputype(); - unsigned char ethaddr[6]; + int err, ctype = alchemy_get_cputype(); alchemy_setup_uarts(ctype); - - /* use firmware-provided mac addr if available and necessary */ - i = prom_get_ethernet_addr(ethaddr); - if (!i && !is_valid_ether_addr(au1xxx_eth0_platform_data.mac)) - memcpy(au1xxx_eth0_platform_data.mac, ethaddr, 6); + alchemy_setup_macs(ctype); err = platform_add_devices(au1xxx_platform_devices, ARRAY_SIZE(au1xxx_platform_devices)); -#ifndef CONFIG_SOC_AU1100 - ethaddr[5] += 1; /* next addr for 2nd MAC */ - if (!i && !is_valid_ether_addr(au1xxx_eth1_platform_data.mac)) - memcpy(au1xxx_eth1_platform_data.mac, ethaddr, 6); - - /* Register second MAC if enabled in pinfunc */ - if (!err && !(au_readl(SYS_PINFUNC) & (u32)SYS_PF_NI2)) - err = platform_device_register(&au1xxx_eth1_device); -#endif - return err; } diff --git a/arch/mips/include/asm/mach-au1x00/au1000.h b/arch/mips/include/asm/mach-au1x00/au1000.h index c4ffb20..415d287 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000.h +++ b/arch/mips/include/asm/mach-au1x00/au1000.h @@ -219,6 +219,20 @@ static inline void alchemy_uart_putchar(u32 uart_phys, u8 c) wmb(); } +/* return number of ethernet MACs on a given cputype */ +static inline int alchemy_get_macs(int type) +{ + switch (type) { + case ALCHEMY_CPU_AU1000: + case ALCHEMY_CPU_AU1500: + case ALCHEMY_CPU_AU1550: + return 2; + case ALCHEMY_CPU_AU1100: + return 1; + } + return 0; +} + /* arch/mips/au1000/common/clocks.c */ extern void set_au1x00_speed(unsigned int new_freq); extern unsigned int get_au1x00_speed(void); @@ -673,6 +687,12 @@ enum soc_au1200_ints { */ #define AU1000_IC0_PHYS_ADDR 0x10400000 /* 01234 */ +#define AU1000_MAC0_PHYS_ADDR 0x10500000 /* 023 */ +#define AU1000_MAC1_PHYS_ADDR 0x10510000 /* 023 */ +#define AU1000_MACEN_PHYS_ADDR 0x10520000 /* 023 */ +#define AU1500_MAC0_PHYS_ADDR 0x11500000 /* 1 */ +#define AU1500_MAC1_PHYS_ADDR 0x11510000 /* 1 */ +#define AU1500_MACEN_PHYS_ADDR 0x11520000 /* 1 */ #define AU1000_UART0_PHYS_ADDR 0x11100000 /* 01234 */ #define AU1000_UART1_PHYS_ADDR 0x11200000 /* 0234 */ #define AU1000_UART2_PHYS_ADDR 0x11300000 /* 0 */ @@ -680,6 +700,8 @@ enum soc_au1200_ints { #define AU1000_IC1_PHYS_ADDR 0x11800000 /* 01234 */ #define AU1550_DBDMA_PHYS_ADDR 0x14002000 /* 34 */ #define AU1550_DBDMA_CONF_PHYS_ADDR 0x14003000 /* 34 */ +#define AU1000_MACDMA0_PHYS_ADDR 0x14004000 /* 0123 */ +#define AU1000_MACDMA1_PHYS_ADDR 0x14004200 /* 0123 */ #ifdef CONFIG_SOC_AU1000 @@ -697,11 +719,6 @@ enum soc_au1200_ints { #define USBH_PHYS_ADDR 0x10100000 #define USBD_PHYS_ADDR 0x10200000 #define IRDA_PHYS_ADDR 0x10300000 -#define MAC0_PHYS_ADDR 0x10500000 -#define MAC1_PHYS_ADDR 0x10510000 -#define MACEN_PHYS_ADDR 0x10520000 -#define MACDMA0_PHYS_ADDR 0x14004000 -#define MACDMA1_PHYS_ADDR 0x14004200 #define I2S_PHYS_ADDR 0x11000000 #define SSI0_PHYS_ADDR 0x11600000 #define SSI1_PHYS_ADDR 0x11680000 @@ -728,11 +745,6 @@ enum soc_au1200_ints { #define USBH_PHYS_ADDR 0x10100000 #define USBD_PHYS_ADDR 0x10200000 #define PCI_PHYS_ADDR 0x14005000 -#define MAC0_PHYS_ADDR 0x11500000 -#define MAC1_PHYS_ADDR 0x11510000 -#define MACEN_PHYS_ADDR 0x11520000 -#define MACDMA0_PHYS_ADDR 0x14004000 -#define MACDMA1_PHYS_ADDR 0x14004200 #define I2S_PHYS_ADDR 0x11000000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 @@ -764,10 +776,6 @@ enum soc_au1200_ints { #define USBH_PHYS_ADDR 0x10100000 #define USBD_PHYS_ADDR 0x10200000 #define IRDA_PHYS_ADDR 0x10300000 -#define MAC0_PHYS_ADDR 0x10500000 -#define MACEN_PHYS_ADDR 0x10520000 -#define MACDMA0_PHYS_ADDR 0x14004000 -#define MACDMA1_PHYS_ADDR 0x14004200 #define I2S_PHYS_ADDR 0x11000000 #define SSI0_PHYS_ADDR 0x11600000 #define SSI1_PHYS_ADDR 0x11680000 @@ -787,11 +795,6 @@ enum soc_au1200_ints { #define USBH_PHYS_ADDR 0x14020000 #define USBD_PHYS_ADDR 0x10200000 #define PCI_PHYS_ADDR 0x14005000 -#define MAC0_PHYS_ADDR 0x10500000 -#define MAC1_PHYS_ADDR 0x10510000 -#define MACEN_PHYS_ADDR 0x10520000 -#define MACDMA0_PHYS_ADDR 0x14004000 -#define MACDMA1_PHYS_ADDR 0x14004200 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 #define PE_PHYS_ADDR 0x14008000 @@ -870,12 +873,6 @@ enum soc_au1200_ints { #define USB_OHCI_BASE 0x10100000 /* phys addr for ioremap */ #define USB_HOST_CONFIG 0xB017FFFC #define FOR_PLATFORM_C_USB_HOST_INT AU1000_USB_HOST_INT - -#define AU1000_ETH0_BASE 0xB0500000 -#define AU1000_ETH1_BASE 0xB0510000 -#define AU1000_MAC0_ENABLE 0xB0520000 -#define AU1000_MAC1_ENABLE 0xB0520004 -#define NUM_ETH_INTERFACES 2 #endif /* CONFIG_SOC_AU1000 */ /* Au1500 */ @@ -887,12 +884,6 @@ enum soc_au1200_ints { #define USB_OHCI_BASE 0x10100000 /* phys addr for ioremap */ #define USB_HOST_CONFIG 0xB017fffc #define FOR_PLATFORM_C_USB_HOST_INT AU1500_USB_HOST_INT - -#define AU1500_ETH0_BASE 0xB1500000 -#define AU1500_ETH1_BASE 0xB1510000 -#define AU1500_MAC0_ENABLE 0xB1520000 -#define AU1500_MAC1_ENABLE 0xB1520004 -#define NUM_ETH_INTERFACES 2 #endif /* CONFIG_SOC_AU1500 */ /* Au1100 */ @@ -904,10 +895,6 @@ enum soc_au1200_ints { #define USB_OHCI_BASE 0x10100000 /* phys addr for ioremap */ #define USB_HOST_CONFIG 0xB017FFFC #define FOR_PLATFORM_C_USB_HOST_INT AU1100_USB_HOST_INT - -#define AU1100_ETH0_BASE 0xB0500000 -#define AU1100_MAC0_ENABLE 0xB0520000 -#define NUM_ETH_INTERFACES 1 #endif /* CONFIG_SOC_AU1100 */ #ifdef CONFIG_SOC_AU1550 @@ -917,12 +904,6 @@ enum soc_au1200_ints { #define USB_OHCI_LEN 0x00060000 #define USB_HOST_CONFIG 0xB4027ffc #define FOR_PLATFORM_C_USB_HOST_INT AU1550_USB_HOST_INT - -#define AU1550_ETH0_BASE 0xB0500000 -#define AU1550_ETH1_BASE 0xB0510000 -#define AU1550_MAC0_ENABLE 0xB0520000 -#define AU1550_MAC1_ENABLE 0xB0520004 -#define NUM_ETH_INTERFACES 2 #endif /* CONFIG_SOC_AU1550 */ -- cgit v1.1 From 5d4ddcb4279672e69136e746d6de6f01b501b853 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:19 +0200 Subject: MIPS: Alchemy: Cleanup DMA addresses According to the databooks, the Au1000 DMA engine must be programmed with the physical FIFO addresses. This patch does that; furthermore this opened the possibility to get rid of a lot of now unnecessary address defines. Signed-off-by: Manuel Lauss To: Linux-MIPS Cc: Florian Fainelli Cc: Wolfgang Grandegger Patchwork: https://patchwork.linux-mips.org/patch/2348/ Signed-off-by: Ralf Baechle io = DMA_CHANNEL_BASE + i * DMA_CHANNEL_LEN; + chan->io = KSEG1ADDR(AU1000_DMA_PHYS_ADDR) + i * DMA_CHANNEL_LEN; chan->dev_id = dev_id; chan->dev_str = dev_str; chan->fifo_addr = dev->fifo_addr; diff --git a/arch/mips/alchemy/common/platform.c b/arch/mips/alchemy/common/platform.c index 541fff2..3b2c18b 100644 --- a/arch/mips/alchemy/common/platform.c +++ b/arch/mips/alchemy/common/platform.c @@ -283,8 +283,8 @@ extern struct au1xmmc_platform_data au1xmmc_platdata[2]; static struct resource au1200_mmc0_resources[] = { [0] = { - .start = SD0_PHYS_ADDR, - .end = SD0_PHYS_ADDR + 0x7ffff, + .start = AU1100_SD0_PHYS_ADDR, + .end = AU1100_SD0_PHYS_ADDR + 0xfff, .flags = IORESOURCE_MEM, }, [1] = { @@ -319,8 +319,8 @@ static struct platform_device au1200_mmc0_device = { #ifndef CONFIG_MIPS_DB1200 static struct resource au1200_mmc1_resources[] = { [0] = { - .start = SD1_PHYS_ADDR, - .end = SD1_PHYS_ADDR + 0x7ffff, + .start = AU1100_SD1_PHYS_ADDR, + .end = AU1100_SD1_PHYS_ADDR + 0xfff, .flags = IORESOURCE_MEM, }, [1] = { diff --git a/arch/mips/include/asm/mach-au1x00/au1000.h b/arch/mips/include/asm/mach-au1x00/au1000.h index 415d287..2dfff4f 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000.h +++ b/arch/mips/include/asm/mach-au1x00/au1000.h @@ -686,10 +686,15 @@ enum soc_au1200_ints { * 0..au1000 1..au1500 2..au1100 3..au1550 4..au1200 */ +#define AU1000_AC97_PHYS_ADDR 0x10000000 /* 012 */ +#define AU1000_USBD_PHYS_ADDR 0x10200000 /* 0123 */ #define AU1000_IC0_PHYS_ADDR 0x10400000 /* 01234 */ #define AU1000_MAC0_PHYS_ADDR 0x10500000 /* 023 */ #define AU1000_MAC1_PHYS_ADDR 0x10510000 /* 023 */ #define AU1000_MACEN_PHYS_ADDR 0x10520000 /* 023 */ +#define AU1100_SD0_PHYS_ADDR 0x10600000 /* 24 */ +#define AU1100_SD1_PHYS_ADDR 0x10680000 /* 24 */ +#define AU1000_I2S_PHYS_ADDR 0x11000000 /* 02 */ #define AU1500_MAC0_PHYS_ADDR 0x11500000 /* 1 */ #define AU1500_MAC1_PHYS_ADDR 0x11510000 /* 1 */ #define AU1500_MACEN_PHYS_ADDR 0x11520000 /* 1 */ @@ -698,6 +703,7 @@ enum soc_au1200_ints { #define AU1000_UART2_PHYS_ADDR 0x11300000 /* 0 */ #define AU1000_UART3_PHYS_ADDR 0x11400000 /* 0123 */ #define AU1000_IC1_PHYS_ADDR 0x11800000 /* 01234 */ +#define AU1000_DMA_PHYS_ADDR 0x14002000 /* 012 */ #define AU1550_DBDMA_PHYS_ADDR 0x14002000 /* 34 */ #define AU1550_DBDMA_CONF_PHYS_ADDR 0x14003000 /* 34 */ #define AU1000_MACDMA0_PHYS_ADDR 0x14004000 /* 0123 */ @@ -707,19 +713,8 @@ enum soc_au1200_ints { #ifdef CONFIG_SOC_AU1000 #define MEM_PHYS_ADDR 0x14000000 #define STATIC_MEM_PHYS_ADDR 0x14001000 -#define DMA0_PHYS_ADDR 0x14002000 -#define DMA1_PHYS_ADDR 0x14002100 -#define DMA2_PHYS_ADDR 0x14002200 -#define DMA3_PHYS_ADDR 0x14002300 -#define DMA4_PHYS_ADDR 0x14002400 -#define DMA5_PHYS_ADDR 0x14002500 -#define DMA6_PHYS_ADDR 0x14002600 -#define DMA7_PHYS_ADDR 0x14002700 -#define AC97_PHYS_ADDR 0x10000000 #define USBH_PHYS_ADDR 0x10100000 -#define USBD_PHYS_ADDR 0x10200000 #define IRDA_PHYS_ADDR 0x10300000 -#define I2S_PHYS_ADDR 0x11000000 #define SSI0_PHYS_ADDR 0x11600000 #define SSI1_PHYS_ADDR 0x11680000 #define SYS_PHYS_ADDR 0x11900000 @@ -733,19 +728,8 @@ enum soc_au1200_ints { #ifdef CONFIG_SOC_AU1500 #define MEM_PHYS_ADDR 0x14000000 #define STATIC_MEM_PHYS_ADDR 0x14001000 -#define DMA0_PHYS_ADDR 0x14002000 -#define DMA1_PHYS_ADDR 0x14002100 -#define DMA2_PHYS_ADDR 0x14002200 -#define DMA3_PHYS_ADDR 0x14002300 -#define DMA4_PHYS_ADDR 0x14002400 -#define DMA5_PHYS_ADDR 0x14002500 -#define DMA6_PHYS_ADDR 0x14002600 -#define DMA7_PHYS_ADDR 0x14002700 -#define AC97_PHYS_ADDR 0x10000000 #define USBH_PHYS_ADDR 0x10100000 -#define USBD_PHYS_ADDR 0x10200000 #define PCI_PHYS_ADDR 0x14005000 -#define I2S_PHYS_ADDR 0x11000000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 #define PCI_MEM_PHYS_ADDR 0x400000000ULL @@ -762,21 +746,8 @@ enum soc_au1200_ints { #ifdef CONFIG_SOC_AU1100 #define MEM_PHYS_ADDR 0x14000000 #define STATIC_MEM_PHYS_ADDR 0x14001000 -#define DMA0_PHYS_ADDR 0x14002000 -#define DMA1_PHYS_ADDR 0x14002100 -#define DMA2_PHYS_ADDR 0x14002200 -#define DMA3_PHYS_ADDR 0x14002300 -#define DMA4_PHYS_ADDR 0x14002400 -#define DMA5_PHYS_ADDR 0x14002500 -#define DMA6_PHYS_ADDR 0x14002600 -#define DMA7_PHYS_ADDR 0x14002700 -#define SD0_PHYS_ADDR 0x10600000 -#define SD1_PHYS_ADDR 0x10680000 -#define AC97_PHYS_ADDR 0x10000000 #define USBH_PHYS_ADDR 0x10100000 -#define USBD_PHYS_ADDR 0x10200000 #define IRDA_PHYS_ADDR 0x10300000 -#define I2S_PHYS_ADDR 0x11000000 #define SSI0_PHYS_ADDR 0x11600000 #define SSI1_PHYS_ADDR 0x11680000 #define GPIO2_PHYS_ADDR 0x11700000 @@ -793,7 +764,6 @@ enum soc_au1200_ints { #define MEM_PHYS_ADDR 0x14000000 #define STATIC_MEM_PHYS_ADDR 0x14001000 #define USBH_PHYS_ADDR 0x14020000 -#define USBD_PHYS_ADDR 0x10200000 #define PCI_PHYS_ADDR 0x14005000 #define GPIO2_PHYS_ADDR 0x11700000 #define SYS_PHYS_ADDR 0x11900000 @@ -824,8 +794,6 @@ enum soc_au1200_ints { #define SYS_PHYS_ADDR 0x11900000 #define PSC0_PHYS_ADDR 0x11A00000 #define PSC1_PHYS_ADDR 0x11B00000 -#define SD0_PHYS_ADDR 0x10600000 -#define SD1_PHYS_ADDR 0x10680000 #define LCD_PHYS_ADDR 0x15000000 #define SWCNT_PHYS_ADDR 0x1110010C #define MAEFE_PHYS_ADDR 0x14012000 @@ -867,9 +835,6 @@ enum soc_au1200_ints { /* Au1000 */ #ifdef CONFIG_SOC_AU1000 -#define UART0_ADDR 0xB1100000 -#define UART3_ADDR 0xB1400000 - #define USB_OHCI_BASE 0x10100000 /* phys addr for ioremap */ #define USB_HOST_CONFIG 0xB017FFFC #define FOR_PLATFORM_C_USB_HOST_INT AU1000_USB_HOST_INT @@ -878,9 +843,6 @@ enum soc_au1200_ints { /* Au1500 */ #ifdef CONFIG_SOC_AU1500 -#define UART0_ADDR 0xB1100000 -#define UART3_ADDR 0xB1400000 - #define USB_OHCI_BASE 0x10100000 /* phys addr for ioremap */ #define USB_HOST_CONFIG 0xB017fffc #define FOR_PLATFORM_C_USB_HOST_INT AU1500_USB_HOST_INT @@ -889,16 +851,12 @@ enum soc_au1200_ints { /* Au1100 */ #ifdef CONFIG_SOC_AU1100 -#define UART0_ADDR 0xB1100000 -#define UART3_ADDR 0xB1400000 - #define USB_OHCI_BASE 0x10100000 /* phys addr for ioremap */ #define USB_HOST_CONFIG 0xB017FFFC #define FOR_PLATFORM_C_USB_HOST_INT AU1100_USB_HOST_INT #endif /* CONFIG_SOC_AU1100 */ #ifdef CONFIG_SOC_AU1550 -#define UART0_ADDR 0xB1100000 #define USB_OHCI_BASE 0x14020000 /* phys addr for ioremap */ #define USB_OHCI_LEN 0x00060000 @@ -909,8 +867,6 @@ enum soc_au1200_ints { #ifdef CONFIG_SOC_AU1200 -#define UART0_ADDR 0xB1100000 - #define USB_UOC_BASE 0x14020020 #define USB_UOC_LEN 0x20 #define USB_OHCI_BASE 0x14020100 @@ -1534,12 +1490,6 @@ enum soc_au1200_ints { # define AC97C_RS (1 << 1) # define AC97C_CE (1 << 0) -/* Secure Digital (SD) Controller */ -#define SD0_XMIT_FIFO 0xB0600000 -#define SD0_RECV_FIFO 0xB0600004 -#define SD1_XMIT_FIFO 0xB0680000 -#define SD1_RECV_FIFO 0xB0680004 - #if defined(CONFIG_SOC_AU1500) || defined(CONFIG_SOC_AU1550) /* Au1500 PCI Controller */ #define Au1500_CFG_BASE 0xB4005000 /* virtual, KSEG1 addr */ diff --git a/arch/mips/include/asm/mach-au1x00/au1000_dma.h b/arch/mips/include/asm/mach-au1x00/au1000_dma.h index c333b4e..59f5b55 100644 --- a/arch/mips/include/asm/mach-au1x00/au1000_dma.h +++ b/arch/mips/include/asm/mach-au1x00/au1000_dma.h @@ -37,10 +37,6 @@ #define NUM_AU1000_DMA_CHANNELS 8 -/* DMA Channel Base Addresses */ -#define DMA_CHANNEL_BASE 0xB4002000 -#define DMA_CHANNEL_LEN 0x00000100 - /* DMA Channel Register Offsets */ #define DMA_MODE_SET 0x00000000 #define DMA_MODE_READ DMA_MODE_SET -- cgit v1.1 From b7f720d68c0042cc8ce496e31a61df79a77f1b48 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Sun, 8 May 2011 10:42:20 +0200 Subject: MIPS: Alchemy: Clean up GPIO registers and accessors remove au_readl/au_writel, remove the predefined GPIO1/2 KSEG1 register addresses and fix the fallout in all boards and drivers. This also fixes a bug in the mtx-1_wdt driver which was introduced by commit 6ea8115bb6f359df4f45152f2b40e1d4d1891392 ("Convert mtx1 wdt to be a platform device and use generic GPIO API") before this patch mtx-1_wdt only modified GPIO215, the patch then used the gpio resource information as bit index into the GPIO2 register but the conversion to the GPIO API didn't realize that. With this patch the drivers original behaviour is restored and GPIO15 is left alone. Signed-off-by: Manuel Lauss Cc: Florian Fainelli To: Linux-MIPS Cc: linux-watchdog@vger.kernel.org Cc: Wim Van Sebroeck Patchwork: https://patchwork.linux-mips.org/patch/2381/ Signed-off-by: Ralf Baechle +#include #include #include #include @@ -470,7 +471,7 @@ static int __init au1xxx_nand_init(void) #ifdef CONFIG_MIPS_PB1550 /* set gpio206 high */ - au_writel(au_readl(GPIO2_DIR) & ~(1 << 6), GPIO2_DIR); + gpio_direction_input(206); boot_swapboot = (au_readl(MEM_STSTAT) & (0x7 << 1)) | ((bcsr_read(BCSR_STATUS) >> 6) & 0x1); diff --git a/drivers/watchdog/mtx-1_wdt.c b/drivers/watchdog/mtx-1_wdt.c index 5ec5ac1..1479dc4 100644 --- a/drivers/watchdog/mtx-1_wdt.c +++ b/drivers/watchdog/mtx-1_wdt.c @@ -66,6 +66,7 @@ static struct { int default_ticks; unsigned long inuse; unsigned gpio; + int gstate; } mtx1_wdt_device; static void mtx1_wdt_trigger(unsigned long unused) @@ -75,13 +76,13 @@ static void mtx1_wdt_trigger(unsigned long unused) spin_lock(&mtx1_wdt_device.lock); if (mtx1_wdt_device.running) ticks--; - /* - * toggle GPIO2_15 - */ - tmp = au_readl(GPIO2_DIR); - tmp = (tmp & ~(1 << mtx1_wdt_device.gpio)) | - ((~tmp) & (1 << mtx1_wdt_device.gpio)); - au_writel(tmp, GPIO2_DIR); + + /* toggle wdt gpio */ + mtx1_wdt_device.gstate = ~mtx1_wdt_device.gstate; + if (mtx1_wdt_device.gstate) + gpio_direction_output(mtx1_wdt_device.gpio, 1); + else + gpio_direction_input(mtx1_wdt_device.gpio); if (mtx1_wdt_device.queue && ticks) mod_timer(&mtx1_wdt_device.timer, jiffies + MTX1_WDT_INTERVAL); @@ -103,7 +104,8 @@ static void mtx1_wdt_start(void) spin_lock_irqsave(&mtx1_wdt_device.lock, flags); if (!mtx1_wdt_device.queue) { mtx1_wdt_device.queue = 1; - gpio_set_value(mtx1_wdt_device.gpio, 1); + mtx1_wdt_device.gstate = 1; + gpio_direction_output(mtx1_wdt_device.gpio, 1); mod_timer(&mtx1_wdt_device.timer, jiffies + MTX1_WDT_INTERVAL); } mtx1_wdt_device.running++; @@ -117,7 +119,8 @@ static int mtx1_wdt_stop(void) spin_lock_irqsave(&mtx1_wdt_device.lock, flags); if (mtx1_wdt_device.queue) { mtx1_wdt_device.queue = 0; - gpio_set_value(mtx1_wdt_device.gpio, 0); + mtx1_wdt_device.gstate = 0; + gpio_direction_output(mtx1_wdt_device.gpio, 0); } ticks = mtx1_wdt_device.default_ticks; spin_unlock_irqrestore(&mtx1_wdt_device.lock, flags); -- cgit v1.1 From b3ae52b6b0335eba547221aad2cb3c50902e3d2d Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Tue, 10 May 2011 23:31:30 +0200 Subject: SSB: Change fallback sprom to callback mechanism. Some embedded devices like the Netgear WNDR3300 have two SSB based cards without an own sprom on the pci bus. We have to provide two different fallback sproms for these and this was not possible with the old solution. In the bcm47xx architecture the sprom data is stored in the nvram in the main flash storage. The architecture code will be able to fill the sprom with the stored data based on the bus where the device was found. The bcm63xx code should do the same thing as before, just using the new API. Acked-by: Michael Buesch Cc: netdev@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: Florian Fainelli Signed-off-by: Hauke Mehrtens Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2362/ Signed-off-by: Ralf Baechle --- arch/mips/bcm63xx/boards/board_bcm963xx.c | 16 ++++++++++-- drivers/ssb/pci.c | 16 ++++++++---- drivers/ssb/sprom.c | 43 ++++++++++++++++++------------- drivers/ssb/ssb_private.h | 3 ++- include/linux/ssb/ssb.h | 4 ++- 5 files changed, 55 insertions(+), 27 deletions(-) diff --git a/arch/mips/bcm63xx/boards/board_bcm963xx.c b/arch/mips/bcm63xx/boards/board_bcm963xx.c index 8dba8cf..40b223b 100644 --- a/arch/mips/bcm63xx/boards/board_bcm963xx.c +++ b/arch/mips/bcm63xx/boards/board_bcm963xx.c @@ -643,6 +643,17 @@ static struct ssb_sprom bcm63xx_sprom = { .boardflags_lo = 0x2848, .boardflags_hi = 0x0000, }; + +int bcm63xx_get_fallback_sprom(struct ssb_bus *bus, struct ssb_sprom *out) +{ + if (bus->bustype == SSB_BUSTYPE_PCI) { + memcpy(out, &bcm63xx_sprom, sizeof(struct ssb_sprom)); + return 0; + } else { + printk(KERN_ERR PFX "unable to fill SPROM for given bustype.\n"); + return -EINVAL; + } +} #endif /* @@ -793,8 +804,9 @@ void __init board_prom_init(void) if (!board_get_mac_address(bcm63xx_sprom.il0mac)) { memcpy(bcm63xx_sprom.et0mac, bcm63xx_sprom.il0mac, ETH_ALEN); memcpy(bcm63xx_sprom.et1mac, bcm63xx_sprom.il0mac, ETH_ALEN); - if (ssb_arch_set_fallback_sprom(&bcm63xx_sprom) < 0) - printk(KERN_ERR "failed to register fallback SPROM\n"); + if (ssb_arch_register_fallback_sprom( + &bcm63xx_get_fallback_sprom) < 0) + printk(KERN_ERR PFX "failed to register fallback SPROM\n"); } #endif } diff --git a/drivers/ssb/pci.c b/drivers/ssb/pci.c index 6f34963..7ad4858 100644 --- a/drivers/ssb/pci.c +++ b/drivers/ssb/pci.c @@ -662,7 +662,6 @@ static int sprom_extract(struct ssb_bus *bus, struct ssb_sprom *out, static int ssb_pci_sprom_get(struct ssb_bus *bus, struct ssb_sprom *sprom) { - const struct ssb_sprom *fallback; int err; u16 *buf; @@ -707,10 +706,17 @@ static int ssb_pci_sprom_get(struct ssb_bus *bus, if (err) { /* All CRC attempts failed. * Maybe there is no SPROM on the device? - * If we have a fallback, use that. */ - fallback = ssb_get_fallback_sprom(); - if (fallback) { - memcpy(sprom, fallback, sizeof(*sprom)); + * Now we ask the arch code if there is some sprom + * available for this device in some other storage */ + err = ssb_fill_sprom_with_fallback(bus, sprom); + if (err) { + ssb_printk(KERN_WARNING PFX "WARNING: Using" + " fallback SPROM failed (err %d)\n", + err); + } else { + ssb_dprintk(KERN_DEBUG PFX "Using SPROM" + " revision %d provided by" + " platform.\n", sprom->revision); err = 0; goto out_free; } diff --git a/drivers/ssb/sprom.c b/drivers/ssb/sprom.c index 5f34d7a..45ff0e3 100644 --- a/drivers/ssb/sprom.c +++ b/drivers/ssb/sprom.c @@ -17,7 +17,7 @@ #include -static const struct ssb_sprom *fallback_sprom; +static int(*get_fallback_sprom)(struct ssb_bus *dev, struct ssb_sprom *out); static int sprom2hex(const u16 *sprom, char *buf, size_t buf_len, @@ -145,36 +145,43 @@ out: } /** - * ssb_arch_set_fallback_sprom - Set a fallback SPROM for use if no SPROM is found. + * ssb_arch_register_fallback_sprom - Registers a method providing a + * fallback SPROM if no SPROM is found. * - * @sprom: The SPROM data structure to register. + * @sprom_callback: The callback function. * - * With this function the architecture implementation may register a fallback - * SPROM data structure. The fallback is only used for PCI based SSB devices, - * where no valid SPROM can be found in the shadow registers. + * With this function the architecture implementation may register a + * callback handler which fills the SPROM data structure. The fallback is + * only used for PCI based SSB devices, where no valid SPROM can be found + * in the shadow registers. * - * This function is useful for weird architectures that have a half-assed SSB device - * hardwired to their PCI bus. + * This function is useful for weird architectures that have a half-assed + * SSB device hardwired to their PCI bus. * - * Note that it does only work with PCI attached SSB devices. PCMCIA devices currently - * don't use this fallback. - * Architectures must provide the SPROM for native SSB devices anyway, - * so the fallback also isn't used for native devices. + * Note that it does only work with PCI attached SSB devices. PCMCIA + * devices currently don't use this fallback. + * Architectures must provide the SPROM for native SSB devices anyway, so + * the fallback also isn't used for native devices. * - * This function is available for architecture code, only. So it is not exported. + * This function is available for architecture code, only. So it is not + * exported. */ -int ssb_arch_set_fallback_sprom(const struct ssb_sprom *sprom) +int ssb_arch_register_fallback_sprom(int (*sprom_callback)(struct ssb_bus *bus, + struct ssb_sprom *out)) { - if (fallback_sprom) + if (get_fallback_sprom) return -EEXIST; - fallback_sprom = sprom; + get_fallback_sprom = sprom_callback; return 0; } -const struct ssb_sprom *ssb_get_fallback_sprom(void) +int ssb_fill_sprom_with_fallback(struct ssb_bus *bus, struct ssb_sprom *out) { - return fallback_sprom; + if (!get_fallback_sprom) + return -ENOENT; + + return get_fallback_sprom(bus, out); } /* http://bcm-v4.sipsolutions.net/802.11/IsSpromAvailable */ diff --git a/drivers/ssb/ssb_private.h b/drivers/ssb/ssb_private.h index 0331139..7765301 100644 --- a/drivers/ssb/ssb_private.h +++ b/drivers/ssb/ssb_private.h @@ -171,7 +171,8 @@ ssize_t ssb_attr_sprom_store(struct ssb_bus *bus, const char *buf, size_t count, int (*sprom_check_crc)(const u16 *sprom, size_t size), int (*sprom_write)(struct ssb_bus *bus, const u16 *sprom)); -extern const struct ssb_sprom *ssb_get_fallback_sprom(void); +extern int ssb_fill_sprom_with_fallback(struct ssb_bus *bus, + struct ssb_sprom *out); /* core.c */ diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 9659eff..045f72a 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -404,7 +404,9 @@ extern bool ssb_is_sprom_available(struct ssb_bus *bus); /* Set a fallback SPROM. * See kdoc at the function definition for complete documentation. */ -extern int ssb_arch_set_fallback_sprom(const struct ssb_sprom *sprom); +extern int ssb_arch_register_fallback_sprom( + int (*sprom_callback)(struct ssb_bus *bus, + struct ssb_sprom *out)); /* Suspend a SSB bus. * Call this from the parent bus suspend routine. */ -- cgit v1.1 From a7c62f8564357532872e106f0fa383728cf886cc Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Tue, 10 May 2011 23:31:31 +0200 Subject: MIPS: BCM47xx: Extend bcm47xx_fill_sprom with prefix. When an other SSB based device without an own SPROM is attached, using the PCI bus to the main SSB based device, the data normally found in the SPROM will be stored in the NVRAM on modern devices. The keys, to load the data from the NVRAM, are all using some sort of prefix like pci/1/1/, pci/1/3/ or sb/1/ before the actual key. This patch extends bcm47xx_fill_sprom() to make it possible to read out these values when some prefix was used. The keys for the SPROM data used on the main chip does not have a prefix. Signed-off-by: Hauke Mehrtens Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2363/ Signed-off-by: Ralf Baechle --- arch/mips/bcm47xx/setup.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index c95f90b..bbfcf9b 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -57,10 +57,23 @@ static void bcm47xx_machine_halt(void) } #define READ_FROM_NVRAM(_outvar, name, buf) \ - if (nvram_getenv(name, buf, sizeof(buf)) >= 0)\ + if (nvram_getprefix(prefix, name, buf, sizeof(buf)) >= 0)\ sprom->_outvar = simple_strtoul(buf, NULL, 0); -static void bcm47xx_fill_sprom(struct ssb_sprom *sprom) +static inline int nvram_getprefix(const char *prefix, char *name, + char *buf, int len) +{ + if (prefix) { + char key[100]; + + snprintf(key, sizeof(key), "%s%s", prefix, name); + return nvram_getenv(key, buf, len); + } + + return nvram_getenv(name, buf, len); +} + +static void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix) { char buf[100]; u32 boardflags; @@ -69,11 +82,11 @@ static void bcm47xx_fill_sprom(struct ssb_sprom *sprom) sprom->revision = 1; /* Fallback: Old hardware does not define this. */ READ_FROM_NVRAM(revision, "sromrev", buf); - if (nvram_getenv("il0macaddr", buf, sizeof(buf)) >= 0) + if (nvram_getprefix(prefix, "il0macaddr", buf, sizeof(buf)) >= 0) nvram_parse_macaddr(buf, sprom->il0mac); - if (nvram_getenv("et0macaddr", buf, sizeof(buf)) >= 0) + if (nvram_getprefix(prefix, "et0macaddr", buf, sizeof(buf)) >= 0) nvram_parse_macaddr(buf, sprom->et0mac); - if (nvram_getenv("et1macaddr", buf, sizeof(buf)) >= 0) + if (nvram_getprefix(prefix, "et1macaddr", buf, sizeof(buf)) >= 0) nvram_parse_macaddr(buf, sprom->et1mac); READ_FROM_NVRAM(et0phyaddr, "et0phyaddr", buf); READ_FROM_NVRAM(et1phyaddr, "et1phyaddr", buf); @@ -125,14 +138,14 @@ static void bcm47xx_fill_sprom(struct ssb_sprom *sprom) READ_FROM_NVRAM(ofdm5gpo, "ofdm5gpo", buf); READ_FROM_NVRAM(ofdm5ghpo, "ofdm5ghpo", buf); - if (nvram_getenv("boardflags", buf, sizeof(buf)) >= 0) { + if (nvram_getprefix(prefix, "boardflags", buf, sizeof(buf)) >= 0) { boardflags = simple_strtoul(buf, NULL, 0); if (boardflags) { sprom->boardflags_lo = (boardflags & 0x0000FFFFU); sprom->boardflags_hi = (boardflags & 0xFFFF0000U) >> 16; } } - if (nvram_getenv("boardflags2", buf, sizeof(buf)) >= 0) { + if (nvram_getprefix(prefix, "boardflags2", buf, sizeof(buf)) >= 0) { boardflags = simple_strtoul(buf, NULL, 0); if (boardflags) { sprom->boardflags2_lo = (boardflags & 0x0000FFFFU); @@ -158,7 +171,7 @@ static int bcm47xx_get_invariants(struct ssb_bus *bus, if (nvram_getenv("boardrev", buf, sizeof(buf)) >= 0) iv->boardinfo.rev = (u16)simple_strtoul(buf, NULL, 0); - bcm47xx_fill_sprom(&iv->sprom); + bcm47xx_fill_sprom(&iv->sprom, NULL); if (nvram_getenv("cardbus", buf, sizeof(buf)) >= 0) iv->has_cardbus_slot = !!simple_strtoul(buf, NULL, 10); -- cgit v1.1 From fe6f3642ac70d21004ddbe7242bd4548c35f1c10 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Tue, 10 May 2011 23:31:32 +0200 Subject: MIPS: BCM47xx: Register SSB fallback sprom callback We are generating the prefix based on the PCI bus address the device is on. This is done like Broadcom does it in their code expect that the the bus number is increased by one. In the SB bus implementation used by Broadcom the SB bus emulates a PCI bus so the kernel sees one PCI bus more then in our implementation. We do not handle prefixes like sb/1/ yet as they are only used on the new bus which is not implemented yet. Signed-off-by: Hauke Mehrtens Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2364/ Signed-off-by: Ralf Baechle --- arch/mips/bcm47xx/nvram.c | 3 ++- arch/mips/bcm47xx/setup.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/mips/bcm47xx/nvram.c b/arch/mips/bcm47xx/nvram.c index e5b6615..54db815 100644 --- a/arch/mips/bcm47xx/nvram.c +++ b/arch/mips/bcm47xx/nvram.c @@ -3,6 +3,7 @@ * * Copyright (C) 2005 Broadcom Corporation * Copyright (C) 2006 Felix Fietkau + * Copyright (C) 2010-2011 Hauke Mehrtens * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -23,7 +24,7 @@ static char nvram_buf[NVRAM_SPACE]; /* Probe for NVRAM header */ -static void __init early_nvram_init(void) +static void early_nvram_init(void) { struct ssb_mipscore *mcore = &ssb_bcm47xx.mipscore; struct nvram_header *header; diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index bbfcf9b..258ffcf 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -3,6 +3,7 @@ * Copyright (C) 2006 Felix Fietkau * Copyright (C) 2006 Michael Buesch * Copyright (C) 2010 Waldemar Brodkorb + * Copyright (C) 2010-2011 Hauke Mehrtens * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -154,6 +155,22 @@ static void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix) } } +int bcm47xx_get_sprom(struct ssb_bus *bus, struct ssb_sprom *out) +{ + char prefix[10]; + + if (bus->bustype == SSB_BUSTYPE_PCI) { + snprintf(prefix, sizeof(prefix), "pci/%u/%u/", + bus->host_pci->bus->number + 1, + PCI_SLOT(bus->host_pci->devfn)); + bcm47xx_fill_sprom(out, prefix); + return 0; + } else { + printk(KERN_WARNING "bcm47xx: unable to fill SPROM for given bustype.\n"); + return -EINVAL; + } +} + static int bcm47xx_get_invariants(struct ssb_bus *bus, struct ssb_init_invariants *iv) { @@ -185,6 +202,11 @@ void __init plat_mem_setup(void) char buf[100]; struct ssb_mipscore *mcore; + err = ssb_arch_register_fallback_sprom(&bcm47xx_get_sprom); + if (err) + printk(KERN_WARNING "bcm47xx: someone else already registered" + " a ssb SPROM callback handler (err %d)\n", err); + err = ssb_bus_ssbbus_register(&ssb_bcm47xx, SSB_ENUM_BASE, bcm47xx_get_invariants); if (err) -- cgit v1.1 From 41790fd51f71f3744a5d142cc5369eebab8817a0 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Tue, 10 May 2011 23:31:33 +0200 Subject: MIPS: BCM47xx: Extend the filling of SPROM from NVRAM Some members of the struct ssb_sprom where not filled with data available in the NVRAM. Some attribute names in the NVRAM changed from SPROM version 3 to version 4. This patch was done by analyzing the the pci sprom parser in the ssb code and some open source parts of the braodcom wireless driver used on embedded devices. Signed-off-by: Hauke Mehrtens Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2365/ Signed-off-by: Ralf Baechle --- arch/mips/bcm47xx/setup.c | 81 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 258ffcf..73b529b 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -61,6 +61,11 @@ static void bcm47xx_machine_halt(void) if (nvram_getprefix(prefix, name, buf, sizeof(buf)) >= 0)\ sprom->_outvar = simple_strtoul(buf, NULL, 0); +#define READ_FROM_NVRAM2(_outvar, name1, name2, buf) \ + if (nvram_getprefix(prefix, name1, buf, sizeof(buf)) >= 0 || \ + nvram_getprefix(prefix, name2, buf, sizeof(buf)) >= 0)\ + sprom->_outvar = simple_strtoul(buf, NULL, 0); + static inline int nvram_getprefix(const char *prefix, char *name, char *buf, int len) { @@ -74,6 +79,27 @@ static inline int nvram_getprefix(const char *prefix, char *name, return nvram_getenv(name, buf, len); } +static u32 nvram_getu32(const char *name, char *buf, int len) +{ + int rv; + char key[100]; + u16 var0, var1; + + snprintf(key, sizeof(key), "%s0", name); + rv = nvram_getenv(key, buf, len); + /* return 0 here so this looks like unset */ + if (rv < 0) + return 0; + var0 = simple_strtoul(buf, NULL, 0); + + snprintf(key, sizeof(key), "%s1", name); + rv = nvram_getenv(key, buf, len); + if (rv < 0) + return 0; + var1 = simple_strtoul(buf, NULL, 0); + return var1 << 16 | var0; +} + static void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix) { char buf[100]; @@ -83,7 +109,8 @@ static void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix) sprom->revision = 1; /* Fallback: Old hardware does not define this. */ READ_FROM_NVRAM(revision, "sromrev", buf); - if (nvram_getprefix(prefix, "il0macaddr", buf, sizeof(buf)) >= 0) + if (nvram_getprefix(prefix, "il0macaddr", buf, sizeof(buf)) >= 0 || + nvram_getprefix(prefix, "macaddr", buf, sizeof(buf)) >= 0) nvram_parse_macaddr(buf, sprom->il0mac); if (nvram_getprefix(prefix, "et0macaddr", buf, sizeof(buf)) >= 0) nvram_parse_macaddr(buf, sprom->et0mac); @@ -109,20 +136,36 @@ static void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix) READ_FROM_NVRAM(pa1hib0, "pa1hib0", buf); READ_FROM_NVRAM(pa1hib2, "pa1hib1", buf); READ_FROM_NVRAM(pa1hib1, "pa1hib2", buf); - READ_FROM_NVRAM(gpio0, "wl0gpio0", buf); - READ_FROM_NVRAM(gpio1, "wl0gpio1", buf); - READ_FROM_NVRAM(gpio2, "wl0gpio2", buf); - READ_FROM_NVRAM(gpio3, "wl0gpio3", buf); - READ_FROM_NVRAM(maxpwr_bg, "pa0maxpwr", buf); - READ_FROM_NVRAM(maxpwr_al, "pa1lomaxpwr", buf); - READ_FROM_NVRAM(maxpwr_a, "pa1maxpwr", buf); - READ_FROM_NVRAM(maxpwr_ah, "pa1himaxpwr", buf); - READ_FROM_NVRAM(itssi_a, "pa1itssit", buf); - READ_FROM_NVRAM(itssi_bg, "pa0itssit", buf); + READ_FROM_NVRAM2(gpio0, "ledbh0", "wl0gpio0", buf); + READ_FROM_NVRAM2(gpio1, "ledbh1", "wl0gpio1", buf); + READ_FROM_NVRAM2(gpio2, "ledbh2", "wl0gpio2", buf); + READ_FROM_NVRAM2(gpio3, "ledbh3", "wl0gpio3", buf); + READ_FROM_NVRAM2(maxpwr_bg, "maxp2ga0", "pa0maxpwr", buf); + READ_FROM_NVRAM2(maxpwr_al, "maxp5gla0", "pa1lomaxpwr", buf); + READ_FROM_NVRAM2(maxpwr_a, "maxp5ga0", "pa1maxpwr", buf); + READ_FROM_NVRAM2(maxpwr_ah, "maxp5gha0", "pa1himaxpwr", buf); + READ_FROM_NVRAM2(itssi_bg, "itt5ga0", "pa0itssit", buf); + READ_FROM_NVRAM2(itssi_a, "itt2ga0", "pa1itssit", buf); READ_FROM_NVRAM(tri2g, "tri2g", buf); READ_FROM_NVRAM(tri5gl, "tri5gl", buf); READ_FROM_NVRAM(tri5g, "tri5g", buf); READ_FROM_NVRAM(tri5gh, "tri5gh", buf); + READ_FROM_NVRAM(txpid2g[0], "txpid2ga0", buf); + READ_FROM_NVRAM(txpid2g[1], "txpid2ga1", buf); + READ_FROM_NVRAM(txpid2g[2], "txpid2ga2", buf); + READ_FROM_NVRAM(txpid2g[3], "txpid2ga3", buf); + READ_FROM_NVRAM(txpid5g[0], "txpid5ga0", buf); + READ_FROM_NVRAM(txpid5g[1], "txpid5ga1", buf); + READ_FROM_NVRAM(txpid5g[2], "txpid5ga2", buf); + READ_FROM_NVRAM(txpid5g[3], "txpid5ga3", buf); + READ_FROM_NVRAM(txpid5gl[0], "txpid5gla0", buf); + READ_FROM_NVRAM(txpid5gl[1], "txpid5gla1", buf); + READ_FROM_NVRAM(txpid5gl[2], "txpid5gla2", buf); + READ_FROM_NVRAM(txpid5gl[3], "txpid5gla3", buf); + READ_FROM_NVRAM(txpid5gh[0], "txpid5gha0", buf); + READ_FROM_NVRAM(txpid5gh[1], "txpid5gha1", buf); + READ_FROM_NVRAM(txpid5gh[2], "txpid5gha2", buf); + READ_FROM_NVRAM(txpid5gh[3], "txpid5gha3", buf); READ_FROM_NVRAM(rxpo2g, "rxpo2g", buf); READ_FROM_NVRAM(rxpo5g, "rxpo5g", buf); READ_FROM_NVRAM(rssisav2g, "rssisav2g", buf); @@ -134,10 +177,18 @@ static void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix) READ_FROM_NVRAM(rssismf5g, "rssismf5g", buf); READ_FROM_NVRAM(bxa5g, "bxa5g", buf); READ_FROM_NVRAM(cck2gpo, "cck2gpo", buf); - READ_FROM_NVRAM(ofdm2gpo, "ofdm2gpo", buf); - READ_FROM_NVRAM(ofdm5glpo, "ofdm5glpo", buf); - READ_FROM_NVRAM(ofdm5gpo, "ofdm5gpo", buf); - READ_FROM_NVRAM(ofdm5ghpo, "ofdm5ghpo", buf); + + sprom->ofdm2gpo = nvram_getu32("ofdm2gpo", buf, sizeof(buf)); + sprom->ofdm5glpo = nvram_getu32("ofdm5glpo", buf, sizeof(buf)); + sprom->ofdm5gpo = nvram_getu32("ofdm5gpo", buf, sizeof(buf)); + sprom->ofdm5ghpo = nvram_getu32("ofdm5ghpo", buf, sizeof(buf)); + + READ_FROM_NVRAM(antenna_gain.ghz24.a0, "ag0", buf); + READ_FROM_NVRAM(antenna_gain.ghz24.a1, "ag1", buf); + READ_FROM_NVRAM(antenna_gain.ghz24.a2, "ag2", buf); + READ_FROM_NVRAM(antenna_gain.ghz24.a3, "ag3", buf); + memcpy(&sprom->antenna_gain.ghz5, &sprom->antenna_gain.ghz24, + sizeof(sprom->antenna_gain.ghz5)); if (nvram_getprefix(prefix, "boardflags", buf, sizeof(buf)) >= 0) { boardflags = simple_strtoul(buf, NULL, 0); -- cgit v1.1 From 9cbda726bb283d60cd4f34a3a9da8b5b48a46b0f Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Tue, 10 May 2011 23:31:34 +0200 Subject: MIPS: BCM47xx: Fix MAC address parsing. Some devices like the Netgear WGT634u are using minuses between the blocks of the MAC address and other devices are using colons to separate them. Signed-off-by: Hauke Mehrtens Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2366/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mach-bcm47xx/nvram.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/mach-bcm47xx/nvram.h b/arch/mips/include/asm/mach-bcm47xx/nvram.h index 9759588..184d5ec 100644 --- a/arch/mips/include/asm/mach-bcm47xx/nvram.h +++ b/arch/mips/include/asm/mach-bcm47xx/nvram.h @@ -39,8 +39,16 @@ extern int nvram_getenv(char *name, char *val, size_t val_len); static inline void nvram_parse_macaddr(char *buf, u8 *macaddr) { - sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &macaddr[0], &macaddr[1], - &macaddr[2], &macaddr[3], &macaddr[4], &macaddr[5]); + if (strchr(buf, ':')) + sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &macaddr[0], + &macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4], + &macaddr[5]); + else if (strchr(buf, '-')) + sscanf(buf, "%hhx-%hhx-%hhx-%hhx-%hhx-%hhx", &macaddr[0], + &macaddr[1], &macaddr[2], &macaddr[3], &macaddr[4], + &macaddr[5]); + else + printk(KERN_WARNING "Can not parse mac address: %s\n", buf); } #endif -- cgit v1.1 From 6edde0247644db475f68f25dcb1bf72260600081 Mon Sep 17 00:00:00 2001 From: Maarten ter Huurne Date: Mon, 2 May 2011 11:47:00 +0200 Subject: MIPS: JZ4740: setup: Autodetect physical memory. Assume that the boot loader knows the physical memory of the system and deduce that information from the contents of the SDRAM control register. It is still possible to override with with the "mem=" parameter, but we have a sensible default now. Signed-off-by: Maarten ter Huurne Acked-by: Lars-Peter Clausen Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2319/ Signed-off-by: Ralf Baechle --- arch/mips/jz4740/setup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/mips/jz4740/setup.c b/arch/mips/jz4740/setup.c index 6a9e14d..d97cfbf 100644 --- a/arch/mips/jz4740/setup.c +++ b/arch/mips/jz4740/setup.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2009-2010, Lars-Peter Clausen + * Copyright (C) 2011, Maarten ter Huurne * JZ4740 setup code * * This program is free software; you can redistribute it and/or modify it @@ -14,13 +15,44 @@ */ #include +#include #include +#include + +#include + #include "reset.h" + +#define JZ4740_EMC_SDRAM_CTRL 0x80 + + +static void __init jz4740_detect_mem(void) +{ + void __iomem *jz_emc_base; + u32 ctrl, bus, bank, rows, cols; + phys_t size; + + jz_emc_base = ioremap(JZ4740_EMC_BASE_ADDR, 0x100); + ctrl = readl(jz_emc_base + JZ4740_EMC_SDRAM_CTRL); + bus = 2 - ((ctrl >> 31) & 1); + bank = 1 + ((ctrl >> 19) & 1); + cols = 8 + ((ctrl >> 26) & 7); + rows = 11 + ((ctrl >> 20) & 3); + printk(KERN_DEBUG + "SDRAM preconfigured: bus:%u bank:%u rows:%u cols:%u\n", + bus, bank, rows, cols); + iounmap(jz_emc_base); + + size = 1 << (bus + bank + cols + rows); + add_memory_region(0, size, BOOT_MEM_RAM); +} + void __init plat_mem_setup(void) { jz4740_reset_init(); + jz4740_detect_mem(); } const char *get_system_type(void) -- cgit v1.1 From c094c99e659efedcbb05a0f75b8f77145d8ec539 Mon Sep 17 00:00:00 2001 From: Robert Millan Date: Mon, 18 Apr 2011 11:37:55 -0700 Subject: MIPS: Introduce set_elf_platform() helper function Replace these sequences: if (cpu == 0) __elf_platform = "foo"; with a trivial inline function. Signed-off-by: Robert Millan Signed-off-by: Kevin Cernekee Signed-off-by: David Daney Cc: wu zhangjin Cc: Aurelien Jarno Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2304/ Patchwork: https://patchwork.linux-mips.org/patch/2374/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/cpu-probe.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index c7b7eb2..aa86250 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -291,6 +291,12 @@ static inline int cpu_has_confreg(void) #endif } +static inline void set_elf_platform(int cpu, const char *plat) +{ + if (cpu == 0) + __elf_platform = plat; +} + /* * Get the FPU Implementation/Revision. */ @@ -956,14 +962,12 @@ static inline void cpu_probe_cavium(struct cpuinfo_mips *c, unsigned int cpu) c->cputype = CPU_CAVIUM_OCTEON_PLUS; __cpu_name[cpu] = "Cavium Octeon+"; platform: - if (cpu == 0) - __elf_platform = "octeon"; + set_elf_platform(cpu, "octeon"); break; case PRID_IMP_CAVIUM_CN63XX: c->cputype = CPU_CAVIUM_OCTEON2; __cpu_name[cpu] = "Cavium Octeon II"; - if (cpu == 0) - __elf_platform = "octeon2"; + set_elf_platform(cpu, "octeon2"); break; default: printk(KERN_INFO "Unknown Octeon chip!\n"); -- cgit v1.1 From 06785df09b18e9127d16893039b64ae118c53cb4 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sat, 16 Apr 2011 11:29:28 -0700 Subject: MIPS: Set ELF AT_PLATFORM string for BMIPS processors Signed-off-by: Kevin Cernekee Cc: Robert Millan Cc: David Daney Cc: wu zhangjin Cc: Aurelien Jarno Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2300/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/cpu-probe.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index aa86250..7da8617 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -917,12 +917,14 @@ static inline void cpu_probe_broadcom(struct cpuinfo_mips *c, unsigned int cpu) case PRID_IMP_BMIPS32_REV8: c->cputype = CPU_BMIPS32; __cpu_name[cpu] = "Broadcom BMIPS32"; + set_elf_platform(cpu, "bmips32"); break; case PRID_IMP_BMIPS3300: case PRID_IMP_BMIPS3300_ALT: case PRID_IMP_BMIPS3300_BUG: c->cputype = CPU_BMIPS3300; __cpu_name[cpu] = "Broadcom BMIPS3300"; + set_elf_platform(cpu, "bmips3300"); break; case PRID_IMP_BMIPS43XX: { int rev = c->processor_id & 0xff; @@ -931,15 +933,18 @@ static inline void cpu_probe_broadcom(struct cpuinfo_mips *c, unsigned int cpu) rev <= PRID_REV_BMIPS4380_HI) { c->cputype = CPU_BMIPS4380; __cpu_name[cpu] = "Broadcom BMIPS4380"; + set_elf_platform(cpu, "bmips4380"); } else { c->cputype = CPU_BMIPS4350; __cpu_name[cpu] = "Broadcom BMIPS4350"; + set_elf_platform(cpu, "bmips4350"); } break; } case PRID_IMP_BMIPS5000: c->cputype = CPU_BMIPS5000; __cpu_name[cpu] = "Broadcom BMIPS5000"; + set_elf_platform(cpu, "bmips5000"); c->options |= MIPS_CPU_ULRI; break; } -- cgit v1.1 From 5aac1e8a381d52a977b5050369a82a547c446ee2 Mon Sep 17 00:00:00 2001 From: Robert Millan Date: Sat, 16 Apr 2011 11:29:29 -0700 Subject: MIPS: Set ELF AT_PLATFORM string for Loongson2 processors Signed-off-by: Robert Millan Acked-by: David Daney Signed-off-by: Kevin Cernekee Cc: David Daney Cc: wu zhangjin Cc: Aurelien Jarno Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2302/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/cpu-probe.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index 7da8617..bb133d1 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -620,6 +620,16 @@ static inline void cpu_probe_legacy(struct cpuinfo_mips *c, unsigned int cpu) case PRID_IMP_LOONGSON2: c->cputype = CPU_LOONGSON2; __cpu_name[cpu] = "ICT Loongson-2"; + + switch (c->processor_id & PRID_REV_MASK) { + case PRID_REV_LOONGSON2E: + set_elf_platform(cpu, "loongson2e"); + break; + case PRID_REV_LOONGSON2F: + set_elf_platform(cpu, "loongson2f"); + break; + } + c->isa_level = MIPS_CPU_ISA_III; c->options = R4K_OPTS | MIPS_CPU_FPU | MIPS_CPU_LLSC | -- cgit v1.1 From 1c8da7a1107a46c94b21cc176aaf95c819aab3db Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Sun, 10 Apr 2011 01:42:17 +0800 Subject: MIPS: Lemote 2F, Malta: Fix build warning Since 5ada28bf76752e33dce3d807bf0dfbe6d1b943ad ["led-class: always implement blinking"] LEDS_CLASS=m is no longer valid so change the setting from m to y. Signed-off-by: Wanlong Gao To: david.woodhouse@intel.com To: akpm@linux-foundation.org To: mingo@elte.hu Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/2276/ Signed-off-by: Ralf Baechle --- arch/mips/configs/lemote2f_defconfig | 2 +- arch/mips/configs/malta_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index 167c1d0..cb2c5ea 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -329,7 +329,7 @@ CONFIG_USB_LED=m CONFIG_USB_GADGET=m CONFIG_USB_GADGET_M66592=y CONFIG_MMC=m -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_STAGING=y # CONFIG_STAGING_EXCLUDE_BUILD is not set CONFIG_FB_SM7XX=y diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig index 7270f31..5527abb 100644 --- a/arch/mips/configs/malta_defconfig +++ b/arch/mips/configs/malta_defconfig @@ -374,7 +374,7 @@ CONFIG_FB_CIRRUS=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_HID=m -CONFIG_LEDS_CLASS=m +CONFIG_LEDS_CLASS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m -- cgit v1.1 From b32ee693eb106172f89639acff88dc8fee8ba3e2 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Sun, 10 Apr 2011 03:04:18 +0800 Subject: MIPS: Fix build warnings on defconfigs Since d45dcef77019012fc6769e657fc2f1a5d681bbbb ["Bluetooth: Fix BT_L2CAP and BT_SCO in Kconfig"] BT_L2CAP=m and BT_SCO=m are no longer valid so change the settings from m to y. [ralf@linux-mips.org: Merging only the MIPS parts of this patch.] Signed-off-by: Wanlong Gao To: akpm@linux-foundation.org To: manuel.lauss@googlemail.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linuxppc-dev@lists.ozlabs.org Patchwork: https://patchwork.linux-mips.org/patch/2277/ Signed-off-by: Ralf Baechle --- arch/mips/configs/lemote2f_defconfig | 4 ++-- arch/mips/configs/mtx1_defconfig | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index cb2c5ea..b6acd2f 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -86,8 +86,8 @@ CONFIG_NET_SCHED=y CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_BT=m -CONFIG_BT_L2CAP=m -CONFIG_BT_SCO=m +CONFIG_BT_L2CAP=y +CONFIG_BT_SCO=y CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y CONFIG_BT_BNEP=m diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index a97a42c6..37862b2 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -225,8 +225,8 @@ CONFIG_TOSHIBA_FIR=m CONFIG_VLSI_FIR=m CONFIG_MCS_FIR=m CONFIG_BT=m -CONFIG_BT_L2CAP=m -CONFIG_BT_SCO=m +CONFIG_BT_L2CAP=y +CONFIG_BT_SCO=y CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y CONFIG_BT_BNEP=m -- cgit v1.1 From 7716e6548abed1582a7759666e79d5c612a906c7 Mon Sep 17 00:00:00 2001 From: Chandrakala Chavva Date: Thu, 17 Feb 2011 13:57:52 -0800 Subject: Octeon: Fix interrupt irq settings for performance counters. Octeon uses different interrupt irq for timer and performance counters. Set CvmCtl[IPPCI] to correct irq value very early. Signed-off-by: Chandrakala Chavva Signed-off-by: David Daney To: linux-mips@linux-mips.org Cc: Chandrakala Chavva Patchwork: https://patchwork.linux-mips.org/patch/2085/ Signed-off-by: Ralf Baechle --- arch/mips/cavium-octeon/setup.c | 7 ------- arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index 0707fae..2d9028f 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -288,7 +288,6 @@ void octeon_user_io_init(void) union octeon_cvmemctl cvmmemctl; union cvmx_iob_fau_timeout fau_timeout; union cvmx_pow_nw_tim nm_tim; - uint64_t cvmctl; /* Get the current settings for CP0_CVMMEMCTL_REG */ cvmmemctl.u64 = read_c0_cvmmemctl(); @@ -392,12 +391,6 @@ void octeon_user_io_init(void) CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE, CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE * 128); - /* Move the performance counter interrupts to IRQ 6 */ - cvmctl = read_c0_cvmctl(); - cvmctl &= ~(7 << 7); - cvmctl |= 6 << 7; - write_c0_cvmctl(cvmctl); - /* Set a default for the hardware timeouts */ fau_timeout.u64 = 0; fau_timeout.s.tout_val = 0xfff; diff --git a/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h b/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h index 0b2b5eb..dedef7d 100644 --- a/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h +++ b/arch/mips/include/asm/mach-cavium-octeon/kernel-entry-init.h @@ -63,6 +63,11 @@ # CN30XX Disable instruction prefetching or v0, v0, 0x2000 skip: + # First clear off CvmCtl[IPPCI] bit and move the performance + # counters interrupt to IRQ 6 + li v1, ~(7 << 7) + and v0, v0, v1 + ori v0, v0, (6 << 7) # Write the cavium control register dmtc0 v0, CP0_CVMCTL_REG sync -- cgit v1.1 From e650ce0f083ff9354a10ad66e6bf8c193e8a2755 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 17 Feb 2011 14:47:52 -0800 Subject: MIPS: Octeon: Don't request interrupts for unused IPI mailbox bits. We only use the three low-order mailbox bits. Leave the upper bits alone for possible use by drivers and other software. Signed-off-by: David Daney To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2090/ Signed-off-by: Ralf Baechle --- arch/mips/cavium-octeon/smp.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21..716fae6 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -37,7 +37,7 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) uint64_t action; /* Load the mailbox register to figure out what we're supposed to do */ - action = cvmx_read_csr(CVMX_CIU_MBOX_CLRX(coreid)); + action = cvmx_read_csr(CVMX_CIU_MBOX_CLRX(coreid)) & 0xffff; /* Clear the mailbox to clear the interrupt */ cvmx_write_csr(CVMX_CIU_MBOX_CLRX(coreid), action); @@ -200,16 +200,15 @@ void octeon_prepare_cpus(unsigned int max_cpus) if (labi->labi_signature != LABI_SIGNATURE) panic("The bootloader version on this board is incorrect."); #endif - - cvmx_write_csr(CVMX_CIU_MBOX_CLRX(cvmx_get_core_num()), 0xffffffff); + /* + * Only the low order mailbox bits are used for IPIs, leave + * the other bits alone. + */ + cvmx_write_csr(CVMX_CIU_MBOX_CLRX(cvmx_get_core_num()), 0xffff); if (request_irq(OCTEON_IRQ_MBOX0, mailbox_interrupt, IRQF_DISABLED, - "mailbox0", mailbox_interrupt)) { + "SMP-IPI", mailbox_interrupt)) { panic("Cannot request_irq(OCTEON_IRQ_MBOX0)\n"); } - if (request_irq(OCTEON_IRQ_MBOX1, mailbox_interrupt, IRQF_DISABLED, - "mailbox1", mailbox_interrupt)) { - panic("Cannot request_irq(OCTEON_IRQ_MBOX1)\n"); - } } /** -- cgit v1.1 From 9c1e8a9138ff92a4ff816ea8a1884ad2461a993a Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 17 May 2011 16:18:09 +0100 Subject: MIPS: Cleanup arch_get_unmapped_area As noticed by Kevin Cernekee in http://www.linux-mips.org/cgi-bin/extract-mesg.cgi?a=linux-mips&m=2011-05&i=BANLkTikq04wuK%3Dbz%2BLieavmm3oDtoYWKxg%40mail.gmail.com Patchwork: https://patchwork.linux-mips.org/patch/2387/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/syscall.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 58beabf..0c207e8 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -79,20 +79,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct vm_area_struct * vmm; int do_color_align; - unsigned long task_size; -#ifdef CONFIG_32BIT - task_size = TASK_SIZE; -#else /* Must be CONFIG_64BIT*/ - task_size = test_thread_flag(TIF_32BIT_ADDR) ? TASK_SIZE32 : TASK_SIZE; -#endif - - if (len > task_size) + if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { - /* Even MAP_FIXED mappings must reside within task_size. */ - if (task_size - len < addr) + /* Even MAP_FIXED mappings must reside within TASK_SIZE. */ + if (TASK_SIZE - len < addr) return -EINVAL; /* @@ -114,7 +107,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, else addr = PAGE_ALIGN(addr); vmm = find_vma(current->mm, addr); - if (task_size - len >= addr && + if (TASK_SIZE - len >= addr && (!vmm || addr + len <= vmm->vm_start)) return addr; } @@ -126,7 +119,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { /* At this point: (!vmm || addr < vmm->vm_end). */ - if (task_size - len < addr) + if (TASK_SIZE - len < addr) return -ENOMEM; if (!vmm || addr + len <= vmm->vm_start) return addr; -- cgit v1.1 From 6f6c3c33c027f2c83d53e8562cd9daa73fe8108b Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 19 May 2011 09:21:33 +0100 Subject: MIPS: Move arch_get_unmapped_area and gang to new file. It never really belonged into syscall.c and it's about to become well more complex. Signed-off-by: Ralf Baechle --- arch/mips/kernel/syscall.c | 113 ----------------------------------------- arch/mips/mm/Makefile | 3 +- arch/mips/mm/mmap.c | 122 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 114 deletions(-) create mode 100644 arch/mips/mm/mmap.c diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 0c207e8..d027657 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -10,12 +10,9 @@ #include #include #include -#include #include #include -#include #include -#include #include #include #include @@ -25,11 +22,9 @@ #include #include #include -#include #include #include #include -#include #include #include @@ -66,114 +61,6 @@ out: return res; } -unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ - -EXPORT_SYMBOL(shm_align_mask); - -#define COLOUR_ALIGN(addr,pgoff) \ - ((((addr) + shm_align_mask) & ~shm_align_mask) + \ - (((pgoff) << PAGE_SHIFT) & shm_align_mask)) - -unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct vm_area_struct * vmm; - int do_color_align; - - if (len > TASK_SIZE) - return -ENOMEM; - - if (flags & MAP_FIXED) { - /* Even MAP_FIXED mappings must reside within TASK_SIZE. */ - if (TASK_SIZE - len < addr) - return -EINVAL; - - /* - * We do not accept a shared mapping if it would violate - * cache aliasing constraints. - */ - if ((flags & MAP_SHARED) && - ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) - return -EINVAL; - return addr; - } - - do_color_align = 0; - if (filp || (flags & MAP_SHARED)) - do_color_align = 1; - if (addr) { - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - else - addr = PAGE_ALIGN(addr); - vmm = find_vma(current->mm, addr); - if (TASK_SIZE - len >= addr && - (!vmm || addr + len <= vmm->vm_start)) - return addr; - } - addr = current->mm->mmap_base; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - else - addr = PAGE_ALIGN(addr); - - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vmm->vm_start) - return addr; - addr = vmm->vm_end; - if (do_color_align) - addr = COLOUR_ALIGN(addr, pgoff); - } -} - -void arch_pick_mmap_layout(struct mm_struct *mm) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) { - random_factor = get_random_int(); - random_factor = random_factor << PAGE_SHIFT; - if (TASK_IS_32BIT_ADDR) - random_factor &= 0xfffffful; - else - random_factor &= 0xffffffful; - } - - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} - -static inline unsigned long brk_rnd(void) -{ - unsigned long rnd = get_random_int(); - - rnd = rnd << PAGE_SHIFT; - /* 8MB for 32bit, 256MB for 64bit */ - if (TASK_IS_32BIT_ADDR) - rnd = rnd & 0x7ffffful; - else - rnd = rnd & 0xffffffful; - - return rnd; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long base = mm->brk; - unsigned long ret; - - ret = PAGE_ALIGN(base + brk_rnd()); - - if (ret < mm->brk) - return mm->brk; - - return ret; -} - SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, off_t, offset) diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index eb44636..4d8c162 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile @@ -3,7 +3,8 @@ # obj-y += cache.o dma-default.o extable.o fault.o \ - init.o tlbex.o tlbex-fault.o uasm.o page.o + init.o mmap.o tlbex.o tlbex-fault.o uasm.o \ + page.o obj-$(CONFIG_32BIT) += ioremap.o pgtable-32.o obj-$(CONFIG_64BIT) += pgtable-64.o diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c new file mode 100644 index 0000000..ae3c20a --- /dev/null +++ b/arch/mips/mm/mmap.c @@ -0,0 +1,122 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2011 Wind River Systems, + * written by Ralf Baechle + */ +#include +#include +#include +#include +#include +#include + +unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ + +EXPORT_SYMBOL(shm_align_mask); + +#define COLOUR_ALIGN(addr,pgoff) \ + ((((addr) + shm_align_mask) & ~shm_align_mask) + \ + (((pgoff) << PAGE_SHIFT) & shm_align_mask)) + +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct vm_area_struct * vmm; + int do_color_align; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + /* Even MAP_FIXED mappings must reside within TASK_SIZE. */ + if (TASK_SIZE - len < addr) + return -EINVAL; + + /* + * We do not accept a shared mapping if it would violate + * cache aliasing constraints. + */ + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) + return -EINVAL; + return addr; + } + + do_color_align = 0; + if (filp || (flags & MAP_SHARED)) + do_color_align = 1; + if (addr) { + if (do_color_align) + addr = COLOUR_ALIGN(addr, pgoff); + else + addr = PAGE_ALIGN(addr); + vmm = find_vma(current->mm, addr); + if (TASK_SIZE - len >= addr && + (!vmm || addr + len <= vmm->vm_start)) + return addr; + } + addr = current->mm->mmap_base; + if (do_color_align) + addr = COLOUR_ALIGN(addr, pgoff); + else + addr = PAGE_ALIGN(addr); + + for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { + /* At this point: (!vmm || addr < vmm->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + if (!vmm || addr + len <= vmm->vm_start) + return addr; + addr = vmm->vm_end; + if (do_color_align) + addr = COLOUR_ALIGN(addr, pgoff); + } +} + +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) { + random_factor = get_random_int(); + random_factor = random_factor << PAGE_SHIFT; + if (TASK_IS_32BIT_ADDR) + random_factor &= 0xfffffful; + else + random_factor &= 0xffffffful; + } + + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} + +static inline unsigned long brk_rnd(void) +{ + unsigned long rnd = get_random_int(); + + rnd = rnd << PAGE_SHIFT; + /* 8MB for 32bit, 256MB for 64bit */ + if (TASK_IS_32BIT_ADDR) + rnd = rnd & 0x7ffffful; + else + rnd = rnd & 0xffffffful; + + return rnd; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long base = mm->brk; + unsigned long ret; + + ret = PAGE_ALIGN(base + brk_rnd()); + + if (ret < mm->brk) + return mm->brk; + + return ret; +} -- cgit v1.1 From d6971822c288ce5547190c6f812cc978d6520777 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Tue, 17 May 2011 15:36:46 +0200 Subject: ftrace/kbuild: Add recordmcount files to force full build Modifications to recordmcount must be performed on all object files to stay consistent with what the kernel code may expect. Add the recordmcount files to the main dependencies to make sure any change to them causes a full recompile. Signed-off-by: Michal Marek Link: http://lkml.kernel.org/r/20110517133646.GP13293@sepie.suse.cz Signed-off-by: Steven Rostedt --- scripts/Makefile.build | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index fdca952..6165622 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -255,6 +255,8 @@ sub_cmd_record_mcount = \ if [ $(@) != "scripts/mod/empty.o" ]; then \ $(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) "$(@)"; \ fi; +recordmcount_source := $(srctree)/scripts/recordmcount.c \ + $(srctree)/scripts/recordmcount.h else sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \ @@ -262,6 +264,7 @@ sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH "$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \ "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ "$(if $(part-of-module),1,0)" "$(@)"; +recordmcount_source := $(srctree)/scripts/recordmcount.pl endif cmd_record_mcount = \ if [ "$(findstring -pg,$(_c_flags))" = "-pg" ]; then \ @@ -282,13 +285,13 @@ define rule_cc_o_c endef # Built-in and composite module parts -$(obj)/%.o: $(src)/%.c FORCE +$(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) # Single-part modules are special since we need to mark them in $(MODVERDIR) -$(single-used-m): $(obj)/%.o: $(src)/%.c FORCE +$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) $(call if_changed_rule,cc_o_c) @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod) -- cgit v1.1 From 369db4c9524b7487faf1ff89646eee396c1363e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:40 +0000 Subject: clocksource: Restructure clocksource struct members Group the hot path members of struct clocksource together so we have a better cache line footprint. Make it cacheline aligned. Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Eric Dumazet Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.003081882%40linutronix.de%3E --- include/linux/clocksource.h | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 0fb0b7e..c918fbd 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -159,42 +159,38 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, */ struct clocksource { /* - * First part of structure is read mostly + * Hotpath data, fits in a single cache line when the + * clocksource itself is cacheline aligned. */ - const char *name; - struct list_head list; - int rating; cycle_t (*read)(struct clocksource *cs); - int (*enable)(struct clocksource *cs); - void (*disable)(struct clocksource *cs); + cycle_t cycle_last; cycle_t mask; u32 mult; u32 shift; u64 max_idle_ns; - unsigned long flags; - cycle_t (*vread)(void); - void (*suspend)(struct clocksource *cs); - void (*resume)(struct clocksource *cs); + #ifdef CONFIG_IA64 void *fsys_mmio; /* used by fsyscall asm code */ #define CLKSRC_FSYS_MMIO_SET(mmio, addr) ((mmio) = (addr)) #else #define CLKSRC_FSYS_MMIO_SET(mmio, addr) do { } while (0) #endif - - /* - * Second part is written at each timer interrupt - * Keep it in a different cache line to dirty no - * more than one cache line. - */ - cycle_t cycle_last ____cacheline_aligned_in_smp; + const char *name; + struct list_head list; + int rating; + cycle_t (*vread)(void); + int (*enable)(struct clocksource *cs); + void (*disable)(struct clocksource *cs); + unsigned long flags; + void (*suspend)(struct clocksource *cs); + void (*resume)(struct clocksource *cs); #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ struct list_head wd_list; cycle_t wd_last; #endif -}; +} ____cacheline_aligned; /* * Clock source flags bits:: -- cgit v1.1 From 724ed53e8ac2c5278af8955673049714c1073464 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:40 +0000 Subject: clocksource: Get rid of the hardcoded 5 seconds sleep time limit Slow clocksources can have a way longer sleep time than 5 seconds and even fast ones can easily cope with 600 seconds and still maintain proper accuracy. Signed-off-by: Thomas Gleixner Cc: John Stultz Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.109811585%40linutronix.de%3E --- kernel/time/clocksource.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6519cf62..6dbbbb1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs) list_add(&cs->list, entry); } - -/* - * Maximum time we expect to go between ticks. This includes idle - * tickless time. It provides the trade off between selecting a - * mult/shift pair that is very precise but can only handle a short - * period of time, vs. a mult/shift pair that can handle long periods - * of time but isn't as precise. - * - * This is a subsystem constant, and actual hardware limitations - * may override it (ie: clocksources that wrap every 3 seconds). - */ -#define MAX_UPDATE_LENGTH 5 /* Seconds */ - /** * __clocksource_updatefreq_scale - Used update clocksource with new freq * @t: clocksource to be registered @@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs) */ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { + unsigned long sec; + /* - * Ideally we want to use some of the limits used in - * clocksource_max_deferment, to provide a more informed - * MAX_UPDATE_LENGTH. But for now this just gets the - * register interface working properly. + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. We apply the same 12.5% + * margin as we do in clocksource_max_deferment() */ + sec = (cs->mask - (cs->mask >> 5)); + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC/scale, - MAX_UPDATE_LENGTH*scale); + NSEC_PER_SEC / scale, sec * scale); cs->max_idle_ns = clocksource_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); -- cgit v1.1 From 847b2f42be203f3cff7f243fdd3ee50c1e06c882 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:41 +0000 Subject: clockevents: Restructure clock_event_device members Group the hot path members of struct clock_event_device together so we have a better cache line footprint. Make it cacheline aligned. Signed-off-by: Thomas Gleixner Cc: John Stultz Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.223607682%40linutronix.de%3E --- include/linux/clockchips.h | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index fc53492..9466eeb 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -56,46 +56,47 @@ enum clock_event_nofitiers { /** * struct clock_event_device - clock event device descriptor - * @name: ptr to clock event name - * @features: features + * @event_handler: Assigned by the framework to be called by the low + * level handler of the event source + * @set_next_event: set next event function + * @next_event: local storage for the next event in oneshot mode * @max_delta_ns: maximum delta value in ns * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) + * @mode: operating mode assigned by the management code + * @features: features + * @retries: number of forced programming retries + * @set_mode: set mode function + * @broadcast: function to broadcast events + * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) * @cpumask: cpumask to indicate for which CPUs this device works - * @set_next_event: set next event function - * @set_mode: set mode function - * @event_handler: Assigned by the framework to be called by the low - * level handler of the event source - * @broadcast: function to broadcast events * @list: list head for the management code - * @mode: operating mode assigned by the management code - * @next_event: local storage for the next event in oneshot mode - * @retries: number of forced programming retries */ struct clock_event_device { - const char *name; - unsigned int features; + void (*event_handler)(struct clock_event_device *); + int (*set_next_event)(unsigned long evt, + struct clock_event_device *); + ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; u32 mult; u32 shift; + enum clock_event_mode mode; + unsigned int features; + unsigned long retries; + + void (*broadcast)(const struct cpumask *mask); + void (*set_mode)(enum clock_event_mode mode, + struct clock_event_device *); + const char *name; int rating; int irq; const struct cpumask *cpumask; - int (*set_next_event)(unsigned long evt, - struct clock_event_device *); - void (*set_mode)(enum clock_event_mode mode, - struct clock_event_device *); - void (*event_handler)(struct clock_event_device *); - void (*broadcast)(const struct cpumask *mask); struct list_head list; - enum clock_event_mode mode; - ktime_t next_event; - unsigned long retries; -}; +} ____cacheline_aligned; /* * Calculate a multiplication factor for scaled math, which is used to convert -- cgit v1.1 From 57f0fcbe1dea8a36c9d1673086326059991c5f81 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:41 +0000 Subject: clockevents: Provide combined configure and register function All clockevent devices have the same open coded initialization functions. Provide an interface which does all necessary initialization in the core code. Signed-off-by: Thomas Gleixner Cc: John Stultz Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.331975870%40linutronix.de%3E --- include/linux/clockchips.h | 9 +++++++++ kernel/time/clockevents.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 9466eeb..80acc79e 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -69,6 +69,8 @@ enum clock_event_nofitiers { * @retries: number of forced programming retries * @set_mode: set mode function * @broadcast: function to broadcast events + * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration + * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) @@ -91,6 +93,9 @@ struct clock_event_device { void (*broadcast)(const struct cpumask *mask); void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); + unsigned long min_delta_ticks; + unsigned long max_delta_ticks; + const char *name; int rating; int irq; @@ -123,6 +128,10 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); +extern void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta); + extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); extern void clockevents_set_mode(struct clock_event_device *dev, diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0d74b9b..c69e88c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -194,6 +194,50 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); +static void clockevents_config(struct clock_event_device *dev, + u32 freq) +{ + unsigned long sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); + dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} + /* * Noop handler when we shut down an event device */ -- cgit v1.1 From 80b816b736cfa5b9582279127099b20a479ab7d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:42 +0000 Subject: clockevents: Provide interface to reconfigure an active clock event device Some ARM SoCs have clock event devices which have their frequency modified due to frequency scaling. Provide an interface which allows to reconfigure an active device. After reconfiguration reprogram the current pending event. Signed-off-by: Thomas Gleixner Cc: LAK Cc: John Stultz Acked-by: Linus Walleij Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.437459958%40linutronix.de%3E --- include/linux/clockchips.h | 2 ++ kernel/time/clockevents.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 80acc79e..d6733e2 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -132,6 +132,8 @@ extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); +extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); + extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); extern void clockevents_set_mode(struct clock_event_device *dev, diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c69e88c..22a9da9 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -238,6 +238,26 @@ void clockevents_config_and_register(struct clock_event_device *dev, clockevents_register_device(dev); } +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events with interrupts disabled! Returns 0 on success, + * -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return 0; + + return clockevents_program_event(dev, dev->next_event, ktime_get()); +} + /* * Noop handler when we shut down an event device */ -- cgit v1.1 From 61ee9a4ba05f0a4163d43a33dee7a0651e080b98 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:42 +0000 Subject: x86: Convert PIT to clockevents_config_and_register() Let the core do the work. Signed-off-by: Thomas Gleixner Cc: John Stultz Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.545615675%40linutronix.de%3E --- arch/x86/kernel/i8253.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 577e90c..fb66dc9 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = init_pit_timer, .set_next_event = pit_next_event, - .shift = 32, .irq = 0, }; @@ -108,11 +107,8 @@ void __init setup_pit_timer(void) * IO_APIC has been initialized. */ pit_ce.cpumask = cpumask_of(smp_processor_id()); - pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift); - pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce); - pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce); - clockevents_register_device(&pit_ce); + clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF); global_clock_event = &pit_ce; } -- cgit v1.1 From ab0e08f15d23628dd8d50bf6ce1a935a8840c7dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 May 2011 21:33:43 +0000 Subject: x86: hpet: Cleanup the clockevents init and register code No need to recalculate the frequency and the conversion factors over and over. Calculate the frequency once and use the new config/register interface and let the core code do the math. Signed-off-by: Thomas Gleixner Cc: John Stultz Reviewed-by: Ingo Molnar Link: http://lkml.kernel.org/r/%3C20110518210136.646482357%40linutronix.de%3E --- arch/x86/kernel/hpet.c | 72 +++++++++++--------------------------------------- 1 file changed, 16 insertions(+), 56 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index bfe8f72..6781765 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -217,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { } /* * Common hpet info */ -static unsigned long hpet_period; +static unsigned long hpet_freq; static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt); @@ -232,7 +232,6 @@ static struct clock_event_device hpet_clockevent = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = hpet_legacy_set_mode, .set_next_event = hpet_legacy_next_event, - .shift = 32, .irq = 0, .rating = 50, }; @@ -290,28 +289,12 @@ static void hpet_legacy_clockevent_register(void) hpet_enable_legacy_int(); /* - * The mult factor is defined as (include/linux/clockchips.h) - * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h) - * hpet_period is in units of femtoseconds (per cycle), so - * mult/2^shift = cyc/ns = 10^6/hpet_period - * mult = (10^6 * 2^shift)/hpet_period - * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period - */ - hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC, - hpet_period, hpet_clockevent.shift); - /* Calculate the min / max delta */ - hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &hpet_clockevent); - /* Setup minimum reprogramming delta. */ - hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, - &hpet_clockevent); - - /* * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(&hpet_clockevent); + clockevents_config_and_register(&hpet_clockevent, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); } @@ -549,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev) static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) { struct clock_event_device *evt = &hdev->evt; - uint64_t hpet_freq; WARN_ON(cpu != smp_processor_id()); if (!(hdev->flags & HPET_DEV_VALID)) @@ -571,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) evt->set_mode = hpet_msi_set_mode; evt->set_next_event = hpet_msi_next_event; - evt->shift = 32; - - /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. - */ - hpet_freq = FSEC_PER_SEC; - do_div(hpet_freq, hpet_period); - evt->mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, evt->shift); - /* Calculate the max delta */ - evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); - /* 5 usec minimum reprogramming delta. */ - evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of(hdev->cpu); - clockevents_register_device(evt); + + clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, + 0x7FFFFFFF); } #ifdef CONFIG_HPET @@ -792,7 +760,6 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; - u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -819,24 +786,7 @@ static int hpet_clocksource_register(void) return -ENODEV; } - /* - * The definition of mult is (include/linux/clocksource.h) - * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc - * so we first need to convert hpet_period to ns/cyc units: - * mult/2^shift = ns/cyc = hpet_period/10^6 - * mult = (hpet_period * 2^shift)/10^6 - * mult = (hpet_period << shift)/FSEC_PER_NSEC - */ - - /* Need to convert hpet_period (fsec/cyc) to cyc/sec: - * - * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) - * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period - */ - hpet_freq = FSEC_PER_SEC; - do_div(hpet_freq, hpet_period); clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); - return 0; } @@ -845,7 +795,9 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { + unsigned long hpet_period; unsigned int id; + u64 freq; int i; if (!is_hpet_capable()) @@ -884,6 +836,14 @@ int __init hpet_enable(void) goto out_nohpet; /* + * The period is a femto seconds value. Convert it to a + * frequency. + */ + freq = FSEC_PER_SEC; + do_div(freq, hpet_period); + hpet_freq = freq; + + /* * Read the HPET ID register to retrieve the IRQ routing * information and the number of channels */ -- cgit v1.1 From 2cba3ffb9a9db3874304a1739002d053d53c738b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 May 2011 13:30:56 +0200 Subject: perf stat: Add -d -d and -d -d -d options to show more CPU events Print even more detailed statistics if requested via perf stat -d: -d: detailed events, L1 and LLC data cache -d -d: more detailed events, dTLB and iTLB events -d -d -d: very detailed events, adding prefetch events Full output looks like this now: Performance counter stats for '/home/mingo/hackbench 10' (5 runs): 1703.674707 task-clock # 8.709 CPUs utilized ( +- 4.19% ) 49,068 context-switches # 0.029 M/sec ( +- 16.66% ) 8,303 CPU-migrations # 0.005 M/sec ( +- 24.90% ) 17,397 page-faults # 0.010 M/sec ( +- 0.46% ) 2,345,389,239 cycles # 1.377 GHz ( +- 4.61% ) [55.90%] 1,884,503,527 stalled-cycles-frontend # 80.35% frontend cycles idle ( +- 5.67% ) [50.39%] 743,919,737 stalled-cycles-backend # 31.72% backend cycles idle ( +- 8.75% ) [49.91%] 1,314,416,379 instructions # 0.56 insns per cycle # 1.43 stalled cycles per insn ( +- 2.53% ) [60.87%] 272,592,567 branches # 160.003 M/sec ( +- 1.74% ) [56.56%] 3,794,846 branch-misses # 1.39% of all branches ( +- 6.59% ) [58.50%] 449,982,778 L1-dcache-loads # 264.125 M/sec ( +- 2.47% ) [49.88%] 22,404,961 L1-dcache-load-misses # 4.98% of all L1-dcache hits ( +- 6.08% ) [55.05%] 6,204,750 LLC-loads # 3.642 M/sec ( +- 8.91% ) [43.75%] 1,837,411 LLC-load-misses # 1.078 M/sec ( +- 7.27% ) [12.07%] 411,440,421 L1-icache-loads # 241.502 M/sec ( +- 5.60% ) [36.52%] 27,556,832 L1-icache-load-misses # 16.175 M/sec ( +- 7.46% ) [46.72%] 464,067,627 dTLB-loads # 272.392 M/sec ( +- 4.46% ) [54.17%] 10,765,648 dTLB-load-misses # 6.319 M/sec ( +- 3.18% ) [48.68%] 1,273,080,386 iTLB-loads # 747.256 M/sec ( +- 3.38% ) [47.53%] 117,481 iTLB-load-misses # 0.069 M/sec ( +- 14.99% ) [47.01%] 4,590,653 L1-dcache-prefetches # 2.695 M/sec ( +- 4.49% ) [46.19%] 1,712,660 L1-dcache-prefetch-misses # 1.005 M/sec ( +- 3.75% ) [44.82%] 0.195622057 seconds time elapsed ( +- 6.84% ) Also clean up the attribute construction code to be appending, and factor it out into add_default_attributes(). Tweak the coverage percentage printout a bit, so that it's easier to view it alongside the +- sttddev colum. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-to3kgu04449s64062val8b62@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 209 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 154 insertions(+), 55 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 602c3c9..a89fc08 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -6,24 +6,28 @@ * * Sample output: - $ perf stat ~/hackbench 10 - Time: 0.104 + $ perf stat ./hackbench 10 - Performance counter stats for '/home/mingo/hackbench': + Time: 0.118 - 1255.538611 task clock ticks # 10.143 CPU utilization factor - 54011 context switches # 0.043 M/sec - 385 CPU migrations # 0.000 M/sec - 17755 pagefaults # 0.014 M/sec - 3808323185 CPU cycles # 3033.219 M/sec - 1575111190 instructions # 1254.530 M/sec - 17367895 cache references # 13.833 M/sec - 7674421 cache misses # 6.112 M/sec + Performance counter stats for './hackbench 10': - Wall-clock time elapsed: 123.786620 msecs + 1708.761321 task-clock # 11.037 CPUs utilized + 41,190 context-switches # 0.024 M/sec + 6,735 CPU-migrations # 0.004 M/sec + 17,318 page-faults # 0.010 M/sec + 5,205,202,243 cycles # 3.046 GHz + 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle + 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle + 2,603,501,247 instructions # 0.50 insns per cycle + # 1.48 stalled cycles per insn + 484,357,498 branches # 283.455 M/sec + 6,388,934 branch-misses # 1.32% of all branches + + 0.154822978 seconds time elapsed * - * Copyright (C) 2008, Red Hat Inc, Ingo Molnar + * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar * * Improvements and fixes by: * @@ -75,22 +79,10 @@ static struct perf_event_attr default_attrs[] = { }; /* - * Detailed stats: + * Detailed stats (-d), covering the L1 and last level data caches: */ static struct perf_event_attr detailed_attrs[] = { - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, - - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, - { .type = PERF_TYPE_HW_CACHE, .config = PERF_COUNT_HW_CACHE_L1D << 0 | @@ -116,6 +108,69 @@ static struct perf_event_attr detailed_attrs[] = { (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, }; +/* + * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: + */ +static struct perf_event_attr very_detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1I << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1I << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + +}; + +/* + * Very, very detailed stats (-d -d -d), adding prefetch events: + */ +static struct perf_event_attr very_very_detailed_attrs[] = { + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + + + struct perf_evlist *evsel_list; static bool system_wide = false; @@ -129,7 +184,7 @@ static pid_t target_pid = -1; static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; -static bool detailed_run = false; +static int detailed_run = 0; static bool sync_run = false; static bool big_num = true; static int big_num_opt = -1; @@ -464,7 +519,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) { double msecs = avg / 1e6; char cpustr[16] = { '\0', }; - const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s"; + const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; if (no_aggr) sprintf(cpustr, "CPU%*d%s", @@ -584,9 +639,9 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (csv_output) fmt = "%s%.0f%s%s"; else if (big_num) - fmt = "%s%'18.0f%s%-24s"; + fmt = "%s%'18.0f%s%-25s"; else - fmt = "%s%18.0f%s%-24s"; + fmt = "%s%18.0f%s%-25s"; if (no_aggr) sprintf(cpustr, "CPU%*d%s", @@ -616,7 +671,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total && avg) { ratio = total / avg; - fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); + fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && @@ -704,7 +759,7 @@ static void print_counter_aggr(struct perf_evsel *counter) avg_enabled = avg_stats(&ps->res_stats[1]); avg_running = avg_stats(&ps->res_stats[2]); - fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); + fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled); } fprintf(stderr, "\n"); } @@ -854,7 +909,7 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), - OPT_BOOLEAN('d', "detailed", &detailed_run, + OPT_INCR('d', "detailed", &detailed_run, "detailed run - start a lot of events"), OPT_BOOLEAN('S', "sync", &sync_run, "call sync() before starting a run"), @@ -873,6 +928,70 @@ static const struct option options[] = { OPT_END() }; +/* + * Add default attributes, if there were no attributes specified or + * if -d/--detailed, -d -d or -d -d -d is used: + */ +static int add_default_attributes(void) +{ + struct perf_evsel *pos; + size_t attr_nr = 0; + size_t c; + + /* Set attrs if no event is selected and !null_run: */ + if (null_run) + return 0; + + if (!evsel_list->nr_entries) { + for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { + pos = perf_evsel__new(default_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; + } + + /* Detailed events get appended to the event list: */ + + if (detailed_run < 1) + return 0; + + /* Append detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { + pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + attr_nr += c; + + if (detailed_run < 2) + return 0; + + /* Append very detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { + pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + if (detailed_run < 3) + return 0; + + /* Append very, very detailed run extra attributes: */ + for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { + pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); + if (pos == NULL) + return -1; + perf_evlist__add(evsel_list, pos); + } + + + return 0; +} + int cmd_stat(int argc, const char **argv, const char *prefix __used) { struct perf_evsel *pos; @@ -918,28 +1037,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) usage_with_options(stat_usage, options); } - /* Set attrs and nr_counters if no event is selected and !null_run */ - if (detailed_run) { - size_t c; - - for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) { - pos = perf_evsel__new(&detailed_attrs[c], c); - if (pos == NULL) - goto out; - perf_evlist__add(evsel_list, pos); - } - } - /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!detailed_run && !null_run && !evsel_list->nr_entries) { - size_t c; - - for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { - pos = perf_evsel__new(&default_attrs[c], c); - if (pos == NULL) - goto out; - perf_evlist__add(evsel_list, pos); - } - } + if (add_default_attributes()) + goto out; if (target_pid != -1) target_tid = target_pid; -- cgit v1.1 From c3305257cd4df63e03e21e331a0140ae9c0faccc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 May 2011 14:01:42 +0200 Subject: perf stat: Add more cache-miss percentage printouts Print out the cache-miss percentage as well if the cache refs were collected, for all the generic cache event types. Before: 11,103,723,230 dTLB-loads # 622.471 M/sec ( +- 0.30% ) 87,065,337 dTLB-load-misses # 4.881 M/sec ( +- 0.90% ) After: 11,353,713,242 dTLB-loads # 626.020 M/sec ( +- 0.35% ) 113,393,472 dTLB-load-misses # 1.00% of all dTLB cache hits ( +- 0.49% ) Also ASCII color highlight too high percentages, them when it's executed on the console. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-lkhwxsevdbd9a8nymx0vxc3y@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 138 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a89fc08..a9f0671 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -261,6 +261,10 @@ struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; +struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; +struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; +struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; +struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -317,6 +321,14 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_cacherefs_stats[0], count[0]); else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) update_stats(&runtime_l1_dcache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) + update_stats(&runtime_l1_icache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) + update_stats(&runtime_ll_cache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) + update_stats(&runtime_dtlb_cache_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) + update_stats(&runtime_itlb_cache_stats[0], count[0]); } /* @@ -630,6 +642,98 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou fprintf(stderr, " of all L1-dcache hits "); } +static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_l1_icache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all L1-icache hits "); +} + +static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_dtlb_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all dTLB cache hits "); +} + +static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_itlb_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all iTLB cache hits "); +} + +static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_ll_cache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " of all LL-cache hits "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -684,6 +788,34 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && runtime_l1_dcache_stats[cpu].n != 0) { print_l1_dcache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_l1_icache_stats[cpu].n != 0) { + print_l1_icache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_dtlb_cache_stats[cpu].n != 0) { + print_dtlb_cache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_itlb_cache_stats[cpu].n != 0) { + print_itlb_cache_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_ll_cache_stats[cpu].n != 0) { + print_ll_cache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); @@ -842,10 +974,12 @@ static void print_stat(int argc, const char **argv) } if (!csv_output) { - fprintf(stderr, "\n"); - fprintf(stderr, " %18.9f seconds time elapsed", + if (!null_run) + fprintf(stderr, "\n"); + fprintf(stderr, " %17.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { + fprintf(stderr, " "); print_noise_pct(stddev_stats(&walltime_nsecs_stats), avg_stats(&walltime_nsecs_stats)); } -- cgit v1.1 From b87ba87ca26e226b2277a2d5613ed596f408e96d Mon Sep 17 00:00:00 2001 From: "Tian, Kevin" Date: Fri, 6 May 2011 14:43:36 +0800 Subject: x86: Skip migrating IRQF_PER_CPU irqs in fixup_irqs() IRQF_PER_CPU means that the irq cannot be moved away from a given cpu. So it must not be migrated when the cpu goes offline. [ tglx: massaged changelog ] Signed-off-by: Fengzhe Zhang Signed-off-by: Kevin Tian Cc: Ian Campbell Cc: Jan Beulich Cc: "xen-devel@lists.xensource.com" Link: http://lkml.kernel.org/r/%3C625BA99ED14B2D499DC4E29D8138F1505C8ED7F7E2%40shsmsx502.ccr.corp.intel.com%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 1cb0b9f..544efe2 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -249,7 +249,7 @@ void fixup_irqs(void) data = irq_desc_get_irq_data(desc); affinity = data->affinity; - if (!irq_has_action(irq) || + if (!irq_has_action(irq) || irqd_is_per_cpu(data) || cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); continue; -- cgit v1.1 From 983bbf1af0664b78689612b247acb514300f62c7 Mon Sep 17 00:00:00 2001 From: "Tian, Kevin" Date: Fri, 6 May 2011 14:43:56 +0800 Subject: x86: Don't unmask disabled irqs when migrating them It doesn't make sense to unconditionally unmask a disabled irq when migrating it from offlined cpu to another. If the irq triggers then it will be disabled in the interrupt handler anyway. So we can just avoid unmasking it. [ tglx: Made masking unconditional again and fixed the changelog ] Signed-off-by: Fengzhe Zhang Signed-off-by: Kevin Tian Cc: Ian Campbell Cc: Jan Beulich Cc: "xen-devel@lists.xensource.com" Link: http://lkml.kernel.org/r/%3C625BA99ED14B2D499DC4E29D8138F1505C8ED7F7E3%40shsmsx502.ccr.corp.intel.com%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 544efe2..6c0802e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -276,7 +276,8 @@ void fixup_irqs(void) else if (!(warned++)) set_affinity = 0; - if (!irqd_can_move_in_process_context(data) && chip->irq_unmask) + if (!irqd_can_move_in_process_context(data) && + !irqd_irq_disabled(data) && chip->irq_unmask) chip->irq_unmask(data); raw_spin_unlock(&desc->lock); -- cgit v1.1 From 3f508953dd2ebcbcd32d28d0b6aad4d76980e722 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Thu, 12 May 2011 17:19:53 -0400 Subject: arch/x86/xen/mmu: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper [v1: Rebased on top of latest linus's to include fixes in mmu.c] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0684f3c..b5f776f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1054,7 +1054,7 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, +static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, enum pt_level level) { SetPagePinned(page); @@ -1271,7 +1271,7 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static __init void xen_pagetable_setup_start(pgd_t *base) +static void __init xen_pagetable_setup_start(pgd_t *base) { } @@ -1291,7 +1291,7 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) static void xen_post_allocator_init(void); -static __init void xen_pagetable_setup_done(pgd_t *base) +static void __init xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); xen_post_allocator_init(); @@ -1488,7 +1488,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) } #ifdef CONFIG_X86_32 -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ if (pte_val_ma(*ptep) & _PAGE_PRESENT) @@ -1498,7 +1498,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) return pte; } #else /* CONFIG_X86_64 */ -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -1519,7 +1519,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) /* Init-time set_pte while constructing initial pagetables, which doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { pte = mask_rw_pte(ptep, pte); @@ -1537,7 +1537,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -1547,7 +1547,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) } /* Used for pmd and pud */ -static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -1557,13 +1557,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static __init void xen_release_pte_init(unsigned long pfn) +static void __init xen_release_pte_init(unsigned long pfn) { pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -static __init void xen_release_pmd_init(unsigned long pfn) +static void __init xen_release_pmd_init(unsigned long pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -1689,7 +1689,7 @@ static void set_page_prot(void *addr, pgprot_t prot) BUG(); } -static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; unsigned ident_pte; @@ -1772,7 +1772,7 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, +pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; @@ -1843,7 +1843,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); -static __init void xen_write_cr3_init(unsigned long cr3) +static void __init xen_write_cr3_init(unsigned long cr3) { unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); @@ -1880,7 +1880,7 @@ static __init void xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, +pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; @@ -1986,7 +1986,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -__init void xen_ident_map_ISA(void) +void __init xen_ident_map_ISA(void) { unsigned long pa; @@ -2009,7 +2009,7 @@ __init void xen_ident_map_ISA(void) xen_flush_tlb(); } -static __init void xen_post_allocator_init(void) +static void __init xen_post_allocator_init(void) { #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); @@ -2046,7 +2046,7 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const struct pv_mmu_ops xen_mmu_ops __initdata = { +static const struct pv_mmu_ops xen_mmu_ops __initconst = { .read_cr2 = xen_read_cr2, .write_cr2 = xen_write_cr2, -- cgit v1.1 From ad7ba09e658f8e890797ed168dfb3f84be934b09 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:19:15 +0200 Subject: arch/x86/xen/xen-ops: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3112f55..97dfdc8 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -74,7 +74,7 @@ static inline void xen_hvm_smp_init(void) {} #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init xen_init_spinlocks(void); -__cpuinit void xen_init_lock_cpu(int cpu); +void __cpuinit xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); #else static inline void xen_init_spinlocks(void) -- cgit v1.1 From fb6ce5dea4bb704bfcd9fda7e6b1354da66f4d2f Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:18:45 +0200 Subject: arch/x86/xen/time: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/time.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 2e2d370..bd4ffd7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -439,11 +439,11 @@ void xen_timer_resume(void) } } -static const struct pv_time_ops xen_time_ops __initdata = { +static const struct pv_time_ops xen_time_ops __initconst = { .sched_clock = xen_clocksource_read, }; -static __init void xen_time_init(void) +static void __init xen_time_init(void) { int cpu = smp_processor_id(); struct timespec tp; @@ -468,7 +468,7 @@ static __init void xen_time_init(void) xen_setup_cpu_clockevents(); } -__init void xen_init_time_ops(void) +void __init xen_init_time_ops(void) { pv_time_ops = xen_time_ops; @@ -490,7 +490,7 @@ static void xen_hvm_setup_cpu_clockevents(void) xen_setup_cpu_clockevents(); } -__init void xen_hvm_init_time_ops(void) +void __init xen_hvm_init_time_ops(void) { /* vector callback is needed otherwise we cannot receive interrupts * on cpu > 0 and at this point we don't know how many cpus are -- cgit v1.1 From b53cedebd74918237176520f9157deb7ae066b71 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:18:05 +0200 Subject: arch/x86/xen/smp: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3061244..194a3ed 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -57,7 +57,7 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static __cpuinit void cpu_bringup(void) +static void __cpuinit cpu_bringup(void) { int cpu = smp_processor_id(); @@ -85,7 +85,7 @@ static __cpuinit void cpu_bringup(void) wmb(); /* make sure everything is out */ } -static __cpuinit void cpu_bringup_and_idle(void) +static void __cpuinit cpu_bringup_and_idle(void) { cpu_bringup(); cpu_idle(); @@ -242,7 +242,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } } -static __cpuinit int +static int __cpuinit cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; @@ -486,7 +486,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static const struct smp_ops xen_smp_ops __initdata = { +static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, .smp_cpus_done = xen_smp_cpus_done, -- cgit v1.1 From 887cb45694f77d59de19674cb73146fec72fadbb Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Wed, 4 May 2011 20:19:49 +0200 Subject: drivers/xen/sys-hypervisor: Cleanup code/data sections definitions Cleanup code/data sections definitions accordingly to include/linux/init.h. Signed-off-by: Daniel Kiper Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/sys-hypervisor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 60f1827..1e0fe01 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -215,7 +215,7 @@ static struct attribute_group xen_compilation_group = { .attrs = xen_compile_attrs, }; -int __init static xen_compilation_init(void) +static int __init xen_compilation_init(void) { return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); } -- cgit v1.1 From 359c47ea71be21c1105ae474e8c90ec3e988bbdf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 3 Apr 2011 13:32:00 +0200 Subject: m68k: bitops - offset == ((long)p - (long)vaddr) * 8 Hence use "offset" in find_next_{,zero_}bit(), like is already done for find_next_{,zero_}bit_le() Signed-off-by: Geert Uytterhoeven Cc: Akinobu Mita Cc: Andreas Schwab Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/bitops_mm.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/m68k/include/asm/bitops_mm.h b/arch/m68k/include/asm/bitops_mm.h index 9d69f6e..5bd3afa 100644 --- a/arch/m68k/include/asm/bitops_mm.h +++ b/arch/m68k/include/asm/bitops_mm.h @@ -220,8 +220,7 @@ static inline int find_next_zero_bit(const unsigned long *vaddr, int size, offset += 32; } /* No zero yet, search remaining full bytes for a zero */ - res = find_first_zero_bit(p, size - ((long)p - (long)vaddr) * 8); - return offset + res; + return offset + find_first_zero_bit(p, size - offset); } static inline int find_first_bit(const unsigned long *vaddr, unsigned size) @@ -267,8 +266,7 @@ static inline int find_next_bit(const unsigned long *vaddr, int size, offset += 32; } /* No one yet, search remaining full bytes for a one */ - res = find_first_bit(p, size - ((long)p - (long)vaddr) * 8); - return offset + res; + return offset + find_first_bit(p, size - offset); } /* -- cgit v1.1 From f82a519f1262963d6ab30fa238721463fad2e0c8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 3 Apr 2011 14:00:10 +0200 Subject: m68k: bitops - Never step beyond the end of the bitmap find_next bitops on m68k (find_next_zero_bit, find_next_bit, and find_next_bit_le) may cause out of bounds memory access when the bitmap size in bits % 32 != 0 and offset (the bitnumber to start searching at) is very close to the bitmap size. For example, unsigned long bitmap[2] = { 0, 0 }; find_next_bit(bitmap, 63, 62); 1. find_next_bit() tries to find any set bits in bitmap[1], but no bits set. 2. Then find_first_bit(bimap + 2, -1) 3. Unfortunately find_first_bit() takes unsigned int as the size argument. 4. find_first_bit will access bitmap[2~] until it find any set bits. Add missing tests for stepping beyond the end of the bitmap to all find_{first,next}_*() functions, and make sure they never return a value larger than the bitmap size. Reported-by: Akinobu Mita Cc: Andreas Schwab Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/bitops_mm.h | 81 ++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/arch/m68k/include/asm/bitops_mm.h b/arch/m68k/include/asm/bitops_mm.h index 5bd3afa..e9020f8 100644 --- a/arch/m68k/include/asm/bitops_mm.h +++ b/arch/m68k/include/asm/bitops_mm.h @@ -181,14 +181,15 @@ static inline int find_first_zero_bit(const unsigned long *vaddr, { const unsigned long *p = vaddr; int res = 32; + unsigned int words; unsigned long num; if (!size) return 0; - size = (size + 31) >> 5; + words = (size + 31) >> 5; while (!(num = ~*p++)) { - if (!--size) + if (!--words) goto out; } @@ -196,7 +197,8 @@ static inline int find_first_zero_bit(const unsigned long *vaddr, : "=d" (res) : "d" (num & -num)); res ^= 31; out: - return ((long)p - (long)vaddr - 4) * 8 + res; + res += ((long)p - (long)vaddr - 4) * 8; + return res < size ? res : size; } static inline int find_next_zero_bit(const unsigned long *vaddr, int size, @@ -215,9 +217,14 @@ static inline int find_next_zero_bit(const unsigned long *vaddr, int size, /* Look for zero in first longword */ __asm__ __volatile__ ("bfffo %1{#0,#0},%0" : "=d" (res) : "d" (num & -num)); - if (res < 32) - return offset + (res ^ 31); + if (res < 32) { + offset += res ^ 31; + return offset < size ? offset : size; + } offset += 32; + + if (offset >= size) + return size; } /* No zero yet, search remaining full bytes for a zero */ return offset + find_first_zero_bit(p, size - offset); @@ -227,14 +234,15 @@ static inline int find_first_bit(const unsigned long *vaddr, unsigned size) { const unsigned long *p = vaddr; int res = 32; + unsigned int words; unsigned long num; if (!size) return 0; - size = (size + 31) >> 5; + words = (size + 31) >> 5; while (!(num = *p++)) { - if (!--size) + if (!--words) goto out; } @@ -242,7 +250,8 @@ static inline int find_first_bit(const unsigned long *vaddr, unsigned size) : "=d" (res) : "d" (num & -num)); res ^= 31; out: - return ((long)p - (long)vaddr - 4) * 8 + res; + res += ((long)p - (long)vaddr - 4) * 8; + return res < size ? res : size; } static inline int find_next_bit(const unsigned long *vaddr, int size, @@ -261,9 +270,14 @@ static inline int find_next_bit(const unsigned long *vaddr, int size, /* Look for one in first longword */ __asm__ __volatile__ ("bfffo %1{#0,#0},%0" : "=d" (res) : "d" (num & -num)); - if (res < 32) - return offset + (res ^ 31); + if (res < 32) { + offset += res ^ 31; + return offset < size ? offset : size; + } offset += 32; + + if (offset >= size) + return size; } /* No one yet, search remaining full bytes for a one */ return offset + find_first_bit(p, size - offset); @@ -364,23 +378,25 @@ static inline int test_bit_le(int nr, const void *vaddr) static inline int find_first_zero_bit_le(const void *vaddr, unsigned size) { const unsigned long *p = vaddr, *addr = vaddr; - int res; + int res = 0; + unsigned int words; if (!size) return 0; - size = (size >> 5) + ((size & 31) > 0); - while (*p++ == ~0UL) - { - if (--size == 0) - return (p - addr) << 5; + words = (size >> 5) + ((size & 31) > 0); + while (*p++ == ~0UL) { + if (--words == 0) + goto out; } --p; for (res = 0; res < 32; res++) if (!test_bit_le(res, p)) break; - return (p - addr) * 32 + res; +out: + res += (p - addr) * 32; + return res < size ? res : size; } static inline unsigned long find_next_zero_bit_le(const void *addr, @@ -398,10 +414,15 @@ static inline unsigned long find_next_zero_bit_le(const void *addr, offset -= bit; /* Look for zero in first longword */ for (res = bit; res < 32; res++) - if (!test_bit_le(res, p)) - return offset + res; + if (!test_bit_le(res, p)) { + offset += res; + return offset < size ? offset : size; + } p++; offset += 32; + + if (offset >= size) + return size; } /* No zero yet, search remaining full bytes for a zero */ return offset + find_first_zero_bit_le(p, size - offset); @@ -410,22 +431,25 @@ static inline unsigned long find_next_zero_bit_le(const void *addr, static inline int find_first_bit_le(const void *vaddr, unsigned size) { const unsigned long *p = vaddr, *addr = vaddr; - int res; + int res = 0; + unsigned int words; if (!size) return 0; - size = (size >> 5) + ((size & 31) > 0); + words = (size >> 5) + ((size & 31) > 0); while (*p++ == 0UL) { - if (--size == 0) - return (p - addr) << 5; + if (--words == 0) + goto out; } --p; for (res = 0; res < 32; res++) if (test_bit_le(res, p)) break; - return (p - addr) * 32 + res; +out: + res += (p - addr) * 32; + return res < size ? res : size; } static inline unsigned long find_next_bit_le(const void *addr, @@ -443,10 +467,15 @@ static inline unsigned long find_next_bit_le(const void *addr, offset -= bit; /* Look for one in first longword */ for (res = bit; res < 32; res++) - if (test_bit_le(res, p)) - return offset + res; + if (test_bit_le(res, p)) { + offset += res; + return offset < size ? offset : size; + } p++; offset += 32; + + if (offset >= size) + return size; } /* No set bit yet, search remaining full bytes for a set bit */ return offset + find_first_bit_le(p, size - offset); -- cgit v1.1 From 6cf515e113fc1938b3cc9812bd8519a4c6155ef9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 24 Apr 2011 10:32:49 +0200 Subject: MAINTAINERS: Roman Zippel has been MIA for several years. Hence make AFFS and HFS orphans, and remove him as an m68k maintainer. Signed-off-by: Geert Uytterhoeven --- MAINTAINERS | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 69f19f1..a01796e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -405,8 +405,8 @@ S: Maintained F: sound/oss/aedsp16.c AFFS FILE SYSTEM -M: Roman Zippel -S: Maintained +L: linux-fsdevel@vger.kernel.org +S: Orphan F: Documentation/filesystems/affs.txt F: fs/affs/ @@ -2946,8 +2946,8 @@ F: drivers/block/cciss* F: include/linux/cciss_ioctl.h HFS FILESYSTEM -M: Roman Zippel -S: Maintained +L: linux-fsdevel@vger.kernel.org +S: Orphan F: Documentation/filesystems/hfs.txt F: fs/hfs/ @@ -4001,7 +4001,6 @@ F: arch/m32r/ M68K ARCHITECTURE M: Geert Uytterhoeven -M: Roman Zippel L: linux-m68k@lists.linux-m68k.org W: http://www.linux-m68k.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k.git -- cgit v1.1 From c4245c9d6535f3d02fda7f6eb9adcec9f09e8fe3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 6 Apr 2011 22:12:53 +0200 Subject: m68k: Merge mmu and non-mmu versions of sys_call_table Impact for nommu: - Store table in .rodata instead of .text, - Let kernel/sys_ni.c handle the stubbing of MMU-only syscalls, - Implement sys_mremap and sys_nfsservct, - Remove unused padding at the end of the table. Impact for mmu: - Store table in .rodata instead of .data. Signed-off-by: Geert Uytterhoeven Acked-by: Greg Ungerer --- arch/m68k/kernel/Makefile_mm | 2 +- arch/m68k/kernel/entry_mm.S | 348 ---------------------------------------- arch/m68k/kernel/syscalltable.S | 191 +++++++++++----------- 3 files changed, 96 insertions(+), 445 deletions(-) diff --git a/arch/m68k/kernel/Makefile_mm b/arch/m68k/kernel/Makefile_mm index 55d5d6b..aced678 100644 --- a/arch/m68k/kernel/Makefile_mm +++ b/arch/m68k/kernel/Makefile_mm @@ -10,7 +10,7 @@ endif extra-y += vmlinux.lds obj-y := entry.o process.o traps.o ints.o signal.o ptrace.o module.o \ - sys_m68k.o time.o setup.o m68k_ksyms.o devres.o + sys_m68k.o time.o setup.o m68k_ksyms.o devres.o syscalltable.o devres-y = ../../../kernel/irq/devres.o diff --git a/arch/m68k/kernel/entry_mm.S b/arch/m68k/kernel/entry_mm.S index 1359ee6..bd0ec05 100644 --- a/arch/m68k/kernel/entry_mm.S +++ b/arch/m68k/kernel/entry_mm.S @@ -407,351 +407,3 @@ resume: rts -.data -ALIGN -sys_call_table: - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long sys_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long sys_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_chown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 */ /* old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_ni_syscall - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long sys_old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ni_syscall /* ioperm for i386 */ - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_ni_syscall - .long sys_ni_syscall /* 110 */ /* iopl for i386 */ - .long sys_vhangup - .long sys_ni_syscall /* obsolete idle() syscall */ - .long sys_ni_syscall /* vm86old for i386 */ - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_cacheflush /* modify_ldt for i386 */ - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130 - old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_getpagesize - .long sys_ni_syscall /* old sys_query_module */ - .long sys_poll - .long sys_nfsservctl - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long sys_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_lchown16; - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long sys_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* streams1 */ - .long sys_ni_syscall /* streams2 */ - .long sys_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap2 - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_chown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_lchown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_ni_syscall - .long sys_ni_syscall - .long sys_getdents64 /* 220 */ - .long sys_gettid - .long sys_tkill - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr /* 225 */ - .long sys_getxattr - .long sys_lgetxattr - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr /* 230 */ - .long sys_flistxattr - .long sys_removexattr - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_futex /* 235 */ - .long sys_sendfile64 - .long sys_mincore - .long sys_madvise - .long sys_fcntl64 - .long sys_readahead /* 240 */ - .long sys_io_setup - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel /* 245 */ - .long sys_fadvise64 - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 250 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 255 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 260 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 265 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy /* 270 */ - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive - .long sys_mq_notify /* 275 */ - .long sys_mq_getsetattr - .long sys_waitid - .long sys_ni_syscall /* for sys_vserver */ - .long sys_add_key - .long sys_request_key /* 280 */ - .long sys_keyctl - .long sys_ioprio_set - .long sys_ioprio_get - .long sys_inotify_init - .long sys_inotify_add_watch /* 285 */ - .long sys_inotify_rm_watch - .long sys_migrate_pages - .long sys_openat - .long sys_mkdirat - .long sys_mknodat /* 290 */ - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 - .long sys_unlinkat - .long sys_renameat /* 295 */ - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat - .long sys_fchmodat - .long sys_faccessat /* 300 */ - .long sys_ni_syscall /* Reserved for pselect6 */ - .long sys_ni_syscall /* Reserved for ppoll */ - .long sys_unshare - .long sys_set_robust_list - .long sys_get_robust_list /* 305 */ - .long sys_splice - .long sys_sync_file_range - .long sys_tee - .long sys_vmsplice - .long sys_move_pages /* 310 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_kexec_load - .long sys_getcpu - .long sys_epoll_pwait /* 315 */ - .long sys_utimensat - .long sys_signalfd - .long sys_timerfd_create - .long sys_eventfd - .long sys_fallocate /* 320 */ - .long sys_timerfd_settime - .long sys_timerfd_gettime - .long sys_signalfd4 - .long sys_eventfd2 - .long sys_epoll_create1 /* 325 */ - .long sys_dup3 - .long sys_pipe2 - .long sys_inotify_init1 - .long sys_preadv - .long sys_pwritev /* 330 */ - .long sys_rt_tgsigqueueinfo - .long sys_perf_event_open - .long sys_get_thread_area - .long sys_set_thread_area - .long sys_atomic_cmpxchg_32 /* 335 */ - .long sys_atomic_barrier - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 - .long sys_name_to_handle_at /* 340 */ - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S index 9b8393d..0284192 100644 --- a/arch/m68k/kernel/syscalltable.S +++ b/arch/m68k/kernel/syscalltable.S @@ -1,6 +1,4 @@ /* - * linux/arch/m68knommu/kernel/syscalltable.S - * * Copyright (C) 2002, Greg Ungerer (gerg@snapgear.com) * * Based on older entry.S files, the following copyrights apply: @@ -9,171 +7,176 @@ * Kenneth Albanowski , * Copyright (C) 2000 Lineo Inc. (www.lineo.com) * Copyright (C) 1991, 1992 Linus Torvalds + * + * Linux/m68k support by Hamish Macdonald */ #include #include -#include -.text +#ifndef CONFIG_MMU +#define sys_mmap2 sys_mmap_pgoff +#endif + +.section .rodata ALIGN ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call */ + .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ .long sys_exit .long sys_fork .long sys_read .long sys_write - .long sys_open /* 5 */ + .long sys_open /* 5 */ .long sys_close .long sys_waitpid .long sys_creat .long sys_link - .long sys_unlink /* 10 */ + .long sys_unlink /* 10 */ .long sys_execve .long sys_chdir .long sys_time .long sys_mknod - .long sys_chmod /* 15 */ + .long sys_chmod /* 15 */ .long sys_chown16 - .long sys_ni_syscall /* old break syscall holder */ + .long sys_ni_syscall /* old break syscall holder */ .long sys_stat .long sys_lseek - .long sys_getpid /* 20 */ + .long sys_getpid /* 20 */ .long sys_mount .long sys_oldumount .long sys_setuid16 .long sys_getuid16 - .long sys_stime /* 25 */ + .long sys_stime /* 25 */ .long sys_ptrace .long sys_alarm .long sys_fstat .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ + .long sys_utime /* 30 */ + .long sys_ni_syscall /* old stty syscall holder */ + .long sys_ni_syscall /* old gtty syscall holder */ .long sys_access .long sys_nice - .long sys_ni_syscall /* 35 */ /* old ftime syscall holder */ + .long sys_ni_syscall /* 35 - old ftime syscall holder */ .long sys_sync .long sys_kill .long sys_rename .long sys_mkdir - .long sys_rmdir /* 40 */ + .long sys_rmdir /* 40 */ .long sys_dup .long sys_pipe .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ + .long sys_ni_syscall /* old prof syscall holder */ + .long sys_brk /* 45 */ .long sys_setgid16 .long sys_getgid16 .long sys_signal .long sys_geteuid16 - .long sys_getegid16 /* 50 */ + .long sys_getegid16 /* 50 */ .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ + .long sys_umount /* recycled never used phys() */ + .long sys_ni_syscall /* old lock syscall holder */ .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ + .long sys_fcntl /* 55 */ + .long sys_ni_syscall /* old mpx syscall holder */ .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ + .long sys_ni_syscall /* old ulimit syscall holder */ .long sys_ni_syscall - .long sys_umask /* 60 */ + .long sys_umask /* 60 */ .long sys_chroot .long sys_ustat .long sys_dup2 .long sys_getppid - .long sys_getpgrp /* 65 */ + .long sys_getpgrp /* 65 */ .long sys_setsid .long sys_sigaction .long sys_sgetmask .long sys_ssetmask - .long sys_setreuid16 /* 70 */ + .long sys_setreuid16 /* 70 */ .long sys_setregid16 .long sys_sigsuspend .long sys_sigpending .long sys_sethostname - .long sys_setrlimit /* 75 */ + .long sys_setrlimit /* 75 */ .long sys_old_getrlimit .long sys_getrusage .long sys_gettimeofday .long sys_settimeofday - .long sys_getgroups16 /* 80 */ + .long sys_getgroups16 /* 80 */ .long sys_setgroups16 .long sys_old_select .long sys_symlink .long sys_lstat - .long sys_readlink /* 85 */ + .long sys_readlink /* 85 */ .long sys_uselib - .long sys_ni_syscall /* sys_swapon */ + .long sys_swapon .long sys_reboot .long sys_old_readdir - .long sys_old_mmap /* 90 */ + .long sys_old_mmap /* 90 */ .long sys_munmap .long sys_truncate .long sys_ftruncate .long sys_fchmod - .long sys_fchown16 /* 95 */ + .long sys_fchown16 /* 95 */ .long sys_getpriority .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ + .long sys_ni_syscall /* old profil syscall holder */ .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ni_syscall /* ioperm for i386 */ + .long sys_fstatfs /* 100 */ + .long sys_ni_syscall /* ioperm for i386 */ .long sys_socketcall .long sys_syslog .long sys_setitimer - .long sys_getitimer /* 105 */ + .long sys_getitimer /* 105 */ .long sys_newstat .long sys_newlstat .long sys_newfstat .long sys_ni_syscall - .long sys_ni_syscall /* iopl for i386 */ /* 110 */ + .long sys_ni_syscall /* 110 - iopl for i386 */ .long sys_vhangup - .long sys_ni_syscall /* obsolete idle() syscall */ - .long sys_ni_syscall /* vm86old for i386 */ + .long sys_ni_syscall /* obsolete idle() syscall */ + .long sys_ni_syscall /* vm86old for i386 */ .long sys_wait4 - .long sys_ni_syscall /* 115 */ /* sys_swapoff */ + .long sys_swapoff /* 115 */ .long sys_sysinfo .long sys_ipc .long sys_fsync .long sys_sigreturn - .long sys_clone /* 120 */ + .long sys_clone /* 120 */ .long sys_setdomainname .long sys_newuname - .long sys_cacheflush /* modify_ldt for i386 */ + .long sys_cacheflush /* modify_ldt for i386 */ .long sys_adjtimex - .long sys_ni_syscall /* 125 */ /* sys_mprotect */ + .long sys_mprotect /* 125 */ .long sys_sigprocmask - .long sys_ni_syscall /* old "creat_module" */ + .long sys_ni_syscall /* old "create_module" */ .long sys_init_module .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ + .long sys_ni_syscall /* 130 - old "get_kernel_syms" */ .long sys_quotactl .long sys_getpgid .long sys_fchdir .long sys_bdflush - .long sys_sysfs /* 135 */ + .long sys_sysfs /* 135 */ .long sys_personality - .long sys_ni_syscall /* for afs_syscall */ + .long sys_ni_syscall /* for afs_syscall */ .long sys_setfsuid16 .long sys_setfsgid16 - .long sys_llseek /* 140 */ + .long sys_llseek /* 140 */ .long sys_getdents .long sys_select .long sys_flock - .long sys_ni_syscall /* sys_msync */ - .long sys_readv /* 145 */ + .long sys_msync + .long sys_readv /* 145 */ .long sys_writev .long sys_getsid .long sys_fdatasync .long sys_sysctl - .long sys_ni_syscall /* 150 */ /* sys_mlock */ - .long sys_ni_syscall /* sys_munlock */ - .long sys_ni_syscall /* sys_mlockall */ - .long sys_ni_syscall /* sys_munlockall */ + .long sys_mlock /* 150 */ + .long sys_munlock + .long sys_mlockall + .long sys_munlockall .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ + .long sys_sched_getparam /* 155 */ .long sys_sched_setscheduler .long sys_sched_getscheduler .long sys_sched_yield @@ -181,124 +184,124 @@ ENTRY(sys_call_table) .long sys_sched_get_priority_min /* 160 */ .long sys_sched_rr_get_interval .long sys_nanosleep - .long sys_ni_syscall /* sys_mremap */ + .long sys_mremap .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_getpagesize /* sys_getpagesize */ - .long sys_ni_syscall /* old "query_module" */ + .long sys_getresuid16 /* 165 */ + .long sys_getpagesize + .long sys_ni_syscall /* old "query_module" */ .long sys_poll - .long sys_ni_syscall /* sys_nfsservctl */ - .long sys_setresgid16 /* 170 */ + .long sys_nfsservctl + .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl .long sys_rt_sigreturn .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ + .long sys_rt_sigprocmask /* 175 */ .long sys_rt_sigpending .long sys_rt_sigtimedwait .long sys_rt_sigqueueinfo .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ + .long sys_pread64 /* 180 */ .long sys_pwrite64 .long sys_lchown16 .long sys_getcwd .long sys_capget - .long sys_capset /* 185 */ + .long sys_capset /* 185 */ .long sys_sigaltstack .long sys_sendfile - .long sys_ni_syscall /* streams1 */ - .long sys_ni_syscall /* streams2 */ - .long sys_vfork /* 190 */ + .long sys_ni_syscall /* streams1 */ + .long sys_ni_syscall /* streams2 */ + .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap_pgoff + .long sys_mmap2 .long sys_truncate64 .long sys_ftruncate64 - .long sys_stat64 /* 195 */ + .long sys_stat64 /* 195 */ .long sys_lstat64 .long sys_fstat64 .long sys_chown .long sys_getuid - .long sys_getgid /* 200 */ + .long sys_getgid /* 200 */ .long sys_geteuid .long sys_getegid .long sys_setreuid .long sys_setregid - .long sys_getgroups /* 205 */ + .long sys_getgroups /* 205 */ .long sys_setgroups .long sys_fchown .long sys_setresuid .long sys_getresuid - .long sys_setresgid /* 210 */ + .long sys_setresgid /* 210 */ .long sys_getresgid .long sys_lchown .long sys_setuid .long sys_setgid - .long sys_setfsuid /* 215 */ + .long sys_setfsuid /* 215 */ .long sys_setfsgid .long sys_pivot_root .long sys_ni_syscall .long sys_ni_syscall - .long sys_getdents64 /* 220 */ + .long sys_getdents64 /* 220 */ .long sys_gettid .long sys_tkill .long sys_setxattr .long sys_lsetxattr - .long sys_fsetxattr /* 225 */ + .long sys_fsetxattr /* 225 */ .long sys_getxattr .long sys_lgetxattr .long sys_fgetxattr .long sys_listxattr - .long sys_llistxattr /* 230 */ + .long sys_llistxattr /* 230 */ .long sys_flistxattr .long sys_removexattr .long sys_lremovexattr .long sys_fremovexattr - .long sys_futex /* 235 */ + .long sys_futex /* 235 */ .long sys_sendfile64 - .long sys_ni_syscall /* sys_mincore */ - .long sys_ni_syscall /* sys_madvise */ + .long sys_mincore + .long sys_madvise .long sys_fcntl64 - .long sys_readahead /* 240 */ + .long sys_readahead /* 240 */ .long sys_io_setup .long sys_io_destroy .long sys_io_getevents .long sys_io_submit - .long sys_io_cancel /* 245 */ + .long sys_io_cancel /* 245 */ .long sys_fadvise64 .long sys_exit_group .long sys_lookup_dcookie .long sys_epoll_create - .long sys_epoll_ctl /* 250 */ + .long sys_epoll_ctl /* 250 */ .long sys_epoll_wait - .long sys_ni_syscall /* sys_remap_file_pages */ + .long sys_remap_file_pages .long sys_set_tid_address .long sys_timer_create - .long sys_timer_settime /* 255 */ + .long sys_timer_settime /* 255 */ .long sys_timer_gettime .long sys_timer_getoverrun .long sys_timer_delete .long sys_clock_settime - .long sys_clock_gettime /* 260 */ + .long sys_clock_gettime /* 260 */ .long sys_clock_getres .long sys_clock_nanosleep .long sys_statfs64 .long sys_fstatfs64 - .long sys_tgkill /* 265 */ + .long sys_tgkill /* 265 */ .long sys_utimes .long sys_fadvise64_64 - .long sys_mbind + .long sys_mbind .long sys_get_mempolicy - .long sys_set_mempolicy /* 270 */ + .long sys_set_mempolicy /* 270 */ .long sys_mq_open .long sys_mq_unlink .long sys_mq_timedsend .long sys_mq_timedreceive - .long sys_mq_notify /* 275 */ + .long sys_mq_notify /* 275 */ .long sys_mq_getsetattr .long sys_waitid - .long sys_ni_syscall /* for sys_vserver */ + .long sys_ni_syscall /* for sys_vserver */ .long sys_add_key - .long sys_request_key /* 280 */ + .long sys_request_key /* 280 */ .long sys_keyctl .long sys_ioprio_set .long sys_ioprio_get @@ -363,7 +366,3 @@ ENTRY(sys_call_table) .long sys_clock_adjtime .long sys_syncfs - .rept NR_syscalls-(.-sys_call_table)/4 - .long sys_ni_syscall - .endr - -- cgit v1.1 From d6d42bb2f85d875dc0c421699de5a1401b2af6a6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 6 May 2011 20:57:11 +0200 Subject: m68k: Really wire up sys_pselect6 and sys_ppoll We reserved the numbers a long time ago, but never wired them up in the syscall table as they need TIF_RESTORE_SIGMASK, which we only got last year in commit cb6831d5d3099e772a510eb3e1ed0760ccffb45e ("m68k: Switch to saner sigsuspend()") Signed-off-by: Geert Uytterhoeven Acked-by: Greg Ungerer Cc: stable@kernel.org --- arch/m68k/kernel/syscalltable.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S index 0284192..5909e39 100644 --- a/arch/m68k/kernel/syscalltable.S +++ b/arch/m68k/kernel/syscalltable.S @@ -322,8 +322,8 @@ ENTRY(sys_call_table) .long sys_readlinkat .long sys_fchmodat .long sys_faccessat /* 300 */ - .long sys_ni_syscall /* Reserved for pselect6 */ - .long sys_ni_syscall /* Reserved for ppoll */ + .long sys_pselect6 + .long sys_ppoll .long sys_unshare .long sys_set_robust_list .long sys_get_robust_list /* 305 */ -- cgit v1.1 From 1fc74ac61229edfe053fb87e8939ae9ca3794389 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 May 2011 20:33:02 +0200 Subject: m68k: unistd - Comment out definitions for unimplemented syscalls Suggested-by: Arnd Bergmann Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/unistd.h | 46 ++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 29e1790..f3b649d 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -22,7 +22,7 @@ #define __NR_mknod 14 #define __NR_chmod 15 #define __NR_chown 16 -#define __NR_break 17 +/*#define __NR_break 17*/ #define __NR_oldstat 18 #define __NR_lseek 19 #define __NR_getpid 20 @@ -36,11 +36,11 @@ #define __NR_oldfstat 28 #define __NR_pause 29 #define __NR_utime 30 -#define __NR_stty 31 -#define __NR_gtty 32 +/*#define __NR_stty 31*/ +/*#define __NR_gtty 32*/ #define __NR_access 33 #define __NR_nice 34 -#define __NR_ftime 35 +/*#define __NR_ftime 35*/ #define __NR_sync 36 #define __NR_kill 37 #define __NR_rename 38 @@ -49,7 +49,7 @@ #define __NR_dup 41 #define __NR_pipe 42 #define __NR_times 43 -#define __NR_prof 44 +/*#define __NR_prof 44*/ #define __NR_brk 45 #define __NR_setgid 46 #define __NR_getgid 47 @@ -58,13 +58,13 @@ #define __NR_getegid 50 #define __NR_acct 51 #define __NR_umount2 52 -#define __NR_lock 53 +/*#define __NR_lock 53*/ #define __NR_ioctl 54 #define __NR_fcntl 55 -#define __NR_mpx 56 +/*#define __NR_mpx 56*/ #define __NR_setpgid 57 -#define __NR_ulimit 58 -#define __NR_oldolduname 59 +/*#define __NR_ulimit 58*/ +/*#define __NR_oldolduname 59*/ #define __NR_umask 60 #define __NR_chroot 61 #define __NR_ustat 62 @@ -103,10 +103,10 @@ #define __NR_fchown 95 #define __NR_getpriority 96 #define __NR_setpriority 97 -#define __NR_profil 98 +/*#define __NR_profil 98*/ #define __NR_statfs 99 #define __NR_fstatfs 100 -#define __NR_ioperm 101 +/*#define __NR_ioperm 101*/ #define __NR_socketcall 102 #define __NR_syslog 103 #define __NR_setitimer 104 @@ -114,11 +114,11 @@ #define __NR_stat 106 #define __NR_lstat 107 #define __NR_fstat 108 -#define __NR_olduname 109 -#define __NR_iopl /* 110 */ not supported +/*#define __NR_olduname 109*/ +/*#define __NR_iopl 110*/ /* not supported */ #define __NR_vhangup 111 -#define __NR_idle /* 112 */ Obsolete -#define __NR_vm86 /* 113 */ not supported +/*#define __NR_idle 112*/ /* Obsolete */ +/*#define __NR_vm86 113*/ /* not supported */ #define __NR_wait4 114 #define __NR_swapoff 115 #define __NR_sysinfo 116 @@ -132,17 +132,17 @@ #define __NR_adjtimex 124 #define __NR_mprotect 125 #define __NR_sigprocmask 126 -#define __NR_create_module 127 +/*#define __NR_create_module 127*/ #define __NR_init_module 128 #define __NR_delete_module 129 -#define __NR_get_kernel_syms 130 +/*#define __NR_get_kernel_syms 130*/ #define __NR_quotactl 131 #define __NR_getpgid 132 #define __NR_fchdir 133 #define __NR_bdflush 134 #define __NR_sysfs 135 #define __NR_personality 136 -#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ +/*#define __NR_afs_syscall 137*/ /* Syscall for Andrew File System */ #define __NR_setfsuid 138 #define __NR_setfsgid 139 #define __NR__llseek 140 @@ -172,7 +172,7 @@ #define __NR_setresuid 164 #define __NR_getresuid 165 #define __NR_getpagesize 166 -#define __NR_query_module 167 +/*#define __NR_query_module 167*/ #define __NR_poll 168 #define __NR_nfsservctl 169 #define __NR_setresgid 170 @@ -193,8 +193,8 @@ #define __NR_capset 185 #define __NR_sigaltstack 186 #define __NR_sendfile 187 -#define __NR_getpmsg 188 /* some people actually want streams */ -#define __NR_putpmsg 189 /* some people actually want streams */ +/*#define __NR_getpmsg 188*/ /* some people actually want streams */ +/*#define __NR_putpmsg 189*/ /* some people actually want streams */ #define __NR_vfork 190 #define __NR_ugetrlimit 191 #define __NR_mmap2 192 @@ -223,6 +223,8 @@ #define __NR_setfsuid32 215 #define __NR_setfsgid32 216 #define __NR_pivot_root 217 +/* 218*/ +/* 219*/ #define __NR_getdents64 220 #define __NR_gettid 221 #define __NR_tkill 222 @@ -281,7 +283,7 @@ #define __NR_mq_notify 275 #define __NR_mq_getsetattr 276 #define __NR_waitid 277 -#define __NR_vserver 278 +/*#define __NR_vserver 278*/ #define __NR_add_key 279 #define __NR_request_key 280 #define __NR_keyctl 281 -- cgit v1.1 From 79abeed6ee93231d494c191a9251c0845bd71fdd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 May 2011 14:55:41 +0200 Subject: m68k/atari: Do not use "/" in interrupt names It may trigger a warning in fs/proc/generic.c:__xlate_proc_name() when trying to add an entry for the interrupt handler to sysfs. Signed-off-by: Geert Uytterhoeven --- arch/m68k/atari/atakeyb.c | 2 +- arch/m68k/atari/stdma.c | 2 +- drivers/net/atarilance.c | 2 +- drivers/video/atafb.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/m68k/atari/atakeyb.c b/arch/m68k/atari/atakeyb.c index b995513..db26f0e 100644 --- a/arch/m68k/atari/atakeyb.c +++ b/arch/m68k/atari/atakeyb.c @@ -575,7 +575,7 @@ int atari_keyb_init(void) kb_state.len = 0; error = request_irq(IRQ_MFP_ACIA, atari_keyboard_interrupt, - IRQ_TYPE_SLOW, "keyboard/mouse/MIDI", + IRQ_TYPE_SLOW, "keyboard,mouse,MIDI", atari_keyboard_interrupt); if (error) return error; diff --git a/arch/m68k/atari/stdma.c b/arch/m68k/atari/stdma.c index 604329f..ddbf43c 100644 --- a/arch/m68k/atari/stdma.c +++ b/arch/m68k/atari/stdma.c @@ -180,7 +180,7 @@ void __init stdma_init(void) { stdma_isr = NULL; if (request_irq(IRQ_MFP_FDC, stdma_int, IRQ_TYPE_SLOW | IRQF_SHARED, - "ST-DMA: floppy/ACSI/IDE/Falcon-SCSI", stdma_int)) + "ST-DMA floppy,ACSI,IDE,Falcon-SCSI", stdma_int)) pr_err("Couldn't register ST-DMA interrupt\n"); } diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c index ce0091e..1264d78 100644 --- a/drivers/net/atarilance.c +++ b/drivers/net/atarilance.c @@ -554,7 +554,7 @@ static unsigned long __init lance_probe1( struct net_device *dev, memaddr == (unsigned short *)0xffe00000) { /* PAMs card and Riebl on ST use level 5 autovector */ if (request_irq(IRQ_AUTO_5, lance_interrupt, IRQ_TYPE_PRIO, - "PAM/Riebl-ST Ethernet", dev)) { + "PAM,Riebl-ST Ethernet", dev)) { printk( "Lance: request for irq %d failed\n", IRQ_AUTO_5 ); return 0; } diff --git a/drivers/video/atafb.c b/drivers/video/atafb.c index 5b2b5ef..64e41f5 100644 --- a/drivers/video/atafb.c +++ b/drivers/video/atafb.c @@ -3117,7 +3117,7 @@ int __init atafb_init(void) atafb_ops.fb_setcolreg = &falcon_setcolreg; error = request_irq(IRQ_AUTO_4, falcon_vbl_switcher, IRQ_TYPE_PRIO, - "framebuffer/modeswitch", + "framebuffer:modeswitch", falcon_vbl_switcher); if (error) return error; -- cgit v1.1 From 7786908c3c1bb38dcc5cd2c037251c05507eef16 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Tue, 16 Dec 2008 21:26:03 +0100 Subject: input/atari: Use the correct mouse interrupt hook The Atari keyboard driver calls atari_mouse_interrupt_hook if it's set, not atari_input_mouse_interrupt_hook. Fix below. [geert] Killed off atari_mouse_interrupt_hook completely, after fixing another incorrect assignment in atarimouse.c. Signed-off-by: Michael Schmitz Signed-off-by: Geert Uytterhoeven --- arch/m68k/atari/atakeyb.c | 7 ++----- arch/m68k/include/asm/atarikb.h | 2 -- drivers/input/mouse/atarimouse.c | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/m68k/atari/atakeyb.c b/arch/m68k/atari/atakeyb.c index db26f0e..95022b0 100644 --- a/arch/m68k/atari/atakeyb.c +++ b/arch/m68k/atari/atakeyb.c @@ -36,13 +36,10 @@ /* Hook for MIDI serial driver */ void (*atari_MIDI_interrupt_hook) (void); -/* Hook for mouse driver */ -void (*atari_mouse_interrupt_hook) (char *); /* Hook for keyboard inputdev driver */ void (*atari_input_keyboard_interrupt_hook) (unsigned char, char); /* Hook for mouse inputdev driver */ void (*atari_input_mouse_interrupt_hook) (char *); -EXPORT_SYMBOL(atari_mouse_interrupt_hook); EXPORT_SYMBOL(atari_input_keyboard_interrupt_hook); EXPORT_SYMBOL(atari_input_mouse_interrupt_hook); @@ -263,8 +260,8 @@ repeat: kb_state.buf[kb_state.len++] = scancode; if (kb_state.len == 3) { kb_state.state = KEYBOARD; - if (atari_mouse_interrupt_hook) - atari_mouse_interrupt_hook(kb_state.buf); + if (atari_input_mouse_interrupt_hook) + atari_input_mouse_interrupt_hook(kb_state.buf); } break; diff --git a/arch/m68k/include/asm/atarikb.h b/arch/m68k/include/asm/atarikb.h index 546e7da..68f3622 100644 --- a/arch/m68k/include/asm/atarikb.h +++ b/arch/m68k/include/asm/atarikb.h @@ -34,8 +34,6 @@ void ikbd_joystick_disable(void); /* Hook for MIDI serial driver */ extern void (*atari_MIDI_interrupt_hook) (void); -/* Hook for mouse driver */ -extern void (*atari_mouse_interrupt_hook) (char *); /* Hook for keyboard inputdev driver */ extern void (*atari_input_keyboard_interrupt_hook) (unsigned char, char); /* Hook for mouse inputdev driver */ diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c index adf45b3..a57143c 100644 --- a/drivers/input/mouse/atarimouse.c +++ b/drivers/input/mouse/atarimouse.c @@ -108,7 +108,7 @@ static int atamouse_open(struct input_dev *dev) static void atamouse_close(struct input_dev *dev) { ikbd_mouse_disable(); - atari_mouse_interrupt_hook = NULL; + atari_input_mouse_interrupt_hook = NULL; } static int __init atamouse_init(void) -- cgit v1.1 From 186f200a95cbd13c291cdd3ddeb07aad0a3782cc Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Sun, 28 Dec 2008 23:00:45 +0100 Subject: input/atari: Fix atarimouse init Atarimouse fails to load as a module (with ENODEV), due to a brown paper bag bug, misinterpreting the semantics of atari_keyb_init(). [geert] Propagate the return value of atari_keyb_init() everywhere Signed-off-by: Michael Schmitz Signed-off-by: Geert Uytterhoeven --- drivers/input/keyboard/atakbd.c | 5 +++-- drivers/input/mouse/atarimouse.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/input/keyboard/atakbd.c b/drivers/input/keyboard/atakbd.c index 1839194..10bcd4a 100644 --- a/drivers/input/keyboard/atakbd.c +++ b/drivers/input/keyboard/atakbd.c @@ -223,8 +223,9 @@ static int __init atakbd_init(void) return -ENODEV; // need to init core driver if not already done so - if (atari_keyb_init()) - return -ENODEV; + error = atari_keyb_init(); + if (error) + return error; atakbd_dev = input_allocate_device(); if (!atakbd_dev) diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c index a57143c..16a9d0f 100644 --- a/drivers/input/mouse/atarimouse.c +++ b/drivers/input/mouse/atarimouse.c @@ -118,8 +118,9 @@ static int __init atamouse_init(void) if (!MACH_IS_ATARI || !ATARIHW_PRESENT(ST_MFP)) return -ENODEV; - if (!atari_keyb_init()) - return -ENODEV; + error = atari_keyb_init(); + if (error) + return error; atamouse_dev = input_allocate_device(); if (!atamouse_dev) -- cgit v1.1 From 659e6ed55ff8d617c895c10288644e3e6107834e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 31 Jan 2011 20:15:04 +0100 Subject: input/atari: Fix mouse movement and button mapping Up and down movements were reversed, left and right buttons were swapped. Signed-off-by: Geert Uytterhoeven --- drivers/input/mouse/atarimouse.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c index 16a9d0f..5c4a692 100644 --- a/drivers/input/mouse/atarimouse.c +++ b/drivers/input/mouse/atarimouse.c @@ -77,15 +77,15 @@ static void atamouse_interrupt(char *buf) #endif /* only relative events get here */ - dx = buf[1]; - dy = -buf[2]; + dx = buf[1]; + dy = buf[2]; input_report_rel(atamouse_dev, REL_X, dx); input_report_rel(atamouse_dev, REL_Y, dy); - input_report_key(atamouse_dev, BTN_LEFT, buttons & 0x1); + input_report_key(atamouse_dev, BTN_LEFT, buttons & 0x4); input_report_key(atamouse_dev, BTN_MIDDLE, buttons & 0x2); - input_report_key(atamouse_dev, BTN_RIGHT, buttons & 0x4); + input_report_key(atamouse_dev, BTN_RIGHT, buttons & 0x1); input_sync(atamouse_dev); -- cgit v1.1 From 52c3ce4ec5601ee383a14f1485f6bac7b278896e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Apr 2011 16:44:26 +0100 Subject: kmemleak: Do not return a pointer to an object that kmemleak did not get The kmemleak_seq_next() function tries to get an object (and increment its use count) before returning it. If it could not get the last object during list traversal (because it may have been freed), the function should return NULL rather than a pointer to such object that it did not get. Signed-off-by: Catalin Marinas Reported-by: Phil Carmody Acked-by: Phil Carmody Cc: --- mm/kmemleak.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c1d5867..aacee45 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++(*pos); list_for_each_continue_rcu(n, &object_list) { - next_obj = list_entry(n, struct kmemleak_object, object_list); - if (get_object(next_obj)) + struct kmemleak_object *obj = + list_entry(n, struct kmemleak_object, object_list); + if (get_object(obj)) { + next_obj = obj; break; + } } put_object(prev_obj); -- cgit v1.1 From 79e0d9bd262bdd36009e8092e57e34dc5e22a1c7 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Apr 2011 17:06:19 +0100 Subject: kmemleak: Select DEBUG_FS unconditionally in DEBUG_KMEMLEAK In the past DEBUG_FS used to depend on SYSFS and DEBUG_KMEMLEAK selected it conditionally. This is no longer the case, so always select DEBUG_FS via DEBUG_KMEMLEAK. Signed-off-by: Catalin Marinas --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c768bcd..4eebb90 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -400,7 +400,7 @@ config DEBUG_KMEMLEAK depends on DEBUG_KERNEL && EXPERIMENTAL && !MEMORY_HOTPLUG && \ (X86 || ARM || PPC || S390 || SPARC64 || SUPERH || MICROBLAZE || TILE) - select DEBUG_FS if SYSFS + select DEBUG_FS select STACKTRACE if STACKTRACE_SUPPORT select KALLSYMS select CRC32 -- cgit v1.1 From 9b090f2da85bd0df5e1a1ecfe4120b7b50358f48 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 19 May 2011 16:25:30 +0100 Subject: kmemleak: Initialise kmemleak after debug_objects_mem_init() Kmemleak frees objects via RCU and when CONFIG_DEBUG_OBJECTS_RCU_HEAD is enabled, the RCU callback triggers a call to free_object() in lib/debugobjects.c. Since kmemleak is initialised before debug objects initialisation, it may result in a kernel panic during booting. This patch moves the kmemleak_init() call after debug_objects_mem_init(). Reported-by: Marcin Slusarz Tested-by: Tejun Heo Signed-off-by: Catalin Marinas Cc: --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 4a9479e..48df882 100644 --- a/init/main.c +++ b/init/main.c @@ -580,8 +580,8 @@ asmlinkage void __init start_kernel(void) #endif page_cgroup_init(); enable_debug_pagealloc(); - kmemleak_init(); debug_objects_mem_init(); + kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) -- cgit v1.1 From 75d65a425c0163d3ec476ddc12b51087217a070c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 May 2011 13:50:07 -0700 Subject: hlist: remove software prefetching in hlist iterators They not only increase the code footprint, they actually make things slower rather than faster. On internationally acclaimed benchmarks ("make -j16" on an already fully built kernel source tree) the hlist prefetching slows down the build by up to 1%. (Almost all of it comes from hlist_for_each_entry_rcu() as used by avc_has_perm_noaudit(), which is very hot due to all the pathname lookups to see if there is anything to do). The cause seems to be two-fold: - on at least some Intel cores, prefetch(NULL) ends up with some microarchitectural stall due to the TLB miss that it incurs. The hlist case triggers this very commonly, since the NULL pointer is the last entry in the list. - the prefetch appears to cause more D$ activity, probably because it prefetches hash list entries that are never actually used (because we ended the search early due to a hit). Regardless, the numbers clearly say that the implicit prefetching is simply a bad idea. If some _particular_ user of the hlist iterators wants to prefetch the next list entry, they can do so themselves explicitly, rather than depend on all list iterators doing so implicitly. Acked-by: Ingo Molnar Acked-by: David S. Miller Cc: linux-arch@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/list.h | 9 ++++----- include/linux/rculist.h | 10 +++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 3a54266..9ac1114 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -664,8 +664,7 @@ static inline void hlist_move_list(struct hlist_head *old, #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ - for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \ - pos = pos->next) + for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ @@ -680,7 +679,7 @@ static inline void hlist_move_list(struct hlist_head *old, */ #define hlist_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ - pos && ({ prefetch(pos->next); 1;}) && \ + pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) @@ -692,7 +691,7 @@ static inline void hlist_move_list(struct hlist_head *old, */ #define hlist_for_each_entry_continue(tpos, pos, member) \ for (pos = (pos)->next; \ - pos && ({ prefetch(pos->next); 1;}) && \ + pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) @@ -703,7 +702,7 @@ static inline void hlist_move_list(struct hlist_head *old, * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(tpos, pos, member) \ - for (; pos && ({ prefetch(pos->next); 1;}) && \ + for (; pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2dea94f..900a97a 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -427,7 +427,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ - pos && ({ prefetch(pos->next); 1; }); \ + pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** @@ -443,7 +443,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, */ #define hlist_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \ - pos && ({ prefetch(pos->next); 1; }) && \ + pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_next_rcu(pos))) @@ -460,7 +460,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, */ #define hlist_for_each_entry_rcu_bh(tpos, pos, head, member) \ for (pos = rcu_dereference_bh((head)->first); \ - pos && ({ prefetch(pos->next); 1; }) && \ + pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_bh(pos->next)) @@ -472,7 +472,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, */ #define hlist_for_each_entry_continue_rcu(tpos, pos, member) \ for (pos = rcu_dereference((pos)->next); \ - pos && ({ prefetch(pos->next); 1; }) && \ + pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference(pos->next)) @@ -484,7 +484,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, */ #define hlist_for_each_entry_continue_rcu_bh(tpos, pos, member) \ for (pos = rcu_dereference_bh((pos)->next); \ - pos && ({ prefetch(pos->next); 1; }) && \ + pos && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_bh(pos->next)) -- cgit v1.1 From e66eed651fd18a961f11cda62f3b5286c8cc4f9f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 May 2011 14:15:29 -0700 Subject: list: remove prefetching from regular list iterators This is removes the use of software prefetching from the regular list iterators. We don't want it. If you do want to prefetch in some iterator of yours, go right ahead. Just don't expect the iterator to do it, since normally the downsides are bigger than the upsides. It also replaces with , because the use of LIST_POISON ends up needing it. is sadly not self-contained, and including prefetch.h just happened to hide that. Suggested by David Miller (networking has a lot of regular lists that are often empty or a single entry, and prefetching is not going to do anything but add useless instructions). Acked-by: Ingo Molnar Acked-by: David S. Miller Cc: linux-arch@vger.kernel.org Signed-off-by: Linus Torvalds --- include/linux/list.h | 26 +++++++++++--------------- include/linux/rculist.h | 6 +++--- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 9ac1114..cc6d2aa 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include /* * Simple doubly linked list implementation. @@ -367,18 +367,15 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each(pos, head) \ - for (pos = (head)->next; prefetch(pos->next), pos != (head); \ - pos = pos->next) + for (pos = (head)->next; pos != (head); pos = pos->next) /** * __list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * - * This variant differs from list_for_each() in that it's the - * simplest possible list iteration code, no prefetching is done. - * Use this for code that knows the list to be very short (empty - * or 1 entry) most of the time. + * This variant doesn't differ from list_for_each() any more. + * We don't do prefetching in either case. */ #define __list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) @@ -389,8 +386,7 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ - for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \ - pos = pos->prev) + for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry @@ -410,7 +406,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ - prefetch(pos->prev), pos != (head); \ + pos != (head); \ pos = n, n = pos->prev) /** @@ -421,7 +417,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ + &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** @@ -432,7 +428,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member); \ - prefetch(pos->member.prev), &pos->member != (head); \ + &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** @@ -457,7 +453,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ + &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** @@ -471,7 +467,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ - prefetch(pos->member.prev), &pos->member != (head); \ + &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** @@ -483,7 +479,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; prefetch(pos->member.next), &pos->member != (head); \ + for (; &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 900a97a..e3beb31 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -253,7 +253,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ #define list_for_each_entry_rcu(pos, head, member) \ for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ + &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) @@ -270,7 +270,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ #define list_for_each_continue_rcu(pos, head) \ for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \ - prefetch((pos)->next), (pos) != (head); \ + (pos) != (head); \ (pos) = rcu_dereference_raw(list_next_rcu(pos))) /** @@ -284,7 +284,7 @@ static inline void list_splice_init_rcu(struct list_head *list, */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ - prefetch(pos->member.next), &pos->member != (head); \ + &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** -- cgit v1.1 From 80d02085d99039b3b7f3a73c8896226b0cb1ba07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 May 2011 01:08:07 -0700 Subject: Revert "rcu: Decrease memory-barrier usage based on semi-formal proof" This reverts commit e59fb3120becfb36b22ddb8bd27d065d3cdca499. This reversion was due to (extreme) boot-time slowdowns on SPARC seen by Yinghai Lu and on x86 by Ingo . This is a non-trivial reversion due to intervening commits. Conflicts: Documentation/RCU/trace.txt kernel/rcutree.c Signed-off-by: Ingo Molnar --- Documentation/RCU/trace.txt | 17 ++++-- kernel/rcutree.c | 130 +++++++++++++++++++++++++------------------- kernel/rcutree.h | 9 ++- kernel/rcutree_plugin.h | 7 ++- kernel/rcutree_trace.c | 12 ++-- 5 files changed, 102 insertions(+), 73 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 8173cec..c078ad4 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -99,11 +99,18 @@ o "qp" indicates that RCU still expects a quiescent state from o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the - scheduler or by irq. This number is even if the CPU is in - dyntick idle mode and odd otherwise. The number after the first - "/" is the interrupt nesting depth when in dyntick-idle state, - or one greater than the interrupt-nesting depth otherwise. - The number after the second "/" is the NMI nesting depth. + scheduler or by irq. The number after the "/" is the interrupt + nesting depth when in dyntick-idle state, or one greater than + the interrupt-nesting depth otherwise. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "dn" is the current value of the dyntick counter that is incremented + when entering or leaving dynticks idle state via NMI. If both + the "dt" and "dn" values are even, then this CPU is in dynticks + idle mode and may be ignored by RCU. If either of these two + counters is odd, then RCU must be alert to the possibility of + an RCU read-side critical section running on this CPU. This field is displayed only for CONFIG_NO_HZ kernels. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5616b17..e486f7c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = 1, - .dynticks = ATOMIC_INIT(1), + .dynticks = 1, }; #endif /* #ifdef CONFIG_NO_HZ */ @@ -321,25 +321,13 @@ void rcu_enter_nohz(void) unsigned long flags; struct rcu_dynticks *rdtp; + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (--rdtp->dynticks_nesting) { - local_irq_restore(flags); - return; - } - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rdtp->dynticks++; + rdtp->dynticks_nesting--; + WARN_ON_ONCE(rdtp->dynticks & 0x1); local_irq_restore(flags); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (in_irq() && - (__get_cpu_var(rcu_sched_data).nxtlist || - __get_cpu_var(rcu_bh_data).nxtlist || - rcu_preempt_needs_cpu(smp_processor_id()))) - set_need_resched(); } /* @@ -355,16 +343,11 @@ void rcu_exit_nohz(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nesting++) { - local_irq_restore(flags); - return; - } - smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rdtp->dynticks++; + rdtp->dynticks_nesting++; + WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); local_irq_restore(flags); + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } /** @@ -378,15 +361,11 @@ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) + if (rdtp->dynticks & 0x1) return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic_inc(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rdtp->dynticks_nmi++; + WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } /** @@ -400,14 +379,11 @@ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) + if (rdtp->dynticks & 0x1) return; - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force delay to next write. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ + rdtp->dynticks_nmi++; + WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); } /** @@ -418,7 +394,13 @@ void rcu_nmi_exit(void) */ void rcu_irq_enter(void) { - rcu_exit_nohz(); + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (rdtp->dynticks_nesting++) + return; + rdtp->dynticks++; + WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } /** @@ -430,7 +412,18 @@ void rcu_irq_enter(void) */ void rcu_irq_exit(void) { - rcu_enter_nohz(); + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (--rdtp->dynticks_nesting) + return; + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ + rdtp->dynticks++; + WARN_ON_ONCE(rdtp->dynticks & 0x1); + + /* If the interrupt queued a callback, get out of dyntick mode. */ + if (__this_cpu_read(rcu_sched_data.nxtlist) || + __this_cpu_read(rcu_bh_data.nxtlist)) + set_need_resched(); } #ifdef CONFIG_SMP @@ -442,8 +435,19 @@ void rcu_irq_exit(void) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - return 0; + int ret; + int snap; + int snap_nmi; + + snap = rdp->dynticks->dynticks; + snap_nmi = rdp->dynticks->dynticks_nmi; + smp_mb(); /* Order sampling of snap with end of grace period. */ + rdp->dynticks_snap = snap; + rdp->dynticks_nmi_snap = snap_nmi; + ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); + if (ret) + rdp->dynticks_fqs++; + return ret; } /* @@ -454,11 +458,16 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - unsigned long curr; - unsigned long snap; + long curr; + long curr_nmi; + long snap; + long snap_nmi; - curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned long)rdp->dynticks_snap; + curr = rdp->dynticks->dynticks; + snap = rdp->dynticks_snap; + curr_nmi = rdp->dynticks->dynticks_nmi; + snap_nmi = rdp->dynticks_nmi_snap; + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* * If the CPU passed through or entered a dynticks idle phase with @@ -468,7 +477,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { + if ((curr != snap || (curr & 0x1) == 0) && + (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { rdp->dynticks_fqs++; return 1; } @@ -897,12 +907,6 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) unsigned long gp_duration; WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - - /* - * Ensure that all grace-period and pre-grace-period activity - * is seen before the assignment to rsp->completed. - */ - smp_mb(); /* See above block comment. */ gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1450,11 +1454,25 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_process_callbacks(void) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be seen before any RCU + * grace-period manipulations below. + */ + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be seen after any RCU + * grace-period manipulations above. + */ + smp_mb(); /* See above block comment. */ + /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ rcu_needs_cpu_flush(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 93d4a1c..2576648 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,9 +84,11 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track irq/process nesting level. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nesting; /* Track nesting level, sort of. */ + int dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nmi; /* Even value for either dynticks-idle or */ + /* not in nmi handler, else odd. So this */ + /* remains even for nmi from irq handler. */ }; /* RCU's kthread states for tracing. */ @@ -282,6 +284,7 @@ struct rcu_data { /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ + int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ #endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ed339702..3f6559a 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1520,6 +1520,7 @@ int rcu_needs_cpu(int cpu) { int c = 0; int snap; + int snap_nmi; int thatcpu; /* Check for being in the holdoff period. */ @@ -1530,10 +1531,10 @@ int rcu_needs_cpu(int cpu) for_each_online_cpu(thatcpu) { if (thatcpu == cpu) continue; - snap = atomic_add_return(0, &per_cpu(rcu_dynticks, - thatcpu).dynticks); + snap = per_cpu(rcu_dynticks, thatcpu).dynticks; + snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; smp_mb(); /* Order sampling of snap with end of grace period. */ - if ((snap & 0x1) != 0) { + if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; return rcu_needs_cpu_quick_check(cpu); diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9678cc3..aa0fd72 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -69,10 +69,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->passed_quiesc, rdp->passed_quiesc_completed, rdp->qs_pending); #ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), + seq_printf(m, " dt=%d/%d dn=%d df=%lu", + rdp->dynticks->dynticks, rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, + rdp->dynticks->dynticks_nmi, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); @@ -141,9 +141,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", - atomic_read(&rdp->dynticks->dynticks), + rdp->dynticks->dynticks, rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, + rdp->dynticks->dynticks_nmi, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); @@ -167,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); #ifdef CONFIG_NO_HZ - seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); + seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.1 From bb0a56ecc4ba2a3db1b6ea6949c309886e3447d3 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 19 May 2011 18:51:07 -0400 Subject: [CPUFREQ] Move x86 drivers to drivers/cpufreq/ Signed-off-by: Dave Jones --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/Makefile | 1 - arch/x86/kernel/cpu/cpufreq/Kconfig | 266 ---- arch/x86/kernel/cpu/cpufreq/Makefile | 21 - arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 773 ----------- arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | 444 ------ arch/x86/kernel/cpu/cpufreq/e_powersaver.c | 367 ----- arch/x86/kernel/cpu/cpufreq/elanfreq.c | 309 ----- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 514 ------- arch/x86/kernel/cpu/cpufreq/longhaul.c | 1024 -------------- arch/x86/kernel/cpu/cpufreq/longhaul.h | 353 ----- arch/x86/kernel/cpu/cpufreq/longrun.c | 324 ----- arch/x86/kernel/cpu/cpufreq/mperf.c | 51 - arch/x86/kernel/cpu/cpufreq/mperf.h | 9 - arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 329 ----- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 621 --------- arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 261 ---- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 747 ---------- arch/x86/kernel/cpu/cpufreq/powernow-k7.h | 43 - arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 1607 ---------------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 222 --- arch/x86/kernel/cpu/cpufreq/sc520_freq.c | 192 --- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 633 --------- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 448 ------ arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 478 ------- arch/x86/kernel/cpu/cpufreq/speedstep-lib.h | 49 - arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 464 ------- drivers/cpufreq/Kconfig | 10 +- drivers/cpufreq/Kconfig.x86 | 255 ++++ drivers/cpufreq/Makefile | 26 + drivers/cpufreq/acpi-cpufreq.c | 773 +++++++++++ drivers/cpufreq/cpufreq-nforce2.c | 444 ++++++ drivers/cpufreq/e_powersaver.c | 367 +++++ drivers/cpufreq/elanfreq.c | 309 +++++ drivers/cpufreq/gx-suspmod.c | 514 +++++++ drivers/cpufreq/longhaul.c | 1024 ++++++++++++++ drivers/cpufreq/longhaul.h | 353 +++++ drivers/cpufreq/longrun.c | 324 +++++ drivers/cpufreq/mperf.c | 51 + drivers/cpufreq/mperf.h | 9 + drivers/cpufreq/p4-clockmod.c | 329 +++++ drivers/cpufreq/pcc-cpufreq.c | 621 +++++++++ drivers/cpufreq/powernow-k6.c | 261 ++++ drivers/cpufreq/powernow-k7.c | 747 ++++++++++ drivers/cpufreq/powernow-k7.h | 43 + drivers/cpufreq/powernow-k8.c | 1607 ++++++++++++++++++++++ drivers/cpufreq/powernow-k8.h | 222 +++ drivers/cpufreq/sc520_freq.c | 192 +++ drivers/cpufreq/speedstep-centrino.c | 633 +++++++++ drivers/cpufreq/speedstep-ich.c | 448 ++++++ drivers/cpufreq/speedstep-lib.c | 478 +++++++ drivers/cpufreq/speedstep-lib.h | 49 + drivers/cpufreq/speedstep-smi.c | 464 +++++++ 53 files changed, 10553 insertions(+), 10552 deletions(-) delete mode 100644 arch/x86/kernel/cpu/cpufreq/Kconfig delete mode 100644 arch/x86/kernel/cpu/cpufreq/Makefile delete mode 100644 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/e_powersaver.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/elanfreq.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/longhaul.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/longhaul.h delete mode 100644 arch/x86/kernel/cpu/cpufreq/longrun.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.h delete mode 100644 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k6.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k7.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k7.h delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k8.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k8.h delete mode 100644 arch/x86/kernel/cpu/cpufreq/sc520_freq.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-lib.c delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-lib.h delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-smi.c create mode 100644 drivers/cpufreq/Kconfig.x86 create mode 100644 drivers/cpufreq/acpi-cpufreq.c create mode 100644 drivers/cpufreq/cpufreq-nforce2.c create mode 100644 drivers/cpufreq/e_powersaver.c create mode 100644 drivers/cpufreq/elanfreq.c create mode 100644 drivers/cpufreq/gx-suspmod.c create mode 100644 drivers/cpufreq/longhaul.c create mode 100644 drivers/cpufreq/longhaul.h create mode 100644 drivers/cpufreq/longrun.c create mode 100644 drivers/cpufreq/mperf.c create mode 100644 drivers/cpufreq/mperf.h create mode 100644 drivers/cpufreq/p4-clockmod.c create mode 100644 drivers/cpufreq/pcc-cpufreq.c create mode 100644 drivers/cpufreq/powernow-k6.c create mode 100644 drivers/cpufreq/powernow-k7.c create mode 100644 drivers/cpufreq/powernow-k7.h create mode 100644 drivers/cpufreq/powernow-k8.c create mode 100644 drivers/cpufreq/powernow-k8.h create mode 100644 drivers/cpufreq/sc520_freq.c create mode 100644 drivers/cpufreq/speedstep-centrino.c create mode 100644 drivers/cpufreq/speedstep-ich.c create mode 100644 drivers/cpufreq/speedstep-lib.c create mode 100644 drivers/cpufreq/speedstep-lib.h create mode 100644 drivers/cpufreq/speedstep-smi.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc6c53a..e7f94a5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1848,7 +1848,7 @@ config APM_ALLOW_INTS endif # APM -source "arch/x86/kernel/cpu/cpufreq/Kconfig" +source "drivers/cpufreq/Kconfig" source "drivers/cpuidle/Kconfig" diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3f0ebe4..6042981 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig deleted file mode 100644 index 870e6cc..0000000 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ /dev/null @@ -1,266 +0,0 @@ -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config X86_PCC_CPUFREQ - tristate "Processor Clocking Control interface driver" - depends on ACPI && ACPI_PROCESSOR - help - This driver adds support for the PCC interface. - - For details, take a look at: - . - - To compile this driver as a module, choose M here: the - module will be called pcc-cpufreq. - - If in doubt, say N. - -config X86_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - This driver also supports Intel Enhanced Speedstep. - - To compile this driver as a module, choose M here: the - module will be called acpi-cpufreq. - - For details, take a look at . - - If in doubt, say N. - -config ELAN_CPUFREQ - tristate "AMD Elan SC400 and SC410" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC400 and SC410 - processors. - - You need to specify the processor maximum speed as boot - parameter: elanfreq=maxspeed (in kHz) or as module - parameter "max_freq". - - For details, take a look at . - - If in doubt, say N. - -config SC520_CPUFREQ - tristate "AMD Elan SC520" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC520 processor. - - For details, take a look at . - - If in doubt, say N. - - -config X86_POWERNOW_K6 - tristate "AMD Mobile K6-2/K6-3 PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K6-2+ and mobile - AMD K6-3+ processors. - - For details, take a look at . - - If in doubt, say N. - -config X86_POWERNOW_K7 - tristate "AMD Mobile Athlon/Duron PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K7 mobile processors. - - For details, take a look at . - - If in doubt, say N. - -config X86_POWERNOW_K7_ACPI - bool - depends on X86_POWERNOW_K7 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m) - depends on X86_32 - default y - -config X86_POWERNOW_K8 - tristate "AMD Opteron/Athlon64 PowerNow!" - select CPU_FREQ_TABLE - depends on ACPI && ACPI_PROCESSOR - help - This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors. - - To compile this driver as a module, choose M here: the - module will be called powernow-k8. - - For details, take a look at . - -config X86_GX_SUSPMOD - tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" - depends on X86_32 && PCI - help - This add the CPUFreq driver for NatSemi Geode processors which - support suspend modulation. - - For details, take a look at . - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO - tristate "Intel Enhanced SpeedStep (deprecated)" - select CPU_FREQ_TABLE - select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32 - depends on X86_32 || (X86_64 && ACPI_PROCESSOR) - help - This is deprecated and this functionality is now merged into - acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of - speedstep_centrino. - This adds the CPUFreq driver for Enhanced SpeedStep enabled - mobile CPUs. This means Intel Pentium M (Centrino) CPUs - or 64bit enabled Intel Xeons. - - To compile this driver as a module, choose M here: the - module will be called speedstep-centrino. - - For details, take a look at . - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO_TABLE - bool "Built-in tables for Banias CPUs" - depends on X86_32 && X86_SPEEDSTEP_CENTRINO - default y - help - Use built-in tables for Banias CPUs if ACPI encoding - is not available. - - If in doubt, say N. - -config X86_SPEEDSTEP_ICH - tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all - mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, - ICH3 or ICH4 southbridge. - - For details, take a look at . - - If in doubt, say N. - -config X86_SPEEDSTEP_SMI - tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) - on systems which have an Intel 440BX/ZX/MX southbridge. - - For details, take a look at . - - If in doubt, say N. - -config X86_P4_CLOCKMOD - tristate "Intel Pentium 4 clock modulation" - select CPU_FREQ_TABLE - help - This adds the CPUFreq driver for Intel Pentium 4 / XEON - processors. When enabled it will lower CPU temperature by skipping - clocks. - - This driver should be only used in exceptional - circumstances when very low power is needed because it causes severe - slowdowns and noticeable latencies. Normally Speedstep should be used - instead. - - To compile this driver as a module, choose M here: the - module will be called p4-clockmod. - - For details, take a look at . - - Unless you are absolutely sure say N. - -config X86_CPUFREQ_NFORCE2 - tristate "nVidia nForce2 FSB changing" - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for FSB changing on nVidia nForce2 - platforms. - - For details, take a look at . - - If in doubt, say N. - -config X86_LONGRUN - tristate "Transmeta LongRun" - depends on X86_32 - help - This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors - which support LongRun. - - For details, take a look at . - - If in doubt, say N. - -config X86_LONGHAUL - tristate "VIA Cyrix III Longhaul" - select CPU_FREQ_TABLE - depends on X86_32 && ACPI_PROCESSOR - help - This adds the CPUFreq driver for VIA Samuel/CyrixIII, - VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T - processors. - - For details, take a look at . - - If in doubt, say N. - -config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for VIA C7 processors. However, this driver - does not have any safeguards to prevent operating the CPU out of spec - and is thus considered dangerous. Please use the regular ACPI cpufreq - driver, enabled by CONFIG_X86_ACPI_CPUFREQ. - - If in doubt, say N. - -comment "shared options" - -config X86_SPEEDSTEP_LIB - tristate - default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) - -config X86_SPEEDSTEP_RELAXED_CAP_CHECK - bool "Relaxed speedstep capability checks" - depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) - help - Don't perform all checks for a speedstep capable system which would - normally be done. Some ancient or strange systems, though speedstep - capable, don't always indicate that they are speedstep capable. This - option lets the probing code bypass some of those checks if the - parameter "relaxed_check=1" is passed to the module. - -endif # CPU_FREQ - -endmenu diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile deleted file mode 100644 index bd54bf6..0000000 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# Link order matters. K8 is preferred to ACPI because of firmware bugs in early -# K8 systems. ACPI is preferred to all other hardware-specific drivers. -# speedstep-* is preferred over p4-clockmod. - -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o -obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o -obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o -obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o -obj-$(CONFIG_X86_LONGHAUL) += longhaul.o -obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o -obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o -obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o -obj-$(CONFIG_X86_LONGRUN) += longrun.o -obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o -obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o -obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o -obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o -obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o -obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o -obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c deleted file mode 100644 index 4e04e12..0000000 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ /dev/null @@ -1,773 +0,0 @@ -/* - * acpi-cpufreq.c - ACPI Processor P-States Driver - * - * Copyright (C) 2001, 2002 Andy Grover - * Copyright (C) 2001, 2002 Paul Diefenbaugh - * Copyright (C) 2002 - 2004 Dominik Brodowski - * Copyright (C) 2006 Denis Sadykov - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include "mperf.h" - -MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); -MODULE_DESCRIPTION("ACPI Processor P-States Driver"); -MODULE_LICENSE("GPL"); - -enum { - UNDEFINED_CAPABLE = 0, - SYSTEM_INTEL_MSR_CAPABLE, - SYSTEM_IO_CAPABLE, -}; - -#define INTEL_MSR_RANGE (0xffff) - -struct acpi_cpufreq_data { - struct acpi_processor_performance *acpi_data; - struct cpufreq_frequency_table *freq_table; - unsigned int resume; - unsigned int cpu_feature; -}; - -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); - -/* acpi_perf_data is a pointer to percpu data. */ -static struct acpi_processor_performance __percpu *acpi_perf_data; - -static struct cpufreq_driver acpi_cpufreq_driver; - -static unsigned int acpi_pstate_strict; - -static int check_est_cpu(unsigned int cpuid) -{ - struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - - return cpu_has(cpu, X86_FEATURE_EST); -} - -static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) -{ - struct acpi_processor_performance *perf; - int i; - - perf = data->acpi_data; - - for (i = 0; i < perf->state_count; i++) { - if (value == perf->states[i].status) - return data->freq_table[i].frequency; - } - return 0; -} - -static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data) -{ - int i; - struct acpi_processor_performance *perf; - - msr &= INTEL_MSR_RANGE; - perf = data->acpi_data; - - for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == perf->states[data->freq_table[i].index].status) - return data->freq_table[i].frequency; - } - return data->freq_table[0].frequency; -} - -static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) -{ - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - return extract_msr(val, data); - case SYSTEM_IO_CAPABLE: - return extract_io(val, data); - default: - return 0; - } -} - -struct msr_addr { - u32 reg; -}; - -struct io_addr { - u16 port; - u8 bit_width; -}; - -struct drv_cmd { - unsigned int type; - const struct cpumask *mask; - union { - struct msr_addr msr; - struct io_addr io; - } addr; - u32 val; -}; - -/* Called via smp_call_function_single(), on the target CPU */ -static void do_drv_read(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 h; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, cmd->val, h); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_read_port((acpi_io_address)cmd->addr.io.port, - &cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -/* Called via smp_call_function_many(), on the target CPUs */ -static void do_drv_write(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 lo, hi; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, lo, hi); - lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); - wrmsr(cmd->addr.msr.reg, lo, hi); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_write_port((acpi_io_address)cmd->addr.io.port, - cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -static void drv_read(struct drv_cmd *cmd) -{ - int err; - cmd->val = 0; - - err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); - WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ -} - -static void drv_write(struct drv_cmd *cmd) -{ - int this_cpu; - - this_cpu = get_cpu(); - if (cpumask_test_cpu(this_cpu, cmd->mask)) - do_drv_write(cmd); - smp_call_function_many(cmd->mask, do_drv_write, cmd, 1); - put_cpu(); -} - -static u32 get_cur_val(const struct cpumask *mask) -{ - struct acpi_processor_performance *perf; - struct drv_cmd cmd; - - if (unlikely(cpumask_empty(mask))) - return 0; - - switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - break; - default: - return 0; - } - - cmd.mask = mask; - drv_read(&cmd); - - pr_debug("get_cur_val = %u\n", cmd.val); - - return cmd.val; -} - -static unsigned int get_cur_freq_on_cpu(unsigned int cpu) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); - unsigned int freq; - unsigned int cached_freq; - - pr_debug("get_cur_freq_on_cpu (%d)\n", cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return 0; - } - - cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(cpumask_of(cpu)), data); - if (freq != cached_freq) { - /* - * The dreaded BIOS frequency change behind our back. - * Force set the frequency on next target call. - */ - data->resume = 1; - } - - pr_debug("cur freq = %u\n", freq); - - return freq; -} - -static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, - struct acpi_cpufreq_data *data) -{ - unsigned int cur_freq; - unsigned int i; - - for (i = 0; i < 100; i++) { - cur_freq = extract_freq(get_cur_val(mask), data); - if (cur_freq == freq) - return 1; - udelay(10); - } - return 0; -} - -static int acpi_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - struct acpi_processor_performance *perf; - struct cpufreq_freqs freqs; - struct drv_cmd cmd; - unsigned int next_state = 0; /* Index into freq_table */ - unsigned int next_perf_state = 0; /* Index into perf table */ - unsigned int i; - int result = 0; - - pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return -ENODEV; - } - - perf = data->acpi_data; - result = cpufreq_frequency_table_target(policy, - data->freq_table, - target_freq, - relation, &next_state); - if (unlikely(result)) { - result = -ENODEV; - goto out; - } - - next_perf_state = data->freq_table[next_state].index; - if (perf->state == next_perf_state) { - if (unlikely(data->resume)) { - pr_debug("Called after resume, resetting to P%d\n", - next_perf_state); - data->resume = 0; - } else { - pr_debug("Already at target state (P%d)\n", - next_perf_state); - goto out; - } - } - - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_CTL; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - default: - result = -ENODEV; - goto out; - } - - /* cpufreq holds the hotplug lock, so we are safe from here on */ - if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) - cmd.mask = policy->cpus; - else - cmd.mask = cpumask_of(policy->cpu); - - freqs.old = perf->states[perf->state].core_frequency * 1000; - freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - drv_write(&cmd); - - if (acpi_pstate_strict) { - if (!check_freqs(cmd.mask, freqs.new, data)) { - pr_debug("acpi_cpufreq_target failed (%d)\n", - policy->cpu); - result = -EAGAIN; - goto out; - } - } - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - perf->state = next_perf_state; - -out: - return result; -} - -static int acpi_cpufreq_verify(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - pr_debug("acpi_cpufreq_verify\n"); - - return cpufreq_frequency_table_verify(policy, data->freq_table); -} - -static unsigned long -acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) -{ - struct acpi_processor_performance *perf = data->acpi_data; - - if (cpu_khz) { - /* search the closest match to cpu_khz */ - unsigned int i; - unsigned long freq; - unsigned long freqn = perf->states[0].core_frequency * 1000; - - for (i = 0; i < (perf->state_count-1); i++) { - freq = freqn; - freqn = perf->states[i+1].core_frequency * 1000; - if ((2 * cpu_khz) > (freqn + freq)) { - perf->state = i; - return freq; - } - } - perf->state = perf->state_count-1; - return freqn; - } else { - /* assume CPU is at P0... */ - perf->state = 0; - return perf->states[0].core_frequency * 1000; - } -} - -static void free_acpi_perf_data(void) -{ - unsigned int i; - - /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ - for_each_possible_cpu(i) - free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) - ->shared_cpu_map); - free_percpu(acpi_perf_data); -} - -/* - * acpi_cpufreq_early_init - initialize ACPI P-States library - * - * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c) - * in order to determine correct frequency and voltage pairings. We can - * do _PDC and _PSD and find out the processor dependency for the - * actual init that will happen later... - */ -static int __init acpi_cpufreq_early_init(void) -{ - unsigned int i; - pr_debug("acpi_cpufreq_early_init\n"); - - acpi_perf_data = alloc_percpu(struct acpi_processor_performance); - if (!acpi_perf_data) { - pr_debug("Memory allocation error for acpi_perf_data.\n"); - return -ENOMEM; - } - for_each_possible_cpu(i) { - if (!zalloc_cpumask_var_node( - &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, - GFP_KERNEL, cpu_to_node(i))) { - - /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ - free_acpi_perf_data(); - return -ENOMEM; - } - } - - /* Do initialization in ACPI core */ - acpi_processor_preregister_performance(acpi_perf_data); - return 0; -} - -#ifdef CONFIG_SMP -/* - * Some BIOSes do SW_ANY coordination internally, either set it up in hw - * or do it in BIOS firmware and won't inform about it to OS. If not - * detected, this has a side effect of making CPU run at a different speed - * than OS intended it to run at. Detect it and handle it cleanly. - */ -static int bios_with_sw_any_bug; - -static int sw_any_bug_found(const struct dmi_system_id *d) -{ - bios_with_sw_any_bug = 1; - return 0; -} - -static const struct dmi_system_id sw_any_bug_dmi_table[] = { - { - .callback = sw_any_bug_found, - .ident = "Supermicro Server X6DLP", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), - DMI_MATCH(DMI_BIOS_VERSION, "080010"), - DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), - }, - }, - { } -}; - -static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) -{ - /* Intel Xeon Processor 7100 Series Specification Update - * http://www.intel.com/Assets/PDF/specupdate/314554.pdf - * AL30: A Machine Check Exception (MCE) Occurring during an - * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up. */ - if (c->x86_vendor == X86_VENDOR_INTEL) { - if ((c->x86 == 15) && - (c->x86_model == 6) && - (c->x86_mask == 8)) { - printk(KERN_INFO "acpi-cpufreq: Intel(R) " - "Xeon(R) 7100 Errata AL30, processors may " - "lock up on frequency changes: disabling " - "acpi-cpufreq.\n"); - return -ENODEV; - } - } - return 0; -} -#endif - -static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - unsigned int valid_states = 0; - unsigned int cpu = policy->cpu; - struct acpi_cpufreq_data *data; - unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - struct acpi_processor_performance *perf; -#ifdef CONFIG_SMP - static int blacklisted; -#endif - - pr_debug("acpi_cpufreq_cpu_init\n"); - -#ifdef CONFIG_SMP - if (blacklisted) - return blacklisted; - blacklisted = acpi_cpufreq_blacklist(c); - if (blacklisted) - return blacklisted; -#endif - - data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(acfreq_data, cpu) = data; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) - acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; - - result = acpi_processor_register_performance(data->acpi_data, cpu); - if (result) - goto err_free; - - perf = data->acpi_data; - policy->shared_type = perf->shared_type; - - /* - * Will let policy->cpus know about dependency only when software - * coordination is required. - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || - policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - cpumask_copy(policy->cpus, perf->shared_cpu_map); - } - cpumask_copy(policy->related_cpus, perf->shared_cpu_map); - -#ifdef CONFIG_SMP - dmi_check_system(sw_any_bug_dmi_table); - if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) { - policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; - cpumask_copy(policy->cpus, cpu_core_mask(cpu)); - } -#endif - - /* capability check */ - if (perf->state_count <= 1) { - pr_debug("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if (perf->control_register.space_id != perf->status_register.space_id) { - result = -ENODEV; - goto err_unreg; - } - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - pr_debug("SYSTEM IO addr space\n"); - data->cpu_feature = SYSTEM_IO_CAPABLE; - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - pr_debug("HARDWARE addr space\n"); - if (!check_est_cpu(cpu)) { - result = -ENODEV; - goto err_unreg; - } - data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; - break; - default: - pr_debug("Unknown addr space %d\n", - (u32) (perf->control_register.space_id)); - result = -ENODEV; - goto err_unreg; - } - - data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * - (perf->state_count+1), GFP_KERNEL); - if (!data->freq_table) { - result = -ENOMEM; - goto err_unreg; - } - - /* detect transition latency */ - policy->cpuinfo.transition_latency = 0; - for (i = 0; i < perf->state_count; i++) { - if ((perf->states[i].transition_latency * 1000) > - policy->cpuinfo.transition_latency) - policy->cpuinfo.transition_latency = - perf->states[i].transition_latency * 1000; - } - - /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */ - if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && - policy->cpuinfo.transition_latency > 20 * 1000) { - policy->cpuinfo.transition_latency = 20 * 1000; - printk_once(KERN_INFO - "P-state transition latency capped at 20 uS\n"); - } - - /* table init */ - for (i = 0; i < perf->state_count; i++) { - if (i > 0 && perf->states[i].core_frequency >= - data->freq_table[valid_states-1].frequency / 1000) - continue; - - data->freq_table[valid_states].index = i; - data->freq_table[valid_states].frequency = - perf->states[i].core_frequency * 1000; - valid_states++; - } - data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; - perf->state = 0; - - result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); - if (result) - goto err_freqfree; - - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) - printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n"); - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - /* Current speed is unknown and not detectable by IO port */ - policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - acpi_cpufreq_driver.get = get_cur_freq_on_cpu; - policy->cur = get_cur_freq_on_cpu(cpu); - break; - default: - break; - } - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; - - pr_debug("CPU%u - ACPI performance management activated.\n", cpu); - for (i = 0; i < perf->state_count; i++) - pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", - (i == perf->state ? '*' : ' '), i, - (u32) perf->states[i].core_frequency, - (u32) perf->states[i].power, - (u32) perf->states[i].transition_latency); - - cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); - - /* - * the first call to ->target() should result in us actually - * writing something to the appropriate registers. - */ - data->resume = 1; - - return result; - -err_freqfree: - kfree(data->freq_table); -err_unreg: - acpi_processor_unregister_performance(perf, cpu); -err_free: - kfree(data); - per_cpu(acfreq_data, cpu) = NULL; - - return result; -} - -static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - pr_debug("acpi_cpufreq_cpu_exit\n"); - - if (data) { - cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(acfreq_data, policy->cpu) = NULL; - acpi_processor_unregister_performance(data->acpi_data, - policy->cpu); - kfree(data->freq_table); - kfree(data); - } - - return 0; -} - -static int acpi_cpufreq_resume(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - pr_debug("acpi_cpufreq_resume\n"); - - data->resume = 1; - - return 0; -} - -static struct freq_attr *acpi_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, -}; - -static int __init acpi_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - pr_debug("acpi_cpufreq_init\n"); - - ret = acpi_cpufreq_early_init(); - if (ret) - return ret; - - ret = cpufreq_register_driver(&acpi_cpufreq_driver); - if (ret) - free_acpi_perf_data(); - - return ret; -} - -static void __exit acpi_cpufreq_exit(void) -{ - pr_debug("acpi_cpufreq_exit\n"); - - cpufreq_unregister_driver(&acpi_cpufreq_driver); - - free_percpu(acpi_perf_data); -} - -module_param(acpi_pstate_strict, uint, 0644); -MODULE_PARM_DESC(acpi_pstate_strict, - "value 0 or non-zero. non-zero -> strict ACPI checks are " - "performed during frequency changes."); - -late_initcall(acpi_cpufreq_init); -module_exit(acpi_cpufreq_exit); - -MODULE_ALIAS("acpi"); diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c deleted file mode 100644 index 7bac808..0000000 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ /dev/null @@ -1,444 +0,0 @@ -/* - * (C) 2004-2006 Sebastian Witt - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include -#include -#include -#include -#include -#include -#include - -#define NFORCE2_XTAL 25 -#define NFORCE2_BOOTFSB 0x48 -#define NFORCE2_PLLENABLE 0xa8 -#define NFORCE2_PLLREG 0xa4 -#define NFORCE2_PLLADR 0xa0 -#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div) - -#define NFORCE2_MIN_FSB 50 -#define NFORCE2_SAFE_DISTANCE 50 - -/* Delay in ms between FSB changes */ -/* #define NFORCE2_DELAY 10 */ - -/* - * nforce2_chipset: - * FSB is changed using the chipset - */ -static struct pci_dev *nforce2_dev; - -/* fid: - * multiplier * 10 - */ -static int fid; - -/* min_fsb, max_fsb: - * minimum and maximum FSB (= FSB at boot time) - */ -static int min_fsb; -static int max_fsb; - -MODULE_AUTHOR("Sebastian Witt "); -MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); -MODULE_LICENSE("GPL"); - -module_param(fid, int, 0444); -module_param(min_fsb, int, 0444); - -MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); -MODULE_PARM_DESC(min_fsb, - "Minimum FSB to use, if not defined: current FSB - 50"); - -#define PFX "cpufreq-nforce2: " - -/** - * nforce2_calc_fsb - calculate FSB - * @pll: PLL value - * - * Calculates FSB from PLL value - */ -static int nforce2_calc_fsb(int pll) -{ - unsigned char mul, div; - - mul = (pll >> 8) & 0xff; - div = pll & 0xff; - - if (div > 0) - return NFORCE2_XTAL * mul / div; - - return 0; -} - -/** - * nforce2_calc_pll - calculate PLL value - * @fsb: FSB - * - * Calculate PLL value for given FSB - */ -static int nforce2_calc_pll(unsigned int fsb) -{ - unsigned char xmul, xdiv; - unsigned char mul = 0, div = 0; - int tried = 0; - - /* Try to calculate multiplier and divider up to 4 times */ - while (((mul == 0) || (div == 0)) && (tried <= 3)) { - for (xdiv = 2; xdiv <= 0x80; xdiv++) - for (xmul = 1; xmul <= 0xfe; xmul++) - if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) == - fsb + tried) { - mul = xmul; - div = xdiv; - } - tried++; - } - - if ((mul == 0) || (div == 0)) - return -1; - - return NFORCE2_PLL(mul, div); -} - -/** - * nforce2_write_pll - write PLL value to chipset - * @pll: PLL value - * - * Writes new FSB PLL value to chipset - */ -static void nforce2_write_pll(int pll) -{ - int temp; - - /* Set the pll addr. to 0x00 */ - pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0); - - /* Now write the value in all 64 registers */ - for (temp = 0; temp <= 0x3f; temp++) - pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll); - - return; -} - -/** - * nforce2_fsb_read - Read FSB - * - * Read FSB from chipset - * If bootfsb != 0, return FSB at boot-time - */ -static unsigned int nforce2_fsb_read(int bootfsb) -{ - struct pci_dev *nforce2_sub5; - u32 fsb, temp = 0; - - /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ - nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF, - PCI_ANY_ID, PCI_ANY_ID, NULL); - if (!nforce2_sub5) - return 0; - - pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); - fsb /= 1000000; - - /* Check if PLL register is already set */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - - if (bootfsb || !temp) - return fsb; - - /* Use PLL register FSB value */ - pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp); - fsb = nforce2_calc_fsb(temp); - - return fsb; -} - -/** - * nforce2_set_fsb - set new FSB - * @fsb: New FSB - * - * Sets new FSB - */ -static int nforce2_set_fsb(unsigned int fsb) -{ - u32 temp = 0; - unsigned int tfsb; - int diff; - int pll = 0; - - if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { - printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb); - return -EINVAL; - } - - tfsb = nforce2_fsb_read(0); - if (!tfsb) { - printk(KERN_ERR PFX "Error while reading the FSB\n"); - return -EINVAL; - } - - /* First write? Then set actual value */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - if (!temp) { - pll = nforce2_calc_pll(tfsb); - - if (pll < 0) - return -EINVAL; - - nforce2_write_pll(pll); - } - - /* Enable write access */ - temp = 0x01; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp); - - diff = tfsb - fsb; - - if (!diff) - return 0; - - while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) { - if (diff < 0) - tfsb++; - else - tfsb--; - - /* Calculate the PLL reg. value */ - pll = nforce2_calc_pll(tfsb); - if (pll == -1) - return -EINVAL; - - nforce2_write_pll(pll); -#ifdef NFORCE2_DELAY - mdelay(NFORCE2_DELAY); -#endif - } - - temp = 0x40; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp); - - return 0; -} - -/** - * nforce2_get - get the CPU frequency - * @cpu: CPU number - * - * Returns the CPU frequency - */ -static unsigned int nforce2_get(unsigned int cpu) -{ - if (cpu) - return 0; - return nforce2_fsb_read(0) * fid * 100; -} - -/** - * nforce2_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int nforce2_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ -/* unsigned long flags; */ - struct cpufreq_freqs freqs; - unsigned int target_fsb; - - if ((target_freq > policy->max) || (target_freq < policy->min)) - return -EINVAL; - - target_fsb = target_freq / (fid * 100); - - freqs.old = nforce2_get(policy->cpu); - freqs.new = target_fsb * fid * 100; - freqs.cpu = 0; /* Only one CPU on nForce2 platforms */ - - if (freqs.old == freqs.new) - return 0; - - pr_debug("Old CPU frequency %d kHz, new %d kHz\n", - freqs.old, freqs.new); - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Disable IRQs */ - /* local_irq_save(flags); */ - - if (nforce2_set_fsb(target_fsb) < 0) - printk(KERN_ERR PFX "Changing FSB to %d failed\n", - target_fsb); - else - pr_debug("Changed FSB successfully to %d\n", - target_fsb); - - /* Enable IRQs */ - /* local_irq_restore(flags); */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - -/** - * nforce2_verify - verifies a new CPUFreq policy - * @policy: new policy - */ -static int nforce2_verify(struct cpufreq_policy *policy) -{ - unsigned int fsb_pol_max; - - fsb_pol_max = policy->max / (fid * 100); - - if (policy->min < (fsb_pol_max * fid * 100)) - policy->max = (fsb_pol_max + 1) * fid * 100; - - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static int nforce2_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int fsb; - unsigned int rfid; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* Get current FSB */ - fsb = nforce2_fsb_read(0); - - if (!fsb) - return -EIO; - - /* FIX: Get FID from CPU */ - if (!fid) { - if (!cpu_khz) { - printk(KERN_WARNING PFX - "cpu_khz not set, can't calculate multiplier!\n"); - return -ENODEV; - } - - fid = cpu_khz / (fsb * 100); - rfid = fid % 5; - - if (rfid) { - if (rfid > 2) - fid += 5 - rfid; - else - fid -= rfid; - } - } - - printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb, - fid / 10, fid % 10); - - /* Set maximum FSB to FSB at boot time */ - max_fsb = nforce2_fsb_read(1); - - if (!max_fsb) - return -EIO; - - if (!min_fsb) - min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE; - - if (min_fsb < NFORCE2_MIN_FSB) - min_fsb = NFORCE2_MIN_FSB; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = min_fsb * fid * 100; - policy->cpuinfo.max_freq = max_fsb * fid * 100; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = nforce2_get(policy->cpu); - policy->min = policy->cpuinfo.min_freq; - policy->max = policy->cpuinfo.max_freq; - - return 0; -} - -static int nforce2_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver nforce2_driver = { - .name = "nforce2", - .verify = nforce2_verify, - .target = nforce2_target, - .get = nforce2_get, - .init = nforce2_cpu_init, - .exit = nforce2_cpu_exit, - .owner = THIS_MODULE, -}; - -/** - * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic - * - * Detects nForce2 A2 and C1 stepping - * - */ -static int nforce2_detect_chipset(void) -{ - nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, - PCI_DEVICE_ID_NVIDIA_NFORCE2, - PCI_ANY_ID, PCI_ANY_ID, NULL); - - if (nforce2_dev == NULL) - return -ENODEV; - - printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n", - nforce2_dev->revision); - printk(KERN_INFO PFX - "FSB changing is maybe unstable and can lead to " - "crashes and data loss.\n"); - - return 0; -} - -/** - * nforce2_init - initializes the nForce2 CPUFreq driver - * - * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init nforce2_init(void) -{ - /* TODO: do we need to detect the processor? */ - - /* detect chipset */ - if (nforce2_detect_chipset()) { - printk(KERN_INFO PFX "No nForce2 chipset.\n"); - return -ENODEV; - } - - return cpufreq_register_driver(&nforce2_driver); -} - -/** - * nforce2_exit - unregisters cpufreq module - * - * Unregisters nForce2 FSB change support. - */ -static void __exit nforce2_exit(void) -{ - cpufreq_unregister_driver(&nforce2_driver); -} - -module_init(nforce2_init); -module_exit(nforce2_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c deleted file mode 100644 index 35a257d..0000000 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Based on documentation provided by Dave Jones. Thanks! - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define EPS_BRAND_C7M 0 -#define EPS_BRAND_C7 1 -#define EPS_BRAND_EDEN 2 -#define EPS_BRAND_C3 3 -#define EPS_BRAND_C7D 4 - -struct eps_cpu_data { - u32 fsb; - struct cpufreq_frequency_table freq_table[]; -}; - -static struct eps_cpu_data *eps_cpu[NR_CPUS]; - - -static unsigned int eps_get(unsigned int cpu) -{ - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (cpu) - return 0; - centaur = eps_cpu[cpu]; - if (centaur == NULL) - return 0; - - /* Return current frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - return centaur->fsb * ((lo >> 8) & 0xff); -} - -static int eps_set_state(struct eps_cpu_data *centaur, - unsigned int cpu, - u32 dest_state) -{ - struct cpufreq_freqs freqs; - u32 lo, hi; - int err = 0; - int i; - - freqs.old = eps_get(cpu); - freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Wait while CPU is busy */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i = 0; - while (lo & ((1 << 16) | (1 << 17))) { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } - /* Set new multiplier and voltage */ - wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); - /* Wait until transition end */ - i = 0; - do { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } while (lo & ((1 << 16) | (1 << 17))); - - /* Return current frequency */ -postchange: - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - freqs.new = centaur->fsb * ((lo >> 8) & 0xff); - -#ifdef DEBUG - { - u8 current_multiplier, current_voltage; - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", - current_multiplier); - } -#endif - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - return err; -} - -static int eps_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct eps_cpu_data *centaur; - unsigned int newstate = 0; - unsigned int cpu = policy->cpu; - unsigned int dest_state; - int ret; - - if (unlikely(eps_cpu[cpu] == NULL)) - return -ENODEV; - centaur = eps_cpu[cpu]; - - if (unlikely(cpufreq_frequency_table_target(policy, - &eps_cpu[cpu]->freq_table[0], - target_freq, - relation, - &newstate))) { - return -EINVAL; - } - - /* Make frequency transition */ - dest_state = centaur->freq_table[newstate].index & 0xffff; - ret = eps_set_state(centaur, cpu, dest_state); - if (ret) - printk(KERN_ERR "eps: Timeout!\n"); - return ret; -} - -static int eps_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - &eps_cpu[policy->cpu]->freq_table[0]); -} - -static int eps_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - u32 lo, hi; - u64 val; - u8 current_multiplier, current_voltage; - u8 max_multiplier, max_voltage; - u8 min_multiplier, min_voltage; - u8 brand = 0; - u32 fsb; - struct eps_cpu_data *centaur; - struct cpuinfo_x86 *c = &cpu_data(0); - struct cpufreq_frequency_table *f_table; - int k, step, voltage; - int ret; - int states; - - if (policy->cpu != 0) - return -ENODEV; - - /* Check brand */ - printk(KERN_INFO "eps: Detected VIA "); - - switch (c->x86_model) { - case 10: - rdmsr(0x1153, lo, hi); - brand = (((lo >> 2) ^ lo) >> 18) & 3; - printk(KERN_CONT "Model A "); - break; - case 13: - rdmsr(0x1154, lo, hi); - brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; - printk(KERN_CONT "Model D "); - break; - } - - switch (brand) { - case EPS_BRAND_C7M: - printk(KERN_CONT "C7-M\n"); - break; - case EPS_BRAND_C7: - printk(KERN_CONT "C7\n"); - break; - case EPS_BRAND_EDEN: - printk(KERN_CONT "Eden\n"); - break; - case EPS_BRAND_C7D: - printk(KERN_CONT "C7-D\n"); - break; - case EPS_BRAND_C3: - printk(KERN_CONT "C3\n"); - return -ENODEV; - break; - } - /* Enable Enhanced PowerSaver */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - wrmsrl(MSR_IA32_MISC_ENABLE, val); - /* Can be locked at 0 */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); - return -ENODEV; - } - } - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); - - /* Print limits */ - max_voltage = hi & 0xff; - printk(KERN_INFO "eps: Highest voltage = %dmV\n", - max_voltage * 16 + 700); - max_multiplier = (hi >> 8) & 0xff; - printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); - min_voltage = (hi >> 16) & 0xff; - printk(KERN_INFO "eps: Lowest voltage = %dmV\n", - min_voltage * 16 + 700); - min_multiplier = (hi >> 24) & 0xff; - printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); - - /* Sanity checks */ - if (current_multiplier == 0 || max_multiplier == 0 - || min_multiplier == 0) - return -EINVAL; - if (current_multiplier > max_multiplier - || max_multiplier <= min_multiplier) - return -EINVAL; - if (current_voltage > 0x1f || max_voltage > 0x1f) - return -EINVAL; - if (max_voltage < min_voltage) - return -EINVAL; - - /* Calc FSB speed */ - fsb = cpu_khz / current_multiplier; - /* Calc number of p-states supported */ - if (brand == EPS_BRAND_C7M) - states = max_multiplier - min_multiplier + 1; - else - states = 2; - - /* Allocate private data and frequency table for current cpu */ - centaur = kzalloc(sizeof(struct eps_cpu_data) - + (states + 1) * sizeof(struct cpufreq_frequency_table), - GFP_KERNEL); - if (!centaur) - return -ENOMEM; - eps_cpu[0] = centaur; - - /* Copy basic values */ - centaur->fsb = fsb; - - /* Fill frequency and MSR value table */ - f_table = ¢aur->freq_table[0]; - if (brand != EPS_BRAND_C7M) { - f_table[0].frequency = fsb * min_multiplier; - f_table[0].index = (min_multiplier << 8) | min_voltage; - f_table[1].frequency = fsb * max_multiplier; - f_table[1].index = (max_multiplier << 8) | max_voltage; - f_table[2].frequency = CPUFREQ_TABLE_END; - } else { - k = 0; - step = ((max_voltage - min_voltage) * 256) - / (max_multiplier - min_multiplier); - for (i = min_multiplier; i <= max_multiplier; i++) { - voltage = (k * step) / 256 + min_voltage; - f_table[k].frequency = fsb * i; - f_table[k].index = (i << 8) | voltage; - k++; - } - f_table[k].frequency = CPUFREQ_TABLE_END; - } - - policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ - policy->cur = fsb * current_multiplier; - - ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); - if (ret) { - kfree(centaur); - return ret; - } - - cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); - return 0; -} - -static int eps_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (eps_cpu[cpu] == NULL) - return -ENODEV; - centaur = eps_cpu[cpu]; - - /* Get max frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - /* Set max frequency */ - eps_set_state(centaur, cpu, hi & 0xffff); - /* Bye */ - cpufreq_frequency_table_put_attr(policy->cpu); - kfree(eps_cpu[cpu]); - eps_cpu[cpu] = NULL; - return 0; -} - -static struct freq_attr *eps_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver eps_driver = { - .verify = eps_verify, - .target = eps_target, - .init = eps_cpu_init, - .exit = eps_cpu_exit, - .get = eps_get, - .name = "e_powersaver", - .owner = THIS_MODULE, - .attr = eps_attr, -}; - -static int __init eps_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* This driver will work only on Centaur C7 processors with - * Enhanced SpeedStep/PowerSaver registers */ - if (c->x86_vendor != X86_VENDOR_CENTAUR - || c->x86 != 6 || c->x86_model < 10) - return -ENODEV; - if (!cpu_has(c, X86_FEATURE_EST)) - return -ENODEV; - - if (cpufreq_register_driver(&eps_driver)) - return -EINVAL; - return 0; -} - -static void __exit eps_exit(void) -{ - cpufreq_unregister_driver(&eps_driver); -} - -MODULE_AUTHOR("Rafal Bilski "); -MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); -MODULE_LICENSE("GPL"); - -module_init(eps_init); -module_exit(eps_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c deleted file mode 100644 index c587db4..0000000 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ /dev/null @@ -1,309 +0,0 @@ -/* - * elanfreq: cpufreq driver for the AMD ELAN family - * - * (c) Copyright 2002 Robert Schwebel - * - * Parts of this code are (c) Sven Geggus - * - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel - * - */ - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ -#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ - -/* Module parameter */ -static int max_freq; - -struct s_elan_multiplier { - int clock; /* frequency in kHz */ - int val40h; /* PMU Force Mode register */ - int val80h; /* CPU Clock Speed Register */ -}; - -/* - * It is important that the frequencies - * are listed in ascending order here! - */ -static struct s_elan_multiplier elan_multiplier[] = { - {1000, 0x02, 0x18}, - {2000, 0x02, 0x10}, - {4000, 0x02, 0x08}, - {8000, 0x00, 0x00}, - {16000, 0x00, 0x02}, - {33000, 0x00, 0x04}, - {66000, 0x01, 0x04}, - {99000, 0x01, 0x05} -}; - -static struct cpufreq_frequency_table elanfreq_table[] = { - {0, 1000}, - {1, 2000}, - {2, 4000}, - {3, 8000}, - {4, 16000}, - {5, 33000}, - {6, 66000}, - {7, 99000}, - {0, CPUFREQ_TABLE_END}, -}; - - -/** - * elanfreq_get_cpu_frequency: determine current cpu speed - * - * Finds out at which frequency the CPU of the Elan SOC runs - * at the moment. Frequencies from 1 to 33 MHz are generated - * the normal way, 66 and 99 MHz are called "Hyperspeed Mode" - * and have the rest of the chip running with 33 MHz. - */ - -static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg; /* Clock Speed Register */ - - local_irq_disable(); - outb_p(0x80, REG_CSCIR); - clockspeed_reg = inb_p(REG_CSCDR); - local_irq_enable(); - - if ((clockspeed_reg & 0xE0) == 0xE0) - return 0; - - /* Are we in CPU clock multiplied mode (66/99 MHz)? */ - if ((clockspeed_reg & 0xE0) == 0xC0) { - if ((clockspeed_reg & 0x01) == 0) - return 66000; - else - return 99000; - } - - /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0) == 0xA0) - return 33000; - - return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; -} - - -/** - * elanfreq_set_cpu_frequency: Change the CPU core frequency - * @cpu: cpu number - * @freq: frequency in kHz - * - * This function takes a frequency value and changes the CPU frequency - * according to this. Note that the frequency has to be checked by - * elanfreq_validatespeed() for correctness! - * - * There is no return value. - */ - -static void elanfreq_set_cpu_state(unsigned int state) -{ - struct cpufreq_freqs freqs; - - freqs.old = elanfreq_get_cpu_frequency(0); - freqs.new = elan_multiplier[state].clock; - freqs.cpu = 0; /* elanfreq.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n", - elan_multiplier[state].clock); - - - /* - * Access to the Elan's internal registers is indexed via - * 0x22: Chip Setup & Control Register Index Register (CSCI) - * 0x23: Chip Setup & Control Register Data Register (CSCD) - * - */ - - /* - * 0x40 is the Power Management Unit's Force Mode Register. - * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency) - */ - - local_irq_disable(); - outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00, REG_CSCDR); - local_irq_enable(); /* wait till internal pipelines and */ - udelay(1000); /* buffers have cleaned up */ - - local_irq_disable(); - - /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80, REG_CSCIR); - outb_p(elan_multiplier[state].val80h, REG_CSCDR); - - /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40, REG_CSCIR); - outb_p(elan_multiplier[state].val40h, REG_CSCDR); - udelay(10000); - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - - -/** - * elanfreq_validatespeed: test if frequency range is valid - * @policy: the policy to validate - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int elanfreq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); -} - -static int elanfreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - elanfreq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int elanfreq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int i; - int result; - - /* capability check */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) - return -ENODEV; - - /* max freq */ - if (!max_freq) - max_freq = elanfreq_get_cpu_frequency(0); - - /* table init */ - for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if (elanfreq_table[i].frequency > max_freq) - elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = elanfreq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); - return 0; -} - - -static int elanfreq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -#ifndef MODULE -/** - * elanfreq_setup - elanfreq command line parameter parsing - * - * elanfreq command line parameter. Use: - * elanfreq=66000 - * to set the maximum CPU frequency to 66 MHz. Note that in - * case you do not give this boot parameter, the maximum - * frequency will fall back to _current_ CPU frequency which - * might be lower. If you build this as a module, use the - * max_freq module parameter instead. - */ -static int __init elanfreq_setup(char *str) -{ - max_freq = simple_strtoul(str, &str, 0); - printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n"); - return 1; -} -__setup("elanfreq=", elanfreq_setup); -#endif - - -static struct freq_attr *elanfreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver elanfreq_driver = { - .get = elanfreq_get_cpu_frequency, - .verify = elanfreq_verify, - .target = elanfreq_target, - .init = elanfreq_cpu_init, - .exit = elanfreq_cpu_exit, - .name = "elanfreq", - .owner = THIS_MODULE, - .attr = elanfreq_attr, -}; - - -static int __init elanfreq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* Test if we have the right hardware */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) { - printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; - } - return cpufreq_register_driver(&elanfreq_driver); -} - - -static void __exit elanfreq_exit(void) -{ - cpufreq_unregister_driver(&elanfreq_driver); -} - - -module_param(max_freq, int, 0444); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Robert Schwebel , " - "Sven Geggus "); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); - -module_init(elanfreq_init); -module_exit(elanfreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c deleted file mode 100644 index ffe1f2c..0000000 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ /dev/null @@ -1,514 +0,0 @@ -/* - * Cyrix MediaGX and NatSemi Geode Suspend Modulation - * (C) 2002 Zwane Mwaikambo - * (C) 2002 Hiroshi Miura - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Theoretical note: - * - * (see Geode(tm) CS5530 manual (rev.4.1) page.56) - * - * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 - * are based on Suspend Modulation. - * - * Suspend Modulation works by asserting and de-asserting the SUSP# pin - * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# - * the CPU enters an idle state. GX1 stops its core clock when SUSP# is - * asserted then power consumption is reduced. - * - * Suspend Modulation's OFF/ON duration are configurable - * with 'Suspend Modulation OFF Count Register' - * and 'Suspend Modulation ON Count Register'. - * These registers are 8bit counters that represent the number of - * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF) - * to the processor. - * - * These counters define a ratio which is the effective frequency - * of operation of the system. - * - * OFF Count - * F_eff = Fgx * ---------------------- - * OFF Count + ON Count - * - * 0 <= On Count, Off Count <= 255 - * - * From these limits, we can get register values - * - * off_duration + on_duration <= MAX_DURATION - * on_duration = off_duration * (stock_freq - freq) / freq - * - * off_duration = (freq * DURATION) / stock_freq - * on_duration = DURATION - off_duration - * - * - *--------------------------------------------------------------------------- - * - * ChangeLog: - * Dec. 12, 2003 Hiroshi Miura - * - fix on/off register mistake - * - fix cpu_khz calc when it stops cpu modulation. - * - * Dec. 11, 2002 Hiroshi Miura - * - rewrite for Cyrix MediaGX Cx5510/5520 and - * NatSemi Geode Cs5530(A). - * - * Jul. ??, 2002 Zwane Mwaikambo - * - cs5530_mod patch for 2.4.19-rc1. - * - *--------------------------------------------------------------------------- - * - * Todo - * Test on machines with 5510, 5530, 5530A - */ - -/************************************************************************ - * Suspend Modulation - Definitions * - ************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* PCI config registers, all at F0 */ -#define PCI_PMER1 0x80 /* power management enable register 1 */ -#define PCI_PMER2 0x81 /* power management enable register 2 */ -#define PCI_PMER3 0x82 /* power management enable register 3 */ -#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */ -#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */ -#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */ -#define PCI_MODON 0x95 /* suspend modulation ON counter register */ -#define PCI_SUSCFG 0x96 /* suspend configuration register */ - -/* PMER1 bits */ -#define GPM (1<<0) /* global power management */ -#define GIT (1<<1) /* globally enable PM device idle timers */ -#define GTR (1<<2) /* globally enable IO traps */ -#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */ -#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */ - -/* SUSCFG bits */ -#define SUSMOD (1<<0) /* enable/disable suspend modulation */ -/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */ -#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ - /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ -#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ -/* the below is supported only with cs5530A */ -#define PWRSVE_ISA (1<<3) /* stop ISA clock */ -#define PWRSVE (1<<4) /* active idle */ - -struct gxfreq_params { - u8 on_duration; - u8 off_duration; - u8 pci_suscfg; - u8 pci_pmer1; - u8 pci_pmer2; - struct pci_dev *cs55x0; -}; - -static struct gxfreq_params *gx_params; -static int stock_freq; - -/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ -static int pci_busclk; -module_param(pci_busclk, int, 0444); - -/* maximum duration for which the cpu may be suspended - * (32us * MAX_DURATION). If no parameter is given, this defaults - * to 255. - * Note that this leads to a maximum of 8 ms(!) where the CPU clock - * is suspended -- processing power is just 0.39% of what it used to be, - * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ -static int max_duration = 255; -module_param(max_duration, int, 0444); - -/* For the default policy, we want at least some processing power - * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) - */ -#define POLICY_MIN_DIV 20 - - -/** - * we can detect a core multipiler from dir0_lsb - * from GX1 datasheet p.56, - * MULT[3:0]: - * 0000 = SYSCLK multiplied by 4 (test only) - * 0001 = SYSCLK multiplied by 10 - * 0010 = SYSCLK multiplied by 4 - * 0011 = SYSCLK multiplied by 6 - * 0100 = SYSCLK multiplied by 9 - * 0101 = SYSCLK multiplied by 5 - * 0110 = SYSCLK multiplied by 7 - * 0111 = SYSCLK multiplied by 8 - * of 33.3MHz - **/ -static int gx_freq_mult[16] = { - 4, 10, 4, 6, 9, 5, 7, 8, - 0, 0, 0, 0, 0, 0, 0, 0 -}; - - -/**************************************************************** - * Low Level chipset interface * - ****************************************************************/ -static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, - { 0, }, -}; - -static void gx_write_byte(int reg, int value) -{ - pci_write_config_byte(gx_params->cs55x0, reg, value); -} - -/** - * gx_detect_chipset: - * - **/ -static __init struct pci_dev *gx_detect_chipset(void) -{ - struct pci_dev *gx_pci = NULL; - - /* check if CPU is a MediaGX or a Geode. */ - if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && - (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { - pr_debug("error: no MediaGX/Geode processor found!\n"); - return NULL; - } - - /* detect which companion chip is used */ - for_each_pci_dev(gx_pci) { - if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) - return gx_pci; - } - - pr_debug("error: no supported chipset found!\n"); - return NULL; -} - -/** - * gx_get_cpuspeed: - * - * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi - * Geode CPU runs. - */ -static unsigned int gx_get_cpuspeed(unsigned int cpu) -{ - if ((gx_params->pci_suscfg & SUSMOD) == 0) - return stock_freq; - - return (stock_freq * gx_params->off_duration) - / (gx_params->on_duration + gx_params->off_duration); -} - -/** - * gx_validate_speed: - * determine current cpu speed - * - **/ - -static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, - u8 *off_duration) -{ - unsigned int i; - u8 tmp_on, tmp_off; - int old_tmp_freq = stock_freq; - int tmp_freq; - - *off_duration = 1; - *on_duration = 0; - - for (i = max_duration; i > 0; i--) { - tmp_off = ((khz * i) / stock_freq) & 0xff; - tmp_on = i - tmp_off; - tmp_freq = (stock_freq * tmp_off) / i; - /* if this relation is closer to khz, use this. If it's equal, - * prefer it, too - lower latency */ - if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) { - *on_duration = tmp_on; - *off_duration = tmp_off; - old_tmp_freq = tmp_freq; - } - } - - return old_tmp_freq; -} - - -/** - * gx_set_cpuspeed: - * set cpu speed in khz. - **/ - -static void gx_set_cpuspeed(unsigned int khz) -{ - u8 suscfg, pmer1; - unsigned int new_khz; - unsigned long flags; - struct cpufreq_freqs freqs; - - freqs.cpu = 0; - freqs.old = gx_get_cpuspeed(0); - - new_khz = gx_validate_speed(khz, &gx_params->on_duration, - &gx_params->off_duration); - - freqs.new = new_khz; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - local_irq_save(flags); - - - - if (new_khz != stock_freq) { - /* if new khz == 100% of CPU speed, it is special case */ - switch (gx_params->cs55x0->device) { - case PCI_DEVICE_ID_CYRIX_5530_LEGACY: - pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; - /* FIXME: need to test other values -- Zwane,Miura */ - /* typical 2 to 4ms */ - gx_write_byte(PCI_IRQTC, 4); - /* typical 50 to 100ms */ - gx_write_byte(PCI_VIDTC, 100); - gx_write_byte(PCI_PMER1, pmer1); - - if (gx_params->cs55x0->revision < 0x10) { - /* CS5530(rev 1.2, 1.3) */ - suscfg = gx_params->pci_suscfg|SUSMOD; - } else { - /* CS5530A,B.. */ - suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE; - } - break; - case PCI_DEVICE_ID_CYRIX_5520: - case PCI_DEVICE_ID_CYRIX_5510: - suscfg = gx_params->pci_suscfg | SUSMOD; - break; - default: - local_irq_restore(flags); - pr_debug("fatal: try to set unknown chipset.\n"); - return; - } - } else { - suscfg = gx_params->pci_suscfg & ~(SUSMOD); - gx_params->off_duration = 0; - gx_params->on_duration = 0; - pr_debug("suspend modulation disabled: cpu runs 100%% speed.\n"); - } - - gx_write_byte(PCI_MODOFF, gx_params->off_duration); - gx_write_byte(PCI_MODON, gx_params->on_duration); - - gx_write_byte(PCI_SUSCFG, suscfg); - pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); - - local_irq_restore(flags); - - gx_params->pci_suscfg = suscfg; - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", - gx_params->on_duration * 32, gx_params->off_duration * 32); - pr_debug("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); -} - -/**************************************************************** - * High level functions * - ****************************************************************/ - -/* - * cpufreq_gx_verify: test if frequency range is valid - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int cpufreq_gx_verify(struct cpufreq_policy *policy) -{ - unsigned int tmp_freq = 0; - u8 tmp1, tmp2; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - /* it needs to be assured that at least one supported frequency is - * within policy->min and policy->max. If it is not, policy->max - * needs to be increased until one freuqency is supported. - * policy->min may not be decreased, though. This way we guarantee a - * specific processing capacity. - */ - tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2); - if (tmp_freq < policy->min) - tmp_freq += stock_freq / max_duration; - policy->min = tmp_freq; - if (policy->min > policy->max) - policy->max = tmp_freq; - tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2); - if (tmp_freq > policy->max) - tmp_freq -= stock_freq / max_duration; - policy->max = tmp_freq; - if (policy->max < policy->min) - policy->max = policy->min; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - return 0; -} - -/* - * cpufreq_gx_target: - * - */ -static int cpufreq_gx_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - u8 tmp1, tmp2; - unsigned int tmp_freq; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - - tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2); - while (tmp_freq < policy->min) { - tmp_freq += stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - while (tmp_freq > policy->max) { - tmp_freq -= stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - - gx_set_cpuspeed(tmp_freq); - - return 0; -} - -static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int maxfreq, curfreq; - - if (!policy || policy->cpu != 0) - return -ENODEV; - - /* determine maximum frequency */ - if (pci_busclk) - maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - else if (cpu_khz) - maxfreq = cpu_khz; - else - maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - - stock_freq = maxfreq; - curfreq = gx_get_cpuspeed(0); - - pr_debug("cpu max frequency is %d.\n", maxfreq); - pr_debug("cpu current frequency is %dkHz.\n", curfreq); - - /* setup basic struct for cpufreq API */ - policy->cpu = 0; - - if (max_duration < POLICY_MIN_DIV) - policy->min = maxfreq / max_duration; - else - policy->min = maxfreq / POLICY_MIN_DIV; - policy->max = maxfreq; - policy->cur = curfreq; - policy->cpuinfo.min_freq = maxfreq / max_duration; - policy->cpuinfo.max_freq = maxfreq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - - return 0; -} - -/* - * cpufreq_gx_init: - * MediaGX/Geode GX initialize cpufreq driver - */ -static struct cpufreq_driver gx_suspmod_driver = { - .get = gx_get_cpuspeed, - .verify = cpufreq_gx_verify, - .target = cpufreq_gx_target, - .init = cpufreq_gx_cpu_init, - .name = "gx-suspmod", - .owner = THIS_MODULE, -}; - -static int __init cpufreq_gx_init(void) -{ - int ret; - struct gxfreq_params *params; - struct pci_dev *gx_pci; - - /* Test if we have the right hardware */ - gx_pci = gx_detect_chipset(); - if (gx_pci == NULL) - return -ENODEV; - - /* check whether module parameters are sane */ - if (max_duration > 0xff) - max_duration = 0xff; - - pr_debug("geode suspend modulation available.\n"); - - params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL); - if (params == NULL) - return -ENOMEM; - - params->cs55x0 = gx_pci; - gx_params = params; - - /* keep cs55x0 configurations */ - pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg)); - pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); - pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); - pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); - pci_read_config_byte(params->cs55x0, PCI_MODOFF, - &(params->off_duration)); - - ret = cpufreq_register_driver(&gx_suspmod_driver); - if (ret) { - kfree(params); - return ret; /* register error! */ - } - - return 0; -} - -static void __exit cpufreq_gx_exit(void) -{ - cpufreq_unregister_driver(&gx_suspmod_driver); - pci_dev_put(gx_params->cs55x0); - kfree(gx_params); -} - -MODULE_AUTHOR("Hiroshi Miura "); -MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); -MODULE_LICENSE("GPL"); - -module_init(cpufreq_gx_init); -module_exit(cpufreq_gx_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c deleted file mode 100644 index f47d26e..0000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ /dev/null @@ -1,1024 +0,0 @@ -/* - * (C) 2001-2004 Dave Jones. - * (C) 2002 Padraig Brady. - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by VIA. - * - * VIA have currently 3 different versions of Longhaul. - * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. - * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. - * Version 2 of longhaul is backward compatible with v1, but adds - * LONGHAUL MSR for purpose of both frequency and voltage scaling. - * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). - * Version 3 of longhaul got renamed to Powersaver and redesigned - * to use only the POWERSAVER MSR at 0x110a. - * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. - * It's pretty much the same feature wise to longhaul v2, though - * there is provision for scaling FSB too, but this doesn't work - * too well in practice so we don't even try to use this. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "longhaul.h" - -#define PFX "longhaul: " - -#define TYPE_LONGHAUL_V1 1 -#define TYPE_LONGHAUL_V2 2 -#define TYPE_POWERSAVER 3 - -#define CPU_SAMUEL 1 -#define CPU_SAMUEL2 2 -#define CPU_EZRA 3 -#define CPU_EZRA_T 4 -#define CPU_NEHEMIAH 5 -#define CPU_NEHEMIAH_C 6 - -/* Flags */ -#define USE_ACPI_C3 (1 << 1) -#define USE_NORTHBRIDGE (1 << 2) - -static int cpu_model; -static unsigned int numscales = 16; -static unsigned int fsb; - -static const struct mV_pos *vrm_mV_table; -static const unsigned char *mV_vrm_table; - -static unsigned int highest_speed, lowest_speed; /* kHz */ -static unsigned int minmult, maxmult; -static int can_scale_voltage; -static struct acpi_processor *pr; -static struct acpi_processor_cx *cx; -static u32 acpi_regs_addr; -static u8 longhaul_flags; -static unsigned int longhaul_index; - -/* Module parameters */ -static int scale_voltage; -static int disable_acpi_c3; -static int revid_errata; - - -/* Clock ratios multiplied by 10 */ -static int mults[32]; -static int eblcr[32]; -static int longhaul_version; -static struct cpufreq_frequency_table *longhaul_table; - -static char speedbuffer[8]; - -static char *print_speed(int speed) -{ - if (speed < 1000) { - snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed); - return speedbuffer; - } - - if (speed%1000 == 0) - snprintf(speedbuffer, sizeof(speedbuffer), - "%dGHz", speed/1000); - else - snprintf(speedbuffer, sizeof(speedbuffer), - "%d.%dGHz", speed/1000, (speed%1000)/100); - - return speedbuffer; -} - - -static unsigned int calc_speed(int mult) -{ - int khz; - khz = (mult/10)*fsb; - if (mult%10) - khz += fsb/2; - khz *= 1000; - return khz; -} - - -static int longhaul_get_cpu_mult(void) -{ - unsigned long invalue = 0, lo, hi; - - rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi); - invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22; - if (longhaul_version == TYPE_LONGHAUL_V2 || - longhaul_version == TYPE_POWERSAVER) { - if (lo & (1<<27)) - invalue += 16; - } - return eblcr[invalue]; -} - -/* For processor with BCR2 MSR */ - -static void do_longhaul1(unsigned int mults_index) -{ - union msr_bcr2 bcr2; - - rdmsrl(MSR_VIA_BCR2, bcr2.val); - /* Enable software clock multiplier */ - bcr2.bits.ESOFTBF = 1; - bcr2.bits.CLOCKMUL = mults_index & 0xff; - - /* Sync to timer tick */ - safe_halt(); - /* Change frequency on next halt or sleep */ - wrmsrl(MSR_VIA_BCR2, bcr2.val); - /* Invoke transition */ - ACPI_FLUSH_CPU_CACHE(); - halt(); - - /* Disable software clock multiplier */ - local_irq_disable(); - rdmsrl(MSR_VIA_BCR2, bcr2.val); - bcr2.bits.ESOFTBF = 0; - wrmsrl(MSR_VIA_BCR2, bcr2.val); -} - -/* For processor with Longhaul MSR */ - -static void do_powersaver(int cx_address, unsigned int mults_index, - unsigned int dir) -{ - union msr_longhaul longhaul; - u32 t; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Setup new frequency */ - if (!revid_errata) - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; - else - longhaul.bits.RevisionKey = 0; - longhaul.bits.SoftBusRatio = mults_index & 0xf; - longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4; - /* Setup new voltage */ - if (can_scale_voltage) - longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f; - /* Sync to timer tick */ - safe_halt(); - /* Raise voltage if necessary */ - if (can_scale_voltage && dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } - - /* Change frequency on next halt or sleep */ - longhaul.bits.EnableSoftBusRatio = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - /* Disable bus ratio bit */ - longhaul.bits.EnableSoftBusRatio = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - - /* Reduce voltage if necessary */ - if (can_scale_voltage && !dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } -} - -/** - * longhaul_set_cpu_frequency() - * @mults_index : bitpattern of the new multiplier. - * - * Sets a new clock ratio. - */ - -static void longhaul_setstate(unsigned int table_index) -{ - unsigned int mults_index; - int speed, mult; - struct cpufreq_freqs freqs; - unsigned long flags; - unsigned int pic1_mask, pic2_mask; - u16 bm_status = 0; - u32 bm_timeout = 1000; - unsigned int dir = 0; - - mults_index = longhaul_table[table_index].index; - /* Safety precautions */ - mult = mults[mults_index & 0x1f]; - if (mult == -1) - return; - speed = calc_speed(mult); - if ((speed > highest_speed) || (speed < lowest_speed)) - return; - /* Voltage transition before frequency transition? */ - if (can_scale_voltage && longhaul_index < table_index) - dir = 1; - - freqs.old = calc_speed(longhaul_get_cpu_mult()); - freqs.new = speed; - freqs.cpu = 0; /* longhaul.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", - fsb, mult/10, mult%10, print_speed(speed/1000)); -retry_loop: - preempt_disable(); - local_irq_save(flags); - - pic2_mask = inb(0xA1); - pic1_mask = inb(0x21); /* works on C3. save mask. */ - outb(0xFF, 0xA1); /* Overkill */ - outb(0xFE, 0x21); /* TMR0 only */ - - /* Wait while PCI bus is busy. */ - if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE - || ((pr != NULL) && pr->flags.bm_control))) { - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - while (bm_status && bm_timeout) { - outw(1 << 4, acpi_regs_addr); - bm_timeout--; - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - } - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Disable AGP and PCI arbiters */ - outb(3, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Disable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); - } - switch (longhaul_version) { - - /* - * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) - * Software controlled multipliers only. - */ - case TYPE_LONGHAUL_V1: - do_longhaul1(mults_index); - break; - - /* - * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] - * - * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) - * Nehemiah can do FSB scaling too, but this has never been proven - * to work in practice. - */ - case TYPE_LONGHAUL_V2: - case TYPE_POWERSAVER: - if (longhaul_flags & USE_ACPI_C3) { - /* Don't allow wakeup */ - acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - do_powersaver(cx->address, mults_index, dir); - } else { - do_powersaver(0, mults_index, dir); - } - break; - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Enable arbiters */ - outb(0, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Enable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); - } - outb(pic2_mask, 0xA1); /* restore mask */ - outb(pic1_mask, 0x21); - - local_irq_restore(flags); - preempt_enable(); - - freqs.new = calc_speed(longhaul_get_cpu_mult()); - /* Check if requested frequency is set. */ - if (unlikely(freqs.new != speed)) { - printk(KERN_INFO PFX "Failed to set requested frequency!\n"); - /* Revision ID = 1 but processor is expecting revision key - * equal to 0. Jumpers at the bottom of processor will change - * multiplier and FSB, but will not change bits in Longhaul - * MSR nor enable voltage scaling. */ - if (!revid_errata) { - printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" " - "option.\n"); - revid_errata = 1; - msleep(200); - goto retry_loop; - } - /* Why ACPI C3 sometimes doesn't work is a mystery for me. - * But it does happen. Processor is entering ACPI C3 state, - * but it doesn't change frequency. I tried poking various - * bits in northbridge registers, but without success. */ - if (longhaul_flags & USE_ACPI_C3) { - printk(KERN_INFO PFX "Disabling ACPI C3 support.\n"); - longhaul_flags &= ~USE_ACPI_C3; - if (revid_errata) { - printk(KERN_INFO PFX "Disabling \"Ignore " - "Revision ID\" option.\n"); - revid_errata = 0; - } - msleep(200); - goto retry_loop; - } - /* This shouldn't happen. Longhaul ver. 2 was reported not - * working on processors without voltage scaling, but with - * RevID = 1. RevID errata will make things right. Just - * to be 100% sure. */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n"); - longhaul_version = TYPE_LONGHAUL_V1; - msleep(200); - goto retry_loop; - } - } - /* Report true CPU frequency */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - if (!bm_timeout) - printk(KERN_INFO PFX "Warning: Timeout while waiting for " - "idle PCI bus.\n"); -} - -/* - * Centaur decided to make life a little more tricky. - * Only longhaul v1 is allowed to read EBLCR BSEL[0:1]. - * Samuel2 and above have to try and guess what the FSB is. - * We do this by assuming we booted at maximum multiplier, and interpolate - * between that value multiplied by possible FSBs and cpu_mhz which - * was calculated at boot time. Really ugly, but no other way to do this. - */ - -#define ROUNDING 0xf - -static int guess_fsb(int mult) -{ - int speed = cpu_khz / 1000; - int i; - int speeds[] = { 666, 1000, 1333, 2000 }; - int f_max, f_min; - - for (i = 0; i < 4; i++) { - f_max = ((speeds[i] * mult) + 50) / 100; - f_max += (ROUNDING / 2); - f_min = f_max - ROUNDING; - if ((speed <= f_max) && (speed >= f_min)) - return speeds[i] / 10; - } - return 0; -} - - -static int __cpuinit longhaul_get_ranges(void) -{ - unsigned int i, j, k = 0; - unsigned int ratio; - int mult; - - /* Get current frequency */ - mult = longhaul_get_cpu_mult(); - if (mult == -1) { - printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); - return -EINVAL; - } - fsb = guess_fsb(mult); - if (fsb == 0) { - printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); - return -EINVAL; - } - /* Get max multiplier - as we always did. - * Longhaul MSR is useful only when voltage scaling is enabled. - * C3 is booting at max anyway. */ - maxmult = mult; - /* Get min multiplier */ - switch (cpu_model) { - case CPU_NEHEMIAH: - minmult = 50; - break; - case CPU_NEHEMIAH_C: - minmult = 40; - break; - default: - minmult = 30; - break; - } - - pr_debug("MinMult:%d.%dx MaxMult:%d.%dx\n", - minmult/10, minmult%10, maxmult/10, maxmult%10); - - highest_speed = calc_speed(maxmult); - lowest_speed = calc_speed(minmult); - pr_debug("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, - print_speed(lowest_speed/1000), - print_speed(highest_speed/1000)); - - if (lowest_speed == highest_speed) { - printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n"); - return -EINVAL; - } - if (lowest_speed > highest_speed) { - printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", - lowest_speed, highest_speed); - return -EINVAL; - } - - longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table), - GFP_KERNEL); - if (!longhaul_table) - return -ENOMEM; - - for (j = 0; j < numscales; j++) { - ratio = mults[j]; - if (ratio == -1) - continue; - if (ratio > maxmult || ratio < minmult) - continue; - longhaul_table[k].frequency = calc_speed(ratio); - longhaul_table[k].index = j; - k++; - } - if (k <= 1) { - kfree(longhaul_table); - return -ENODEV; - } - /* Sort */ - for (j = 0; j < k - 1; j++) { - unsigned int min_f, min_i; - min_f = longhaul_table[j].frequency; - min_i = j; - for (i = j + 1; i < k; i++) { - if (longhaul_table[i].frequency < min_f) { - min_f = longhaul_table[i].frequency; - min_i = i; - } - } - if (min_i != j) { - swap(longhaul_table[j].frequency, - longhaul_table[min_i].frequency); - swap(longhaul_table[j].index, - longhaul_table[min_i].index); - } - } - - longhaul_table[k].frequency = CPUFREQ_TABLE_END; - - /* Find index we are running on */ - for (j = 0; j < k; j++) { - if (mults[longhaul_table[j].index & 0x1f] == mult) { - longhaul_index = j; - break; - } - } - return 0; -} - - -static void __cpuinit longhaul_setup_voltagescaling(void) -{ - union msr_longhaul longhaul; - struct mV_pos minvid, maxvid, vid; - unsigned int j, speed, pos, kHz_step, numvscales; - int min_vid_speed; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!(longhaul.bits.RevisionID & 1)) { - printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n"); - return; - } - - if (!longhaul.bits.VRMRev) { - printk(KERN_INFO PFX "VRM 8.5\n"); - vrm_mV_table = &vrm85_mV[0]; - mV_vrm_table = &mV_vrm85[0]; - } else { - printk(KERN_INFO PFX "Mobile VRM\n"); - if (cpu_model < CPU_NEHEMIAH) - return; - vrm_mV_table = &mobilevrm_mV[0]; - mV_vrm_table = &mV_mobilevrm[0]; - } - - minvid = vrm_mV_table[longhaul.bits.MinimumVID]; - maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; - - if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { - printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " - "Voltage scaling disabled.\n", - minvid.mV/1000, minvid.mV%1000, - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - if (minvid.mV == maxvid.mV) { - printk(KERN_INFO PFX "Claims to support voltage scaling but " - "min & max are both %d.%03d. " - "Voltage scaling disabled\n", - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - /* How many voltage steps*/ - numvscales = maxvid.pos - minvid.pos + 1; - printk(KERN_INFO PFX - "Max VID=%d.%03d " - "Min VID=%d.%03d, " - "%d possible voltage scales\n", - maxvid.mV/1000, maxvid.mV%1000, - minvid.mV/1000, minvid.mV%1000, - numvscales); - - /* Calculate max frequency at min voltage */ - j = longhaul.bits.MinMHzBR; - if (longhaul.bits.MinMHzBR4) - j += 16; - min_vid_speed = eblcr[j]; - if (min_vid_speed == -1) - return; - switch (longhaul.bits.MinMHzFSB) { - case 0: - min_vid_speed *= 13333; - break; - case 1: - min_vid_speed *= 10000; - break; - case 3: - min_vid_speed *= 6666; - break; - default: - return; - break; - } - if (min_vid_speed >= highest_speed) - return; - /* Calculate kHz for one voltage step */ - kHz_step = (highest_speed - min_vid_speed) / numvscales; - - j = 0; - while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { - speed = longhaul_table[j].frequency; - if (speed > min_vid_speed) - pos = (speed - min_vid_speed) / kHz_step + minvid.pos; - else - pos = minvid.pos; - longhaul_table[j].index |= mV_vrm_table[pos] << 8; - vid = vrm_mV_table[mV_vrm_table[pos]]; - printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", - speed, j, vid.mV); - j++; - } - - can_scale_voltage = 1; - printk(KERN_INFO PFX "Voltage scaling enabled.\n"); -} - - -static int longhaul_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, longhaul_table); -} - - -static int longhaul_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int table_index = 0; - unsigned int i; - unsigned int dir = 0; - u8 vid, current_vid; - - if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, - relation, &table_index)) - return -EINVAL; - - /* Don't set same frequency again */ - if (longhaul_index == table_index) - return 0; - - if (!can_scale_voltage) - longhaul_setstate(table_index); - else { - /* On test system voltage transitions exceeding single - * step up or down were turning motherboard off. Both - * "ondemand" and "userspace" are unsafe. C7 is doing - * this in hardware, C3 is old and we need to do this - * in software. */ - i = longhaul_index; - current_vid = (longhaul_table[longhaul_index].index >> 8); - current_vid &= 0x1f; - if (table_index > longhaul_index) - dir = 1; - while (i != table_index) { - vid = (longhaul_table[i].index >> 8) & 0x1f; - if (vid != current_vid) { - longhaul_setstate(i); - current_vid = vid; - msleep(200); - } - if (dir) - i++; - else - i--; - } - longhaul_setstate(table_index); - } - longhaul_index = table_index; - return 0; -} - - -static unsigned int longhaul_get(unsigned int cpu) -{ - if (cpu) - return 0; - return calc_speed(longhaul_get_cpu_mult()); -} - -static acpi_status longhaul_walk_callback(acpi_handle obj_handle, - u32 nesting_level, - void *context, void **return_value) -{ - struct acpi_device *d; - - if (acpi_bus_get_device(obj_handle, &d)) - return 0; - - *return_value = acpi_driver_data(d); - return 1; -} - -/* VIA don't support PM2 reg, but have something similar */ -static int enable_arbiter_disable(void) -{ - struct pci_dev *dev; - int status = 1; - int reg; - u8 pci_cmd; - - /* Find PLE133 host bridge */ - reg = 0x78; - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, - NULL); - /* Find PM133/VT8605 host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8605_0, NULL); - /* Find CLE266 host bridge */ - if (dev == NULL) { - reg = 0x76; - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_862X_0, NULL); - /* Find CN400 V-Link host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL); - } - if (dev != NULL) { - /* Enable access to port 0x22 */ - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - pci_cmd |= 1<<7; - pci_write_config_byte(dev, reg, pci_cmd); - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - printk(KERN_ERR PFX - "Can't enable access to port 0x22.\n"); - status = 0; - } - } - pci_dev_put(dev); - return status; - } - return 0; -} - -static int longhaul_setup_southbridge(void) -{ - struct pci_dev *dev; - u8 pci_cmd; - - /* Find VT8235 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); - if (dev == NULL) - /* Find VT8237 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8237, NULL); - if (dev != NULL) { - /* Set transition time to max */ - pci_read_config_byte(dev, 0xec, &pci_cmd); - pci_cmd &= ~(1 << 2); - pci_write_config_byte(dev, 0xec, pci_cmd); - pci_read_config_byte(dev, 0xe4, &pci_cmd); - pci_cmd &= ~(1 << 7); - pci_write_config_byte(dev, 0xe4, pci_cmd); - pci_read_config_byte(dev, 0xe5, &pci_cmd); - pci_cmd |= 1 << 7; - pci_write_config_byte(dev, 0xe5, pci_cmd); - /* Get address of ACPI registers block*/ - pci_read_config_byte(dev, 0x81, &pci_cmd); - if (pci_cmd & 1 << 7) { - pci_read_config_dword(dev, 0x88, &acpi_regs_addr); - acpi_regs_addr &= 0xff00; - printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", - acpi_regs_addr); - } - - pci_dev_put(dev); - return 1; - } - return 0; -} - -static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - char *cpuname = NULL; - int ret; - u32 lo, hi; - - /* Check what we have on this motherboard */ - switch (c->x86_model) { - case 6: - cpu_model = CPU_SAMUEL; - cpuname = "C3 'Samuel' [C5A]"; - longhaul_version = TYPE_LONGHAUL_V1; - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr)); - break; - - case 7: - switch (c->x86_mask) { - case 0: - longhaul_version = TYPE_LONGHAUL_V1; - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - /* Note, this is not a typo, early Samuel2's had - * Samuel1 ratios. */ - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); - break; - case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V2; - if (c->x86_mask < 8) { - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - } else { - cpu_model = CPU_EZRA; - cpuname = "C3 'Ezra' [C5C]"; - } - memcpy(mults, ezra_mults, sizeof(ezra_mults)); - memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr)); - break; - } - break; - - case 8: - cpu_model = CPU_EZRA_T; - cpuname = "C3 'Ezra-T' [C5M]"; - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, ezrat_mults, sizeof(ezrat_mults)); - memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr)); - break; - - case 9: - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults)); - memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr)); - switch (c->x86_mask) { - case 0 ... 1: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah A' [C5XLOE]"; - break; - case 2 ... 4: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah B' [C5XLOH]"; - break; - case 5 ... 15: - cpu_model = CPU_NEHEMIAH_C; - cpuname = "C3 'Nehemiah C' [C5P]"; - break; - } - break; - - default: - cpuname = "Unknown"; - break; - } - /* Check Longhaul ver. 2 */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - rdmsr(MSR_VIA_LONGHAUL, lo, hi); - if (lo == 0 && hi == 0) - /* Looks like MSR isn't present */ - longhaul_version = TYPE_LONGHAUL_V1; - } - - printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname); - switch (longhaul_version) { - case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: - printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version); - break; - case TYPE_POWERSAVER: - printk(KERN_CONT "Powersaver supported.\n"); - break; - }; - - /* Doesn't hurt */ - longhaul_setup_southbridge(); - - /* Find ACPI data for processor */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, - NULL, (void *)&pr); - - /* Check ACPI support for C3 state */ - if (pr != NULL && longhaul_version == TYPE_POWERSAVER) { - cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && cx->latency <= 1000) - longhaul_flags |= USE_ACPI_C3; - } - /* Disable if it isn't working */ - if (disable_acpi_c3) - longhaul_flags &= ~USE_ACPI_C3; - /* Check if northbridge is friendly */ - if (enable_arbiter_disable()) - longhaul_flags |= USE_NORTHBRIDGE; - - /* Check ACPI support for bus master arbiter disable */ - if (!(longhaul_flags & USE_ACPI_C3 - || longhaul_flags & USE_NORTHBRIDGE) - && ((pr == NULL) || !(pr->flags.bm_control))) { - printk(KERN_ERR PFX - "No ACPI support. Unsupported northbridge.\n"); - return -ENODEV; - } - - if (longhaul_flags & USE_NORTHBRIDGE) - printk(KERN_INFO PFX "Using northbridge support.\n"); - if (longhaul_flags & USE_ACPI_C3) - printk(KERN_INFO PFX "Using ACPI support.\n"); - - ret = longhaul_get_ranges(); - if (ret != 0) - return ret; - - if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) - longhaul_setup_voltagescaling(); - - policy->cpuinfo.transition_latency = 200000; /* nsec */ - policy->cur = calc_speed(longhaul_get_cpu_mult()); - - ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table); - if (ret) - return ret; - - cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); - - return 0; -} - -static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *longhaul_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver longhaul_driver = { - .verify = longhaul_verify, - .target = longhaul_target, - .get = longhaul_get, - .init = longhaul_cpu_init, - .exit = __devexit_p(longhaul_cpu_exit), - .name = "longhaul", - .owner = THIS_MODULE, - .attr = longhaul_attr, -}; - - -static int __init longhaul_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) - return -ENODEV; - -#ifdef CONFIG_SMP - if (num_online_cpus() > 1) { - printk(KERN_ERR PFX "More than 1 CPU detected, " - "longhaul disabled.\n"); - return -ENODEV; - } -#endif -#ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { - printk(KERN_ERR PFX "APIC detected. Longhaul is currently " - "broken in this configuration.\n"); - return -ENODEV; - } -#endif - switch (c->x86_model) { - case 6 ... 9: - return cpufreq_register_driver(&longhaul_driver); - case 10: - printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n"); - default: - ; - } - - return -ENODEV; -} - - -static void __exit longhaul_exit(void) -{ - int i; - - for (i = 0; i < numscales; i++) { - if (mults[i] == maxmult) { - longhaul_setstate(i); - break; - } - } - - cpufreq_unregister_driver(&longhaul_driver); - kfree(longhaul_table); -} - -/* Even if BIOS is exporting ACPI C3 state, and it is used - * with success when CPU is idle, this state doesn't - * trigger frequency transition in some cases. */ -module_param(disable_acpi_c3, int, 0644); -MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); -/* Change CPU voltage with frequency. Very useful to save - * power, but most VIA C3 processors aren't supporting it. */ -module_param(scale_voltage, int, 0644); -MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); -/* Force revision key to 0 for processors which doesn't - * support voltage scaling, but are introducing itself as - * such. */ -module_param(revid_errata, int, 0644); -MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); - -MODULE_AUTHOR("Dave Jones "); -MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors."); -MODULE_LICENSE("GPL"); - -late_initcall(longhaul_init); -module_exit(longhaul_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h deleted file mode 100644 index cbf48fb..0000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ /dev/null @@ -1,353 +0,0 @@ -/* - * longhaul.h - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * VIA-specific information - */ - -union msr_bcr2 { - struct { - unsigned Reseved:19, // 18:0 - ESOFTBF:1, // 19 - Reserved2:3, // 22:20 - CLOCKMUL:4, // 26:23 - Reserved3:5; // 31:27 - } bits; - unsigned long val; -}; - -union msr_longhaul { - struct { - unsigned RevisionID:4, // 3:0 - RevisionKey:4, // 7:4 - EnableSoftBusRatio:1, // 8 - EnableSoftVID:1, // 9 - EnableSoftBSEL:1, // 10 - Reserved:3, // 11:13 - SoftBusRatio4:1, // 14 - VRMRev:1, // 15 - SoftBusRatio:4, // 19:16 - SoftVID:5, // 24:20 - Reserved2:3, // 27:25 - SoftBSEL:2, // 29:28 - Reserved3:2, // 31:30 - MaxMHzBR:4, // 35:32 - MaximumVID:5, // 40:36 - MaxMHzFSB:2, // 42:41 - MaxMHzBR4:1, // 43 - Reserved4:4, // 47:44 - MinMHzBR:4, // 51:48 - MinimumVID:5, // 56:52 - MinMHzFSB:2, // 58:57 - MinMHzBR4:1, // 59 - Reserved5:4; // 63:60 - } bits; - unsigned long long val; -}; - -/* - * Clock ratio tables. Div/Mod by 10 to get ratio. - * The eblcr values specify the ratio read from the CPU. - * The mults values specify what to write to the CPU. - */ - -/* - * VIA C3 Samuel 1 & Samuel 2 (stepping 0) - */ -static const int __cpuinitdata samuel1_mults[16] = { - -1, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - -1, /* 0100 -> RESERVED */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - -1, /* 1111 -> RESERVED */ -}; - -static const int __cpuinitdata samuel1_eblcr[16] = { - 50, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - -1, /* 0111 -> RESERVED */ - -1, /* 1000 -> RESERVED */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - -1, /* 1100 -> RESERVED */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Samuel2 Stepping 1->15 - */ -static const int __cpuinitdata samuel2_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 110, /* 0111 -> 11.0x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 130, /* 1110 -> 13.0x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Ezra - */ -static const int __cpuinitdata ezra_mults[16] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata ezra_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 (Ezra-T) [C5M]. - */ -static const int __cpuinitdata ezrat_mults[32] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - - -1, /* 0000 -> RESERVED (10.0x) */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (9.0x)*/ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> RESERVED (12.0x) */ -}; - -static const int __cpuinitdata ezrat_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - - -1, /* 0000 -> RESERVED (9.0x) */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (10.0x)*/ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - -1, /* 1100 -> RESERVED (12.0x) */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145, /* 1111 -> 14.5x */ -}; - -/* - * VIA C3 Nehemiah */ - -static const int __cpuinitdata nehemiah_mults[32] = { - 100, /* 0000 -> 10.0x */ - -1, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - -1, /* 0000 -> 10.0x */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata nehemiah_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ -}; - -/* - * Voltage scales. Div/Mod by 1000 to get actual voltage. - * Which scale to use depends on the VRM type in use. - */ - -struct mV_pos { - unsigned short mV; - unsigned short pos; -}; - -static const struct mV_pos __cpuinitdata vrm85_mV[32] = { - {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, - {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, - {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, - {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10}, - {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3}, - {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27}, - {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19}, - {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} -}; - -static const unsigned char __cpuinitdata mV_vrm85[32] = { - 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, - 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, - 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, - 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 -}; - -static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { - {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, - {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, - {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, - {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16}, - {975, 15}, {950, 14}, {925, 13}, {900, 12}, - {875, 11}, {850, 10}, {825, 9}, {800, 8}, - {775, 7}, {750, 6}, {725, 5}, {700, 4}, - {675, 3}, {650, 2}, {625, 1}, {600, 0} -}; - -static const unsigned char __cpuinitdata mV_mobilevrm[32] = { - 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, - 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, - 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, - 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 -}; - diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c deleted file mode 100644 index 34ea359..0000000 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ /dev/null @@ -1,324 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include -#include -#include -#include -#include - -#include -#include - -static struct cpufreq_driver longrun_driver; - -/** - * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz - * values into per cent values. In TMTA microcode, the following is valid: - * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - */ -static unsigned int longrun_low_freq, longrun_high_freq; - - -/** - * longrun_get_policy - get the current LongRun policy - * @policy: struct cpufreq_policy where current policy is written into - * - * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS - * and MSR_TMTA_LONGRUN_CTRL - */ -static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - pr_debug("longrun flags are %x - %x\n", msr_lo, msr_hi); - if (msr_lo & 0x01) - policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else - policy->policy = CPUFREQ_POLICY_POWERSAVE; - - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - pr_debug("longrun ctrl is %x - %x\n", msr_lo, msr_hi); - msr_lo &= 0x0000007F; - msr_hi &= 0x0000007F; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - policy->min = policy->max = longrun_high_freq; - } else { - policy->min = longrun_low_freq + msr_lo * - ((longrun_high_freq - longrun_low_freq) / 100); - policy->max = longrun_low_freq + msr_hi * - ((longrun_high_freq - longrun_low_freq) / 100); - } - policy->cpu = 0; -} - - -/** - * longrun_set_policy - sets a new CPUFreq policy - * @policy: new policy - * - * Sets a new CPUFreq policy on LongRun-capable processors. This function - * has to be called with cpufreq_driver locked. - */ -static int longrun_set_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - u32 pctg_lo, pctg_hi; - - if (!policy) - return -EINVAL; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - pctg_lo = pctg_hi = 100; - } else { - pctg_lo = (policy->min - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - pctg_hi = (policy->max - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - } - - if (pctg_hi > 100) - pctg_hi = 100; - if (pctg_lo > pctg_hi) - pctg_lo = pctg_hi; - - /* performance or economy mode */ - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - msr_lo &= 0xFFFFFFFE; - switch (policy->policy) { - case CPUFREQ_POLICY_PERFORMANCE: - msr_lo |= 0x00000001; - break; - case CPUFREQ_POLICY_POWERSAVE: - break; - } - wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - - /* lower and upper boundary */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_lo |= pctg_lo; - msr_hi |= pctg_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - return 0; -} - - -/** - * longrun_verify_poliy - verifies a new CPUFreq policy - * @policy: the policy to verify - * - * Validates a new CPUFreq policy. This function has to be called with - * cpufreq_driver locked. - */ -static int longrun_verify_policy(struct cpufreq_policy *policy) -{ - if (!policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - - if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && - (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) - return -EINVAL; - - return 0; -} - -static unsigned int longrun_get(unsigned int cpu) -{ - u32 eax, ebx, ecx, edx; - - if (cpu) - return 0; - - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - pr_debug("cpuid eax is %u\n", eax); - - return eax * 1000; -} - -/** - * longrun_determine_freqs - determines the lowest and highest possible core frequency - * @low_freq: an int to put the lowest frequency into - * @high_freq: an int to put the highest frequency into - * - * Determines the lowest and highest possible core frequencies on this CPU. - * This is necessary to calculate the performance percentage according to - * TMTA rules: - * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) - */ -static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) -{ - u32 msr_lo, msr_hi; - u32 save_lo, save_hi; - u32 eax, ebx, ecx, edx; - u32 try_hi; - struct cpuinfo_x86 *c = &cpu_data(0); - - if (!low_freq || !high_freq) - return -EINVAL; - - if (cpu_has(c, X86_FEATURE_LRTI)) { - /* if the LongRun Table Interface is present, the - * detection is a bit easier: - * For minimum frequency, read out the maximum - * level (msr_hi), write that into "currently - * selected level", and read out the frequency. - * For maximum frequency, read out level zero. - */ - /* minimum */ - rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi); - wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *low_freq = msr_lo * 1000; /* to kHz */ - - /* maximum */ - wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *high_freq = msr_lo * 1000; /* to kHz */ - - pr_debug("longrun table interface told %u - %u kHz\n", - *low_freq, *high_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - return 0; - } - - /* set the upper border to the value determined during TSC init */ - *high_freq = (cpu_khz / 1000); - *high_freq = *high_freq * 1000; - pr_debug("high frequency is %u kHz\n", *high_freq); - - /* get current borders */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - save_lo = msr_lo & 0x0000007F; - save_hi = msr_hi & 0x0000007F; - - /* if current perf_pctg is larger than 90%, we need to decrease the - * upper limit to make the calculation more accurate. - */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - /* try decreasing in 10% steps, some processors react only - * on some barrier values */ - for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) { - /* set to 0 to try_hi perf_pctg */ - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_hi |= try_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - /* read out current core MHz and current perf_pctg */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - - /* restore values */ - wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); - } - pr_debug("percentage is %u %%, freq is %u MHz\n", ecx, eax); - - /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - * eqals - * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) - * - * high_freq * perf_pctg is stored tempoarily into "ebx". - */ - ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */ - - if ((ecx > 95) || (ecx == 0) || (eax < ebx)) - return -EIO; - - edx = ((eax - ebx) * 100) / (100 - ecx); - *low_freq = edx * 1000; /* back to kHz */ - - pr_debug("low frequency is %u kHz\n", *low_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - - return 0; -} - - -static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) -{ - int result = 0; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* detect low and high frequency */ - result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq); - if (result) - return result; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = longrun_low_freq; - policy->cpuinfo.max_freq = longrun_high_freq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - longrun_get_policy(policy); - - return 0; -} - - -static struct cpufreq_driver longrun_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .verify = longrun_verify_policy, - .setpolicy = longrun_set_policy, - .get = longrun_get, - .init = longrun_cpu_init, - .name = "longrun", - .owner = THIS_MODULE, -}; - - -/** - * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver - * - * Initializes the LongRun support. - */ -static int __init longrun_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_TRANSMETA || - !cpu_has(c, X86_FEATURE_LONGRUN)) - return -ENODEV; - - return cpufreq_register_driver(&longrun_driver); -} - - -/** - * longrun_exit - unregisters LongRun support - */ -static void __exit longrun_exit(void) -{ - cpufreq_unregister_driver(&longrun_driver); -} - - -MODULE_AUTHOR("Dominik Brodowski "); -MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and " - "Efficeon processors."); -MODULE_LICENSE("GPL"); - -module_init(longrun_init); -module_exit(longrun_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c deleted file mode 100644 index 911e193..0000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.c +++ /dev/null @@ -1,51 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "mperf.h" - -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); - -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); - per_cpu(acfreq_old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h deleted file mode 100644 index 5dbf295..0000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * (c) 2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c deleted file mode 100644 index 6be3e07..0000000 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Pentium 4/Xeon CPU on demand clock modulation/speed scaling - * (C) 2002 - 2003 Dominik Brodowski - * (C) 2002 Zwane Mwaikambo - * (C) 2002 Arjan van de Ven - * (C) 2002 Tora T. Engstad - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Date Errata Description - * 20020525 N44, O17 12.5% or 25% DC causes lockup - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "speedstep-lib.h" - -#define PFX "p4-clockmod: " - -/* - * Duty Cycle (3bits), note DC_DISABLE is not specified in - * intel docs i just use it to mean disable - */ -enum { - DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT, - DC_64PT, DC_75PT, DC_88PT, DC_DISABLE -}; - -#define DC_ENTRIES 8 - - -static int has_N44_O17_errata[NR_CPUS]; -static unsigned int stock_freq; -static struct cpufreq_driver p4clockmod_driver; -static unsigned int cpufreq_p4_get(unsigned int cpu); - -static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) -{ - u32 l, h; - - if (!cpu_online(cpu) || - (newstate > DC_DISABLE) || (newstate == DC_RESV)) - return -EINVAL; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); - - if (l & 0x01) - pr_debug("CPU#%d currently thermal throttled\n", cpu); - - if (has_N44_O17_errata[cpu] && - (newstate == DC_25PT || newstate == DC_DFLT)) - newstate = DC_38PT; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - if (newstate == DC_DISABLE) { - pr_debug("CPU#%d disabling modulation\n", cpu); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); - } else { - pr_debug("CPU#%d setting duty cycle to %d%%\n", - cpu, ((125 * newstate) / 10)); - /* bits 63 - 5 : reserved - * bit 4 : enable/disable - * bits 3-1 : duty cycle - * bit 0 : reserved - */ - l = (l & ~14); - l = l | (1<<4) | ((newstate & 0x7)<<1); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h); - } - - return 0; -} - - -static struct cpufreq_frequency_table p4clockmod_table[] = { - {DC_RESV, CPUFREQ_ENTRY_INVALID}, - {DC_DFLT, 0}, - {DC_25PT, 0}, - {DC_38PT, 0}, - {DC_50PT, 0}, - {DC_64PT, 0}, - {DC_75PT, 0}, - {DC_88PT, 0}, - {DC_DISABLE, 0}, - {DC_RESV, CPUFREQ_TABLE_END}, -}; - - -static int cpufreq_p4_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = DC_RESV; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = cpufreq_p4_get(policy->cpu); - freqs.new = stock_freq * p4clockmod_table[newstate].index / 8; - - if (freqs.new == freqs.old) - return 0; - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - /* run on each logical CPU, - * see section 13.15.3 of IA32 Intel Architecture Software - * Developer's Manual, Volume 3 - */ - for_each_cpu(i, policy->cpus) - cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -static int cpufreq_p4_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]); -} - - -static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x06) { - if (cpu_has(c, X86_FEATURE_EST)) - printk_once(KERN_WARNING PFX "Warning: EST-capable " - "CPU detected. The acpi-cpufreq module offers " - "voltage scaling in addition to frequency " - "scaling. You should use that instead of " - "p4-clockmod, if possible.\n"); - switch (c->x86_model) { - case 0x0E: /* Core */ - case 0x0F: /* Core Duo */ - case 0x16: /* Celeron Core */ - case 0x1C: /* Atom */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); - case 0x0D: /* Pentium M (Dothan) */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - /* fall through */ - case 0x09: /* Pentium M (Banias) */ - return speedstep_get_frequency(SPEEDSTEP_CPU_PM); - } - } - - if (c->x86 != 0xF) - return 0; - - /* on P-4s, the TSC runs with constant frequency independent whether - * throttling is active or not. */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) { - printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " - "The speedstep-ich or acpi cpufreq modules offer " - "voltage scaling in addition of frequency scaling. " - "You should use either one instead of p4-clockmod, " - "if possible.\n"); - return speedstep_get_frequency(SPEEDSTEP_CPU_P4M); - } - - return speedstep_get_frequency(SPEEDSTEP_CPU_P4D); -} - - - -static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - int cpuid = 0; - unsigned int i; - -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - - /* Errata workaround */ - cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask; - switch (cpuid) { - case 0x0f07: - case 0x0f0a: - case 0x0f11: - case 0x0f12: - has_N44_O17_errata[policy->cpu] = 1; - pr_debug("has errata -- disabling low frequencies\n"); - } - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D && - c->x86_model < 2) { - /* switch to maximum frequency and measure result */ - cpufreq_p4_setdc(policy->cpu, DC_DISABLE); - recalibrate_cpu_khz(); - } - /* get max frequency */ - stock_freq = cpufreq_p4_get_frequency(c); - if (!stock_freq) - return -EINVAL; - - /* table init */ - for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if ((i < 2) && (has_N44_O17_errata[policy->cpu])) - p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; - else - p4clockmod_table[i].frequency = (stock_freq * i)/8; - } - cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); - - /* cpuinfo and default policy values */ - - /* the transition latency is set to be 1 higher than the maximum - * transition latency of the ondemand governor */ - policy->cpuinfo.transition_latency = 10000001; - policy->cur = stock_freq; - - return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); -} - - -static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int cpufreq_p4_get(unsigned int cpu) -{ - u32 l, h; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - - if (l & 0x10) { - l = l >> 1; - l &= 0x7; - } else - l = DC_DISABLE; - - if (l != DC_DISABLE) - return stock_freq * l / 8; - - return stock_freq; -} - -static struct freq_attr *p4clockmod_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver p4clockmod_driver = { - .verify = cpufreq_p4_verify, - .target = cpufreq_p4_target, - .init = cpufreq_p4_cpu_init, - .exit = cpufreq_p4_cpu_exit, - .get = cpufreq_p4_get, - .name = "p4-clockmod", - .owner = THIS_MODULE, - .attr = p4clockmod_attr, -}; - - -static int __init cpufreq_p4_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int ret; - - /* - * THERM_CONTROL is architectural for IA32 now, so - * we can rely on the capability checks - */ - if (c->x86_vendor != X86_VENDOR_INTEL) - return -ENODEV; - - if (!test_cpu_cap(c, X86_FEATURE_ACPI) || - !test_cpu_cap(c, X86_FEATURE_ACC)) - return -ENODEV; - - ret = cpufreq_register_driver(&p4clockmod_driver); - if (!ret) - printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock " - "Modulation available\n"); - - return ret; -} - - -static void __exit cpufreq_p4_exit(void) -{ - cpufreq_unregister_driver(&p4clockmod_driver); -} - - -MODULE_AUTHOR("Zwane Mwaikambo "); -MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); -MODULE_LICENSE("GPL"); - -late_initcall(cpufreq_p4_init); -module_exit(cpufreq_p4_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c deleted file mode 100644 index 7b0603e..0000000 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ /dev/null @@ -1,621 +0,0 @@ -/* - * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface - * - * Copyright (C) 2009 Red Hat, Matthew Garrett - * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. - * Nagananda Chumbalkar - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON - * INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#define PCC_VERSION "1.10.00" -#define POLL_LOOPS 300 - -#define CMD_COMPLETE 0x1 -#define CMD_GET_FREQ 0x0 -#define CMD_SET_FREQ 0x1 - -#define BUF_SZ 4 - -struct pcc_register_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_size; - u64 address; -} __attribute__ ((packed)); - -struct pcc_memory_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 resource_usage; - u8 type_specific; - u64 granularity; - u64 minimum; - u64 maximum; - u64 translation_offset; - u64 address_length; -} __attribute__ ((packed)); - -static struct cpufreq_driver pcc_cpufreq_driver; - -struct pcc_header { - u32 signature; - u16 length; - u8 major; - u8 minor; - u32 features; - u16 command; - u16 status; - u32 latency; - u32 minimum_time; - u32 maximum_time; - u32 nominal; - u32 throttled_frequency; - u32 minimum_frequency; -}; - -static void __iomem *pcch_virt_addr; -static struct pcc_header __iomem *pcch_hdr; - -static DEFINE_SPINLOCK(pcc_lock); - -static struct acpi_generic_address doorbell; - -static u64 doorbell_preserve; -static u64 doorbell_write; - -static u8 OSC_UUID[16] = {0x9F, 0x2C, 0x9B, 0x63, 0x91, 0x70, 0x1f, 0x49, - 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; - -struct pcc_cpu { - u32 input_offset; - u32 output_offset; -}; - -static struct pcc_cpu __percpu *pcc_cpu_info; - -static int pcc_cpufreq_verify(struct cpufreq_policy *policy) -{ - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static inline void pcc_cmd(void) -{ - u64 doorbell_value; - int i; - - acpi_read(&doorbell_value, &doorbell); - acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, - &doorbell); - - for (i = 0; i < POLL_LOOPS; i++) { - if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) - break; - } -} - -static inline void pcc_clear_mapping(void) -{ - if (pcch_virt_addr) - iounmap(pcch_virt_addr); - pcch_virt_addr = NULL; -} - -static unsigned int pcc_get_freq(unsigned int cpu) -{ - struct pcc_cpu *pcc_cpu_data; - unsigned int curr_freq; - unsigned int freq_limit; - u16 status; - u32 input_buffer; - u32 output_buffer; - - spin_lock(&pcc_lock); - - pr_debug("get: get_freq for CPU %d\n", cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - input_buffer = 0x1; - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_GET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - output_buffer = - ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - pr_debug("get: FAILED: for CPU %d, status is %d\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) - / 100) * 1000); - - pr_debug("get: SUCCESS: (virtual) output_offset for cpu %d is " - "0x%p, contains a value of: 0x%x. Speed is: %d MHz\n", - cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), - output_buffer, curr_freq); - - freq_limit = (output_buffer >> 8) & 0xff; - if (freq_limit != 0xff) { - pr_debug("get: frequency for cpu %d is being temporarily" - " capped at %d\n", cpu, curr_freq); - } - - spin_unlock(&pcc_lock); - return curr_freq; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return 0; -} - -static int pcc_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct pcc_cpu *pcc_cpu_data; - struct cpufreq_freqs freqs; - u16 status; - u32 input_buffer; - int cpu; - - spin_lock(&pcc_lock); - cpu = policy->cpu; - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - pr_debug("target: CPU %d should go to target freq: %d " - "(virtual) input_offset is 0x%p\n", - cpu, target_freq, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - - freqs.new = target_freq; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - input_buffer = 0x1 | (((target_freq * 100) - / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_SET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - pr_debug("target: FAILED for cpu %d, with status: 0x%x\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu); - spin_unlock(&pcc_lock); - - return 0; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return -EINVAL; -} - -static int pcc_get_offset(int cpu) -{ - acpi_status status; - struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object *pccp, *offset; - struct pcc_cpu *pcc_cpu_data; - struct acpi_processor *pr; - int ret = 0; - - pr = per_cpu(processors, cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); - if (ACPI_FAILURE(status)) - return -ENODEV; - - pccp = buffer.pointer; - if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - }; - - offset = &(pccp->package.elements[0]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->input_offset = offset->integer.value; - - offset = &(pccp->package.elements[1]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->output_offset = offset->integer.value; - - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); - - pr_debug("pcc_get_offset: for CPU %d: pcc_cpu_data " - "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", - cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); -out_free: - kfree(buffer.pointer); - return ret; -} - -static int __init pcc_cpufreq_do_osc(acpi_handle *handle) -{ - acpi_status status; - struct acpi_object_list input; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object in_params[4]; - union acpi_object *out_obj; - u32 capabilities[2]; - u32 errors; - u32 supported; - int ret = 0; - - input.count = 4; - input.pointer = in_params; - in_params[0].type = ACPI_TYPE_BUFFER; - in_params[0].buffer.length = 16; - in_params[0].buffer.pointer = OSC_UUID; - in_params[1].type = ACPI_TYPE_INTEGER; - in_params[1].integer.value = 1; - in_params[2].type = ACPI_TYPE_INTEGER; - in_params[2].integer.value = 2; - in_params[3].type = ACPI_TYPE_BUFFER; - in_params[3].buffer.length = 8; - in_params[3].buffer.pointer = (u8 *)&capabilities; - - capabilities[0] = OSC_QUERY_ENABLE; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - - kfree(output.pointer); - capabilities[0] = 0x0; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - -out_free: - kfree(output.pointer); - return ret; -} - -static int __init pcc_cpufreq_probe(void) -{ - acpi_status status; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - struct pcc_memory_resource *mem_resource; - struct pcc_register_resource *reg_resource; - union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle, pcch_handle; - int ret = 0; - - status = acpi_get_handle(NULL, "\\_SB", &handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "PCCH", &pcch_handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "_OSC", &osc_handle); - if (ACPI_SUCCESS(status)) { - ret = pcc_cpufreq_do_osc(&osc_handle); - if (ret) - pr_debug("probe: _OSC evaluation did not succeed\n"); - /* Firmware's use of _OSC is optional */ - ret = 0; - } - - status = acpi_evaluate_object(handle, "PCCH", NULL, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - } - - member = &out_obj->package.elements[0]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; - - pr_debug("probe: mem_resource descriptor: 0x%x," - " length: %d, space_id: %d, resource_usage: %d," - " type_specific: %d, granularity: 0x%llx," - " minimum: 0x%llx, maximum: 0x%llx," - " translation_offset: 0x%llx, address_length: 0x%llx\n", - mem_resource->descriptor, mem_resource->length, - mem_resource->space_id, mem_resource->resource_usage, - mem_resource->type_specific, mem_resource->granularity, - mem_resource->minimum, mem_resource->maximum, - mem_resource->translation_offset, - mem_resource->address_length); - - if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { - ret = -ENODEV; - goto out_free; - } - - pcch_virt_addr = ioremap_nocache(mem_resource->minimum, - mem_resource->address_length); - if (pcch_virt_addr == NULL) { - pr_debug("probe: could not map shared mem region\n"); - goto out_free; - } - pcch_hdr = pcch_virt_addr; - - pr_debug("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); - pr_debug("probe: PCCH header is at physical address: 0x%llx," - " signature: 0x%x, length: %d bytes, major: %d, minor: %d," - " supported features: 0x%x, command field: 0x%x," - " status field: 0x%x, nominal latency: %d us\n", - mem_resource->minimum, ioread32(&pcch_hdr->signature), - ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), - ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), - ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), - ioread32(&pcch_hdr->latency)); - - pr_debug("probe: min time between commands: %d us," - " max time between commands: %d us," - " nominal CPU frequency: %d MHz," - " minimum CPU frequency: %d MHz," - " minimum CPU frequency without throttling: %d MHz\n", - ioread32(&pcch_hdr->minimum_time), - ioread32(&pcch_hdr->maximum_time), - ioread32(&pcch_hdr->nominal), - ioread32(&pcch_hdr->throttled_frequency), - ioread32(&pcch_hdr->minimum_frequency)); - - member = &out_obj->package.elements[1]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto pcch_free; - } - - reg_resource = (struct pcc_register_resource *)member->buffer.pointer; - - doorbell.space_id = reg_resource->space_id; - doorbell.bit_width = reg_resource->bit_width; - doorbell.bit_offset = reg_resource->bit_offset; - doorbell.access_width = 64; - doorbell.address = reg_resource->address; - - pr_debug("probe: doorbell: space_id is %d, bit_width is %d, " - "bit_offset is %d, access_width is %d, address is 0x%llx\n", - doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, - doorbell.access_width, reg_resource->address); - - member = &out_obj->package.elements[2]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_preserve = member->integer.value; - - member = &out_obj->package.elements[3]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_write = member->integer.value; - - pr_debug("probe: doorbell_preserve: 0x%llx," - " doorbell_write: 0x%llx\n", - doorbell_preserve, doorbell_write); - - pcc_cpu_info = alloc_percpu(struct pcc_cpu); - if (!pcc_cpu_info) { - ret = -ENOMEM; - goto pcch_free; - } - - printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" - " limits: %d MHz, %d MHz\n", PCC_VERSION, - ioread32(&pcch_hdr->minimum_frequency), - ioread32(&pcch_hdr->nominal)); - kfree(output.pointer); - return ret; -pcch_free: - pcc_clear_mapping(); -out_free: - kfree(output.pointer); - return ret; -} - -static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - unsigned int result = 0; - - if (!pcch_virt_addr) { - result = -1; - goto out; - } - - result = pcc_get_offset(cpu); - if (result) { - pr_debug("init: PCCP evaluation failed\n"); - goto out; - } - - policy->max = policy->cpuinfo.max_freq = - ioread32(&pcch_hdr->nominal) * 1000; - policy->min = policy->cpuinfo.min_freq = - ioread32(&pcch_hdr->minimum_frequency) * 1000; - policy->cur = pcc_get_freq(cpu); - - if (!policy->cur) { - pr_debug("init: Unable to get current CPU frequency\n"); - result = -EINVAL; - goto out; - } - - pr_debug("init: policy->max is %d, policy->min is %d\n", - policy->max, policy->min); -out: - return result; -} - -static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver pcc_cpufreq_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .get = pcc_get_freq, - .verify = pcc_cpufreq_verify, - .target = pcc_cpufreq_target, - .init = pcc_cpufreq_cpu_init, - .exit = pcc_cpufreq_cpu_exit, - .name = "pcc-cpufreq", - .owner = THIS_MODULE, -}; - -static int __init pcc_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - ret = pcc_cpufreq_probe(); - if (ret) { - pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n"); - return ret; - } - - ret = cpufreq_register_driver(&pcc_cpufreq_driver); - - return ret; -} - -static void __exit pcc_cpufreq_exit(void) -{ - cpufreq_unregister_driver(&pcc_cpufreq_driver); - - pcc_clear_mapping(); - - free_percpu(pcc_cpu_info); -} - -MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); -MODULE_VERSION(PCC_VERSION); -MODULE_DESCRIPTION("Processor Clocking Control interface driver"); -MODULE_LICENSE("GPL"); - -late_initcall(pcc_cpufreq_init); -module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c deleted file mode 100644 index b3379d6..0000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) - * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, - * Dominik Brodowski. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ - -#define PFX "powernow-k6: " -static unsigned int busfreq; /* FSB, in 10 kHz */ -static unsigned int max_multiplier; - - -/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */ -static struct cpufreq_frequency_table clock_ratio[] = { - {45, /* 000 -> 4.5x */ 0}, - {50, /* 001 -> 5.0x */ 0}, - {40, /* 010 -> 4.0x */ 0}, - {55, /* 011 -> 5.5x */ 0}, - {20, /* 100 -> 2.0x */ 0}, - {30, /* 101 -> 3.0x */ 0}, - {60, /* 110 -> 6.0x */ 0}, - {35, /* 111 -> 3.5x */ 0}, - {0, CPUFREQ_TABLE_END} -}; - - -/** - * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier - * - * Returns the current setting of the frequency multiplier. Core clock - * speed is frequency of the Front-Side Bus multiplied with this value. - */ -static int powernow_k6_get_cpu_multiplier(void) -{ - u64 invalue = 0; - u32 msrval; - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - return clock_ratio[(invalue >> 5)&7].index; -} - - -/** - * powernow_k6_set_state - set the PowerNow! multiplier - * @best_i: clock_ratio[best_i] is the target multiplier - * - * Tries to change the PowerNow! multiplier - */ -static void powernow_k6_set_state(unsigned int best_i) -{ - unsigned long outvalue = 0, invalue = 0; - unsigned long msrval; - struct cpufreq_freqs freqs; - - if (clock_ratio[best_i].index > max_multiplier) { - printk(KERN_ERR PFX "invalid target frequency\n"); - return; - } - - freqs.old = busfreq * powernow_k6_get_cpu_multiplier(); - freqs.new = busfreq * clock_ratio[best_i].index; - freqs.cpu = 0; /* powernow-k6.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* we now need to transform best_i to the BVC format, see AMD#23446 */ - - outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5); - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - invalue = invalue & 0xf; - outvalue = outvalue | invalue; - outl(outvalue , (POWERNOW_IOPORT + 0x8)); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return; -} - - -/** - * powernow_k6_verify - verifies a new CPUfreq policy - * @policy: new policy - * - * Policy must be within lowest and highest possible CPU Frequency, - * and at least one possible state must be within min and max. - */ -static int powernow_k6_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &clock_ratio[0]); -} - - -/** - * powernow_k6_setpolicy - sets a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * sets a new CPUFreq policy - */ -static int powernow_k6_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &clock_ratio[0], - target_freq, relation, &newstate)) - return -EINVAL; - - powernow_k6_set_state(newstate); - - return 0; -} - - -static int powernow_k6_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i, f; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - /* get frequencies */ - max_multiplier = powernow_k6_get_cpu_multiplier(); - busfreq = cpu_khz / max_multiplier; - - /* table init */ - for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { - f = clock_ratio[i].index; - if (f > max_multiplier) - clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; - else - clock_ratio[i].frequency = busfreq * f; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 200000; - policy->cur = busfreq * max_multiplier; - - result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); - if (result) - return result; - - cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); - - return 0; -} - - -static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int i; - for (i = 0; i < 8; i++) { - if (i == max_multiplier) - powernow_k6_set_state(i); - } - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int powernow_k6_get(unsigned int cpu) -{ - unsigned int ret; - ret = (busfreq * powernow_k6_get_cpu_multiplier()); - return ret; -} - -static struct freq_attr *powernow_k6_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_k6_driver = { - .verify = powernow_k6_verify, - .target = powernow_k6_target, - .init = powernow_k6_cpu_init, - .exit = powernow_k6_cpu_exit, - .get = powernow_k6_get, - .name = "powernow-k6", - .owner = THIS_MODULE, - .attr = powernow_k6_attr, -}; - - -/** - * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver - * - * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported - * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero - * on success. - */ -static int __init powernow_k6_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || - ((c->x86_model != 12) && (c->x86_model != 13))) - return -ENODEV; - - if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { - printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n"); - return -EIO; - } - - if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region(POWERNOW_IOPORT, 16); - return -EINVAL; - } - - return 0; -} - - -/** - * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support - * - * Unregisters AMD K6-2+ / K6-3+ PowerNow! support. - */ -static void __exit powernow_k6_exit(void) -{ - cpufreq_unregister_driver(&powernow_k6_driver); - release_region(POWERNOW_IOPORT, 16); -} - - -MODULE_AUTHOR("Arjan van de Ven, Dave Jones , " - "Dominik Brodowski "); -MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE("GPL"); - -module_init(powernow_k6_init); -module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c deleted file mode 100644 index d71d9f3..0000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ /dev/null @@ -1,747 +0,0 @@ -/* - * AMD K7 Powernow driver. - * (C) 2003 Dave Jones on behalf of SuSE Labs. - * (C) 2003-2004 Dave Jones - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Errata 5: - * CPU may fail to execute a FID/VID change in presence of interrupt. - * - We cli/sti on stepping A0 CPUs around the FID/VID transition. - * Errata 15: - * CPU with half frequency multipliers may hang upon wakeup from disconnect. - * - We disable half multipliers if ACPI is used on A0 stepping CPUs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* Needed for recalibrate_cpu_khz() */ -#include -#include - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -#include -#include -#endif - -#include "powernow-k7.h" - -#define PFX "powernow: " - - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags; - u16 settlingtime; - u8 reserved1; - u8 numpst; -}; - -struct pst_s { - u32 cpuid; - u8 fsbspeed; - u8 maxfid; - u8 startvid; - u8 numpstates; -}; - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -union powernow_acpi_control_t { - struct { - unsigned long fid:5, - vid:5, - sgtc:20, - res1:2; - } bits; - unsigned long val; -}; -#endif - -/* divide by 1000 to get VCore voltage in V. */ -static const int mobile_vid_table[32] = { - 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, - 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0, - 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, - 1075, 1050, 1025, 1000, 975, 950, 925, 0, -}; - -/* divide by 10 to get FID. */ -static const int fid_codes[32] = { - 110, 115, 120, 125, 50, 55, 60, 65, - 70, 75, 80, 85, 90, 95, 100, 105, - 30, 190, 40, 200, 130, 135, 140, 210, - 150, 225, 160, 165, 170, 180, -1, -1, -}; - -/* This parameter is used in order to force ACPI instead of legacy method for - * configuration purpose. - */ - -static int acpi_force; - -static struct cpufreq_frequency_table *powernow_table; - -static unsigned int can_scale_bus; -static unsigned int can_scale_vid; -static unsigned int minimum_speed = -1; -static unsigned int maximum_speed; -static unsigned int number_scales; -static unsigned int fsb; -static unsigned int latency; -static char have_a0; - -static int check_fsb(unsigned int fsbspeed) -{ - int delta; - unsigned int f = fsb / 1000; - - delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; - return delta < 5; -} - -static int check_powernow(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int maxei, eax, ebx, ecx, edx; - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) { -#ifdef MODULE - printk(KERN_INFO PFX "This module only works with " - "AMD K7 CPUs\n"); -#endif - return 0; - } - - /* Get maximum capabilities */ - maxei = cpuid_eax(0x80000000); - if (maxei < 0x80000007) { /* Any powernow info ? */ -#ifdef MODULE - printk(KERN_INFO PFX "No powernow capabilities detected\n"); -#endif - return 0; - } - - if ((c->x86_model == 6) && (c->x86_mask == 0)) { - printk(KERN_INFO PFX "K7 660[A0] core detected, " - "enabling errata workarounds\n"); - have_a0 = 1; - } - - cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - - /* Check we can actually do something before we say anything.*/ - if (!(edx & (1 << 1 | 1 << 2))) - return 0; - - printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); - - if (edx & 1 << 1) { - printk("frequency"); - can_scale_bus = 1; - } - - if ((edx & (1 << 1 | 1 << 2)) == 0x6) - printk(" and "); - - if (edx & 1 << 2) { - printk("voltage"); - can_scale_vid = 1; - } - - printk(".\n"); - return 1; -} - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -static void invalidate_entry(unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} -#endif - -static int get_ranges(unsigned char *pst) -{ - unsigned int j; - unsigned int speed; - u8 fid, vid; - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) - return -ENOMEM; - - for (j = 0 ; j < number_scales; j++) { - fid = *pst++; - - powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; - powernow_table[j].index = fid; /* lower 8 bits */ - - speed = powernow_table[j].frequency; - - if ((fid_codes[fid] % 10) == 5) { -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (have_a0 == 1) - invalidate_entry(j); -#endif - } - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - - vid = *pst++; - powernow_table[j].index |= (vid << 8); /* upper 8 bits */ - - pr_debug(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed/1000, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - } - powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; - powernow_table[number_scales].index = 0; - - return 0; -} - - -static void change_FID(int fid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.FID != fid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.FID = fid; - fidvidctl.bits.VIDC = 0; - fidvidctl.bits.FIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_VID(int vid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.VID != vid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.VID = vid; - fidvidctl.bits.FIDC = 0; - fidvidctl.bits.VIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_speed(unsigned int index) -{ - u8 fid, vid; - struct cpufreq_freqs freqs; - union msr_fidvidstatus fidvidstatus; - int cfid; - - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in powernow_decode_bios, - * vid are the upper 8 bits. - */ - - fid = powernow_table[index].index & 0xFF; - vid = (powernow_table[index].index & 0xFF00) >> 8; - - freqs.cpu = 0; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - freqs.old = fsb * fid_codes[cfid] / 10; - - freqs.new = powernow_table[index].frequency; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Now do the magic poking into the MSRs. */ - - if (have_a0 == 1) /* A0 errata 5 */ - local_irq_disable(); - - if (freqs.old > freqs.new) { - /* Going down, so change FID first */ - change_FID(fid); - change_VID(vid); - } else { - /* Going up, so change VID first */ - change_VID(vid); - change_FID(fid); - } - - - if (have_a0 == 1) - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -} - - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - -static struct acpi_processor_performance *acpi_processor_perf; - -static int powernow_acpi_init(void) -{ - int i; - int retval = 0; - union powernow_acpi_control_t pc; - - if (acpi_processor_perf != NULL && powernow_table != NULL) { - retval = -EINVAL; - goto err0; - } - - acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance), - GFP_KERNEL); - if (!acpi_processor_perf) { - retval = -ENOMEM; - goto err0; - } - - if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, - GFP_KERNEL)) { - retval = -ENOMEM; - goto err05; - } - - if (acpi_processor_register_performance(acpi_processor_perf, 0)) { - retval = -EIO; - goto err1; - } - - if (acpi_processor_perf->control_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - if (acpi_processor_perf->status_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - number_scales = acpi_processor_perf->state_count; - - if (number_scales < 2) { - retval = -ENODEV; - goto err2; - } - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) { - retval = -ENOMEM; - goto err2; - } - - pc.val = (unsigned long) acpi_processor_perf->states[0].control; - for (i = 0; i < number_scales; i++) { - u8 fid, vid; - struct acpi_processor_px *state = - &acpi_processor_perf->states[i]; - unsigned int speed, speed_mhz; - - pc.val = (unsigned long) state->control; - pr_debug("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", - i, - (u32) state->core_frequency, - (u32) state->power, - (u32) state->transition_latency, - (u32) state->control, - pc.bits.sgtc); - - vid = pc.bits.vid; - fid = pc.bits.fid; - - powernow_table[i].frequency = fsb * fid_codes[fid] / 10; - powernow_table[i].index = fid; /* lower 8 bits */ - powernow_table[i].index |= (vid << 8); /* upper 8 bits */ - - speed = powernow_table[i].frequency; - speed_mhz = speed / 1000; - - /* processor_perflib will multiply the MHz value by 1000 to - * get a KHz value (e.g. 1266000). However, powernow-k7 works - * with true KHz values (e.g. 1266768). To ensure that all - * powernow frequencies are available, we must ensure that - * ACPI doesn't restrict them, so we round up the MHz value - * to ensure that perflib's computed KHz value is greater than - * or equal to powernow's KHz value. - */ - if (speed % 1000 > 0) - speed_mhz++; - - if ((fid_codes[fid] % 10) == 5) { - if (have_a0 == 1) - invalidate_entry(i); - } - - pr_debug(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed_mhz, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - - if (state->core_frequency != speed_mhz) { - state->core_frequency = speed_mhz; - pr_debug(" Corrected ACPI frequency to %d\n", - speed_mhz); - } - - if (latency < pc.bits.sgtc) - latency = pc.bits.sgtc; - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - } - - powernow_table[i].frequency = CPUFREQ_TABLE_END; - powernow_table[i].index = 0; - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - return 0; - -err2: - acpi_processor_unregister_performance(acpi_processor_perf, 0); -err1: - free_cpumask_var(acpi_processor_perf->shared_cpu_map); -err05: - kfree(acpi_processor_perf); -err0: - printk(KERN_WARNING PFX "ACPI perflib can not be used on " - "this platform\n"); - acpi_processor_perf = NULL; - return retval; -} -#else -static int powernow_acpi_init(void) -{ - printk(KERN_INFO PFX "no support for ACPI processor found." - " Please recompile your kernel with ACPI processor\n"); - return -EINVAL; -} -#endif - -static void print_pst_entry(struct pst_s *pst, unsigned int j) -{ - pr_debug("PST:%d (@%p)\n", j, pst); - pr_debug(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", - pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); -} - -static int powernow_decode_bios(int maxfid, int startvid) -{ - struct psb_s *psb; - struct pst_s *pst; - unsigned int i, j; - unsigned char *p; - unsigned int etuple; - unsigned int ret; - - etuple = cpuid_eax(0x80000001); - - for (i = 0xC0000; i < 0xffff0 ; i += 16) { - - p = phys_to_virt(i); - - if (memcmp(p, "AMDK7PNOW!", 10) == 0) { - pr_debug("Found PSB header at %p\n", p); - psb = (struct psb_s *) p; - pr_debug("Table version: 0x%x\n", psb->tableversion); - if (psb->tableversion != 0x12) { - printk(KERN_INFO PFX "Sorry, only v1.2 tables" - " supported right now\n"); - return -ENODEV; - } - - pr_debug("Flags: 0x%x\n", psb->flags); - if ((psb->flags & 1) == 0) - pr_debug("Mobile voltage regulator\n"); - else - pr_debug("Desktop voltage regulator\n"); - - latency = psb->settlingtime; - if (latency < 100) { - printk(KERN_INFO PFX "BIOS set settling time " - "to %d microseconds. " - "Should be at least 100. " - "Correcting.\n", latency); - latency = 100; - } - pr_debug("Settling Time: %d microseconds.\n", - psb->settlingtime); - pr_debug("Has %d PST tables. (Only dumping ones " - "relevant to this CPU).\n", - psb->numpst); - - p += sizeof(struct psb_s); - - pst = (struct pst_s *) p; - - for (j = 0; j < psb->numpst; j++) { - pst = (struct pst_s *) p; - number_scales = pst->numpstates; - - if ((etuple == pst->cpuid) && - check_fsb(pst->fsbspeed) && - (maxfid == pst->maxfid) && - (startvid == pst->startvid)) { - print_pst_entry(pst, j); - p = (char *)pst + sizeof(struct pst_s); - ret = get_ranges(p); - return ret; - } else { - unsigned int k; - p = (char *)pst + sizeof(struct pst_s); - for (k = 0; k < number_scales; k++) - p += 2; - } - } - printk(KERN_INFO PFX "No PST tables match this cpuid " - "(0x%x)\n", etuple); - printk(KERN_INFO PFX "This is indicative of a broken " - "BIOS.\n"); - - return -EINVAL; - } - p++; - } - - return -ENODEV; -} - - -static int powernow_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate; - - if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, - relation, &newstate)) - return -EINVAL; - - change_speed(newstate); - - return 0; -} - - -static int powernow_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, powernow_table); -} - -/* - * We use the fact that the bus frequency is somehow - * a multiple of 100000/3 khz, then we compute sgtc according - * to this multiple. - * That way, we match more how AMD thinks all of that work. - * We will then get the same kind of behaviour already tested under - * the "well-known" other OS. - */ -static int __cpuinit fixup_sgtc(void) -{ - unsigned int sgtc; - unsigned int m; - - m = fsb / 3333; - if ((m % 10) >= 5) - m += 5; - - m /= 10; - - sgtc = 100 * m * latency; - sgtc = sgtc / 3; - if (sgtc > 0xfffff) { - printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); - sgtc = 0xfffff; - } - return sgtc; -} - -static unsigned int powernow_get(unsigned int cpu) -{ - union msr_fidvidstatus fidvidstatus; - unsigned int cfid; - - if (cpu) - return 0; - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - - return fsb * fid_codes[cfid] / 10; -} - - -static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) -{ - printk(KERN_WARNING PFX - "%s laptop with broken PST tables in BIOS detected.\n", - d->ident); - printk(KERN_WARNING PFX - "You need to downgrade to 3A21 (09/09/2002), or try a newer " - "BIOS than 3A71 (01/20/2003)\n"); - printk(KERN_WARNING PFX - "cpufreq scaling has been disabled as a result of this.\n"); - return 0; -} - -/* - * Some Athlon laptops have really fucked PST tables. - * A BIOS update is all that can save them. - * Mention this, and disable cpufreq. - */ -static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { - { - .callback = acer_cpufreq_pst, - .ident = "Acer Aspire", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"), - DMI_MATCH(DMI_BIOS_VERSION, "3A71"), - }, - }, - { } -}; - -static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) -{ - union msr_fidvidstatus fidvidstatus; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - - recalibrate_cpu_khz(); - - fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID]; - if (!fsb) { - printk(KERN_WARNING PFX "can not determine bus frequency\n"); - return -EINVAL; - } - pr_debug("FSB: %3dMHz\n", fsb/1000); - - if (dmi_check_system(powernow_dmi_table) || acpi_force) { - printk(KERN_INFO PFX "PSB/PST known to be broken. " - "Trying ACPI instead\n"); - result = powernow_acpi_init(); - } else { - result = powernow_decode_bios(fidvidstatus.bits.MFID, - fidvidstatus.bits.SVID); - if (result) { - printk(KERN_INFO PFX "Trying ACPI perflib\n"); - maximum_speed = 0; - minimum_speed = -1; - latency = 0; - result = powernow_acpi_init(); - if (result) { - printk(KERN_INFO PFX - "ACPI and legacy methods failed\n"); - } - } else { - /* SGTC use the bus clock as timer */ - latency = fixup_sgtc(); - printk(KERN_INFO PFX "SGTC: %d\n", latency); - } - } - - if (result) - return result; - - printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", - minimum_speed/1000, maximum_speed/1000); - - policy->cpuinfo.transition_latency = - cpufreq_scale(2000000UL, fsb, latency); - - policy->cur = powernow_get(0); - - cpufreq_frequency_table_get_attr(powernow_table, policy->cpu); - - return cpufreq_frequency_table_cpuinfo(policy, powernow_table); -} - -static int powernow_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (acpi_processor_perf) { - acpi_processor_unregister_performance(acpi_processor_perf, 0); - free_cpumask_var(acpi_processor_perf->shared_cpu_map); - kfree(acpi_processor_perf); - } -#endif - - kfree(powernow_table); - return 0; -} - -static struct freq_attr *powernow_table_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - .bios_limit = acpi_processor_get_bios_limit, -#endif - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, -}; - -static int __init powernow_init(void) -{ - if (check_powernow() == 0) - return -ENODEV; - return cpufreq_register_driver(&powernow_driver); -} - - -static void __exit powernow_exit(void) -{ - cpufreq_unregister_driver(&powernow_driver); -} - -module_param(acpi_force, int, 0444); -MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); - -MODULE_AUTHOR("Dave Jones "); -MODULE_DESCRIPTION("Powernow driver for AMD K7 processors."); -MODULE_LICENSE("GPL"); - -late_initcall(powernow_init); -module_exit(powernow_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h deleted file mode 100644 index 35fb4ea..0000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * AMD-specific information - * - */ - -union msr_fidvidctl { - struct { - unsigned FID:5, // 4:0 - reserved1:3, // 7:5 - VID:5, // 12:8 - reserved2:3, // 15:13 - FIDC:1, // 16 - VIDC:1, // 17 - reserved3:2, // 19:18 - FIDCHGRATIO:1, // 20 - reserved4:11, // 31-21 - SGTC:20, // 32:51 - reserved5:12; // 63:52 - } bits; - unsigned long long val; -}; - -union msr_fidvidstatus { - struct { - unsigned CFID:5, // 4:0 - reserved1:3, // 7:5 - SFID:5, // 12:8 - reserved2:3, // 15:13 - MFID:5, // 20:16 - reserved3:11, // 31:21 - CVID:5, // 36:32 - reserved4:3, // 39:37 - SVID:5, // 44:40 - reserved5:3, // 47:45 - MVID:5, // 52:48 - reserved6:11; // 63:53 - } bits; - unsigned long long val; -}; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c deleted file mode 100644 index 83479b6..0000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ /dev/null @@ -1,1607 +0,0 @@ -/* - * (c) 2003-2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Support : mark.langsdorf@amd.com - * - * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs - * (C) 2004 Dominik Brodowski - * (C) 2004 Pavel Machek - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Valuable input gratefully received from Dave Jones, Pavel Machek, - * Dominik Brodowski, Jacob Shin, and others. - * Originally developed by Paul Devriendt. - * Processor information obtained from Chapter 9 (Power and Thermal Management) - * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD - * Opteron Processors" available for download from www.amd.com - * - * Tables for specific CPUs can be inferred from - * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include /* for current / set_cpus_allowed() */ -#include -#include - -#include - -#include -#include -#include - -#define PFX "powernow-k8: " -#define VERSION "version 2.20.00" -#include "powernow-k8.h" -#include "mperf.h" - -/* serialize freq changes */ -static DEFINE_MUTEX(fidvid_mutex); - -static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); - -static int cpu_family = CPU_OPTERON; - -/* core performance boost */ -static bool cpb_capable, cpb_enabled; -static struct msr __percpu *msrs; - -static struct cpufreq_driver cpufreq_amd64_driver; - -#ifndef CONFIG_SMP -static inline const struct cpumask *cpu_core_mask(int cpu) -{ - return cpumask_of(0); -} -#endif - -/* Return a frequency in MHz, given an input fid */ -static u32 find_freq_from_fid(u32 fid) -{ - return 800 + (fid * 100); -} - -/* Return a frequency in KHz, given an input fid */ -static u32 find_khz_freq_from_fid(u32 fid) -{ - return 1000 * find_freq_from_fid(fid); -} - -static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, - u32 pstate) -{ - return data[pstate].frequency; -} - -/* Return the vco fid for an input fid - * - * Each "low" fid has corresponding "high" fid, and you can get to "low" fids - * only from corresponding high fids. This returns "high" fid corresponding to - * "low" one. - */ -static u32 convert_fid_to_vco_fid(u32 fid) -{ - if (fid < HI_FID_TABLE_BOTTOM) - return 8 + (2 * fid); - else - return fid; -} - -/* - * Return 1 if the pending bit is set. Unless we just instructed the processor - * to transition to a new state, seeing this bit set is really bad news. - */ -static int pending_bit_stuck(void) -{ - u32 lo, hi; - - if (cpu_family == CPU_HW_PSTATE) - return 0; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; -} - -/* - * Update the global current fid / vid values from the status msr. - * Returns 1 on error. - */ -static int query_current_values_with_pending_wait(struct powernow_k8_data *data) -{ - u32 lo, hi; - u32 i = 0; - - if (cpu_family == CPU_HW_PSTATE) { - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - data->currpstate = i; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) - data->currpstate = HW_PSTATE_0; - - return 0; - } - do { - if (i++ > 10000) { - pr_debug("detected change pending stuck\n"); - return 1; - } - rdmsr(MSR_FIDVID_STATUS, lo, hi); - } while (lo & MSR_S_LO_CHANGE_PENDING); - - data->currvid = hi & MSR_S_HI_CURRENT_VID; - data->currfid = lo & MSR_S_LO_CURRENT_FID; - - return 0; -} - -/* the isochronous relief time */ -static void count_off_irt(struct powernow_k8_data *data) -{ - udelay((1 << data->irt) * 10); - return; -} - -/* the voltage stabilization time */ -static void count_off_vst(struct powernow_k8_data *data) -{ - udelay(data->vstable * VST_UNITS_20US); - return; -} - -/* need to init the control msr to a safe value (for each cpu) */ -static void fidvid_msr_init(void) -{ - u32 lo, hi; - u8 fid, vid; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - vid = hi & MSR_S_HI_CURRENT_VID; - fid = lo & MSR_S_LO_CURRENT_FID; - lo = fid | (vid << MSR_C_LO_VID_SHIFT); - hi = MSR_C_HI_STP_GNT_BENIGN; - pr_debug("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); - wrmsr(MSR_FIDVID_CTL, lo, hi); -} - -/* write the new fid value along with the other control fields to the msr */ -static int write_new_fid(struct powernow_k8_data *data, u32 fid) -{ - u32 lo; - u32 savevid = data->currvid; - u32 i = 0; - - if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on fid write\n"); - return 1; - } - - lo = fid; - lo |= (data->currvid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - pr_debug("writing fid 0x%x, lo 0x%x, hi 0x%x\n", - fid, lo, data->plllock * PLL_LOCK_CONVERSION); - - do { - wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); - if (i++ > 100) { - printk(KERN_ERR PFX - "Hardware error - pending bit very stuck - " - "no further pstate changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - count_off_irt(data); - - if (savevid != data->currvid) { - printk(KERN_ERR PFX - "vid change on fid trans, old 0x%x, new 0x%x\n", - savevid, data->currvid); - return 1; - } - - if (fid != data->currfid) { - printk(KERN_ERR PFX - "fid trans failed, fid 0x%x, curr 0x%x\n", fid, - data->currfid); - return 1; - } - - return 0; -} - -/* Write a new vid to the hardware */ -static int write_new_vid(struct powernow_k8_data *data, u32 vid) -{ - u32 lo; - u32 savefid = data->currfid; - int i = 0; - - if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on vid write\n"); - return 1; - } - - lo = data->currfid; - lo |= (vid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - pr_debug("writing vid 0x%x, lo 0x%x, hi 0x%x\n", - vid, lo, STOP_GRANT_5NS); - - do { - wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); - if (i++ > 100) { - printk(KERN_ERR PFX "internal error - pending bit " - "very stuck - no further pstate " - "changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "fid changed on vid trans, old " - "0x%x new 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (vid != data->currvid) { - printk(KERN_ERR PFX "vid trans failed, vid 0x%x, " - "curr 0x%x\n", - vid, data->currvid); - return 1; - } - - return 0; -} - -/* - * Reduce the vid by the max of step or reqvid. - * Decreasing vid codes represent increasing voltages: - * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. - */ -static int decrease_vid_code_by_step(struct powernow_k8_data *data, - u32 reqvid, u32 step) -{ - if ((data->currvid - reqvid) > step) - reqvid = data->currvid - step; - - if (write_new_vid(data, reqvid)) - return 1; - - count_off_vst(data); - - return 0; -} - -/* Change hardware pstate by single MSR write */ -static int transition_pstate(struct powernow_k8_data *data, u32 pstate) -{ - wrmsr(MSR_PSTATE_CTRL, pstate, 0); - data->currpstate = pstate; - return 0; -} - -/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ -static int transition_fid_vid(struct powernow_k8_data *data, - u32 reqfid, u32 reqvid) -{ - if (core_voltage_pre_transition(data, reqvid, reqfid)) - return 1; - - if (core_frequency_transition(data, reqfid)) - return 1; - - if (core_voltage_post_transition(data, reqvid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((reqfid != data->currfid) || (reqvid != data->currvid)) { - printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, " - "curr 0x%x 0x%x\n", - smp_processor_id(), - reqfid, reqvid, data->currfid, data->currvid); - return 1; - } - - pr_debug("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", - smp_processor_id(), data->currfid, data->currvid); - - return 0; -} - -/* Phase 1 - core voltage transition ... setup voltage */ -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 reqfid) -{ - u32 rvosteps = data->rvo; - u32 savefid = data->currfid; - u32 maxvid, lo, rvomult = 1; - - pr_debug("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " - "reqvid 0x%x, rvo 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqvid, data->rvo); - - if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) - rvomult = 2; - rvosteps *= rvomult; - rdmsr(MSR_FIDVID_STATUS, lo, maxvid); - maxvid = 0x1f & (maxvid >> 16); - pr_debug("ph1 maxvid=0x%x\n", maxvid); - if (reqvid < maxvid) /* lower numbers are higher voltages */ - reqvid = maxvid; - - while (data->currvid > reqvid) { - pr_debug("ph1: curr 0x%x, req vid 0x%x\n", - data->currvid, reqvid); - if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) - return 1; - } - - while ((rvosteps > 0) && - ((rvomult * data->rvo + data->currvid) > reqvid)) { - if (data->currvid == maxvid) { - rvosteps = 0; - } else { - pr_debug("ph1: changing vid for rvo, req 0x%x\n", - data->currvid - 1); - if (decrease_vid_code_by_step(data, data->currvid-1, 1)) - return 1; - rvosteps--; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - pr_debug("ph1 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 2 - core frequency transition */ -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) -{ - u32 vcoreqfid, vcocurrfid, vcofiddiff; - u32 fid_interval, savevid = data->currvid; - - if (data->currfid == reqfid) { - printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", - data->currfid); - return 0; - } - - pr_debug("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " - "reqfid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqfid); - - vcoreqfid = convert_fid_to_vco_fid(reqfid); - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - - if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) - vcofiddiff = 0; - - while (vcofiddiff > 2) { - (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); - - if (reqfid > data->currfid) { - if (data->currfid > LO_FID_TABLE_TOP) { - if (write_new_fid(data, - data->currfid + fid_interval)) - return 1; - } else { - if (write_new_fid - (data, - 2 + convert_fid_to_vco_fid(data->currfid))) - return 1; - } - } else { - if (write_new_fid(data, data->currfid - fid_interval)) - return 1; - } - - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - } - - if (write_new_fid(data, reqfid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (data->currfid != reqfid) { - printk(KERN_ERR PFX - "ph2: mismatch, failed fid transition, " - "curr 0x%x, req 0x%x\n", - data->currfid, reqfid); - return 1; - } - - if (savevid != data->currvid) { - printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n", - savevid, data->currvid); - return 1; - } - - pr_debug("ph2 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 3 - core voltage transition flow ... jump to the final vid. */ -static int core_voltage_post_transition(struct powernow_k8_data *data, - u32 reqvid) -{ - u32 savefid = data->currfid; - u32 savereqvid = reqvid; - - pr_debug("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid); - - if (reqvid != data->currvid) { - if (write_new_vid(data, reqvid)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX - "ph3: bad fid change, save 0x%x, curr 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (data->currvid != reqvid) { - printk(KERN_ERR PFX - "ph3: failed vid transition\n, " - "req 0x%x, curr 0x%x", - reqvid, data->currvid); - return 1; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savereqvid != data->currvid) { - pr_debug("ph3 failed, currvid 0x%x\n", data->currvid); - return 1; - } - - if (savefid != data->currfid) { - pr_debug("ph3 failed, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - pr_debug("ph3 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -static void check_supported_cpu(void *_rc) -{ - u32 eax, ebx, ecx, edx; - int *rc = _rc; - - *rc = -ENODEV; - - if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD) - return; - - eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && - ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - return; - - if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { - if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { - printk(KERN_INFO PFX - "Processor cpuid %x not supported\n", eax); - return; - } - - eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); - if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { - printk(KERN_INFO PFX - "No frequency change capabilities detected\n"); - return; - } - - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & P_STATE_TRANSITION_CAPABLE) - != P_STATE_TRANSITION_CAPABLE) { - printk(KERN_INFO PFX - "Power state transitions not supported\n"); - return; - } - } else { /* must be a HW Pstate capable processor */ - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) - cpu_family = CPU_HW_PSTATE; - else - return; - } - - *rc = 0; -} - -static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, - u8 maxvid) -{ - unsigned int j; - u8 lastfid = 0xff; - - for (j = 0; j < data->numps; j++) { - if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", - j, pst[j].vid); - return -EINVAL; - } - if (pst[j].vid < data->rvo) { - /* vid + rvo >= 0 */ - printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].vid < maxvid + data->rvo) { - /* vid + rvo >= maxvid */ - printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].fid > MAX_FID) { - printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { - /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR FW_BUG PFX "two low fids - %d : " - "0x%x\n", j, pst[j].fid); - return -EINVAL; - } - if (pst[j].fid < lastfid) - lastfid = pst[j].fid; - } - if (lastfid & 1) { - printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); - return -EINVAL; - } - if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO FW_BUG PFX - "first fid not from lo freq table\n"); - - return 0; -} - -static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, - unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} - -static void print_basics(struct powernow_k8_data *data) -{ - int j; - for (j = 0; j < data->numps; j++) { - if (data->powernow_table[j].frequency != - CPUFREQ_ENTRY_INVALID) { - if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX - " %d : pstate %d (%d MHz)\n", j, - data->powernow_table[j].index, - data->powernow_table[j].frequency/1000); - } else { - printk(KERN_INFO PFX - "fid 0x%x (%d MHz), vid 0x%x\n", - data->powernow_table[j].index & 0xff, - data->powernow_table[j].frequency/1000, - data->powernow_table[j].index >> 8); - } - } - } - if (data->batps) - printk(KERN_INFO PFX "Only %d pstates on battery\n", - data->batps); -} - -static u32 freq_from_fid_did(u32 fid, u32 did) -{ - u32 mhz = 0; - - if (boot_cpu_data.x86 == 0x10) - mhz = (100 * (fid + 0x10)) >> did; - else if (boot_cpu_data.x86 == 0x11) - mhz = (100 * (fid + 8)) >> did; - else - BUG(); - - return mhz * 1000; -} - -static int fill_powernow_table(struct powernow_k8_data *data, - struct pst_s *pst, u8 maxvid) -{ - struct cpufreq_frequency_table *powernow_table; - unsigned int j; - - if (data->batps) { - /* use ACPI support to get full speed on mains power */ - printk(KERN_WARNING PFX - "Only %d pstates usable (use ACPI driver for full " - "range\n", data->batps); - data->numps = data->batps; - } - - for (j = 1; j < data->numps; j++) { - if (pst[j-1].fid >= pst[j].fid) { - printk(KERN_ERR PFX "PST out of sequence\n"); - return -EINVAL; - } - } - - if (data->numps < 2) { - printk(KERN_ERR PFX "no p states to transition\n"); - return -ENODEV; - } - - if (check_pst_table(data, pst, maxvid)) - return -EINVAL; - - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->numps + 1)), GFP_KERNEL); - if (!powernow_table) { - printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); - return -ENOMEM; - } - - for (j = 0; j < data->numps; j++) { - int freq; - powernow_table[j].index = pst[j].fid; /* lower 8 bits */ - powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ - freq = find_khz_freq_from_fid(pst[j].fid); - powernow_table[j].frequency = freq; - } - powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; - powernow_table[data->numps].index = 0; - - if (query_current_values_with_pending_wait(data)) { - kfree(powernow_table); - return -EIO; - } - - pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); - data->powernow_table = powernow_table; - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - for (j = 0; j < data->numps; j++) - if ((pst[j].fid == data->currfid) && - (pst[j].vid == data->currvid)) - return 0; - - pr_debug("currfid/vid do not match PST, ignoring\n"); - return 0; -} - -/* Find and validate the PSB/PST table in BIOS. */ -static int find_psb_table(struct powernow_k8_data *data) -{ - struct psb_s *psb; - unsigned int i; - u32 mvs; - u8 maxvid; - u32 cpst = 0; - u32 thiscpuid; - - for (i = 0xc0000; i < 0xffff0; i += 0x10) { - /* Scan BIOS looking for the signature. */ - /* It can not be at ffff0 - it is too big. */ - - psb = phys_to_virt(i); - if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) - continue; - - pr_debug("found PSB header at 0x%p\n", psb); - - pr_debug("table vers: 0x%x\n", psb->tableversion); - if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); - return -ENODEV; - } - - pr_debug("flags: 0x%x\n", psb->flags1); - if (psb->flags1) { - printk(KERN_ERR FW_BUG PFX "unknown flags\n"); - return -ENODEV; - } - - data->vstable = psb->vstable; - pr_debug("voltage stabilization time: %d(*20us)\n", - data->vstable); - - pr_debug("flags2: 0x%x\n", psb->flags2); - data->rvo = psb->flags2 & 3; - data->irt = ((psb->flags2) >> 2) & 3; - mvs = ((psb->flags2) >> 4) & 3; - data->vidmvs = 1 << mvs; - data->batps = ((psb->flags2) >> 6) & 3; - - pr_debug("ramp voltage offset: %d\n", data->rvo); - pr_debug("isochronous relief time: %d\n", data->irt); - pr_debug("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); - - pr_debug("numpst: 0x%x\n", psb->num_tables); - cpst = psb->num_tables; - if ((psb->cpuid == 0x00000fc0) || - (psb->cpuid == 0x00000fe0)) { - thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if ((thiscpuid == 0x00000fc0) || - (thiscpuid == 0x00000fe0)) - cpst = 1; - } - if (cpst != 1) { - printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); - return -ENODEV; - } - - data->plllock = psb->plllocktime; - pr_debug("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); - pr_debug("maxfid: 0x%x\n", psb->maxfid); - pr_debug("maxvid: 0x%x\n", psb->maxvid); - maxvid = psb->maxvid; - - data->numps = psb->numps; - pr_debug("numpstates: 0x%x\n", data->numps); - return fill_powernow_table(data, - (struct pst_s *)(psb+1), maxvid); - } - /* - * If you see this message, complain to BIOS manufacturer. If - * he tells you "we do not support Linux" or some similar - * nonsense, remember that Windows 2000 uses the same legacy - * mechanism that the old Linux PSB driver uses. Tell them it - * is broken with Windows 2000. - * - * The reference to the AMD documentation is chapter 9 in the - * BIOS and Kernel Developer's Guide, which is available on - * www.amd.com - */ - printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); - printk(KERN_ERR PFX "Make sure that your BIOS is up to date" - " and Cool'N'Quiet support is enabled in BIOS setup\n"); - return -ENODEV; -} - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, - unsigned int index) -{ - u64 control; - - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) - return; - - control = data->acpi_data.states[index].control; - data->irt = (control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (control >> VST_SHIFT) & VST_MASK; -} - -static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) -{ - struct cpufreq_frequency_table *powernow_table; - int ret_val = -ENODEV; - u64 control, status; - - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { - pr_debug("register performance failed: bad ACPI data\n"); - return -EIO; - } - - /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { - pr_debug("No ACPI P-States\n"); - goto err_out; - } - - control = data->acpi_data.control_register.space_id; - status = data->acpi_data.status_register.space_id; - - if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - pr_debug("Invalid control/status registers (%llx - %llx)\n", - control, status); - goto err_out; - } - - /* fill in data->powernow_table */ - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); - if (!powernow_table) { - pr_debug("powernow_table memory alloc failure\n"); - goto err_out; - } - - /* fill in data */ - data->numps = data->acpi_data.state_count; - powernow_k8_acpi_pst_values(data, 0); - - if (cpu_family == CPU_HW_PSTATE) - ret_val = fill_powernow_table_pstate(data, powernow_table); - else - ret_val = fill_powernow_table_fidvid(data, powernow_table); - if (ret_val) - goto err_out_mem; - - powernow_table[data->acpi_data.state_count].frequency = - CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; - data->powernow_table = powernow_table; - - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { - printk(KERN_ERR PFX - "unable to alloc powernow_k8_data cpumask\n"); - ret_val = -ENOMEM; - goto err_out_mem; - } - - return 0; - -err_out_mem: - kfree(powernow_table); - -err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); - - /* data->acpi_data.state_count informs us at ->exit() - * whether ACPI was used */ - data->acpi_data.state_count = 0; - - return ret_val; -} - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); - data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 index; - - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; - if (index > data->max_hw_pstate) { - printk(KERN_ERR PFX "invalid pstate %d - " - "bad value %d.\n", i, index); - printk(KERN_ERR PFX "Please report to BIOS " - "manufacturer\n"); - invalidate_entry(powernow_table, i); - continue; - } - rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); - if (!(hi & HW_PSTATE_VALID_MASK)) { - pr_debug("invalid pstate %d, ignoring\n", index); - invalidate_entry(powernow_table, i); - continue; - } - - powernow_table[i].index = index; - - /* Frequency may be rounded for these */ - if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) - || boot_cpu_data.x86 == 0x11) { - powernow_table[i].frequency = - freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); - } else - powernow_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; - } - return 0; -} - -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 fid; - u32 vid; - u32 freq, index; - u64 status, control; - - if (data->exttype) { - status = data->acpi_data.states[i].status; - fid = status & EXT_FID_MASK; - vid = (status >> VID_SHIFT) & EXT_VID_MASK; - } else { - control = data->acpi_data.states[i].control; - fid = control & FID_MASK; - vid = (control >> VID_SHIFT) & VID_MASK; - } - - pr_debug(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); - - index = fid | (vid<<8); - powernow_table[i].index = index; - - freq = find_khz_freq_from_fid(fid); - powernow_table[i].frequency = freq; - - /* verify frequency is OK */ - if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { - pr_debug("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(powernow_table, i); - continue; - } - - /* verify voltage is OK - - * BIOSs are using "off" to indicate invalid */ - if (vid == VID_OFF) { - pr_debug("invalid vid %u, ignoring\n", vid); - invalidate_entry(powernow_table, i); - continue; - } - - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { - printk(KERN_INFO PFX "invalid freq entries " - "%u kHz vs. %u kHz\n", freq, - (unsigned int) - (data->acpi_data.states[i].core_frequency - * 1000)); - invalidate_entry(powernow_table, i); - continue; - } - } - return 0; -} - -static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) -{ - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, - data->cpu); - free_cpumask_var(data->acpi_data.shared_cpu_map); -} - -static int get_transition_latency(struct powernow_k8_data *data) -{ - int max_latency = 0; - int i; - for (i = 0; i < data->acpi_data.state_count; i++) { - int cur_latency = data->acpi_data.states[i].transition_latency - + data->acpi_data.states[i].bus_master_latency; - if (cur_latency > max_latency) - max_latency = cur_latency; - } - if (max_latency == 0) { - /* - * Fam 11h and later may return 0 as transition latency. This - * is intended and means "very fast". While cpufreq core and - * governors currently can handle that gracefully, better set it - * to 1 to avoid problems in the future. - */ - if (boot_cpu_data.x86 < 0x11) - printk(KERN_ERR FW_WARN PFX "Invalid zero transition " - "latency\n"); - max_latency = 1; - } - /* value in usecs, needs to be in nanoseconds */ - return 1000 * max_latency; -} - -/* Take a frequency, and issue the fid/vid transition command */ -static int transition_frequency_fidvid(struct powernow_k8_data *data, - unsigned int index) -{ - u32 fid = 0; - u32 vid = 0; - int res, i; - struct cpufreq_freqs freqs; - - pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* fid/vid correctness check for k8 */ - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in find_psb_table, vid - * are the upper 8 bits. - */ - fid = data->powernow_table[index].index & 0xFF; - vid = (data->powernow_table[index].index & 0xFF00) >> 8; - - pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((data->currvid == vid) && (data->currfid == fid)) { - pr_debug("target matches current values (fid 0x%x, vid 0x%x)\n", - fid, vid); - return 0; - } - - pr_debug("cpu %d, changing to fid 0x%x, vid 0x%x\n", - smp_processor_id(), fid, vid); - freqs.old = find_khz_freq_from_fid(data->currfid); - freqs.new = find_khz_freq_from_fid(fid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_fid_vid(data, fid, vid); - freqs.new = find_khz_freq_from_fid(data->currfid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Take a frequency, and issue the hardware pstate transition command */ -static int transition_frequency_pstate(struct powernow_k8_data *data, - unsigned int index) -{ - u32 pstate = 0; - int res, i; - struct cpufreq_freqs freqs; - - pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* get MSR index for hardware pstate transition */ - pstate = index & HW_PSTATE_MASK; - if (pstate > data->max_hw_pstate) - return 0; - freqs.old = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_pstate(data, pstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Driver entry point to switch to the target frequency */ -static int powernowk8_target(struct cpufreq_policy *pol, - unsigned targfreq, unsigned relation) -{ - cpumask_var_t oldmask; - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - u32 checkfid; - u32 checkvid; - unsigned int newstate; - int ret = -EIO; - - if (!data) - return -EINVAL; - - checkfid = data->currfid; - checkvid = data->currvid; - - /* only run on specific CPU from here on. */ - /* This is poor form: use a workqueue or smp_call_function_single */ - if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(oldmask, tsk_cpus_allowed(current)); - set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing targ, change pending bit set\n"); - goto err_out; - } - - pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", - pol->cpu, targfreq, pol->min, pol->max, relation); - - if (query_current_values_with_pending_wait(data)) - goto err_out; - - if (cpu_family != CPU_HW_PSTATE) { - pr_debug("targ: curr fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - if ((checkvid != data->currvid) || - (checkfid != data->currfid)) { - printk(KERN_INFO PFX - "error - out of sync, fix 0x%x 0x%x, " - "vid 0x%x 0x%x\n", - checkfid, data->currfid, - checkvid, data->currvid); - } - } - - if (cpufreq_frequency_table_target(pol, data->powernow_table, - targfreq, relation, &newstate)) - goto err_out; - - mutex_lock(&fidvid_mutex); - - powernow_k8_acpi_pst_values(data, newstate); - - if (cpu_family == CPU_HW_PSTATE) - ret = transition_frequency_pstate(data, newstate); - else - ret = transition_frequency_fidvid(data, newstate); - if (ret) { - printk(KERN_ERR PFX "transition frequency failed\n"); - ret = 1; - mutex_unlock(&fidvid_mutex); - goto err_out; - } - mutex_unlock(&fidvid_mutex); - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - newstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - ret = 0; - -err_out: - set_cpus_allowed_ptr(current, oldmask); - free_cpumask_var(oldmask); - return ret; -} - -/* Driver entry point to verify the policy and range of frequencies */ -static int powernowk8_verify(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - return cpufreq_frequency_table_verify(pol, data->powernow_table); -} - -struct init_on_cpu { - struct powernow_k8_data *data; - int rc; -}; - -static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) -{ - struct init_on_cpu *init_on_cpu = _init_on_cpu; - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - init_on_cpu->rc = -ENODEV; - return; - } - - if (query_current_values_with_pending_wait(init_on_cpu->data)) { - init_on_cpu->rc = -ENODEV; - return; - } - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - init_on_cpu->rc = 0; -} - -/* per CPU init entry point to the driver */ -static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) -{ - static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - FW_BUG PFX "Try again with latest BIOS.\n"; - struct powernow_k8_data *data; - struct init_on_cpu init_on_cpu; - int rc; - struct cpuinfo_x86 *c = &cpu_data(pol->cpu); - - if (!cpu_online(pol->cpu)) - return -ENODEV; - - smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); - if (rc) - return -ENODEV; - - data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); - if (!data) { - printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); - return -ENOMEM; - } - - data->cpu = pol->cpu; - data->currpstate = HW_PSTATE_INVALID; - - if (powernow_k8_cpu_init_acpi(data)) { - /* - * Use the PSB BIOS structure. This is only available on - * an UP version, and is deprecated by AMD. - */ - if (num_online_cpus() != 1) { - printk_once(ACPI_PSS_BIOS_BUG_MSG); - goto err_out; - } - if (pol->cpu != 0) { - printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " - "CPU other than CPU0. Complain to your BIOS " - "vendor.\n"); - goto err_out; - } - rc = find_psb_table(data); - if (rc) - goto err_out; - - /* Take a crude guess here. - * That guess was in microseconds, so multiply with 1000 */ - pol->cpuinfo.transition_latency = ( - ((data->rvo + 8) * data->vstable * VST_UNITS_20US) + - ((1 << data->irt) * 30)) * 1000; - } else /* ACPI _PSS objects available */ - pol->cpuinfo.transition_latency = get_transition_latency(data); - - /* only run on specific CPU from here on */ - init_on_cpu.data = data; - smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, - &init_on_cpu, 1); - rc = init_on_cpu.rc; - if (rc != 0) - goto err_out_exit_acpi; - - if (cpu_family == CPU_HW_PSTATE) - cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); - else - cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); - data->available_cores = pol->cpus; - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - pr_debug("policy current frequency %d kHz\n", pol->cur); - - /* min/max the cpu is capable of */ - if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); - powernow_k8_cpu_exit_acpi(data); - kfree(data->powernow_table); - kfree(data); - return -EINVAL; - } - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; - - cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); - - if (cpu_family == CPU_HW_PSTATE) - pr_debug("cpu_init done, current pstate 0x%x\n", - data->currpstate); - else - pr_debug("cpu_init done, current fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - per_cpu(powernow_data, pol->cpu) = data; - - return 0; - -err_out_exit_acpi: - powernow_k8_cpu_exit_acpi(data); - -err_out: - kfree(data); - return -ENODEV; -} - -static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - powernow_k8_cpu_exit_acpi(data); - - cpufreq_frequency_table_put_attr(pol->cpu); - - kfree(data->powernow_table); - kfree(data); - per_cpu(powernow_data, pol->cpu) = NULL; - - return 0; -} - -static void query_values_on_cpu(void *_err) -{ - int *err = _err; - struct powernow_k8_data *data = __this_cpu_read(powernow_data); - - *err = query_current_values_with_pending_wait(data); -} - -static unsigned int powernowk8_get(unsigned int cpu) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, cpu); - unsigned int khz = 0; - int err; - - if (!data) - return 0; - - smp_call_function_single(cpu, query_values_on_cpu, &err, true); - if (err) - goto out; - - if (cpu_family == CPU_HW_PSTATE) - khz = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - khz = find_khz_freq_from_fid(data->currfid); - - -out: - return khz; -} - -static void _cpb_toggle_msrs(bool t) -{ - int cpu; - - get_online_cpus(); - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - if (t) - reg->l &= ~BIT(25); - else - reg->l |= BIT(25); - } - wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - put_online_cpus(); -} - -/* - * Switch on/off core performance boosting. - * - * 0=disable - * 1=enable. - */ -static void cpb_toggle(bool t) -{ - if (!cpb_capable) - return; - - if (t && !cpb_enabled) { - cpb_enabled = true; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting enabled.\n"); - } else if (!t && cpb_enabled) { - cpb_enabled = false; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting disabled.\n"); - } -} - -static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, - size_t count) -{ - int ret = -EINVAL; - unsigned long val = 0; - - ret = strict_strtoul(buf, 10, &val); - if (!ret && (val == 0 || val == 1) && cpb_capable) - cpb_toggle(val); - else - return -EINVAL; - - return count; -} - -static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) -{ - return sprintf(buf, "%u\n", cpb_enabled); -} - -#define define_one_rw(_name) \ -static struct freq_attr _name = \ -__ATTR(_name, 0644, show_##_name, store_##_name) - -define_one_rw(cpb); - -static struct freq_attr *powernow_k8_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - &cpb, - NULL, -}; - -static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, -}; - -/* - * Clear the boost-disable flag on the CPU_DOWN path so that this cpu - * cannot block the remaining ones from boosting. On the CPU_UP path we - * simply keep the boost-disable flag in sync with the current global - * state. - */ -static int cpb_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) -{ - unsigned cpu = (long)hcpu; - u32 lo, hi; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - - if (!cpb_enabled) { - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo |= BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - } - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo &= ~BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block cpb_nb = { - .notifier_call = cpb_notify, -}; - -/* driver entry point for init */ -static int __cpuinit powernowk8_init(void) -{ - unsigned int i, supported_cpus = 0, cpu; - int rv; - - for_each_online_cpu(i) { - int rc; - smp_call_function_single(i, check_supported_cpu, &rc, 1); - if (rc == 0) - supported_cpus++; - } - - if (supported_cpus != num_online_cpus()) - return -ENODEV; - - printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", - num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - - cpb_capable = true; - - msrs = msrs_alloc(); - if (!msrs) { - printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); - return -ENOMEM; - } - - register_cpu_notifier(&cpb_nb); - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - cpb_enabled |= !(!!(reg->l & BIT(25))); - } - - printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", - (cpb_enabled ? "on" : "off")); - } - - rv = cpufreq_register_driver(&cpufreq_amd64_driver); - if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) { - unregister_cpu_notifier(&cpb_nb); - msrs_free(msrs); - msrs = NULL; - } - return rv; -} - -/* driver entry point for term */ -static void __exit powernowk8_exit(void) -{ - pr_debug("exit\n"); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - msrs_free(msrs); - msrs = NULL; - - unregister_cpu_notifier(&cpb_nb); - } - - cpufreq_unregister_driver(&cpufreq_amd64_driver); -} - -MODULE_AUTHOR("Paul Devriendt and " - "Mark Langsdorf "); -MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); -MODULE_LICENSE("GPL"); - -late_initcall(powernowk8_init); -module_exit(powernowk8_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h deleted file mode 100644 index 3744d26..0000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ /dev/null @@ -1,222 +0,0 @@ -/* - * (c) 2003-2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -enum pstate { - HW_PSTATE_INVALID = 0xff, - HW_PSTATE_0 = 0, - HW_PSTATE_1 = 1, - HW_PSTATE_2 = 2, - HW_PSTATE_3 = 3, - HW_PSTATE_4 = 4, - HW_PSTATE_5 = 5, - HW_PSTATE_6 = 6, - HW_PSTATE_7 = 7, -}; - -struct powernow_k8_data { - unsigned int cpu; - - u32 numps; /* number of p-states */ - u32 batps; /* number of p-states supported on battery */ - u32 max_hw_pstate; /* maximum legal hardware pstate */ - - /* these values are constant when the PSB is used to determine - * vid/fid pairings, but are modified during the ->target() call - * when ACPI is used */ - u32 rvo; /* ramp voltage offset */ - u32 irt; /* isochronous relief time */ - u32 vidmvs; /* usable value calculated from mvs */ - u32 vstable; /* voltage stabilization time, units 20 us */ - u32 plllock; /* pll lock time, units 1 us */ - u32 exttype; /* extended interface = 1 */ - - /* keep track of the current fid / vid or pstate */ - u32 currvid; - u32 currfid; - enum pstate currpstate; - - /* the powernow_table includes all frequency and vid/fid pairings: - * fid are the lower 8 bits of the index, vid are the upper 8 bits. - * frequency is in kHz */ - struct cpufreq_frequency_table *powernow_table; - - /* the acpi table needs to be kept. it's only available if ACPI was - * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; - - /* we need to keep track of associated cores, but let cpufreq - * handle hotplug events - so just point at cpufreq pol->cpus - * structure */ - struct cpumask *available_cores; -}; - -/* processor's cpuid instruction support */ -#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ -#define CPUID_XFAM 0x0ff00000 /* extended family */ -#define CPUID_XFAM_K8 0 -#define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_MASK 0x000c0000 -#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ -#define CPUID_USE_XFAM_XMOD 0x00000f00 -#define CPUID_GET_MAX_CAPABILITIES 0x80000000 -#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -#define P_STATE_TRANSITION_CAPABLE 6 - -/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */ -/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ -/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ -/* the register number is placed in ecx, and the data is returned in edx:eax. */ - -#define MSR_FIDVID_CTL 0xc0010041 -#define MSR_FIDVID_STATUS 0xc0010042 - -/* Field definitions within the FID VID Low Control MSR : */ -#define MSR_C_LO_INIT_FID_VID 0x00010000 -#define MSR_C_LO_NEW_VID 0x00003f00 -#define MSR_C_LO_NEW_FID 0x0000003f -#define MSR_C_LO_VID_SHIFT 8 - -/* Field definitions within the FID VID High Control MSR : */ -#define MSR_C_HI_STP_GNT_TO 0x000fffff - -/* Field definitions within the FID VID Low Status MSR : */ -#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ -#define MSR_S_LO_MAX_RAMP_VID 0x3f000000 -#define MSR_S_LO_MAX_FID 0x003f0000 -#define MSR_S_LO_START_FID 0x00003f00 -#define MSR_S_LO_CURRENT_FID 0x0000003f - -/* Field definitions within the FID VID High Status MSR : */ -#define MSR_S_HI_MIN_WORKING_VID 0x3f000000 -#define MSR_S_HI_MAX_WORKING_VID 0x003f0000 -#define MSR_S_HI_START_VID 0x00003f00 -#define MSR_S_HI_CURRENT_VID 0x0000003f -#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 - - -/* Hardware Pstate _PSS and MSR definitions */ -#define USE_HW_PSTATE 0x00000080 -#define HW_PSTATE_MASK 0x00000007 -#define HW_PSTATE_VALID_MASK 0x80000000 -#define HW_PSTATE_MAX_MASK 0x000000f0 -#define HW_PSTATE_MAX_SHIFT 4 -#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ -#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ -#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ -#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ - -/* define the two driver architectures */ -#define CPU_OPTERON 0 -#define CPU_HW_PSTATE 1 - - -/* - * There are restrictions frequencies have to follow: - * - only 1 entry in the low fid table ( <=1.4GHz ) - * - lowest entry in the high fid table must be >= 2 * the entry in the - * low fid table - * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry - * in the low fid table - * - the parts can only step at <= 200 MHz intervals, odd fid values are - * supported in revision G and later revisions. - * - lowest frequency must be >= interprocessor hypertransport link speed - * (only applies to MP systems obviously) - */ - -/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ -#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */ -#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ - -#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ -#define HI_VCOFREQ_TABLE_BOTTOM 1600 - -#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ - -#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ -#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */ - -#define MIN_FREQ 800 /* Min and max freqs, per spec */ -#define MAX_FREQ 5000 - -#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */ -#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */ - -#define VID_OFF 0x3f - -#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ - -#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ - -#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ -#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ - -/* - * Most values of interest are encoded in a single field of the _PSS - * entries: the "control" value. - */ - -#define IRT_SHIFT 30 -#define RVO_SHIFT 28 -#define EXT_TYPE_SHIFT 27 -#define PLL_L_SHIFT 20 -#define MVS_SHIFT 18 -#define VST_SHIFT 11 -#define VID_SHIFT 6 -#define IRT_MASK 3 -#define RVO_MASK 3 -#define EXT_TYPE_MASK 1 -#define PLL_L_MASK 0x7f -#define MVS_MASK 3 -#define VST_MASK 0x7f -#define VID_MASK 0x1f -#define FID_MASK 0x1f -#define EXT_VID_MASK 0x3f -#define EXT_FID_MASK 0x3f - - -/* - * Version 1.4 of the PSB table. This table is constructed by BIOS and is - * to tell the OS's power management driver which VIDs and FIDs are - * supported by this particular processor. - * If the data in the PSB / PST is wrong, then this driver will program the - * wrong values into hardware, which is very likely to lead to a crash. - */ - -#define PSB_ID_STRING "AMDK7PNOW!" -#define PSB_ID_STRING_LEN 10 - -#define PSB_VERSION_1_4 0x14 - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags1; - u16 vstable; - u8 flags2; - u8 num_tables; - u32 cpuid; - u8 plllocktime; - u8 maxfid; - u8 maxvid; - u8 numps; -}; - -/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */ -struct pst_s { - u8 fid; - u8 vid; -}; - -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 regfid); -static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c deleted file mode 100644 index 1e205e6..0000000 --- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * sc520_freq.c: cpufreq driver for the AMD Elan sc520 - * - * Copyright (C) 2005 Sean Young - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Based on elanfreq.c - * - * 2005-03-30: - initial revision - */ - -#include -#include -#include - -#include -#include -#include -#include - -#include - -#define MMCR_BASE 0xfffef000 /* The default base address */ -#define OFFS_CPUCTL 0x2 /* CPU Control Register */ - -static __u8 __iomem *cpuctl; - -#define PFX "sc520_freq: " - -static struct cpufreq_frequency_table sc520_freq_table[] = { - {0x01, 100000}, - {0x02, 133000}, - {0, CPUFREQ_TABLE_END}, -}; - -static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg = *cpuctl; - - switch (clockspeed_reg & 0x03) { - default: - printk(KERN_ERR PFX "error: cpuctl register has unexpected " - "value %02x\n", clockspeed_reg); - case 0x01: - return 100000; - case 0x02: - return 133000; - } -} - -static void sc520_freq_set_cpu_state(unsigned int state) -{ - - struct cpufreq_freqs freqs; - u8 clockspeed_reg; - - freqs.old = sc520_freq_get_cpu_frequency(0); - freqs.new = sc520_freq_table[state].frequency; - freqs.cpu = 0; /* AMD Elan is UP */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - pr_debug("attempting to set frequency to %i kHz\n", - sc520_freq_table[state].frequency); - - local_irq_disable(); - - clockspeed_reg = *cpuctl & ~0x03; - *cpuctl = clockspeed_reg | sc520_freq_table[state].index; - - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - -static int sc520_freq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); -} - -static int sc520_freq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, sc520_freq_table, - target_freq, relation, &newstate)) - return -EINVAL; - - sc520_freq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int sc520_freq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int result; - - /* capability check */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) - return -ENODEV; - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 1000000; /* 1ms */ - policy->cur = sc520_freq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); - - return 0; -} - - -static int sc520_freq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -static struct freq_attr *sc520_freq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver sc520_freq_driver = { - .get = sc520_freq_get_cpu_frequency, - .verify = sc520_freq_verify, - .target = sc520_freq_target, - .init = sc520_freq_cpu_init, - .exit = sc520_freq_cpu_exit, - .name = "sc520_freq", - .owner = THIS_MODULE, - .attr = sc520_freq_attr, -}; - - -static int __init sc520_freq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int err; - - /* Test if we have the right hardware */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) { - pr_debug("no Elan SC520 processor found!\n"); - return -ENODEV; - } - cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); - if (!cpuctl) { - printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); - return -ENOMEM; - } - - err = cpufreq_register_driver(&sc520_freq_driver); - if (err) - iounmap(cpuctl); - - return err; -} - - -static void __exit sc520_freq_exit(void) -{ - cpufreq_unregister_driver(&sc520_freq_driver); - iounmap(cpuctl); -} - - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Sean Young "); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU"); - -module_init(sc520_freq_init); -module_exit(sc520_freq_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c deleted file mode 100644 index 6ea3455..0000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ /dev/null @@ -1,633 +0,0 @@ -/* - * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium - * M (part of the Centrino chipset). - * - * Since the original Pentium M, most new Intel CPUs support Enhanced - * SpeedStep. - * - * Despite the "SpeedStep" in the name, this is almost entirely unlike - * traditional SpeedStep. - * - * Modelled on speedstep.c - * - * Copyright (C) 2003 Jeremy Fitzhardinge - */ - -#include -#include -#include -#include -#include /* current */ -#include -#include -#include - -#include -#include -#include - -#define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@vger.kernel.org" - -#define INTEL_MSR_RANGE (0xffff) - -struct cpu_id -{ - __u8 x86; /* CPU family */ - __u8 x86_model; /* model */ - __u8 x86_mask; /* stepping */ -}; - -enum { - CPU_BANIAS, - CPU_DOTHAN_A1, - CPU_DOTHAN_A2, - CPU_DOTHAN_B0, - CPU_MP4HT_D0, - CPU_MP4HT_E0, -}; - -static const struct cpu_id cpu_ids[] = { - [CPU_BANIAS] = { 6, 9, 5 }, - [CPU_DOTHAN_A1] = { 6, 13, 1 }, - [CPU_DOTHAN_A2] = { 6, 13, 2 }, - [CPU_DOTHAN_B0] = { 6, 13, 6 }, - [CPU_MP4HT_D0] = {15, 3, 4 }, - [CPU_MP4HT_E0] = {15, 4, 1 }, -}; -#define N_IDS ARRAY_SIZE(cpu_ids) - -struct cpu_model -{ - const struct cpu_id *cpu_id; - const char *model_name; - unsigned max_freq; /* max clock in kHz */ - - struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ -}; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x); - -/* Operating points for current CPU */ -static DEFINE_PER_CPU(struct cpu_model *, centrino_model); -static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); - -static struct cpufreq_driver centrino_driver; - -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE - -/* Computes the correct form for IA32_PERF_CTL MSR for a particular - frequency/voltage operating point; frequency in MHz, volts in mV. - This is stored as "index" in the structure. */ -#define OP(mhz, mv) \ - { \ - .frequency = (mhz) * 1000, \ - .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \ - } - -/* - * These voltage tables were derived from the Intel Pentium M - * datasheet, document 25261202.pdf, Table 5. I have verified they - * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium - * M. - */ - -/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */ -static struct cpufreq_frequency_table banias_900[] = -{ - OP(600, 844), - OP(800, 988), - OP(900, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */ -static struct cpufreq_frequency_table banias_1000[] = -{ - OP(600, 844), - OP(800, 972), - OP(900, 988), - OP(1000, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */ -static struct cpufreq_frequency_table banias_1100[] = -{ - OP( 600, 956), - OP( 800, 1020), - OP( 900, 1100), - OP(1000, 1164), - OP(1100, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - - -/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */ -static struct cpufreq_frequency_table banias_1200[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP( 900, 1020), - OP(1000, 1100), - OP(1100, 1164), - OP(1200, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.30GHz (Banias) */ -static struct cpufreq_frequency_table banias_1300[] = -{ - OP( 600, 956), - OP( 800, 1260), - OP(1000, 1292), - OP(1200, 1356), - OP(1300, 1388), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.40GHz (Banias) */ -static struct cpufreq_frequency_table banias_1400[] = -{ - OP( 600, 956), - OP( 800, 1180), - OP(1000, 1308), - OP(1200, 1436), - OP(1400, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.50GHz (Banias) */ -static struct cpufreq_frequency_table banias_1500[] = -{ - OP( 600, 956), - OP( 800, 1116), - OP(1000, 1228), - OP(1200, 1356), - OP(1400, 1452), - OP(1500, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.60GHz (Banias) */ -static struct cpufreq_frequency_table banias_1600[] = -{ - OP( 600, 956), - OP( 800, 1036), - OP(1000, 1164), - OP(1200, 1276), - OP(1400, 1420), - OP(1600, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.70GHz (Banias) */ -static struct cpufreq_frequency_table banias_1700[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP(1000, 1116), - OP(1200, 1228), - OP(1400, 1308), - OP(1700, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; -#undef OP - -#define _BANIAS(cpuid, max, name) \ -{ .cpu_id = cpuid, \ - .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \ - .max_freq = (max)*1000, \ - .op_points = banias_##max, \ -} -#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max) - -/* CPU models, their operating frequency range, and freq/voltage - operating points */ -static struct cpu_model models[] = -{ - _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"), - BANIAS(1000), - BANIAS(1100), - BANIAS(1200), - BANIAS(1300), - BANIAS(1400), - BANIAS(1500), - BANIAS(1600), - BANIAS(1700), - - /* NULL model_name is a wildcard */ - { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, - - { NULL, } -}; -#undef _BANIAS -#undef BANIAS - -static int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - struct cpu_model *model; - - for(model = models; model->cpu_id != NULL; model++) - if (centrino_verify_cpu_id(cpu, model->cpu_id) && - (model->model_name == NULL || - strcmp(cpu->x86_model_id, model->model_name) == 0)) - break; - - if (model->cpu_id == NULL) { - /* No match at all */ - pr_debug("no support for CPU model \"%s\": " - "send /proc/cpuinfo to " MAINTAINER "\n", - cpu->x86_model_id); - return -ENOENT; - } - - if (model->op_points == NULL) { - /* Matched a non-match */ - pr_debug("no table support for CPU model \"%s\"\n", - cpu->x86_model_id); - pr_debug("try using the acpi-cpufreq driver\n"); - return -ENOENT; - } - - per_cpu(centrino_model, policy->cpu) = model; - - pr_debug("found \"%s\": max frequency: %dkHz\n", - model->model_name, model->max_freq); - - return 0; -} - -#else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - return -ENODEV; -} -#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ - -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x) -{ - if ((c->x86 == x->x86) && - (c->x86_model == x->x86_model) && - (c->x86_mask == x->x86_mask)) - return 1; - return 0; -} - -/* To be called only after centrino_model is initialized */ -static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) -{ - int i; - - /* - * Extract clock in kHz from PERF_CTL value - * for centrino, as some DSDTs are buggy. - * Ideally, this can be done using the acpi_data structure. - */ - if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { - msr = (msr >> 8) & 0xff; - return msr * 100000; - } - - if ((!per_cpu(centrino_model, cpu)) || - (!per_cpu(centrino_model, cpu)->op_points)) - return 0; - - msr &= 0xffff; - for (i = 0; - per_cpu(centrino_model, cpu)->op_points[i].frequency - != CPUFREQ_TABLE_END; - i++) { - if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) - return per_cpu(centrino_model, cpu)-> - op_points[i].frequency; - } - if (failsafe) - return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; - else - return 0; -} - -/* Return the current CPU frequency in kHz */ -static unsigned int get_cur_freq(unsigned int cpu) -{ - unsigned l, h; - unsigned clock_freq; - - rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); - clock_freq = extract_clock(l, cpu, 0); - - if (unlikely(clock_freq == 0)) { - /* - * On some CPUs, we can see transient MSR values (which are - * not present in _PSS), while CPU is doing some automatic - * P-state transition (like TM2). Get the last freq set - * in PERF_CTL. - */ - rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); - clock_freq = extract_clock(l, cpu, 1); - } - return clock_freq; -} - - -static int centrino_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - unsigned freq; - unsigned l, h; - int ret; - int i; - - /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) - centrino_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (policy->cpu != 0) - return -ENODEV; - - for (i = 0; i < N_IDS; i++) - if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) - break; - - if (i != N_IDS) - per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - - if (!per_cpu(centrino_cpu, policy->cpu)) { - pr_debug("found unsupported CPU with " - "Enhanced SpeedStep: send /proc/cpuinfo to " - MAINTAINER "\n"); - return -ENODEV; - } - - if (centrino_cpu_init_table(policy)) { - return -ENODEV; - } - - /* Check to see if Enhanced SpeedStep is enabled, and try to - enable it if not. */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - pr_debug("trying to enable Enhanced SpeedStep (%x)\n", l); - wrmsr(MSR_IA32_MISC_ENABLE, l, h); - - /* check to see if it stuck */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO PFX - "couldn't enable Enhanced SpeedStep\n"); - return -ENODEV; - } - } - - freq = get_cur_freq(policy->cpu); - policy->cpuinfo.transition_latency = 10000; - /* 10uS transition latency */ - policy->cur = freq; - - pr_debug("centrino_cpu_init: cur=%dkHz\n", policy->cur); - - ret = cpufreq_frequency_table_cpuinfo(policy, - per_cpu(centrino_model, policy->cpu)->op_points); - if (ret) - return (ret); - - cpufreq_frequency_table_get_attr( - per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); - - return 0; -} - -static int centrino_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - - if (!per_cpu(centrino_model, cpu)) - return -ENODEV; - - cpufreq_frequency_table_put_attr(cpu); - - per_cpu(centrino_model, cpu) = NULL; - - return 0; -} - -/** - * centrino_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within this model's frequency range at least one - * border included. - */ -static int centrino_verify (struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - per_cpu(centrino_model, policy->cpu)->op_points); -} - -/** - * centrino_setpolicy - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int centrino_target (struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; - struct cpufreq_freqs freqs; - int retval = 0; - unsigned int j, k, first_cpu, tmp; - cpumask_var_t covered_cpus; - - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) - return -ENOMEM; - - if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { - retval = -ENODEV; - goto out; - } - - if (unlikely(cpufreq_frequency_table_target(policy, - per_cpu(centrino_model, cpu)->op_points, - target_freq, - relation, - &newstate))) { - retval = -EINVAL; - goto out; - } - - first_cpu = 1; - for_each_cpu(j, policy->cpus) { - int good_cpu; - - /* cpufreq holds the hotplug lock, so we are safe here */ - if (!cpu_online(j)) - continue; - - /* - * Support for SMP systems. - * Make sure we are running on CPU that wants to change freq - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - good_cpu = cpumask_any_and(policy->cpus, - cpu_online_mask); - else - good_cpu = j; - - if (good_cpu >= nr_cpu_ids) { - pr_debug("couldn't limit to CPUs in this domain\n"); - retval = -EAGAIN; - if (first_cpu) { - /* We haven't started the transition yet. */ - goto out; - } - break; - } - - msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; - - if (first_cpu) { - rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); - if (msr == (oldmsr & 0xffff)) { - pr_debug("no change needed - msr was and needs " - "to be %x\n", oldmsr); - retval = 0; - goto out; - } - - freqs.old = extract_clock(oldmsr, cpu, 0); - freqs.new = extract_clock(msr, cpu, 0); - - pr_debug("target=%dkHz old=%d new=%d msr=%04x\n", - target_freq, freqs.old, freqs.new, msr); - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, - CPUFREQ_PRECHANGE); - } - - first_cpu = 0; - /* all but 16 LSB are reserved, treat them with care */ - oldmsr &= ~0xffff; - msr &= 0xffff; - oldmsr |= msr; - } - - wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - break; - - cpumask_set_cpu(j, covered_cpus); - } - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - if (unlikely(retval)) { - /* - * We have failed halfway through the frequency change. - * We have sent callbacks to policy->cpus and - * MSRs have already been written on coverd_cpus. - * Best effort undo.. - */ - - for_each_cpu(j, covered_cpus) - wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); - - tmp = freqs.new; - freqs.new = freqs.old; - freqs.old = tmp; - for_each_cpu(j, policy->cpus) { - if (!cpu_online(j)) - continue; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - } - retval = 0; - -out: - free_cpumask_var(covered_cpus); - return retval; -} - -static struct freq_attr* centrino_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver centrino_driver = { - .name = "centrino", /* should be speedstep-centrino, - but there's a 16 char limit */ - .init = centrino_cpu_init, - .exit = centrino_cpu_exit, - .verify = centrino_verify, - .target = centrino_target, - .get = get_cur_freq, - .attr = centrino_attr, - .owner = THIS_MODULE, -}; - - -/** - * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver - * - * Initializes the Enhanced SpeedStep support. Returns -ENODEV on - * unsupported devices, -ENOENT if there's no voltage table for this - * particular CPU model, -EINVAL on problems during initiatization, - * and zero on success. - * - * This is quite picky. Not only does the CPU have to advertise the - * "est" flag in the cpuid capability flags, we look for a specific - * CPU model and stepping, and we need to have the exact model name in - * our voltage tables. That is, be paranoid about not releasing - * someone's valuable magic smoke. - */ -static int __init centrino_init(void) -{ - struct cpuinfo_x86 *cpu = &cpu_data(0); - - if (!cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - return cpufreq_register_driver(¢rino_driver); -} - -static void __exit centrino_exit(void) -{ - cpufreq_unregister_driver(¢rino_driver); -} - -MODULE_AUTHOR ("Jeremy Fitzhardinge "); -MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors."); -MODULE_LICENSE ("GPL"); - -late_initcall(centrino_init); -module_exit(centrino_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c deleted file mode 100644 index a748ce7..0000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ /dev/null @@ -1,448 +0,0 @@ -/* - * (C) 2001 Dave Jones, Arjan van de ven. - * (C) 2002 - 2003 Dominik Brodowski - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information, and on Intel documentation - * for chipsets ICH2-M and ICH3-M. - * - * Many thanks to Ducrot Bruno for finding and fixing the last - * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler - * for extensive testing. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include "speedstep-lib.h" - - -/* speedstep_chipset: - * It is necessary to know which chipset is used. As accesses to - * this device occur at various places in this module, we need a - * static struct pci_dev * pointing to that device. - */ -static struct pci_dev *speedstep_chipset_dev; - - -/* speedstep_processor - */ -static enum speedstep_processor speedstep_processor; - -static u32 pmbase; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - - -/** - * speedstep_find_register - read the PMBASE address - * - * Returns: -ENODEV if no register could be found - */ -static int speedstep_find_register(void) -{ - if (!speedstep_chipset_dev) - return -ENODEV; - - /* get PMBASE */ - pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase); - if (!(pmbase & 0x01)) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - pmbase &= 0xFFFFFFFE; - if (!pmbase) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - pr_debug("pmbase is 0x%x\n", pmbase); - return 0; -} - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - * Tries to change the SpeedStep state. Can be called from - * smp_call_function_single. - */ -static void speedstep_set_state(unsigned int state) -{ - u8 pm2_blk; - u8 value; - unsigned long flags; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - /* read state */ - value = inb(pmbase + 0x50); - - pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - /* write new state */ - value &= 0xFE; - value |= state; - - pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); - - /* Disable bus master arbitration */ - pm2_blk = inb(pmbase + 0x20); - pm2_blk |= 0x01; - outb(pm2_blk, (pmbase + 0x20)); - - /* Actual transition */ - outb(value, (pmbase + 0x50)); - - /* Restore bus master arbitration */ - pm2_blk &= 0xfe; - outb(pm2_blk, (pmbase + 0x20)); - - /* check if transition was successful */ - value = inb(pmbase + 0x50); - - /* Enable IRQs */ - local_irq_restore(flags); - - pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - if (state == (value & 0x1)) - pr_debug("change to %u MHz succeeded\n", - speedstep_get_frequency(speedstep_processor) / 1000); - else - printk(KERN_ERR "cpufreq: change failed - I/O error\n"); - - return; -} - -/* Wrapper for smp_call_function_single. */ -static void _speedstep_set_state(void *_state) -{ - speedstep_set_state(*(unsigned int *)_state); -} - -/** - * speedstep_activate - activate SpeedStep control in the chipset - * - * Tries to activate the SpeedStep status and control registers. - * Returns -EINVAL on an unsupported chipset, and zero on success. - */ -static int speedstep_activate(void) -{ - u16 value = 0; - - if (!speedstep_chipset_dev) - return -EINVAL; - - pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); - if (!(value & 0x08)) { - value |= 0x08; - pr_debug("activating SpeedStep (TM) registers\n"); - pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); - } - - return 0; -} - - -/** - * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic - * - * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to - * the LPC bridge / PM module which contains all power-management - * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected - * chipset, or zero on failure. - */ -static unsigned int speedstep_detect_chipset(void) -{ - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 4; /* 4-M */ - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801CA_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 3; /* 3-M */ - - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_10, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) { - /* speedstep.c causes lockups on Dell Inspirons 8000 and - * 8100 which use a pretty old revision of the 82815 - * host brige. Abort on these systems. - */ - static struct pci_dev *hostbridge; - - hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82815_MC, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - - if (!hostbridge) - return 2; /* 2-M */ - - if (hostbridge->revision < 5) { - pr_debug("hostbridge does not support speedstep\n"); - speedstep_chipset_dev = NULL; - pci_dev_put(hostbridge); - return 0; - } - - pci_dev_put(hostbridge); - return 2; /* 2-M */ - } - - return 0; -} - -static void get_freq_data(void *_speed) -{ - unsigned int *speed = _speed; - - *speed = speedstep_get_frequency(speedstep_processor); -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - unsigned int speed; - - /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) - BUG(); - - pr_debug("detected %u kHz as current frequency\n", speed); - return speed; -} - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0, policy_cpu; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - freqs.old = speedstep_get(policy_cpu); - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = policy->cpu; - - pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new); - - /* no transition necessary */ - if (freqs.old == freqs.new) - return 0; - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, - true); - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - -struct get_freqs { - struct cpufreq_policy *policy; - int ret; -}; - -static void get_freqs_on_cpu(void *_get_freqs) -{ - struct get_freqs *get_freqs = _get_freqs; - - get_freqs->ret = - speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &get_freqs->policy->cpuinfo.transition_latency, - &speedstep_set_state); -} - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int policy_cpu, speed; - struct get_freqs gf; - - /* only run on CPU to be set, or on its sibling */ -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - - /* detect low and high frequency and transition latency */ - gf.policy = policy; - smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); - if (gf.ret) - return gf.ret; - - /* get current speed setting */ - speed = speedstep_get(policy_cpu); - if (!speed) - return -EIO; - - pr_debug("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-ich", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - /* detect processor */ - speedstep_processor = speedstep_detect_processor(); - if (!speedstep_processor) { - pr_debug("Intel(R) SpeedStep(TM) capable processor " - "not found\n"); - return -ENODEV; - } - - /* detect chipset */ - if (!speedstep_detect_chipset()) { - pr_debug("Intel(R) SpeedStep(TM) for this chipset not " - "(yet) available.\n"); - return -ENODEV; - } - - /* activate speedstep support */ - if (speedstep_activate()) { - pci_dev_put(speedstep_chipset_dev); - return -EINVAL; - } - - if (speedstep_find_register()) - return -ENODEV; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - pci_dev_put(speedstep_chipset_dev); - cpufreq_unregister_driver(&speedstep_driver); -} - - -MODULE_AUTHOR("Dave Jones , " - "Dominik Brodowski "); -MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets " - "with ICH-M southbridges."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c deleted file mode 100644 index 8af2d2f..0000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ /dev/null @@ -1,478 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include -#include -#include -#include -#include - -#include -#include -#include "speedstep-lib.h" - -#define PFX "speedstep-lib: " - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -static int relaxed_check; -#else -#define relaxed_check 0 -#endif - -/********************************************************************* - * GET PROCESSOR CORE SPEED IN KHZ * - *********************************************************************/ - -static unsigned int pentium3_get_frequency(enum speedstep_processor processor) -{ - /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ - struct { - unsigned int ratio; /* Frequency Multiplier (x10) */ - u8 bitmap; /* power on configuration bits - [27, 25:22] (in MSR 0x2a) */ - } msr_decode_mult[] = { - { 30, 0x01 }, - { 35, 0x05 }, - { 40, 0x02 }, - { 45, 0x06 }, - { 50, 0x00 }, - { 55, 0x04 }, - { 60, 0x0b }, - { 65, 0x0f }, - { 70, 0x09 }, - { 75, 0x0d }, - { 80, 0x0a }, - { 85, 0x26 }, - { 90, 0x20 }, - { 100, 0x2b }, - { 0, 0xff } /* error or unknown value */ - }; - - /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ - struct { - unsigned int value; /* Front Side Bus speed in MHz */ - u8 bitmap; /* power on configuration bits [18: 19] - (in MSR 0x2a) */ - } msr_decode_fsb[] = { - { 66, 0x0 }, - { 100, 0x2 }, - { 133, 0x1 }, - { 0, 0xff} - }; - - u32 msr_lo, msr_tmp; - int i = 0, j = 0; - - /* read MSR 0x2a - we only need the low 32 bits */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - pr_debug("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - msr_tmp = msr_lo; - - /* decode the FSB */ - msr_tmp &= 0x00c0000; - msr_tmp >>= 18; - while (msr_tmp != msr_decode_fsb[i].bitmap) { - if (msr_decode_fsb[i].bitmap == 0xff) - return 0; - i++; - } - - /* decode the multiplier */ - if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) { - pr_debug("workaround for early PIIIs\n"); - msr_lo &= 0x03c00000; - } else - msr_lo &= 0x0bc00000; - msr_lo >>= 22; - while (msr_lo != msr_decode_mult[j].bitmap) { - if (msr_decode_mult[j].bitmap == 0xff) - return 0; - j++; - } - - pr_debug("speed is %u\n", - (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); - - return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100; -} - - -static unsigned int pentiumM_get_frequency(void) -{ - u32 msr_lo, msr_tmp; - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - pr_debug("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - - /* see table B-2 of 24547212.pdf */ - if (msr_lo & 0x00040000) { - printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n", - msr_lo, msr_tmp); - return 0; - } - - msr_tmp = (msr_lo >> 22) & 0x1f; - pr_debug("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * 100 * 1000)); - - return msr_tmp * 100 * 1000; -} - -static unsigned int pentium_core_get_frequency(void) -{ - u32 fsb = 0; - u32 msr_lo, msr_tmp; - int ret; - - rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); - /* see table B-2 of 25366920.pdf */ - switch (msr_lo & 0x07) { - case 5: - fsb = 100000; - break; - case 1: - fsb = 133333; - break; - case 3: - fsb = 166667; - break; - case 2: - fsb = 200000; - break; - case 0: - fsb = 266667; - break; - case 4: - fsb = 333333; - break; - default: - printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); - } - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - pr_debug("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", - msr_lo, msr_tmp); - - msr_tmp = (msr_lo >> 22) & 0x1f; - pr_debug("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * fsb)); - - ret = (msr_tmp * fsb); - return ret; -} - - -static unsigned int pentium4_get_frequency(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 msr_lo, msr_hi, mult; - unsigned int fsb = 0; - unsigned int ret; - u8 fsb_code; - - /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency - * to System Bus Frequency Ratio Field in the Processor Frequency - * Configuration Register of the MSR. Therefore the current - * frequency cannot be calculated and has to be measured. - */ - if (c->x86_model < 2) - return cpu_khz; - - rdmsr(0x2c, msr_lo, msr_hi); - - pr_debug("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); - - /* decode the FSB: see IA-32 Intel (C) Architecture Software - * Developer's Manual, Volume 3: System Prgramming Guide, - * revision #12 in Table B-1: MSRs in the Pentium 4 and - * Intel Xeon Processors, on page B-4 and B-5. - */ - fsb_code = (msr_lo >> 16) & 0x7; - switch (fsb_code) { - case 0: - fsb = 100 * 1000; - break; - case 1: - fsb = 13333 * 10; - break; - case 2: - fsb = 200 * 1000; - break; - } - - if (!fsb) - printk(KERN_DEBUG PFX "couldn't detect FSB speed. " - "Please send an e-mail to \n"); - - /* Multiplier. */ - mult = msr_lo >> 24; - - pr_debug("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", - fsb, mult, (fsb * mult)); - - ret = (fsb * mult); - return ret; -} - - -/* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(enum speedstep_processor processor) -{ - switch (processor) { - case SPEEDSTEP_CPU_PCORE: - return pentium_core_get_frequency(); - case SPEEDSTEP_CPU_PM: - return pentiumM_get_frequency(); - case SPEEDSTEP_CPU_P4D: - case SPEEDSTEP_CPU_P4M: - return pentium4_get_frequency(); - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - return pentium3_get_frequency(processor); - default: - return 0; - }; - return 0; -} -EXPORT_SYMBOL_GPL(speedstep_get_frequency); - - -/********************************************************************* - * DETECT SPEEDSTEP-CAPABLE PROCESSOR * - *********************************************************************/ - -unsigned int speedstep_detect_processor(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - u32 ebx, msr_lo, msr_hi; - - pr_debug("x86: %x, model: %x\n", c->x86, c->x86_model); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - ((c->x86 != 6) && (c->x86 != 0xF))) - return 0; - - if (c->x86 == 0xF) { - /* Intel Mobile Pentium 4-M - * or Intel Mobile Pentium 4 with 533 MHz FSB */ - if (c->x86_model != 2) - return 0; - - ebx = cpuid_ebx(0x00000001); - ebx &= 0x000000FF; - - pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); - - switch (c->x86_mask) { - case 4: - /* - * B-stepping [M-P4-M] - * sample has ebx = 0x0f, production has 0x0e. - */ - if ((ebx == 0x0e) || (ebx == 0x0f)) - return SPEEDSTEP_CPU_P4M; - break; - case 7: - /* - * C-stepping [M-P4-M] - * needs to have ebx=0x0e, else it's a celeron: - * cf. 25130917.pdf / page 7, footnote 5 even - * though 25072120.pdf / page 7 doesn't say - * samples are only of B-stepping... - */ - if (ebx == 0x0e) - return SPEEDSTEP_CPU_P4M; - break; - case 9: - /* - * D-stepping [M-P4-M or M-P4/533] - * - * this is totally strange: CPUID 0x0F29 is - * used by M-P4-M, M-P4/533 and(!) Celeron CPUs. - * The latter need to be sorted out as they don't - * support speedstep. - * Celerons with CPUID 0x0F29 may have either - * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything - * specific. - * M-P4-Ms may have either ebx=0xe or 0xf [see above] - * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] - * also, M-P4M HTs have ebx=0x8, too - * For now, they are distinguished by the model_id - * string - */ - if ((ebx == 0x0e) || - (strstr(c->x86_model_id, - "Mobile Intel(R) Pentium(R) 4") != NULL)) - return SPEEDSTEP_CPU_P4M; - break; - default: - break; - } - return 0; - } - - switch (c->x86_model) { - case 0x0B: /* Intel PIII [Tualatin] */ - /* cpuid_ebx(1) is 0x04 for desktop PIII, - * 0x06 for mobile PIII-M */ - ebx = cpuid_ebx(0x00000001); - pr_debug("ebx is %x\n", ebx); - - ebx &= 0x000000FF; - - if (ebx != 0x06) - return 0; - - /* So far all PIII-M processors support SpeedStep. See - * Intel's 24540640.pdf of June 2003 - */ - return SPEEDSTEP_CPU_PIII_T; - - case 0x08: /* Intel PIII [Coppermine] */ - - /* all mobile PIII Coppermines have FSB 100 MHz - * ==> sort out a few desktop PIIIs. */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); - pr_debug("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", - msr_lo, msr_hi); - msr_lo &= 0x00c0000; - if (msr_lo != 0x0080000) - return 0; - - /* - * If the processor is a mobile version, - * platform ID has bit 50 set - * it has SpeedStep technology if either - * bit 56 or 57 is set - */ - rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); - pr_debug("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", - msr_lo, msr_hi); - if ((msr_hi & (1<<18)) && - (relaxed_check ? 1 : (msr_hi & (3<<24)))) { - if (c->x86_mask == 0x01) { - pr_debug("early PIII version\n"); - return SPEEDSTEP_CPU_PIII_C_EARLY; - } else - return SPEEDSTEP_CPU_PIII_C; - } - - default: - return 0; - } -} -EXPORT_SYMBOL_GPL(speedstep_detect_processor); - - -/********************************************************************* - * DETECT SPEEDSTEP SPEEDS * - *********************************************************************/ - -unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)) -{ - unsigned int prev_speed; - unsigned int ret = 0; - unsigned long flags; - struct timeval tv1, tv2; - - if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) - return -EINVAL; - - pr_debug("trying to determine both speeds\n"); - - /* get current speed */ - prev_speed = speedstep_get_frequency(processor); - if (!prev_speed) - return -EIO; - - pr_debug("previous speed is %u\n", prev_speed); - - local_irq_save(flags); - - /* switch to low state */ - set_state(SPEEDSTEP_LOW); - *low_speed = speedstep_get_frequency(processor); - if (!*low_speed) { - ret = -EIO; - goto out; - } - - pr_debug("low speed is %u\n", *low_speed); - - /* start latency measurement */ - if (transition_latency) - do_gettimeofday(&tv1); - - /* switch to high state */ - set_state(SPEEDSTEP_HIGH); - - /* end latency measurement */ - if (transition_latency) - do_gettimeofday(&tv2); - - *high_speed = speedstep_get_frequency(processor); - if (!*high_speed) { - ret = -EIO; - goto out; - } - - pr_debug("high speed is %u\n", *high_speed); - - if (*low_speed == *high_speed) { - ret = -ENODEV; - goto out; - } - - /* switch to previous state, if necessary */ - if (*high_speed != prev_speed) - set_state(SPEEDSTEP_LOW); - - if (transition_latency) { - *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC + - tv2.tv_usec - tv1.tv_usec; - pr_debug("transition latency is %u uSec\n", *transition_latency); - - /* convert uSec to nSec and add 20% for safety reasons */ - *transition_latency *= 1200; - - /* check if the latency measurement is too high or too low - * and set it to a safe value (500uSec) in that case - */ - if (*transition_latency > 10000000 || - *transition_latency < 50000) { - printk(KERN_WARNING PFX "frequency transition " - "measured seems out of range (%u " - "nSec), falling back to a safe one of" - "%u nSec.\n", - *transition_latency, 500000); - *transition_latency = 500000; - } - } - -out: - local_irq_restore(flags); - return ret; -} -EXPORT_SYMBOL_GPL(speedstep_get_freqs); - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -module_param(relaxed_check, int, 0444); -MODULE_PARM_DESC(relaxed_check, - "Don't do all checks for speedstep capability."); -#endif - -MODULE_AUTHOR("Dominik Brodowski "); -MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h deleted file mode 100644 index 70d9cea..0000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - - -/* processors */ -enum speedstep_processor { - SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ - SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ -/* the following processors are not speedstep-capable and are not auto-detected - * in speedstep_detect_processor(). However, their speed can be detected using - * the speedstep_get_frequency() call. */ - SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ - SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ - SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ -}; - -/* speedstep states -- only two of them */ - -#define SPEEDSTEP_HIGH 0x00000000 -#define SPEEDSTEP_LOW 0x00000001 - - -/* detect a speedstep-capable processor */ -extern enum speedstep_processor speedstep_detect_processor(void); - -/* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); - - -/* detect the low and high speeds of the processor. The callback - * set_state"'s first argument is either SPEEDSTEP_HIGH or - * SPEEDSTEP_LOW; the second argument is zero so that no - * cpufreq_notify_transition calls are initiated. - */ -extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c deleted file mode 100644 index c76ead3..0000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ /dev/null @@ -1,464 +0,0 @@ -/* - * Intel SpeedStep SMI driver. - * - * (C) 2003 Hiroshi Miura - * - * Licensed under the terms of the GNU GPL License version 2. - * - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "speedstep-lib.h" - -/* speedstep system management interface port/command. - * - * These parameters are got from IST-SMI BIOS call. - * If user gives it, these are used. - * - */ -static int smi_port; -static int smi_cmd; -static unsigned int smi_sig; - -/* info about the processor */ -static enum speedstep_processor speedstep_processor; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - -#define GET_SPEEDSTEP_OWNER 0 -#define GET_SPEEDSTEP_STATE 1 -#define SET_SPEEDSTEP_STATE 2 -#define GET_SPEEDSTEP_FREQS 4 - -/* how often shall the SMI call be tried if it failed, e.g. because - * of DMA activity going on? */ -#define SMI_TRIES 5 - -/** - * speedstep_smi_ownership - */ -static int speedstep_smi_ownership(void) -{ - u32 command, result, magic, dummy; - u32 function = GET_SPEEDSTEP_OWNER; - unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - magic = virt_to_phys(magic_data); - - pr_debug("trying to obtain ownership with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=D" (result), - "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), - "=S" (dummy) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), - "D" (0), "S" (magic) - : "memory" - ); - - pr_debug("result is %x\n", result); - - return result; -} - -/** - * speedstep_smi_get_freqs - get SpeedStep preferred & current freq. - * @low: the low frequency value is placed here - * @high: the high frequency value is placed here - * - * Only available on later SpeedStep-enabled systems, returns false results or - * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing - * shows that the latter occurs if !(ist_info.event & 0xFFFF). - */ -static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) -{ - u32 command, result = 0, edi, high_mhz, low_mhz, dummy; - u32 state = 0; - u32 function = GET_SPEEDSTEP_FREQS; - - if (!(ist_info.event & 0xFFFF)) { - pr_debug("bug #1422 -- can't read freqs from BIOS\n"); - return -ENODEV; - } - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - pr_debug("trying to determine frequencies with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=a" (result), - "=b" (high_mhz), - "=c" (low_mhz), - "=d" (state), "=D" (edi), "=S" (dummy) - : "a" (command), - "b" (function), - "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - - pr_debug("result %x, low_freq %u, high_freq %u\n", - result, low_mhz, high_mhz); - - /* abort if results are obviously incorrect... */ - if ((high_mhz + low_mhz) < 600) - return -EINVAL; - - *high = high_mhz * 1000; - *low = low_mhz * 1000; - - return result; -} - -/** - * speedstep_get_state - set the SpeedStep state - * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static int speedstep_get_state(void) -{ - u32 function = GET_SPEEDSTEP_STATE; - u32 result, state, edi, command, dummy; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - pr_debug("trying to determine current setting with command %x " - "at port %x\n", command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=a" (result), - "=b" (state), "=D" (edi), - "=c" (dummy), "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (0), - "d" (smi_port), "S" (0), "D" (0) - ); - - pr_debug("state is %x, result is %x\n", state, result); - - return state & 1; -} - - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static void speedstep_set_state(unsigned int state) -{ - unsigned int result = 0, command, new_state, dummy; - unsigned long flags; - unsigned int function = SET_SPEEDSTEP_STATE; - unsigned int retry = 0; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - pr_debug("trying to set frequency to state %u " - "with command %x at port %x\n", - state, command, smi_port); - - do { - if (retry) { - pr_debug("retry %u, previous result %u, waiting...\n", - retry, result); - mdelay(retry * 50); - } - retry++; - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=b" (new_state), "=D" (result), - "=c" (dummy), "=a" (dummy), - "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - } while ((new_state != state) && (retry <= SMI_TRIES)); - - /* enable IRQs */ - local_irq_restore(flags); - - if (new_state == state) - pr_debug("change to %u MHz succeeded after %u tries " - "with result %u\n", - (speedstep_freqs[new_state].frequency / 1000), - retry, result); - else - printk(KERN_ERR "cpufreq: change to state %u " - "failed with new_state %u and result %u\n", - state, new_state, result); - - return; -} - - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: new freq - * @relation: - * - * Sets a new CPUFreq policy/freq. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int newstate = 0; - struct cpufreq_freqs freqs; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = speedstep_freqs[speedstep_get_state()].frequency; - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = 0; /* speedstep.c is UP only driver */ - - if (freqs.old == freqs.new) - return 0; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - speedstep_set_state(newstate); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int speed, state; - unsigned int *low, *high; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - result = speedstep_smi_ownership(); - if (result) { - pr_debug("fails in acquiring ownership of a SMI interface.\n"); - return -EINVAL; - } - - /* detect low and high frequency */ - low = &speedstep_freqs[SPEEDSTEP_LOW].frequency; - high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency; - - result = speedstep_smi_get_freqs(low, high); - if (result) { - /* fall back to speedstep_lib.c dection mechanism: - * try both states out */ - pr_debug("could not detect low and high frequencies " - "by SMI call.\n"); - result = speedstep_get_freqs(speedstep_processor, - low, high, - NULL, - &speedstep_set_state); - - if (result) { - pr_debug("could not detect two different speeds" - " -- aborting.\n"); - return result; - } else - pr_debug("workaround worked.\n"); - } - - /* get current speed setting */ - state = speedstep_get_state(); - speed = speedstep_freqs[state].frequency; - - pr_debug("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - if (cpu) - return -ENODEV; - return speedstep_get_frequency(speedstep_processor); -} - - -static int speedstep_resume(struct cpufreq_policy *policy) -{ - int result = speedstep_smi_ownership(); - - if (result) - pr_debug("fails in re-acquiring ownership of a SMI interface.\n"); - - return result; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-smi", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .resume = speedstep_resume, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * BIOS, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - speedstep_processor = speedstep_detect_processor(); - - switch (speedstep_processor) { - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - break; - default: - speedstep_processor = 0; - } - - if (!speedstep_processor) { - pr_debug("No supported Intel CPU detected.\n"); - return -ENODEV; - } - - pr_debug("signature:0x%.8ulx, command:0x%.8ulx, " - "event:0x%.8ulx, perf_level:0x%.8ulx.\n", - ist_info.signature, ist_info.command, - ist_info.event, ist_info.perf_level); - - /* Error if no IST-SMI BIOS or no PARM - sig= 'ISGE' aka 'Intel Speedstep Gate E' */ - if ((ist_info.signature != 0x47534943) && ( - (smi_port == 0) || (smi_cmd == 0))) - return -ENODEV; - - if (smi_sig == 1) - smi_sig = 0x47534943; - else - smi_sig = ist_info.signature; - - /* setup smi_port from MODLULE_PARM or BIOS */ - if ((smi_port > 0xff) || (smi_port < 0)) - return -EINVAL; - else if (smi_port == 0) - smi_port = ist_info.command & 0xff; - - if ((smi_cmd > 0xff) || (smi_cmd < 0)) - return -EINVAL; - else if (smi_cmd == 0) - smi_cmd = (ist_info.command >> 16) & 0xff; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - cpufreq_unregister_driver(&speedstep_driver); -} - -module_param(smi_port, int, 0444); -module_param(smi_cmd, int, 0444); -module_param(smi_sig, uint, 0444); - -MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value " - "-- Intel's default setting is 0xb2"); -MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value " - "-- Intel's default setting is 0x82"); -MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the " - "SMI interface."); - -MODULE_AUTHOR("Hiroshi Miura"); -MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index b78baa54..9fb8485 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -1,3 +1,5 @@ +menu "CPU Frequency scaling" + config CPU_FREQ bool "CPU Frequency scaling" help @@ -177,4 +179,10 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. -endif # CPU_FREQ +menu "x86 CPU frequency scaling drivers" +depends on X86 +source "drivers/cpufreq/Kconfig.x86" +endmenu + +endif +endmenu diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 new file mode 100644 index 0000000..343f847 --- /dev/null +++ b/drivers/cpufreq/Kconfig.x86 @@ -0,0 +1,255 @@ +# +# x86 CPU Frequency scaling drivers +# + +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + . + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + +config X86_ACPI_CPUFREQ + tristate "ACPI Processor P-States driver" + select CPU_FREQ_TABLE + depends on ACPI_PROCESSOR + help + This driver adds a CPUFreq driver which utilizes the ACPI + Processor Performance States. + This driver also supports Intel Enhanced Speedstep. + + To compile this driver as a module, choose M here: the + module will be called acpi-cpufreq. + + For details, take a look at . + + If in doubt, say N. + +config ELAN_CPUFREQ + tristate "AMD Elan SC400 and SC410" + select CPU_FREQ_TABLE + depends on X86_ELAN + ---help--- + This adds the CPUFreq driver for AMD Elan SC400 and SC410 + processors. + + You need to specify the processor maximum speed as boot + parameter: elanfreq=maxspeed (in kHz) or as module + parameter "max_freq". + + For details, take a look at . + + If in doubt, say N. + +config SC520_CPUFREQ + tristate "AMD Elan SC520" + select CPU_FREQ_TABLE + depends on X86_ELAN + ---help--- + This adds the CPUFreq driver for AMD Elan SC520 processor. + + For details, take a look at . + + If in doubt, say N. + + +config X86_POWERNOW_K6 + tristate "AMD Mobile K6-2/K6-3 PowerNow!" + select CPU_FREQ_TABLE + depends on X86_32 + help + This adds the CPUFreq driver for mobile AMD K6-2+ and mobile + AMD K6-3+ processors. + + For details, take a look at . + + If in doubt, say N. + +config X86_POWERNOW_K7 + tristate "AMD Mobile Athlon/Duron PowerNow!" + select CPU_FREQ_TABLE + depends on X86_32 + help + This adds the CPUFreq driver for mobile AMD K7 mobile processors. + + For details, take a look at . + + If in doubt, say N. + +config X86_POWERNOW_K7_ACPI + bool + depends on X86_POWERNOW_K7 && ACPI_PROCESSOR + depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m) + depends on X86_32 + default y + +config X86_POWERNOW_K8 + tristate "AMD Opteron/Athlon64 PowerNow!" + select CPU_FREQ_TABLE + depends on ACPI && ACPI_PROCESSOR + help + This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors. + + To compile this driver as a module, choose M here: the + module will be called powernow-k8. + + For details, take a look at . + +config X86_GX_SUSPMOD + tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" + depends on X86_32 && PCI + help + This add the CPUFreq driver for NatSemi Geode processors which + support suspend modulation. + + For details, take a look at . + + If in doubt, say N. + +config X86_SPEEDSTEP_CENTRINO + tristate "Intel Enhanced SpeedStep (deprecated)" + select CPU_FREQ_TABLE + select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32 + depends on X86_32 || (X86_64 && ACPI_PROCESSOR) + help + This is deprecated and this functionality is now merged into + acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of + speedstep_centrino. + This adds the CPUFreq driver for Enhanced SpeedStep enabled + mobile CPUs. This means Intel Pentium M (Centrino) CPUs + or 64bit enabled Intel Xeons. + + To compile this driver as a module, choose M here: the + module will be called speedstep-centrino. + + For details, take a look at . + + If in doubt, say N. + +config X86_SPEEDSTEP_CENTRINO_TABLE + bool "Built-in tables for Banias CPUs" + depends on X86_32 && X86_SPEEDSTEP_CENTRINO + default y + help + Use built-in tables for Banias CPUs if ACPI encoding + is not available. + + If in doubt, say N. + +config X86_SPEEDSTEP_ICH + tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" + select CPU_FREQ_TABLE + depends on X86_32 + help + This adds the CPUFreq driver for certain mobile Intel Pentium III + (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all + mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, + ICH3 or ICH4 southbridge. + + For details, take a look at . + + If in doubt, say N. + +config X86_SPEEDSTEP_SMI + tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)" + select CPU_FREQ_TABLE + depends on X86_32 && EXPERIMENTAL + help + This adds the CPUFreq driver for certain mobile Intel Pentium III + (Coppermine), all mobile Intel Pentium III-M (Tualatin) + on systems which have an Intel 440BX/ZX/MX southbridge. + + For details, take a look at . + + If in doubt, say N. + +config X86_P4_CLOCKMOD + tristate "Intel Pentium 4 clock modulation" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for Intel Pentium 4 / XEON + processors. When enabled it will lower CPU temperature by skipping + clocks. + + This driver should be only used in exceptional + circumstances when very low power is needed because it causes severe + slowdowns and noticeable latencies. Normally Speedstep should be used + instead. + + To compile this driver as a module, choose M here: the + module will be called p4-clockmod. + + For details, take a look at . + + Unless you are absolutely sure say N. + +config X86_CPUFREQ_NFORCE2 + tristate "nVidia nForce2 FSB changing" + depends on X86_32 && EXPERIMENTAL + help + This adds the CPUFreq driver for FSB changing on nVidia nForce2 + platforms. + + For details, take a look at . + + If in doubt, say N. + +config X86_LONGRUN + tristate "Transmeta LongRun" + depends on X86_32 + help + This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors + which support LongRun. + + For details, take a look at . + + If in doubt, say N. + +config X86_LONGHAUL + tristate "VIA Cyrix III Longhaul" + select CPU_FREQ_TABLE + depends on X86_32 && ACPI_PROCESSOR + help + This adds the CPUFreq driver for VIA Samuel/CyrixIII, + VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T + processors. + + For details, take a look at . + + If in doubt, say N. + +config X86_E_POWERSAVER + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" + select CPU_FREQ_TABLE + depends on X86_32 && EXPERIMENTAL + help + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. + + If in doubt, say N. + +comment "shared options" + +config X86_SPEEDSTEP_LIB + tristate + default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) + +config X86_SPEEDSTEP_RELAXED_CAP_CHECK + bool "Relaxed speedstep capability checks" + depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) + help + Don't perform all checks for a speedstep capable system which would + normally be done. Some ancient or strange systems, though speedstep + capable, don't always indicate that they are speedstep capable. This + option lets the probing code bypass some of those checks if the + parameter "relaxed_check=1" is passed to the module. + diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 71fc3b4..c7f1a6f 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -13,3 +13,29 @@ obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE) += cpufreq_conservative.o # CPUfreq cross-arch helpers obj-$(CONFIG_CPU_FREQ_TABLE) += freq_table.o +##################################################################################d +# x86 drivers. +# Link order matters. K8 is preferred to ACPI because of firmware bugs in early +# K8 systems. ACPI is preferred to all other hardware-specific drivers. +# speedstep-* is preferred over p4-clockmod. + +obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o +obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o +obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o +obj-$(CONFIG_X86_LONGHAUL) += longhaul.o +obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o +obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o +obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o +obj-$(CONFIG_X86_LONGRUN) += longrun.o +obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o +obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o +obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o +obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o +obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o +obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o +obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o + +##################################################################################d + diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c new file mode 100644 index 0000000..4e04e12 --- /dev/null +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -0,0 +1,773 @@ +/* + * acpi-cpufreq.c - ACPI Processor P-States Driver + * + * Copyright (C) 2001, 2002 Andy Grover + * Copyright (C) 2001, 2002 Paul Diefenbaugh + * Copyright (C) 2002 - 2004 Dominik Brodowski + * Copyright (C) 2006 Denis Sadykov + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include "mperf.h" + +MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); +MODULE_DESCRIPTION("ACPI Processor P-States Driver"); +MODULE_LICENSE("GPL"); + +enum { + UNDEFINED_CAPABLE = 0, + SYSTEM_INTEL_MSR_CAPABLE, + SYSTEM_IO_CAPABLE, +}; + +#define INTEL_MSR_RANGE (0xffff) + +struct acpi_cpufreq_data { + struct acpi_processor_performance *acpi_data; + struct cpufreq_frequency_table *freq_table; + unsigned int resume; + unsigned int cpu_feature; +}; + +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); + +/* acpi_perf_data is a pointer to percpu data. */ +static struct acpi_processor_performance __percpu *acpi_perf_data; + +static struct cpufreq_driver acpi_cpufreq_driver; + +static unsigned int acpi_pstate_strict; + +static int check_est_cpu(unsigned int cpuid) +{ + struct cpuinfo_x86 *cpu = &cpu_data(cpuid); + + return cpu_has(cpu, X86_FEATURE_EST); +} + +static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) +{ + struct acpi_processor_performance *perf; + int i; + + perf = data->acpi_data; + + for (i = 0; i < perf->state_count; i++) { + if (value == perf->states[i].status) + return data->freq_table[i].frequency; + } + return 0; +} + +static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data) +{ + int i; + struct acpi_processor_performance *perf; + + msr &= INTEL_MSR_RANGE; + perf = data->acpi_data; + + for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { + if (msr == perf->states[data->freq_table[i].index].status) + return data->freq_table[i].frequency; + } + return data->freq_table[0].frequency; +} + +static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) +{ + switch (data->cpu_feature) { + case SYSTEM_INTEL_MSR_CAPABLE: + return extract_msr(val, data); + case SYSTEM_IO_CAPABLE: + return extract_io(val, data); + default: + return 0; + } +} + +struct msr_addr { + u32 reg; +}; + +struct io_addr { + u16 port; + u8 bit_width; +}; + +struct drv_cmd { + unsigned int type; + const struct cpumask *mask; + union { + struct msr_addr msr; + struct io_addr io; + } addr; + u32 val; +}; + +/* Called via smp_call_function_single(), on the target CPU */ +static void do_drv_read(void *_cmd) +{ + struct drv_cmd *cmd = _cmd; + u32 h; + + switch (cmd->type) { + case SYSTEM_INTEL_MSR_CAPABLE: + rdmsr(cmd->addr.msr.reg, cmd->val, h); + break; + case SYSTEM_IO_CAPABLE: + acpi_os_read_port((acpi_io_address)cmd->addr.io.port, + &cmd->val, + (u32)cmd->addr.io.bit_width); + break; + default: + break; + } +} + +/* Called via smp_call_function_many(), on the target CPUs */ +static void do_drv_write(void *_cmd) +{ + struct drv_cmd *cmd = _cmd; + u32 lo, hi; + + switch (cmd->type) { + case SYSTEM_INTEL_MSR_CAPABLE: + rdmsr(cmd->addr.msr.reg, lo, hi); + lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); + wrmsr(cmd->addr.msr.reg, lo, hi); + break; + case SYSTEM_IO_CAPABLE: + acpi_os_write_port((acpi_io_address)cmd->addr.io.port, + cmd->val, + (u32)cmd->addr.io.bit_width); + break; + default: + break; + } +} + +static void drv_read(struct drv_cmd *cmd) +{ + int err; + cmd->val = 0; + + err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); + WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ +} + +static void drv_write(struct drv_cmd *cmd) +{ + int this_cpu; + + this_cpu = get_cpu(); + if (cpumask_test_cpu(this_cpu, cmd->mask)) + do_drv_write(cmd); + smp_call_function_many(cmd->mask, do_drv_write, cmd, 1); + put_cpu(); +} + +static u32 get_cur_val(const struct cpumask *mask) +{ + struct acpi_processor_performance *perf; + struct drv_cmd cmd; + + if (unlikely(cpumask_empty(mask))) + return 0; + + switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { + case SYSTEM_INTEL_MSR_CAPABLE: + cmd.type = SYSTEM_INTEL_MSR_CAPABLE; + cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; + break; + case SYSTEM_IO_CAPABLE: + cmd.type = SYSTEM_IO_CAPABLE; + perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; + cmd.addr.io.port = perf->control_register.address; + cmd.addr.io.bit_width = perf->control_register.bit_width; + break; + default: + return 0; + } + + cmd.mask = mask; + drv_read(&cmd); + + pr_debug("get_cur_val = %u\n", cmd.val); + + return cmd.val; +} + +static unsigned int get_cur_freq_on_cpu(unsigned int cpu) +{ + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); + unsigned int freq; + unsigned int cached_freq; + + pr_debug("get_cur_freq_on_cpu (%d)\n", cpu); + + if (unlikely(data == NULL || + data->acpi_data == NULL || data->freq_table == NULL)) { + return 0; + } + + cached_freq = data->freq_table[data->acpi_data->state].frequency; + freq = extract_freq(get_cur_val(cpumask_of(cpu)), data); + if (freq != cached_freq) { + /* + * The dreaded BIOS frequency change behind our back. + * Force set the frequency on next target call. + */ + data->resume = 1; + } + + pr_debug("cur freq = %u\n", freq); + + return freq; +} + +static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, + struct acpi_cpufreq_data *data) +{ + unsigned int cur_freq; + unsigned int i; + + for (i = 0; i < 100; i++) { + cur_freq = extract_freq(get_cur_val(mask), data); + if (cur_freq == freq) + return 1; + udelay(10); + } + return 0; +} + +static int acpi_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); + struct acpi_processor_performance *perf; + struct cpufreq_freqs freqs; + struct drv_cmd cmd; + unsigned int next_state = 0; /* Index into freq_table */ + unsigned int next_perf_state = 0; /* Index into perf table */ + unsigned int i; + int result = 0; + + pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); + + if (unlikely(data == NULL || + data->acpi_data == NULL || data->freq_table == NULL)) { + return -ENODEV; + } + + perf = data->acpi_data; + result = cpufreq_frequency_table_target(policy, + data->freq_table, + target_freq, + relation, &next_state); + if (unlikely(result)) { + result = -ENODEV; + goto out; + } + + next_perf_state = data->freq_table[next_state].index; + if (perf->state == next_perf_state) { + if (unlikely(data->resume)) { + pr_debug("Called after resume, resetting to P%d\n", + next_perf_state); + data->resume = 0; + } else { + pr_debug("Already at target state (P%d)\n", + next_perf_state); + goto out; + } + } + + switch (data->cpu_feature) { + case SYSTEM_INTEL_MSR_CAPABLE: + cmd.type = SYSTEM_INTEL_MSR_CAPABLE; + cmd.addr.msr.reg = MSR_IA32_PERF_CTL; + cmd.val = (u32) perf->states[next_perf_state].control; + break; + case SYSTEM_IO_CAPABLE: + cmd.type = SYSTEM_IO_CAPABLE; + cmd.addr.io.port = perf->control_register.address; + cmd.addr.io.bit_width = perf->control_register.bit_width; + cmd.val = (u32) perf->states[next_perf_state].control; + break; + default: + result = -ENODEV; + goto out; + } + + /* cpufreq holds the hotplug lock, so we are safe from here on */ + if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) + cmd.mask = policy->cpus; + else + cmd.mask = cpumask_of(policy->cpu); + + freqs.old = perf->states[perf->state].core_frequency * 1000; + freqs.new = data->freq_table[next_state].frequency; + for_each_cpu(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + drv_write(&cmd); + + if (acpi_pstate_strict) { + if (!check_freqs(cmd.mask, freqs.new, data)) { + pr_debug("acpi_cpufreq_target failed (%d)\n", + policy->cpu); + result = -EAGAIN; + goto out; + } + } + + for_each_cpu(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + perf->state = next_perf_state; + +out: + return result; +} + +static int acpi_cpufreq_verify(struct cpufreq_policy *policy) +{ + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); + + pr_debug("acpi_cpufreq_verify\n"); + + return cpufreq_frequency_table_verify(policy, data->freq_table); +} + +static unsigned long +acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) +{ + struct acpi_processor_performance *perf = data->acpi_data; + + if (cpu_khz) { + /* search the closest match to cpu_khz */ + unsigned int i; + unsigned long freq; + unsigned long freqn = perf->states[0].core_frequency * 1000; + + for (i = 0; i < (perf->state_count-1); i++) { + freq = freqn; + freqn = perf->states[i+1].core_frequency * 1000; + if ((2 * cpu_khz) > (freqn + freq)) { + perf->state = i; + return freq; + } + } + perf->state = perf->state_count-1; + return freqn; + } else { + /* assume CPU is at P0... */ + perf->state = 0; + return perf->states[0].core_frequency * 1000; + } +} + +static void free_acpi_perf_data(void) +{ + unsigned int i; + + /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ + for_each_possible_cpu(i) + free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) + ->shared_cpu_map); + free_percpu(acpi_perf_data); +} + +/* + * acpi_cpufreq_early_init - initialize ACPI P-States library + * + * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c) + * in order to determine correct frequency and voltage pairings. We can + * do _PDC and _PSD and find out the processor dependency for the + * actual init that will happen later... + */ +static int __init acpi_cpufreq_early_init(void) +{ + unsigned int i; + pr_debug("acpi_cpufreq_early_init\n"); + + acpi_perf_data = alloc_percpu(struct acpi_processor_performance); + if (!acpi_perf_data) { + pr_debug("Memory allocation error for acpi_perf_data.\n"); + return -ENOMEM; + } + for_each_possible_cpu(i) { + if (!zalloc_cpumask_var_node( + &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, + GFP_KERNEL, cpu_to_node(i))) { + + /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ + free_acpi_perf_data(); + return -ENOMEM; + } + } + + /* Do initialization in ACPI core */ + acpi_processor_preregister_performance(acpi_perf_data); + return 0; +} + +#ifdef CONFIG_SMP +/* + * Some BIOSes do SW_ANY coordination internally, either set it up in hw + * or do it in BIOS firmware and won't inform about it to OS. If not + * detected, this has a side effect of making CPU run at a different speed + * than OS intended it to run at. Detect it and handle it cleanly. + */ +static int bios_with_sw_any_bug; + +static int sw_any_bug_found(const struct dmi_system_id *d) +{ + bios_with_sw_any_bug = 1; + return 0; +} + +static const struct dmi_system_id sw_any_bug_dmi_table[] = { + { + .callback = sw_any_bug_found, + .ident = "Supermicro Server X6DLP", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), + DMI_MATCH(DMI_BIOS_VERSION, "080010"), + DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), + }, + }, + { } +}; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf + * AL30: A Machine Check Exception (MCE) Occurring during an + * Enhanced Intel SpeedStep Technology Ratio Change May Cause + * Both Processor Cores to Lock Up. */ + if (c->x86_vendor == X86_VENDOR_INTEL) { + if ((c->x86 == 15) && + (c->x86_model == 6) && + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); + return -ENODEV; + } + } + return 0; +} +#endif + +static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i; + unsigned int valid_states = 0; + unsigned int cpu = policy->cpu; + struct acpi_cpufreq_data *data; + unsigned int result = 0; + struct cpuinfo_x86 *c = &cpu_data(policy->cpu); + struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif + + pr_debug("acpi_cpufreq_cpu_init\n"); + +#ifdef CONFIG_SMP + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; +#endif + + data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); + per_cpu(acfreq_data, cpu) = data; + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) + acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; + + result = acpi_processor_register_performance(data->acpi_data, cpu); + if (result) + goto err_free; + + perf = data->acpi_data; + policy->shared_type = perf->shared_type; + + /* + * Will let policy->cpus know about dependency only when software + * coordination is required. + */ + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || + policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { + cpumask_copy(policy->cpus, perf->shared_cpu_map); + } + cpumask_copy(policy->related_cpus, perf->shared_cpu_map); + +#ifdef CONFIG_SMP + dmi_check_system(sw_any_bug_dmi_table); + if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) { + policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; + cpumask_copy(policy->cpus, cpu_core_mask(cpu)); + } +#endif + + /* capability check */ + if (perf->state_count <= 1) { + pr_debug("No P-States\n"); + result = -ENODEV; + goto err_unreg; + } + + if (perf->control_register.space_id != perf->status_register.space_id) { + result = -ENODEV; + goto err_unreg; + } + + switch (perf->control_register.space_id) { + case ACPI_ADR_SPACE_SYSTEM_IO: + pr_debug("SYSTEM IO addr space\n"); + data->cpu_feature = SYSTEM_IO_CAPABLE; + break; + case ACPI_ADR_SPACE_FIXED_HARDWARE: + pr_debug("HARDWARE addr space\n"); + if (!check_est_cpu(cpu)) { + result = -ENODEV; + goto err_unreg; + } + data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; + break; + default: + pr_debug("Unknown addr space %d\n", + (u32) (perf->control_register.space_id)); + result = -ENODEV; + goto err_unreg; + } + + data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * + (perf->state_count+1), GFP_KERNEL); + if (!data->freq_table) { + result = -ENOMEM; + goto err_unreg; + } + + /* detect transition latency */ + policy->cpuinfo.transition_latency = 0; + for (i = 0; i < perf->state_count; i++) { + if ((perf->states[i].transition_latency * 1000) > + policy->cpuinfo.transition_latency) + policy->cpuinfo.transition_latency = + perf->states[i].transition_latency * 1000; + } + + /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */ + if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && + policy->cpuinfo.transition_latency > 20 * 1000) { + policy->cpuinfo.transition_latency = 20 * 1000; + printk_once(KERN_INFO + "P-state transition latency capped at 20 uS\n"); + } + + /* table init */ + for (i = 0; i < perf->state_count; i++) { + if (i > 0 && perf->states[i].core_frequency >= + data->freq_table[valid_states-1].frequency / 1000) + continue; + + data->freq_table[valid_states].index = i; + data->freq_table[valid_states].frequency = + perf->states[i].core_frequency * 1000; + valid_states++; + } + data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; + perf->state = 0; + + result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); + if (result) + goto err_freqfree; + + if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) + printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n"); + + switch (perf->control_register.space_id) { + case ACPI_ADR_SPACE_SYSTEM_IO: + /* Current speed is unknown and not detectable by IO port */ + policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); + break; + case ACPI_ADR_SPACE_FIXED_HARDWARE: + acpi_cpufreq_driver.get = get_cur_freq_on_cpu; + policy->cur = get_cur_freq_on_cpu(cpu); + break; + default: + break; + } + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + /* Check for APERF/MPERF support in hardware */ + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; + + pr_debug("CPU%u - ACPI performance management activated.\n", cpu); + for (i = 0; i < perf->state_count; i++) + pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", + (i == perf->state ? '*' : ' '), i, + (u32) perf->states[i].core_frequency, + (u32) perf->states[i].power, + (u32) perf->states[i].transition_latency); + + cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); + + /* + * the first call to ->target() should result in us actually + * writing something to the appropriate registers. + */ + data->resume = 1; + + return result; + +err_freqfree: + kfree(data->freq_table); +err_unreg: + acpi_processor_unregister_performance(perf, cpu); +err_free: + kfree(data); + per_cpu(acfreq_data, cpu) = NULL; + + return result; +} + +static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); + + pr_debug("acpi_cpufreq_cpu_exit\n"); + + if (data) { + cpufreq_frequency_table_put_attr(policy->cpu); + per_cpu(acfreq_data, policy->cpu) = NULL; + acpi_processor_unregister_performance(data->acpi_data, + policy->cpu); + kfree(data->freq_table); + kfree(data); + } + + return 0; +} + +static int acpi_cpufreq_resume(struct cpufreq_policy *policy) +{ + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); + + pr_debug("acpi_cpufreq_resume\n"); + + data->resume = 1; + + return 0; +} + +static struct freq_attr *acpi_cpufreq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver acpi_cpufreq_driver = { + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .resume = acpi_cpufreq_resume, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, +}; + +static int __init acpi_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + pr_debug("acpi_cpufreq_init\n"); + + ret = acpi_cpufreq_early_init(); + if (ret) + return ret; + + ret = cpufreq_register_driver(&acpi_cpufreq_driver); + if (ret) + free_acpi_perf_data(); + + return ret; +} + +static void __exit acpi_cpufreq_exit(void) +{ + pr_debug("acpi_cpufreq_exit\n"); + + cpufreq_unregister_driver(&acpi_cpufreq_driver); + + free_percpu(acpi_perf_data); +} + +module_param(acpi_pstate_strict, uint, 0644); +MODULE_PARM_DESC(acpi_pstate_strict, + "value 0 or non-zero. non-zero -> strict ACPI checks are " + "performed during frequency changes."); + +late_initcall(acpi_cpufreq_init); +module_exit(acpi_cpufreq_exit); + +MODULE_ALIAS("acpi"); diff --git a/drivers/cpufreq/cpufreq-nforce2.c b/drivers/cpufreq/cpufreq-nforce2.c new file mode 100644 index 0000000..7bac808 --- /dev/null +++ b/drivers/cpufreq/cpufreq-nforce2.c @@ -0,0 +1,444 @@ +/* + * (C) 2004-2006 Sebastian Witt + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon reverse engineered information + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include +#include +#include +#include +#include +#include +#include + +#define NFORCE2_XTAL 25 +#define NFORCE2_BOOTFSB 0x48 +#define NFORCE2_PLLENABLE 0xa8 +#define NFORCE2_PLLREG 0xa4 +#define NFORCE2_PLLADR 0xa0 +#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div) + +#define NFORCE2_MIN_FSB 50 +#define NFORCE2_SAFE_DISTANCE 50 + +/* Delay in ms between FSB changes */ +/* #define NFORCE2_DELAY 10 */ + +/* + * nforce2_chipset: + * FSB is changed using the chipset + */ +static struct pci_dev *nforce2_dev; + +/* fid: + * multiplier * 10 + */ +static int fid; + +/* min_fsb, max_fsb: + * minimum and maximum FSB (= FSB at boot time) + */ +static int min_fsb; +static int max_fsb; + +MODULE_AUTHOR("Sebastian Witt "); +MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); +MODULE_LICENSE("GPL"); + +module_param(fid, int, 0444); +module_param(min_fsb, int, 0444); + +MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); +MODULE_PARM_DESC(min_fsb, + "Minimum FSB to use, if not defined: current FSB - 50"); + +#define PFX "cpufreq-nforce2: " + +/** + * nforce2_calc_fsb - calculate FSB + * @pll: PLL value + * + * Calculates FSB from PLL value + */ +static int nforce2_calc_fsb(int pll) +{ + unsigned char mul, div; + + mul = (pll >> 8) & 0xff; + div = pll & 0xff; + + if (div > 0) + return NFORCE2_XTAL * mul / div; + + return 0; +} + +/** + * nforce2_calc_pll - calculate PLL value + * @fsb: FSB + * + * Calculate PLL value for given FSB + */ +static int nforce2_calc_pll(unsigned int fsb) +{ + unsigned char xmul, xdiv; + unsigned char mul = 0, div = 0; + int tried = 0; + + /* Try to calculate multiplier and divider up to 4 times */ + while (((mul == 0) || (div == 0)) && (tried <= 3)) { + for (xdiv = 2; xdiv <= 0x80; xdiv++) + for (xmul = 1; xmul <= 0xfe; xmul++) + if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) == + fsb + tried) { + mul = xmul; + div = xdiv; + } + tried++; + } + + if ((mul == 0) || (div == 0)) + return -1; + + return NFORCE2_PLL(mul, div); +} + +/** + * nforce2_write_pll - write PLL value to chipset + * @pll: PLL value + * + * Writes new FSB PLL value to chipset + */ +static void nforce2_write_pll(int pll) +{ + int temp; + + /* Set the pll addr. to 0x00 */ + pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0); + + /* Now write the value in all 64 registers */ + for (temp = 0; temp <= 0x3f; temp++) + pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll); + + return; +} + +/** + * nforce2_fsb_read - Read FSB + * + * Read FSB from chipset + * If bootfsb != 0, return FSB at boot-time + */ +static unsigned int nforce2_fsb_read(int bootfsb) +{ + struct pci_dev *nforce2_sub5; + u32 fsb, temp = 0; + + /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ + nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF, + PCI_ANY_ID, PCI_ANY_ID, NULL); + if (!nforce2_sub5) + return 0; + + pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); + fsb /= 1000000; + + /* Check if PLL register is already set */ + pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); + + if (bootfsb || !temp) + return fsb; + + /* Use PLL register FSB value */ + pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp); + fsb = nforce2_calc_fsb(temp); + + return fsb; +} + +/** + * nforce2_set_fsb - set new FSB + * @fsb: New FSB + * + * Sets new FSB + */ +static int nforce2_set_fsb(unsigned int fsb) +{ + u32 temp = 0; + unsigned int tfsb; + int diff; + int pll = 0; + + if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { + printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb); + return -EINVAL; + } + + tfsb = nforce2_fsb_read(0); + if (!tfsb) { + printk(KERN_ERR PFX "Error while reading the FSB\n"); + return -EINVAL; + } + + /* First write? Then set actual value */ + pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); + if (!temp) { + pll = nforce2_calc_pll(tfsb); + + if (pll < 0) + return -EINVAL; + + nforce2_write_pll(pll); + } + + /* Enable write access */ + temp = 0x01; + pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp); + + diff = tfsb - fsb; + + if (!diff) + return 0; + + while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) { + if (diff < 0) + tfsb++; + else + tfsb--; + + /* Calculate the PLL reg. value */ + pll = nforce2_calc_pll(tfsb); + if (pll == -1) + return -EINVAL; + + nforce2_write_pll(pll); +#ifdef NFORCE2_DELAY + mdelay(NFORCE2_DELAY); +#endif + } + + temp = 0x40; + pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp); + + return 0; +} + +/** + * nforce2_get - get the CPU frequency + * @cpu: CPU number + * + * Returns the CPU frequency + */ +static unsigned int nforce2_get(unsigned int cpu) +{ + if (cpu) + return 0; + return nforce2_fsb_read(0) * fid * 100; +} + +/** + * nforce2_target - set a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * Sets a new CPUFreq policy. + */ +static int nforce2_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ +/* unsigned long flags; */ + struct cpufreq_freqs freqs; + unsigned int target_fsb; + + if ((target_freq > policy->max) || (target_freq < policy->min)) + return -EINVAL; + + target_fsb = target_freq / (fid * 100); + + freqs.old = nforce2_get(policy->cpu); + freqs.new = target_fsb * fid * 100; + freqs.cpu = 0; /* Only one CPU on nForce2 platforms */ + + if (freqs.old == freqs.new) + return 0; + + pr_debug("Old CPU frequency %d kHz, new %d kHz\n", + freqs.old, freqs.new); + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Disable IRQs */ + /* local_irq_save(flags); */ + + if (nforce2_set_fsb(target_fsb) < 0) + printk(KERN_ERR PFX "Changing FSB to %d failed\n", + target_fsb); + else + pr_debug("Changed FSB successfully to %d\n", + target_fsb); + + /* Enable IRQs */ + /* local_irq_restore(flags); */ + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return 0; +} + +/** + * nforce2_verify - verifies a new CPUFreq policy + * @policy: new policy + */ +static int nforce2_verify(struct cpufreq_policy *policy) +{ + unsigned int fsb_pol_max; + + fsb_pol_max = policy->max / (fid * 100); + + if (policy->min < (fsb_pol_max * fid * 100)) + policy->max = (fsb_pol_max + 1) * fid * 100; + + cpufreq_verify_within_limits(policy, + policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static int nforce2_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int fsb; + unsigned int rfid; + + /* capability check */ + if (policy->cpu != 0) + return -ENODEV; + + /* Get current FSB */ + fsb = nforce2_fsb_read(0); + + if (!fsb) + return -EIO; + + /* FIX: Get FID from CPU */ + if (!fid) { + if (!cpu_khz) { + printk(KERN_WARNING PFX + "cpu_khz not set, can't calculate multiplier!\n"); + return -ENODEV; + } + + fid = cpu_khz / (fsb * 100); + rfid = fid % 5; + + if (rfid) { + if (rfid > 2) + fid += 5 - rfid; + else + fid -= rfid; + } + } + + printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb, + fid / 10, fid % 10); + + /* Set maximum FSB to FSB at boot time */ + max_fsb = nforce2_fsb_read(1); + + if (!max_fsb) + return -EIO; + + if (!min_fsb) + min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE; + + if (min_fsb < NFORCE2_MIN_FSB) + min_fsb = NFORCE2_MIN_FSB; + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = min_fsb * fid * 100; + policy->cpuinfo.max_freq = max_fsb * fid * 100; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = nforce2_get(policy->cpu); + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + + return 0; +} + +static int nforce2_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver nforce2_driver = { + .name = "nforce2", + .verify = nforce2_verify, + .target = nforce2_target, + .get = nforce2_get, + .init = nforce2_cpu_init, + .exit = nforce2_cpu_exit, + .owner = THIS_MODULE, +}; + +/** + * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic + * + * Detects nForce2 A2 and C1 stepping + * + */ +static int nforce2_detect_chipset(void) +{ + nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, + PCI_DEVICE_ID_NVIDIA_NFORCE2, + PCI_ANY_ID, PCI_ANY_ID, NULL); + + if (nforce2_dev == NULL) + return -ENODEV; + + printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n", + nforce2_dev->revision); + printk(KERN_INFO PFX + "FSB changing is maybe unstable and can lead to " + "crashes and data loss.\n"); + + return 0; +} + +/** + * nforce2_init - initializes the nForce2 CPUFreq driver + * + * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported + * devices, -EINVAL on problems during initiatization, and zero on + * success. + */ +static int __init nforce2_init(void) +{ + /* TODO: do we need to detect the processor? */ + + /* detect chipset */ + if (nforce2_detect_chipset()) { + printk(KERN_INFO PFX "No nForce2 chipset.\n"); + return -ENODEV; + } + + return cpufreq_register_driver(&nforce2_driver); +} + +/** + * nforce2_exit - unregisters cpufreq module + * + * Unregisters nForce2 FSB change support. + */ +static void __exit nforce2_exit(void) +{ + cpufreq_unregister_driver(&nforce2_driver); +} + +module_init(nforce2_init); +module_exit(nforce2_exit); + diff --git a/drivers/cpufreq/e_powersaver.c b/drivers/cpufreq/e_powersaver.c new file mode 100644 index 0000000..35a257d --- /dev/null +++ b/drivers/cpufreq/e_powersaver.c @@ -0,0 +1,367 @@ +/* + * Based on documentation provided by Dave Jones. Thanks! + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define EPS_BRAND_C7M 0 +#define EPS_BRAND_C7 1 +#define EPS_BRAND_EDEN 2 +#define EPS_BRAND_C3 3 +#define EPS_BRAND_C7D 4 + +struct eps_cpu_data { + u32 fsb; + struct cpufreq_frequency_table freq_table[]; +}; + +static struct eps_cpu_data *eps_cpu[NR_CPUS]; + + +static unsigned int eps_get(unsigned int cpu) +{ + struct eps_cpu_data *centaur; + u32 lo, hi; + + if (cpu) + return 0; + centaur = eps_cpu[cpu]; + if (centaur == NULL) + return 0; + + /* Return current frequency */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + return centaur->fsb * ((lo >> 8) & 0xff); +} + +static int eps_set_state(struct eps_cpu_data *centaur, + unsigned int cpu, + u32 dest_state) +{ + struct cpufreq_freqs freqs; + u32 lo, hi; + int err = 0; + int i; + + freqs.old = eps_get(cpu); + freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Wait while CPU is busy */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i = 0; + while (lo & ((1 << 16) | (1 << 17))) { + udelay(16); + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i++; + if (unlikely(i > 64)) { + err = -ENODEV; + goto postchange; + } + } + /* Set new multiplier and voltage */ + wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); + /* Wait until transition end */ + i = 0; + do { + udelay(16); + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + i++; + if (unlikely(i > 64)) { + err = -ENODEV; + goto postchange; + } + } while (lo & ((1 << 16) | (1 << 17))); + + /* Return current frequency */ +postchange: + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + freqs.new = centaur->fsb * ((lo >> 8) & 0xff); + +#ifdef DEBUG + { + u8 current_multiplier, current_voltage; + + /* Print voltage and multiplier */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + current_voltage = lo & 0xff; + printk(KERN_INFO "eps: Current voltage = %dmV\n", + current_voltage * 16 + 700); + current_multiplier = (lo >> 8) & 0xff; + printk(KERN_INFO "eps: Current multiplier = %d\n", + current_multiplier); + } +#endif + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + return err; +} + +static int eps_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct eps_cpu_data *centaur; + unsigned int newstate = 0; + unsigned int cpu = policy->cpu; + unsigned int dest_state; + int ret; + + if (unlikely(eps_cpu[cpu] == NULL)) + return -ENODEV; + centaur = eps_cpu[cpu]; + + if (unlikely(cpufreq_frequency_table_target(policy, + &eps_cpu[cpu]->freq_table[0], + target_freq, + relation, + &newstate))) { + return -EINVAL; + } + + /* Make frequency transition */ + dest_state = centaur->freq_table[newstate].index & 0xffff; + ret = eps_set_state(centaur, cpu, dest_state); + if (ret) + printk(KERN_ERR "eps: Timeout!\n"); + return ret; +} + +static int eps_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, + &eps_cpu[policy->cpu]->freq_table[0]); +} + +static int eps_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i; + u32 lo, hi; + u64 val; + u8 current_multiplier, current_voltage; + u8 max_multiplier, max_voltage; + u8 min_multiplier, min_voltage; + u8 brand = 0; + u32 fsb; + struct eps_cpu_data *centaur; + struct cpuinfo_x86 *c = &cpu_data(0); + struct cpufreq_frequency_table *f_table; + int k, step, voltage; + int ret; + int states; + + if (policy->cpu != 0) + return -ENODEV; + + /* Check brand */ + printk(KERN_INFO "eps: Detected VIA "); + + switch (c->x86_model) { + case 10: + rdmsr(0x1153, lo, hi); + brand = (((lo >> 2) ^ lo) >> 18) & 3; + printk(KERN_CONT "Model A "); + break; + case 13: + rdmsr(0x1154, lo, hi); + brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; + printk(KERN_CONT "Model D "); + break; + } + + switch (brand) { + case EPS_BRAND_C7M: + printk(KERN_CONT "C7-M\n"); + break; + case EPS_BRAND_C7: + printk(KERN_CONT "C7\n"); + break; + case EPS_BRAND_EDEN: + printk(KERN_CONT "Eden\n"); + break; + case EPS_BRAND_C7D: + printk(KERN_CONT "C7-D\n"); + break; + case EPS_BRAND_C3: + printk(KERN_CONT "C3\n"); + return -ENODEV; + break; + } + /* Enable Enhanced PowerSaver */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); + if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; + wrmsrl(MSR_IA32_MISC_ENABLE, val); + /* Can be locked at 0 */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); + if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); + return -ENODEV; + } + } + + /* Print voltage and multiplier */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + current_voltage = lo & 0xff; + printk(KERN_INFO "eps: Current voltage = %dmV\n", + current_voltage * 16 + 700); + current_multiplier = (lo >> 8) & 0xff; + printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); + + /* Print limits */ + max_voltage = hi & 0xff; + printk(KERN_INFO "eps: Highest voltage = %dmV\n", + max_voltage * 16 + 700); + max_multiplier = (hi >> 8) & 0xff; + printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); + min_voltage = (hi >> 16) & 0xff; + printk(KERN_INFO "eps: Lowest voltage = %dmV\n", + min_voltage * 16 + 700); + min_multiplier = (hi >> 24) & 0xff; + printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); + + /* Sanity checks */ + if (current_multiplier == 0 || max_multiplier == 0 + || min_multiplier == 0) + return -EINVAL; + if (current_multiplier > max_multiplier + || max_multiplier <= min_multiplier) + return -EINVAL; + if (current_voltage > 0x1f || max_voltage > 0x1f) + return -EINVAL; + if (max_voltage < min_voltage) + return -EINVAL; + + /* Calc FSB speed */ + fsb = cpu_khz / current_multiplier; + /* Calc number of p-states supported */ + if (brand == EPS_BRAND_C7M) + states = max_multiplier - min_multiplier + 1; + else + states = 2; + + /* Allocate private data and frequency table for current cpu */ + centaur = kzalloc(sizeof(struct eps_cpu_data) + + (states + 1) * sizeof(struct cpufreq_frequency_table), + GFP_KERNEL); + if (!centaur) + return -ENOMEM; + eps_cpu[0] = centaur; + + /* Copy basic values */ + centaur->fsb = fsb; + + /* Fill frequency and MSR value table */ + f_table = ¢aur->freq_table[0]; + if (brand != EPS_BRAND_C7M) { + f_table[0].frequency = fsb * min_multiplier; + f_table[0].index = (min_multiplier << 8) | min_voltage; + f_table[1].frequency = fsb * max_multiplier; + f_table[1].index = (max_multiplier << 8) | max_voltage; + f_table[2].frequency = CPUFREQ_TABLE_END; + } else { + k = 0; + step = ((max_voltage - min_voltage) * 256) + / (max_multiplier - min_multiplier); + for (i = min_multiplier; i <= max_multiplier; i++) { + voltage = (k * step) / 256 + min_voltage; + f_table[k].frequency = fsb * i; + f_table[k].index = (i << 8) | voltage; + k++; + } + f_table[k].frequency = CPUFREQ_TABLE_END; + } + + policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ + policy->cur = fsb * current_multiplier; + + ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); + if (ret) { + kfree(centaur); + return ret; + } + + cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); + return 0; +} + +static int eps_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct eps_cpu_data *centaur; + u32 lo, hi; + + if (eps_cpu[cpu] == NULL) + return -ENODEV; + centaur = eps_cpu[cpu]; + + /* Get max frequency */ + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + /* Set max frequency */ + eps_set_state(centaur, cpu, hi & 0xffff); + /* Bye */ + cpufreq_frequency_table_put_attr(policy->cpu); + kfree(eps_cpu[cpu]); + eps_cpu[cpu] = NULL; + return 0; +} + +static struct freq_attr *eps_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver eps_driver = { + .verify = eps_verify, + .target = eps_target, + .init = eps_cpu_init, + .exit = eps_cpu_exit, + .get = eps_get, + .name = "e_powersaver", + .owner = THIS_MODULE, + .attr = eps_attr, +}; + +static int __init eps_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + /* This driver will work only on Centaur C7 processors with + * Enhanced SpeedStep/PowerSaver registers */ + if (c->x86_vendor != X86_VENDOR_CENTAUR + || c->x86 != 6 || c->x86_model < 10) + return -ENODEV; + if (!cpu_has(c, X86_FEATURE_EST)) + return -ENODEV; + + if (cpufreq_register_driver(&eps_driver)) + return -EINVAL; + return 0; +} + +static void __exit eps_exit(void) +{ + cpufreq_unregister_driver(&eps_driver); +} + +MODULE_AUTHOR("Rafal Bilski "); +MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); +MODULE_LICENSE("GPL"); + +module_init(eps_init); +module_exit(eps_exit); diff --git a/drivers/cpufreq/elanfreq.c b/drivers/cpufreq/elanfreq.c new file mode 100644 index 0000000..c587db4 --- /dev/null +++ b/drivers/cpufreq/elanfreq.c @@ -0,0 +1,309 @@ +/* + * elanfreq: cpufreq driver for the AMD ELAN family + * + * (c) Copyright 2002 Robert Schwebel + * + * Parts of this code are (c) Sven Geggus + * + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel + * + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ +#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ + +/* Module parameter */ +static int max_freq; + +struct s_elan_multiplier { + int clock; /* frequency in kHz */ + int val40h; /* PMU Force Mode register */ + int val80h; /* CPU Clock Speed Register */ +}; + +/* + * It is important that the frequencies + * are listed in ascending order here! + */ +static struct s_elan_multiplier elan_multiplier[] = { + {1000, 0x02, 0x18}, + {2000, 0x02, 0x10}, + {4000, 0x02, 0x08}, + {8000, 0x00, 0x00}, + {16000, 0x00, 0x02}, + {33000, 0x00, 0x04}, + {66000, 0x01, 0x04}, + {99000, 0x01, 0x05} +}; + +static struct cpufreq_frequency_table elanfreq_table[] = { + {0, 1000}, + {1, 2000}, + {2, 4000}, + {3, 8000}, + {4, 16000}, + {5, 33000}, + {6, 66000}, + {7, 99000}, + {0, CPUFREQ_TABLE_END}, +}; + + +/** + * elanfreq_get_cpu_frequency: determine current cpu speed + * + * Finds out at which frequency the CPU of the Elan SOC runs + * at the moment. Frequencies from 1 to 33 MHz are generated + * the normal way, 66 and 99 MHz are called "Hyperspeed Mode" + * and have the rest of the chip running with 33 MHz. + */ + +static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) +{ + u8 clockspeed_reg; /* Clock Speed Register */ + + local_irq_disable(); + outb_p(0x80, REG_CSCIR); + clockspeed_reg = inb_p(REG_CSCDR); + local_irq_enable(); + + if ((clockspeed_reg & 0xE0) == 0xE0) + return 0; + + /* Are we in CPU clock multiplied mode (66/99 MHz)? */ + if ((clockspeed_reg & 0xE0) == 0xC0) { + if ((clockspeed_reg & 0x01) == 0) + return 66000; + else + return 99000; + } + + /* 33 MHz is not 32 MHz... */ + if ((clockspeed_reg & 0xE0) == 0xA0) + return 33000; + + return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; +} + + +/** + * elanfreq_set_cpu_frequency: Change the CPU core frequency + * @cpu: cpu number + * @freq: frequency in kHz + * + * This function takes a frequency value and changes the CPU frequency + * according to this. Note that the frequency has to be checked by + * elanfreq_validatespeed() for correctness! + * + * There is no return value. + */ + +static void elanfreq_set_cpu_state(unsigned int state) +{ + struct cpufreq_freqs freqs; + + freqs.old = elanfreq_get_cpu_frequency(0); + freqs.new = elan_multiplier[state].clock; + freqs.cpu = 0; /* elanfreq.c is UP only driver */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n", + elan_multiplier[state].clock); + + + /* + * Access to the Elan's internal registers is indexed via + * 0x22: Chip Setup & Control Register Index Register (CSCI) + * 0x23: Chip Setup & Control Register Data Register (CSCD) + * + */ + + /* + * 0x40 is the Power Management Unit's Force Mode Register. + * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency) + */ + + local_irq_disable(); + outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00, REG_CSCDR); + local_irq_enable(); /* wait till internal pipelines and */ + udelay(1000); /* buffers have cleaned up */ + + local_irq_disable(); + + /* now, set the CPU clock speed register (0x80) */ + outb_p(0x80, REG_CSCIR); + outb_p(elan_multiplier[state].val80h, REG_CSCDR); + + /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ + outb_p(0x40, REG_CSCIR); + outb_p(elan_multiplier[state].val40h, REG_CSCDR); + udelay(10000); + local_irq_enable(); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +}; + + +/** + * elanfreq_validatespeed: test if frequency range is valid + * @policy: the policy to validate + * + * This function checks if a given frequency range in kHz is valid + * for the hardware supported by the driver. + */ + +static int elanfreq_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); +} + +static int elanfreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + + if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], + target_freq, relation, &newstate)) + return -EINVAL; + + elanfreq_set_cpu_state(newstate); + + return 0; +} + + +/* + * Module init and exit code + */ + +static int elanfreq_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + unsigned int i; + int result; + + /* capability check */ + if ((c->x86_vendor != X86_VENDOR_AMD) || + (c->x86 != 4) || (c->x86_model != 10)) + return -ENODEV; + + /* max freq */ + if (!max_freq) + max_freq = elanfreq_get_cpu_frequency(0); + + /* table init */ + for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + if (elanfreq_table[i].frequency > max_freq) + elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; + } + + /* cpuinfo and default policy values */ + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = elanfreq_get_cpu_frequency(0); + + result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); + if (result) + return result; + + cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); + return 0; +} + + +static int elanfreq_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + + +#ifndef MODULE +/** + * elanfreq_setup - elanfreq command line parameter parsing + * + * elanfreq command line parameter. Use: + * elanfreq=66000 + * to set the maximum CPU frequency to 66 MHz. Note that in + * case you do not give this boot parameter, the maximum + * frequency will fall back to _current_ CPU frequency which + * might be lower. If you build this as a module, use the + * max_freq module parameter instead. + */ +static int __init elanfreq_setup(char *str) +{ + max_freq = simple_strtoul(str, &str, 0); + printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n"); + return 1; +} +__setup("elanfreq=", elanfreq_setup); +#endif + + +static struct freq_attr *elanfreq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + + +static struct cpufreq_driver elanfreq_driver = { + .get = elanfreq_get_cpu_frequency, + .verify = elanfreq_verify, + .target = elanfreq_target, + .init = elanfreq_cpu_init, + .exit = elanfreq_cpu_exit, + .name = "elanfreq", + .owner = THIS_MODULE, + .attr = elanfreq_attr, +}; + + +static int __init elanfreq_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + /* Test if we have the right hardware */ + if ((c->x86_vendor != X86_VENDOR_AMD) || + (c->x86 != 4) || (c->x86_model != 10)) { + printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); + return -ENODEV; + } + return cpufreq_register_driver(&elanfreq_driver); +} + + +static void __exit elanfreq_exit(void) +{ + cpufreq_unregister_driver(&elanfreq_driver); +} + + +module_param(max_freq, int, 0444); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Robert Schwebel , " + "Sven Geggus "); +MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); + +module_init(elanfreq_init); +module_exit(elanfreq_exit); diff --git a/drivers/cpufreq/gx-suspmod.c b/drivers/cpufreq/gx-suspmod.c new file mode 100644 index 0000000..ffe1f2c --- /dev/null +++ b/drivers/cpufreq/gx-suspmod.c @@ -0,0 +1,514 @@ +/* + * Cyrix MediaGX and NatSemi Geode Suspend Modulation + * (C) 2002 Zwane Mwaikambo + * (C) 2002 Hiroshi Miura + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation + * + * The author(s) of this software shall not be held liable for damages + * of any nature resulting due to the use of this software. This + * software is provided AS-IS with no warranties. + * + * Theoretical note: + * + * (see Geode(tm) CS5530 manual (rev.4.1) page.56) + * + * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 + * are based on Suspend Modulation. + * + * Suspend Modulation works by asserting and de-asserting the SUSP# pin + * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# + * the CPU enters an idle state. GX1 stops its core clock when SUSP# is + * asserted then power consumption is reduced. + * + * Suspend Modulation's OFF/ON duration are configurable + * with 'Suspend Modulation OFF Count Register' + * and 'Suspend Modulation ON Count Register'. + * These registers are 8bit counters that represent the number of + * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF) + * to the processor. + * + * These counters define a ratio which is the effective frequency + * of operation of the system. + * + * OFF Count + * F_eff = Fgx * ---------------------- + * OFF Count + ON Count + * + * 0 <= On Count, Off Count <= 255 + * + * From these limits, we can get register values + * + * off_duration + on_duration <= MAX_DURATION + * on_duration = off_duration * (stock_freq - freq) / freq + * + * off_duration = (freq * DURATION) / stock_freq + * on_duration = DURATION - off_duration + * + * + *--------------------------------------------------------------------------- + * + * ChangeLog: + * Dec. 12, 2003 Hiroshi Miura + * - fix on/off register mistake + * - fix cpu_khz calc when it stops cpu modulation. + * + * Dec. 11, 2002 Hiroshi Miura + * - rewrite for Cyrix MediaGX Cx5510/5520 and + * NatSemi Geode Cs5530(A). + * + * Jul. ??, 2002 Zwane Mwaikambo + * - cs5530_mod patch for 2.4.19-rc1. + * + *--------------------------------------------------------------------------- + * + * Todo + * Test on machines with 5510, 5530, 5530A + */ + +/************************************************************************ + * Suspend Modulation - Definitions * + ************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* PCI config registers, all at F0 */ +#define PCI_PMER1 0x80 /* power management enable register 1 */ +#define PCI_PMER2 0x81 /* power management enable register 2 */ +#define PCI_PMER3 0x82 /* power management enable register 3 */ +#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */ +#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */ +#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */ +#define PCI_MODON 0x95 /* suspend modulation ON counter register */ +#define PCI_SUSCFG 0x96 /* suspend configuration register */ + +/* PMER1 bits */ +#define GPM (1<<0) /* global power management */ +#define GIT (1<<1) /* globally enable PM device idle timers */ +#define GTR (1<<2) /* globally enable IO traps */ +#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */ +#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */ + +/* SUSCFG bits */ +#define SUSMOD (1<<0) /* enable/disable suspend modulation */ +/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */ +#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ + /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ +#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ +/* the below is supported only with cs5530A */ +#define PWRSVE_ISA (1<<3) /* stop ISA clock */ +#define PWRSVE (1<<4) /* active idle */ + +struct gxfreq_params { + u8 on_duration; + u8 off_duration; + u8 pci_suscfg; + u8 pci_pmer1; + u8 pci_pmer2; + struct pci_dev *cs55x0; +}; + +static struct gxfreq_params *gx_params; +static int stock_freq; + +/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ +static int pci_busclk; +module_param(pci_busclk, int, 0444); + +/* maximum duration for which the cpu may be suspended + * (32us * MAX_DURATION). If no parameter is given, this defaults + * to 255. + * Note that this leads to a maximum of 8 ms(!) where the CPU clock + * is suspended -- processing power is just 0.39% of what it used to be, + * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ +static int max_duration = 255; +module_param(max_duration, int, 0444); + +/* For the default policy, we want at least some processing power + * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) + */ +#define POLICY_MIN_DIV 20 + + +/** + * we can detect a core multipiler from dir0_lsb + * from GX1 datasheet p.56, + * MULT[3:0]: + * 0000 = SYSCLK multiplied by 4 (test only) + * 0001 = SYSCLK multiplied by 10 + * 0010 = SYSCLK multiplied by 4 + * 0011 = SYSCLK multiplied by 6 + * 0100 = SYSCLK multiplied by 9 + * 0101 = SYSCLK multiplied by 5 + * 0110 = SYSCLK multiplied by 7 + * 0111 = SYSCLK multiplied by 8 + * of 33.3MHz + **/ +static int gx_freq_mult[16] = { + 4, 10, 4, 6, 9, 5, 7, 8, + 0, 0, 0, 0, 0, 0, 0, 0 +}; + + +/**************************************************************** + * Low Level chipset interface * + ****************************************************************/ +static struct pci_device_id gx_chipset_tbl[] __initdata = { + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, + { 0, }, +}; + +static void gx_write_byte(int reg, int value) +{ + pci_write_config_byte(gx_params->cs55x0, reg, value); +} + +/** + * gx_detect_chipset: + * + **/ +static __init struct pci_dev *gx_detect_chipset(void) +{ + struct pci_dev *gx_pci = NULL; + + /* check if CPU is a MediaGX or a Geode. */ + if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && + (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { + pr_debug("error: no MediaGX/Geode processor found!\n"); + return NULL; + } + + /* detect which companion chip is used */ + for_each_pci_dev(gx_pci) { + if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) + return gx_pci; + } + + pr_debug("error: no supported chipset found!\n"); + return NULL; +} + +/** + * gx_get_cpuspeed: + * + * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi + * Geode CPU runs. + */ +static unsigned int gx_get_cpuspeed(unsigned int cpu) +{ + if ((gx_params->pci_suscfg & SUSMOD) == 0) + return stock_freq; + + return (stock_freq * gx_params->off_duration) + / (gx_params->on_duration + gx_params->off_duration); +} + +/** + * gx_validate_speed: + * determine current cpu speed + * + **/ + +static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, + u8 *off_duration) +{ + unsigned int i; + u8 tmp_on, tmp_off; + int old_tmp_freq = stock_freq; + int tmp_freq; + + *off_duration = 1; + *on_duration = 0; + + for (i = max_duration; i > 0; i--) { + tmp_off = ((khz * i) / stock_freq) & 0xff; + tmp_on = i - tmp_off; + tmp_freq = (stock_freq * tmp_off) / i; + /* if this relation is closer to khz, use this. If it's equal, + * prefer it, too - lower latency */ + if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) { + *on_duration = tmp_on; + *off_duration = tmp_off; + old_tmp_freq = tmp_freq; + } + } + + return old_tmp_freq; +} + + +/** + * gx_set_cpuspeed: + * set cpu speed in khz. + **/ + +static void gx_set_cpuspeed(unsigned int khz) +{ + u8 suscfg, pmer1; + unsigned int new_khz; + unsigned long flags; + struct cpufreq_freqs freqs; + + freqs.cpu = 0; + freqs.old = gx_get_cpuspeed(0); + + new_khz = gx_validate_speed(khz, &gx_params->on_duration, + &gx_params->off_duration); + + freqs.new = new_khz; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + local_irq_save(flags); + + + + if (new_khz != stock_freq) { + /* if new khz == 100% of CPU speed, it is special case */ + switch (gx_params->cs55x0->device) { + case PCI_DEVICE_ID_CYRIX_5530_LEGACY: + pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; + /* FIXME: need to test other values -- Zwane,Miura */ + /* typical 2 to 4ms */ + gx_write_byte(PCI_IRQTC, 4); + /* typical 50 to 100ms */ + gx_write_byte(PCI_VIDTC, 100); + gx_write_byte(PCI_PMER1, pmer1); + + if (gx_params->cs55x0->revision < 0x10) { + /* CS5530(rev 1.2, 1.3) */ + suscfg = gx_params->pci_suscfg|SUSMOD; + } else { + /* CS5530A,B.. */ + suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE; + } + break; + case PCI_DEVICE_ID_CYRIX_5520: + case PCI_DEVICE_ID_CYRIX_5510: + suscfg = gx_params->pci_suscfg | SUSMOD; + break; + default: + local_irq_restore(flags); + pr_debug("fatal: try to set unknown chipset.\n"); + return; + } + } else { + suscfg = gx_params->pci_suscfg & ~(SUSMOD); + gx_params->off_duration = 0; + gx_params->on_duration = 0; + pr_debug("suspend modulation disabled: cpu runs 100%% speed.\n"); + } + + gx_write_byte(PCI_MODOFF, gx_params->off_duration); + gx_write_byte(PCI_MODON, gx_params->on_duration); + + gx_write_byte(PCI_SUSCFG, suscfg); + pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); + + local_irq_restore(flags); + + gx_params->pci_suscfg = suscfg; + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", + gx_params->on_duration * 32, gx_params->off_duration * 32); + pr_debug("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); +} + +/**************************************************************** + * High level functions * + ****************************************************************/ + +/* + * cpufreq_gx_verify: test if frequency range is valid + * + * This function checks if a given frequency range in kHz is valid + * for the hardware supported by the driver. + */ + +static int cpufreq_gx_verify(struct cpufreq_policy *policy) +{ + unsigned int tmp_freq = 0; + u8 tmp1, tmp2; + + if (!stock_freq || !policy) + return -EINVAL; + + policy->cpu = 0; + cpufreq_verify_within_limits(policy, (stock_freq / max_duration), + stock_freq); + + /* it needs to be assured that at least one supported frequency is + * within policy->min and policy->max. If it is not, policy->max + * needs to be increased until one freuqency is supported. + * policy->min may not be decreased, though. This way we guarantee a + * specific processing capacity. + */ + tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2); + if (tmp_freq < policy->min) + tmp_freq += stock_freq / max_duration; + policy->min = tmp_freq; + if (policy->min > policy->max) + policy->max = tmp_freq; + tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2); + if (tmp_freq > policy->max) + tmp_freq -= stock_freq / max_duration; + policy->max = tmp_freq; + if (policy->max < policy->min) + policy->max = policy->min; + cpufreq_verify_within_limits(policy, (stock_freq / max_duration), + stock_freq); + + return 0; +} + +/* + * cpufreq_gx_target: + * + */ +static int cpufreq_gx_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + u8 tmp1, tmp2; + unsigned int tmp_freq; + + if (!stock_freq || !policy) + return -EINVAL; + + policy->cpu = 0; + + tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2); + while (tmp_freq < policy->min) { + tmp_freq += stock_freq / max_duration; + tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); + } + while (tmp_freq > policy->max) { + tmp_freq -= stock_freq / max_duration; + tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); + } + + gx_set_cpuspeed(tmp_freq); + + return 0; +} + +static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int maxfreq, curfreq; + + if (!policy || policy->cpu != 0) + return -ENODEV; + + /* determine maximum frequency */ + if (pci_busclk) + maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; + else if (cpu_khz) + maxfreq = cpu_khz; + else + maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; + + stock_freq = maxfreq; + curfreq = gx_get_cpuspeed(0); + + pr_debug("cpu max frequency is %d.\n", maxfreq); + pr_debug("cpu current frequency is %dkHz.\n", curfreq); + + /* setup basic struct for cpufreq API */ + policy->cpu = 0; + + if (max_duration < POLICY_MIN_DIV) + policy->min = maxfreq / max_duration; + else + policy->min = maxfreq / POLICY_MIN_DIV; + policy->max = maxfreq; + policy->cur = curfreq; + policy->cpuinfo.min_freq = maxfreq / max_duration; + policy->cpuinfo.max_freq = maxfreq; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + + return 0; +} + +/* + * cpufreq_gx_init: + * MediaGX/Geode GX initialize cpufreq driver + */ +static struct cpufreq_driver gx_suspmod_driver = { + .get = gx_get_cpuspeed, + .verify = cpufreq_gx_verify, + .target = cpufreq_gx_target, + .init = cpufreq_gx_cpu_init, + .name = "gx-suspmod", + .owner = THIS_MODULE, +}; + +static int __init cpufreq_gx_init(void) +{ + int ret; + struct gxfreq_params *params; + struct pci_dev *gx_pci; + + /* Test if we have the right hardware */ + gx_pci = gx_detect_chipset(); + if (gx_pci == NULL) + return -ENODEV; + + /* check whether module parameters are sane */ + if (max_duration > 0xff) + max_duration = 0xff; + + pr_debug("geode suspend modulation available.\n"); + + params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL); + if (params == NULL) + return -ENOMEM; + + params->cs55x0 = gx_pci; + gx_params = params; + + /* keep cs55x0 configurations */ + pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg)); + pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); + pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); + pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); + pci_read_config_byte(params->cs55x0, PCI_MODOFF, + &(params->off_duration)); + + ret = cpufreq_register_driver(&gx_suspmod_driver); + if (ret) { + kfree(params); + return ret; /* register error! */ + } + + return 0; +} + +static void __exit cpufreq_gx_exit(void) +{ + cpufreq_unregister_driver(&gx_suspmod_driver); + pci_dev_put(gx_params->cs55x0); + kfree(gx_params); +} + +MODULE_AUTHOR("Hiroshi Miura "); +MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); +MODULE_LICENSE("GPL"); + +module_init(cpufreq_gx_init); +module_exit(cpufreq_gx_exit); + diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c new file mode 100644 index 0000000..f47d26e --- /dev/null +++ b/drivers/cpufreq/longhaul.c @@ -0,0 +1,1024 @@ +/* + * (C) 2001-2004 Dave Jones. + * (C) 2002 Padraig Brady. + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon datasheets & sample CPUs kindly provided by VIA. + * + * VIA have currently 3 different versions of Longhaul. + * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. + * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. + * Version 2 of longhaul is backward compatible with v1, but adds + * LONGHAUL MSR for purpose of both frequency and voltage scaling. + * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). + * Version 3 of longhaul got renamed to Powersaver and redesigned + * to use only the POWERSAVER MSR at 0x110a. + * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. + * It's pretty much the same feature wise to longhaul v2, though + * there is provision for scaling FSB too, but this doesn't work + * too well in practice so we don't even try to use this. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "longhaul.h" + +#define PFX "longhaul: " + +#define TYPE_LONGHAUL_V1 1 +#define TYPE_LONGHAUL_V2 2 +#define TYPE_POWERSAVER 3 + +#define CPU_SAMUEL 1 +#define CPU_SAMUEL2 2 +#define CPU_EZRA 3 +#define CPU_EZRA_T 4 +#define CPU_NEHEMIAH 5 +#define CPU_NEHEMIAH_C 6 + +/* Flags */ +#define USE_ACPI_C3 (1 << 1) +#define USE_NORTHBRIDGE (1 << 2) + +static int cpu_model; +static unsigned int numscales = 16; +static unsigned int fsb; + +static const struct mV_pos *vrm_mV_table; +static const unsigned char *mV_vrm_table; + +static unsigned int highest_speed, lowest_speed; /* kHz */ +static unsigned int minmult, maxmult; +static int can_scale_voltage; +static struct acpi_processor *pr; +static struct acpi_processor_cx *cx; +static u32 acpi_regs_addr; +static u8 longhaul_flags; +static unsigned int longhaul_index; + +/* Module parameters */ +static int scale_voltage; +static int disable_acpi_c3; +static int revid_errata; + + +/* Clock ratios multiplied by 10 */ +static int mults[32]; +static int eblcr[32]; +static int longhaul_version; +static struct cpufreq_frequency_table *longhaul_table; + +static char speedbuffer[8]; + +static char *print_speed(int speed) +{ + if (speed < 1000) { + snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed); + return speedbuffer; + } + + if (speed%1000 == 0) + snprintf(speedbuffer, sizeof(speedbuffer), + "%dGHz", speed/1000); + else + snprintf(speedbuffer, sizeof(speedbuffer), + "%d.%dGHz", speed/1000, (speed%1000)/100); + + return speedbuffer; +} + + +static unsigned int calc_speed(int mult) +{ + int khz; + khz = (mult/10)*fsb; + if (mult%10) + khz += fsb/2; + khz *= 1000; + return khz; +} + + +static int longhaul_get_cpu_mult(void) +{ + unsigned long invalue = 0, lo, hi; + + rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi); + invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22; + if (longhaul_version == TYPE_LONGHAUL_V2 || + longhaul_version == TYPE_POWERSAVER) { + if (lo & (1<<27)) + invalue += 16; + } + return eblcr[invalue]; +} + +/* For processor with BCR2 MSR */ + +static void do_longhaul1(unsigned int mults_index) +{ + union msr_bcr2 bcr2; + + rdmsrl(MSR_VIA_BCR2, bcr2.val); + /* Enable software clock multiplier */ + bcr2.bits.ESOFTBF = 1; + bcr2.bits.CLOCKMUL = mults_index & 0xff; + + /* Sync to timer tick */ + safe_halt(); + /* Change frequency on next halt or sleep */ + wrmsrl(MSR_VIA_BCR2, bcr2.val); + /* Invoke transition */ + ACPI_FLUSH_CPU_CACHE(); + halt(); + + /* Disable software clock multiplier */ + local_irq_disable(); + rdmsrl(MSR_VIA_BCR2, bcr2.val); + bcr2.bits.ESOFTBF = 0; + wrmsrl(MSR_VIA_BCR2, bcr2.val); +} + +/* For processor with Longhaul MSR */ + +static void do_powersaver(int cx_address, unsigned int mults_index, + unsigned int dir) +{ + union msr_longhaul longhaul; + u32 t; + + rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Setup new frequency */ + if (!revid_errata) + longhaul.bits.RevisionKey = longhaul.bits.RevisionID; + else + longhaul.bits.RevisionKey = 0; + longhaul.bits.SoftBusRatio = mults_index & 0xf; + longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4; + /* Setup new voltage */ + if (can_scale_voltage) + longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f; + /* Sync to timer tick */ + safe_halt(); + /* Raise voltage if necessary */ + if (can_scale_voltage && dir) { + longhaul.bits.EnableSoftVID = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Change voltage */ + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 + * read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + } + + /* Change frequency on next halt or sleep */ + longhaul.bits.EnableSoftBusRatio = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + /* Disable bus ratio bit */ + longhaul.bits.EnableSoftBusRatio = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + + /* Reduce voltage if necessary */ + if (can_scale_voltage && !dir) { + longhaul.bits.EnableSoftVID = 1; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + /* Change voltage */ + if (!cx_address) { + ACPI_FLUSH_CPU_CACHE(); + halt(); + } else { + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 + * read */ + t = inl(acpi_gbl_FADT.xpm_timer_block.address); + } + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + } +} + +/** + * longhaul_set_cpu_frequency() + * @mults_index : bitpattern of the new multiplier. + * + * Sets a new clock ratio. + */ + +static void longhaul_setstate(unsigned int table_index) +{ + unsigned int mults_index; + int speed, mult; + struct cpufreq_freqs freqs; + unsigned long flags; + unsigned int pic1_mask, pic2_mask; + u16 bm_status = 0; + u32 bm_timeout = 1000; + unsigned int dir = 0; + + mults_index = longhaul_table[table_index].index; + /* Safety precautions */ + mult = mults[mults_index & 0x1f]; + if (mult == -1) + return; + speed = calc_speed(mult); + if ((speed > highest_speed) || (speed < lowest_speed)) + return; + /* Voltage transition before frequency transition? */ + if (can_scale_voltage && longhaul_index < table_index) + dir = 1; + + freqs.old = calc_speed(longhaul_get_cpu_mult()); + freqs.new = speed; + freqs.cpu = 0; /* longhaul.c is UP only driver */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", + fsb, mult/10, mult%10, print_speed(speed/1000)); +retry_loop: + preempt_disable(); + local_irq_save(flags); + + pic2_mask = inb(0xA1); + pic1_mask = inb(0x21); /* works on C3. save mask. */ + outb(0xFF, 0xA1); /* Overkill */ + outb(0xFE, 0x21); /* TMR0 only */ + + /* Wait while PCI bus is busy. */ + if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE + || ((pr != NULL) && pr->flags.bm_control))) { + bm_status = inw(acpi_regs_addr); + bm_status &= 1 << 4; + while (bm_status && bm_timeout) { + outw(1 << 4, acpi_regs_addr); + bm_timeout--; + bm_status = inw(acpi_regs_addr); + bm_status &= 1 << 4; + } + } + + if (longhaul_flags & USE_NORTHBRIDGE) { + /* Disable AGP and PCI arbiters */ + outb(3, 0x22); + } else if ((pr != NULL) && pr->flags.bm_control) { + /* Disable bus master arbitration */ + acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); + } + switch (longhaul_version) { + + /* + * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) + * Software controlled multipliers only. + */ + case TYPE_LONGHAUL_V1: + do_longhaul1(mults_index); + break; + + /* + * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] + * + * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) + * Nehemiah can do FSB scaling too, but this has never been proven + * to work in practice. + */ + case TYPE_LONGHAUL_V2: + case TYPE_POWERSAVER: + if (longhaul_flags & USE_ACPI_C3) { + /* Don't allow wakeup */ + acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0); + do_powersaver(cx->address, mults_index, dir); + } else { + do_powersaver(0, mults_index, dir); + } + break; + } + + if (longhaul_flags & USE_NORTHBRIDGE) { + /* Enable arbiters */ + outb(0, 0x22); + } else if ((pr != NULL) && pr->flags.bm_control) { + /* Enable bus master arbitration */ + acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); + } + outb(pic2_mask, 0xA1); /* restore mask */ + outb(pic1_mask, 0x21); + + local_irq_restore(flags); + preempt_enable(); + + freqs.new = calc_speed(longhaul_get_cpu_mult()); + /* Check if requested frequency is set. */ + if (unlikely(freqs.new != speed)) { + printk(KERN_INFO PFX "Failed to set requested frequency!\n"); + /* Revision ID = 1 but processor is expecting revision key + * equal to 0. Jumpers at the bottom of processor will change + * multiplier and FSB, but will not change bits in Longhaul + * MSR nor enable voltage scaling. */ + if (!revid_errata) { + printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" " + "option.\n"); + revid_errata = 1; + msleep(200); + goto retry_loop; + } + /* Why ACPI C3 sometimes doesn't work is a mystery for me. + * But it does happen. Processor is entering ACPI C3 state, + * but it doesn't change frequency. I tried poking various + * bits in northbridge registers, but without success. */ + if (longhaul_flags & USE_ACPI_C3) { + printk(KERN_INFO PFX "Disabling ACPI C3 support.\n"); + longhaul_flags &= ~USE_ACPI_C3; + if (revid_errata) { + printk(KERN_INFO PFX "Disabling \"Ignore " + "Revision ID\" option.\n"); + revid_errata = 0; + } + msleep(200); + goto retry_loop; + } + /* This shouldn't happen. Longhaul ver. 2 was reported not + * working on processors without voltage scaling, but with + * RevID = 1. RevID errata will make things right. Just + * to be 100% sure. */ + if (longhaul_version == TYPE_LONGHAUL_V2) { + printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n"); + longhaul_version = TYPE_LONGHAUL_V1; + msleep(200); + goto retry_loop; + } + } + /* Report true CPU frequency */ + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + if (!bm_timeout) + printk(KERN_INFO PFX "Warning: Timeout while waiting for " + "idle PCI bus.\n"); +} + +/* + * Centaur decided to make life a little more tricky. + * Only longhaul v1 is allowed to read EBLCR BSEL[0:1]. + * Samuel2 and above have to try and guess what the FSB is. + * We do this by assuming we booted at maximum multiplier, and interpolate + * between that value multiplied by possible FSBs and cpu_mhz which + * was calculated at boot time. Really ugly, but no other way to do this. + */ + +#define ROUNDING 0xf + +static int guess_fsb(int mult) +{ + int speed = cpu_khz / 1000; + int i; + int speeds[] = { 666, 1000, 1333, 2000 }; + int f_max, f_min; + + for (i = 0; i < 4; i++) { + f_max = ((speeds[i] * mult) + 50) / 100; + f_max += (ROUNDING / 2); + f_min = f_max - ROUNDING; + if ((speed <= f_max) && (speed >= f_min)) + return speeds[i] / 10; + } + return 0; +} + + +static int __cpuinit longhaul_get_ranges(void) +{ + unsigned int i, j, k = 0; + unsigned int ratio; + int mult; + + /* Get current frequency */ + mult = longhaul_get_cpu_mult(); + if (mult == -1) { + printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); + return -EINVAL; + } + fsb = guess_fsb(mult); + if (fsb == 0) { + printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); + return -EINVAL; + } + /* Get max multiplier - as we always did. + * Longhaul MSR is useful only when voltage scaling is enabled. + * C3 is booting at max anyway. */ + maxmult = mult; + /* Get min multiplier */ + switch (cpu_model) { + case CPU_NEHEMIAH: + minmult = 50; + break; + case CPU_NEHEMIAH_C: + minmult = 40; + break; + default: + minmult = 30; + break; + } + + pr_debug("MinMult:%d.%dx MaxMult:%d.%dx\n", + minmult/10, minmult%10, maxmult/10, maxmult%10); + + highest_speed = calc_speed(maxmult); + lowest_speed = calc_speed(minmult); + pr_debug("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, + print_speed(lowest_speed/1000), + print_speed(highest_speed/1000)); + + if (lowest_speed == highest_speed) { + printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n"); + return -EINVAL; + } + if (lowest_speed > highest_speed) { + printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", + lowest_speed, highest_speed); + return -EINVAL; + } + + longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table), + GFP_KERNEL); + if (!longhaul_table) + return -ENOMEM; + + for (j = 0; j < numscales; j++) { + ratio = mults[j]; + if (ratio == -1) + continue; + if (ratio > maxmult || ratio < minmult) + continue; + longhaul_table[k].frequency = calc_speed(ratio); + longhaul_table[k].index = j; + k++; + } + if (k <= 1) { + kfree(longhaul_table); + return -ENODEV; + } + /* Sort */ + for (j = 0; j < k - 1; j++) { + unsigned int min_f, min_i; + min_f = longhaul_table[j].frequency; + min_i = j; + for (i = j + 1; i < k; i++) { + if (longhaul_table[i].frequency < min_f) { + min_f = longhaul_table[i].frequency; + min_i = i; + } + } + if (min_i != j) { + swap(longhaul_table[j].frequency, + longhaul_table[min_i].frequency); + swap(longhaul_table[j].index, + longhaul_table[min_i].index); + } + } + + longhaul_table[k].frequency = CPUFREQ_TABLE_END; + + /* Find index we are running on */ + for (j = 0; j < k; j++) { + if (mults[longhaul_table[j].index & 0x1f] == mult) { + longhaul_index = j; + break; + } + } + return 0; +} + + +static void __cpuinit longhaul_setup_voltagescaling(void) +{ + union msr_longhaul longhaul; + struct mV_pos minvid, maxvid, vid; + unsigned int j, speed, pos, kHz_step, numvscales; + int min_vid_speed; + + rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); + if (!(longhaul.bits.RevisionID & 1)) { + printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n"); + return; + } + + if (!longhaul.bits.VRMRev) { + printk(KERN_INFO PFX "VRM 8.5\n"); + vrm_mV_table = &vrm85_mV[0]; + mV_vrm_table = &mV_vrm85[0]; + } else { + printk(KERN_INFO PFX "Mobile VRM\n"); + if (cpu_model < CPU_NEHEMIAH) + return; + vrm_mV_table = &mobilevrm_mV[0]; + mV_vrm_table = &mV_mobilevrm[0]; + } + + minvid = vrm_mV_table[longhaul.bits.MinimumVID]; + maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; + + if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { + printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " + "Voltage scaling disabled.\n", + minvid.mV/1000, minvid.mV%1000, + maxvid.mV/1000, maxvid.mV%1000); + return; + } + + if (minvid.mV == maxvid.mV) { + printk(KERN_INFO PFX "Claims to support voltage scaling but " + "min & max are both %d.%03d. " + "Voltage scaling disabled\n", + maxvid.mV/1000, maxvid.mV%1000); + return; + } + + /* How many voltage steps*/ + numvscales = maxvid.pos - minvid.pos + 1; + printk(KERN_INFO PFX + "Max VID=%d.%03d " + "Min VID=%d.%03d, " + "%d possible voltage scales\n", + maxvid.mV/1000, maxvid.mV%1000, + minvid.mV/1000, minvid.mV%1000, + numvscales); + + /* Calculate max frequency at min voltage */ + j = longhaul.bits.MinMHzBR; + if (longhaul.bits.MinMHzBR4) + j += 16; + min_vid_speed = eblcr[j]; + if (min_vid_speed == -1) + return; + switch (longhaul.bits.MinMHzFSB) { + case 0: + min_vid_speed *= 13333; + break; + case 1: + min_vid_speed *= 10000; + break; + case 3: + min_vid_speed *= 6666; + break; + default: + return; + break; + } + if (min_vid_speed >= highest_speed) + return; + /* Calculate kHz for one voltage step */ + kHz_step = (highest_speed - min_vid_speed) / numvscales; + + j = 0; + while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { + speed = longhaul_table[j].frequency; + if (speed > min_vid_speed) + pos = (speed - min_vid_speed) / kHz_step + minvid.pos; + else + pos = minvid.pos; + longhaul_table[j].index |= mV_vrm_table[pos] << 8; + vid = vrm_mV_table[mV_vrm_table[pos]]; + printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", + speed, j, vid.mV); + j++; + } + + can_scale_voltage = 1; + printk(KERN_INFO PFX "Voltage scaling enabled.\n"); +} + + +static int longhaul_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, longhaul_table); +} + + +static int longhaul_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + unsigned int table_index = 0; + unsigned int i; + unsigned int dir = 0; + u8 vid, current_vid; + + if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, + relation, &table_index)) + return -EINVAL; + + /* Don't set same frequency again */ + if (longhaul_index == table_index) + return 0; + + if (!can_scale_voltage) + longhaul_setstate(table_index); + else { + /* On test system voltage transitions exceeding single + * step up or down were turning motherboard off. Both + * "ondemand" and "userspace" are unsafe. C7 is doing + * this in hardware, C3 is old and we need to do this + * in software. */ + i = longhaul_index; + current_vid = (longhaul_table[longhaul_index].index >> 8); + current_vid &= 0x1f; + if (table_index > longhaul_index) + dir = 1; + while (i != table_index) { + vid = (longhaul_table[i].index >> 8) & 0x1f; + if (vid != current_vid) { + longhaul_setstate(i); + current_vid = vid; + msleep(200); + } + if (dir) + i++; + else + i--; + } + longhaul_setstate(table_index); + } + longhaul_index = table_index; + return 0; +} + + +static unsigned int longhaul_get(unsigned int cpu) +{ + if (cpu) + return 0; + return calc_speed(longhaul_get_cpu_mult()); +} + +static acpi_status longhaul_walk_callback(acpi_handle obj_handle, + u32 nesting_level, + void *context, void **return_value) +{ + struct acpi_device *d; + + if (acpi_bus_get_device(obj_handle, &d)) + return 0; + + *return_value = acpi_driver_data(d); + return 1; +} + +/* VIA don't support PM2 reg, but have something similar */ +static int enable_arbiter_disable(void) +{ + struct pci_dev *dev; + int status = 1; + int reg; + u8 pci_cmd; + + /* Find PLE133 host bridge */ + reg = 0x78; + dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, + NULL); + /* Find PM133/VT8605 host bridge */ + if (dev == NULL) + dev = pci_get_device(PCI_VENDOR_ID_VIA, + PCI_DEVICE_ID_VIA_8605_0, NULL); + /* Find CLE266 host bridge */ + if (dev == NULL) { + reg = 0x76; + dev = pci_get_device(PCI_VENDOR_ID_VIA, + PCI_DEVICE_ID_VIA_862X_0, NULL); + /* Find CN400 V-Link host bridge */ + if (dev == NULL) + dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL); + } + if (dev != NULL) { + /* Enable access to port 0x22 */ + pci_read_config_byte(dev, reg, &pci_cmd); + if (!(pci_cmd & 1<<7)) { + pci_cmd |= 1<<7; + pci_write_config_byte(dev, reg, pci_cmd); + pci_read_config_byte(dev, reg, &pci_cmd); + if (!(pci_cmd & 1<<7)) { + printk(KERN_ERR PFX + "Can't enable access to port 0x22.\n"); + status = 0; + } + } + pci_dev_put(dev); + return status; + } + return 0; +} + +static int longhaul_setup_southbridge(void) +{ + struct pci_dev *dev; + u8 pci_cmd; + + /* Find VT8235 southbridge */ + dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); + if (dev == NULL) + /* Find VT8237 southbridge */ + dev = pci_get_device(PCI_VENDOR_ID_VIA, + PCI_DEVICE_ID_VIA_8237, NULL); + if (dev != NULL) { + /* Set transition time to max */ + pci_read_config_byte(dev, 0xec, &pci_cmd); + pci_cmd &= ~(1 << 2); + pci_write_config_byte(dev, 0xec, pci_cmd); + pci_read_config_byte(dev, 0xe4, &pci_cmd); + pci_cmd &= ~(1 << 7); + pci_write_config_byte(dev, 0xe4, pci_cmd); + pci_read_config_byte(dev, 0xe5, &pci_cmd); + pci_cmd |= 1 << 7; + pci_write_config_byte(dev, 0xe5, pci_cmd); + /* Get address of ACPI registers block*/ + pci_read_config_byte(dev, 0x81, &pci_cmd); + if (pci_cmd & 1 << 7) { + pci_read_config_dword(dev, 0x88, &acpi_regs_addr); + acpi_regs_addr &= 0xff00; + printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", + acpi_regs_addr); + } + + pci_dev_put(dev); + return 1; + } + return 0; +} + +static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + char *cpuname = NULL; + int ret; + u32 lo, hi; + + /* Check what we have on this motherboard */ + switch (c->x86_model) { + case 6: + cpu_model = CPU_SAMUEL; + cpuname = "C3 'Samuel' [C5A]"; + longhaul_version = TYPE_LONGHAUL_V1; + memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); + memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr)); + break; + + case 7: + switch (c->x86_mask) { + case 0: + longhaul_version = TYPE_LONGHAUL_V1; + cpu_model = CPU_SAMUEL2; + cpuname = "C3 'Samuel 2' [C5B]"; + /* Note, this is not a typo, early Samuel2's had + * Samuel1 ratios. */ + memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); + memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); + break; + case 1 ... 15: + longhaul_version = TYPE_LONGHAUL_V2; + if (c->x86_mask < 8) { + cpu_model = CPU_SAMUEL2; + cpuname = "C3 'Samuel 2' [C5B]"; + } else { + cpu_model = CPU_EZRA; + cpuname = "C3 'Ezra' [C5C]"; + } + memcpy(mults, ezra_mults, sizeof(ezra_mults)); + memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr)); + break; + } + break; + + case 8: + cpu_model = CPU_EZRA_T; + cpuname = "C3 'Ezra-T' [C5M]"; + longhaul_version = TYPE_POWERSAVER; + numscales = 32; + memcpy(mults, ezrat_mults, sizeof(ezrat_mults)); + memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr)); + break; + + case 9: + longhaul_version = TYPE_POWERSAVER; + numscales = 32; + memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults)); + memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr)); + switch (c->x86_mask) { + case 0 ... 1: + cpu_model = CPU_NEHEMIAH; + cpuname = "C3 'Nehemiah A' [C5XLOE]"; + break; + case 2 ... 4: + cpu_model = CPU_NEHEMIAH; + cpuname = "C3 'Nehemiah B' [C5XLOH]"; + break; + case 5 ... 15: + cpu_model = CPU_NEHEMIAH_C; + cpuname = "C3 'Nehemiah C' [C5P]"; + break; + } + break; + + default: + cpuname = "Unknown"; + break; + } + /* Check Longhaul ver. 2 */ + if (longhaul_version == TYPE_LONGHAUL_V2) { + rdmsr(MSR_VIA_LONGHAUL, lo, hi); + if (lo == 0 && hi == 0) + /* Looks like MSR isn't present */ + longhaul_version = TYPE_LONGHAUL_V1; + } + + printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname); + switch (longhaul_version) { + case TYPE_LONGHAUL_V1: + case TYPE_LONGHAUL_V2: + printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version); + break; + case TYPE_POWERSAVER: + printk(KERN_CONT "Powersaver supported.\n"); + break; + }; + + /* Doesn't hurt */ + longhaul_setup_southbridge(); + + /* Find ACPI data for processor */ + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, + NULL, (void *)&pr); + + /* Check ACPI support for C3 state */ + if (pr != NULL && longhaul_version == TYPE_POWERSAVER) { + cx = &pr->power.states[ACPI_STATE_C3]; + if (cx->address > 0 && cx->latency <= 1000) + longhaul_flags |= USE_ACPI_C3; + } + /* Disable if it isn't working */ + if (disable_acpi_c3) + longhaul_flags &= ~USE_ACPI_C3; + /* Check if northbridge is friendly */ + if (enable_arbiter_disable()) + longhaul_flags |= USE_NORTHBRIDGE; + + /* Check ACPI support for bus master arbiter disable */ + if (!(longhaul_flags & USE_ACPI_C3 + || longhaul_flags & USE_NORTHBRIDGE) + && ((pr == NULL) || !(pr->flags.bm_control))) { + printk(KERN_ERR PFX + "No ACPI support. Unsupported northbridge.\n"); + return -ENODEV; + } + + if (longhaul_flags & USE_NORTHBRIDGE) + printk(KERN_INFO PFX "Using northbridge support.\n"); + if (longhaul_flags & USE_ACPI_C3) + printk(KERN_INFO PFX "Using ACPI support.\n"); + + ret = longhaul_get_ranges(); + if (ret != 0) + return ret; + + if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) + longhaul_setup_voltagescaling(); + + policy->cpuinfo.transition_latency = 200000; /* nsec */ + policy->cur = calc_speed(longhaul_get_cpu_mult()); + + ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table); + if (ret) + return ret; + + cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); + + return 0; +} + +static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static struct freq_attr *longhaul_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver longhaul_driver = { + .verify = longhaul_verify, + .target = longhaul_target, + .get = longhaul_get, + .init = longhaul_cpu_init, + .exit = __devexit_p(longhaul_cpu_exit), + .name = "longhaul", + .owner = THIS_MODULE, + .attr = longhaul_attr, +}; + + +static int __init longhaul_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) + return -ENODEV; + +#ifdef CONFIG_SMP + if (num_online_cpus() > 1) { + printk(KERN_ERR PFX "More than 1 CPU detected, " + "longhaul disabled.\n"); + return -ENODEV; + } +#endif +#ifdef CONFIG_X86_IO_APIC + if (cpu_has_apic) { + printk(KERN_ERR PFX "APIC detected. Longhaul is currently " + "broken in this configuration.\n"); + return -ENODEV; + } +#endif + switch (c->x86_model) { + case 6 ... 9: + return cpufreq_register_driver(&longhaul_driver); + case 10: + printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n"); + default: + ; + } + + return -ENODEV; +} + + +static void __exit longhaul_exit(void) +{ + int i; + + for (i = 0; i < numscales; i++) { + if (mults[i] == maxmult) { + longhaul_setstate(i); + break; + } + } + + cpufreq_unregister_driver(&longhaul_driver); + kfree(longhaul_table); +} + +/* Even if BIOS is exporting ACPI C3 state, and it is used + * with success when CPU is idle, this state doesn't + * trigger frequency transition in some cases. */ +module_param(disable_acpi_c3, int, 0644); +MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); +/* Change CPU voltage with frequency. Very useful to save + * power, but most VIA C3 processors aren't supporting it. */ +module_param(scale_voltage, int, 0644); +MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); +/* Force revision key to 0 for processors which doesn't + * support voltage scaling, but are introducing itself as + * such. */ +module_param(revid_errata, int, 0644); +MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); + +MODULE_AUTHOR("Dave Jones "); +MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors."); +MODULE_LICENSE("GPL"); + +late_initcall(longhaul_init); +module_exit(longhaul_exit); diff --git a/drivers/cpufreq/longhaul.h b/drivers/cpufreq/longhaul.h new file mode 100644 index 0000000..cbf48fb --- /dev/null +++ b/drivers/cpufreq/longhaul.h @@ -0,0 +1,353 @@ +/* + * longhaul.h + * (C) 2003 Dave Jones. + * + * Licensed under the terms of the GNU GPL License version 2. + * + * VIA-specific information + */ + +union msr_bcr2 { + struct { + unsigned Reseved:19, // 18:0 + ESOFTBF:1, // 19 + Reserved2:3, // 22:20 + CLOCKMUL:4, // 26:23 + Reserved3:5; // 31:27 + } bits; + unsigned long val; +}; + +union msr_longhaul { + struct { + unsigned RevisionID:4, // 3:0 + RevisionKey:4, // 7:4 + EnableSoftBusRatio:1, // 8 + EnableSoftVID:1, // 9 + EnableSoftBSEL:1, // 10 + Reserved:3, // 11:13 + SoftBusRatio4:1, // 14 + VRMRev:1, // 15 + SoftBusRatio:4, // 19:16 + SoftVID:5, // 24:20 + Reserved2:3, // 27:25 + SoftBSEL:2, // 29:28 + Reserved3:2, // 31:30 + MaxMHzBR:4, // 35:32 + MaximumVID:5, // 40:36 + MaxMHzFSB:2, // 42:41 + MaxMHzBR4:1, // 43 + Reserved4:4, // 47:44 + MinMHzBR:4, // 51:48 + MinimumVID:5, // 56:52 + MinMHzFSB:2, // 58:57 + MinMHzBR4:1, // 59 + Reserved5:4; // 63:60 + } bits; + unsigned long long val; +}; + +/* + * Clock ratio tables. Div/Mod by 10 to get ratio. + * The eblcr values specify the ratio read from the CPU. + * The mults values specify what to write to the CPU. + */ + +/* + * VIA C3 Samuel 1 & Samuel 2 (stepping 0) + */ +static const int __cpuinitdata samuel1_mults[16] = { + -1, /* 0000 -> RESERVED */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + -1, /* 0011 -> RESERVED */ + -1, /* 0100 -> RESERVED */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + -1, /* 1110 -> RESERVED */ + -1, /* 1111 -> RESERVED */ +}; + +static const int __cpuinitdata samuel1_eblcr[16] = { + 50, /* 0000 -> RESERVED */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + -1, /* 0011 -> RESERVED */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + -1, /* 0111 -> RESERVED */ + -1, /* 1000 -> RESERVED */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + -1, /* 1100 -> RESERVED */ + 75, /* 1101 -> 7.5x */ + -1, /* 1110 -> RESERVED */ + 65, /* 1111 -> 6.5x */ +}; + +/* + * VIA C3 Samuel2 Stepping 1->15 + */ +static const int __cpuinitdata samuel2_eblcr[16] = { + 50, /* 0000 -> 5.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 110, /* 0111 -> 11.0x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 130, /* 1110 -> 13.0x */ + 65, /* 1111 -> 6.5x */ +}; + +/* + * VIA C3 Ezra + */ +static const int __cpuinitdata ezra_mults[16] = { + 100, /* 0000 -> 10.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ +}; + +static const int __cpuinitdata ezra_eblcr[16] = { + 50, /* 0000 -> 5.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ +}; + +/* + * VIA C3 (Ezra-T) [C5M]. + */ +static const int __cpuinitdata ezrat_mults[32] = { + 100, /* 0000 -> 10.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ + + -1, /* 0000 -> RESERVED (10.0x) */ + 110, /* 0001 -> 11.0x */ + -1, /* 0010 -> 12.0x */ + -1, /* 0011 -> RESERVED (9.0x)*/ + 105, /* 0100 -> 10.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 135, /* 0111 -> 13.5x */ + 140, /* 1000 -> 14.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 130, /* 1011 -> 13.0x */ + 145, /* 1100 -> 14.5x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + -1, /* 1111 -> RESERVED (12.0x) */ +}; + +static const int __cpuinitdata ezrat_eblcr[32] = { + 50, /* 0000 -> 5.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ + + -1, /* 0000 -> RESERVED (9.0x) */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + -1, /* 0011 -> RESERVED (10.0x)*/ + 135, /* 0100 -> 13.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 105, /* 0111 -> 10.5x */ + 130, /* 1000 -> 13.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 140, /* 1011 -> 14.0x */ + -1, /* 1100 -> RESERVED (12.0x) */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 145, /* 1111 -> 14.5x */ +}; + +/* + * VIA C3 Nehemiah */ + +static const int __cpuinitdata nehemiah_mults[32] = { + 100, /* 0000 -> 10.0x */ + -1, /* 0001 -> 16.0x */ + 40, /* 0010 -> 4.0x */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + -1, /* 0101 -> RESERVED */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ + -1, /* 0000 -> 10.0x */ + 110, /* 0001 -> 11.0x */ + -1, /* 0010 -> 12.0x */ + -1, /* 0011 -> 9.0x */ + 105, /* 0100 -> 10.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 135, /* 0111 -> 13.5x */ + 140, /* 1000 -> 14.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 130, /* 1011 -> 13.0x */ + 145, /* 1100 -> 14.5x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + -1, /* 1111 -> 12.0x */ +}; + +static const int __cpuinitdata nehemiah_eblcr[32] = { + 50, /* 0000 -> 5.0x */ + 160, /* 0001 -> 16.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + -1, /* 0101 -> RESERVED */ + 45, /* 0110 -> 4.5x */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ + 90, /* 0000 -> 9.0x */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + 100, /* 0011 -> 10.0x */ + 135, /* 0100 -> 13.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 105, /* 0111 -> 10.5x */ + 130, /* 1000 -> 13.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 140, /* 1011 -> 14.0x */ + 120, /* 1100 -> 12.0x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 145 /* 1111 -> 14.5x */ +}; + +/* + * Voltage scales. Div/Mod by 1000 to get actual voltage. + * Which scale to use depends on the VRM type in use. + */ + +struct mV_pos { + unsigned short mV; + unsigned short pos; +}; + +static const struct mV_pos __cpuinitdata vrm85_mV[32] = { + {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, + {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, + {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, + {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10}, + {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3}, + {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27}, + {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19}, + {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} +}; + +static const unsigned char __cpuinitdata mV_vrm85[32] = { + 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, + 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, + 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, + 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 +}; + +static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { + {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, + {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, + {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, + {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16}, + {975, 15}, {950, 14}, {925, 13}, {900, 12}, + {875, 11}, {850, 10}, {825, 9}, {800, 8}, + {775, 7}, {750, 6}, {725, 5}, {700, 4}, + {675, 3}, {650, 2}, {625, 1}, {600, 0} +}; + +static const unsigned char __cpuinitdata mV_mobilevrm[32] = { + 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, + 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, + 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 +}; + diff --git a/drivers/cpufreq/longrun.c b/drivers/cpufreq/longrun.c new file mode 100644 index 0000000..34ea359 --- /dev/null +++ b/drivers/cpufreq/longrun.c @@ -0,0 +1,324 @@ +/* + * (C) 2002 - 2003 Dominik Brodowski + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include +#include +#include +#include +#include + +#include +#include + +static struct cpufreq_driver longrun_driver; + +/** + * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz + * values into per cent values. In TMTA microcode, the following is valid: + * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) + */ +static unsigned int longrun_low_freq, longrun_high_freq; + + +/** + * longrun_get_policy - get the current LongRun policy + * @policy: struct cpufreq_policy where current policy is written into + * + * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS + * and MSR_TMTA_LONGRUN_CTRL + */ +static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) +{ + u32 msr_lo, msr_hi; + + rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); + pr_debug("longrun flags are %x - %x\n", msr_lo, msr_hi); + if (msr_lo & 0x01) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + pr_debug("longrun ctrl is %x - %x\n", msr_lo, msr_hi); + msr_lo &= 0x0000007F; + msr_hi &= 0x0000007F; + + if (longrun_high_freq <= longrun_low_freq) { + /* Assume degenerate Longrun table */ + policy->min = policy->max = longrun_high_freq; + } else { + policy->min = longrun_low_freq + msr_lo * + ((longrun_high_freq - longrun_low_freq) / 100); + policy->max = longrun_low_freq + msr_hi * + ((longrun_high_freq - longrun_low_freq) / 100); + } + policy->cpu = 0; +} + + +/** + * longrun_set_policy - sets a new CPUFreq policy + * @policy: new policy + * + * Sets a new CPUFreq policy on LongRun-capable processors. This function + * has to be called with cpufreq_driver locked. + */ +static int longrun_set_policy(struct cpufreq_policy *policy) +{ + u32 msr_lo, msr_hi; + u32 pctg_lo, pctg_hi; + + if (!policy) + return -EINVAL; + + if (longrun_high_freq <= longrun_low_freq) { + /* Assume degenerate Longrun table */ + pctg_lo = pctg_hi = 100; + } else { + pctg_lo = (policy->min - longrun_low_freq) / + ((longrun_high_freq - longrun_low_freq) / 100); + pctg_hi = (policy->max - longrun_low_freq) / + ((longrun_high_freq - longrun_low_freq) / 100); + } + + if (pctg_hi > 100) + pctg_hi = 100; + if (pctg_lo > pctg_hi) + pctg_lo = pctg_hi; + + /* performance or economy mode */ + rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); + msr_lo &= 0xFFFFFFFE; + switch (policy->policy) { + case CPUFREQ_POLICY_PERFORMANCE: + msr_lo |= 0x00000001; + break; + case CPUFREQ_POLICY_POWERSAVE: + break; + } + wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); + + /* lower and upper boundary */ + rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + msr_lo &= 0xFFFFFF80; + msr_hi &= 0xFFFFFF80; + msr_lo |= pctg_lo; + msr_hi |= pctg_hi; + wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + + return 0; +} + + +/** + * longrun_verify_poliy - verifies a new CPUFreq policy + * @policy: the policy to verify + * + * Validates a new CPUFreq policy. This function has to be called with + * cpufreq_driver locked. + */ +static int longrun_verify_policy(struct cpufreq_policy *policy) +{ + if (!policy) + return -EINVAL; + + policy->cpu = 0; + cpufreq_verify_within_limits(policy, + policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + + if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && + (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) + return -EINVAL; + + return 0; +} + +static unsigned int longrun_get(unsigned int cpu) +{ + u32 eax, ebx, ecx, edx; + + if (cpu) + return 0; + + cpuid(0x80860007, &eax, &ebx, &ecx, &edx); + pr_debug("cpuid eax is %u\n", eax); + + return eax * 1000; +} + +/** + * longrun_determine_freqs - determines the lowest and highest possible core frequency + * @low_freq: an int to put the lowest frequency into + * @high_freq: an int to put the highest frequency into + * + * Determines the lowest and highest possible core frequencies on this CPU. + * This is necessary to calculate the performance percentage according to + * TMTA rules: + * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) + */ +static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) +{ + u32 msr_lo, msr_hi; + u32 save_lo, save_hi; + u32 eax, ebx, ecx, edx; + u32 try_hi; + struct cpuinfo_x86 *c = &cpu_data(0); + + if (!low_freq || !high_freq) + return -EINVAL; + + if (cpu_has(c, X86_FEATURE_LRTI)) { + /* if the LongRun Table Interface is present, the + * detection is a bit easier: + * For minimum frequency, read out the maximum + * level (msr_hi), write that into "currently + * selected level", and read out the frequency. + * For maximum frequency, read out level zero. + */ + /* minimum */ + rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi); + wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi); + rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); + *low_freq = msr_lo * 1000; /* to kHz */ + + /* maximum */ + wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi); + rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); + *high_freq = msr_lo * 1000; /* to kHz */ + + pr_debug("longrun table interface told %u - %u kHz\n", + *low_freq, *high_freq); + + if (*low_freq > *high_freq) + *low_freq = *high_freq; + return 0; + } + + /* set the upper border to the value determined during TSC init */ + *high_freq = (cpu_khz / 1000); + *high_freq = *high_freq * 1000; + pr_debug("high frequency is %u kHz\n", *high_freq); + + /* get current borders */ + rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + save_lo = msr_lo & 0x0000007F; + save_hi = msr_hi & 0x0000007F; + + /* if current perf_pctg is larger than 90%, we need to decrease the + * upper limit to make the calculation more accurate. + */ + cpuid(0x80860007, &eax, &ebx, &ecx, &edx); + /* try decreasing in 10% steps, some processors react only + * on some barrier values */ + for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) { + /* set to 0 to try_hi perf_pctg */ + msr_lo &= 0xFFFFFF80; + msr_hi &= 0xFFFFFF80; + msr_hi |= try_hi; + wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + + /* read out current core MHz and current perf_pctg */ + cpuid(0x80860007, &eax, &ebx, &ecx, &edx); + + /* restore values */ + wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); + } + pr_debug("percentage is %u %%, freq is %u MHz\n", ecx, eax); + + /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) + * eqals + * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) + * + * high_freq * perf_pctg is stored tempoarily into "ebx". + */ + ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */ + + if ((ecx > 95) || (ecx == 0) || (eax < ebx)) + return -EIO; + + edx = ((eax - ebx) * 100) / (100 - ecx); + *low_freq = edx * 1000; /* back to kHz */ + + pr_debug("low frequency is %u kHz\n", *low_freq); + + if (*low_freq > *high_freq) + *low_freq = *high_freq; + + return 0; +} + + +static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) +{ + int result = 0; + + /* capability check */ + if (policy->cpu != 0) + return -ENODEV; + + /* detect low and high frequency */ + result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq); + if (result) + return result; + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = longrun_low_freq; + policy->cpuinfo.max_freq = longrun_high_freq; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + longrun_get_policy(policy); + + return 0; +} + + +static struct cpufreq_driver longrun_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = longrun_verify_policy, + .setpolicy = longrun_set_policy, + .get = longrun_get, + .init = longrun_cpu_init, + .name = "longrun", + .owner = THIS_MODULE, +}; + + +/** + * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver + * + * Initializes the LongRun support. + */ +static int __init longrun_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_TRANSMETA || + !cpu_has(c, X86_FEATURE_LONGRUN)) + return -ENODEV; + + return cpufreq_register_driver(&longrun_driver); +} + + +/** + * longrun_exit - unregisters LongRun support + */ +static void __exit longrun_exit(void) +{ + cpufreq_unregister_driver(&longrun_driver); +} + + +MODULE_AUTHOR("Dominik Brodowski "); +MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and " + "Efficeon processors."); +MODULE_LICENSE("GPL"); + +module_init(longrun_init); +module_exit(longrun_exit); diff --git a/drivers/cpufreq/mperf.c b/drivers/cpufreq/mperf.c new file mode 100644 index 0000000..911e193 --- /dev/null +++ b/drivers/cpufreq/mperf.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include + +#include "mperf.h" + +static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); + +/* Called via smp_call_function_single(), on the target CPU */ +static void read_measured_perf_ctrs(void *_cur) +{ + struct aperfmperf *am = _cur; + + get_aperfmperf(am); +} + +/* + * Return the measured active (C0) frequency on this CPU since last call + * to this function. + * Input: cpu number + * Return: Average CPU frequency in terms of max frequency (zero on error) + * + * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance + * over a period of time, while CPU is in C0 state. + * IA32_MPERF counts at the rate of max advertised frequency + * IA32_APERF counts at the rate of actual CPU frequency + * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and + * no meaning should be associated with absolute values of these MSRs. + */ +unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) +{ + struct aperfmperf perf; + unsigned long ratio; + unsigned int retval; + + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) + return 0; + + ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); + per_cpu(acfreq_old_perf, cpu) = perf; + + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; + + return retval; +} +EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); +MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/mperf.h b/drivers/cpufreq/mperf.h new file mode 100644 index 0000000..5dbf295 --- /dev/null +++ b/drivers/cpufreq/mperf.h @@ -0,0 +1,9 @@ +/* + * (c) 2010 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + */ + +unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu); diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c new file mode 100644 index 0000000..6be3e07 --- /dev/null +++ b/drivers/cpufreq/p4-clockmod.c @@ -0,0 +1,329 @@ +/* + * Pentium 4/Xeon CPU on demand clock modulation/speed scaling + * (C) 2002 - 2003 Dominik Brodowski + * (C) 2002 Zwane Mwaikambo + * (C) 2002 Arjan van de Ven + * (C) 2002 Tora T. Engstad + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The author(s) of this software shall not be held liable for damages + * of any nature resulting due to the use of this software. This + * software is provided AS-IS with no warranties. + * + * Date Errata Description + * 20020525 N44, O17 12.5% or 25% DC causes lockup + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "speedstep-lib.h" + +#define PFX "p4-clockmod: " + +/* + * Duty Cycle (3bits), note DC_DISABLE is not specified in + * intel docs i just use it to mean disable + */ +enum { + DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT, + DC_64PT, DC_75PT, DC_88PT, DC_DISABLE +}; + +#define DC_ENTRIES 8 + + +static int has_N44_O17_errata[NR_CPUS]; +static unsigned int stock_freq; +static struct cpufreq_driver p4clockmod_driver; +static unsigned int cpufreq_p4_get(unsigned int cpu); + +static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) +{ + u32 l, h; + + if (!cpu_online(cpu) || + (newstate > DC_DISABLE) || (newstate == DC_RESV)) + return -EINVAL; + + rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); + + if (l & 0x01) + pr_debug("CPU#%d currently thermal throttled\n", cpu); + + if (has_N44_O17_errata[cpu] && + (newstate == DC_25PT || newstate == DC_DFLT)) + newstate = DC_38PT; + + rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); + if (newstate == DC_DISABLE) { + pr_debug("CPU#%d disabling modulation\n", cpu); + wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); + } else { + pr_debug("CPU#%d setting duty cycle to %d%%\n", + cpu, ((125 * newstate) / 10)); + /* bits 63 - 5 : reserved + * bit 4 : enable/disable + * bits 3-1 : duty cycle + * bit 0 : reserved + */ + l = (l & ~14); + l = l | (1<<4) | ((newstate & 0x7)<<1); + wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h); + } + + return 0; +} + + +static struct cpufreq_frequency_table p4clockmod_table[] = { + {DC_RESV, CPUFREQ_ENTRY_INVALID}, + {DC_DFLT, 0}, + {DC_25PT, 0}, + {DC_38PT, 0}, + {DC_50PT, 0}, + {DC_64PT, 0}, + {DC_75PT, 0}, + {DC_88PT, 0}, + {DC_DISABLE, 0}, + {DC_RESV, CPUFREQ_TABLE_END}, +}; + + +static int cpufreq_p4_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = DC_RESV; + struct cpufreq_freqs freqs; + int i; + + if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], + target_freq, relation, &newstate)) + return -EINVAL; + + freqs.old = cpufreq_p4_get(policy->cpu); + freqs.new = stock_freq * p4clockmod_table[newstate].index / 8; + + if (freqs.new == freqs.old) + return 0; + + /* notifiers */ + for_each_cpu(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + /* run on each logical CPU, + * see section 13.15.3 of IA32 Intel Architecture Software + * Developer's Manual, Volume 3 + */ + for_each_cpu(i, policy->cpus) + cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); + + /* notifiers */ + for_each_cpu(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + + return 0; +} + + +static int cpufreq_p4_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]); +} + + +static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) +{ + if (c->x86 == 0x06) { + if (cpu_has(c, X86_FEATURE_EST)) + printk_once(KERN_WARNING PFX "Warning: EST-capable " + "CPU detected. The acpi-cpufreq module offers " + "voltage scaling in addition to frequency " + "scaling. You should use that instead of " + "p4-clockmod, if possible.\n"); + switch (c->x86_model) { + case 0x0E: /* Core */ + case 0x0F: /* Core Duo */ + case 0x16: /* Celeron Core */ + case 0x1C: /* Atom */ + p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; + return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); + case 0x0D: /* Pentium M (Dothan) */ + p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; + /* fall through */ + case 0x09: /* Pentium M (Banias) */ + return speedstep_get_frequency(SPEEDSTEP_CPU_PM); + } + } + + if (c->x86 != 0xF) + return 0; + + /* on P-4s, the TSC runs with constant frequency independent whether + * throttling is active or not. */ + p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; + + if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) { + printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " + "The speedstep-ich or acpi cpufreq modules offer " + "voltage scaling in addition of frequency scaling. " + "You should use either one instead of p4-clockmod, " + "if possible.\n"); + return speedstep_get_frequency(SPEEDSTEP_CPU_P4M); + } + + return speedstep_get_frequency(SPEEDSTEP_CPU_P4D); +} + + + +static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = &cpu_data(policy->cpu); + int cpuid = 0; + unsigned int i; + +#ifdef CONFIG_SMP + cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); +#endif + + /* Errata workaround */ + cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask; + switch (cpuid) { + case 0x0f07: + case 0x0f0a: + case 0x0f11: + case 0x0f12: + has_N44_O17_errata[policy->cpu] = 1; + pr_debug("has errata -- disabling low frequencies\n"); + } + + if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D && + c->x86_model < 2) { + /* switch to maximum frequency and measure result */ + cpufreq_p4_setdc(policy->cpu, DC_DISABLE); + recalibrate_cpu_khz(); + } + /* get max frequency */ + stock_freq = cpufreq_p4_get_frequency(c); + if (!stock_freq) + return -EINVAL; + + /* table init */ + for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { + if ((i < 2) && (has_N44_O17_errata[policy->cpu])) + p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; + else + p4clockmod_table[i].frequency = (stock_freq * i)/8; + } + cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); + + /* cpuinfo and default policy values */ + + /* the transition latency is set to be 1 higher than the maximum + * transition latency of the ondemand governor */ + policy->cpuinfo.transition_latency = 10000001; + policy->cur = stock_freq; + + return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); +} + + +static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static unsigned int cpufreq_p4_get(unsigned int cpu) +{ + u32 l, h; + + rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); + + if (l & 0x10) { + l = l >> 1; + l &= 0x7; + } else + l = DC_DISABLE; + + if (l != DC_DISABLE) + return stock_freq * l / 8; + + return stock_freq; +} + +static struct freq_attr *p4clockmod_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver p4clockmod_driver = { + .verify = cpufreq_p4_verify, + .target = cpufreq_p4_target, + .init = cpufreq_p4_cpu_init, + .exit = cpufreq_p4_cpu_exit, + .get = cpufreq_p4_get, + .name = "p4-clockmod", + .owner = THIS_MODULE, + .attr = p4clockmod_attr, +}; + + +static int __init cpufreq_p4_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int ret; + + /* + * THERM_CONTROL is architectural for IA32 now, so + * we can rely on the capability checks + */ + if (c->x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + if (!test_cpu_cap(c, X86_FEATURE_ACPI) || + !test_cpu_cap(c, X86_FEATURE_ACC)) + return -ENODEV; + + ret = cpufreq_register_driver(&p4clockmod_driver); + if (!ret) + printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock " + "Modulation available\n"); + + return ret; +} + + +static void __exit cpufreq_p4_exit(void) +{ + cpufreq_unregister_driver(&p4clockmod_driver); +} + + +MODULE_AUTHOR("Zwane Mwaikambo "); +MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); +MODULE_LICENSE("GPL"); + +late_initcall(cpufreq_p4_init); +module_exit(cpufreq_p4_exit); diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c new file mode 100644 index 0000000..7b0603e --- /dev/null +++ b/drivers/cpufreq/pcc-cpufreq.c @@ -0,0 +1,621 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define PCC_VERSION "1.10.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x9F, 0x2C, 0x9B, 0x63, 0x91, 0x70, 0x1f, 0x49, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu __percpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + pr_debug("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + pr_debug("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + pr_debug("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%p, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + pr_debug("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return 0; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + pr_debug("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%p\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + pr_debug("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + pr_debug("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle, pcch_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "PCCH", &pcch_handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + pr_debug("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + pr_debug("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + pr_debug("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + pr_debug("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); + pr_debug("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + pr_debug("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + pr_debug("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + pr_debug("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto out; + } + + result = pcc_get_offset(cpu); + if (result) { + pr_debug("init: PCCP evaluation failed\n"); + goto out; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + if (!policy->cur) { + pr_debug("init: Unable to get current CPU frequency\n"); + result = -EINVAL; + goto out; + } + + pr_debug("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); +out: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff --git a/drivers/cpufreq/powernow-k6.c b/drivers/cpufreq/powernow-k6.c new file mode 100644 index 0000000..b3379d6 --- /dev/null +++ b/drivers/cpufreq/powernow-k6.c @@ -0,0 +1,261 @@ +/* + * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) + * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, + * Dominik Brodowski. + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ + +#define PFX "powernow-k6: " +static unsigned int busfreq; /* FSB, in 10 kHz */ +static unsigned int max_multiplier; + + +/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */ +static struct cpufreq_frequency_table clock_ratio[] = { + {45, /* 000 -> 4.5x */ 0}, + {50, /* 001 -> 5.0x */ 0}, + {40, /* 010 -> 4.0x */ 0}, + {55, /* 011 -> 5.5x */ 0}, + {20, /* 100 -> 2.0x */ 0}, + {30, /* 101 -> 3.0x */ 0}, + {60, /* 110 -> 6.0x */ 0}, + {35, /* 111 -> 3.5x */ 0}, + {0, CPUFREQ_TABLE_END} +}; + + +/** + * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier + * + * Returns the current setting of the frequency multiplier. Core clock + * speed is frequency of the Front-Side Bus multiplied with this value. + */ +static int powernow_k6_get_cpu_multiplier(void) +{ + u64 invalue = 0; + u32 msrval; + + msrval = POWERNOW_IOPORT + 0x1; + wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ + invalue = inl(POWERNOW_IOPORT + 0x8); + msrval = POWERNOW_IOPORT + 0x0; + wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ + + return clock_ratio[(invalue >> 5)&7].index; +} + + +/** + * powernow_k6_set_state - set the PowerNow! multiplier + * @best_i: clock_ratio[best_i] is the target multiplier + * + * Tries to change the PowerNow! multiplier + */ +static void powernow_k6_set_state(unsigned int best_i) +{ + unsigned long outvalue = 0, invalue = 0; + unsigned long msrval; + struct cpufreq_freqs freqs; + + if (clock_ratio[best_i].index > max_multiplier) { + printk(KERN_ERR PFX "invalid target frequency\n"); + return; + } + + freqs.old = busfreq * powernow_k6_get_cpu_multiplier(); + freqs.new = busfreq * clock_ratio[best_i].index; + freqs.cpu = 0; /* powernow-k6.c is UP only driver */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* we now need to transform best_i to the BVC format, see AMD#23446 */ + + outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5); + + msrval = POWERNOW_IOPORT + 0x1; + wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ + invalue = inl(POWERNOW_IOPORT + 0x8); + invalue = invalue & 0xf; + outvalue = outvalue | invalue; + outl(outvalue , (POWERNOW_IOPORT + 0x8)); + msrval = POWERNOW_IOPORT + 0x0; + wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return; +} + + +/** + * powernow_k6_verify - verifies a new CPUfreq policy + * @policy: new policy + * + * Policy must be within lowest and highest possible CPU Frequency, + * and at least one possible state must be within min and max. + */ +static int powernow_k6_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &clock_ratio[0]); +} + + +/** + * powernow_k6_setpolicy - sets a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * sets a new CPUFreq policy + */ +static int powernow_k6_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + + if (cpufreq_frequency_table_target(policy, &clock_ratio[0], + target_freq, relation, &newstate)) + return -EINVAL; + + powernow_k6_set_state(newstate); + + return 0; +} + + +static int powernow_k6_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i, f; + int result; + + if (policy->cpu != 0) + return -ENODEV; + + /* get frequencies */ + max_multiplier = powernow_k6_get_cpu_multiplier(); + busfreq = cpu_khz / max_multiplier; + + /* table init */ + for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + f = clock_ratio[i].index; + if (f > max_multiplier) + clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; + else + clock_ratio[i].frequency = busfreq * f; + } + + /* cpuinfo and default policy values */ + policy->cpuinfo.transition_latency = 200000; + policy->cur = busfreq * max_multiplier; + + result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); + if (result) + return result; + + cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); + + return 0; +} + + +static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int i; + for (i = 0; i < 8; i++) { + if (i == max_multiplier) + powernow_k6_set_state(i); + } + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static unsigned int powernow_k6_get(unsigned int cpu) +{ + unsigned int ret; + ret = (busfreq * powernow_k6_get_cpu_multiplier()); + return ret; +} + +static struct freq_attr *powernow_k6_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver powernow_k6_driver = { + .verify = powernow_k6_verify, + .target = powernow_k6_target, + .init = powernow_k6_cpu_init, + .exit = powernow_k6_cpu_exit, + .get = powernow_k6_get, + .name = "powernow-k6", + .owner = THIS_MODULE, + .attr = powernow_k6_attr, +}; + + +/** + * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver + * + * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported + * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero + * on success. + */ +static int __init powernow_k6_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || + ((c->x86_model != 12) && (c->x86_model != 13))) + return -ENODEV; + + if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { + printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n"); + return -EIO; + } + + if (cpufreq_register_driver(&powernow_k6_driver)) { + release_region(POWERNOW_IOPORT, 16); + return -EINVAL; + } + + return 0; +} + + +/** + * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support + * + * Unregisters AMD K6-2+ / K6-3+ PowerNow! support. + */ +static void __exit powernow_k6_exit(void) +{ + cpufreq_unregister_driver(&powernow_k6_driver); + release_region(POWERNOW_IOPORT, 16); +} + + +MODULE_AUTHOR("Arjan van de Ven, Dave Jones , " + "Dominik Brodowski "); +MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE("GPL"); + +module_init(powernow_k6_init); +module_exit(powernow_k6_exit); diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c new file mode 100644 index 0000000..d71d9f3 --- /dev/null +++ b/drivers/cpufreq/powernow-k7.c @@ -0,0 +1,747 @@ +/* + * AMD K7 Powernow driver. + * (C) 2003 Dave Jones on behalf of SuSE Labs. + * (C) 2003-2004 Dave Jones + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon datasheets & sample CPUs kindly provided by AMD. + * + * Errata 5: + * CPU may fail to execute a FID/VID change in presence of interrupt. + * - We cli/sti on stepping A0 CPUs around the FID/VID transition. + * Errata 15: + * CPU with half frequency multipliers may hang upon wakeup from disconnect. + * - We disable half multipliers if ACPI is used on A0 stepping CPUs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* Needed for recalibrate_cpu_khz() */ +#include +#include + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI +#include +#include +#endif + +#include "powernow-k7.h" + +#define PFX "powernow: " + + +struct psb_s { + u8 signature[10]; + u8 tableversion; + u8 flags; + u16 settlingtime; + u8 reserved1; + u8 numpst; +}; + +struct pst_s { + u32 cpuid; + u8 fsbspeed; + u8 maxfid; + u8 startvid; + u8 numpstates; +}; + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI +union powernow_acpi_control_t { + struct { + unsigned long fid:5, + vid:5, + sgtc:20, + res1:2; + } bits; + unsigned long val; +}; +#endif + +/* divide by 1000 to get VCore voltage in V. */ +static const int mobile_vid_table[32] = { + 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, + 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0, + 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, + 1075, 1050, 1025, 1000, 975, 950, 925, 0, +}; + +/* divide by 10 to get FID. */ +static const int fid_codes[32] = { + 110, 115, 120, 125, 50, 55, 60, 65, + 70, 75, 80, 85, 90, 95, 100, 105, + 30, 190, 40, 200, 130, 135, 140, 210, + 150, 225, 160, 165, 170, 180, -1, -1, +}; + +/* This parameter is used in order to force ACPI instead of legacy method for + * configuration purpose. + */ + +static int acpi_force; + +static struct cpufreq_frequency_table *powernow_table; + +static unsigned int can_scale_bus; +static unsigned int can_scale_vid; +static unsigned int minimum_speed = -1; +static unsigned int maximum_speed; +static unsigned int number_scales; +static unsigned int fsb; +static unsigned int latency; +static char have_a0; + +static int check_fsb(unsigned int fsbspeed) +{ + int delta; + unsigned int f = fsb / 1000; + + delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; + return delta < 5; +} + +static int check_powernow(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + unsigned int maxei, eax, ebx, ecx, edx; + + if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) { +#ifdef MODULE + printk(KERN_INFO PFX "This module only works with " + "AMD K7 CPUs\n"); +#endif + return 0; + } + + /* Get maximum capabilities */ + maxei = cpuid_eax(0x80000000); + if (maxei < 0x80000007) { /* Any powernow info ? */ +#ifdef MODULE + printk(KERN_INFO PFX "No powernow capabilities detected\n"); +#endif + return 0; + } + + if ((c->x86_model == 6) && (c->x86_mask == 0)) { + printk(KERN_INFO PFX "K7 660[A0] core detected, " + "enabling errata workarounds\n"); + have_a0 = 1; + } + + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + /* Check we can actually do something before we say anything.*/ + if (!(edx & (1 << 1 | 1 << 2))) + return 0; + + printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); + + if (edx & 1 << 1) { + printk("frequency"); + can_scale_bus = 1; + } + + if ((edx & (1 << 1 | 1 << 2)) == 0x6) + printk(" and "); + + if (edx & 1 << 2) { + printk("voltage"); + can_scale_vid = 1; + } + + printk(".\n"); + return 1; +} + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI +static void invalidate_entry(unsigned int entry) +{ + powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; +} +#endif + +static int get_ranges(unsigned char *pst) +{ + unsigned int j; + unsigned int speed; + u8 fid, vid; + + powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * + (number_scales + 1)), GFP_KERNEL); + if (!powernow_table) + return -ENOMEM; + + for (j = 0 ; j < number_scales; j++) { + fid = *pst++; + + powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; + powernow_table[j].index = fid; /* lower 8 bits */ + + speed = powernow_table[j].frequency; + + if ((fid_codes[fid] % 10) == 5) { +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + if (have_a0 == 1) + invalidate_entry(j); +#endif + } + + if (speed < minimum_speed) + minimum_speed = speed; + if (speed > maximum_speed) + maximum_speed = speed; + + vid = *pst++; + powernow_table[j].index |= (vid << 8); /* upper 8 bits */ + + pr_debug(" FID: 0x%x (%d.%dx [%dMHz]) " + "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, + fid_codes[fid] % 10, speed/1000, vid, + mobile_vid_table[vid]/1000, + mobile_vid_table[vid]%1000); + } + powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; + powernow_table[number_scales].index = 0; + + return 0; +} + + +static void change_FID(int fid) +{ + union msr_fidvidctl fidvidctl; + + rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); + if (fidvidctl.bits.FID != fid) { + fidvidctl.bits.SGTC = latency; + fidvidctl.bits.FID = fid; + fidvidctl.bits.VIDC = 0; + fidvidctl.bits.FIDC = 1; + wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); + } +} + + +static void change_VID(int vid) +{ + union msr_fidvidctl fidvidctl; + + rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); + if (fidvidctl.bits.VID != vid) { + fidvidctl.bits.SGTC = latency; + fidvidctl.bits.VID = vid; + fidvidctl.bits.FIDC = 0; + fidvidctl.bits.VIDC = 1; + wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); + } +} + + +static void change_speed(unsigned int index) +{ + u8 fid, vid; + struct cpufreq_freqs freqs; + union msr_fidvidstatus fidvidstatus; + int cfid; + + /* fid are the lower 8 bits of the index we stored into + * the cpufreq frequency table in powernow_decode_bios, + * vid are the upper 8 bits. + */ + + fid = powernow_table[index].index & 0xFF; + vid = (powernow_table[index].index & 0xFF00) >> 8; + + freqs.cpu = 0; + + rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); + cfid = fidvidstatus.bits.CFID; + freqs.old = fsb * fid_codes[cfid] / 10; + + freqs.new = powernow_table[index].frequency; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Now do the magic poking into the MSRs. */ + + if (have_a0 == 1) /* A0 errata 5 */ + local_irq_disable(); + + if (freqs.old > freqs.new) { + /* Going down, so change FID first */ + change_FID(fid); + change_VID(vid); + } else { + /* Going up, so change VID first */ + change_VID(vid); + change_FID(fid); + } + + + if (have_a0 == 1) + local_irq_enable(); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +} + + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + +static struct acpi_processor_performance *acpi_processor_perf; + +static int powernow_acpi_init(void) +{ + int i; + int retval = 0; + union powernow_acpi_control_t pc; + + if (acpi_processor_perf != NULL && powernow_table != NULL) { + retval = -EINVAL; + goto err0; + } + + acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance), + GFP_KERNEL); + if (!acpi_processor_perf) { + retval = -ENOMEM; + goto err0; + } + + if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, + GFP_KERNEL)) { + retval = -ENOMEM; + goto err05; + } + + if (acpi_processor_register_performance(acpi_processor_perf, 0)) { + retval = -EIO; + goto err1; + } + + if (acpi_processor_perf->control_register.space_id != + ACPI_ADR_SPACE_FIXED_HARDWARE) { + retval = -ENODEV; + goto err2; + } + + if (acpi_processor_perf->status_register.space_id != + ACPI_ADR_SPACE_FIXED_HARDWARE) { + retval = -ENODEV; + goto err2; + } + + number_scales = acpi_processor_perf->state_count; + + if (number_scales < 2) { + retval = -ENODEV; + goto err2; + } + + powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * + (number_scales + 1)), GFP_KERNEL); + if (!powernow_table) { + retval = -ENOMEM; + goto err2; + } + + pc.val = (unsigned long) acpi_processor_perf->states[0].control; + for (i = 0; i < number_scales; i++) { + u8 fid, vid; + struct acpi_processor_px *state = + &acpi_processor_perf->states[i]; + unsigned int speed, speed_mhz; + + pc.val = (unsigned long) state->control; + pr_debug("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", + i, + (u32) state->core_frequency, + (u32) state->power, + (u32) state->transition_latency, + (u32) state->control, + pc.bits.sgtc); + + vid = pc.bits.vid; + fid = pc.bits.fid; + + powernow_table[i].frequency = fsb * fid_codes[fid] / 10; + powernow_table[i].index = fid; /* lower 8 bits */ + powernow_table[i].index |= (vid << 8); /* upper 8 bits */ + + speed = powernow_table[i].frequency; + speed_mhz = speed / 1000; + + /* processor_perflib will multiply the MHz value by 1000 to + * get a KHz value (e.g. 1266000). However, powernow-k7 works + * with true KHz values (e.g. 1266768). To ensure that all + * powernow frequencies are available, we must ensure that + * ACPI doesn't restrict them, so we round up the MHz value + * to ensure that perflib's computed KHz value is greater than + * or equal to powernow's KHz value. + */ + if (speed % 1000 > 0) + speed_mhz++; + + if ((fid_codes[fid] % 10) == 5) { + if (have_a0 == 1) + invalidate_entry(i); + } + + pr_debug(" FID: 0x%x (%d.%dx [%dMHz]) " + "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, + fid_codes[fid] % 10, speed_mhz, vid, + mobile_vid_table[vid]/1000, + mobile_vid_table[vid]%1000); + + if (state->core_frequency != speed_mhz) { + state->core_frequency = speed_mhz; + pr_debug(" Corrected ACPI frequency to %d\n", + speed_mhz); + } + + if (latency < pc.bits.sgtc) + latency = pc.bits.sgtc; + + if (speed < minimum_speed) + minimum_speed = speed; + if (speed > maximum_speed) + maximum_speed = speed; + } + + powernow_table[i].frequency = CPUFREQ_TABLE_END; + powernow_table[i].index = 0; + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + return 0; + +err2: + acpi_processor_unregister_performance(acpi_processor_perf, 0); +err1: + free_cpumask_var(acpi_processor_perf->shared_cpu_map); +err05: + kfree(acpi_processor_perf); +err0: + printk(KERN_WARNING PFX "ACPI perflib can not be used on " + "this platform\n"); + acpi_processor_perf = NULL; + return retval; +} +#else +static int powernow_acpi_init(void) +{ + printk(KERN_INFO PFX "no support for ACPI processor found." + " Please recompile your kernel with ACPI processor\n"); + return -EINVAL; +} +#endif + +static void print_pst_entry(struct pst_s *pst, unsigned int j) +{ + pr_debug("PST:%d (@%p)\n", j, pst); + pr_debug(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", + pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); +} + +static int powernow_decode_bios(int maxfid, int startvid) +{ + struct psb_s *psb; + struct pst_s *pst; + unsigned int i, j; + unsigned char *p; + unsigned int etuple; + unsigned int ret; + + etuple = cpuid_eax(0x80000001); + + for (i = 0xC0000; i < 0xffff0 ; i += 16) { + + p = phys_to_virt(i); + + if (memcmp(p, "AMDK7PNOW!", 10) == 0) { + pr_debug("Found PSB header at %p\n", p); + psb = (struct psb_s *) p; + pr_debug("Table version: 0x%x\n", psb->tableversion); + if (psb->tableversion != 0x12) { + printk(KERN_INFO PFX "Sorry, only v1.2 tables" + " supported right now\n"); + return -ENODEV; + } + + pr_debug("Flags: 0x%x\n", psb->flags); + if ((psb->flags & 1) == 0) + pr_debug("Mobile voltage regulator\n"); + else + pr_debug("Desktop voltage regulator\n"); + + latency = psb->settlingtime; + if (latency < 100) { + printk(KERN_INFO PFX "BIOS set settling time " + "to %d microseconds. " + "Should be at least 100. " + "Correcting.\n", latency); + latency = 100; + } + pr_debug("Settling Time: %d microseconds.\n", + psb->settlingtime); + pr_debug("Has %d PST tables. (Only dumping ones " + "relevant to this CPU).\n", + psb->numpst); + + p += sizeof(struct psb_s); + + pst = (struct pst_s *) p; + + for (j = 0; j < psb->numpst; j++) { + pst = (struct pst_s *) p; + number_scales = pst->numpstates; + + if ((etuple == pst->cpuid) && + check_fsb(pst->fsbspeed) && + (maxfid == pst->maxfid) && + (startvid == pst->startvid)) { + print_pst_entry(pst, j); + p = (char *)pst + sizeof(struct pst_s); + ret = get_ranges(p); + return ret; + } else { + unsigned int k; + p = (char *)pst + sizeof(struct pst_s); + for (k = 0; k < number_scales; k++) + p += 2; + } + } + printk(KERN_INFO PFX "No PST tables match this cpuid " + "(0x%x)\n", etuple); + printk(KERN_INFO PFX "This is indicative of a broken " + "BIOS.\n"); + + return -EINVAL; + } + p++; + } + + return -ENODEV; +} + + +static int powernow_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate; + + if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, + relation, &newstate)) + return -EINVAL; + + change_speed(newstate); + + return 0; +} + + +static int powernow_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, powernow_table); +} + +/* + * We use the fact that the bus frequency is somehow + * a multiple of 100000/3 khz, then we compute sgtc according + * to this multiple. + * That way, we match more how AMD thinks all of that work. + * We will then get the same kind of behaviour already tested under + * the "well-known" other OS. + */ +static int __cpuinit fixup_sgtc(void) +{ + unsigned int sgtc; + unsigned int m; + + m = fsb / 3333; + if ((m % 10) >= 5) + m += 5; + + m /= 10; + + sgtc = 100 * m * latency; + sgtc = sgtc / 3; + if (sgtc > 0xfffff) { + printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); + sgtc = 0xfffff; + } + return sgtc; +} + +static unsigned int powernow_get(unsigned int cpu) +{ + union msr_fidvidstatus fidvidstatus; + unsigned int cfid; + + if (cpu) + return 0; + rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); + cfid = fidvidstatus.bits.CFID; + + return fsb * fid_codes[cfid] / 10; +} + + +static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) +{ + printk(KERN_WARNING PFX + "%s laptop with broken PST tables in BIOS detected.\n", + d->ident); + printk(KERN_WARNING PFX + "You need to downgrade to 3A21 (09/09/2002), or try a newer " + "BIOS than 3A71 (01/20/2003)\n"); + printk(KERN_WARNING PFX + "cpufreq scaling has been disabled as a result of this.\n"); + return 0; +} + +/* + * Some Athlon laptops have really fucked PST tables. + * A BIOS update is all that can save them. + * Mention this, and disable cpufreq. + */ +static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { + { + .callback = acer_cpufreq_pst, + .ident = "Acer Aspire", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"), + DMI_MATCH(DMI_BIOS_VERSION, "3A71"), + }, + }, + { } +}; + +static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) +{ + union msr_fidvidstatus fidvidstatus; + int result; + + if (policy->cpu != 0) + return -ENODEV; + + rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); + + recalibrate_cpu_khz(); + + fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID]; + if (!fsb) { + printk(KERN_WARNING PFX "can not determine bus frequency\n"); + return -EINVAL; + } + pr_debug("FSB: %3dMHz\n", fsb/1000); + + if (dmi_check_system(powernow_dmi_table) || acpi_force) { + printk(KERN_INFO PFX "PSB/PST known to be broken. " + "Trying ACPI instead\n"); + result = powernow_acpi_init(); + } else { + result = powernow_decode_bios(fidvidstatus.bits.MFID, + fidvidstatus.bits.SVID); + if (result) { + printk(KERN_INFO PFX "Trying ACPI perflib\n"); + maximum_speed = 0; + minimum_speed = -1; + latency = 0; + result = powernow_acpi_init(); + if (result) { + printk(KERN_INFO PFX + "ACPI and legacy methods failed\n"); + } + } else { + /* SGTC use the bus clock as timer */ + latency = fixup_sgtc(); + printk(KERN_INFO PFX "SGTC: %d\n", latency); + } + } + + if (result) + return result; + + printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", + minimum_speed/1000, maximum_speed/1000); + + policy->cpuinfo.transition_latency = + cpufreq_scale(2000000UL, fsb, latency); + + policy->cur = powernow_get(0); + + cpufreq_frequency_table_get_attr(powernow_table, policy->cpu); + + return cpufreq_frequency_table_cpuinfo(policy, powernow_table); +} + +static int powernow_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + if (acpi_processor_perf) { + acpi_processor_unregister_performance(acpi_processor_perf, 0); + free_cpumask_var(acpi_processor_perf->shared_cpu_map); + kfree(acpi_processor_perf); + } +#endif + + kfree(powernow_table); + return 0; +} + +static struct freq_attr *powernow_table_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver powernow_driver = { + .verify = powernow_verify, + .target = powernow_target, + .get = powernow_get, +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + .bios_limit = acpi_processor_get_bios_limit, +#endif + .init = powernow_cpu_init, + .exit = powernow_cpu_exit, + .name = "powernow-k7", + .owner = THIS_MODULE, + .attr = powernow_table_attr, +}; + +static int __init powernow_init(void) +{ + if (check_powernow() == 0) + return -ENODEV; + return cpufreq_register_driver(&powernow_driver); +} + + +static void __exit powernow_exit(void) +{ + cpufreq_unregister_driver(&powernow_driver); +} + +module_param(acpi_force, int, 0444); +MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); + +MODULE_AUTHOR("Dave Jones "); +MODULE_DESCRIPTION("Powernow driver for AMD K7 processors."); +MODULE_LICENSE("GPL"); + +late_initcall(powernow_init); +module_exit(powernow_exit); + diff --git a/drivers/cpufreq/powernow-k7.h b/drivers/cpufreq/powernow-k7.h new file mode 100644 index 0000000..35fb4ea --- /dev/null +++ b/drivers/cpufreq/powernow-k7.h @@ -0,0 +1,43 @@ +/* + * (C) 2003 Dave Jones. + * + * Licensed under the terms of the GNU GPL License version 2. + * + * AMD-specific information + * + */ + +union msr_fidvidctl { + struct { + unsigned FID:5, // 4:0 + reserved1:3, // 7:5 + VID:5, // 12:8 + reserved2:3, // 15:13 + FIDC:1, // 16 + VIDC:1, // 17 + reserved3:2, // 19:18 + FIDCHGRATIO:1, // 20 + reserved4:11, // 31-21 + SGTC:20, // 32:51 + reserved5:12; // 63:52 + } bits; + unsigned long long val; +}; + +union msr_fidvidstatus { + struct { + unsigned CFID:5, // 4:0 + reserved1:3, // 7:5 + SFID:5, // 12:8 + reserved2:3, // 15:13 + MFID:5, // 20:16 + reserved3:11, // 31:21 + CVID:5, // 36:32 + reserved4:3, // 39:37 + SVID:5, // 44:40 + reserved5:3, // 47:45 + MVID:5, // 52:48 + reserved6:11; // 63:53 + } bits; + unsigned long long val; +}; diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c new file mode 100644 index 0000000..83479b6 --- /dev/null +++ b/drivers/cpufreq/powernow-k8.c @@ -0,0 +1,1607 @@ +/* + * (c) 2003-2010 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Support : mark.langsdorf@amd.com + * + * Based on the powernow-k7.c module written by Dave Jones. + * (C) 2003 Dave Jones on behalf of SuSE Labs + * (C) 2004 Dominik Brodowski + * (C) 2004 Pavel Machek + * Licensed under the terms of the GNU GPL License version 2. + * Based upon datasheets & sample CPUs kindly provided by AMD. + * + * Valuable input gratefully received from Dave Jones, Pavel Machek, + * Dominik Brodowski, Jacob Shin, and others. + * Originally developed by Paul Devriendt. + * Processor information obtained from Chapter 9 (Power and Thermal Management) + * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" available for download from www.amd.com + * + * Tables for specific CPUs can be inferred from + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for current / set_cpus_allowed() */ +#include +#include + +#include + +#include +#include +#include + +#define PFX "powernow-k8: " +#define VERSION "version 2.20.00" +#include "powernow-k8.h" +#include "mperf.h" + +/* serialize freq changes */ +static DEFINE_MUTEX(fidvid_mutex); + +static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); + +static int cpu_family = CPU_OPTERON; + +/* core performance boost */ +static bool cpb_capable, cpb_enabled; +static struct msr __percpu *msrs; + +static struct cpufreq_driver cpufreq_amd64_driver; + +#ifndef CONFIG_SMP +static inline const struct cpumask *cpu_core_mask(int cpu) +{ + return cpumask_of(0); +} +#endif + +/* Return a frequency in MHz, given an input fid */ +static u32 find_freq_from_fid(u32 fid) +{ + return 800 + (fid * 100); +} + +/* Return a frequency in KHz, given an input fid */ +static u32 find_khz_freq_from_fid(u32 fid) +{ + return 1000 * find_freq_from_fid(fid); +} + +static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, + u32 pstate) +{ + return data[pstate].frequency; +} + +/* Return the vco fid for an input fid + * + * Each "low" fid has corresponding "high" fid, and you can get to "low" fids + * only from corresponding high fids. This returns "high" fid corresponding to + * "low" one. + */ +static u32 convert_fid_to_vco_fid(u32 fid) +{ + if (fid < HI_FID_TABLE_BOTTOM) + return 8 + (2 * fid); + else + return fid; +} + +/* + * Return 1 if the pending bit is set. Unless we just instructed the processor + * to transition to a new state, seeing this bit set is really bad news. + */ +static int pending_bit_stuck(void) +{ + u32 lo, hi; + + if (cpu_family == CPU_HW_PSTATE) + return 0; + + rdmsr(MSR_FIDVID_STATUS, lo, hi); + return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; +} + +/* + * Update the global current fid / vid values from the status msr. + * Returns 1 on error. + */ +static int query_current_values_with_pending_wait(struct powernow_k8_data *data) +{ + u32 lo, hi; + u32 i = 0; + + if (cpu_family == CPU_HW_PSTATE) { + rdmsr(MSR_PSTATE_STATUS, lo, hi); + i = lo & HW_PSTATE_MASK; + data->currpstate = i; + + /* + * a workaround for family 11h erratum 311 might cause + * an "out-of-range Pstate if the core is in Pstate-0 + */ + if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) + data->currpstate = HW_PSTATE_0; + + return 0; + } + do { + if (i++ > 10000) { + pr_debug("detected change pending stuck\n"); + return 1; + } + rdmsr(MSR_FIDVID_STATUS, lo, hi); + } while (lo & MSR_S_LO_CHANGE_PENDING); + + data->currvid = hi & MSR_S_HI_CURRENT_VID; + data->currfid = lo & MSR_S_LO_CURRENT_FID; + + return 0; +} + +/* the isochronous relief time */ +static void count_off_irt(struct powernow_k8_data *data) +{ + udelay((1 << data->irt) * 10); + return; +} + +/* the voltage stabilization time */ +static void count_off_vst(struct powernow_k8_data *data) +{ + udelay(data->vstable * VST_UNITS_20US); + return; +} + +/* need to init the control msr to a safe value (for each cpu) */ +static void fidvid_msr_init(void) +{ + u32 lo, hi; + u8 fid, vid; + + rdmsr(MSR_FIDVID_STATUS, lo, hi); + vid = hi & MSR_S_HI_CURRENT_VID; + fid = lo & MSR_S_LO_CURRENT_FID; + lo = fid | (vid << MSR_C_LO_VID_SHIFT); + hi = MSR_C_HI_STP_GNT_BENIGN; + pr_debug("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); + wrmsr(MSR_FIDVID_CTL, lo, hi); +} + +/* write the new fid value along with the other control fields to the msr */ +static int write_new_fid(struct powernow_k8_data *data, u32 fid) +{ + u32 lo; + u32 savevid = data->currvid; + u32 i = 0; + + if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { + printk(KERN_ERR PFX "internal error - overflow on fid write\n"); + return 1; + } + + lo = fid; + lo |= (data->currvid << MSR_C_LO_VID_SHIFT); + lo |= MSR_C_LO_INIT_FID_VID; + + pr_debug("writing fid 0x%x, lo 0x%x, hi 0x%x\n", + fid, lo, data->plllock * PLL_LOCK_CONVERSION); + + do { + wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); + if (i++ > 100) { + printk(KERN_ERR PFX + "Hardware error - pending bit very stuck - " + "no further pstate changes possible\n"); + return 1; + } + } while (query_current_values_with_pending_wait(data)); + + count_off_irt(data); + + if (savevid != data->currvid) { + printk(KERN_ERR PFX + "vid change on fid trans, old 0x%x, new 0x%x\n", + savevid, data->currvid); + return 1; + } + + if (fid != data->currfid) { + printk(KERN_ERR PFX + "fid trans failed, fid 0x%x, curr 0x%x\n", fid, + data->currfid); + return 1; + } + + return 0; +} + +/* Write a new vid to the hardware */ +static int write_new_vid(struct powernow_k8_data *data, u32 vid) +{ + u32 lo; + u32 savefid = data->currfid; + int i = 0; + + if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { + printk(KERN_ERR PFX "internal error - overflow on vid write\n"); + return 1; + } + + lo = data->currfid; + lo |= (vid << MSR_C_LO_VID_SHIFT); + lo |= MSR_C_LO_INIT_FID_VID; + + pr_debug("writing vid 0x%x, lo 0x%x, hi 0x%x\n", + vid, lo, STOP_GRANT_5NS); + + do { + wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); + if (i++ > 100) { + printk(KERN_ERR PFX "internal error - pending bit " + "very stuck - no further pstate " + "changes possible\n"); + return 1; + } + } while (query_current_values_with_pending_wait(data)); + + if (savefid != data->currfid) { + printk(KERN_ERR PFX "fid changed on vid trans, old " + "0x%x new 0x%x\n", + savefid, data->currfid); + return 1; + } + + if (vid != data->currvid) { + printk(KERN_ERR PFX "vid trans failed, vid 0x%x, " + "curr 0x%x\n", + vid, data->currvid); + return 1; + } + + return 0; +} + +/* + * Reduce the vid by the max of step or reqvid. + * Decreasing vid codes represent increasing voltages: + * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. + */ +static int decrease_vid_code_by_step(struct powernow_k8_data *data, + u32 reqvid, u32 step) +{ + if ((data->currvid - reqvid) > step) + reqvid = data->currvid - step; + + if (write_new_vid(data, reqvid)) + return 1; + + count_off_vst(data); + + return 0; +} + +/* Change hardware pstate by single MSR write */ +static int transition_pstate(struct powernow_k8_data *data, u32 pstate) +{ + wrmsr(MSR_PSTATE_CTRL, pstate, 0); + data->currpstate = pstate; + return 0; +} + +/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ +static int transition_fid_vid(struct powernow_k8_data *data, + u32 reqfid, u32 reqvid) +{ + if (core_voltage_pre_transition(data, reqvid, reqfid)) + return 1; + + if (core_frequency_transition(data, reqfid)) + return 1; + + if (core_voltage_post_transition(data, reqvid)) + return 1; + + if (query_current_values_with_pending_wait(data)) + return 1; + + if ((reqfid != data->currfid) || (reqvid != data->currvid)) { + printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, " + "curr 0x%x 0x%x\n", + smp_processor_id(), + reqfid, reqvid, data->currfid, data->currvid); + return 1; + } + + pr_debug("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", + smp_processor_id(), data->currfid, data->currvid); + + return 0; +} + +/* Phase 1 - core voltage transition ... setup voltage */ +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 reqfid) +{ + u32 rvosteps = data->rvo; + u32 savefid = data->currfid; + u32 maxvid, lo, rvomult = 1; + + pr_debug("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " + "reqvid 0x%x, rvo 0x%x\n", + smp_processor_id(), + data->currfid, data->currvid, reqvid, data->rvo); + + if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) + rvomult = 2; + rvosteps *= rvomult; + rdmsr(MSR_FIDVID_STATUS, lo, maxvid); + maxvid = 0x1f & (maxvid >> 16); + pr_debug("ph1 maxvid=0x%x\n", maxvid); + if (reqvid < maxvid) /* lower numbers are higher voltages */ + reqvid = maxvid; + + while (data->currvid > reqvid) { + pr_debug("ph1: curr 0x%x, req vid 0x%x\n", + data->currvid, reqvid); + if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) + return 1; + } + + while ((rvosteps > 0) && + ((rvomult * data->rvo + data->currvid) > reqvid)) { + if (data->currvid == maxvid) { + rvosteps = 0; + } else { + pr_debug("ph1: changing vid for rvo, req 0x%x\n", + data->currvid - 1); + if (decrease_vid_code_by_step(data, data->currvid-1, 1)) + return 1; + rvosteps--; + } + } + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (savefid != data->currfid) { + printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", + data->currfid); + return 1; + } + + pr_debug("ph1 complete, currfid 0x%x, currvid 0x%x\n", + data->currfid, data->currvid); + + return 0; +} + +/* Phase 2 - core frequency transition */ +static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) +{ + u32 vcoreqfid, vcocurrfid, vcofiddiff; + u32 fid_interval, savevid = data->currvid; + + if (data->currfid == reqfid) { + printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", + data->currfid); + return 0; + } + + pr_debug("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " + "reqfid 0x%x\n", + smp_processor_id(), + data->currfid, data->currvid, reqfid); + + vcoreqfid = convert_fid_to_vco_fid(reqfid); + vcocurrfid = convert_fid_to_vco_fid(data->currfid); + vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid + : vcoreqfid - vcocurrfid; + + if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) + vcofiddiff = 0; + + while (vcofiddiff > 2) { + (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); + + if (reqfid > data->currfid) { + if (data->currfid > LO_FID_TABLE_TOP) { + if (write_new_fid(data, + data->currfid + fid_interval)) + return 1; + } else { + if (write_new_fid + (data, + 2 + convert_fid_to_vco_fid(data->currfid))) + return 1; + } + } else { + if (write_new_fid(data, data->currfid - fid_interval)) + return 1; + } + + vcocurrfid = convert_fid_to_vco_fid(data->currfid); + vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid + : vcoreqfid - vcocurrfid; + } + + if (write_new_fid(data, reqfid)) + return 1; + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (data->currfid != reqfid) { + printk(KERN_ERR PFX + "ph2: mismatch, failed fid transition, " + "curr 0x%x, req 0x%x\n", + data->currfid, reqfid); + return 1; + } + + if (savevid != data->currvid) { + printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n", + savevid, data->currvid); + return 1; + } + + pr_debug("ph2 complete, currfid 0x%x, currvid 0x%x\n", + data->currfid, data->currvid); + + return 0; +} + +/* Phase 3 - core voltage transition flow ... jump to the final vid. */ +static int core_voltage_post_transition(struct powernow_k8_data *data, + u32 reqvid) +{ + u32 savefid = data->currfid; + u32 savereqvid = reqvid; + + pr_debug("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", + smp_processor_id(), + data->currfid, data->currvid); + + if (reqvid != data->currvid) { + if (write_new_vid(data, reqvid)) + return 1; + + if (savefid != data->currfid) { + printk(KERN_ERR PFX + "ph3: bad fid change, save 0x%x, curr 0x%x\n", + savefid, data->currfid); + return 1; + } + + if (data->currvid != reqvid) { + printk(KERN_ERR PFX + "ph3: failed vid transition\n, " + "req 0x%x, curr 0x%x", + reqvid, data->currvid); + return 1; + } + } + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (savereqvid != data->currvid) { + pr_debug("ph3 failed, currvid 0x%x\n", data->currvid); + return 1; + } + + if (savefid != data->currfid) { + pr_debug("ph3 failed, currfid changed 0x%x\n", + data->currfid); + return 1; + } + + pr_debug("ph3 complete, currfid 0x%x, currvid 0x%x\n", + data->currfid, data->currvid); + + return 0; +} + +static void check_supported_cpu(void *_rc) +{ + u32 eax, ebx, ecx, edx; + int *rc = _rc; + + *rc = -ENODEV; + + if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD) + return; + + eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && + ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) + return; + + if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { + if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || + ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { + printk(KERN_INFO PFX + "Processor cpuid %x not supported\n", eax); + return; + } + + eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); + if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { + printk(KERN_INFO PFX + "No frequency change capabilities detected\n"); + return; + } + + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & P_STATE_TRANSITION_CAPABLE) + != P_STATE_TRANSITION_CAPABLE) { + printk(KERN_INFO PFX + "Power state transitions not supported\n"); + return; + } + } else { /* must be a HW Pstate capable processor */ + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) + cpu_family = CPU_HW_PSTATE; + else + return; + } + + *rc = 0; +} + +static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, + u8 maxvid) +{ + unsigned int j; + u8 lastfid = 0xff; + + for (j = 0; j < data->numps; j++) { + if (pst[j].vid > LEAST_VID) { + printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", + j, pst[j].vid); + return -EINVAL; + } + if (pst[j].vid < data->rvo) { + /* vid + rvo >= 0 */ + printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" + " %d\n", j); + return -ENODEV; + } + if (pst[j].vid < maxvid + data->rvo) { + /* vid + rvo >= maxvid */ + printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" + " %d\n", j); + return -ENODEV; + } + if (pst[j].fid > MAX_FID) { + printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" + " %d\n", j); + return -ENODEV; + } + if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { + /* Only first fid is allowed to be in "low" range */ + printk(KERN_ERR FW_BUG PFX "two low fids - %d : " + "0x%x\n", j, pst[j].fid); + return -EINVAL; + } + if (pst[j].fid < lastfid) + lastfid = pst[j].fid; + } + if (lastfid & 1) { + printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); + return -EINVAL; + } + if (lastfid > LO_FID_TABLE_TOP) + printk(KERN_INFO FW_BUG PFX + "first fid not from lo freq table\n"); + + return 0; +} + +static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, + unsigned int entry) +{ + powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; +} + +static void print_basics(struct powernow_k8_data *data) +{ + int j; + for (j = 0; j < data->numps; j++) { + if (data->powernow_table[j].frequency != + CPUFREQ_ENTRY_INVALID) { + if (cpu_family == CPU_HW_PSTATE) { + printk(KERN_INFO PFX + " %d : pstate %d (%d MHz)\n", j, + data->powernow_table[j].index, + data->powernow_table[j].frequency/1000); + } else { + printk(KERN_INFO PFX + "fid 0x%x (%d MHz), vid 0x%x\n", + data->powernow_table[j].index & 0xff, + data->powernow_table[j].frequency/1000, + data->powernow_table[j].index >> 8); + } + } + } + if (data->batps) + printk(KERN_INFO PFX "Only %d pstates on battery\n", + data->batps); +} + +static u32 freq_from_fid_did(u32 fid, u32 did) +{ + u32 mhz = 0; + + if (boot_cpu_data.x86 == 0x10) + mhz = (100 * (fid + 0x10)) >> did; + else if (boot_cpu_data.x86 == 0x11) + mhz = (100 * (fid + 8)) >> did; + else + BUG(); + + return mhz * 1000; +} + +static int fill_powernow_table(struct powernow_k8_data *data, + struct pst_s *pst, u8 maxvid) +{ + struct cpufreq_frequency_table *powernow_table; + unsigned int j; + + if (data->batps) { + /* use ACPI support to get full speed on mains power */ + printk(KERN_WARNING PFX + "Only %d pstates usable (use ACPI driver for full " + "range\n", data->batps); + data->numps = data->batps; + } + + for (j = 1; j < data->numps; j++) { + if (pst[j-1].fid >= pst[j].fid) { + printk(KERN_ERR PFX "PST out of sequence\n"); + return -EINVAL; + } + } + + if (data->numps < 2) { + printk(KERN_ERR PFX "no p states to transition\n"); + return -ENODEV; + } + + if (check_pst_table(data, pst, maxvid)) + return -EINVAL; + + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) + * (data->numps + 1)), GFP_KERNEL); + if (!powernow_table) { + printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); + return -ENOMEM; + } + + for (j = 0; j < data->numps; j++) { + int freq; + powernow_table[j].index = pst[j].fid; /* lower 8 bits */ + powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ + freq = find_khz_freq_from_fid(pst[j].fid); + powernow_table[j].frequency = freq; + } + powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; + powernow_table[data->numps].index = 0; + + if (query_current_values_with_pending_wait(data)) { + kfree(powernow_table); + return -EIO; + } + + pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); + data->powernow_table = powernow_table; + if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) + print_basics(data); + + for (j = 0; j < data->numps; j++) + if ((pst[j].fid == data->currfid) && + (pst[j].vid == data->currvid)) + return 0; + + pr_debug("currfid/vid do not match PST, ignoring\n"); + return 0; +} + +/* Find and validate the PSB/PST table in BIOS. */ +static int find_psb_table(struct powernow_k8_data *data) +{ + struct psb_s *psb; + unsigned int i; + u32 mvs; + u8 maxvid; + u32 cpst = 0; + u32 thiscpuid; + + for (i = 0xc0000; i < 0xffff0; i += 0x10) { + /* Scan BIOS looking for the signature. */ + /* It can not be at ffff0 - it is too big. */ + + psb = phys_to_virt(i); + if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) + continue; + + pr_debug("found PSB header at 0x%p\n", psb); + + pr_debug("table vers: 0x%x\n", psb->tableversion); + if (psb->tableversion != PSB_VERSION_1_4) { + printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); + return -ENODEV; + } + + pr_debug("flags: 0x%x\n", psb->flags1); + if (psb->flags1) { + printk(KERN_ERR FW_BUG PFX "unknown flags\n"); + return -ENODEV; + } + + data->vstable = psb->vstable; + pr_debug("voltage stabilization time: %d(*20us)\n", + data->vstable); + + pr_debug("flags2: 0x%x\n", psb->flags2); + data->rvo = psb->flags2 & 3; + data->irt = ((psb->flags2) >> 2) & 3; + mvs = ((psb->flags2) >> 4) & 3; + data->vidmvs = 1 << mvs; + data->batps = ((psb->flags2) >> 6) & 3; + + pr_debug("ramp voltage offset: %d\n", data->rvo); + pr_debug("isochronous relief time: %d\n", data->irt); + pr_debug("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); + + pr_debug("numpst: 0x%x\n", psb->num_tables); + cpst = psb->num_tables; + if ((psb->cpuid == 0x00000fc0) || + (psb->cpuid == 0x00000fe0)) { + thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + if ((thiscpuid == 0x00000fc0) || + (thiscpuid == 0x00000fe0)) + cpst = 1; + } + if (cpst != 1) { + printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); + return -ENODEV; + } + + data->plllock = psb->plllocktime; + pr_debug("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); + pr_debug("maxfid: 0x%x\n", psb->maxfid); + pr_debug("maxvid: 0x%x\n", psb->maxvid); + maxvid = psb->maxvid; + + data->numps = psb->numps; + pr_debug("numpstates: 0x%x\n", data->numps); + return fill_powernow_table(data, + (struct pst_s *)(psb+1), maxvid); + } + /* + * If you see this message, complain to BIOS manufacturer. If + * he tells you "we do not support Linux" or some similar + * nonsense, remember that Windows 2000 uses the same legacy + * mechanism that the old Linux PSB driver uses. Tell them it + * is broken with Windows 2000. + * + * The reference to the AMD documentation is chapter 9 in the + * BIOS and Kernel Developer's Guide, which is available on + * www.amd.com + */ + printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); + printk(KERN_ERR PFX "Make sure that your BIOS is up to date" + " and Cool'N'Quiet support is enabled in BIOS setup\n"); + return -ENODEV; +} + +static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, + unsigned int index) +{ + u64 control; + + if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) + return; + + control = data->acpi_data.states[index].control; + data->irt = (control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (control >> VST_SHIFT) & VST_MASK; +} + +static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) +{ + struct cpufreq_frequency_table *powernow_table; + int ret_val = -ENODEV; + u64 control, status; + + if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { + pr_debug("register performance failed: bad ACPI data\n"); + return -EIO; + } + + /* verify the data contained in the ACPI structures */ + if (data->acpi_data.state_count <= 1) { + pr_debug("No ACPI P-States\n"); + goto err_out; + } + + control = data->acpi_data.control_register.space_id; + status = data->acpi_data.status_register.space_id; + + if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + pr_debug("Invalid control/status registers (%llx - %llx)\n", + control, status); + goto err_out; + } + + /* fill in data->powernow_table */ + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) + * (data->acpi_data.state_count + 1)), GFP_KERNEL); + if (!powernow_table) { + pr_debug("powernow_table memory alloc failure\n"); + goto err_out; + } + + /* fill in data */ + data->numps = data->acpi_data.state_count; + powernow_k8_acpi_pst_values(data, 0); + + if (cpu_family == CPU_HW_PSTATE) + ret_val = fill_powernow_table_pstate(data, powernow_table); + else + ret_val = fill_powernow_table_fidvid(data, powernow_table); + if (ret_val) + goto err_out_mem; + + powernow_table[data->acpi_data.state_count].frequency = + CPUFREQ_TABLE_END; + powernow_table[data->acpi_data.state_count].index = 0; + data->powernow_table = powernow_table; + + if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) + print_basics(data); + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { + printk(KERN_ERR PFX + "unable to alloc powernow_k8_data cpumask\n"); + ret_val = -ENOMEM; + goto err_out_mem; + } + + return 0; + +err_out_mem: + kfree(powernow_table); + +err_out: + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + + /* data->acpi_data.state_count informs us at ->exit() + * whether ACPI was used */ + data->acpi_data.state_count = 0; + + return ret_val; +} + +static int fill_powernow_table_pstate(struct powernow_k8_data *data, + struct cpufreq_frequency_table *powernow_table) +{ + int i; + u32 hi = 0, lo = 0; + rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); + data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + + for (i = 0; i < data->acpi_data.state_count; i++) { + u32 index; + + index = data->acpi_data.states[i].control & HW_PSTATE_MASK; + if (index > data->max_hw_pstate) { + printk(KERN_ERR PFX "invalid pstate %d - " + "bad value %d.\n", i, index); + printk(KERN_ERR PFX "Please report to BIOS " + "manufacturer\n"); + invalidate_entry(powernow_table, i); + continue; + } + rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); + if (!(hi & HW_PSTATE_VALID_MASK)) { + pr_debug("invalid pstate %d, ignoring\n", index); + invalidate_entry(powernow_table, i); + continue; + } + + powernow_table[i].index = index; + + /* Frequency may be rounded for these */ + if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) + || boot_cpu_data.x86 == 0x11) { + powernow_table[i].frequency = + freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); + } else + powernow_table[i].frequency = + data->acpi_data.states[i].core_frequency * 1000; + } + return 0; +} + +static int fill_powernow_table_fidvid(struct powernow_k8_data *data, + struct cpufreq_frequency_table *powernow_table) +{ + int i; + + for (i = 0; i < data->acpi_data.state_count; i++) { + u32 fid; + u32 vid; + u32 freq, index; + u64 status, control; + + if (data->exttype) { + status = data->acpi_data.states[i].status; + fid = status & EXT_FID_MASK; + vid = (status >> VID_SHIFT) & EXT_VID_MASK; + } else { + control = data->acpi_data.states[i].control; + fid = control & FID_MASK; + vid = (control >> VID_SHIFT) & VID_MASK; + } + + pr_debug(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); + + index = fid | (vid<<8); + powernow_table[i].index = index; + + freq = find_khz_freq_from_fid(fid); + powernow_table[i].frequency = freq; + + /* verify frequency is OK */ + if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { + pr_debug("invalid freq %u kHz, ignoring\n", freq); + invalidate_entry(powernow_table, i); + continue; + } + + /* verify voltage is OK - + * BIOSs are using "off" to indicate invalid */ + if (vid == VID_OFF) { + pr_debug("invalid vid %u, ignoring\n", vid); + invalidate_entry(powernow_table, i); + continue; + } + + if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { + printk(KERN_INFO PFX "invalid freq entries " + "%u kHz vs. %u kHz\n", freq, + (unsigned int) + (data->acpi_data.states[i].core_frequency + * 1000)); + invalidate_entry(powernow_table, i); + continue; + } + } + return 0; +} + +static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) +{ + if (data->acpi_data.state_count) + acpi_processor_unregister_performance(&data->acpi_data, + data->cpu); + free_cpumask_var(data->acpi_data.shared_cpu_map); +} + +static int get_transition_latency(struct powernow_k8_data *data) +{ + int max_latency = 0; + int i; + for (i = 0; i < data->acpi_data.state_count; i++) { + int cur_latency = data->acpi_data.states[i].transition_latency + + data->acpi_data.states[i].bus_master_latency; + if (cur_latency > max_latency) + max_latency = cur_latency; + } + if (max_latency == 0) { + /* + * Fam 11h and later may return 0 as transition latency. This + * is intended and means "very fast". While cpufreq core and + * governors currently can handle that gracefully, better set it + * to 1 to avoid problems in the future. + */ + if (boot_cpu_data.x86 < 0x11) + printk(KERN_ERR FW_WARN PFX "Invalid zero transition " + "latency\n"); + max_latency = 1; + } + /* value in usecs, needs to be in nanoseconds */ + return 1000 * max_latency; +} + +/* Take a frequency, and issue the fid/vid transition command */ +static int transition_frequency_fidvid(struct powernow_k8_data *data, + unsigned int index) +{ + u32 fid = 0; + u32 vid = 0; + int res, i; + struct cpufreq_freqs freqs; + + pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index); + + /* fid/vid correctness check for k8 */ + /* fid are the lower 8 bits of the index we stored into + * the cpufreq frequency table in find_psb_table, vid + * are the upper 8 bits. + */ + fid = data->powernow_table[index].index & 0xFF; + vid = (data->powernow_table[index].index & 0xFF00) >> 8; + + pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); + + if (query_current_values_with_pending_wait(data)) + return 1; + + if ((data->currvid == vid) && (data->currfid == fid)) { + pr_debug("target matches current values (fid 0x%x, vid 0x%x)\n", + fid, vid); + return 0; + } + + pr_debug("cpu %d, changing to fid 0x%x, vid 0x%x\n", + smp_processor_id(), fid, vid); + freqs.old = find_khz_freq_from_fid(data->currfid); + freqs.new = find_khz_freq_from_fid(fid); + + for_each_cpu(i, data->available_cores) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + res = transition_fid_vid(data, fid, vid); + freqs.new = find_khz_freq_from_fid(data->currfid); + + for_each_cpu(i, data->available_cores) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + return res; +} + +/* Take a frequency, and issue the hardware pstate transition command */ +static int transition_frequency_pstate(struct powernow_k8_data *data, + unsigned int index) +{ + u32 pstate = 0; + int res, i; + struct cpufreq_freqs freqs; + + pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index); + + /* get MSR index for hardware pstate transition */ + pstate = index & HW_PSTATE_MASK; + if (pstate > data->max_hw_pstate) + return 0; + freqs.old = find_khz_freq_from_pstate(data->powernow_table, + data->currpstate); + freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); + + for_each_cpu(i, data->available_cores) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + res = transition_pstate(data, pstate); + freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); + + for_each_cpu(i, data->available_cores) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + return res; +} + +/* Driver entry point to switch to the target frequency */ +static int powernowk8_target(struct cpufreq_policy *pol, + unsigned targfreq, unsigned relation) +{ + cpumask_var_t oldmask; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); + u32 checkfid; + u32 checkvid; + unsigned int newstate; + int ret = -EIO; + + if (!data) + return -EINVAL; + + checkfid = data->currfid; + checkvid = data->currvid; + + /* only run on specific CPU from here on. */ + /* This is poor form: use a workqueue or smp_call_function_single */ + if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(oldmask, tsk_cpus_allowed(current)); + set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); + + if (smp_processor_id() != pol->cpu) { + printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); + goto err_out; + } + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing targ, change pending bit set\n"); + goto err_out; + } + + pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", + pol->cpu, targfreq, pol->min, pol->max, relation); + + if (query_current_values_with_pending_wait(data)) + goto err_out; + + if (cpu_family != CPU_HW_PSTATE) { + pr_debug("targ: curr fid 0x%x, vid 0x%x\n", + data->currfid, data->currvid); + + if ((checkvid != data->currvid) || + (checkfid != data->currfid)) { + printk(KERN_INFO PFX + "error - out of sync, fix 0x%x 0x%x, " + "vid 0x%x 0x%x\n", + checkfid, data->currfid, + checkvid, data->currvid); + } + } + + if (cpufreq_frequency_table_target(pol, data->powernow_table, + targfreq, relation, &newstate)) + goto err_out; + + mutex_lock(&fidvid_mutex); + + powernow_k8_acpi_pst_values(data, newstate); + + if (cpu_family == CPU_HW_PSTATE) + ret = transition_frequency_pstate(data, newstate); + else + ret = transition_frequency_fidvid(data, newstate); + if (ret) { + printk(KERN_ERR PFX "transition frequency failed\n"); + ret = 1; + mutex_unlock(&fidvid_mutex); + goto err_out; + } + mutex_unlock(&fidvid_mutex); + + if (cpu_family == CPU_HW_PSTATE) + pol->cur = find_khz_freq_from_pstate(data->powernow_table, + newstate); + else + pol->cur = find_khz_freq_from_fid(data->currfid); + ret = 0; + +err_out: + set_cpus_allowed_ptr(current, oldmask); + free_cpumask_var(oldmask); + return ret; +} + +/* Driver entry point to verify the policy and range of frequencies */ +static int powernowk8_verify(struct cpufreq_policy *pol) +{ + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); + + if (!data) + return -EINVAL; + + return cpufreq_frequency_table_verify(pol, data->powernow_table); +} + +struct init_on_cpu { + struct powernow_k8_data *data; + int rc; +}; + +static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) +{ + struct init_on_cpu *init_on_cpu = _init_on_cpu; + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing init, change pending bit set\n"); + init_on_cpu->rc = -ENODEV; + return; + } + + if (query_current_values_with_pending_wait(init_on_cpu->data)) { + init_on_cpu->rc = -ENODEV; + return; + } + + if (cpu_family == CPU_OPTERON) + fidvid_msr_init(); + + init_on_cpu->rc = 0; +} + +/* per CPU init entry point to the driver */ +static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) +{ + static const char ACPI_PSS_BIOS_BUG_MSG[] = + KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" + FW_BUG PFX "Try again with latest BIOS.\n"; + struct powernow_k8_data *data; + struct init_on_cpu init_on_cpu; + int rc; + struct cpuinfo_x86 *c = &cpu_data(pol->cpu); + + if (!cpu_online(pol->cpu)) + return -ENODEV; + + smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); + if (rc) + return -ENODEV; + + data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); + if (!data) { + printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); + return -ENOMEM; + } + + data->cpu = pol->cpu; + data->currpstate = HW_PSTATE_INVALID; + + if (powernow_k8_cpu_init_acpi(data)) { + /* + * Use the PSB BIOS structure. This is only available on + * an UP version, and is deprecated by AMD. + */ + if (num_online_cpus() != 1) { + printk_once(ACPI_PSS_BIOS_BUG_MSG); + goto err_out; + } + if (pol->cpu != 0) { + printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " + "CPU other than CPU0. Complain to your BIOS " + "vendor.\n"); + goto err_out; + } + rc = find_psb_table(data); + if (rc) + goto err_out; + + /* Take a crude guess here. + * That guess was in microseconds, so multiply with 1000 */ + pol->cpuinfo.transition_latency = ( + ((data->rvo + 8) * data->vstable * VST_UNITS_20US) + + ((1 << data->irt) * 30)) * 1000; + } else /* ACPI _PSS objects available */ + pol->cpuinfo.transition_latency = get_transition_latency(data); + + /* only run on specific CPU from here on */ + init_on_cpu.data = data; + smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, + &init_on_cpu, 1); + rc = init_on_cpu.rc; + if (rc != 0) + goto err_out_exit_acpi; + + if (cpu_family == CPU_HW_PSTATE) + cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); + else + cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); + data->available_cores = pol->cpus; + + if (cpu_family == CPU_HW_PSTATE) + pol->cur = find_khz_freq_from_pstate(data->powernow_table, + data->currpstate); + else + pol->cur = find_khz_freq_from_fid(data->currfid); + pr_debug("policy current frequency %d kHz\n", pol->cur); + + /* min/max the cpu is capable of */ + if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { + printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); + powernow_k8_cpu_exit_acpi(data); + kfree(data->powernow_table); + kfree(data); + return -EINVAL; + } + + /* Check for APERF/MPERF support in hardware */ + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; + + cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); + + if (cpu_family == CPU_HW_PSTATE) + pr_debug("cpu_init done, current pstate 0x%x\n", + data->currpstate); + else + pr_debug("cpu_init done, current fid 0x%x, vid 0x%x\n", + data->currfid, data->currvid); + + per_cpu(powernow_data, pol->cpu) = data; + + return 0; + +err_out_exit_acpi: + powernow_k8_cpu_exit_acpi(data); + +err_out: + kfree(data); + return -ENODEV; +} + +static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) +{ + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); + + if (!data) + return -EINVAL; + + powernow_k8_cpu_exit_acpi(data); + + cpufreq_frequency_table_put_attr(pol->cpu); + + kfree(data->powernow_table); + kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; + + return 0; +} + +static void query_values_on_cpu(void *_err) +{ + int *err = _err; + struct powernow_k8_data *data = __this_cpu_read(powernow_data); + + *err = query_current_values_with_pending_wait(data); +} + +static unsigned int powernowk8_get(unsigned int cpu) +{ + struct powernow_k8_data *data = per_cpu(powernow_data, cpu); + unsigned int khz = 0; + int err; + + if (!data) + return 0; + + smp_call_function_single(cpu, query_values_on_cpu, &err, true); + if (err) + goto out; + + if (cpu_family == CPU_HW_PSTATE) + khz = find_khz_freq_from_pstate(data->powernow_table, + data->currpstate); + else + khz = find_khz_freq_from_fid(data->currfid); + + +out: + return khz; +} + +static void _cpb_toggle_msrs(bool t) +{ + int cpu; + + get_online_cpus(); + + rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + for_each_cpu(cpu, cpu_online_mask) { + struct msr *reg = per_cpu_ptr(msrs, cpu); + if (t) + reg->l &= ~BIT(25); + else + reg->l |= BIT(25); + } + wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + put_online_cpus(); +} + +/* + * Switch on/off core performance boosting. + * + * 0=disable + * 1=enable. + */ +static void cpb_toggle(bool t) +{ + if (!cpb_capable) + return; + + if (t && !cpb_enabled) { + cpb_enabled = true; + _cpb_toggle_msrs(t); + printk(KERN_INFO PFX "Core Boosting enabled.\n"); + } else if (!t && cpb_enabled) { + cpb_enabled = false; + _cpb_toggle_msrs(t); + printk(KERN_INFO PFX "Core Boosting disabled.\n"); + } +} + +static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, + size_t count) +{ + int ret = -EINVAL; + unsigned long val = 0; + + ret = strict_strtoul(buf, 10, &val); + if (!ret && (val == 0 || val == 1) && cpb_capable) + cpb_toggle(val); + else + return -EINVAL; + + return count; +} + +static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) +{ + return sprintf(buf, "%u\n", cpb_enabled); +} + +#define define_one_rw(_name) \ +static struct freq_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +define_one_rw(cpb); + +static struct freq_attr *powernow_k8_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + &cpb, + NULL, +}; + +static struct cpufreq_driver cpufreq_amd64_driver = { + .verify = powernowk8_verify, + .target = powernowk8_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = powernowk8_cpu_init, + .exit = __devexit_p(powernowk8_cpu_exit), + .get = powernowk8_get, + .name = "powernow-k8", + .owner = THIS_MODULE, + .attr = powernow_k8_attr, +}; + +/* + * Clear the boost-disable flag on the CPU_DOWN path so that this cpu + * cannot block the remaining ones from boosting. On the CPU_UP path we + * simply keep the boost-disable flag in sync with the current global + * state. + */ +static int cpb_notify(struct notifier_block *nb, unsigned long action, + void *hcpu) +{ + unsigned cpu = (long)hcpu; + u32 lo, hi; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + + if (!cpb_enabled) { + rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); + lo |= BIT(25); + wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); + } + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); + lo &= ~BIT(25); + wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpb_nb = { + .notifier_call = cpb_notify, +}; + +/* driver entry point for init */ +static int __cpuinit powernowk8_init(void) +{ + unsigned int i, supported_cpus = 0, cpu; + int rv; + + for_each_online_cpu(i) { + int rc; + smp_call_function_single(i, check_supported_cpu, &rc, 1); + if (rc == 0) + supported_cpus++; + } + + if (supported_cpus != num_online_cpus()) + return -ENODEV; + + printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", + num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); + + if (boot_cpu_has(X86_FEATURE_CPB)) { + + cpb_capable = true; + + msrs = msrs_alloc(); + if (!msrs) { + printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); + return -ENOMEM; + } + + register_cpu_notifier(&cpb_nb); + + rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + for_each_cpu(cpu, cpu_online_mask) { + struct msr *reg = per_cpu_ptr(msrs, cpu); + cpb_enabled |= !(!!(reg->l & BIT(25))); + } + + printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", + (cpb_enabled ? "on" : "off")); + } + + rv = cpufreq_register_driver(&cpufreq_amd64_driver); + if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) { + unregister_cpu_notifier(&cpb_nb); + msrs_free(msrs); + msrs = NULL; + } + return rv; +} + +/* driver entry point for term */ +static void __exit powernowk8_exit(void) +{ + pr_debug("exit\n"); + + if (boot_cpu_has(X86_FEATURE_CPB)) { + msrs_free(msrs); + msrs = NULL; + + unregister_cpu_notifier(&cpb_nb); + } + + cpufreq_unregister_driver(&cpufreq_amd64_driver); +} + +MODULE_AUTHOR("Paul Devriendt and " + "Mark Langsdorf "); +MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); +MODULE_LICENSE("GPL"); + +late_initcall(powernowk8_init); +module_exit(powernowk8_exit); diff --git a/drivers/cpufreq/powernow-k8.h b/drivers/cpufreq/powernow-k8.h new file mode 100644 index 0000000..3744d26 --- /dev/null +++ b/drivers/cpufreq/powernow-k8.h @@ -0,0 +1,222 @@ +/* + * (c) 2003-2006 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + */ + +enum pstate { + HW_PSTATE_INVALID = 0xff, + HW_PSTATE_0 = 0, + HW_PSTATE_1 = 1, + HW_PSTATE_2 = 2, + HW_PSTATE_3 = 3, + HW_PSTATE_4 = 4, + HW_PSTATE_5 = 5, + HW_PSTATE_6 = 6, + HW_PSTATE_7 = 7, +}; + +struct powernow_k8_data { + unsigned int cpu; + + u32 numps; /* number of p-states */ + u32 batps; /* number of p-states supported on battery */ + u32 max_hw_pstate; /* maximum legal hardware pstate */ + + /* these values are constant when the PSB is used to determine + * vid/fid pairings, but are modified during the ->target() call + * when ACPI is used */ + u32 rvo; /* ramp voltage offset */ + u32 irt; /* isochronous relief time */ + u32 vidmvs; /* usable value calculated from mvs */ + u32 vstable; /* voltage stabilization time, units 20 us */ + u32 plllock; /* pll lock time, units 1 us */ + u32 exttype; /* extended interface = 1 */ + + /* keep track of the current fid / vid or pstate */ + u32 currvid; + u32 currfid; + enum pstate currpstate; + + /* the powernow_table includes all frequency and vid/fid pairings: + * fid are the lower 8 bits of the index, vid are the upper 8 bits. + * frequency is in kHz */ + struct cpufreq_frequency_table *powernow_table; + + /* the acpi table needs to be kept. it's only available if ACPI was + * used to determine valid frequency/vid/fid states */ + struct acpi_processor_performance acpi_data; + + /* we need to keep track of associated cores, but let cpufreq + * handle hotplug events - so just point at cpufreq pol->cpus + * structure */ + struct cpumask *available_cores; +}; + +/* processor's cpuid instruction support */ +#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ +#define CPUID_XFAM 0x0ff00000 /* extended family */ +#define CPUID_XFAM_K8 0 +#define CPUID_XMOD 0x000f0000 /* extended model */ +#define CPUID_XMOD_REV_MASK 0x000c0000 +#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ +#define CPUID_USE_XFAM_XMOD 0x00000f00 +#define CPUID_GET_MAX_CAPABILITIES 0x80000000 +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 +#define P_STATE_TRANSITION_CAPABLE 6 + +/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */ +/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ +/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ +/* the register number is placed in ecx, and the data is returned in edx:eax. */ + +#define MSR_FIDVID_CTL 0xc0010041 +#define MSR_FIDVID_STATUS 0xc0010042 + +/* Field definitions within the FID VID Low Control MSR : */ +#define MSR_C_LO_INIT_FID_VID 0x00010000 +#define MSR_C_LO_NEW_VID 0x00003f00 +#define MSR_C_LO_NEW_FID 0x0000003f +#define MSR_C_LO_VID_SHIFT 8 + +/* Field definitions within the FID VID High Control MSR : */ +#define MSR_C_HI_STP_GNT_TO 0x000fffff + +/* Field definitions within the FID VID Low Status MSR : */ +#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ +#define MSR_S_LO_MAX_RAMP_VID 0x3f000000 +#define MSR_S_LO_MAX_FID 0x003f0000 +#define MSR_S_LO_START_FID 0x00003f00 +#define MSR_S_LO_CURRENT_FID 0x0000003f + +/* Field definitions within the FID VID High Status MSR : */ +#define MSR_S_HI_MIN_WORKING_VID 0x3f000000 +#define MSR_S_HI_MAX_WORKING_VID 0x003f0000 +#define MSR_S_HI_START_VID 0x00003f00 +#define MSR_S_HI_CURRENT_VID 0x0000003f +#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 + + +/* Hardware Pstate _PSS and MSR definitions */ +#define USE_HW_PSTATE 0x00000080 +#define HW_PSTATE_MASK 0x00000007 +#define HW_PSTATE_VALID_MASK 0x80000000 +#define HW_PSTATE_MAX_MASK 0x000000f0 +#define HW_PSTATE_MAX_SHIFT 4 +#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ +#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ +#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ +#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ + +/* define the two driver architectures */ +#define CPU_OPTERON 0 +#define CPU_HW_PSTATE 1 + + +/* + * There are restrictions frequencies have to follow: + * - only 1 entry in the low fid table ( <=1.4GHz ) + * - lowest entry in the high fid table must be >= 2 * the entry in the + * low fid table + * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry + * in the low fid table + * - the parts can only step at <= 200 MHz intervals, odd fid values are + * supported in revision G and later revisions. + * - lowest frequency must be >= interprocessor hypertransport link speed + * (only applies to MP systems obviously) + */ + +/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ +#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */ +#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ + +#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ +#define HI_VCOFREQ_TABLE_BOTTOM 1600 + +#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ + +#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ +#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */ + +#define MIN_FREQ 800 /* Min and max freqs, per spec */ +#define MAX_FREQ 5000 + +#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */ +#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */ + +#define VID_OFF 0x3f + +#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ + +#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ + +#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ +#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ + +/* + * Most values of interest are encoded in a single field of the _PSS + * entries: the "control" value. + */ + +#define IRT_SHIFT 30 +#define RVO_SHIFT 28 +#define EXT_TYPE_SHIFT 27 +#define PLL_L_SHIFT 20 +#define MVS_SHIFT 18 +#define VST_SHIFT 11 +#define VID_SHIFT 6 +#define IRT_MASK 3 +#define RVO_MASK 3 +#define EXT_TYPE_MASK 1 +#define PLL_L_MASK 0x7f +#define MVS_MASK 3 +#define VST_MASK 0x7f +#define VID_MASK 0x1f +#define FID_MASK 0x1f +#define EXT_VID_MASK 0x3f +#define EXT_FID_MASK 0x3f + + +/* + * Version 1.4 of the PSB table. This table is constructed by BIOS and is + * to tell the OS's power management driver which VIDs and FIDs are + * supported by this particular processor. + * If the data in the PSB / PST is wrong, then this driver will program the + * wrong values into hardware, which is very likely to lead to a crash. + */ + +#define PSB_ID_STRING "AMDK7PNOW!" +#define PSB_ID_STRING_LEN 10 + +#define PSB_VERSION_1_4 0x14 + +struct psb_s { + u8 signature[10]; + u8 tableversion; + u8 flags1; + u16 vstable; + u8 flags2; + u8 num_tables; + u32 cpuid; + u8 plllocktime; + u8 maxfid; + u8 maxvid; + u8 numps; +}; + +/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */ +struct pst_s { + u8 fid; + u8 vid; +}; + +static int core_voltage_pre_transition(struct powernow_k8_data *data, + u32 reqvid, u32 regfid); +static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); + +static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); + +static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); +static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); diff --git a/drivers/cpufreq/sc520_freq.c b/drivers/cpufreq/sc520_freq.c new file mode 100644 index 0000000..1e205e6 --- /dev/null +++ b/drivers/cpufreq/sc520_freq.c @@ -0,0 +1,192 @@ +/* + * sc520_freq.c: cpufreq driver for the AMD Elan sc520 + * + * Copyright (C) 2005 Sean Young + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on elanfreq.c + * + * 2005-03-30: - initial revision + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define MMCR_BASE 0xfffef000 /* The default base address */ +#define OFFS_CPUCTL 0x2 /* CPU Control Register */ + +static __u8 __iomem *cpuctl; + +#define PFX "sc520_freq: " + +static struct cpufreq_frequency_table sc520_freq_table[] = { + {0x01, 100000}, + {0x02, 133000}, + {0, CPUFREQ_TABLE_END}, +}; + +static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu) +{ + u8 clockspeed_reg = *cpuctl; + + switch (clockspeed_reg & 0x03) { + default: + printk(KERN_ERR PFX "error: cpuctl register has unexpected " + "value %02x\n", clockspeed_reg); + case 0x01: + return 100000; + case 0x02: + return 133000; + } +} + +static void sc520_freq_set_cpu_state(unsigned int state) +{ + + struct cpufreq_freqs freqs; + u8 clockspeed_reg; + + freqs.old = sc520_freq_get_cpu_frequency(0); + freqs.new = sc520_freq_table[state].frequency; + freqs.cpu = 0; /* AMD Elan is UP */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + pr_debug("attempting to set frequency to %i kHz\n", + sc520_freq_table[state].frequency); + + local_irq_disable(); + + clockspeed_reg = *cpuctl & ~0x03; + *cpuctl = clockspeed_reg | sc520_freq_table[state].index; + + local_irq_enable(); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +}; + +static int sc520_freq_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); +} + +static int sc520_freq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + + if (cpufreq_frequency_table_target(policy, sc520_freq_table, + target_freq, relation, &newstate)) + return -EINVAL; + + sc520_freq_set_cpu_state(newstate); + + return 0; +} + + +/* + * Module init and exit code + */ + +static int sc520_freq_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int result; + + /* capability check */ + if (c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 4 || c->x86_model != 9) + return -ENODEV; + + /* cpuinfo and default policy values */ + policy->cpuinfo.transition_latency = 1000000; /* 1ms */ + policy->cur = sc520_freq_get_cpu_frequency(0); + + result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); + if (result) + return result; + + cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); + + return 0; +} + + +static int sc520_freq_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + + +static struct freq_attr *sc520_freq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + + +static struct cpufreq_driver sc520_freq_driver = { + .get = sc520_freq_get_cpu_frequency, + .verify = sc520_freq_verify, + .target = sc520_freq_target, + .init = sc520_freq_cpu_init, + .exit = sc520_freq_cpu_exit, + .name = "sc520_freq", + .owner = THIS_MODULE, + .attr = sc520_freq_attr, +}; + + +static int __init sc520_freq_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int err; + + /* Test if we have the right hardware */ + if (c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 4 || c->x86_model != 9) { + pr_debug("no Elan SC520 processor found!\n"); + return -ENODEV; + } + cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); + if (!cpuctl) { + printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); + return -ENOMEM; + } + + err = cpufreq_register_driver(&sc520_freq_driver); + if (err) + iounmap(cpuctl); + + return err; +} + + +static void __exit sc520_freq_exit(void) +{ + cpufreq_unregister_driver(&sc520_freq_driver); + iounmap(cpuctl); +} + + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sean Young "); +MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU"); + +module_init(sc520_freq_init); +module_exit(sc520_freq_exit); + diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c new file mode 100644 index 0000000..6ea3455 --- /dev/null +++ b/drivers/cpufreq/speedstep-centrino.c @@ -0,0 +1,633 @@ +/* + * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium + * M (part of the Centrino chipset). + * + * Since the original Pentium M, most new Intel CPUs support Enhanced + * SpeedStep. + * + * Despite the "SpeedStep" in the name, this is almost entirely unlike + * traditional SpeedStep. + * + * Modelled on speedstep.c + * + * Copyright (C) 2003 Jeremy Fitzhardinge + */ + +#include +#include +#include +#include +#include /* current */ +#include +#include +#include + +#include +#include +#include + +#define PFX "speedstep-centrino: " +#define MAINTAINER "cpufreq@vger.kernel.org" + +#define INTEL_MSR_RANGE (0xffff) + +struct cpu_id +{ + __u8 x86; /* CPU family */ + __u8 x86_model; /* model */ + __u8 x86_mask; /* stepping */ +}; + +enum { + CPU_BANIAS, + CPU_DOTHAN_A1, + CPU_DOTHAN_A2, + CPU_DOTHAN_B0, + CPU_MP4HT_D0, + CPU_MP4HT_E0, +}; + +static const struct cpu_id cpu_ids[] = { + [CPU_BANIAS] = { 6, 9, 5 }, + [CPU_DOTHAN_A1] = { 6, 13, 1 }, + [CPU_DOTHAN_A2] = { 6, 13, 2 }, + [CPU_DOTHAN_B0] = { 6, 13, 6 }, + [CPU_MP4HT_D0] = {15, 3, 4 }, + [CPU_MP4HT_E0] = {15, 4, 1 }, +}; +#define N_IDS ARRAY_SIZE(cpu_ids) + +struct cpu_model +{ + const struct cpu_id *cpu_id; + const char *model_name; + unsigned max_freq; /* max clock in kHz */ + + struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ +}; +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x); + +/* Operating points for current CPU */ +static DEFINE_PER_CPU(struct cpu_model *, centrino_model); +static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); + +static struct cpufreq_driver centrino_driver; + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE + +/* Computes the correct form for IA32_PERF_CTL MSR for a particular + frequency/voltage operating point; frequency in MHz, volts in mV. + This is stored as "index" in the structure. */ +#define OP(mhz, mv) \ + { \ + .frequency = (mhz) * 1000, \ + .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \ + } + +/* + * These voltage tables were derived from the Intel Pentium M + * datasheet, document 25261202.pdf, Table 5. I have verified they + * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium + * M. + */ + +/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */ +static struct cpufreq_frequency_table banias_900[] = +{ + OP(600, 844), + OP(800, 988), + OP(900, 1004), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */ +static struct cpufreq_frequency_table banias_1000[] = +{ + OP(600, 844), + OP(800, 972), + OP(900, 988), + OP(1000, 1004), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */ +static struct cpufreq_frequency_table banias_1100[] = +{ + OP( 600, 956), + OP( 800, 1020), + OP( 900, 1100), + OP(1000, 1164), + OP(1100, 1180), + { .frequency = CPUFREQ_TABLE_END } +}; + + +/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */ +static struct cpufreq_frequency_table banias_1200[] = +{ + OP( 600, 956), + OP( 800, 1004), + OP( 900, 1020), + OP(1000, 1100), + OP(1100, 1164), + OP(1200, 1180), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.30GHz (Banias) */ +static struct cpufreq_frequency_table banias_1300[] = +{ + OP( 600, 956), + OP( 800, 1260), + OP(1000, 1292), + OP(1200, 1356), + OP(1300, 1388), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.40GHz (Banias) */ +static struct cpufreq_frequency_table banias_1400[] = +{ + OP( 600, 956), + OP( 800, 1180), + OP(1000, 1308), + OP(1200, 1436), + OP(1400, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.50GHz (Banias) */ +static struct cpufreq_frequency_table banias_1500[] = +{ + OP( 600, 956), + OP( 800, 1116), + OP(1000, 1228), + OP(1200, 1356), + OP(1400, 1452), + OP(1500, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.60GHz (Banias) */ +static struct cpufreq_frequency_table banias_1600[] = +{ + OP( 600, 956), + OP( 800, 1036), + OP(1000, 1164), + OP(1200, 1276), + OP(1400, 1420), + OP(1600, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.70GHz (Banias) */ +static struct cpufreq_frequency_table banias_1700[] = +{ + OP( 600, 956), + OP( 800, 1004), + OP(1000, 1116), + OP(1200, 1228), + OP(1400, 1308), + OP(1700, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; +#undef OP + +#define _BANIAS(cpuid, max, name) \ +{ .cpu_id = cpuid, \ + .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \ + .max_freq = (max)*1000, \ + .op_points = banias_##max, \ +} +#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max) + +/* CPU models, their operating frequency range, and freq/voltage + operating points */ +static struct cpu_model models[] = +{ + _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"), + BANIAS(1000), + BANIAS(1100), + BANIAS(1200), + BANIAS(1300), + BANIAS(1400), + BANIAS(1500), + BANIAS(1600), + BANIAS(1700), + + /* NULL model_name is a wildcard */ + { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL }, + { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL }, + { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL }, + { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, + { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, + + { NULL, } +}; +#undef _BANIAS +#undef BANIAS + +static int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); + struct cpu_model *model; + + for(model = models; model->cpu_id != NULL; model++) + if (centrino_verify_cpu_id(cpu, model->cpu_id) && + (model->model_name == NULL || + strcmp(cpu->x86_model_id, model->model_name) == 0)) + break; + + if (model->cpu_id == NULL) { + /* No match at all */ + pr_debug("no support for CPU model \"%s\": " + "send /proc/cpuinfo to " MAINTAINER "\n", + cpu->x86_model_id); + return -ENOENT; + } + + if (model->op_points == NULL) { + /* Matched a non-match */ + pr_debug("no table support for CPU model \"%s\"\n", + cpu->x86_model_id); + pr_debug("try using the acpi-cpufreq driver\n"); + return -ENOENT; + } + + per_cpu(centrino_model, policy->cpu) = model; + + pr_debug("found \"%s\": max frequency: %dkHz\n", + model->model_name, model->max_freq); + + return 0; +} + +#else +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + return -ENODEV; +} +#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ + +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x) +{ + if ((c->x86 == x->x86) && + (c->x86_model == x->x86_model) && + (c->x86_mask == x->x86_mask)) + return 1; + return 0; +} + +/* To be called only after centrino_model is initialized */ +static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) +{ + int i; + + /* + * Extract clock in kHz from PERF_CTL value + * for centrino, as some DSDTs are buggy. + * Ideally, this can be done using the acpi_data structure. + */ + if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { + msr = (msr >> 8) & 0xff; + return msr * 100000; + } + + if ((!per_cpu(centrino_model, cpu)) || + (!per_cpu(centrino_model, cpu)->op_points)) + return 0; + + msr &= 0xffff; + for (i = 0; + per_cpu(centrino_model, cpu)->op_points[i].frequency + != CPUFREQ_TABLE_END; + i++) { + if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) + return per_cpu(centrino_model, cpu)-> + op_points[i].frequency; + } + if (failsafe) + return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; + else + return 0; +} + +/* Return the current CPU frequency in kHz */ +static unsigned int get_cur_freq(unsigned int cpu) +{ + unsigned l, h; + unsigned clock_freq; + + rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); + clock_freq = extract_clock(l, cpu, 0); + + if (unlikely(clock_freq == 0)) { + /* + * On some CPUs, we can see transient MSR values (which are + * not present in _PSS), while CPU is doing some automatic + * P-state transition (like TM2). Get the last freq set + * in PERF_CTL. + */ + rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); + clock_freq = extract_clock(l, cpu, 1); + } + return clock_freq; +} + + +static int centrino_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); + unsigned freq; + unsigned l, h; + int ret; + int i; + + /* Only Intel makes Enhanced Speedstep-capable CPUs */ + if (cpu->x86_vendor != X86_VENDOR_INTEL || + !cpu_has(cpu, X86_FEATURE_EST)) + return -ENODEV; + + if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) + centrino_driver.flags |= CPUFREQ_CONST_LOOPS; + + if (policy->cpu != 0) + return -ENODEV; + + for (i = 0; i < N_IDS; i++) + if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) + break; + + if (i != N_IDS) + per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; + + if (!per_cpu(centrino_cpu, policy->cpu)) { + pr_debug("found unsupported CPU with " + "Enhanced SpeedStep: send /proc/cpuinfo to " + MAINTAINER "\n"); + return -ENODEV; + } + + if (centrino_cpu_init_table(policy)) { + return -ENODEV; + } + + /* Check to see if Enhanced SpeedStep is enabled, and try to + enable it if not. */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + + if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; + pr_debug("trying to enable Enhanced SpeedStep (%x)\n", l); + wrmsr(MSR_IA32_MISC_ENABLE, l, h); + + /* check to see if it stuck */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); + return -ENODEV; + } + } + + freq = get_cur_freq(policy->cpu); + policy->cpuinfo.transition_latency = 10000; + /* 10uS transition latency */ + policy->cur = freq; + + pr_debug("centrino_cpu_init: cur=%dkHz\n", policy->cur); + + ret = cpufreq_frequency_table_cpuinfo(policy, + per_cpu(centrino_model, policy->cpu)->op_points); + if (ret) + return (ret); + + cpufreq_frequency_table_get_attr( + per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); + + return 0; +} + +static int centrino_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + + if (!per_cpu(centrino_model, cpu)) + return -ENODEV; + + cpufreq_frequency_table_put_attr(cpu); + + per_cpu(centrino_model, cpu) = NULL; + + return 0; +} + +/** + * centrino_verify - verifies a new CPUFreq policy + * @policy: new policy + * + * Limit must be within this model's frequency range at least one + * border included. + */ +static int centrino_verify (struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, + per_cpu(centrino_model, policy->cpu)->op_points); +} + +/** + * centrino_setpolicy - set a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * Sets a new CPUFreq policy. + */ +static int centrino_target (struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; + struct cpufreq_freqs freqs; + int retval = 0; + unsigned int j, k, first_cpu, tmp; + cpumask_var_t covered_cpus; + + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) + return -ENOMEM; + + if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { + retval = -ENODEV; + goto out; + } + + if (unlikely(cpufreq_frequency_table_target(policy, + per_cpu(centrino_model, cpu)->op_points, + target_freq, + relation, + &newstate))) { + retval = -EINVAL; + goto out; + } + + first_cpu = 1; + for_each_cpu(j, policy->cpus) { + int good_cpu; + + /* cpufreq holds the hotplug lock, so we are safe here */ + if (!cpu_online(j)) + continue; + + /* + * Support for SMP systems. + * Make sure we are running on CPU that wants to change freq + */ + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) + good_cpu = cpumask_any_and(policy->cpus, + cpu_online_mask); + else + good_cpu = j; + + if (good_cpu >= nr_cpu_ids) { + pr_debug("couldn't limit to CPUs in this domain\n"); + retval = -EAGAIN; + if (first_cpu) { + /* We haven't started the transition yet. */ + goto out; + } + break; + } + + msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; + + if (first_cpu) { + rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); + if (msr == (oldmsr & 0xffff)) { + pr_debug("no change needed - msr was and needs " + "to be %x\n", oldmsr); + retval = 0; + goto out; + } + + freqs.old = extract_clock(oldmsr, cpu, 0); + freqs.new = extract_clock(msr, cpu, 0); + + pr_debug("target=%dkHz old=%d new=%d msr=%04x\n", + target_freq, freqs.old, freqs.new, msr); + + for_each_cpu(k, policy->cpus) { + if (!cpu_online(k)) + continue; + freqs.cpu = k; + cpufreq_notify_transition(&freqs, + CPUFREQ_PRECHANGE); + } + + first_cpu = 0; + /* all but 16 LSB are reserved, treat them with care */ + oldmsr &= ~0xffff; + msr &= 0xffff; + oldmsr |= msr; + } + + wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) + break; + + cpumask_set_cpu(j, covered_cpus); + } + + for_each_cpu(k, policy->cpus) { + if (!cpu_online(k)) + continue; + freqs.cpu = k; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + + if (unlikely(retval)) { + /* + * We have failed halfway through the frequency change. + * We have sent callbacks to policy->cpus and + * MSRs have already been written on coverd_cpus. + * Best effort undo.. + */ + + for_each_cpu(j, covered_cpus) + wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); + + tmp = freqs.new; + freqs.new = freqs.old; + freqs.old = tmp; + for_each_cpu(j, policy->cpus) { + if (!cpu_online(j)) + continue; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + } + retval = 0; + +out: + free_cpumask_var(covered_cpus); + return retval; +} + +static struct freq_attr* centrino_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver centrino_driver = { + .name = "centrino", /* should be speedstep-centrino, + but there's a 16 char limit */ + .init = centrino_cpu_init, + .exit = centrino_cpu_exit, + .verify = centrino_verify, + .target = centrino_target, + .get = get_cur_freq, + .attr = centrino_attr, + .owner = THIS_MODULE, +}; + + +/** + * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver + * + * Initializes the Enhanced SpeedStep support. Returns -ENODEV on + * unsupported devices, -ENOENT if there's no voltage table for this + * particular CPU model, -EINVAL on problems during initiatization, + * and zero on success. + * + * This is quite picky. Not only does the CPU have to advertise the + * "est" flag in the cpuid capability flags, we look for a specific + * CPU model and stepping, and we need to have the exact model name in + * our voltage tables. That is, be paranoid about not releasing + * someone's valuable magic smoke. + */ +static int __init centrino_init(void) +{ + struct cpuinfo_x86 *cpu = &cpu_data(0); + + if (!cpu_has(cpu, X86_FEATURE_EST)) + return -ENODEV; + + return cpufreq_register_driver(¢rino_driver); +} + +static void __exit centrino_exit(void) +{ + cpufreq_unregister_driver(¢rino_driver); +} + +MODULE_AUTHOR ("Jeremy Fitzhardinge "); +MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors."); +MODULE_LICENSE ("GPL"); + +late_initcall(centrino_init); +module_exit(centrino_exit); diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c new file mode 100644 index 0000000..a748ce7 --- /dev/null +++ b/drivers/cpufreq/speedstep-ich.c @@ -0,0 +1,448 @@ +/* + * (C) 2001 Dave Jones, Arjan van de ven. + * (C) 2002 - 2003 Dominik Brodowski + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon reverse engineered information, and on Intel documentation + * for chipsets ICH2-M and ICH3-M. + * + * Many thanks to Ducrot Bruno for finding and fixing the last + * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler + * for extensive testing. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + + +/********************************************************************* + * SPEEDSTEP - DEFINITIONS * + *********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "speedstep-lib.h" + + +/* speedstep_chipset: + * It is necessary to know which chipset is used. As accesses to + * this device occur at various places in this module, we need a + * static struct pci_dev * pointing to that device. + */ +static struct pci_dev *speedstep_chipset_dev; + + +/* speedstep_processor + */ +static enum speedstep_processor speedstep_processor; + +static u32 pmbase; + +/* + * There are only two frequency states for each processor. Values + * are in kHz for the time being. + */ +static struct cpufreq_frequency_table speedstep_freqs[] = { + {SPEEDSTEP_HIGH, 0}, + {SPEEDSTEP_LOW, 0}, + {0, CPUFREQ_TABLE_END}, +}; + + +/** + * speedstep_find_register - read the PMBASE address + * + * Returns: -ENODEV if no register could be found + */ +static int speedstep_find_register(void) +{ + if (!speedstep_chipset_dev) + return -ENODEV; + + /* get PMBASE */ + pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase); + if (!(pmbase & 0x01)) { + printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); + return -ENODEV; + } + + pmbase &= 0xFFFFFFFE; + if (!pmbase) { + printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); + return -ENODEV; + } + + pr_debug("pmbase is 0x%x\n", pmbase); + return 0; +} + +/** + * speedstep_set_state - set the SpeedStep state + * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) + * + * Tries to change the SpeedStep state. Can be called from + * smp_call_function_single. + */ +static void speedstep_set_state(unsigned int state) +{ + u8 pm2_blk; + u8 value; + unsigned long flags; + + if (state > 0x1) + return; + + /* Disable IRQs */ + local_irq_save(flags); + + /* read state */ + value = inb(pmbase + 0x50); + + pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); + + /* write new state */ + value &= 0xFE; + value |= state; + + pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); + + /* Disable bus master arbitration */ + pm2_blk = inb(pmbase + 0x20); + pm2_blk |= 0x01; + outb(pm2_blk, (pmbase + 0x20)); + + /* Actual transition */ + outb(value, (pmbase + 0x50)); + + /* Restore bus master arbitration */ + pm2_blk &= 0xfe; + outb(pm2_blk, (pmbase + 0x20)); + + /* check if transition was successful */ + value = inb(pmbase + 0x50); + + /* Enable IRQs */ + local_irq_restore(flags); + + pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); + + if (state == (value & 0x1)) + pr_debug("change to %u MHz succeeded\n", + speedstep_get_frequency(speedstep_processor) / 1000); + else + printk(KERN_ERR "cpufreq: change failed - I/O error\n"); + + return; +} + +/* Wrapper for smp_call_function_single. */ +static void _speedstep_set_state(void *_state) +{ + speedstep_set_state(*(unsigned int *)_state); +} + +/** + * speedstep_activate - activate SpeedStep control in the chipset + * + * Tries to activate the SpeedStep status and control registers. + * Returns -EINVAL on an unsupported chipset, and zero on success. + */ +static int speedstep_activate(void) +{ + u16 value = 0; + + if (!speedstep_chipset_dev) + return -EINVAL; + + pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); + if (!(value & 0x08)) { + value |= 0x08; + pr_debug("activating SpeedStep (TM) registers\n"); + pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); + } + + return 0; +} + + +/** + * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic + * + * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to + * the LPC bridge / PM module which contains all power-management + * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected + * chipset, or zero on failure. + */ +static unsigned int speedstep_detect_chipset(void) +{ + speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801DB_12, + PCI_ANY_ID, PCI_ANY_ID, + NULL); + if (speedstep_chipset_dev) + return 4; /* 4-M */ + + speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801CA_12, + PCI_ANY_ID, PCI_ANY_ID, + NULL); + if (speedstep_chipset_dev) + return 3; /* 3-M */ + + + speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801BA_10, + PCI_ANY_ID, PCI_ANY_ID, + NULL); + if (speedstep_chipset_dev) { + /* speedstep.c causes lockups on Dell Inspirons 8000 and + * 8100 which use a pretty old revision of the 82815 + * host brige. Abort on these systems. + */ + static struct pci_dev *hostbridge; + + hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82815_MC, + PCI_ANY_ID, PCI_ANY_ID, + NULL); + + if (!hostbridge) + return 2; /* 2-M */ + + if (hostbridge->revision < 5) { + pr_debug("hostbridge does not support speedstep\n"); + speedstep_chipset_dev = NULL; + pci_dev_put(hostbridge); + return 0; + } + + pci_dev_put(hostbridge); + return 2; /* 2-M */ + } + + return 0; +} + +static void get_freq_data(void *_speed) +{ + unsigned int *speed = _speed; + + *speed = speedstep_get_frequency(speedstep_processor); +} + +static unsigned int speedstep_get(unsigned int cpu) +{ + unsigned int speed; + + /* You're supposed to ensure CPU is online. */ + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) + BUG(); + + pr_debug("detected %u kHz as current frequency\n", speed); + return speed; +} + +/** + * speedstep_target - set a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * Sets a new CPUFreq policy. + */ +static int speedstep_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0, policy_cpu; + struct cpufreq_freqs freqs; + int i; + + if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], + target_freq, relation, &newstate)) + return -EINVAL; + + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); + freqs.old = speedstep_get(policy_cpu); + freqs.new = speedstep_freqs[newstate].frequency; + freqs.cpu = policy->cpu; + + pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new); + + /* no transition necessary */ + if (freqs.old == freqs.new) + return 0; + + for_each_cpu(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, + true); + + for_each_cpu(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + + return 0; +} + + +/** + * speedstep_verify - verifies a new CPUFreq policy + * @policy: new policy + * + * Limit must be within speedstep_low_freq and speedstep_high_freq, with + * at least one border included. + */ +static int speedstep_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); +} + +struct get_freqs { + struct cpufreq_policy *policy; + int ret; +}; + +static void get_freqs_on_cpu(void *_get_freqs) +{ + struct get_freqs *get_freqs = _get_freqs; + + get_freqs->ret = + speedstep_get_freqs(speedstep_processor, + &speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency, + &get_freqs->policy->cpuinfo.transition_latency, + &speedstep_set_state); +} + +static int speedstep_cpu_init(struct cpufreq_policy *policy) +{ + int result; + unsigned int policy_cpu, speed; + struct get_freqs gf; + + /* only run on CPU to be set, or on its sibling */ +#ifdef CONFIG_SMP + cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); +#endif + policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); + + /* detect low and high frequency and transition latency */ + gf.policy = policy; + smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); + if (gf.ret) + return gf.ret; + + /* get current speed setting */ + speed = speedstep_get(policy_cpu); + if (!speed) + return -EIO; + + pr_debug("currently at %s speed setting - %i MHz\n", + (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) + ? "low" : "high", + (speed / 1000)); + + /* cpuinfo and default policy values */ + policy->cur = speed; + + result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); + if (result) + return result; + + cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); + + return 0; +} + + +static int speedstep_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static struct freq_attr *speedstep_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + + +static struct cpufreq_driver speedstep_driver = { + .name = "speedstep-ich", + .verify = speedstep_verify, + .target = speedstep_target, + .init = speedstep_cpu_init, + .exit = speedstep_cpu_exit, + .get = speedstep_get, + .owner = THIS_MODULE, + .attr = speedstep_attr, +}; + + +/** + * speedstep_init - initializes the SpeedStep CPUFreq driver + * + * Initializes the SpeedStep support. Returns -ENODEV on unsupported + * devices, -EINVAL on problems during initiatization, and zero on + * success. + */ +static int __init speedstep_init(void) +{ + /* detect processor */ + speedstep_processor = speedstep_detect_processor(); + if (!speedstep_processor) { + pr_debug("Intel(R) SpeedStep(TM) capable processor " + "not found\n"); + return -ENODEV; + } + + /* detect chipset */ + if (!speedstep_detect_chipset()) { + pr_debug("Intel(R) SpeedStep(TM) for this chipset not " + "(yet) available.\n"); + return -ENODEV; + } + + /* activate speedstep support */ + if (speedstep_activate()) { + pci_dev_put(speedstep_chipset_dev); + return -EINVAL; + } + + if (speedstep_find_register()) + return -ENODEV; + + return cpufreq_register_driver(&speedstep_driver); +} + + +/** + * speedstep_exit - unregisters SpeedStep support + * + * Unregisters SpeedStep support. + */ +static void __exit speedstep_exit(void) +{ + pci_dev_put(speedstep_chipset_dev); + cpufreq_unregister_driver(&speedstep_driver); +} + + +MODULE_AUTHOR("Dave Jones , " + "Dominik Brodowski "); +MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets " + "with ICH-M southbridges."); +MODULE_LICENSE("GPL"); + +module_init(speedstep_init); +module_exit(speedstep_exit); diff --git a/drivers/cpufreq/speedstep-lib.c b/drivers/cpufreq/speedstep-lib.c new file mode 100644 index 0000000..8af2d2f --- /dev/null +++ b/drivers/cpufreq/speedstep-lib.c @@ -0,0 +1,478 @@ +/* + * (C) 2002 - 2003 Dominik Brodowski + * + * Licensed under the terms of the GNU GPL License version 2. + * + * Library for common functions for Intel SpeedStep v.1 and v.2 support + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include +#include +#include +#include +#include + +#include +#include +#include "speedstep-lib.h" + +#define PFX "speedstep-lib: " + +#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK +static int relaxed_check; +#else +#define relaxed_check 0 +#endif + +/********************************************************************* + * GET PROCESSOR CORE SPEED IN KHZ * + *********************************************************************/ + +static unsigned int pentium3_get_frequency(enum speedstep_processor processor) +{ + /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ + struct { + unsigned int ratio; /* Frequency Multiplier (x10) */ + u8 bitmap; /* power on configuration bits + [27, 25:22] (in MSR 0x2a) */ + } msr_decode_mult[] = { + { 30, 0x01 }, + { 35, 0x05 }, + { 40, 0x02 }, + { 45, 0x06 }, + { 50, 0x00 }, + { 55, 0x04 }, + { 60, 0x0b }, + { 65, 0x0f }, + { 70, 0x09 }, + { 75, 0x0d }, + { 80, 0x0a }, + { 85, 0x26 }, + { 90, 0x20 }, + { 100, 0x2b }, + { 0, 0xff } /* error or unknown value */ + }; + + /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ + struct { + unsigned int value; /* Front Side Bus speed in MHz */ + u8 bitmap; /* power on configuration bits [18: 19] + (in MSR 0x2a) */ + } msr_decode_fsb[] = { + { 66, 0x0 }, + { 100, 0x2 }, + { 133, 0x1 }, + { 0, 0xff} + }; + + u32 msr_lo, msr_tmp; + int i = 0, j = 0; + + /* read MSR 0x2a - we only need the low 32 bits */ + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); + pr_debug("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); + msr_tmp = msr_lo; + + /* decode the FSB */ + msr_tmp &= 0x00c0000; + msr_tmp >>= 18; + while (msr_tmp != msr_decode_fsb[i].bitmap) { + if (msr_decode_fsb[i].bitmap == 0xff) + return 0; + i++; + } + + /* decode the multiplier */ + if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) { + pr_debug("workaround for early PIIIs\n"); + msr_lo &= 0x03c00000; + } else + msr_lo &= 0x0bc00000; + msr_lo >>= 22; + while (msr_lo != msr_decode_mult[j].bitmap) { + if (msr_decode_mult[j].bitmap == 0xff) + return 0; + j++; + } + + pr_debug("speed is %u\n", + (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); + + return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100; +} + + +static unsigned int pentiumM_get_frequency(void) +{ + u32 msr_lo, msr_tmp; + + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); + pr_debug("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); + + /* see table B-2 of 24547212.pdf */ + if (msr_lo & 0x00040000) { + printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n", + msr_lo, msr_tmp); + return 0; + } + + msr_tmp = (msr_lo >> 22) & 0x1f; + pr_debug("bits 22-26 are 0x%x, speed is %u\n", + msr_tmp, (msr_tmp * 100 * 1000)); + + return msr_tmp * 100 * 1000; +} + +static unsigned int pentium_core_get_frequency(void) +{ + u32 fsb = 0; + u32 msr_lo, msr_tmp; + int ret; + + rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); + /* see table B-2 of 25366920.pdf */ + switch (msr_lo & 0x07) { + case 5: + fsb = 100000; + break; + case 1: + fsb = 133333; + break; + case 3: + fsb = 166667; + break; + case 2: + fsb = 200000; + break; + case 0: + fsb = 266667; + break; + case 4: + fsb = 333333; + break; + default: + printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); + } + + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); + pr_debug("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", + msr_lo, msr_tmp); + + msr_tmp = (msr_lo >> 22) & 0x1f; + pr_debug("bits 22-26 are 0x%x, speed is %u\n", + msr_tmp, (msr_tmp * fsb)); + + ret = (msr_tmp * fsb); + return ret; +} + + +static unsigned int pentium4_get_frequency(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 msr_lo, msr_hi, mult; + unsigned int fsb = 0; + unsigned int ret; + u8 fsb_code; + + /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency + * to System Bus Frequency Ratio Field in the Processor Frequency + * Configuration Register of the MSR. Therefore the current + * frequency cannot be calculated and has to be measured. + */ + if (c->x86_model < 2) + return cpu_khz; + + rdmsr(0x2c, msr_lo, msr_hi); + + pr_debug("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); + + /* decode the FSB: see IA-32 Intel (C) Architecture Software + * Developer's Manual, Volume 3: System Prgramming Guide, + * revision #12 in Table B-1: MSRs in the Pentium 4 and + * Intel Xeon Processors, on page B-4 and B-5. + */ + fsb_code = (msr_lo >> 16) & 0x7; + switch (fsb_code) { + case 0: + fsb = 100 * 1000; + break; + case 1: + fsb = 13333 * 10; + break; + case 2: + fsb = 200 * 1000; + break; + } + + if (!fsb) + printk(KERN_DEBUG PFX "couldn't detect FSB speed. " + "Please send an e-mail to \n"); + + /* Multiplier. */ + mult = msr_lo >> 24; + + pr_debug("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", + fsb, mult, (fsb * mult)); + + ret = (fsb * mult); + return ret; +} + + +/* Warning: may get called from smp_call_function_single. */ +unsigned int speedstep_get_frequency(enum speedstep_processor processor) +{ + switch (processor) { + case SPEEDSTEP_CPU_PCORE: + return pentium_core_get_frequency(); + case SPEEDSTEP_CPU_PM: + return pentiumM_get_frequency(); + case SPEEDSTEP_CPU_P4D: + case SPEEDSTEP_CPU_P4M: + return pentium4_get_frequency(); + case SPEEDSTEP_CPU_PIII_T: + case SPEEDSTEP_CPU_PIII_C: + case SPEEDSTEP_CPU_PIII_C_EARLY: + return pentium3_get_frequency(processor); + default: + return 0; + }; + return 0; +} +EXPORT_SYMBOL_GPL(speedstep_get_frequency); + + +/********************************************************************* + * DETECT SPEEDSTEP-CAPABLE PROCESSOR * + *********************************************************************/ + +unsigned int speedstep_detect_processor(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + u32 ebx, msr_lo, msr_hi; + + pr_debug("x86: %x, model: %x\n", c->x86, c->x86_model); + + if ((c->x86_vendor != X86_VENDOR_INTEL) || + ((c->x86 != 6) && (c->x86 != 0xF))) + return 0; + + if (c->x86 == 0xF) { + /* Intel Mobile Pentium 4-M + * or Intel Mobile Pentium 4 with 533 MHz FSB */ + if (c->x86_model != 2) + return 0; + + ebx = cpuid_ebx(0x00000001); + ebx &= 0x000000FF; + + pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); + + switch (c->x86_mask) { + case 4: + /* + * B-stepping [M-P4-M] + * sample has ebx = 0x0f, production has 0x0e. + */ + if ((ebx == 0x0e) || (ebx == 0x0f)) + return SPEEDSTEP_CPU_P4M; + break; + case 7: + /* + * C-stepping [M-P4-M] + * needs to have ebx=0x0e, else it's a celeron: + * cf. 25130917.pdf / page 7, footnote 5 even + * though 25072120.pdf / page 7 doesn't say + * samples are only of B-stepping... + */ + if (ebx == 0x0e) + return SPEEDSTEP_CPU_P4M; + break; + case 9: + /* + * D-stepping [M-P4-M or M-P4/533] + * + * this is totally strange: CPUID 0x0F29 is + * used by M-P4-M, M-P4/533 and(!) Celeron CPUs. + * The latter need to be sorted out as they don't + * support speedstep. + * Celerons with CPUID 0x0F29 may have either + * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything + * specific. + * M-P4-Ms may have either ebx=0xe or 0xf [see above] + * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] + * also, M-P4M HTs have ebx=0x8, too + * For now, they are distinguished by the model_id + * string + */ + if ((ebx == 0x0e) || + (strstr(c->x86_model_id, + "Mobile Intel(R) Pentium(R) 4") != NULL)) + return SPEEDSTEP_CPU_P4M; + break; + default: + break; + } + return 0; + } + + switch (c->x86_model) { + case 0x0B: /* Intel PIII [Tualatin] */ + /* cpuid_ebx(1) is 0x04 for desktop PIII, + * 0x06 for mobile PIII-M */ + ebx = cpuid_ebx(0x00000001); + pr_debug("ebx is %x\n", ebx); + + ebx &= 0x000000FF; + + if (ebx != 0x06) + return 0; + + /* So far all PIII-M processors support SpeedStep. See + * Intel's 24540640.pdf of June 2003 + */ + return SPEEDSTEP_CPU_PIII_T; + + case 0x08: /* Intel PIII [Coppermine] */ + + /* all mobile PIII Coppermines have FSB 100 MHz + * ==> sort out a few desktop PIIIs. */ + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); + pr_debug("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", + msr_lo, msr_hi); + msr_lo &= 0x00c0000; + if (msr_lo != 0x0080000) + return 0; + + /* + * If the processor is a mobile version, + * platform ID has bit 50 set + * it has SpeedStep technology if either + * bit 56 or 57 is set + */ + rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); + pr_debug("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", + msr_lo, msr_hi); + if ((msr_hi & (1<<18)) && + (relaxed_check ? 1 : (msr_hi & (3<<24)))) { + if (c->x86_mask == 0x01) { + pr_debug("early PIII version\n"); + return SPEEDSTEP_CPU_PIII_C_EARLY; + } else + return SPEEDSTEP_CPU_PIII_C; + } + + default: + return 0; + } +} +EXPORT_SYMBOL_GPL(speedstep_detect_processor); + + +/********************************************************************* + * DETECT SPEEDSTEP SPEEDS * + *********************************************************************/ + +unsigned int speedstep_get_freqs(enum speedstep_processor processor, + unsigned int *low_speed, + unsigned int *high_speed, + unsigned int *transition_latency, + void (*set_state) (unsigned int state)) +{ + unsigned int prev_speed; + unsigned int ret = 0; + unsigned long flags; + struct timeval tv1, tv2; + + if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) + return -EINVAL; + + pr_debug("trying to determine both speeds\n"); + + /* get current speed */ + prev_speed = speedstep_get_frequency(processor); + if (!prev_speed) + return -EIO; + + pr_debug("previous speed is %u\n", prev_speed); + + local_irq_save(flags); + + /* switch to low state */ + set_state(SPEEDSTEP_LOW); + *low_speed = speedstep_get_frequency(processor); + if (!*low_speed) { + ret = -EIO; + goto out; + } + + pr_debug("low speed is %u\n", *low_speed); + + /* start latency measurement */ + if (transition_latency) + do_gettimeofday(&tv1); + + /* switch to high state */ + set_state(SPEEDSTEP_HIGH); + + /* end latency measurement */ + if (transition_latency) + do_gettimeofday(&tv2); + + *high_speed = speedstep_get_frequency(processor); + if (!*high_speed) { + ret = -EIO; + goto out; + } + + pr_debug("high speed is %u\n", *high_speed); + + if (*low_speed == *high_speed) { + ret = -ENODEV; + goto out; + } + + /* switch to previous state, if necessary */ + if (*high_speed != prev_speed) + set_state(SPEEDSTEP_LOW); + + if (transition_latency) { + *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC + + tv2.tv_usec - tv1.tv_usec; + pr_debug("transition latency is %u uSec\n", *transition_latency); + + /* convert uSec to nSec and add 20% for safety reasons */ + *transition_latency *= 1200; + + /* check if the latency measurement is too high or too low + * and set it to a safe value (500uSec) in that case + */ + if (*transition_latency > 10000000 || + *transition_latency < 50000) { + printk(KERN_WARNING PFX "frequency transition " + "measured seems out of range (%u " + "nSec), falling back to a safe one of" + "%u nSec.\n", + *transition_latency, 500000); + *transition_latency = 500000; + } + } + +out: + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(speedstep_get_freqs); + +#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK +module_param(relaxed_check, int, 0444); +MODULE_PARM_DESC(relaxed_check, + "Don't do all checks for speedstep capability."); +#endif + +MODULE_AUTHOR("Dominik Brodowski "); +MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); +MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/speedstep-lib.h b/drivers/cpufreq/speedstep-lib.h new file mode 100644 index 0000000..70d9cea --- /dev/null +++ b/drivers/cpufreq/speedstep-lib.h @@ -0,0 +1,49 @@ +/* + * (C) 2002 - 2003 Dominik Brodowski + * + * Licensed under the terms of the GNU GPL License version 2. + * + * Library for common functions for Intel SpeedStep v.1 and v.2 support + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + + + +/* processors */ +enum speedstep_processor { + SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ + SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ +/* the following processors are not speedstep-capable and are not auto-detected + * in speedstep_detect_processor(). However, their speed can be detected using + * the speedstep_get_frequency() call. */ + SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ + SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ + SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ +}; + +/* speedstep states -- only two of them */ + +#define SPEEDSTEP_HIGH 0x00000000 +#define SPEEDSTEP_LOW 0x00000001 + + +/* detect a speedstep-capable processor */ +extern enum speedstep_processor speedstep_detect_processor(void); + +/* detect the current speed (in khz) of the processor */ +extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); + + +/* detect the low and high speeds of the processor. The callback + * set_state"'s first argument is either SPEEDSTEP_HIGH or + * SPEEDSTEP_LOW; the second argument is zero so that no + * cpufreq_notify_transition calls are initiated. + */ +extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, + unsigned int *low_speed, + unsigned int *high_speed, + unsigned int *transition_latency, + void (*set_state) (unsigned int state)); diff --git a/drivers/cpufreq/speedstep-smi.c b/drivers/cpufreq/speedstep-smi.c new file mode 100644 index 0000000..c76ead3 --- /dev/null +++ b/drivers/cpufreq/speedstep-smi.c @@ -0,0 +1,464 @@ +/* + * Intel SpeedStep SMI driver. + * + * (C) 2003 Hiroshi Miura + * + * Licensed under the terms of the GNU GPL License version 2. + * + */ + + +/********************************************************************* + * SPEEDSTEP - DEFINITIONS * + *********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "speedstep-lib.h" + +/* speedstep system management interface port/command. + * + * These parameters are got from IST-SMI BIOS call. + * If user gives it, these are used. + * + */ +static int smi_port; +static int smi_cmd; +static unsigned int smi_sig; + +/* info about the processor */ +static enum speedstep_processor speedstep_processor; + +/* + * There are only two frequency states for each processor. Values + * are in kHz for the time being. + */ +static struct cpufreq_frequency_table speedstep_freqs[] = { + {SPEEDSTEP_HIGH, 0}, + {SPEEDSTEP_LOW, 0}, + {0, CPUFREQ_TABLE_END}, +}; + +#define GET_SPEEDSTEP_OWNER 0 +#define GET_SPEEDSTEP_STATE 1 +#define SET_SPEEDSTEP_STATE 2 +#define GET_SPEEDSTEP_FREQS 4 + +/* how often shall the SMI call be tried if it failed, e.g. because + * of DMA activity going on? */ +#define SMI_TRIES 5 + +/** + * speedstep_smi_ownership + */ +static int speedstep_smi_ownership(void) +{ + u32 command, result, magic, dummy; + u32 function = GET_SPEEDSTEP_OWNER; + unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + magic = virt_to_phys(magic_data); + + pr_debug("trying to obtain ownership with command %x at port %x\n", + command, smi_port); + + __asm__ __volatile__( + "push %%ebp\n" + "out %%al, (%%dx)\n" + "pop %%ebp\n" + : "=D" (result), + "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), + "=S" (dummy) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), + "D" (0), "S" (magic) + : "memory" + ); + + pr_debug("result is %x\n", result); + + return result; +} + +/** + * speedstep_smi_get_freqs - get SpeedStep preferred & current freq. + * @low: the low frequency value is placed here + * @high: the high frequency value is placed here + * + * Only available on later SpeedStep-enabled systems, returns false results or + * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing + * shows that the latter occurs if !(ist_info.event & 0xFFFF). + */ +static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) +{ + u32 command, result = 0, edi, high_mhz, low_mhz, dummy; + u32 state = 0; + u32 function = GET_SPEEDSTEP_FREQS; + + if (!(ist_info.event & 0xFFFF)) { + pr_debug("bug #1422 -- can't read freqs from BIOS\n"); + return -ENODEV; + } + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + + pr_debug("trying to determine frequencies with command %x at port %x\n", + command, smi_port); + + __asm__ __volatile__( + "push %%ebp\n" + "out %%al, (%%dx)\n" + "pop %%ebp" + : "=a" (result), + "=b" (high_mhz), + "=c" (low_mhz), + "=d" (state), "=D" (edi), "=S" (dummy) + : "a" (command), + "b" (function), + "c" (state), + "d" (smi_port), "S" (0), "D" (0) + ); + + pr_debug("result %x, low_freq %u, high_freq %u\n", + result, low_mhz, high_mhz); + + /* abort if results are obviously incorrect... */ + if ((high_mhz + low_mhz) < 600) + return -EINVAL; + + *high = high_mhz * 1000; + *low = low_mhz * 1000; + + return result; +} + +/** + * speedstep_get_state - set the SpeedStep state + * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) + * + */ +static int speedstep_get_state(void) +{ + u32 function = GET_SPEEDSTEP_STATE; + u32 result, state, edi, command, dummy; + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + + pr_debug("trying to determine current setting with command %x " + "at port %x\n", command, smi_port); + + __asm__ __volatile__( + "push %%ebp\n" + "out %%al, (%%dx)\n" + "pop %%ebp\n" + : "=a" (result), + "=b" (state), "=D" (edi), + "=c" (dummy), "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (0), + "d" (smi_port), "S" (0), "D" (0) + ); + + pr_debug("state is %x, result is %x\n", state, result); + + return state & 1; +} + + +/** + * speedstep_set_state - set the SpeedStep state + * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) + * + */ +static void speedstep_set_state(unsigned int state) +{ + unsigned int result = 0, command, new_state, dummy; + unsigned long flags; + unsigned int function = SET_SPEEDSTEP_STATE; + unsigned int retry = 0; + + if (state > 0x1) + return; + + /* Disable IRQs */ + local_irq_save(flags); + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + + pr_debug("trying to set frequency to state %u " + "with command %x at port %x\n", + state, command, smi_port); + + do { + if (retry) { + pr_debug("retry %u, previous result %u, waiting...\n", + retry, result); + mdelay(retry * 50); + } + retry++; + __asm__ __volatile__( + "push %%ebp\n" + "out %%al, (%%dx)\n" + "pop %%ebp" + : "=b" (new_state), "=D" (result), + "=c" (dummy), "=a" (dummy), + "=d" (dummy), "=S" (dummy) + : "a" (command), "b" (function), "c" (state), + "d" (smi_port), "S" (0), "D" (0) + ); + } while ((new_state != state) && (retry <= SMI_TRIES)); + + /* enable IRQs */ + local_irq_restore(flags); + + if (new_state == state) + pr_debug("change to %u MHz succeeded after %u tries " + "with result %u\n", + (speedstep_freqs[new_state].frequency / 1000), + retry, result); + else + printk(KERN_ERR "cpufreq: change to state %u " + "failed with new_state %u and result %u\n", + state, new_state, result); + + return; +} + + +/** + * speedstep_target - set a new CPUFreq policy + * @policy: new policy + * @target_freq: new freq + * @relation: + * + * Sets a new CPUFreq policy/freq. + */ +static int speedstep_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + unsigned int newstate = 0; + struct cpufreq_freqs freqs; + + if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], + target_freq, relation, &newstate)) + return -EINVAL; + + freqs.old = speedstep_freqs[speedstep_get_state()].frequency; + freqs.new = speedstep_freqs[newstate].frequency; + freqs.cpu = 0; /* speedstep.c is UP only driver */ + + if (freqs.old == freqs.new) + return 0; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + speedstep_set_state(newstate); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return 0; +} + + +/** + * speedstep_verify - verifies a new CPUFreq policy + * @policy: new policy + * + * Limit must be within speedstep_low_freq and speedstep_high_freq, with + * at least one border included. + */ +static int speedstep_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); +} + + +static int speedstep_cpu_init(struct cpufreq_policy *policy) +{ + int result; + unsigned int speed, state; + unsigned int *low, *high; + + /* capability check */ + if (policy->cpu != 0) + return -ENODEV; + + result = speedstep_smi_ownership(); + if (result) { + pr_debug("fails in acquiring ownership of a SMI interface.\n"); + return -EINVAL; + } + + /* detect low and high frequency */ + low = &speedstep_freqs[SPEEDSTEP_LOW].frequency; + high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency; + + result = speedstep_smi_get_freqs(low, high); + if (result) { + /* fall back to speedstep_lib.c dection mechanism: + * try both states out */ + pr_debug("could not detect low and high frequencies " + "by SMI call.\n"); + result = speedstep_get_freqs(speedstep_processor, + low, high, + NULL, + &speedstep_set_state); + + if (result) { + pr_debug("could not detect two different speeds" + " -- aborting.\n"); + return result; + } else + pr_debug("workaround worked.\n"); + } + + /* get current speed setting */ + state = speedstep_get_state(); + speed = speedstep_freqs[state].frequency; + + pr_debug("currently at %s speed setting - %i MHz\n", + (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) + ? "low" : "high", + (speed / 1000)); + + /* cpuinfo and default policy values */ + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = speed; + + result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); + if (result) + return result; + + cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); + + return 0; +} + +static int speedstep_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static unsigned int speedstep_get(unsigned int cpu) +{ + if (cpu) + return -ENODEV; + return speedstep_get_frequency(speedstep_processor); +} + + +static int speedstep_resume(struct cpufreq_policy *policy) +{ + int result = speedstep_smi_ownership(); + + if (result) + pr_debug("fails in re-acquiring ownership of a SMI interface.\n"); + + return result; +} + +static struct freq_attr *speedstep_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver speedstep_driver = { + .name = "speedstep-smi", + .verify = speedstep_verify, + .target = speedstep_target, + .init = speedstep_cpu_init, + .exit = speedstep_cpu_exit, + .get = speedstep_get, + .resume = speedstep_resume, + .owner = THIS_MODULE, + .attr = speedstep_attr, +}; + +/** + * speedstep_init - initializes the SpeedStep CPUFreq driver + * + * Initializes the SpeedStep support. Returns -ENODEV on unsupported + * BIOS, -EINVAL on problems during initiatization, and zero on + * success. + */ +static int __init speedstep_init(void) +{ + speedstep_processor = speedstep_detect_processor(); + + switch (speedstep_processor) { + case SPEEDSTEP_CPU_PIII_T: + case SPEEDSTEP_CPU_PIII_C: + case SPEEDSTEP_CPU_PIII_C_EARLY: + break; + default: + speedstep_processor = 0; + } + + if (!speedstep_processor) { + pr_debug("No supported Intel CPU detected.\n"); + return -ENODEV; + } + + pr_debug("signature:0x%.8ulx, command:0x%.8ulx, " + "event:0x%.8ulx, perf_level:0x%.8ulx.\n", + ist_info.signature, ist_info.command, + ist_info.event, ist_info.perf_level); + + /* Error if no IST-SMI BIOS or no PARM + sig= 'ISGE' aka 'Intel Speedstep Gate E' */ + if ((ist_info.signature != 0x47534943) && ( + (smi_port == 0) || (smi_cmd == 0))) + return -ENODEV; + + if (smi_sig == 1) + smi_sig = 0x47534943; + else + smi_sig = ist_info.signature; + + /* setup smi_port from MODLULE_PARM or BIOS */ + if ((smi_port > 0xff) || (smi_port < 0)) + return -EINVAL; + else if (smi_port == 0) + smi_port = ist_info.command & 0xff; + + if ((smi_cmd > 0xff) || (smi_cmd < 0)) + return -EINVAL; + else if (smi_cmd == 0) + smi_cmd = (ist_info.command >> 16) & 0xff; + + return cpufreq_register_driver(&speedstep_driver); +} + + +/** + * speedstep_exit - unregisters SpeedStep support + * + * Unregisters SpeedStep support. + */ +static void __exit speedstep_exit(void) +{ + cpufreq_unregister_driver(&speedstep_driver); +} + +module_param(smi_port, int, 0444); +module_param(smi_cmd, int, 0444); +module_param(smi_sig, uint, 0444); + +MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value " + "-- Intel's default setting is 0xb2"); +MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value " + "-- Intel's default setting is 0x82"); +MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the " + "SMI interface."); + +MODULE_AUTHOR("Hiroshi Miura"); +MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface."); +MODULE_LICENSE("GPL"); + +module_init(speedstep_init); +module_exit(speedstep_exit); -- cgit v1.1 From 1477fcc290b3d5c2614bde98bf3b1154c538860d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 20 May 2011 11:11:53 +1000 Subject: signal.h need a definition of struct task_struct This fixes these build errors on powerpc: In file included from arch/powerpc/mm/fault.c:18: include/linux/signal.h:239: error: 'struct task_struct' declared inside parameter list include/linux/signal.h:239: error: its scope is only this definition or declaration, which is probably not what you want include/linux/signal.h:240: error: 'struct task_struct' declared inside parameter list .. Exposed by commit e66eed651fd1 ("list: remove prefetching from regular list iterators"), which removed the include of from . Without that, linux/signal.h no longer accidentally got the declaration of 'struct task_struct'. Fix by properly declaring the struct, rather than introducing any new header file dependency. Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/signal.h b/include/linux/signal.h index fcd2b14..29a68ac 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -7,6 +7,8 @@ #ifdef __KERNEL__ #include +struct task_struct; + /* for sysctl */ extern int print_fatal_signals; /* -- cgit v1.1 From 044aea9b83614948c98564000db07d1d32b2d29b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 May 2011 18:59:47 -0700 Subject: selinux: de-crapify avc cache stat code generation You can turn off the avc cache stats, but distributions seem to not do that (perhaps because several performance tuning how-to's talk about the avc cache statistics). Which is sad, because the code it generates is truly horrendous, with the statistics update being sandwitched between get_cpu/put_cpu which in turn causes preemption disables etc. We're talking ten+ instructions just to increment a per-cpu variable in some pretty hot code. Fix the craziness by just using 'this_cpu_inc()' instead. Suddenly we only need a single 'inc' instruction to increment the statistics. This is quite noticeable in the incredibly hot avc_has_perm_noaudit() function (which triggers all the statistics by virtue of doing an avc_lookup() call). Signed-off-by: Linus Torvalds --- security/selinux/avc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 1d027e2..5971e30e 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -38,11 +38,7 @@ #define AVC_CACHE_RECLAIM 16 #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS -#define avc_cache_stats_incr(field) \ -do { \ - per_cpu(avc_cache_stats, get_cpu()).field++; \ - put_cpu(); \ -} while (0) +#define avc_cache_stats_incr(field) this_cpu_inc(avc_cache_stats.field) #else #define avc_cache_stats_incr(field) do {} while (0) #endif -- cgit v1.1 From 257313b2a87795e07a0bdf58d0fffbdba8b31051 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 May 2011 21:22:53 -0700 Subject: selinux: avoid unnecessary avc cache stat hit count There is no point in counting hits - we can calculate it from the number of lookups and misses. This makes the avc statistics a bit smaller, and makes the code generation better too. Signed-off-by: Linus Torvalds --- security/selinux/avc.c | 9 ++++----- security/selinux/include/avc.h | 1 - security/selinux/selinuxfs.c | 10 +++++++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 5971e30e..3d2715f 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -343,11 +343,10 @@ static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass) node = avc_search_node(ssid, tsid, tclass); if (node) - avc_cache_stats_incr(hits); - else - avc_cache_stats_incr(misses); + return node; - return node; + avc_cache_stats_incr(misses); + return NULL; } static int avc_latest_notif_update(int seqno, int is_insert) @@ -765,7 +764,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, rcu_read_lock(); node = avc_lookup(ssid, tsid, tclass); - if (!node) { + if (unlikely(!node)) { rcu_read_unlock(); if (in_avd) diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index e77b2ac..47fda96 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -41,7 +41,6 @@ struct sk_buff; */ struct avc_cache_stats { unsigned int lookups; - unsigned int hits; unsigned int misses; unsigned int allocations; unsigned int reclaims; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index ea39cb7..c0e1a0f 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1380,10 +1380,14 @@ static int sel_avc_stats_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_printf(seq, "lookups hits misses allocations reclaims " "frees\n"); - else - seq_printf(seq, "%u %u %u %u %u %u\n", st->lookups, - st->hits, st->misses, st->allocations, + else { + unsigned int lookups = st->lookups; + unsigned int misses = st->misses; + unsigned int hits = lookups - misses; + seq_printf(seq, "%u %u %u %u %u %u\n", lookups, + hits, misses, st->allocations, st->reclaims, st->frees); + } return 0; } -- cgit v1.1 From c0e299b1a91cbdb21ae08e382a4176200398bc36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 May 2011 10:50:52 +0200 Subject: clockevents/source: Use u64 to make 32bit happy unsigned long is not 64bit on 32bit machine. Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- kernel/time/clocksource.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 22a9da9..c027d4f 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { - unsigned long sec; + u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9d5f8c..1c95fd6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -639,7 +639,7 @@ static void clocksource_enqueue(struct clocksource *cs) */ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - unsigned long sec; + u64 sec; /* * Calc the maximum number of seconds which we can run before -- cgit v1.1 From bbe7b8bef48c567f5ff3f6041c1fb011292e8f12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 May 2011 11:38:24 +0200 Subject: MAINTAINERS: Add drivers/clocksource to TIMEKEEPING It's not a random dump ground and we care about it. Signed-off-by: Thomas Gleixner Cc: Ralf Baechle --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8df8d2d..e20c974 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5431,6 +5431,7 @@ F: include/linux/timex.h F: kernel/time/clocksource.c F: kernel/time/time*.c F: kernel/time/ntp.c +F: drivers/clocksource TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie -- cgit v1.1