Merge branch 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6
* 'for-linus' of git://git390.marist.edu/pub/scm/linux-2.6:
[S390] signal: use set_restore_sigmask() helper
[S390] smp: remove pointless comments in startup_secondary()
[S390] qdio: Use kstrtoul_from_user
[S390] sclp_async: Use kstrtoul_from_user
[S390] exec: remove redundant set_fs(USER_DS)
[S390] cpu hotplug: on cpu start wait until being marked active
[S390] signal: convert to use set_current_blocked()
[S390] asm offsets: fix coding style
[S390] Add support for IBM zEnterprise 114
[S390] dasd: check if raw track access is supported
[S390] Use diagnose 308 for system reset
[S390] Export store_status() function
[S390] dasd: use vmalloc for statistics input buffer
[S390] Add PSW restart shutdown trigger
[S390] missing return in page_table_alloc_pgste
[S390] qdio: 2nd stage retry on SIGA-W busy conditions
diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt
index dfab718..5cc699b 100644
--- a/Documentation/acpi/apei/einj.txt
+++ b/Documentation/acpi/apei/einj.txt
@@ -48,12 +48,19 @@
- param1
This file is used to set the first error parameter value. Effect of
parameter depends on error_type specified. For memory error, this is
- physical memory address.
+ physical memory address. Only available if param_extension module
+ parameter is specified.
- param2
This file is used to set the second error parameter value. Effect of
parameter depends on error_type specified. For memory error, this is
- physical memory address mask.
+ physical memory address mask. Only available if param_extension
+ module parameter is specified.
+
+Injecting parameter support is a BIOS version specific extension, that
+is, it only works on some BIOS version. If you want to use it, please
+make sure your BIOS version has the proper support and specify
+"param_extension=y" in module parameter.
For more information about EINJ, please refer to ACPI specification
version 4.0, section 17.5.
diff --git a/Documentation/devicetree/bindings/gpio/gpio_keys.txt b/Documentation/devicetree/bindings/gpio/gpio_keys.txt
index 7190c99..5c2c021 100644
--- a/Documentation/devicetree/bindings/gpio/gpio_keys.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio_keys.txt
@@ -10,7 +10,7 @@
Each button (key) is represented as a sub-node of "gpio-keys":
Subnode properties:
- - gpios: OF devcie-tree gpio specificatin.
+ - gpios: OF device-tree gpio specification.
- label: Descriptive name of the key.
- linux,code: Keycode to emit.
diff --git a/Documentation/devicetree/bindings/input/fsl-mma8450.txt b/Documentation/devicetree/bindings/input/fsl-mma8450.txt
new file mode 100644
index 0000000..a00c94c
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/fsl-mma8450.txt
@@ -0,0 +1,11 @@
+* Freescale MMA8450 3-Axis Accelerometer
+
+Required properties:
+- compatible : "fsl,mma8450".
+
+Example:
+
+accelerometer: mma8450@1c {
+ compatible = "fsl,mma8450";
+ reg = <0x1c>;
+};
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 7be15e4..82a5d25 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -143,8 +143,7 @@
failslab, fail_page_alloc, and fail_make_request use this way.
Helper functions:
- init_fault_attr_dentries(entries, attr, name);
- void cleanup_fault_attr_dentries(entries);
+ fault_create_debugfs_attr(name, parent, attr);
- module parameters
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ea0bace..43f4809 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -296,15 +296,6 @@
---------------------------
-What: CONFIG_THERMAL_HWMON
-When: January 2009
-Why: This option was introduced just to allow older lm-sensors userspace
- to keep working over the upgrade to 2.6.26. At the scheduled time of
- removal fixed lm-sensors (2.x or 3.x) should be readily available.
-Who: Rene Herman <rene.herman@gmail.com>
-
----------------------------
-
What: Code that is now under CONFIG_WIRELESS_EXT_SYSFS
(in net/core/net-sysfs.c)
When: After the only user (hal) has seen a release with the patches
diff --git a/Documentation/frv/booting.txt b/Documentation/frv/booting.txt
index ace200b..37c4d84 100644
--- a/Documentation/frv/booting.txt
+++ b/Documentation/frv/booting.txt
@@ -106,13 +106,20 @@
To use the first on-chip serial port at baud rate 115200, no parity, 8
bits, and no flow control.
- (*) root=/dev/<xxxx>
+ (*) root=<xxxx>
- This specifies the device upon which the root filesystem resides. For
- example:
+ This specifies the device upon which the root filesystem resides. It
+ may be specified by major and minor number, device path, or even
+ partition uuid, if supported. For example:
/dev/nfs NFS root filesystem
/dev/mtdblock3 Fourth RedBoot partition on the System Flash
+ PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=1
+ first partition after the partition with the given UUID
+ 253:0 Device with major 253 and minor 0
+
+ Authoritative information can be found in
+ "Documentation/kernel-parameters.txt".
(*) rw
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 72ba8d5..845a191 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -292,6 +292,7 @@
<mailto:buk@buks.ipn.de>
0xA0 all linux/sdp/sdp.h Industrial Device Project
<mailto:kenji@bitgate.com>
+0xA2 00-0F arch/tile/include/asm/hardwall.h
0xA3 80-8F Port ACL in development:
<mailto:tlewis@mindspring.com>
0xA3 90-9F linux/dtlk.h
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 26a8374..e279b72 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -163,6 +163,11 @@
See also Documentation/power/pm.txt, pci=noacpi
+ acpi_rsdp= [ACPI,EFI,KEXEC]
+ Pass the RSDP address to the kernel, mostly used
+ on machines running EFI runtime service to boot the
+ second kernel for kdump.
+
acpi_apic_instance= [ACPI, IOAPIC]
Format: <int>
2: use 2nd APIC table, if available
@@ -546,6 +551,9 @@
/proc/<pid>/coredump_filter.
See also Documentation/filesystems/proc.txt.
+ cpuidle.off=1 [CPU_IDLE]
+ disable the cpuidle sub-system
+
cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
Format:
<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -2240,6 +2248,7 @@
ro [KNL] Mount root device read-only on boot
root= [KNL] Root filesystem
+ See name_to_dev_t comment in init/do_mounts.c.
rootdelay= [KNL] Delay (in seconds) to pause before attempting to
mount the root filesystem
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index c93bed6..97d45f2 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -129,6 +129,20 @@
the first of these. You can find out all valid major numbers by
looking into include/linux/major.h.
+In addition to major and minor numbers, if the device containing your
+root partition uses a partition table format with unique partition
+identifiers, then you may use them. For instance,
+"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF". It is also
+possible to reference another partition on the same device using a
+known partition UUID as the starting point. For example,
+if partition 5 of the device has the UUID of
+00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as
+follows:
+ PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2
+
+Authoritative information can be found in
+"Documentation/kernel-parameters.txt".
+
2.2) ro, rw
-----------
diff --git a/MAINTAINERS b/MAINTAINERS
index 0d2fcda..07cfd8d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3367,6 +3367,12 @@
F: drivers/net/ixgbe/
F: drivers/net/ixgbevf/
+INTEL MRST PMU DRIVER
+M: Len Brown <len.brown@intel.com>
+L: linux-pm@lists.linux-foundation.org
+S: Supported
+F: arch/x86/platform/mrst/pmu.*
+
INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT
L: linux-wireless@vger.kernel.org
S: Orphan
@@ -6319,6 +6325,7 @@
TARGET SUBSYSTEM
M: Nicholas A. Bellinger <nab@linux-iscsi.org>
L: linux-scsi@vger.kernel.org
+L: target-devel@vger.kernel.org
L: http://groups.google.com/group/linux-iscsi-target-dev
W: http://www.linux-iscsi.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/nab/lio-core-2.6.git master
diff --git a/arch/Kconfig b/arch/Kconfig
index 26b0e23..4b0669c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -178,4 +178,7 @@
config HAVE_RCU_TABLE_FREE
bool
+config ARCH_HAVE_NMI_SAFE_CMPXCHG
+ bool
+
source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index ca2da8d..60cde53 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -14,6 +14,7 @@
select AUTO_IRQ_AFFINITY if SMP
select GENERIC_IRQ_SHOW
select ARCH_WANT_OPTIONAL_GPIOLIB
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
help
The Alpha is a 64-bit general-purpose processor designed and
marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 5e1e541..d7ee0d4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -30,6 +30,7 @@
#include <linux/uaccess.h>
#include <linux/random.h>
#include <linux/hw_breakpoint.h>
+#include <linux/cpuidle.h>
#include <asm/cacheflush.h>
#include <asm/leds.h>
@@ -196,7 +197,8 @@
cpu_relax();
} else {
stop_critical_timings();
- pm_idle();
+ if (cpuidle_call_idle())
+ pm_idle();
start_critical_timings();
/*
* This will eventually be removed - pm_idle
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index e9d689b..197e96f 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -10,6 +10,7 @@
select GENERIC_IRQ_PROBE
select HARDIRQS_SW_RESEND
select GENERIC_IRQ_SHOW
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
help
AVR32 is a high-performance 32-bit RISC microprocessor core,
designed for cost-sensitive embedded applications, with particular
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 85026537..466af40 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -158,7 +158,7 @@
static int sync_serial_release(struct inode *inode, struct file *file);
static unsigned int sync_serial_poll(struct file *filp, poll_table *wait);
-static int sync_serial_ioctl(struct file *file,
+static long sync_serial_ioctl(struct file *file,
unsigned int cmd, unsigned long arg);
static ssize_t sync_serial_write(struct file *file, const char *buf,
size_t count, loff_t *ppos);
@@ -625,11 +625,11 @@
*R_IRQ_MASK1_SET = 1 << port->data_avail_bit;
DEBUG(printk(KERN_DEBUG "sser%d rec started\n", dev));
}
- ret = 0;
+ err = 0;
out:
mutex_unlock(&sync_serial_mutex);
- return ret;
+ return err;
}
static int sync_serial_release(struct inode *inode, struct file *file)
diff --git a/arch/cris/arch-v10/kernel/irq.c b/arch/cris/arch-v10/kernel/irq.c
index 907cfb5..ba0e596 100644
--- a/arch/cris/arch-v10/kernel/irq.c
+++ b/arch/cris/arch-v10/kernel/irq.c
@@ -20,6 +20,9 @@
#define crisv10_mask_irq(irq_nr) (*R_VECT_MASK_CLR = 1 << (irq_nr));
#define crisv10_unmask_irq(irq_nr) (*R_VECT_MASK_SET = 1 << (irq_nr));
+extern void kgdb_init(void);
+extern void breakpoint(void);
+
/* don't use set_int_vector, it bypasses the linux interrupt handlers. it is
* global just so that the kernel gdb can use it.
*/
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 29b74a1..332f19c 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -11,8 +11,6 @@
#ifdef __KERNEL__
-#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-
#ifndef __ASSEMBLY__
#include <asm/types.h>
#include <asm/processor.h>
@@ -67,8 +65,10 @@
#define init_thread_info (init_thread_union.thread_info)
+#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
/* thread information allocation */
-#define alloc_thread_info(tsk, node) ((struct thread_info *) __get_free_pages(GFP_KERNEL,1))
+#define alloc_thread_info_node(tsk, node) \
+ ((struct thread_info *) __get_free_pages(GFP_KERNEL, 1))
#define free_thread_info(ti) free_pages((unsigned long) (ti), 1)
#endif /* !__ASSEMBLY__ */
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index cb884e4..bad27a6 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,6 +7,7 @@
select HAVE_PERF_EVENTS
select HAVE_GENERIC_HARDIRQS
select GENERIC_IRQ_SHOW
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
config ZONE_DMA
bool
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 64c7ab7..1248547 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -28,6 +28,7 @@
select IRQ_PER_CPU
select GENERIC_IRQ_SHOW
select ARCH_WANT_OPTIONAL_GPIOLIB
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
default y
help
The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 284cd37..9e8ee9d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -6,6 +6,7 @@
select GENERIC_ATOMIC64 if MMU
select HAVE_GENERIC_HARDIRQS if !MMU
select GENERIC_IRQ_SHOW if !MMU
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
config RWSEM_GENERIC_SPINLOCK
bool
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 65adc86..e077b0b 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -15,6 +15,7 @@
select HAVE_GENERIC_HARDIRQS
select GENERIC_IRQ_PROBE
select IRQ_PER_CPU
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
help
The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 374c475..6926b61 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -136,6 +136,7 @@
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_BPF_JIT if (PPC64 && NET)
select HAVE_ARCH_JUMP_LABEL
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
config EARLY_PRINTK
bool
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 17dbb43..ed5cb5a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -81,6 +81,7 @@
select INIT_ALL_POSSIBLE
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 748ff19..ff9177c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -11,6 +11,7 @@
select HAVE_DMA_ATTRS
select HAVE_IRQ_WORK
select HAVE_PERF_EVENTS
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
select PERF_USE_VMALLOC
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 84db0d6..3c45de1 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -16,12 +16,13 @@
#include <linux/thread_info.h>
#include <linux/irqflags.h>
#include <linux/smp.h>
+#include <linux/cpuidle.h>
#include <asm/pgalloc.h>
#include <asm/system.h>
#include <linux/atomic.h>
#include <asm/smp.h>
-void (*pm_idle)(void) = NULL;
+static void (*pm_idle)(void);
static int hlt_counter;
@@ -100,7 +101,8 @@
local_irq_disable();
/* Don't trace irqs off for idle */
stop_critical_timings();
- pm_idle();
+ if (cpuidle_call_idle())
+ pm_idle();
/*
* Sanity check to ensure that pm_idle() returns
* with IRQs enabled
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 1074ddd..42c67be 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -54,6 +54,7 @@
select HAVE_PERF_EVENTS
select PERF_USE_VMALLOC
select IRQ_PREFLOW_FASTEOI
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
config ARCH_DEFCONFIG
string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 0249b8b..b30f71a 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -12,6 +12,7 @@
select GENERIC_PENDING_IRQ if SMP
select GENERIC_IRQ_SHOW
select SYS_HYPERVISOR
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386
# FIXME: investigate whether we need/want these options.
# select HAVE_IOREMAP_PROT
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 849ab2f..aec60dc 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -2,3 +2,41 @@
header-y += ucontext.h
header-y += hardwall.h
+
+generic-y += bug.h
+generic-y += bugs.h
+generic-y += cputime.h
+generic-y += device.h
+generic-y += div64.h
+generic-y += emergency-restart.h
+generic-y += errno.h
+generic-y += fb.h
+generic-y += fcntl.h
+generic-y += ioctl.h
+generic-y += ioctls.h
+generic-y += ipc.h
+generic-y += ipcbuf.h
+generic-y += irq_regs.h
+generic-y += kdebug.h
+generic-y += local.h
+generic-y += module.h
+generic-y += msgbuf.h
+generic-y += mutex.h
+generic-y += param.h
+generic-y += parport.h
+generic-y += poll.h
+generic-y += posix_types.h
+generic-y += resource.h
+generic-y += scatterlist.h
+generic-y += sembuf.h
+generic-y += serial.h
+generic-y += shmbuf.h
+generic-y += shmparam.h
+generic-y += socket.h
+generic-y += sockios.h
+generic-y += statfs.h
+generic-y += termbits.h
+generic-y += termios.h
+generic-y += types.h
+generic-y += ucontext.h
+generic-y += xor.h
diff --git a/arch/tile/include/asm/bug.h b/arch/tile/include/asm/bug.h
deleted file mode 100644
index b12fd89..0000000
--- a/arch/tile/include/asm/bug.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bug.h>
diff --git a/arch/tile/include/asm/bugs.h b/arch/tile/include/asm/bugs.h
deleted file mode 100644
index 61791e1..0000000
--- a/arch/tile/include/asm/bugs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bugs.h>
diff --git a/arch/tile/include/asm/cputime.h b/arch/tile/include/asm/cputime.h
deleted file mode 100644
index 6d68ad7..0000000
--- a/arch/tile/include/asm/cputime.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/cputime.h>
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
deleted file mode 100644
index f0a4c25..0000000
--- a/arch/tile/include/asm/device.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/device.h>
diff --git a/arch/tile/include/asm/div64.h b/arch/tile/include/asm/div64.h
deleted file mode 100644
index 6cd978c..0000000
--- a/arch/tile/include/asm/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/tile/include/asm/emergency-restart.h b/arch/tile/include/asm/emergency-restart.h
deleted file mode 100644
index 3711bd9..0000000
--- a/arch/tile/include/asm/emergency-restart.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/emergency-restart.h>
diff --git a/arch/tile/include/asm/errno.h b/arch/tile/include/asm/errno.h
deleted file mode 100644
index 4c82b50..0000000
--- a/arch/tile/include/asm/errno.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/errno.h>
diff --git a/arch/tile/include/asm/fb.h b/arch/tile/include/asm/fb.h
deleted file mode 100644
index 3a4988e..0000000
--- a/arch/tile/include/asm/fb.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fb.h>
diff --git a/arch/tile/include/asm/fcntl.h b/arch/tile/include/asm/fcntl.h
deleted file mode 100644
index 46ab12d..0000000
--- a/arch/tile/include/asm/fcntl.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fcntl.h>
diff --git a/arch/tile/include/asm/fixmap.h b/arch/tile/include/asm/fixmap.h
index 51537ff..c66f793 100644
--- a/arch/tile/include/asm/fixmap.h
+++ b/arch/tile/include/asm/fixmap.h
@@ -75,12 +75,6 @@
#define set_fixmap(idx, phys) \
__set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
- __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
-
#define clear_fixmap(idx) \
__set_fixmap(idx, 0, __pgprot(0))
diff --git a/arch/tile/include/asm/ioctl.h b/arch/tile/include/asm/ioctl.h
deleted file mode 100644
index b279fe0..0000000
--- a/arch/tile/include/asm/ioctl.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctl.h>
diff --git a/arch/tile/include/asm/ioctls.h b/arch/tile/include/asm/ioctls.h
deleted file mode 100644
index ec34c76..0000000
--- a/arch/tile/include/asm/ioctls.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctls.h>
diff --git a/arch/tile/include/asm/ipc.h b/arch/tile/include/asm/ipc.h
deleted file mode 100644
index a46e3d9..0000000
--- a/arch/tile/include/asm/ipc.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipc.h>
diff --git a/arch/tile/include/asm/ipcbuf.h b/arch/tile/include/asm/ipcbuf.h
deleted file mode 100644
index 84c7e51..0000000
--- a/arch/tile/include/asm/ipcbuf.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/tile/include/asm/irq_regs.h b/arch/tile/include/asm/irq_regs.h
deleted file mode 100644
index 3dd9c0b..0000000
--- a/arch/tile/include/asm/irq_regs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/tile/include/asm/kdebug.h b/arch/tile/include/asm/kdebug.h
deleted file mode 100644
index 6ece1b0..0000000
--- a/arch/tile/include/asm/kdebug.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kdebug.h>
diff --git a/arch/tile/include/asm/local.h b/arch/tile/include/asm/local.h
deleted file mode 100644
index c11c530..0000000
--- a/arch/tile/include/asm/local.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local.h>
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
deleted file mode 100644
index 1e4b79f..0000000
--- a/arch/tile/include/asm/module.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/module.h>
diff --git a/arch/tile/include/asm/msgbuf.h b/arch/tile/include/asm/msgbuf.h
deleted file mode 100644
index 809134c..0000000
--- a/arch/tile/include/asm/msgbuf.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/msgbuf.h>
diff --git a/arch/tile/include/asm/mutex.h b/arch/tile/include/asm/mutex.h
deleted file mode 100644
index ff6101a..0000000
--- a/arch/tile/include/asm/mutex.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/mutex-dec.h>
diff --git a/arch/tile/include/asm/param.h b/arch/tile/include/asm/param.h
deleted file mode 100644
index 965d454..0000000
--- a/arch/tile/include/asm/param.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/param.h>
diff --git a/arch/tile/include/asm/parport.h b/arch/tile/include/asm/parport.h
deleted file mode 100644
index cf252af..0000000
--- a/arch/tile/include/asm/parport.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/parport.h>
diff --git a/arch/tile/include/asm/poll.h b/arch/tile/include/asm/poll.h
deleted file mode 100644
index c98509d..0000000
--- a/arch/tile/include/asm/poll.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
diff --git a/arch/tile/include/asm/posix_types.h b/arch/tile/include/asm/posix_types.h
deleted file mode 100644
index 22cae62..0000000
--- a/arch/tile/include/asm/posix_types.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/posix_types.h>
diff --git a/arch/tile/include/asm/resource.h b/arch/tile/include/asm/resource.h
deleted file mode 100644
index 04bc4db..0000000
--- a/arch/tile/include/asm/resource.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/resource.h>
diff --git a/arch/tile/include/asm/scatterlist.h b/arch/tile/include/asm/scatterlist.h
deleted file mode 100644
index 35d786f..0000000
--- a/arch/tile/include/asm/scatterlist.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/scatterlist.h>
diff --git a/arch/tile/include/asm/sembuf.h b/arch/tile/include/asm/sembuf.h
deleted file mode 100644
index 7673b83..0000000
--- a/arch/tile/include/asm/sembuf.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/sembuf.h>
diff --git a/arch/tile/include/asm/serial.h b/arch/tile/include/asm/serial.h
deleted file mode 100644
index a0cb0caf..0000000
--- a/arch/tile/include/asm/serial.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/serial.h>
diff --git a/arch/tile/include/asm/shmbuf.h b/arch/tile/include/asm/shmbuf.h
deleted file mode 100644
index 83c05fc..0000000
--- a/arch/tile/include/asm/shmbuf.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/shmbuf.h>
diff --git a/arch/tile/include/asm/shmparam.h b/arch/tile/include/asm/shmparam.h
deleted file mode 100644
index 93f30de..0000000
--- a/arch/tile/include/asm/shmparam.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/shmparam.h>
diff --git a/arch/tile/include/asm/socket.h b/arch/tile/include/asm/socket.h
deleted file mode 100644
index 6b71384..0000000
--- a/arch/tile/include/asm/socket.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/socket.h>
diff --git a/arch/tile/include/asm/sockios.h b/arch/tile/include/asm/sockios.h
deleted file mode 100644
index def6d47..0000000
--- a/arch/tile/include/asm/sockios.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/sockios.h>
diff --git a/arch/tile/include/asm/statfs.h b/arch/tile/include/asm/statfs.h
deleted file mode 100644
index 0b91fe1..0000000
--- a/arch/tile/include/asm/statfs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/statfs.h>
diff --git a/arch/tile/include/asm/termbits.h b/arch/tile/include/asm/termbits.h
deleted file mode 100644
index 3935b10..0000000
--- a/arch/tile/include/asm/termbits.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/termbits.h>
diff --git a/arch/tile/include/asm/termios.h b/arch/tile/include/asm/termios.h
deleted file mode 100644
index 280d78a..0000000
--- a/arch/tile/include/asm/termios.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/termios.h>
diff --git a/arch/tile/include/asm/types.h b/arch/tile/include/asm/types.h
deleted file mode 100644
index b9e79bc..0000000
--- a/arch/tile/include/asm/types.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/types.h>
diff --git a/arch/tile/include/asm/ucontext.h b/arch/tile/include/asm/ucontext.h
deleted file mode 100644
index 9bc07b9..0000000
--- a/arch/tile/include/asm/ucontext.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ucontext.h>
diff --git a/arch/tile/include/asm/xor.h b/arch/tile/include/asm/xor.h
deleted file mode 100644
index c82eb12..0000000
--- a/arch/tile/include/asm/xor.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/xor.h>
diff --git a/arch/tile/include/hv/drv_srom_intf.h b/arch/tile/include/hv/drv_srom_intf.h
new file mode 100644
index 0000000..6395faa
--- /dev/null
+++ b/arch/tile/include/hv/drv_srom_intf.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+/**
+ * @file drv_srom_intf.h
+ * Interface definitions for the SPI Flash ROM driver.
+ */
+
+#ifndef _SYS_HV_INCLUDE_DRV_SROM_INTF_H
+#define _SYS_HV_INCLUDE_DRV_SROM_INTF_H
+
+/** Read this offset to get the total device size. */
+#define SROM_TOTAL_SIZE_OFF 0xF0000000
+
+/** Read this offset to get the device sector size. */
+#define SROM_SECTOR_SIZE_OFF 0xF0000004
+
+/** Read this offset to get the device page size. */
+#define SROM_PAGE_SIZE_OFF 0xF0000008
+
+/** Write this offset to flush any pending writes. */
+#define SROM_FLUSH_OFF 0xF1000000
+
+/** Write this offset, plus the byte offset of the start of a sector, to
+ * erase a sector. Any write data is ignored, but there must be at least
+ * one byte of write data. Only applies when the driver is in MTD mode.
+ */
+#define SROM_ERASE_OFF 0xF2000000
+
+#endif /* _SYS_HV_INCLUDE_DRV_SROM_INTF_H */
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index c4be58c..f6f50f2a 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -78,7 +78,6 @@
.rating = 300,
.read = clocksource_get_cycles,
.mask = CLOCKSOURCE_MASK(64),
- .shift = 22, /* typical value, e.g. x86 tsc uses this */
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
@@ -91,8 +90,6 @@
cycles_per_sec = hv_sysconf(HV_SYSCONF_CPU_SPEED);
sched_clock_mult =
clocksource_hz2mult(cycles_per_sec, SCHED_CLOCK_SHIFT);
- cycle_counter_cs.mult =
- clocksource_hz2mult(cycles_per_sec, cycle_counter_cs.shift);
}
void __init calibrate_delay(void)
@@ -107,7 +104,7 @@
void __init time_init(void)
{
/* Initialize and register the clock source. */
- clocksource_register(&cycle_counter_cs);
+ clocksource_register_hz(&cycle_counter_cs, cycles_per_sec);
/* Start up the tile-timer interrupt source on the boot cpu. */
setup_tile_timer();
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 4e10c40..7309988c 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -836,8 +836,7 @@
#endif
#ifdef CONFIG_FLATMEM
- if (!mem_map)
- BUG();
+ BUG_ON(!mem_map);
#endif
#ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7cf916f..6a47bb2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -72,6 +72,7 @@
select USE_GENERIC_SMP_HELPERS if SMP
select HAVE_BPF_JIT if (X86_64 && NET)
select CLKEVT_I8253
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d02804d..d8e8eef 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,8 +40,6 @@
#include <linux/compiler.h>
#include <asm/page.h>
-#include <xen/xen.h>
-
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -334,6 +332,7 @@
extern bool is_early_ioremap_ptep(pte_t *ptep);
#ifdef CONFIG_XEN
+#include <xen/xen.h>
struct bio_vec;
extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2193715..0d1171c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -751,8 +751,6 @@
:: "a" (eax), "c" (ecx));
}
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void init_amd_e400_c1e_mask(void);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 5812404..f50e7fb 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -149,6 +149,29 @@
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+ if (!need_resched()) {
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)¤t_thread_info()->flags);
+
+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
+ }
+}
+
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e1ba8cb2..e7e3b01 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -438,29 +438,6 @@
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)¤t_thread_info()->flags);
-
- __monitor((void *)¤t_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-}
-
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3d0dc5..7a3b651 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/cpuidle.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -109,7 +110,8 @@
local_irq_disable();
/* Don't trace irqs off for idle */
stop_critical_timings();
- pm_idle();
+ if (cpuidle_idle_call())
+ pm_idle();
start_critical_timings();
}
tick_nohz_restart_sched_tick();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca6f7ab..f693e44 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
+#include <linux/cpuidle.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -136,7 +137,8 @@
enter_idle();
/* Don't trace irqs off for idle */
stop_critical_timings();
- pm_idle();
+ if (cpuidle_idle_call())
+ pm_idle();
start_critical_timings();
/* In many cases the interrupt that ended idle
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index f61ccdd..1ea3877 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_X86_MRST) += mrst.o
obj-$(CONFIG_X86_MRST) += vrtc.o
obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
+obj-$(CONFIG_X86_MRST) += pmu.o
diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c
new file mode 100644
index 0000000..9281da7
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.c
@@ -0,0 +1,817 @@
+/*
+ * mrst/pmu.c - driver for MRST Power Management Unit
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+#include <linux/sfi.h>
+#include <asm/intel_scu_ipc.h>
+#include "pmu.h"
+
+#define IPCMSG_FW_REVISION 0xF4
+
+struct mrst_device {
+ u16 pci_dev_num; /* DEBUG only */
+ u16 lss;
+ u16 latest_request;
+ unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */
+};
+
+/*
+ * comlete list of MRST PCI devices
+ */
+static struct mrst_device mrst_devs[] = {
+/* 0 */ { 0x0800, LSS_SPI0 }, /* Moorestown SPI Ctrl 0 */
+/* 1 */ { 0x0801, LSS_SPI1 }, /* Moorestown SPI Ctrl 1 */
+/* 2 */ { 0x0802, LSS_I2C0 }, /* Moorestown I2C 0 */
+/* 3 */ { 0x0803, LSS_I2C1 }, /* Moorestown I2C 1 */
+/* 4 */ { 0x0804, LSS_I2C2 }, /* Moorestown I2C 2 */
+/* 5 */ { 0x0805, LSS_KBD }, /* Moorestown Keyboard Ctrl */
+/* 6 */ { 0x0806, LSS_USB_HC }, /* Moorestown USB Ctrl */
+/* 7 */ { 0x0807, LSS_SD_HC0 }, /* Moorestown SD Host Ctrl 0 */
+/* 8 */ { 0x0808, LSS_SD_HC1 }, /* Moorestown SD Host Ctrl 1 */
+/* 9 */ { 0x0809, LSS_NAND }, /* Moorestown NAND Ctrl */
+/* 10 */ { 0x080a, LSS_AUDIO }, /* Moorestown Audio Ctrl */
+/* 11 */ { 0x080b, LSS_IMAGING }, /* Moorestown ISP */
+/* 12 */ { 0x080c, LSS_SECURITY }, /* Moorestown Security Controller */
+/* 13 */ { 0x080d, LSS_DISPLAY }, /* Moorestown External Displays */
+/* 14 */ { 0x080e, 0 }, /* Moorestown SCU IPC */
+/* 15 */ { 0x080f, LSS_GPIO }, /* Moorestown GPIO Controller */
+/* 16 */ { 0x0810, 0 }, /* Moorestown Power Management Unit */
+/* 17 */ { 0x0811, LSS_USB_OTG }, /* Moorestown OTG Ctrl */
+/* 18 */ { 0x0812, LSS_SPI2 }, /* Moorestown SPI Ctrl 2 */
+/* 19 */ { 0x0813, 0 }, /* Moorestown SC DMA */
+/* 20 */ { 0x0814, LSS_AUDIO_LPE }, /* Moorestown LPE DMA */
+/* 21 */ { 0x0815, LSS_AUDIO_SSP }, /* Moorestown SSP0 */
+
+/* 22 */ { 0x084F, LSS_SD_HC2 }, /* Moorestown SD Host Ctrl 2 */
+
+/* 23 */ { 0x4102, 0 }, /* Lincroft */
+/* 24 */ { 0x4110, 0 }, /* Lincroft */
+};
+
+/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */
+static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0};
+static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803,
+ 0x0804, 0x0805, 0x080f, 0};
+
+/* handle concurrent SMP invokations of pmu_pci_set_power_state() */
+static spinlock_t mrst_pmu_power_state_lock;
+
+static unsigned int wake_counters[MRST_NUM_LSS]; /* DEBUG only */
+static unsigned int pmu_irq_stats[INT_INVALID + 1]; /* DEBUG only */
+
+static int graphics_is_off;
+static int lss_s0i3_enabled;
+static bool mrst_pmu_s0i3_enable;
+
+/* debug counters */
+static u32 pmu_wait_ready_calls;
+static u32 pmu_wait_ready_udelays;
+static u32 pmu_wait_ready_udelays_max;
+static u32 pmu_wait_done_calls;
+static u32 pmu_wait_done_udelays;
+static u32 pmu_wait_done_udelays_max;
+static u32 pmu_set_power_state_entry;
+static u32 pmu_set_power_state_send_cmd;
+
+static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num)
+{
+ int index = 0;
+
+ if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815))
+ index = pci_dev_num - 0x800;
+ else if (pci_dev_num == 0x084F)
+ index = 22;
+ else if (pci_dev_num == 0x4102)
+ index = 23;
+ else if (pci_dev_num == 0x4110)
+ index = 24;
+
+ if (pci_dev_num != mrst_devs[index].pci_dev_num) {
+ WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num);
+ return 0;
+ }
+
+ return &mrst_devs[index];
+}
+
+/**
+ * mrst_pmu_validate_cstates
+ * @dev: cpuidle_device
+ *
+ * Certain states are not appropriate for governor to pick in some cases.
+ * This function will be called as cpuidle_device's prepare callback and
+ * thus tells governor to ignore such states when selecting the next state
+ * to enter.
+ */
+
+#define IDLE_STATE4_IS_C6 4
+#define IDLE_STATE5_IS_S0I3 5
+
+int mrst_pmu_invalid_cstates(void)
+{
+ int cpu = smp_processor_id();
+
+ /*
+ * Demote to C4 if the PMU is busy.
+ * Since LSS changes leave the busy bit clear...
+ * busy means either the PMU is waiting for an ACK-C6 that
+ * isn't coming due to an MWAIT that returned immediately;
+ * or we returned from S0i3 successfully, and the PMU
+ * is not done sending us interrupts.
+ */
+ if (pmu_read_busy_status())
+ return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3;
+
+ /*
+ * Disallow S0i3 if: PMU is not initialized, or CPU1 is active,
+ * or if device LSS is insufficient, or the GPU is active,
+ * or if it has been explicitly disabled.
+ */
+ if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) ||
+ !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable)
+ return 1 << IDLE_STATE5_IS_S0I3;
+ else
+ return 0;
+}
+
+/*
+ * pmu_update_wake_counters(): read PM_WKS, update wake_counters[]
+ * DEBUG only.
+ */
+static void pmu_update_wake_counters(void)
+{
+ int lss;
+ u32 wake_status;
+
+ wake_status = pmu_read_wks();
+
+ for (lss = 0; lss < MRST_NUM_LSS; ++lss) {
+ if (wake_status & (1 << lss))
+ wake_counters[lss]++;
+ }
+}
+
+int mrst_pmu_s0i3_entry(void)
+{
+ int status;
+
+ /* Clear any possible error conditions */
+ pmu_write_ics(0x300);
+
+ /* set wake control to current D-states */
+ pmu_write_wssc(S0I3_SSS_TARGET);
+
+ status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd);
+ pmu_update_wake_counters();
+ return status;
+}
+
+/* poll for maximum of 5ms for busy bit to clear */
+static int pmu_wait_ready(void)
+{
+ int udelays;
+
+ pmu_wait_ready_calls++;
+
+ for (udelays = 0; udelays < 500; ++udelays) {
+ if (udelays > pmu_wait_ready_udelays_max)
+ pmu_wait_ready_udelays_max = udelays;
+
+ if (pmu_read_busy_status() == 0)
+ return 0;
+
+ udelay(10);
+ pmu_wait_ready_udelays++;
+ }
+
+ /*
+ * if this fires, observe
+ * /sys/kernel/debug/mrst_pmu_wait_ready_calls
+ * /sys/kernel/debug/mrst_pmu_wait_ready_udelays
+ */
+ WARN_ONCE(1, "SCU not ready for 5ms");
+ return -EBUSY;
+}
+/* poll for maximum of 50ms us for busy bit to clear */
+static int pmu_wait_done(void)
+{
+ int udelays;
+
+ pmu_wait_done_calls++;
+
+ for (udelays = 0; udelays < 500; ++udelays) {
+ if (udelays > pmu_wait_done_udelays_max)
+ pmu_wait_done_udelays_max = udelays;
+
+ if (pmu_read_busy_status() == 0)
+ return 0;
+
+ udelay(100);
+ pmu_wait_done_udelays++;
+ }
+
+ /*
+ * if this fires, observe
+ * /sys/kernel/debug/mrst_pmu_wait_done_calls
+ * /sys/kernel/debug/mrst_pmu_wait_done_udelays
+ */
+ WARN_ONCE(1, "SCU not done for 50ms");
+ return -EBUSY;
+}
+
+u32 mrst_pmu_msi_is_disabled(void)
+{
+ return pmu_msi_is_disabled();
+}
+
+void mrst_pmu_enable_msi(void)
+{
+ pmu_msi_enable();
+}
+
+/**
+ * pmu_irq - pmu driver interrupt handler
+ * Context: interrupt context
+ */
+static irqreturn_t pmu_irq(int irq, void *dummy)
+{
+ union pmu_pm_ics pmu_ics;
+
+ pmu_ics.value = pmu_read_ics();
+
+ if (!pmu_ics.bits.pending)
+ return IRQ_NONE;
+
+ switch (pmu_ics.bits.cause) {
+ case INT_SPURIOUS:
+ case INT_CMD_DONE:
+ case INT_CMD_ERR:
+ case INT_WAKE_RX:
+ case INT_SS_ERROR:
+ case INT_S0IX_MISS:
+ case INT_NO_ACKC6:
+ pmu_irq_stats[pmu_ics.bits.cause]++;
+ break;
+ default:
+ pmu_irq_stats[INT_INVALID]++;
+ }
+
+ pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * Translate PCI power management to MRST LSS D-states
+ */
+static int pci_2_mrst_state(int lss, pci_power_t pci_state)
+{
+ switch (pci_state) {
+ case PCI_D0:
+ if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET)
+ return D0i1;
+ else
+ return D0;
+ case PCI_D1:
+ return D0i1;
+ case PCI_D2:
+ return D0i2;
+ case PCI_D3hot:
+ case PCI_D3cold:
+ return D0i3;
+ default:
+ WARN(1, "pci_state %d\n", pci_state);
+ return 0;
+ }
+}
+
+static int pmu_issue_command(u32 pm_ssc)
+{
+ union pmu_pm_set_cfg_cmd_t command;
+
+ if (pmu_read_busy_status()) {
+ pr_debug("pmu is busy, Operation not permitted\n");
+ return -1;
+ }
+
+ /*
+ * enable interrupts in PMU so that interrupts are
+ * propagated when ioc bit for a particular set
+ * command is set
+ */
+
+ pmu_irq_enable();
+
+ /* Configure the sub systems for pmu2 */
+
+ pmu_write_ssc(pm_ssc);
+
+ /*
+ * Send the set config command for pmu its configured
+ * for mode CM_IMMEDIATE & hence with No Trigger
+ */
+
+ command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE;
+ command.pmu2_params.d_param.cfg_delay = 0;
+ command.pmu2_params.d_param.rsvd = 0;
+
+ /* construct the command to send SET_CFG to particular PMU */
+ command.pmu2_params.d_param.cmd = SET_CFG_CMD;
+ command.pmu2_params.d_param.ioc = 0;
+ command.pmu2_params.d_param.mode_id = 0;
+ command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0;
+
+ /* write the value of PM_CMD into particular PMU */
+ pr_debug("pmu command being written %x\n",
+ command.pmu_pm_set_cfg_cmd_value);
+
+ pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value);
+
+ return 0;
+}
+
+static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state)
+{
+ u16 existing_request;
+ int i;
+
+ for (i = 0; ids[i]; ++i) {
+ struct mrst_device *mrst_dev;
+
+ mrst_dev = pci_id_2_mrst_dev(ids[i]);
+ if (unlikely(!mrst_dev))
+ continue;
+
+ existing_request = mrst_dev->latest_request;
+ if (existing_request < pci_state)
+ pci_state = existing_request;
+ }
+ return pci_state;
+}
+
+/**
+ * pmu_pci_set_power_state - Callback function is used by all the PCI devices
+ * for a platform specific device power on/shutdown.
+ */
+
+int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state)
+{
+ u32 old_sss, new_sss;
+ int status = 0;
+ struct mrst_device *mrst_dev;
+
+ pmu_set_power_state_entry++;
+
+ BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL);
+ BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold);
+
+ mrst_dev = pci_id_2_mrst_dev(pdev->device);
+ if (unlikely(!mrst_dev))
+ return -ENODEV;
+
+ mrst_dev->pci_state_counts[pci_state]++; /* count invocations */
+
+ /* PMU driver calls self as part of PCI initialization, ignore */
+ if (pdev->device == PCI_DEV_ID_MRST_PMU)
+ return 0;
+
+ BUG_ON(!pmu_reg); /* SW bug if called before initialized */
+
+ spin_lock(&mrst_pmu_power_state_lock);
+
+ if (pdev->d3_delay) {
+ dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n",
+ pdev->d3_delay);
+ pdev->d3_delay = 0;
+ }
+ /*
+ * If Lincroft graphics, simply remember state
+ */
+ if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY
+ && !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) {
+ if (pci_state == PCI_D0)
+ graphics_is_off = 0;
+ else
+ graphics_is_off = 1;
+ goto ret;
+ }
+
+ if (!mrst_dev->lss)
+ goto ret; /* device with no LSS */
+
+ if (mrst_dev->latest_request == pci_state)
+ goto ret; /* no change */
+
+ mrst_dev->latest_request = pci_state; /* record latest request */
+
+ /*
+ * LSS9 and LSS10 contain multiple PCI devices.
+ * Use the lowest numbered (highest power) state in the LSS
+ */
+ if (mrst_dev->lss == 9)
+ pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state);
+ else if (mrst_dev->lss == 10)
+ pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state);
+
+ status = pmu_wait_ready();
+ if (status)
+ goto ret;
+
+ old_sss = pmu_read_sss();
+ new_sss = old_sss & ~SSMSK(3, mrst_dev->lss);
+ new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state),
+ mrst_dev->lss);
+
+ if (new_sss == old_sss)
+ goto ret; /* nothing to do */
+
+ pmu_set_power_state_send_cmd++;
+
+ status = pmu_issue_command(new_sss);
+
+ if (unlikely(status != 0)) {
+ dev_err(&pdev->dev, "Failed to Issue a PM command\n");
+ goto ret;
+ }
+
+ if (pmu_wait_done())
+ goto ret;
+
+ lss_s0i3_enabled =
+ ((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET);
+ret:
+ spin_unlock(&mrst_pmu_power_state_lock);
+ return status;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static inline const char *d0ix_name(int state)
+{
+ return d0ix_names[(int) state];
+}
+
+static int debug_mrst_pmu_show(struct seq_file *s, void *unused)
+{
+ struct pci_dev *pdev = NULL;
+ u32 cur_pmsss;
+ int lss;
+
+ seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET);
+
+ cur_pmsss = pmu_read_sss();
+
+ seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET);
+
+ seq_printf(s, "0x%08X Current SSS ", cur_pmsss);
+ seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n");
+
+ if (cpumask_equal(cpu_online_mask, cpumask_of(0)))
+ seq_printf(s, "cpu0 is only cpu online\n");
+ else
+ seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n");
+
+ seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]");
+
+
+ for_each_pci_dev(pdev) {
+ int pos;
+ u16 pmcsr;
+ struct mrst_device *mrst_dev;
+ int i;
+
+ mrst_dev = pci_id_2_mrst_dev(pdev->device);
+
+ seq_printf(s, "%s %04x/%04X %-16.16s ",
+ dev_name(&pdev->dev),
+ pdev->vendor, pdev->device,
+ dev_driver_string(&pdev->dev));
+
+ if (unlikely (!mrst_dev)) {
+ seq_printf(s, " UNKNOWN\n");
+ continue;
+ }
+
+ if (mrst_dev->lss)
+ seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss,
+ d0ix_name(((cur_pmsss >>
+ (mrst_dev->lss * 2)) & 0x3)));
+ else
+ seq_printf(s, " ");
+
+ /* PCI PM config space setting */
+ pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
+ if (pos != 0) {
+ pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
+ seq_printf(s, "PCI-%-4s",
+ pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK));
+ } else {
+ seq_printf(s, " ");
+ }
+
+ seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request));
+ for (i = 0; i <= PCI_D3cold; ++i)
+ seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]);
+
+ if (mrst_dev->lss) {
+ unsigned int lssmask;
+
+ lssmask = SSMSK(D0i3, mrst_dev->lss);
+
+ if ((lssmask & S0I3_SSS_TARGET) &&
+ ((lssmask & cur_pmsss) !=
+ (lssmask & S0I3_SSS_TARGET)))
+ seq_printf(s , "[BLOCKS s0i3]");
+ }
+
+ seq_printf(s, "\n");
+ }
+ seq_printf(s, "Wake Counters:\n");
+ for (lss = 0; lss < MRST_NUM_LSS; ++lss)
+ seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]);
+
+ seq_printf(s, "Interrupt Counters:\n");
+ seq_printf(s,
+ "INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n"
+ "INT_CMD_ERR \t%8u\n" "INT_WAKE_RX \t%8u\n"
+ "INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n"
+ "INT_NO_ACKC6 \t%8u\n" "INT_INVALID \t%8u\n",
+ pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE],
+ pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX],
+ pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS],
+ pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]);
+
+ seq_printf(s, "mrst_pmu_wait_ready_calls %8d\n",
+ pmu_wait_ready_calls);
+ seq_printf(s, "mrst_pmu_wait_ready_udelays %8d\n",
+ pmu_wait_ready_udelays);
+ seq_printf(s, "mrst_pmu_wait_ready_udelays_max %8d\n",
+ pmu_wait_ready_udelays_max);
+ seq_printf(s, "mrst_pmu_wait_done_calls %8d\n",
+ pmu_wait_done_calls);
+ seq_printf(s, "mrst_pmu_wait_done_udelays %8d\n",
+ pmu_wait_done_udelays);
+ seq_printf(s, "mrst_pmu_wait_done_udelays_max %8d\n",
+ pmu_wait_done_udelays_max);
+ seq_printf(s, "mrst_pmu_set_power_state_entry %8d\n",
+ pmu_set_power_state_entry);
+ seq_printf(s, "mrst_pmu_set_power_state_send_cmd %8d\n",
+ pmu_set_power_state_send_cmd);
+ seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status());
+
+ return 0;
+}
+
+static int debug_mrst_pmu_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, debug_mrst_pmu_show, NULL);
+}
+
+static const struct file_operations devices_state_operations = {
+ .open = debug_mrst_pmu_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* DEBUG_FS */
+
+/*
+ * Validate SCU PCI shim PCI vendor capability byte
+ * against LSS hard-coded in mrst_devs[] above.
+ * DEBUG only.
+ */
+static void pmu_scu_firmware_debug(void)
+{
+ struct pci_dev *pdev = NULL;
+
+ for_each_pci_dev(pdev) {
+ struct mrst_device *mrst_dev;
+ u8 pci_config_lss;
+ int pos;
+
+ mrst_dev = pci_id_2_mrst_dev(pdev->device);
+ if (unlikely(!mrst_dev)) {
+ printk(KERN_ERR FW_BUG "pmu: Unknown "
+ "PCI device 0x%04X\n", pdev->device);
+ continue;
+ }
+
+ if (mrst_dev->lss == 0)
+ continue; /* no LSS in our table */
+
+ pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
+ if (!pos != 0) {
+ printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+ "missing PCI Vendor Capability\n",
+ pdev->device);
+ continue;
+ }
+ pci_read_config_byte(pdev, pos + 4, &pci_config_lss);
+ if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) {
+ printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+ "invalid PCI Vendor Capability 0x%x "
+ " expected LSS 0x%X\n",
+ pdev->device, pci_config_lss, mrst_dev->lss);
+ continue;
+ }
+ pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK;
+
+ if (mrst_dev->lss == pci_config_lss)
+ continue;
+
+ printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n",
+ pdev->device, pci_config_lss, mrst_dev->lss);
+ }
+}
+
+/**
+ * pmu_probe
+ */
+static int __devinit pmu_probe(struct pci_dev *pdev,
+ const struct pci_device_id *pci_id)
+{
+ int ret;
+ struct mrst_pmu_reg *pmu;
+
+ /* Init the device */
+ ret = pci_enable_device(pdev);
+ if (ret) {
+ dev_err(&pdev->dev, "Unable to Enable PCI device\n");
+ return ret;
+ }
+
+ ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
+ goto out_err1;
+ }
+
+ /* Map the memory of PMU reg base */
+ pmu = pci_iomap(pdev, 0, 0);
+ if (!pmu) {
+ dev_err(&pdev->dev, "Unable to map the PMU address space\n");
+ ret = -ENOMEM;
+ goto out_err2;
+ }
+
+#ifdef CONFIG_DEBUG_FS
+ /* /sys/kernel/debug/mrst_pmu */
+ (void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO,
+ NULL, NULL, &devices_state_operations);
+#endif
+ pmu_reg = pmu; /* success */
+
+ if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) {
+ dev_err(&pdev->dev, "Registering isr has failed\n");
+ ret = -1;
+ goto out_err3;
+ }
+
+ pmu_scu_firmware_debug();
+
+ pmu_write_wkc(S0I3_WAKE_SOURCES); /* Enable S0i3 wakeup sources */
+
+ pmu_wait_ready();
+
+ pmu_write_ssc(D0I1_ACG_SSS_TARGET); /* Enable Auto-Clock_Gating */
+ pmu_write_cmd(0x201);
+
+ spin_lock_init(&mrst_pmu_power_state_lock);
+
+ /* Enable the hardware interrupt */
+ pmu_irq_enable();
+ return 0;
+
+out_err3:
+ free_irq(pdev->irq, NULL);
+ pci_iounmap(pdev, pmu_reg);
+ pmu_reg = NULL;
+out_err2:
+ pci_release_region(pdev, 0);
+out_err1:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static void __devexit pmu_remove(struct pci_dev *pdev)
+{
+ dev_err(&pdev->dev, "Mid PM pmu_remove called\n");
+
+ /* Freeing up the irq */
+ free_irq(pdev->irq, NULL);
+
+ pci_iounmap(pdev, pmu_reg);
+ pmu_reg = NULL;
+
+ /* disable the current PCI device */
+ pci_release_region(pdev, 0);
+ pci_disable_device(pdev);
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = {
+ { PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 },
+ { }
+};
+
+MODULE_DEVICE_TABLE(pci, pmu_pci_ids);
+
+static struct pci_driver driver = {
+ .name = MRST_PMU_DRV_NAME,
+ .id_table = pmu_pci_ids,
+ .probe = pmu_probe,
+ .remove = __devexit_p(pmu_remove),
+};
+
+/**
+ * pmu_pci_register - register the PMU driver as PCI device
+ */
+static int __init pmu_pci_register(void)
+{
+ return pci_register_driver(&driver);
+}
+
+/* Register and probe via fs_initcall() to preceed device_initcall() */
+fs_initcall(pmu_pci_register);
+
+static void __exit mid_pci_cleanup(void)
+{
+ pci_unregister_driver(&driver);
+}
+
+static int ia_major;
+static int ia_minor;
+
+static int pmu_sfi_parse_oem(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+
+ sb = (struct sfi_table_simple *)table;
+ ia_major = (sb->pentry[1] >> 0) & 0xFFFF;
+ ia_minor = (sb->pentry[1] >> 16) & 0xFFFF;
+ printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n",
+ ia_major, ia_minor);
+
+ return 0;
+}
+
+static int __init scu_fw_check(void)
+{
+ int ret;
+ u32 fw_version;
+
+ if (!pmu_reg)
+ return 0; /* this driver didn't probe-out */
+
+ sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem);
+
+ if (ia_major < 0x6005 || ia_minor < 0x1525) {
+ WARN(1, "mrst_pmu: IA FW version too old\n");
+ return -1;
+ }
+
+ ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0,
+ &fw_version, 1);
+
+ if (ret) {
+ WARN(1, "mrst_pmu: IPC FW version? %d\n", ret);
+ } else {
+ int scu_major = (fw_version >> 8) & 0xFF;
+ int scu_minor = (fw_version >> 0) & 0xFF;
+
+ printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version);
+
+ if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) {
+ printk(KERN_INFO "mrst_pmu: enabling S0i3\n");
+ mrst_pmu_s0i3_enable = true;
+ } else {
+ WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X",
+ scu_major, scu_minor);
+ }
+ }
+ return 0;
+}
+late_initcall(scu_fw_check);
+module_exit(mid_pci_cleanup);
diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h
new file mode 100644
index 0000000..bfbfe64
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.h
@@ -0,0 +1,234 @@
+/*
+ * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _MRST_PMU_H_
+#define _MRST_PMU_H_
+
+#define PCI_DEV_ID_MRST_PMU 0x0810
+#define MRST_PMU_DRV_NAME "mrst_pmu"
+#define PCI_SUB_CLASS_MASK 0xFF00
+
+#define PCI_VENDOR_CAP_LOG_ID_MASK 0x7F
+#define PCI_VENDOR_CAP_LOG_SS_MASK 0x80
+
+#define SUB_SYS_ALL_D0I1 0x01155555
+#define S0I3_WAKE_SOURCES 0x00001FFF
+
+#define PM_S0I3_COMMAND \
+ ((0 << 31) | /* Reserved */ \
+ (0 << 30) | /* Core must be idle */ \
+ (0xc2 << 22) | /* ACK C6 trigger */ \
+ (3 << 19) | /* Trigger on DMI message */ \
+ (3 << 16) | /* Enter S0i3 */ \
+ (0 << 13) | /* Numeric mode ID (sw) */ \
+ (3 << 9) | /* Trigger mode */ \
+ (0 << 8) | /* Do not interrupt */ \
+ (1 << 0)) /* Set configuration */
+
+#define LSS_DMI 0
+#define LSS_SD_HC0 1
+#define LSS_SD_HC1 2
+#define LSS_NAND 3
+#define LSS_IMAGING 4
+#define LSS_SECURITY 5
+#define LSS_DISPLAY 6
+#define LSS_USB_HC 7
+#define LSS_USB_OTG 8
+#define LSS_AUDIO 9
+#define LSS_AUDIO_LPE 9
+#define LSS_AUDIO_SSP 9
+#define LSS_I2C0 10
+#define LSS_I2C1 10
+#define LSS_I2C2 10
+#define LSS_KBD 10
+#define LSS_SPI0 10
+#define LSS_SPI1 10
+#define LSS_SPI2 10
+#define LSS_GPIO 10
+#define LSS_SRAM 11 /* used by SCU, do not touch */
+#define LSS_SD_HC2 12
+/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */
+#define MRST_NUM_LSS 13
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define SSMSK(mask, lss) ((mask) << ((lss) * 2))
+#define D0 0
+#define D0i1 1
+#define D0i2 2
+#define D0i3 3
+
+#define S0I3_SSS_TARGET ( \
+ SSMSK(D0i1, LSS_DMI) | \
+ SSMSK(D0i3, LSS_SD_HC0) | \
+ SSMSK(D0i3, LSS_SD_HC1) | \
+ SSMSK(D0i3, LSS_NAND) | \
+ SSMSK(D0i3, LSS_SD_HC2) | \
+ SSMSK(D0i3, LSS_IMAGING) | \
+ SSMSK(D0i3, LSS_SECURITY) | \
+ SSMSK(D0i3, LSS_DISPLAY) | \
+ SSMSK(D0i3, LSS_USB_HC) | \
+ SSMSK(D0i3, LSS_USB_OTG) | \
+ SSMSK(D0i3, LSS_AUDIO) | \
+ SSMSK(D0i1, LSS_I2C0))
+
+/*
+ * D0i1 on Langwell is Autonomous Clock Gating (ACG).
+ * Enable ACG on every LSS except camera and audio
+ */
+#define D0I1_ACG_SSS_TARGET \
+ (SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO))
+
+enum cm_mode {
+ CM_NOP, /* ignore the config mode value */
+ CM_IMMEDIATE,
+ CM_DELAY,
+ CM_TRIGGER,
+ CM_INVALID
+};
+
+enum sys_state {
+ SYS_STATE_S0I0,
+ SYS_STATE_S0I1,
+ SYS_STATE_S0I2,
+ SYS_STATE_S0I3,
+ SYS_STATE_S3,
+ SYS_STATE_S5
+};
+
+#define SET_CFG_CMD 1
+
+enum int_status {
+ INT_SPURIOUS = 0,
+ INT_CMD_DONE = 1,
+ INT_CMD_ERR = 2,
+ INT_WAKE_RX = 3,
+ INT_SS_ERROR = 4,
+ INT_S0IX_MISS = 5,
+ INT_NO_ACKC6 = 6,
+ INT_INVALID = 7,
+};
+
+/* PMU register interface */
+static struct mrst_pmu_reg {
+ u32 pm_sts; /* 0x00 */
+ u32 pm_cmd; /* 0x04 */
+ u32 pm_ics; /* 0x08 */
+ u32 _resv1; /* 0x0C */
+ u32 pm_wkc[2]; /* 0x10 */
+ u32 pm_wks[2]; /* 0x18 */
+ u32 pm_ssc[4]; /* 0x20 */
+ u32 pm_sss[4]; /* 0x30 */
+ u32 pm_wssc[4]; /* 0x40 */
+ u32 pm_c3c4; /* 0x50 */
+ u32 pm_c5c6; /* 0x54 */
+ u32 pm_msi_disable; /* 0x58 */
+} *pmu_reg;
+
+static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); }
+static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); }
+static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); }
+static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); }
+
+static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); }
+static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); }
+static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); }
+static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); }
+static inline void pmu_write_wssc(u32 arg)
+ { writel(arg, &pmu_reg->pm_wssc[0]); }
+
+static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); }
+static inline u32 pmu_msi_is_disabled(void)
+ { return readl(&pmu_reg->pm_msi_disable); }
+
+union pmu_pm_ics {
+ struct {
+ u32 cause:8;
+ u32 enable:1;
+ u32 pending:1;
+ u32 reserved:22;
+ } bits;
+ u32 value;
+};
+
+static inline void pmu_irq_enable(void)
+{
+ union pmu_pm_ics pmu_ics;
+
+ pmu_ics.value = pmu_read_ics();
+ pmu_ics.bits.enable = 1;
+ pmu_write_ics(pmu_ics.value);
+}
+
+union pmu_pm_status {
+ struct {
+ u32 pmu_rev:8;
+ u32 pmu_busy:1;
+ u32 mode_id:4;
+ u32 Reserved:19;
+ } pmu_status_parts;
+ u32 pmu_status_value;
+};
+
+static inline int pmu_read_busy_status(void)
+{
+ union pmu_pm_status result;
+
+ result.pmu_status_value = pmu_read_sts();
+
+ return result.pmu_status_parts.pmu_busy;
+}
+
+/* pmu set config parameters */
+struct cfg_delay_param_t {
+ u32 cmd:8;
+ u32 ioc:1;
+ u32 cfg_mode:4;
+ u32 mode_id:3;
+ u32 sys_state:3;
+ u32 cfg_delay:8;
+ u32 rsvd:5;
+};
+
+struct cfg_trig_param_t {
+ u32 cmd:8;
+ u32 ioc:1;
+ u32 cfg_mode:4;
+ u32 mode_id:3;
+ u32 sys_state:3;
+ u32 cfg_trig_type:3;
+ u32 cfg_trig_val:8;
+ u32 cmbi:1;
+ u32 rsvd1:1;
+};
+
+union pmu_pm_set_cfg_cmd_t {
+ union {
+ struct cfg_delay_param_t d_param;
+ struct cfg_trig_param_t t_param;
+ } pmu2_params;
+ u32 pmu_pm_set_cfg_cmd_value;
+};
+
+#ifdef FUTURE_PATCH
+extern int mrst_s0i3_entry(u32 regval, u32 *regaddr);
+#else
+static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; }
+#endif
+#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 60aeeb5..a9627e2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
+#include <linux/cpuidle.h>
#include <asm/elf.h>
#include <asm/vdso.h>
@@ -426,7 +427,7 @@
#ifdef CONFIG_X86_32
boot_cpu_data.hlt_works_ok = 1;
#endif
- pm_idle = default_idle;
+ disable_cpuidle();
boot_option_idle_override = IDLE_HALT;
fiddle_vdso();
diff --git a/block/blk-core.c b/block/blk-core.c
index b850bed..b627558 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1368,8 +1368,10 @@
static int __init fail_make_request_debugfs(void)
{
- return init_fault_attr_dentries(&fail_make_request,
- "fail_make_request");
+ struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
+ NULL, &fail_make_request);
+
+ return IS_ERR(dir) ? PTR_ERR(dir) : 0;
}
late_initcall(fail_make_request_debugfs);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 4f0c06c..7803548 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -28,7 +28,10 @@
static int __init fail_io_timeout_debugfs(void)
{
- return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
+ struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
+ NULL, &fail_io_timeout);
+
+ return IS_ERR(dir) ? PTR_ERR(dir) : 0;
}
late_initcall(fail_io_timeout_debugfs);
diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 73863d86..76dc02f 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -126,6 +126,12 @@
*/
u8 ACPI_INIT_GLOBAL(acpi_gbl_truncate_io_addresses, FALSE);
+/*
+ * Disable runtime checking and repair of values returned by control methods.
+ * Use only if the repair is causing a problem on a particular machine.
+ */
+u8 ACPI_INIT_GLOBAL(acpi_gbl_disable_auto_repair, FALSE);
+
/* acpi_gbl_FADT is a local copy of the FADT, converted to a common format. */
struct acpi_table_fadt acpi_gbl_FADT;
diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h
index c7f743c..5552125 100644
--- a/drivers/acpi/acpica/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -357,6 +357,7 @@
char *pathname;
const union acpi_predefined_info *predefined;
union acpi_operand_object *parent_package;
+ struct acpi_namespace_node *node;
u32 flags;
u8 node_flags;
};
diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h
index 94e73c9..c445cca 100644
--- a/drivers/acpi/acpica/acpredef.h
+++ b/drivers/acpi/acpica/acpredef.h
@@ -468,6 +468,7 @@
{{"_SWS", 0, ACPI_RTYPE_INTEGER}},
{{"_TC1", 0, ACPI_RTYPE_INTEGER}},
{{"_TC2", 0, ACPI_RTYPE_INTEGER}},
+ {{"_TDL", 0, ACPI_RTYPE_INTEGER}},
{{"_TIP", 1, ACPI_RTYPE_INTEGER}},
{{"_TIV", 1, ACPI_RTYPE_INTEGER}},
{{"_TMP", 0, ACPI_RTYPE_INTEGER}},
diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c
index 9fb03fa..c845c80 100644
--- a/drivers/acpi/acpica/nspredef.c
+++ b/drivers/acpi/acpica/nspredef.c
@@ -193,14 +193,20 @@
}
/*
- * 1) We have a return value, but if one wasn't expected, just exit, this is
- * not a problem. For example, if the "Implicit Return" feature is
- * enabled, methods will always return a value.
+ * Return value validation and possible repair.
*
- * 2) If the return value can be of any type, then we cannot perform any
- * validation, exit.
+ * 1) Don't perform return value validation/repair if this feature
+ * has been disabled via a global option.
+ *
+ * 2) We have a return value, but if one wasn't expected, just exit,
+ * this is not a problem. For example, if the "Implicit Return"
+ * feature is enabled, methods will always return a value.
+ *
+ * 3) If the return value can be of any type, then we cannot perform
+ * any validation, just exit.
*/
- if ((!predefined->info.expected_btypes) ||
+ if (acpi_gbl_disable_auto_repair ||
+ (!predefined->info.expected_btypes) ||
(predefined->info.expected_btypes == ACPI_RTYPE_ALL)) {
goto cleanup;
}
@@ -212,6 +218,7 @@
goto cleanup;
}
data->predefined = predefined;
+ data->node = node;
data->node_flags = node->flags;
data->pathname = pathname;
diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c
index 973883b..024c4f2 100644
--- a/drivers/acpi/acpica/nsrepair2.c
+++ b/drivers/acpi/acpica/nsrepair2.c
@@ -503,6 +503,21 @@
{
union acpi_operand_object *return_object = *return_object_ptr;
acpi_status status;
+ struct acpi_namespace_node *node;
+
+ /*
+ * We can only sort the _TSS return package if there is no _PSS in the
+ * same scope. This is because if _PSS is present, the ACPI specification
+ * dictates that the _TSS Power Dissipation field is to be ignored, and
+ * therefore some BIOSs leave garbage values in the _TSS Power field(s).
+ * In this case, it is best to just return the _TSS package as-is.
+ * (May, 2011)
+ */
+ status =
+ acpi_ns_get_node(data->node, "^_PSS", ACPI_NS_NO_UPSEARCH, &node);
+ if (ACPI_SUCCESS(status)) {
+ return (AE_OK);
+ }
status = acpi_ns_check_sorted_list(data, return_object, 5, 1,
ACPI_SORT_DESCENDING,
diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c
index 48db094..62365f6 100644
--- a/drivers/acpi/acpica/tbinstal.c
+++ b/drivers/acpi/acpica/tbinstal.c
@@ -126,12 +126,29 @@
}
/*
- * Originally, we checked the table signature for "SSDT" or "PSDT" here.
- * Next, we added support for OEMx tables, signature "OEM".
- * Valid tables were encountered with a null signature, so we've just
- * given up on validating the signature, since it seems to be a waste
- * of code. The original code was removed (05/2008).
+ * Validate the incoming table signature.
+ *
+ * 1) Originally, we checked the table signature for "SSDT" or "PSDT".
+ * 2) We added support for OEMx tables, signature "OEM".
+ * 3) Valid tables were encountered with a null signature, so we just
+ * gave up on validating the signature, (05/2008).
+ * 4) We encountered non-AML tables such as the MADT, which caused
+ * interpreter errors and kernel faults. So now, we once again allow
+ * only "SSDT", "OEMx", and now, also a null signature. (05/2011).
*/
+ if ((table_desc->pointer->signature[0] != 0x00) &&
+ (!ACPI_COMPARE_NAME(table_desc->pointer->signature, ACPI_SIG_SSDT))
+ && (ACPI_STRNCMP(table_desc->pointer->signature, "OEM", 3))) {
+ ACPI_ERROR((AE_INFO,
+ "Table has invalid signature [%4.4s] (0x%8.8X), must be SSDT or OEMx",
+ acpi_ut_valid_acpi_name(*(u32 *)table_desc->
+ pointer->
+ signature) ? table_desc->
+ pointer->signature : "????",
+ *(u32 *)table_desc->pointer->signature));
+
+ return_ACPI_STATUS(AE_BAD_SIGNATURE);
+ }
(void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES);
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index f739a70..c34aa51 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -10,9 +10,11 @@
error injection.
config ACPI_APEI_GHES
- tristate "APEI Generic Hardware Error Source"
+ bool "APEI Generic Hardware Error Source"
depends on ACPI_APEI && X86
select ACPI_HED
+ select LLIST
+ select GENERIC_ALLOCATOR
help
Generic Hardware Error Source provides a way to report
platform hardware errors (such as that from chipset). It
@@ -30,6 +32,13 @@
PCIe AER errors may be reported via APEI firmware first mode.
Turn on this option to enable the corresponding support.
+config ACPI_APEI_MEMORY_FAILURE
+ bool "APEI memory error recovering support"
+ depends on ACPI_APEI && MEMORY_FAILURE
+ help
+ Memory errors may be reported via APEI firmware first mode.
+ Turn on this option to enable the memory recovering support.
+
config ACPI_APEI_EINJ
tristate "APEI Error INJection (EINJ)"
depends on ACPI_APEI && DEBUG_FS
diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c
index 4a904a4..8041248 100644
--- a/drivers/acpi/apei/apei-base.c
+++ b/drivers/acpi/apei/apei-base.c
@@ -157,9 +157,10 @@
* Interpret the specified action. Go through whole action table,
* execute all instructions belong to the action.
*/
-int apei_exec_run(struct apei_exec_context *ctx, u8 action)
+int __apei_exec_run(struct apei_exec_context *ctx, u8 action,
+ bool optional)
{
- int rc;
+ int rc = -ENOENT;
u32 i, ip;
struct acpi_whea_header *entry;
apei_exec_ins_func_t run;
@@ -198,9 +199,9 @@
goto rewind;
}
- return 0;
+ return !optional && rc < 0 ? rc : 0;
}
-EXPORT_SYMBOL_GPL(apei_exec_run);
+EXPORT_SYMBOL_GPL(__apei_exec_run);
typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx,
struct acpi_whea_header *entry,
@@ -603,3 +604,29 @@
return dapei;
}
EXPORT_SYMBOL_GPL(apei_get_debugfs_dir);
+
+int apei_osc_setup(void)
+{
+ static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c";
+ acpi_handle handle;
+ u32 capbuf[3];
+ struct acpi_osc_context context = {
+ .uuid_str = whea_uuid_str,
+ .rev = 1,
+ .cap.length = sizeof(capbuf),
+ .cap.pointer = capbuf,
+ };
+
+ capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
+ capbuf[OSC_SUPPORT_TYPE] = 0;
+ capbuf[OSC_CONTROL_TYPE] = 0;
+
+ if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))
+ || ACPI_FAILURE(acpi_run_osc(handle, &context)))
+ return -EIO;
+ else {
+ kfree(context.ret.pointer);
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(apei_osc_setup);
diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
index ef0581f..f57050e 100644
--- a/drivers/acpi/apei/apei-internal.h
+++ b/drivers/acpi/apei/apei-internal.h
@@ -50,7 +50,18 @@
return ctx->value;
}
-int apei_exec_run(struct apei_exec_context *ctx, u8 action);
+int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool optional);
+
+static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action)
+{
+ return __apei_exec_run(ctx, action, 0);
+}
+
+/* It is optional whether the firmware provides the action */
+static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action)
+{
+ return __apei_exec_run(ctx, action, 1);
+}
/* Common instruction implementation */
@@ -113,4 +124,6 @@
const struct acpi_hest_generic_status *estatus);
int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus);
int apei_estatus_check(const struct acpi_hest_generic_status *estatus);
+
+int apei_osc_setup(void);
#endif
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index f74b2ea..589b96c 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -46,7 +46,8 @@
* Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the
* EINJ table through an unpublished extension. Use with caution as
* most will ignore the parameter and make their own choice of address
- * for error injection.
+ * for error injection. This extension is used only if
+ * param_extension module parameter is specified.
*/
struct einj_parameter {
u64 type;
@@ -65,6 +66,9 @@
((struct acpi_whea_header *)((char *)(tab) + \
sizeof(struct acpi_table_einj)))
+static bool param_extension;
+module_param(param_extension, bool, 0);
+
static struct acpi_table_einj *einj_tab;
static struct apei_resources einj_resources;
@@ -285,7 +289,7 @@
einj_exec_ctx_init(&ctx);
- rc = apei_exec_run(&ctx, ACPI_EINJ_BEGIN_OPERATION);
+ rc = apei_exec_run_optional(&ctx, ACPI_EINJ_BEGIN_OPERATION);
if (rc)
return rc;
apei_exec_ctx_set_input(&ctx, type);
@@ -323,7 +327,7 @@
rc = __einj_error_trigger(trigger_paddr);
if (rc)
return rc;
- rc = apei_exec_run(&ctx, ACPI_EINJ_END_OPERATION);
+ rc = apei_exec_run_optional(&ctx, ACPI_EINJ_END_OPERATION);
return rc;
}
@@ -489,14 +493,6 @@
einj_debug_dir, NULL, &error_type_fops);
if (!fentry)
goto err_cleanup;
- fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
- einj_debug_dir, &error_param1);
- if (!fentry)
- goto err_cleanup;
- fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR,
- einj_debug_dir, &error_param2);
- if (!fentry)
- goto err_cleanup;
fentry = debugfs_create_file("error_inject", S_IWUSR,
einj_debug_dir, NULL, &error_inject_fops);
if (!fentry)
@@ -513,12 +509,23 @@
rc = apei_exec_pre_map_gars(&ctx);
if (rc)
goto err_release;
- param_paddr = einj_get_parameter_address();
- if (param_paddr) {
- einj_param = ioremap(param_paddr, sizeof(*einj_param));
- rc = -ENOMEM;
- if (!einj_param)
- goto err_unmap;
+ if (param_extension) {
+ param_paddr = einj_get_parameter_address();
+ if (param_paddr) {
+ einj_param = ioremap(param_paddr, sizeof(*einj_param));
+ rc = -ENOMEM;
+ if (!einj_param)
+ goto err_unmap;
+ fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
+ einj_debug_dir, &error_param1);
+ if (!fentry)
+ goto err_unmap;
+ fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR,
+ einj_debug_dir, &error_param2);
+ if (!fentry)
+ goto err_unmap;
+ } else
+ pr_warn(EINJ_PFX "Parameter extension is not supported.\n");
}
pr_info(EINJ_PFX "Error INJection is initialized.\n");
@@ -526,6 +533,8 @@
return 0;
err_unmap:
+ if (einj_param)
+ iounmap(einj_param);
apei_exec_post_unmap_gars(&ctx);
err_release:
apei_resources_release(&einj_resources);
diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
index a4cfb64..903549d 100644
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -33,7 +33,7 @@
#define ERST_DBG_PFX "ERST DBG: "
-#define ERST_DBG_RECORD_LEN_MAX 4096
+#define ERST_DBG_RECORD_LEN_MAX 0x4000
static void *erst_dbg_buf;
static unsigned int erst_dbg_buf_len;
@@ -213,6 +213,10 @@
static __init int erst_dbg_init(void)
{
+ if (erst_disable) {
+ pr_info(ERST_DBG_PFX "ERST support is disabled.\n");
+ return -ENODEV;
+ }
return misc_register(&erst_dbg_dev);
}
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6053f47..2ca59dc 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -642,7 +642,7 @@
int rc;
erst_exec_ctx_init(&ctx);
- rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE);
+ rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE);
if (rc)
return rc;
apei_exec_ctx_set_input(&ctx, offset);
@@ -666,7 +666,7 @@
if (rc)
return rc;
val = apei_exec_ctx_get_output(&ctx);
- rc = apei_exec_run(&ctx, ACPI_ERST_END);
+ rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
if (rc)
return rc;
@@ -681,7 +681,7 @@
int rc;
erst_exec_ctx_init(&ctx);
- rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ);
+ rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ);
if (rc)
return rc;
apei_exec_ctx_set_input(&ctx, offset);
@@ -709,7 +709,7 @@
if (rc)
return rc;
val = apei_exec_ctx_get_output(&ctx);
- rc = apei_exec_run(&ctx, ACPI_ERST_END);
+ rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
if (rc)
return rc;
@@ -724,7 +724,7 @@
int rc;
erst_exec_ctx_init(&ctx);
- rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR);
+ rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR);
if (rc)
return rc;
apei_exec_ctx_set_input(&ctx, record_id);
@@ -748,7 +748,7 @@
if (rc)
return rc;
val = apei_exec_ctx_get_output(&ctx);
- rc = apei_exec_run(&ctx, ACPI_ERST_END);
+ rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
if (rc)
return rc;
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f703b28..0784f99 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -12,7 +12,7 @@
* For more information about Generic Hardware Error Source, please
* refer to ACPI Specification version 4.0, section 17.3.2.6
*
- * Copyright 2010 Intel Corp.
+ * Copyright 2010,2011 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
* This program is free software; you can redistribute it and/or
@@ -42,6 +42,9 @@
#include <linux/mutex.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
+#include <linux/genalloc.h>
#include <acpi/apei.h>
#include <acpi/atomicio.h>
#include <acpi/hed.h>
@@ -53,6 +56,30 @@
#define GHES_PFX "GHES: "
#define GHES_ESTATUS_MAX_SIZE 65536
+#define GHES_ESOURCE_PREALLOC_MAX_SIZE 65536
+
+#define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3
+
+/* This is just an estimation for memory pool allocation */
+#define GHES_ESTATUS_CACHE_AVG_SIZE 512
+
+#define GHES_ESTATUS_CACHES_SIZE 4
+
+#define GHES_ESTATUS_IN_CACHE_MAX_NSEC 10000000000ULL
+/* Prevent too many caches are allocated because of RCU */
+#define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2)
+
+#define GHES_ESTATUS_CACHE_LEN(estatus_len) \
+ (sizeof(struct ghes_estatus_cache) + (estatus_len))
+#define GHES_ESTATUS_FROM_CACHE(estatus_cache) \
+ ((struct acpi_hest_generic_status *) \
+ ((struct ghes_estatus_cache *)(estatus_cache) + 1))
+
+#define GHES_ESTATUS_NODE_LEN(estatus_len) \
+ (sizeof(struct ghes_estatus_node) + (estatus_len))
+#define GHES_ESTATUS_FROM_NODE(estatus_node) \
+ ((struct acpi_hest_generic_status *) \
+ ((struct ghes_estatus_node *)(estatus_node) + 1))
/*
* One struct ghes is created for each generic hardware error source.
@@ -77,6 +104,22 @@
};
};
+struct ghes_estatus_node {
+ struct llist_node llnode;
+ struct acpi_hest_generic *generic;
+};
+
+struct ghes_estatus_cache {
+ u32 estatus_len;
+ atomic_t count;
+ struct acpi_hest_generic *generic;
+ unsigned long long time_in;
+ struct rcu_head rcu;
+};
+
+int ghes_disable;
+module_param_named(disable, ghes_disable, bool, 0);
+
static int ghes_panic_timeout __read_mostly = 30;
/*
@@ -121,6 +164,22 @@
static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+/*
+ * printk is not safe in NMI context. So in NMI handler, we allocate
+ * required memory from lock-less memory allocator
+ * (ghes_estatus_pool), save estatus into it, put them into lock-less
+ * list (ghes_estatus_llist), then delay printk into IRQ context via
+ * irq_work (ghes_proc_irq_work). ghes_estatus_size_request record
+ * required pool size by all NMI error source.
+ */
+static struct gen_pool *ghes_estatus_pool;
+static unsigned long ghes_estatus_pool_size_request;
+static struct llist_head ghes_estatus_llist;
+static struct irq_work ghes_proc_irq_work;
+
+struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
+static atomic_t ghes_estatus_cache_alloced;
+
static int ghes_ioremap_init(void)
{
ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
@@ -180,6 +239,55 @@
__flush_tlb_one(vaddr);
}
+static int ghes_estatus_pool_init(void)
+{
+ ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1);
+ if (!ghes_estatus_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
+ struct gen_pool_chunk *chunk,
+ void *data)
+{
+ free_page(chunk->start_addr);
+}
+
+static void ghes_estatus_pool_exit(void)
+{
+ gen_pool_for_each_chunk(ghes_estatus_pool,
+ ghes_estatus_pool_free_chunk_page, NULL);
+ gen_pool_destroy(ghes_estatus_pool);
+}
+
+static int ghes_estatus_pool_expand(unsigned long len)
+{
+ unsigned long i, pages, size, addr;
+ int ret;
+
+ ghes_estatus_pool_size_request += PAGE_ALIGN(len);
+ size = gen_pool_size(ghes_estatus_pool);
+ if (size >= ghes_estatus_pool_size_request)
+ return 0;
+ pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
+ for (i = 0; i < pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ return -ENOMEM;
+ ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ghes_estatus_pool_shrink(unsigned long len)
+{
+ ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
+}
+
static struct ghes *ghes_new(struct acpi_hest_generic *generic)
{
struct ghes *ghes;
@@ -341,43 +449,196 @@
ghes->flags &= ~GHES_TO_CLEAR;
}
-static void ghes_do_proc(struct ghes *ghes)
+static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
{
- int sev, processed = 0;
+ int sev, sec_sev;
struct acpi_hest_generic_data *gdata;
- sev = ghes_severity(ghes->estatus->error_severity);
- apei_estatus_for_each_section(ghes->estatus, gdata) {
-#ifdef CONFIG_X86_MCE
+ sev = ghes_severity(estatus->error_severity);
+ apei_estatus_for_each_section(estatus, gdata) {
+ sec_sev = ghes_severity(gdata->error_severity);
if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
CPER_SEC_PLATFORM_MEM)) {
- apei_mce_report_mem_error(
- sev == GHES_SEV_CORRECTED,
- (struct cper_sec_mem_err *)(gdata+1));
- processed = 1;
- }
+ struct cper_sec_mem_err *mem_err;
+ mem_err = (struct cper_sec_mem_err *)(gdata+1);
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
+ mem_err);
#endif
+#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
+ if (sev == GHES_SEV_RECOVERABLE &&
+ sec_sev == GHES_SEV_RECOVERABLE &&
+ mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
+ unsigned long pfn;
+ pfn = mem_err->physical_addr >> PAGE_SHIFT;
+ memory_failure_queue(pfn, 0, 0);
+ }
+#endif
+ }
}
}
-static void ghes_print_estatus(const char *pfx, struct ghes *ghes)
+static void __ghes_print_estatus(const char *pfx,
+ const struct acpi_hest_generic *generic,
+ const struct acpi_hest_generic_status *estatus)
{
- /* Not more than 2 messages every 5 seconds */
- static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2);
-
if (pfx == NULL) {
- if (ghes_severity(ghes->estatus->error_severity) <=
+ if (ghes_severity(estatus->error_severity) <=
GHES_SEV_CORRECTED)
pfx = KERN_WARNING HW_ERR;
else
pfx = KERN_ERR HW_ERR;
}
- if (__ratelimit(&ratelimit)) {
- printk(
- "%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
- pfx, ghes->generic->header.source_id);
- apei_estatus_print(pfx, ghes->estatus);
+ printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
+ pfx, generic->header.source_id);
+ apei_estatus_print(pfx, estatus);
+}
+
+static int ghes_print_estatus(const char *pfx,
+ const struct acpi_hest_generic *generic,
+ const struct acpi_hest_generic_status *estatus)
+{
+ /* Not more than 2 messages every 5 seconds */
+ static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
+ static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
+ struct ratelimit_state *ratelimit;
+
+ if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
+ ratelimit = &ratelimit_corrected;
+ else
+ ratelimit = &ratelimit_uncorrected;
+ if (__ratelimit(ratelimit)) {
+ __ghes_print_estatus(pfx, generic, estatus);
+ return 1;
}
+ return 0;
+}
+
+/*
+ * GHES error status reporting throttle, to report more kinds of
+ * errors, instead of just most frequently occurred errors.
+ */
+static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus)
+{
+ u32 len;
+ int i, cached = 0;
+ unsigned long long now;
+ struct ghes_estatus_cache *cache;
+ struct acpi_hest_generic_status *cache_estatus;
+
+ len = apei_estatus_len(estatus);
+ rcu_read_lock();
+ for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
+ cache = rcu_dereference(ghes_estatus_caches[i]);
+ if (cache == NULL)
+ continue;
+ if (len != cache->estatus_len)
+ continue;
+ cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
+ if (memcmp(estatus, cache_estatus, len))
+ continue;
+ atomic_inc(&cache->count);
+ now = sched_clock();
+ if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC)
+ cached = 1;
+ break;
+ }
+ rcu_read_unlock();
+ return cached;
+}
+
+static struct ghes_estatus_cache *ghes_estatus_cache_alloc(
+ struct acpi_hest_generic *generic,
+ struct acpi_hest_generic_status *estatus)
+{
+ int alloced;
+ u32 len, cache_len;
+ struct ghes_estatus_cache *cache;
+ struct acpi_hest_generic_status *cache_estatus;
+
+ alloced = atomic_add_return(1, &ghes_estatus_cache_alloced);
+ if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) {
+ atomic_dec(&ghes_estatus_cache_alloced);
+ return NULL;
+ }
+ len = apei_estatus_len(estatus);
+ cache_len = GHES_ESTATUS_CACHE_LEN(len);
+ cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len);
+ if (!cache) {
+ atomic_dec(&ghes_estatus_cache_alloced);
+ return NULL;
+ }
+ cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
+ memcpy(cache_estatus, estatus, len);
+ cache->estatus_len = len;
+ atomic_set(&cache->count, 0);
+ cache->generic = generic;
+ cache->time_in = sched_clock();
+ return cache;
+}
+
+static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache)
+{
+ u32 len;
+
+ len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache));
+ len = GHES_ESTATUS_CACHE_LEN(len);
+ gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len);
+ atomic_dec(&ghes_estatus_cache_alloced);
+}
+
+static void ghes_estatus_cache_rcu_free(struct rcu_head *head)
+{
+ struct ghes_estatus_cache *cache;
+
+ cache = container_of(head, struct ghes_estatus_cache, rcu);
+ ghes_estatus_cache_free(cache);
+}
+
+static void ghes_estatus_cache_add(
+ struct acpi_hest_generic *generic,
+ struct acpi_hest_generic_status *estatus)
+{
+ int i, slot = -1, count;
+ unsigned long long now, duration, period, max_period = 0;
+ struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache;
+
+ new_cache = ghes_estatus_cache_alloc(generic, estatus);
+ if (new_cache == NULL)
+ return;
+ rcu_read_lock();
+ now = sched_clock();
+ for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
+ cache = rcu_dereference(ghes_estatus_caches[i]);
+ if (cache == NULL) {
+ slot = i;
+ slot_cache = NULL;
+ break;
+ }
+ duration = now - cache->time_in;
+ if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) {
+ slot = i;
+ slot_cache = cache;
+ break;
+ }
+ count = atomic_read(&cache->count);
+ period = duration;
+ do_div(period, (count + 1));
+ if (period > max_period) {
+ max_period = period;
+ slot = i;
+ slot_cache = cache;
+ }
+ }
+ /* new_cache must be put into array after its contents are written */
+ smp_wmb();
+ if (slot != -1 && cmpxchg(ghes_estatus_caches + slot,
+ slot_cache, new_cache) == slot_cache) {
+ if (slot_cache)
+ call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free);
+ } else
+ ghes_estatus_cache_free(new_cache);
+ rcu_read_unlock();
}
static int ghes_proc(struct ghes *ghes)
@@ -387,9 +648,11 @@
rc = ghes_read_estatus(ghes, 0);
if (rc)
goto out;
- ghes_print_estatus(NULL, ghes);
- ghes_do_proc(ghes);
-
+ if (!ghes_estatus_cached(ghes->estatus)) {
+ if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
+ ghes_estatus_cache_add(ghes->generic, ghes->estatus);
+ }
+ ghes_do_proc(ghes->estatus);
out:
ghes_clear_estatus(ghes);
return 0;
@@ -447,6 +710,45 @@
return ret;
}
+static void ghes_proc_in_irq(struct irq_work *irq_work)
+{
+ struct llist_node *llnode, *next, *tail = NULL;
+ struct ghes_estatus_node *estatus_node;
+ struct acpi_hest_generic *generic;
+ struct acpi_hest_generic_status *estatus;
+ u32 len, node_len;
+
+ /*
+ * Because the time order of estatus in list is reversed,
+ * revert it back to proper order.
+ */
+ llnode = llist_del_all(&ghes_estatus_llist);
+ while (llnode) {
+ next = llnode->next;
+ llnode->next = tail;
+ tail = llnode;
+ llnode = next;
+ }
+ llnode = tail;
+ while (llnode) {
+ next = llnode->next;
+ estatus_node = llist_entry(llnode, struct ghes_estatus_node,
+ llnode);
+ estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+ len = apei_estatus_len(estatus);
+ node_len = GHES_ESTATUS_NODE_LEN(len);
+ ghes_do_proc(estatus);
+ if (!ghes_estatus_cached(estatus)) {
+ generic = estatus_node->generic;
+ if (ghes_print_estatus(NULL, generic, estatus))
+ ghes_estatus_cache_add(generic, estatus);
+ }
+ gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+ node_len);
+ llnode = next;
+ }
+}
+
static int ghes_notify_nmi(struct notifier_block *this,
unsigned long cmd, void *data)
{
@@ -476,7 +778,8 @@
if (sev_global >= GHES_SEV_PANIC) {
oops_begin();
- ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global);
+ __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic,
+ ghes_global->estatus);
/* reboot to log the error! */
if (panic_timeout == 0)
panic_timeout = ghes_panic_timeout;
@@ -484,12 +787,34 @@
}
list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ u32 len, node_len;
+ struct ghes_estatus_node *estatus_node;
+ struct acpi_hest_generic_status *estatus;
+#endif
if (!(ghes->flags & GHES_TO_CLEAR))
continue;
- /* Do not print estatus because printk is not NMI safe */
- ghes_do_proc(ghes);
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ if (ghes_estatus_cached(ghes->estatus))
+ goto next;
+ /* Save estatus for further processing in IRQ context */
+ len = apei_estatus_len(ghes->estatus);
+ node_len = GHES_ESTATUS_NODE_LEN(len);
+ estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
+ node_len);
+ if (estatus_node) {
+ estatus_node->generic = ghes->generic;
+ estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+ memcpy(estatus, ghes->estatus, len);
+ llist_add(&estatus_node->llnode, &ghes_estatus_llist);
+ }
+next:
+#endif
ghes_clear_estatus(ghes);
}
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ irq_work_queue(&ghes_proc_irq_work);
+#endif
out:
raw_spin_unlock(&ghes_nmi_lock);
@@ -504,10 +829,26 @@
.notifier_call = ghes_notify_nmi,
};
+static unsigned long ghes_esource_prealloc_size(
+ const struct acpi_hest_generic *generic)
+{
+ unsigned long block_length, prealloc_records, prealloc_size;
+
+ block_length = min_t(unsigned long, generic->error_block_length,
+ GHES_ESTATUS_MAX_SIZE);
+ prealloc_records = max_t(unsigned long,
+ generic->records_to_preallocate, 1);
+ prealloc_size = min_t(unsigned long, block_length * prealloc_records,
+ GHES_ESOURCE_PREALLOC_MAX_SIZE);
+
+ return prealloc_size;
+}
+
static int __devinit ghes_probe(struct platform_device *ghes_dev)
{
struct acpi_hest_generic *generic;
struct ghes *ghes = NULL;
+ unsigned long len;
int rc = -EINVAL;
generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
@@ -573,6 +914,8 @@
mutex_unlock(&ghes_list_mutex);
break;
case ACPI_HEST_NOTIFY_NMI:
+ len = ghes_esource_prealloc_size(generic);
+ ghes_estatus_pool_expand(len);
mutex_lock(&ghes_list_mutex);
if (list_empty(&ghes_nmi))
register_die_notifier(&ghes_notifier_nmi);
@@ -597,6 +940,7 @@
{
struct ghes *ghes;
struct acpi_hest_generic *generic;
+ unsigned long len;
ghes = platform_get_drvdata(ghes_dev);
generic = ghes->generic;
@@ -627,6 +971,8 @@
* freed after NMI handler finishes.
*/
synchronize_rcu();
+ len = ghes_esource_prealloc_size(generic);
+ ghes_estatus_pool_shrink(len);
break;
default:
BUG();
@@ -662,15 +1008,43 @@
return -EINVAL;
}
+ if (ghes_disable) {
+ pr_info(GHES_PFX "GHES is not enabled!\n");
+ return -EINVAL;
+ }
+
+ init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
+
rc = ghes_ioremap_init();
if (rc)
goto err;
- rc = platform_driver_register(&ghes_platform_driver);
+ rc = ghes_estatus_pool_init();
if (rc)
goto err_ioremap_exit;
+ rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
+ GHES_ESTATUS_CACHE_ALLOCED_MAX);
+ if (rc)
+ goto err_pool_exit;
+
+ rc = platform_driver_register(&ghes_platform_driver);
+ if (rc)
+ goto err_pool_exit;
+
+ rc = apei_osc_setup();
+ if (rc == 0 && osc_sb_apei_support_acked)
+ pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n");
+ else if (rc == 0 && !osc_sb_apei_support_acked)
+ pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n");
+ else if (rc && osc_sb_apei_support_acked)
+ pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n");
+ else
+ pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
+
return 0;
+err_pool_exit:
+ ghes_estatus_pool_exit();
err_ioremap_exit:
ghes_ioremap_exit();
err:
@@ -680,6 +1054,7 @@
static void __exit ghes_exit(void)
{
platform_driver_unregister(&ghes_platform_driver);
+ ghes_estatus_pool_exit();
ghes_ioremap_exit();
}
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 181bc2f..05fee06 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -231,16 +231,17 @@
goto err;
}
- rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
- if (rc)
- goto err;
-
- rc = hest_ghes_dev_register(ghes_count);
- if (!rc) {
- pr_info(HEST_PFX "Table parsing has been initialized.\n");
- return;
+ if (!ghes_disable) {
+ rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
+ if (rc)
+ goto err;
+ rc = hest_ghes_dev_register(ghes_count);
+ if (rc)
+ goto err;
}
+ pr_info(HEST_PFX "Table parsing has been initialized.\n");
+ return;
err:
hest_disable = 1;
}
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 2c66135..87c0a8d 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -55,6 +55,9 @@
#define ACPI_BATTERY_NOTIFY_INFO 0x81
#define ACPI_BATTERY_NOTIFY_THRESHOLD 0x82
+/* Battery power unit: 0 means mW, 1 means mA */
+#define ACPI_BATTERY_POWER_UNIT_MA 1
+
#define _COMPONENT ACPI_BATTERY_COMPONENT
ACPI_MODULE_NAME("battery");
@@ -91,11 +94,6 @@
enum {
ACPI_BATTERY_ALARM_PRESENT,
ACPI_BATTERY_XINFO_PRESENT,
- /* For buggy DSDTs that report negative 16-bit values for either
- * charging or discharging current and/or report 0 as 65536
- * due to bad math.
- */
- ACPI_BATTERY_QUIRK_SIGNED16_CURRENT,
ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY,
};
@@ -301,7 +299,8 @@
#ifdef CONFIG_ACPI_PROCFS_POWER
inline char *acpi_battery_units(struct acpi_battery *battery)
{
- return (battery->power_unit)?"mA":"mW";
+ return (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) ?
+ "mA" : "mW";
}
#endif
@@ -461,9 +460,17 @@
battery->update_time = jiffies;
kfree(buffer.pointer);
- if (test_bit(ACPI_BATTERY_QUIRK_SIGNED16_CURRENT, &battery->flags) &&
- battery->rate_now != -1)
+ /* For buggy DSDTs that report negative 16-bit values for either
+ * charging or discharging current and/or report 0 as 65536
+ * due to bad math.
+ */
+ if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA &&
+ battery->rate_now != ACPI_BATTERY_VALUE_UNKNOWN &&
+ (s16)(battery->rate_now) < 0) {
battery->rate_now = abs((s16)battery->rate_now);
+ printk_once(KERN_WARNING FW_BUG "battery: (dis)charge rate"
+ " invalid.\n");
+ }
if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags)
&& battery->capacity_now >= 0 && battery->capacity_now <= 100)
@@ -544,7 +551,7 @@
{
int result;
- if (battery->power_unit) {
+ if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) {
battery->bat.properties = charge_battery_props;
battery->bat.num_properties =
ARRAY_SIZE(charge_battery_props);
@@ -566,18 +573,16 @@
static void sysfs_remove_battery(struct acpi_battery *battery)
{
- if (!battery->bat.dev)
+ mutex_lock(&battery->lock);
+ if (!battery->bat.dev) {
+ mutex_unlock(&battery->lock);
return;
+ }
+
device_remove_file(battery->bat.dev, &alarm_attr);
power_supply_unregister(&battery->bat);
battery->bat.dev = NULL;
-}
-
-static void acpi_battery_quirks(struct acpi_battery *battery)
-{
- if (dmi_name_in_vendors("Acer") && battery->power_unit) {
- set_bit(ACPI_BATTERY_QUIRK_SIGNED16_CURRENT, &battery->flags);
- }
+ mutex_unlock(&battery->lock);
}
/*
@@ -592,7 +597,7 @@
*
* Handle this correctly so that they won't break userspace.
*/
-static void acpi_battery_quirks2(struct acpi_battery *battery)
+static void acpi_battery_quirks(struct acpi_battery *battery)
{
if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags))
return ;
@@ -623,13 +628,15 @@
result = acpi_battery_get_info(battery);
if (result)
return result;
- acpi_battery_quirks(battery);
acpi_battery_init_alarm(battery);
}
- if (!battery->bat.dev)
- sysfs_add_battery(battery);
+ if (!battery->bat.dev) {
+ result = sysfs_add_battery(battery);
+ if (result)
+ return result;
+ }
result = acpi_battery_get_state(battery);
- acpi_battery_quirks2(battery);
+ acpi_battery_quirks(battery);
return result;
}
@@ -863,7 +870,7 @@
}, \
}
-static struct battery_file {
+static const struct battery_file {
struct file_operations ops;
mode_t mode;
const char *name;
@@ -948,9 +955,12 @@
struct acpi_battery *battery = container_of(nb, struct acpi_battery,
pm_nb);
switch (mode) {
+ case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- sysfs_remove_battery(battery);
- sysfs_add_battery(battery);
+ if (battery->bat.dev) {
+ sysfs_remove_battery(battery);
+ sysfs_add_battery(battery);
+ }
break;
}
@@ -975,25 +985,33 @@
if (ACPI_SUCCESS(acpi_get_handle(battery->device->handle,
"_BIX", &handle)))
set_bit(ACPI_BATTERY_XINFO_PRESENT, &battery->flags);
- acpi_battery_update(battery);
+ result = acpi_battery_update(battery);
+ if (result)
+ goto fail;
#ifdef CONFIG_ACPI_PROCFS_POWER
result = acpi_battery_add_fs(device);
#endif
- if (!result) {
- printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n",
- ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device),
- device->status.battery_present ? "present" : "absent");
- } else {
+ if (result) {
#ifdef CONFIG_ACPI_PROCFS_POWER
acpi_battery_remove_fs(device);
#endif
- kfree(battery);
+ goto fail;
}
+ printk(KERN_INFO PREFIX "%s Slot [%s] (battery %s)\n",
+ ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device),
+ device->status.battery_present ? "present" : "absent");
+
battery->pm_nb.notifier_call = battery_notify;
register_pm_notifier(&battery->pm_nb);
return result;
+
+fail:
+ sysfs_remove_battery(battery);
+ mutex_destroy(&battery->lock);
+ kfree(battery);
+ return result;
}
static int acpi_battery_remove(struct acpi_device *device, int type)
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index d1e06c1..437ddbf 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -39,6 +39,7 @@
#include <linux/pci.h>
#include <acpi/acpi_bus.h>
#include <acpi/acpi_drivers.h>
+#include <acpi/apei.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
@@ -519,6 +520,7 @@
}
EXPORT_SYMBOL(acpi_run_osc);
+bool osc_sb_apei_support_acked;
static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48";
static void acpi_bus_osc_support(void)
{
@@ -541,11 +543,19 @@
#if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE)
capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT;
#endif
+
+ if (!ghes_disable)
+ capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT;
if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
return;
- if (ACPI_SUCCESS(acpi_run_osc(handle, &context)))
+ if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) {
+ u32 *capbuf_ret = context.ret.pointer;
+ if (context.ret.length > OSC_SUPPORT_TYPE)
+ osc_sb_apei_support_acked =
+ capbuf_ret[OSC_SUPPORT_TYPE] & OSC_SB_APEI_SUPPORT;
kfree(context.ret.pointer);
- /* do we need to check the returned cap? Sounds no */
+ }
+ /* do we need to check other returned cap? Sounds no */
}
/* --------------------------------------------------------------------------
diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index 1864ad3..19a6113 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -77,7 +77,7 @@
struct list_head list;
struct list_head hotplug_list;
acpi_handle handle;
- struct acpi_dock_ops *ops;
+ const struct acpi_dock_ops *ops;
void *context;
};
@@ -589,7 +589,7 @@
* the dock driver after _DCK is executed.
*/
int
-register_hotplug_dock_device(acpi_handle handle, struct acpi_dock_ops *ops,
+register_hotplug_dock_device(acpi_handle handle, const struct acpi_dock_ops *ops,
void *context)
{
struct dock_dependent_device *dd;
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index 05b4420..22f918b 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -92,7 +92,7 @@
return count;
}
-static struct file_operations acpi_ec_io_ops = {
+static const struct file_operations acpi_ec_io_ops = {
.owner = THIS_MODULE,
.open = acpi_ec_open_io,
.read = acpi_ec_read_io,
diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c
index 467479f..0f0356c 100644
--- a/drivers/acpi/fan.c
+++ b/drivers/acpi/fan.c
@@ -110,7 +110,7 @@
return result;
}
-static struct thermal_cooling_device_ops fan_cooling_ops = {
+static const struct thermal_cooling_device_ops fan_cooling_ops = {
.get_max_state = fan_get_max_state,
.get_cur_state = fan_get_cur_state,
.set_cur_state = fan_set_cur_state,
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 372f9b7..fa32f58 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -155,7 +155,7 @@
{
if (!strcmp("Linux", interface)) {
- printk(KERN_NOTICE FW_BUG PREFIX
+ printk_once(KERN_NOTICE FW_BUG PREFIX
"BIOS _OSI(Linux) query %s%s\n",
osi_linux.enable ? "honored" : "ignored",
osi_linux.cmdline ? " via cmdline" :
@@ -237,8 +237,23 @@
#endif
}
+#ifdef CONFIG_KEXEC
+static unsigned long acpi_rsdp;
+static int __init setup_acpi_rsdp(char *arg)
+{
+ acpi_rsdp = simple_strtoul(arg, NULL, 16);
+ return 0;
+}
+early_param("acpi_rsdp", setup_acpi_rsdp);
+#endif
+
acpi_physical_address __init acpi_os_get_root_pointer(void)
{
+#ifdef CONFIG_KEXEC
+ if (acpi_rsdp)
+ return acpi_rsdp;
+#endif
+
if (efi_enabled) {
if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
return efi.acpi20;
@@ -1083,7 +1098,13 @@
bool enable;
};
-static struct osi_setup_entry __initdata osi_setup_entries[OSI_STRING_ENTRIES_MAX];
+static struct osi_setup_entry __initdata
+ osi_setup_entries[OSI_STRING_ENTRIES_MAX] = {
+ {"Module Device", true},
+ {"Processor Device", true},
+ {"3.0 _SCP Extensions", true},
+ {"Processor Aggregator Device", true},
+};
void __init acpi_osi_setup(char *str)
{
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index f907cfb..7f9eba9 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -303,6 +303,61 @@
/* --------------------------------------------------------------------------
PCI Interrupt Routing Support
-------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_IO_APIC
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+static int bridge_has_boot_interrupt_variant(struct pci_bus *bus)
+{
+ struct pci_bus *bus_it;
+
+ for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) {
+ if (!bus_it->self)
+ return 0;
+ if (bus_it->self->irq_reroute_variant)
+ return bus_it->self->irq_reroute_variant;
+ }
+ return 0;
+}
+
+/*
+ * Some chipsets (e.g. Intel 6700PXH) generate a legacy INTx when the IRQ
+ * entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel does
+ * during interrupt handling). When this INTx generation cannot be disabled,
+ * we reroute these interrupts to their legacy equivalent to get rid of
+ * spurious interrupts.
+ */
+static int acpi_reroute_boot_interrupt(struct pci_dev *dev,
+ struct acpi_prt_entry *entry)
+{
+ if (noioapicquirk || noioapicreroute) {
+ return 0;
+ } else {
+ switch (bridge_has_boot_interrupt_variant(dev->bus)) {
+ case 0:
+ /* no rerouting necessary */
+ return 0;
+ case INTEL_IRQ_REROUTE_VARIANT:
+ /*
+ * Remap according to INTx routing table in 6700PXH
+ * specs, intel order number 302628-002, section
+ * 2.15.2. Other chipsets (80332, ...) have the same
+ * mapping and are handled here as well.
+ */
+ dev_info(&dev->dev, "PCI IRQ %d -> rerouted to legacy "
+ "IRQ %d\n", entry->index,
+ (entry->index % 4) + 16);
+ entry->index = (entry->index % 4) + 16;
+ return 1;
+ default:
+ dev_warn(&dev->dev, "Cannot reroute IRQ %d to legacy "
+ "IRQ: unknown mapping\n", entry->index);
+ return -1;
+ }
+ }
+}
+#endif /* CONFIG_X86_IO_APIC */
+
static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin)
{
struct acpi_prt_entry *entry;
@@ -311,6 +366,9 @@
entry = acpi_pci_irq_find_prt_entry(dev, pin);
if (entry) {
+#ifdef CONFIG_X86_IO_APIC
+ acpi_reroute_boot_interrupt(dev, entry);
+#endif /* CONFIG_X86_IO_APIC */
ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %s[%c] _PRT entry\n",
pci_name(dev), pin_name(pin)));
return entry;
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index d06078d..2672c79 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -485,7 +485,8 @@
root->secondary.end = 0xFF;
printk(KERN_WARNING FW_BUG PREFIX
"no secondary bus range in _CRS\n");
- status = acpi_evaluate_integer(device->handle, METHOD_NAME__BBN, NULL, &bus);
+ status = acpi_evaluate_integer(device->handle, METHOD_NAME__BBN,
+ NULL, &bus);
if (ACPI_SUCCESS(status))
root->secondary.start = bus;
else if (status == AE_NOT_FOUND)
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 79cb653..870550d 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -244,7 +244,7 @@
return result;
}
-struct thermal_cooling_device_ops processor_cooling_ops = {
+const struct thermal_cooling_device_ops processor_cooling_ops = {
.get_max_state = processor_get_max_state,
.get_cur_state = processor_get_cur_state,
.set_cur_state = processor_set_cur_state,
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 50658ff..6e36d0c 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -130,6 +130,9 @@
#define to_acpi_sbs(x) container_of(x, struct acpi_sbs, charger)
+static int acpi_sbs_remove(struct acpi_device *device, int type);
+static int acpi_battery_get_state(struct acpi_battery *battery);
+
static inline int battery_scale(int log)
{
int scale = 1;
@@ -195,6 +198,8 @@
if ((!battery->present) && psp != POWER_SUPPLY_PROP_PRESENT)
return -ENODEV;
+
+ acpi_battery_get_state(battery);
switch (psp) {
case POWER_SUPPLY_PROP_STATUS:
if (battery->rate_now < 0)
@@ -225,11 +230,17 @@
case POWER_SUPPLY_PROP_POWER_NOW:
val->intval = abs(battery->rate_now) *
acpi_battery_ipscale(battery) * 1000;
+ val->intval *= (acpi_battery_mode(battery)) ?
+ (battery->voltage_now *
+ acpi_battery_vscale(battery) / 1000) : 1;
break;
case POWER_SUPPLY_PROP_CURRENT_AVG:
case POWER_SUPPLY_PROP_POWER_AVG:
val->intval = abs(battery->rate_avg) *
acpi_battery_ipscale(battery) * 1000;
+ val->intval *= (acpi_battery_mode(battery)) ?
+ (battery->voltage_now *
+ acpi_battery_vscale(battery) / 1000) : 1;
break;
case POWER_SUPPLY_PROP_CAPACITY:
val->intval = battery->state_of_charge;
@@ -903,8 +914,6 @@
}
}
-static int acpi_sbs_remove(struct acpi_device *device, int type);
-
static int acpi_sbs_add(struct acpi_device *device)
{
struct acpi_sbs *sbs;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 6c94960..3ed80b2 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -428,6 +428,22 @@
DMI_MATCH(DMI_PRODUCT_NAME, "1000 Series"),
},
},
+ {
+ .callback = init_old_suspend_ordering,
+ .ident = "Asus A8N-SLI DELUXE",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "A8N-SLI DELUXE"),
+ },
+ },
+ {
+ .callback = init_old_suspend_ordering,
+ .ident = "Asus A8N-SLI Premium",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "A8N-SLI Premium"),
+ },
+ },
{},
};
#endif /* CONFIG_SUSPEND */
diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
index 77255f2..c538d0e 100644
--- a/drivers/acpi/sysfs.c
+++ b/drivers/acpi/sysfs.c
@@ -149,12 +149,12 @@
return result;
}
-static struct kernel_param_ops param_ops_debug_layer = {
+static const struct kernel_param_ops param_ops_debug_layer = {
.set = param_set_uint,
.get = param_get_debug_layer,
};
-static struct kernel_param_ops param_ops_debug_level = {
+static const struct kernel_param_ops param_ops_debug_level = {
.set = param_set_uint,
.get = param_get_debug_level,
};
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 2607e17..48fbc64 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -812,7 +812,7 @@
thermal_zone_unbind_cooling_device);
}
-static struct thermal_zone_device_ops acpi_thermal_zone_ops = {
+static const struct thermal_zone_device_ops acpi_thermal_zone_ops = {
.bind = acpi_thermal_bind_cooling_device,
.unbind = acpi_thermal_unbind_cooling_device,
.get_temp = thermal_get_temp,
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index ada4b4d..08a44b5 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -307,7 +307,7 @@
return acpi_video_device_lcd_set_level(video, level);
}
-static struct thermal_cooling_device_ops video_cooling_ops = {
+static const struct thermal_cooling_device_ops video_cooling_ops = {
.get_max_state = video_get_max_state,
.get_cur_state = video_get_cur_state,
.set_cur_state = video_set_cur_state,
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index e0a5b55..bb7c5f1 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -218,12 +218,12 @@
ata_acpi_uevent(dev->link->ap, dev, event);
}
-static struct acpi_dock_ops ata_acpi_dev_dock_ops = {
+static const struct acpi_dock_ops ata_acpi_dev_dock_ops = {
.handler = ata_acpi_dev_notify_dock,
.uevent = ata_acpi_dev_uevent,
};
-static struct acpi_dock_ops ata_acpi_ap_dock_ops = {
+static const struct acpi_dock_ops ata_acpi_ap_dock_ops = {
.handler = ata_acpi_ap_notify_dock,
.uevent = ata_acpi_ap_uevent,
};
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 49502bc..423fd56 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -616,5 +616,16 @@
Enables userspace clients to read and write to some packet SMD
ports via device interface for MSM chipset.
+config TILE_SROM
+ bool "Character-device access via hypervisor to the Tilera SPI ROM"
+ depends on TILE
+ default y
+ ---help---
+ This device provides character-level read-write access
+ to the SROM, typically via the "0", "1", and "2" devices
+ in /dev/srom/. The Tilera hypervisor makes the flash
+ device appear much like a simple EEPROM, and knows
+ how to partition a single ROM for multiple purposes.
+
endmenu
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 7a00672..32762ba 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -63,3 +63,5 @@
obj-$(CONFIG_JS_RTC) += js-rtc.o
js-rtc-y = rtc.o
+
+obj-$(CONFIG_TILE_SROM) += tile-srom.o
diff --git a/drivers/char/ramoops.c b/drivers/char/ramoops.c
index fca0c51..810aff9 100644
--- a/drivers/char/ramoops.c
+++ b/drivers/char/ramoops.c
@@ -147,6 +147,14 @@
cxt->phys_addr = pdata->mem_address;
cxt->record_size = pdata->record_size;
cxt->dump_oops = pdata->dump_oops;
+ /*
+ * Update the module parameter variables as well so they are visible
+ * through /sys/module/ramoops/parameters/
+ */
+ mem_size = pdata->mem_size;
+ mem_address = pdata->mem_address;
+ record_size = pdata->record_size;
+ dump_oops = pdata->dump_oops;
if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) {
pr_err("request mem region failed\n");
diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c
new file mode 100644
index 0000000..cf3ee00
--- /dev/null
+++ b/drivers/char/tile-srom.c
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ *
+ * SPI Flash ROM driver
+ *
+ * This source code is derived from code provided in "Linux Device
+ * Drivers, Third Edition", by Jonathan Corbet, Alessandro Rubini, and
+ * Greg Kroah-Hartman, published by O'Reilly Media, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/kernel.h> /* printk() */
+#include <linux/slab.h> /* kmalloc() */
+#include <linux/fs.h> /* everything... */
+#include <linux/errno.h> /* error codes */
+#include <linux/types.h> /* size_t */
+#include <linux/proc_fs.h>
+#include <linux/fcntl.h> /* O_ACCMODE */
+#include <linux/aio.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+#include <linux/uaccess.h>
+#include <linux/platform_device.h>
+#include <hv/hypervisor.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <hv/drv_srom_intf.h>
+
+/*
+ * Size of our hypervisor I/O requests. We break up large transfers
+ * so that we don't spend large uninterrupted spans of time in the
+ * hypervisor. Erasing an SROM sector takes a significant fraction of
+ * a second, so if we allowed the user to, say, do one I/O to write the
+ * entire ROM, we'd get soft lockup timeouts, or worse.
+ */
+#define SROM_CHUNK_SIZE ((size_t)4096)
+
+/*
+ * When hypervisor is busy (e.g. erasing), poll the status periodically.
+ */
+
+/*
+ * Interval to poll the state in msec
+ */
+#define SROM_WAIT_TRY_INTERVAL 20
+
+/*
+ * Maximum times to poll the state
+ */
+#define SROM_MAX_WAIT_TRY_TIMES 1000
+
+struct srom_dev {
+ int hv_devhdl; /* Handle for hypervisor device */
+ u32 total_size; /* Size of this device */
+ u32 sector_size; /* Size of a sector */
+ u32 page_size; /* Size of a page */
+ struct mutex lock; /* Allow only one accessor at a time */
+};
+
+static int srom_major; /* Dynamic major by default */
+module_param(srom_major, int, 0);
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+static int srom_devs; /* Number of SROM partitions */
+static struct cdev srom_cdev;
+static struct class *srom_class;
+static struct srom_dev *srom_devices;
+
+/*
+ * Handle calling the hypervisor and managing EAGAIN/EBUSY.
+ */
+
+static ssize_t _srom_read(int hv_devhdl, void *buf,
+ loff_t off, size_t count)
+{
+ int retval, retries = SROM_MAX_WAIT_TRY_TIMES;
+ for (;;) {
+ retval = hv_dev_pread(hv_devhdl, 0, (HV_VirtAddr)buf,
+ count, off);
+ if (retval >= 0)
+ return retval;
+ if (retval == HV_EAGAIN)
+ continue;
+ if (retval == HV_EBUSY && --retries > 0) {
+ msleep(SROM_WAIT_TRY_INTERVAL);
+ continue;
+ }
+ pr_err("_srom_read: error %d\n", retval);
+ return -EIO;
+ }
+}
+
+static ssize_t _srom_write(int hv_devhdl, const void *buf,
+ loff_t off, size_t count)
+{
+ int retval, retries = SROM_MAX_WAIT_TRY_TIMES;
+ for (;;) {
+ retval = hv_dev_pwrite(hv_devhdl, 0, (HV_VirtAddr)buf,
+ count, off);
+ if (retval >= 0)
+ return retval;
+ if (retval == HV_EAGAIN)
+ continue;
+ if (retval == HV_EBUSY && --retries > 0) {
+ msleep(SROM_WAIT_TRY_INTERVAL);
+ continue;
+ }
+ pr_err("_srom_write: error %d\n", retval);
+ return -EIO;
+ }
+}
+
+/**
+ * srom_open() - Device open routine.
+ * @inode: Inode for this device.
+ * @filp: File for this specific open of the device.
+ *
+ * Returns zero, or an error code.
+ */
+static int srom_open(struct inode *inode, struct file *filp)
+{
+ filp->private_data = &srom_devices[iminor(inode)];
+ return 0;
+}
+
+
+/**
+ * srom_release() - Device release routine.
+ * @inode: Inode for this device.
+ * @filp: File for this specific open of the device.
+ *
+ * Returns zero, or an error code.
+ */
+static int srom_release(struct inode *inode, struct file *filp)
+{
+ struct srom_dev *srom = filp->private_data;
+ char dummy;
+
+ /* Make sure we've flushed anything written to the ROM. */
+ mutex_lock(&srom->lock);
+ if (srom->hv_devhdl >= 0)
+ _srom_write(srom->hv_devhdl, &dummy, SROM_FLUSH_OFF, 1);
+ mutex_unlock(&srom->lock);
+
+ filp->private_data = NULL;
+
+ return 0;
+}
+
+
+/**
+ * srom_read() - Read data from the device.
+ * @filp: File for this specific open of the device.
+ * @buf: User's data buffer.
+ * @count: Number of bytes requested.
+ * @f_pos: File position.
+ *
+ * Returns number of bytes read, or an error code.
+ */
+static ssize_t srom_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ int retval = 0;
+ void *kernbuf;
+ struct srom_dev *srom = filp->private_data;
+
+ kernbuf = kmalloc(SROM_CHUNK_SIZE, GFP_KERNEL);
+ if (!kernbuf)
+ return -ENOMEM;
+
+ if (mutex_lock_interruptible(&srom->lock)) {
+ retval = -ERESTARTSYS;
+ kfree(kernbuf);
+ return retval;
+ }
+
+ while (count) {
+ int hv_retval;
+ int bytes_this_pass = min(count, SROM_CHUNK_SIZE);
+
+ hv_retval = _srom_read(srom->hv_devhdl, kernbuf,
+ *f_pos, bytes_this_pass);
+ if (hv_retval > 0) {
+ if (copy_to_user(buf, kernbuf, hv_retval) != 0) {
+ retval = -EFAULT;
+ break;
+ }
+ } else if (hv_retval <= 0) {
+ if (retval == 0)
+ retval = hv_retval;
+ break;
+ }
+
+ retval += hv_retval;
+ *f_pos += hv_retval;
+ buf += hv_retval;
+ count -= hv_retval;
+ }
+
+ mutex_unlock(&srom->lock);
+ kfree(kernbuf);
+
+ return retval;
+}
+
+/**
+ * srom_write() - Write data to the device.
+ * @filp: File for this specific open of the device.
+ * @buf: User's data buffer.
+ * @count: Number of bytes requested.
+ * @f_pos: File position.
+ *
+ * Returns number of bytes written, or an error code.
+ */
+static ssize_t srom_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ int retval = 0;
+ void *kernbuf;
+ struct srom_dev *srom = filp->private_data;
+
+ kernbuf = kmalloc(SROM_CHUNK_SIZE, GFP_KERNEL);
+ if (!kernbuf)
+ return -ENOMEM;
+
+ if (mutex_lock_interruptible(&srom->lock)) {
+ retval = -ERESTARTSYS;
+ kfree(kernbuf);
+ return retval;
+ }
+
+ while (count) {
+ int hv_retval;
+ int bytes_this_pass = min(count, SROM_CHUNK_SIZE);
+
+ if (copy_from_user(kernbuf, buf, bytes_this_pass) != 0) {
+ retval = -EFAULT;
+ break;
+ }
+
+ hv_retval = _srom_write(srom->hv_devhdl, kernbuf,
+ *f_pos, bytes_this_pass);
+ if (hv_retval <= 0) {
+ if (retval == 0)
+ retval = hv_retval;
+ break;
+ }
+
+ retval += hv_retval;
+ *f_pos += hv_retval;
+ buf += hv_retval;
+ count -= hv_retval;
+ }
+
+ mutex_unlock(&srom->lock);
+ kfree(kernbuf);
+
+ return retval;
+}
+
+/* Provide our own implementation so we can use srom->total_size. */
+loff_t srom_llseek(struct file *filp, loff_t offset, int origin)
+{
+ struct srom_dev *srom = filp->private_data;
+
+ if (mutex_lock_interruptible(&srom->lock))
+ return -ERESTARTSYS;
+
+ switch (origin) {
+ case SEEK_END:
+ offset += srom->total_size;
+ break;
+ case SEEK_CUR:
+ offset += filp->f_pos;
+ break;
+ }
+
+ if (offset < 0 || offset > srom->total_size) {
+ offset = -EINVAL;
+ } else {
+ filp->f_pos = offset;
+ filp->f_version = 0;
+ }
+
+ mutex_unlock(&srom->lock);
+
+ return offset;
+}
+
+static ssize_t total_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct srom_dev *srom = dev_get_drvdata(dev);
+ return sprintf(buf, "%u\n", srom->total_size);
+}
+
+static ssize_t sector_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct srom_dev *srom = dev_get_drvdata(dev);
+ return sprintf(buf, "%u\n", srom->sector_size);
+}
+
+static ssize_t page_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct srom_dev *srom = dev_get_drvdata(dev);
+ return sprintf(buf, "%u\n", srom->page_size);
+}
+
+static struct device_attribute srom_dev_attrs[] = {
+ __ATTR(total_size, S_IRUGO, total_show, NULL),
+ __ATTR(sector_size, S_IRUGO, sector_show, NULL),
+ __ATTR(page_size, S_IRUGO, page_show, NULL),
+ __ATTR_NULL
+};
+
+static char *srom_devnode(struct device *dev, mode_t *mode)
+{
+ *mode = S_IRUGO | S_IWUSR;
+ return kasprintf(GFP_KERNEL, "srom/%s", dev_name(dev));
+}
+
+/*
+ * The fops
+ */
+static const struct file_operations srom_fops = {
+ .owner = THIS_MODULE,
+ .llseek = srom_llseek,
+ .read = srom_read,
+ .write = srom_write,
+ .open = srom_open,
+ .release = srom_release,
+};
+
+/**
+ * srom_setup_minor() - Initialize per-minor information.
+ * @srom: Per-device SROM state.
+ * @index: Device to set up.
+ */
+static int srom_setup_minor(struct srom_dev *srom, int index)
+{
+ struct device *dev;
+ int devhdl = srom->hv_devhdl;
+
+ mutex_init(&srom->lock);
+
+ if (_srom_read(devhdl, &srom->total_size,
+ SROM_TOTAL_SIZE_OFF, sizeof(srom->total_size)) < 0)
+ return -EIO;
+ if (_srom_read(devhdl, &srom->sector_size,
+ SROM_SECTOR_SIZE_OFF, sizeof(srom->sector_size)) < 0)
+ return -EIO;
+ if (_srom_read(devhdl, &srom->page_size,
+ SROM_PAGE_SIZE_OFF, sizeof(srom->page_size)) < 0)
+ return -EIO;
+
+ dev = device_create(srom_class, &platform_bus,
+ MKDEV(srom_major, index), srom, "%d", index);
+ return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+}
+
+/** srom_init() - Initialize the driver's module. */
+static int srom_init(void)
+{
+ int result, i;
+ dev_t dev = MKDEV(srom_major, 0);
+
+ /*
+ * Start with a plausible number of partitions; the krealloc() call
+ * below will yield about log(srom_devs) additional allocations.
+ */
+ srom_devices = kzalloc(4 * sizeof(struct srom_dev), GFP_KERNEL);
+
+ /* Discover the number of srom partitions. */
+ for (i = 0; ; i++) {
+ int devhdl;
+ char buf[20];
+ struct srom_dev *new_srom_devices =
+ krealloc(srom_devices, (i+1) * sizeof(struct srom_dev),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!new_srom_devices) {
+ result = -ENOMEM;
+ goto fail_mem;
+ }
+ srom_devices = new_srom_devices;
+ sprintf(buf, "srom/0/%d", i);
+ devhdl = hv_dev_open((HV_VirtAddr)buf, 0);
+ if (devhdl < 0) {
+ if (devhdl != HV_ENODEV)
+ pr_notice("srom/%d: hv_dev_open failed: %d.\n",
+ i, devhdl);
+ break;
+ }
+ srom_devices[i].hv_devhdl = devhdl;
+ }
+ srom_devs = i;
+
+ /* Bail out early if we have no partitions at all. */
+ if (srom_devs == 0) {
+ result = -ENODEV;
+ goto fail_mem;
+ }
+
+ /* Register our major, and accept a dynamic number. */
+ if (srom_major)
+ result = register_chrdev_region(dev, srom_devs, "srom");
+ else {
+ result = alloc_chrdev_region(&dev, 0, srom_devs, "srom");
+ srom_major = MAJOR(dev);
+ }
+ if (result < 0)
+ goto fail_mem;
+
+ /* Register a character device. */
+ cdev_init(&srom_cdev, &srom_fops);
+ srom_cdev.owner = THIS_MODULE;
+ srom_cdev.ops = &srom_fops;
+ result = cdev_add(&srom_cdev, dev, srom_devs);
+ if (result < 0)
+ goto fail_chrdev;
+
+ /* Create a sysfs class. */
+ srom_class = class_create(THIS_MODULE, "srom");
+ if (IS_ERR(srom_class)) {
+ result = PTR_ERR(srom_class);
+ goto fail_cdev;
+ }
+ srom_class->dev_attrs = srom_dev_attrs;
+ srom_class->devnode = srom_devnode;
+
+ /* Do per-partition initialization */
+ for (i = 0; i < srom_devs; i++) {
+ result = srom_setup_minor(srom_devices + i, i);
+ if (result < 0)
+ goto fail_class;
+ }
+
+ return 0;
+
+fail_class:
+ for (i = 0; i < srom_devs; i++)
+ device_destroy(srom_class, MKDEV(srom_major, i));
+ class_destroy(srom_class);
+fail_cdev:
+ cdev_del(&srom_cdev);
+fail_chrdev:
+ unregister_chrdev_region(dev, srom_devs);
+fail_mem:
+ kfree(srom_devices);
+ return result;
+}
+
+/** srom_cleanup() - Clean up the driver's module. */
+static void srom_cleanup(void)
+{
+ int i;
+ for (i = 0; i < srom_devs; i++)
+ device_destroy(srom_class, MKDEV(srom_major, i));
+ class_destroy(srom_class);
+ cdev_del(&srom_cdev);
+ unregister_chrdev_region(MKDEV(srom_major, 0), srom_devs);
+ kfree(srom_devices);
+}
+
+module_init(srom_init);
+module_exit(srom_cleanup);
diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 7fc2f10..3f4051a 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -80,7 +80,7 @@
static LIST_HEAD(tis_chips);
static DEFINE_SPINLOCK(tis_lock);
-#ifdef CONFIG_PNP
+#if defined(CONFIG_PNP) && defined(CONFIG_ACPI)
static int is_itpm(struct pnp_dev *dev)
{
struct acpi_device *acpi = pnp_acpi_device(dev);
@@ -93,6 +93,11 @@
return 0;
}
+#else
+static inline int is_itpm(struct pnp_dev *dev)
+{
+ return 0;
+}
#endif
static int check_locality(struct tpm_chip *chip, int l)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index bf50924..d4c5423 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -25,9 +25,19 @@
DEFINE_MUTEX(cpuidle_lock);
LIST_HEAD(cpuidle_detected_devices);
-static void (*pm_idle_old)(void);
static int enabled_devices;
+static int off __read_mostly;
+static int initialized __read_mostly;
+
+int cpuidle_disabled(void)
+{
+ return off;
+}
+void disable_cpuidle(void)
+{
+ off = 1;
+}
#if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
static void cpuidle_kick_cpus(void)
@@ -46,25 +56,23 @@
* cpuidle_idle_call - the main idle loop
*
* NOTE: no locks or semaphores should be used here
+ * return non-zero on failure
*/
-static void cpuidle_idle_call(void)
+int cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_state *target_state;
int next_state;
+ if (off)
+ return -ENODEV;
+
+ if (!initialized)
+ return -ENODEV;
+
/* check if the device is ready */
- if (!dev || !dev->enabled) {
- if (pm_idle_old)
- pm_idle_old();
- else
-#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
- default_idle();
-#else
- local_irq_enable();
-#endif
- return;
- }
+ if (!dev || !dev->enabled)
+ return -EBUSY;
#if 0
/* shows regressions, re-enable for 2.6.29 */
@@ -89,7 +97,7 @@
next_state = cpuidle_curr_governor->select(dev);
if (need_resched()) {
local_irq_enable();
- return;
+ return 0;
}
target_state = &dev->states[next_state];
@@ -114,6 +122,8 @@
/* give the governor an opportunity to reflect on the outcome */
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev);
+
+ return 0;
}
/**
@@ -121,10 +131,10 @@
*/
void cpuidle_install_idle_handler(void)
{
- if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
+ if (enabled_devices) {
/* Make sure all changes finished before we switch to new idle */
smp_wmb();
- pm_idle = cpuidle_idle_call;
+ initialized = 1;
}
}
@@ -133,8 +143,8 @@
*/
void cpuidle_uninstall_idle_handler(void)
{
- if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
- pm_idle = pm_idle_old;
+ if (enabled_devices) {
+ initialized = 0;
cpuidle_kick_cpus();
}
}
@@ -427,7 +437,8 @@
{
int ret;
- pm_idle_old = pm_idle;
+ if (cpuidle_disabled())
+ return -ENODEV;
ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
if (ret)
@@ -438,4 +449,5 @@
return 0;
}
+module_param(off, int, 0444);
core_initcall(cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 33e50d5..38c3fd8 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -13,6 +13,7 @@
extern struct list_head cpuidle_detected_devices;
extern struct mutex cpuidle_lock;
extern spinlock_t cpuidle_driver_lock;
+extern int cpuidle_disabled(void);
/* idle loop */
extern void cpuidle_install_idle_handler(void);
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index fd1601e..3f7e3ce 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -26,6 +26,9 @@
if (!drv)
return -EINVAL;
+ if (cpuidle_disabled())
+ return -ENODEV;
+
spin_lock(&cpuidle_driver_lock);
if (cpuidle_curr_driver) {
spin_unlock(&cpuidle_driver_lock);
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 724c164..ea2f8e7 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -81,6 +81,9 @@
if (!gov || !gov->select)
return -EINVAL;
+ if (cpuidle_disabled())
+ return -ENODEV;
+
mutex_lock(&cpuidle_lock);
if (__cpuidle_find_governor(gov->name) == NULL) {
ret = 0;
diff --git a/drivers/eisa/pci_eisa.c b/drivers/eisa/pci_eisa.c
index 30da70d..cdae207 100644
--- a/drivers/eisa/pci_eisa.c
+++ b/drivers/eisa/pci_eisa.c
@@ -45,13 +45,13 @@
return 0;
}
-static struct pci_device_id __initdata pci_eisa_pci_tbl[] = {
+static struct pci_device_id pci_eisa_pci_tbl[] = {
{ PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
PCI_CLASS_BRIDGE_EISA << 8, 0xffff00, 0 },
{ 0, }
};
-static struct pci_driver __initdata pci_eisa_driver = {
+static struct pci_driver __refdata pci_eisa_driver = {
.name = "pci_eisa",
.id_table = pci_eisa_pci_tbl,
.probe = pci_eisa_init,
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index eacb05e..eb80b54 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -157,7 +157,7 @@
return length;
}
-static unsigned long
+static inline unsigned long
utf16_strlen(efi_char16_t *s)
{
return utf16_strnlen(s, ~0UL);
@@ -580,8 +580,8 @@
return -1;
}
-static u64 efi_pstore_write(enum pstore_type_id type, int part, size_t size,
- struct pstore_info *psi)
+static u64 efi_pstore_write(enum pstore_type_id type, unsigned int part,
+ size_t size, struct pstore_info *psi)
{
return 0;
}
diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c
index ce281d1..67df91a 100644
--- a/drivers/input/keyboard/gpio_keys.c
+++ b/drivers/input/keyboard/gpio_keys.c
@@ -483,7 +483,7 @@
buttons = kzalloc(pdata->nbuttons * (sizeof *buttons), GFP_KERNEL);
if (!buttons)
- return -ENODEV;
+ return -ENOMEM;
pp = NULL;
i = 0;
diff --git a/drivers/input/keyboard/lm8323.c b/drivers/input/keyboard/lm8323.c
index ab0acaf..756348a 100644
--- a/drivers/input/keyboard/lm8323.c
+++ b/drivers/input/keyboard/lm8323.c
@@ -754,8 +754,11 @@
device_remove_file(&client->dev, &dev_attr_disable_kp);
fail2:
while (--pwm >= 0)
- if (lm->pwm[pwm].enabled)
+ if (lm->pwm[pwm].enabled) {
+ device_remove_file(lm->pwm[pwm].cdev.dev,
+ &dev_attr_time);
led_classdev_unregister(&lm->pwm[pwm].cdev);
+ }
fail1:
input_free_device(idev);
kfree(lm);
@@ -775,8 +778,10 @@
device_remove_file(&lm->client->dev, &dev_attr_disable_kp);
for (i = 0; i < 3; i++)
- if (lm->pwm[i].enabled)
+ if (lm->pwm[i].enabled) {
+ device_remove_file(lm->pwm[i].cdev.dev, &dev_attr_time);
led_classdev_unregister(&lm->pwm[i].cdev);
+ }
kfree(lm);
diff --git a/drivers/input/keyboard/tegra-kbc.c b/drivers/input/keyboard/tegra-kbc.c
index da3828f..f270447 100644
--- a/drivers/input/keyboard/tegra-kbc.c
+++ b/drivers/input/keyboard/tegra-kbc.c
@@ -19,6 +19,7 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/input.h>
#include <linux/platform_device.h>
@@ -37,7 +38,7 @@
#define KBC_ROW_SCAN_DLY 5
/* KBC uses a 32KHz clock so a cycle = 1/32Khz */
-#define KBC_CYCLE_USEC 32
+#define KBC_CYCLE_MS 32
/* KBC Registers */
@@ -647,7 +648,7 @@
debounce_cnt = min(pdata->debounce_cnt, KBC_MAX_DEBOUNCE_CNT);
scan_time_rows = (KBC_ROW_SCAN_TIME + debounce_cnt) * num_rows;
kbc->repoll_dly = KBC_ROW_SCAN_DLY + scan_time_rows + pdata->repeat_cnt;
- kbc->repoll_dly = ((kbc->repoll_dly * KBC_CYCLE_USEC) + 999) / 1000;
+ kbc->repoll_dly = DIV_ROUND_UP(kbc->repoll_dly, KBC_CYCLE_MS);
input_dev->name = pdev->name;
input_dev->id.bustype = BUS_HOST;
diff --git a/drivers/input/misc/kxtj9.c b/drivers/input/misc/kxtj9.c
index c456f63..783597a 100644
--- a/drivers/input/misc/kxtj9.c
+++ b/drivers/input/misc/kxtj9.c
@@ -21,6 +21,7 @@
#include <linux/i2c.h>
#include <linux/input.h>
#include <linux/interrupt.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/input/kxtj9.h>
#include <linux/input-polldev.h>
diff --git a/drivers/input/misc/mma8450.c b/drivers/input/misc/mma8450.c
index 20f8f92..6c76cf7 100644
--- a/drivers/input/misc/mma8450.c
+++ b/drivers/input/misc/mma8450.c
@@ -24,6 +24,7 @@
#include <linux/delay.h>
#include <linux/i2c.h>
#include <linux/input-polldev.h>
+#include <linux/of_device.h>
#define MMA8450_DRV_NAME "mma8450"
@@ -229,10 +230,17 @@
};
MODULE_DEVICE_TABLE(i2c, mma8450_id);
+static const struct of_device_id mma8450_dt_ids[] = {
+ { .compatible = "fsl,mma8450", },
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, mma8450_dt_ids);
+
static struct i2c_driver mma8450_driver = {
.driver = {
.name = MMA8450_DRV_NAME,
.owner = THIS_MODULE,
+ .of_match_table = mma8450_dt_ids,
},
.probe = mma8450_probe,
.remove = __devexit_p(mma8450_remove),
diff --git a/drivers/input/mouse/hgpk.c b/drivers/input/mouse/hgpk.c
index 95577c1..4d17d9f 100644
--- a/drivers/input/mouse/hgpk.c
+++ b/drivers/input/mouse/hgpk.c
@@ -32,6 +32,7 @@
#define DEBUG
#include <linux/slab.h>
#include <linux/input.h>
+#include <linux/module.h>
#include <linux/serio.h>
#include <linux/libps2.h>
#include <linux/delay.h>
diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c
index bc3b518..131f9d1c 100644
--- a/drivers/input/touchscreen/ad7879.c
+++ b/drivers/input/touchscreen/ad7879.c
@@ -249,12 +249,14 @@
static void __ad7879_disable(struct ad7879 *ts)
{
+ u16 reg = (ts->cmd_crtl2 & ~AD7879_PM(-1)) |
+ AD7879_PM(AD7879_PM_SHUTDOWN);
disable_irq(ts->irq);
if (del_timer_sync(&ts->timer))
ad7879_ts_event_release(ts);
- ad7879_write(ts, AD7879_REG_CTRL2, AD7879_PM(AD7879_PM_SHUTDOWN));
+ ad7879_write(ts, AD7879_REG_CTRL2, reg);
}
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 3ff22e3..fb28b5a 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -17,14 +17,39 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+/**
+ * struct alias_prop - Alias property in 'aliases' node
+ * @link: List node to link the structure in aliases_lookup list
+ * @alias: Alias property name
+ * @np: Pointer to device_node that the alias stands for
+ * @id: Index value from end of alias name
+ * @stem: Alias string without the index
+ *
+ * The structure represents one alias property of 'aliases' node as
+ * an entry in aliases_lookup list.
+ */
+struct alias_prop {
+ struct list_head link;
+ const char *alias;
+ struct device_node *np;
+ int id;
+ char stem[0];
+};
+
+static LIST_HEAD(aliases_lookup);
+
struct device_node *allnodes;
struct device_node *of_chosen;
+struct device_node *of_aliases;
+
+static DEFINE_MUTEX(of_aliases_mutex);
/* use when traversing tree through the allnext, child, sibling,
* or parent members of struct device_node.
@@ -988,3 +1013,108 @@
}
#endif /* defined(CONFIG_OF_DYNAMIC) */
+static void of_alias_add(struct alias_prop *ap, struct device_node *np,
+ int id, const char *stem, int stem_len)
+{
+ ap->id = id;
+ ap->np = np;
+ strncpy(ap->stem, stem, stem_len);
+ ap->stem[stem_len] = 0;
+ list_add_tail(&ap->link, &aliases_lookup);
+ pr_debug("adding DT alias:%s: stem=%s id=%i node=%s\n",
+ ap->alias, ap->stem, ap->id, np ? np->full_name : NULL);
+}
+
+/**
+ * of_alias_scan() - Scan all properties of 'aliases' node
+ *
+ * The function scans all the properties of 'aliases' node and populate
+ * the global lookup table with the properties. It returns the
+ * number of alias_prop found, or error code in error case.
+ */
+__init void of_alias_scan(void)
+{
+ struct property *pp;
+
+ if (!of_aliases)
+ return;
+
+ for_each_property(pp, of_aliases->properties) {
+ const char *start = pp->name;
+ const char *end = start + strlen(start);
+ struct device_node *np;
+ struct alias_prop *ap;
+ int id, len;
+
+ /* Skip those we do not want to proceed */
+ if (!strcmp(pp->name, "name") ||
+ !strcmp(pp->name, "phandle") ||
+ !strcmp(pp->name, "linux,phandle"))
+ continue;
+
+ np = of_find_node_by_path(pp->value);
+ if (!np)
+ continue;
+
+ /* walk alias backwards to extract the id and 'stem' string */
+ while (isdigit(*(end-1)) && end > start)
+ end--;
+ len = end - start;
+ id = strlen(end) ? simple_strtoul(end, NULL, 10) : -1;
+
+ /* Allocate an alias_prop with enough space for the stem */
+ ap = early_init_dt_alloc_memory_arch(sizeof(*ap) + len + 1, 4);
+ if (!ap)
+ continue;
+ ap->alias = start;
+ of_alias_add(ap, np, id, start, len);
+ }
+}
+
+/**
+ * of_alias_get_id() - Get alias id for the given device_node
+ * @np: Pointer to the given device_node
+ * @stem: Alias stem of the given device_node
+ *
+ * The function travels the lookup table to get alias id for the given
+ * device_node and alias stem. It returns the alias id if find it.
+ * If not, dynamically creates one in the lookup table and returns it,
+ * or returns error code if fail to create.
+ */
+int of_alias_get_id(struct device_node *np, const char *stem)
+{
+ struct alias_prop *app;
+ int id = 0;
+ bool found = false;
+
+ mutex_lock(&of_aliases_mutex);
+ list_for_each_entry(app, &aliases_lookup, link) {
+ if (strcmp(app->stem, stem) != 0)
+ continue;
+
+ if (np == app->np) {
+ found = true;
+ id = app->id;
+ break;
+ }
+
+ if (id <= app->id)
+ id = app->id + 1;
+ }
+
+ /* If an id is not found, then allocate a new one */
+ if (!found) {
+ app = kzalloc(sizeof(*app) + strlen(stem) + 1, 4);
+ if (!app) {
+ id = -ENODEV;
+ goto out;
+ }
+ of_alias_add(app, np, id, stem, strlen(stem));
+ }
+
+ out:
+ mutex_unlock(&of_aliases_mutex);
+
+ return id;
+}
+EXPORT_SYMBOL_GPL(of_alias_get_id);
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 65200af..13d6d3a 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -707,10 +707,12 @@
__unflatten_device_tree(initial_boot_params, &allnodes,
early_init_dt_alloc_memory_arch);
- /* Get pointer to OF "/chosen" node for use everywhere */
+ /* Get pointer to "/chosen" and "/aliasas" nodes for use everywhere */
of_chosen = of_find_node_by_path("/chosen");
if (of_chosen == NULL)
of_chosen = of_find_node_by_path("/chosen@0");
+ of_aliases = of_find_node_by_path("/aliases");
+ of_alias_scan();
}
#endif /* CONFIG_OF_EARLY_FLATTREE */
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index a70fa89..2202857 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -110,7 +110,7 @@
}
-static struct acpi_dock_ops acpiphp_dock_ops = {
+static const struct acpi_dock_ops acpiphp_dock_ops = {
.handler = handle_hotplug_event_func,
};
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index bcae8dd..7789002 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -368,7 +368,7 @@
pr_info("%s: already running\n", pdev->name);
/* force to 24 hour mode */
- new_ctrl = reg & ~(OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP);
+ new_ctrl = reg & (OMAP_RTC_CTRL_SPLIT|OMAP_RTC_CTRL_AUTO_COMP);
new_ctrl |= OMAP_RTC_CTRL_STOP;
/* BOARD-SPECIFIC CUSTOMIZATION CAN GO HERE:
diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig
index 564ff4e..8345fb4 100644
--- a/drivers/target/iscsi/Kconfig
+++ b/drivers/target/iscsi/Kconfig
@@ -1,5 +1,6 @@
config ISCSI_TARGET
tristate "Linux-iSCSI.org iSCSI Target Mode Stack"
+ depends on NET
select CRYPTO
select CRYPTO_CRC32C
select CRYPTO_CRC32C_INTEL if X86
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 14c81c4..c24fb10 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -120,7 +120,7 @@
struct iscsi_tiqn *tiqn = NULL;
int ret;
- if (strlen(buf) > ISCSI_IQN_LEN) {
+ if (strlen(buf) >= ISCSI_IQN_LEN) {
pr_err("Target IQN exceeds %d bytes\n",
ISCSI_IQN_LEN);
return ERR_PTR(-EINVAL);
@@ -1857,7 +1857,7 @@
char *text_ptr, *text_in;
int cmdsn_ret, niov = 0, rx_got, rx_size;
u32 checksum = 0, data_crc = 0, payload_length;
- u32 padding = 0, text_length = 0;
+ u32 padding = 0, pad_bytes = 0, text_length = 0;
struct iscsi_cmd *cmd;
struct kvec iov[3];
struct iscsi_text *hdr;
@@ -1896,7 +1896,7 @@
padding = ((-payload_length) & 3);
if (padding != 0) {
- iov[niov].iov_base = cmd->pad_bytes;
+ iov[niov].iov_base = &pad_bytes;
iov[niov++].iov_len = padding;
rx_size += padding;
pr_debug("Receiving %u additional bytes"
@@ -1917,7 +1917,7 @@
if (conn->conn_ops->DataDigest) {
iscsit_do_crypto_hash_buf(&conn->conn_rx_hash,
text_in, text_length,
- padding, cmd->pad_bytes,
+ padding, (u8 *)&pad_bytes,
(u8 *)&data_crc);
if (checksum != data_crc) {
@@ -3468,7 +3468,12 @@
}
#else
-#define iscsit_thread_get_cpumask(X) ({})
+
+void iscsit_thread_get_cpumask(struct iscsi_conn *conn)
+{
+ return;
+}
+
#define iscsit_thread_check_cpumask(X, Y, Z) ({})
#endif /* CONFIG_SMP */
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index 32bb92c..f095e65 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -181,7 +181,7 @@
return ERR_PTR(-EOVERFLOW);
}
memset(buf, 0, MAX_PORTAL_LEN + 1);
- snprintf(buf, MAX_PORTAL_LEN, "%s", name);
+ snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
memset(&sockaddr, 0, sizeof(struct __kernel_sockaddr_storage));
diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index 713a4d2..4d087ac 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -978,7 +978,7 @@
pr_err("Unable to allocate memory for struct iscsi_login.\n");
iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
ISCSI_LOGIN_STATUS_NO_RESOURCES);
- goto out;
+ return NULL;
}
login->req = kzalloc(ISCSI_HDR_LEN, GFP_KERNEL);
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index c75a01a..8976032 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1747,6 +1747,8 @@
}
EXPORT_SYMBOL(transport_generic_handle_cdb);
+static void transport_generic_request_failure(struct se_cmd *,
+ struct se_device *, int, int);
/*
* Used by fabric module frontends to queue tasks directly.
* Many only be used from process context only
@@ -1754,6 +1756,8 @@
int transport_handle_cdb_direct(
struct se_cmd *cmd)
{
+ int ret;
+
if (!cmd->se_lun) {
dump_stack();
pr_err("cmd->se_lun is NULL\n");
@@ -1765,8 +1769,31 @@
" from interrupt context\n");
return -EINVAL;
}
-
- return transport_generic_new_cmd(cmd);
+ /*
+ * Set TRANSPORT_NEW_CMD state and cmd->t_transport_active=1 following
+ * transport_generic_handle_cdb*() -> transport_add_cmd_to_queue()
+ * in existing usage to ensure that outstanding descriptors are handled
+ * correctly during shutdown via transport_generic_wait_for_tasks()
+ *
+ * Also, we don't take cmd->t_state_lock here as we only expect
+ * this to be called for initial descriptor submission.
+ */
+ cmd->t_state = TRANSPORT_NEW_CMD;
+ atomic_set(&cmd->t_transport_active, 1);
+ /*
+ * transport_generic_new_cmd() is already handling QUEUE_FULL,
+ * so follow TRANSPORT_NEW_CMD processing thread context usage
+ * and call transport_generic_request_failure() if necessary..
+ */
+ ret = transport_generic_new_cmd(cmd);
+ if (ret == -EAGAIN)
+ return 0;
+ else if (ret < 0) {
+ cmd->transport_error_status = ret;
+ transport_generic_request_failure(cmd, NULL, 0,
+ (cmd->data_direction != DMA_TO_DEVICE));
+ }
+ return 0;
}
EXPORT_SYMBOL(transport_handle_cdb_direct);
@@ -3324,7 +3351,7 @@
goto out_invalid_cdb_field;
}
- cmd->t_task_lba = get_unaligned_be16(&cdb[2]);
+ cmd->t_task_lba = get_unaligned_be64(&cdb[2]);
passthrough = (dev->transport->transport_type ==
TRANSPORT_PLUGIN_PHBA_PDEV);
/*
diff --git a/drivers/target/tcm_fc/tcm_fc.h b/drivers/target/tcm_fc/tcm_fc.h
index f7fff7e..bd4fe21 100644
--- a/drivers/target/tcm_fc/tcm_fc.h
+++ b/drivers/target/tcm_fc/tcm_fc.h
@@ -187,4 +187,9 @@
ssize_t ft_format_wwn(char *, size_t, u64);
+/*
+ * Underlying HW specific helper function
+ */
+void ft_invl_hw_context(struct ft_cmd *);
+
#endif /* __TCM_FC_H__ */
diff --git a/drivers/target/tcm_fc/tfc_cmd.c b/drivers/target/tcm_fc/tfc_cmd.c
index 09df38b..5654dc2 100644
--- a/drivers/target/tcm_fc/tfc_cmd.c
+++ b/drivers/target/tcm_fc/tfc_cmd.c
@@ -320,6 +320,7 @@
default:
pr_debug("%s: unhandled frame r_ctl %x\n",
__func__, fh->fh_r_ctl);
+ ft_invl_hw_context(cmd);
fc_frame_free(fp);
transport_generic_free_cmd(&cmd->se_cmd, 0, 0);
break;
diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c
index 8e2a46d..c37f4cd 100644
--- a/drivers/target/tcm_fc/tfc_io.c
+++ b/drivers/target/tcm_fc/tfc_io.c
@@ -213,62 +213,49 @@
if (!(ntoh24(fh->fh_f_ctl) & FC_FC_REL_OFF))
goto drop;
- /*
- * Doesn't expect even single byte of payload. Payload
- * is expected to be copied directly to user buffers
- * due to DDP (Large Rx offload) feature, hence
- * BUG_ON if BUF is non-NULL
- */
- buf = fc_frame_payload_get(fp, 1);
- if (cmd->was_ddp_setup && buf) {
- pr_debug("%s: When DDP was setup, not expected to"
- "receive frame with payload, Payload shall be"
- "copied directly to buffer instead of coming "
- "via. legacy receive queues\n", __func__);
- BUG_ON(buf);
+ f_ctl = ntoh24(fh->fh_f_ctl);
+ ep = fc_seq_exch(seq);
+ lport = ep->lp;
+ if (cmd->was_ddp_setup) {
+ BUG_ON(!ep);
+ BUG_ON(!lport);
}
/*
- * If ft_cmd indicated 'ddp_setup', in that case only the last frame
- * should come with 'TSI bit being set'. If 'TSI bit is not set and if
- * data frame appears here, means error condition. In both the cases
- * release the DDP context (ddp_put) and in error case, as well
- * initiate error recovery mechanism.
+ * Doesn't expect payload if DDP is setup. Payload
+ * is expected to be copied directly to user buffers
+ * due to DDP (Large Rx offload),
*/
- ep = fc_seq_exch(seq);
- if (cmd->was_ddp_setup) {
- BUG_ON(!ep);
- lport = ep->lp;
- BUG_ON(!lport);
- }
- if (cmd->was_ddp_setup && ep->xid != FC_XID_UNKNOWN) {
- f_ctl = ntoh24(fh->fh_f_ctl);
- /*
- * If TSI bit set in f_ctl, means last write data frame is
- * received successfully where payload is posted directly
- * to user buffer and only the last frame's header is posted
- * in legacy receive queue
- */
- if (f_ctl & FC_FC_SEQ_INIT) { /* TSI bit set in FC frame */
- cmd->write_data_len = lport->tt.ddp_done(lport,
- ep->xid);
- goto last_frame;
- } else {
- /*
- * Updating the write_data_len may be meaningless at
- * this point, but just in case if required in future
- * for debugging or any other purpose
- */
- pr_err("%s: Received frame with TSI bit not"
- " being SET, dropping the frame, "
- "cmd->sg <%p>, cmd->sg_cnt <0x%x>\n",
- __func__, cmd->sg, cmd->sg_cnt);
- cmd->write_data_len = lport->tt.ddp_done(lport,
- ep->xid);
- lport->tt.seq_exch_abort(cmd->seq, 0);
- goto drop;
- }
- }
+ buf = fc_frame_payload_get(fp, 1);
+ if (buf)
+ pr_err("%s: xid 0x%x, f_ctl 0x%x, cmd->sg %p, "
+ "cmd->sg_cnt 0x%x. DDP was setup"
+ " hence not expected to receive frame with "
+ "payload, Frame will be dropped if "
+ "'Sequence Initiative' bit in f_ctl is "
+ "not set\n", __func__, ep->xid, f_ctl,
+ cmd->sg, cmd->sg_cnt);
+ /*
+ * Invalidate HW DDP context if it was setup for respective
+ * command. Invalidation of HW DDP context is requited in both
+ * situation (success and error).
+ */
+ ft_invl_hw_context(cmd);
+
+ /*
+ * If "Sequence Initiative (TSI)" bit set in f_ctl, means last
+ * write data frame is received successfully where payload is
+ * posted directly to user buffer and only the last frame's
+ * header is posted in receive queue.
+ *
+ * If "Sequence Initiative (TSI)" bit is not set, means error
+ * condition w.r.t. DDP, hence drop the packet and let explict
+ * ABORTS from other end of exchange timer trigger the recovery.
+ */
+ if (f_ctl & FC_FC_SEQ_INIT)
+ goto last_frame;
+ else
+ goto drop;
rel_off = ntohl(fh->fh_parm_offset);
frame_len = fr_len(fp);
@@ -331,3 +318,39 @@
drop:
fc_frame_free(fp);
}
+
+/*
+ * Handle and cleanup any HW specific resources if
+ * received ABORTS, errors, timeouts.
+ */
+void ft_invl_hw_context(struct ft_cmd *cmd)
+{
+ struct fc_seq *seq = cmd->seq;
+ struct fc_exch *ep = NULL;
+ struct fc_lport *lport = NULL;
+
+ BUG_ON(!cmd);
+
+ /* Cleanup the DDP context in HW if DDP was setup */
+ if (cmd->was_ddp_setup && seq) {
+ ep = fc_seq_exch(seq);
+ if (ep) {
+ lport = ep->lp;
+ if (lport && (ep->xid <= lport->lro_xid))
+ /*
+ * "ddp_done" trigger invalidation of HW
+ * specific DDP context
+ */
+ cmd->write_data_len = lport->tt.ddp_done(lport,
+ ep->xid);
+
+ /*
+ * Resetting same variable to indicate HW's
+ * DDP context has been invalidated to avoid
+ * re_invalidation of same context (context is
+ * identified using ep->xid)
+ */
+ cmd->was_ddp_setup = 0;
+ }
+ }
+}
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index bf7c687..f7f71b2 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -14,11 +14,7 @@
If you want this support, you should say Y or M here.
config THERMAL_HWMON
- bool "Hardware monitoring support"
+ bool
depends on THERMAL
depends on HWMON=y || HWMON=THERMAL
- help
- The generic thermal sysfs driver's hardware monitoring support
- requires a 2.10.7/3.0.2 or later lm-sensors userspace.
-
- Say Y if your user-space is new enough.
+ default y
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c
index 0b1c82a..708f8e9 100644
--- a/drivers/thermal/thermal_sys.c
+++ b/drivers/thermal/thermal_sys.c
@@ -420,6 +420,29 @@
/* hwmon sys I/F */
#include <linux/hwmon.h>
+
+/* thermal zone devices with the same type share one hwmon device */
+struct thermal_hwmon_device {
+ char type[THERMAL_NAME_LENGTH];
+ struct device *device;
+ int count;
+ struct list_head tz_list;
+ struct list_head node;
+};
+
+struct thermal_hwmon_attr {
+ struct device_attribute attr;
+ char name[16];
+};
+
+/* one temperature input for each thermal zone */
+struct thermal_hwmon_temp {
+ struct list_head hwmon_node;
+ struct thermal_zone_device *tz;
+ struct thermal_hwmon_attr temp_input; /* hwmon sys attr */
+ struct thermal_hwmon_attr temp_crit; /* hwmon sys attr */
+};
+
static LIST_HEAD(thermal_hwmon_list);
static ssize_t
@@ -437,9 +460,10 @@
int ret;
struct thermal_hwmon_attr *hwmon_attr
= container_of(attr, struct thermal_hwmon_attr, attr);
- struct thermal_zone_device *tz
- = container_of(hwmon_attr, struct thermal_zone_device,
+ struct thermal_hwmon_temp *temp
+ = container_of(hwmon_attr, struct thermal_hwmon_temp,
temp_input);
+ struct thermal_zone_device *tz = temp->tz;
ret = tz->ops->get_temp(tz, &temperature);
@@ -455,9 +479,10 @@
{
struct thermal_hwmon_attr *hwmon_attr
= container_of(attr, struct thermal_hwmon_attr, attr);
- struct thermal_zone_device *tz
- = container_of(hwmon_attr, struct thermal_zone_device,
+ struct thermal_hwmon_temp *temp
+ = container_of(hwmon_attr, struct thermal_hwmon_temp,
temp_crit);
+ struct thermal_zone_device *tz = temp->tz;
long temperature;
int ret;
@@ -469,22 +494,54 @@
}
-static int
-thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
+static struct thermal_hwmon_device *
+thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)
{
struct thermal_hwmon_device *hwmon;
- int new_hwmon_device = 1;
- int result;
mutex_lock(&thermal_list_lock);
list_for_each_entry(hwmon, &thermal_hwmon_list, node)
if (!strcmp(hwmon->type, tz->type)) {
- new_hwmon_device = 0;
mutex_unlock(&thermal_list_lock);
- goto register_sys_interface;
+ return hwmon;
}
mutex_unlock(&thermal_list_lock);
+ return NULL;
+}
+
+/* Find the temperature input matching a given thermal zone */
+static struct thermal_hwmon_temp *
+thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
+ const struct thermal_zone_device *tz)
+{
+ struct thermal_hwmon_temp *temp;
+
+ mutex_lock(&thermal_list_lock);
+ list_for_each_entry(temp, &hwmon->tz_list, hwmon_node)
+ if (temp->tz == tz) {
+ mutex_unlock(&thermal_list_lock);
+ return temp;
+ }
+ mutex_unlock(&thermal_list_lock);
+
+ return NULL;
+}
+
+static int
+thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
+{
+ struct thermal_hwmon_device *hwmon;
+ struct thermal_hwmon_temp *temp;
+ int new_hwmon_device = 1;
+ int result;
+
+ hwmon = thermal_hwmon_lookup_by_type(tz);
+ if (hwmon) {
+ new_hwmon_device = 0;
+ goto register_sys_interface;
+ }
+
hwmon = kzalloc(sizeof(struct thermal_hwmon_device), GFP_KERNEL);
if (!hwmon)
return -ENOMEM;
@@ -502,30 +559,36 @@
goto free_mem;
register_sys_interface:
- tz->hwmon = hwmon;
+ temp = kzalloc(sizeof(struct thermal_hwmon_temp), GFP_KERNEL);
+ if (!temp) {
+ result = -ENOMEM;
+ goto unregister_name;
+ }
+
+ temp->tz = tz;
hwmon->count++;
- snprintf(tz->temp_input.name, THERMAL_NAME_LENGTH,
+ snprintf(temp->temp_input.name, THERMAL_NAME_LENGTH,
"temp%d_input", hwmon->count);
- tz->temp_input.attr.attr.name = tz->temp_input.name;
- tz->temp_input.attr.attr.mode = 0444;
- tz->temp_input.attr.show = temp_input_show;
- sysfs_attr_init(&tz->temp_input.attr.attr);
- result = device_create_file(hwmon->device, &tz->temp_input.attr);
+ temp->temp_input.attr.attr.name = temp->temp_input.name;
+ temp->temp_input.attr.attr.mode = 0444;
+ temp->temp_input.attr.show = temp_input_show;
+ sysfs_attr_init(&temp->temp_input.attr.attr);
+ result = device_create_file(hwmon->device, &temp->temp_input.attr);
if (result)
- goto unregister_name;
+ goto free_temp_mem;
if (tz->ops->get_crit_temp) {
unsigned long temperature;
if (!tz->ops->get_crit_temp(tz, &temperature)) {
- snprintf(tz->temp_crit.name, THERMAL_NAME_LENGTH,
+ snprintf(temp->temp_crit.name, THERMAL_NAME_LENGTH,
"temp%d_crit", hwmon->count);
- tz->temp_crit.attr.attr.name = tz->temp_crit.name;
- tz->temp_crit.attr.attr.mode = 0444;
- tz->temp_crit.attr.show = temp_crit_show;
- sysfs_attr_init(&tz->temp_crit.attr.attr);
+ temp->temp_crit.attr.attr.name = temp->temp_crit.name;
+ temp->temp_crit.attr.attr.mode = 0444;
+ temp->temp_crit.attr.show = temp_crit_show;
+ sysfs_attr_init(&temp->temp_crit.attr.attr);
result = device_create_file(hwmon->device,
- &tz->temp_crit.attr);
+ &temp->temp_crit.attr);
if (result)
goto unregister_input;
}
@@ -534,13 +597,15 @@
mutex_lock(&thermal_list_lock);
if (new_hwmon_device)
list_add_tail(&hwmon->node, &thermal_hwmon_list);
- list_add_tail(&tz->hwmon_node, &hwmon->tz_list);
+ list_add_tail(&temp->hwmon_node, &hwmon->tz_list);
mutex_unlock(&thermal_list_lock);
return 0;
unregister_input:
- device_remove_file(hwmon->device, &tz->temp_input.attr);
+ device_remove_file(hwmon->device, &temp->temp_input.attr);
+ free_temp_mem:
+ kfree(temp);
unregister_name:
if (new_hwmon_device) {
device_remove_file(hwmon->device, &dev_attr_name);
@@ -556,15 +621,30 @@
static void
thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
{
- struct thermal_hwmon_device *hwmon = tz->hwmon;
+ struct thermal_hwmon_device *hwmon;
+ struct thermal_hwmon_temp *temp;
- tz->hwmon = NULL;
- device_remove_file(hwmon->device, &tz->temp_input.attr);
+ hwmon = thermal_hwmon_lookup_by_type(tz);
+ if (unlikely(!hwmon)) {
+ /* Should never happen... */
+ dev_dbg(&tz->device, "hwmon device lookup failed!\n");
+ return;
+ }
+
+ temp = thermal_hwmon_lookup_temp(hwmon, tz);
+ if (unlikely(!temp)) {
+ /* Should never happen... */
+ dev_dbg(&tz->device, "temperature input lookup failed!\n");
+ return;
+ }
+
+ device_remove_file(hwmon->device, &temp->temp_input.attr);
if (tz->ops->get_crit_temp)
- device_remove_file(hwmon->device, &tz->temp_crit.attr);
+ device_remove_file(hwmon->device, &temp->temp_crit.attr);
mutex_lock(&thermal_list_lock);
- list_del(&tz->hwmon_node);
+ list_del(&temp->hwmon_node);
+ kfree(temp);
if (!list_empty(&hwmon->tz_list)) {
mutex_unlock(&thermal_list_lock);
return;
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 69407e7..278aeaa 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -336,7 +336,7 @@
enable its driver.
config BACKLIGHT_AAT2870
- bool "AnalogicTech AAT2870 Backlight"
+ tristate "AnalogicTech AAT2870 Backlight"
depends on BACKLIGHT_CLASS_DEVICE && MFD_AAT2870_CORE
help
If you have a AnalogicTech AAT2870 say Y to enable the
diff --git a/drivers/video/backlight/aat2870_bl.c b/drivers/video/backlight/aat2870_bl.c
index 4952a61..331f1ef 100644
--- a/drivers/video/backlight/aat2870_bl.c
+++ b/drivers/video/backlight/aat2870_bl.c
@@ -44,7 +44,7 @@
struct backlight_device *bd = aat2870_bl->bd;
int val;
- val = brightness * aat2870_bl->max_current;
+ val = brightness * (aat2870_bl->max_current - 1);
val /= bd->props.max_brightness;
return val;
@@ -158,10 +158,10 @@
props.type = BACKLIGHT_RAW;
bd = backlight_device_register("aat2870-backlight", &pdev->dev,
aat2870_bl, &aat2870_bl_ops, &props);
- if (!bd) {
+ if (IS_ERR(bd)) {
dev_err(&pdev->dev,
"Failed allocate memory for backlight device\n");
- ret = -ENOMEM;
+ ret = PTR_ERR(bd);
goto out_kfree;
}
@@ -175,7 +175,7 @@
else
aat2870_bl->channels = AAT2870_BL_CH_ALL;
- if (pdata->max_brightness > 0)
+ if (pdata->max_current > 0)
aat2870_bl->max_current = pdata->max_current;
else
aat2870_bl->max_current = AAT2870_CURRENT_27_9;
diff --git a/fs/Kconfig b/fs/Kconfig
index 19891aa..9fe0b34 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -127,14 +127,21 @@
select TMPFS_XATTR
select GENERIC_ACL
help
- POSIX Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
+ POSIX Access Control Lists (ACLs) support additional access rights
+ for users and groups beyond the standard owner/group/world scheme,
+ and this option selects support for ACLs specifically for tmpfs
+ filesystems.
+
+ If you've selected TMPFS, it's possible that you'll also need
+ this option as there are a number of Linux distros that require
+ POSIX ACL support under /dev for certain features to work properly.
+ For example, some distros need this feature for ALSA-related /dev
+ files for sound to work properly. In short, if you're not sure,
+ say Y.
To learn more about Access Control Lists, visit the POSIX ACLs for
Linux website <http://acl.bestbits.at/>.
- If you don't know what Access Control Lists are, say N.
-
config TMPFS_XATTR
bool "Tmpfs extended attributes"
depends on TMPFS
diff --git a/fs/dcache.c b/fs/dcache.c
index 2347cdb..c83cae1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -795,6 +795,7 @@
/**
* prune_dcache_sb - shrink the dcache
+ * @sb: superblock
* @nr_to_scan: number of entries to try to free
*
* Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e2d88ba..4687fea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -124,7 +124,7 @@
{
void *ret;
- ret = kmalloc(size, flags);
+ ret = kzalloc(size, flags);
if (!ret)
ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
return ret;
diff --git a/fs/stack.c b/fs/stack.c
index 4a6f7f4..b4f2ab48 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -29,10 +29,7 @@
*
* We don't actually know what locking is used at the lower level;
* but if it's a filesystem that supports quotas, it will be using
- * i_lock as in inode_add_bytes(). tmpfs uses other locking, and
- * its 32-bit is (just) able to exceed 2TB i_size with the aid of
- * holes; but its i_blocks cannot carry into the upper long without
- * almost 2TB swap - let's ignore that case.
+ * i_lock as in inode_add_bytes().
*/
if (sizeof(i_blocks) > sizeof(long))
spin_lock(&src->i_lock);
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 3090471..e49c36d 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -128,7 +128,7 @@
extern int register_dock_notifier(struct notifier_block *nb);
extern void unregister_dock_notifier(struct notifier_block *nb);
extern int register_hotplug_dock_device(acpi_handle handle,
- struct acpi_dock_ops *ops,
+ const struct acpi_dock_ops *ops,
void *context);
extern void unregister_hotplug_dock_device(acpi_handle handle);
#else
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 2ed0a84..f554a93 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -47,7 +47,7 @@
/* Current ACPICA subsystem version in YYYYMMDD format */
-#define ACPI_CA_VERSION 0x20110413
+#define ACPI_CA_VERSION 0x20110623
#include "actypes.h"
#include "actbl.h"
@@ -69,6 +69,7 @@
extern u32 acpi_gbl_enable_aml_debug_object;
extern u8 acpi_gbl_copy_dsdt_locally;
extern u8 acpi_gbl_truncate_io_addresses;
+extern u8 acpi_gbl_disable_auto_repair;
extern u32 acpi_current_gpe_count;
extern struct acpi_table_fadt acpi_gbl_FADT;
diff --git a/include/acpi/apei.h b/include/acpi/apei.h
index e67b523..51a527d 100644
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -18,6 +18,11 @@
extern int hest_disable;
extern int erst_disable;
+#ifdef CONFIG_ACPI_APEI_GHES
+extern int ghes_disable;
+#else
+#define ghes_disable 1
+#endif
#ifdef CONFIG_ACPI_APEI
void __init acpi_hest_init(void);
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index ba4928c..67055f1 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -337,7 +337,7 @@
/* in processor_thermal.c */
int acpi_processor_get_limit_info(struct acpi_processor *pr);
-extern struct thermal_cooling_device_ops processor_cooling_ops;
+extern const struct thermal_cooling_device_ops processor_cooling_ops;
#ifdef CONFIG_CPU_FREQ
void acpi_thermal_cpufreq_init(void);
void acpi_thermal_cpufreq_exit(void);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 1deb2a7..6001b4da 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -238,7 +238,6 @@
extern int pnpacpi_disabled;
#define PXM_INVAL (-1)
-#define NID_INVAL (-1)
int acpi_check_resource_conflict(const struct resource *res);
@@ -280,6 +279,8 @@
#define OSC_SB_CPUHP_OST_SUPPORT 8
#define OSC_SB_APEI_SUPPORT 16
+extern bool osc_sb_apei_support_acked;
+
/* PCI defined _OSC bits */
/* _OSC DW1 Definition (OS Support Fields) */
#define OSC_EXT_PCI_CONFIG_SUPPORT 1
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 3bac44c..7ad6345 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -146,6 +146,7 @@
extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
#define BITMAP_LAST_WORD_MASK(nbits) \
( \
((nbits) % BITS_PER_LONG) ? \
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 36719ea..b51629e 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -122,6 +122,8 @@
};
#ifdef CONFIG_CPU_IDLE
+extern void disable_cpuidle(void);
+extern int cpuidle_idle_call(void);
extern int cpuidle_register_driver(struct cpuidle_driver *drv);
struct cpuidle_driver *cpuidle_get_driver(void);
@@ -135,6 +137,8 @@
extern void cpuidle_disable_device(struct cpuidle_device *dev);
#else
+static inline void disable_cpuidle(void) { }
+static inline int cpuidle_idle_call(void) { return -ENODEV; }
static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
{return -ENODEV; }
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 3ff060a..c6f996f 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -25,10 +25,6 @@
unsigned long reject_end;
unsigned long count;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
- struct dentry *dir;
-#endif
};
#define FAULT_ATTR_INITIALIZER { \
@@ -45,19 +41,15 @@
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-int init_fault_attr_dentries(struct fault_attr *attr, const char *name);
-void cleanup_fault_attr_dentries(struct fault_attr *attr);
+struct dentry *fault_create_debugfs_attr(const char *name,
+ struct dentry *parent, struct fault_attr *attr);
#else /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-static inline int init_fault_attr_dentries(struct fault_attr *attr,
- const char *name)
+static inline struct dentry *fault_create_debugfs_attr(const char *name,
+ struct dentry *parent, struct fault_attr *attr)
{
- return -ENODEV;
-}
-
-static inline void cleanup_fault_attr_dentries(struct fault_attr *attr)
-{
+ return ERR_PTR(-ENODEV);
}
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5bbebda..5e98eeb 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -1,8 +1,26 @@
/*
- * Basic general purpose allocator for managing special purpose memory
- * not managed by the regular kmalloc/kfree interface.
- * Uses for this includes on-device special memory, uncached memory
- * etc.
+ * Basic general purpose allocator for managing special purpose
+ * memory, for example, memory that is not managed by the regular
+ * kmalloc/kfree interface. Uses for this includes on-device special
+ * memory, uncached memory etc.
+ *
+ * It is safe to use the allocator in NMI handlers and other special
+ * unblockable contexts that could otherwise deadlock on locks. This
+ * is implemented by using atomic operations and retries on any
+ * conflicts. The disadvantage is that there may be livelocks in
+ * extreme cases. For better scalability, one allocator can be used
+ * for each CPU.
+ *
+ * The lockless operation only works if there is enough memory
+ * available. If new memory is added to the pool a lock has to be
+ * still taken. So any user relying on locklessness has to ensure
+ * that sufficient memory is preallocated.
+ *
+ * The basic atomic operation of this allocator is cmpxchg on long.
+ * On architectures that don't have NMI-safe cmpxchg implementation,
+ * the allocator can NOT be used in NMI handler. So code uses the
+ * allocator in NMI handler should depend on
+ * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
@@ -15,7 +33,7 @@
* General purpose special memory pool descriptor.
*/
struct gen_pool {
- rwlock_t lock;
+ spinlock_t lock;
struct list_head chunks; /* list of chunks in this pool */
int min_alloc_order; /* minimum allocation order */
};
@@ -24,8 +42,8 @@
* General purpose special memory pool chunk descriptor.
*/
struct gen_pool_chunk {
- spinlock_t lock;
struct list_head next_chunk; /* next chunk in pool */
+ atomic_t avail;
phys_addr_t phys_addr; /* physical starting address of memory chunk */
unsigned long start_addr; /* starting address of memory chunk */
unsigned long end_addr; /* ending address of memory chunk */
@@ -56,4 +74,8 @@
extern void gen_pool_destroy(struct gen_pool *);
extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
+extern void gen_pool_for_each_chunk(struct gen_pool *,
+ void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
+extern size_t gen_pool_avail(struct gen_pool *);
+extern size_t gen_pool_size(struct gen_pool *);
#endif /* __GENALLOC_H__ */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index cb40892..3a76faf 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -92,7 +92,7 @@
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 13a801f..255491c 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -146,6 +146,10 @@
void ida_destroy(struct ida *ida);
void ida_init(struct ida *ida);
+int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
+ gfp_t gfp_mask);
+void ida_simple_remove(struct ida *ida, unsigned int id);
+
void __init idr_init_cache(void);
#endif /* __IDR_H__ */
diff --git a/include/linux/llist.h b/include/linux/llist.h
new file mode 100644
index 0000000..aa0c8b5
--- /dev/null
+++ b/include/linux/llist.h
@@ -0,0 +1,126 @@
+#ifndef LLIST_H
+#define LLIST_H
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * If there are multiple producers and multiple consumers, llist_add
+ * can be used in producers and llist_del_all can be used in
+ * consumers. They can work simultaneously without lock. But
+ * llist_del_first can not be used here. Because llist_del_first
+ * depends on list->first->next does not changed if list->first is not
+ * changed during its operation, but llist_del_first, llist_add,
+ * llist_add (or llist_del_all, llist_add, llist_add) sequence in
+ * another consumer may violate that.
+ *
+ * If there are multiple producers and one consumer, llist_add can be
+ * used in producers and llist_del_all or llist_del_first can be used
+ * in the consumer.
+ *
+ * This can be summarized as follow:
+ *
+ * | add | del_first | del_all
+ * add | - | - | -
+ * del_first | | L | L
+ * del_all | | | -
+ *
+ * Where "-" stands for no lock is needed, while "L" stands for lock
+ * is needed.
+ *
+ * The list entries deleted via llist_del_all can be traversed with
+ * traversing function such as llist_for_each etc. But the list
+ * entries can not be traversed safely before deleted from the list.
+ * The order of deleted entries is from the newest to the oldest added
+ * one. If you want to traverse from the oldest to the newest, you
+ * must reverse the order by yourself before traversing.
+ *
+ * The basic atomic operation of this list is cmpxchg on long. On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handler. So code uses the list in NMI
+ * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ */
+
+struct llist_head {
+ struct llist_node *first;
+};
+
+struct llist_node {
+ struct llist_node *next;
+};
+
+#define LLIST_HEAD_INIT(name) { NULL }
+#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name)
+
+/**
+ * init_llist_head - initialize lock-less list head
+ * @head: the head for your lock-less list
+ */
+static inline void init_llist_head(struct llist_head *list)
+{
+ list->first = NULL;
+}
+
+/**
+ * llist_entry - get the struct of this entry
+ * @ptr: the &struct llist_node pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the llist_node within the struct.
+ */
+#define llist_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+/**
+ * llist_for_each - iterate over some deleted entries of a lock-less list
+ * @pos: the &struct llist_node to use as a loop cursor
+ * @node: the first entry of deleted list entries
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being deleted from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry. If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each(pos, node) \
+ for ((pos) = (node); pos; (pos) = (pos)->next)
+
+/**
+ * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
+ * @pos: the type * to use as a loop cursor.
+ * @node: the fist entry of deleted list entries.
+ * @member: the name of the llist_node with the struct.
+ *
+ * In general, some entries of the lock-less list can be traversed
+ * safely only after being removed from list, so start with an entry
+ * instead of list head.
+ *
+ * If being used on entries deleted from lock-less list directly, the
+ * traverse order is from the newest to the oldest added entry. If
+ * you want to traverse from the oldest to the newest, you must
+ * reverse the order by yourself before traversing.
+ */
+#define llist_for_each_entry(pos, node, member) \
+ for ((pos) = llist_entry((node), typeof(*(pos)), member); \
+ &(pos)->member != NULL; \
+ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
+
+/**
+ * llist_empty - tests whether a lock-less list is empty
+ * @head: the list to test
+ *
+ * Not guaranteed to be accurate or up to date. Just a quick way to
+ * test whether the list is empty without deleting something from the
+ * list.
+ */
+static inline int llist_empty(const struct llist_head *head)
+{
+ return ACCESS_ONCE(head->first) == NULL;
+}
+
+void llist_add(struct llist_node *new, struct llist_head *head);
+void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+ struct llist_head *head);
+struct llist_node *llist_del_first(struct llist_head *head);
+struct llist_node *llist_del_all(struct llist_head *head);
+#endif /* LLIST_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b966007..3b535db 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -86,8 +86,6 @@
extern void mem_cgroup_uncharge_page(struct page *page);
extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern int mem_cgroup_shmem_charge_fallback(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask);
extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
@@ -225,12 +223,6 @@
{
}
-static inline int mem_cgroup_shmem_charge_fallback(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
- return 0;
-}
-
static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
{
}
diff --git a/include/linux/mfd/aat2870.h b/include/linux/mfd/aat2870.h
index 89212df..f7316c2 100644
--- a/include/linux/mfd/aat2870.h
+++ b/include/linux/mfd/aat2870.h
@@ -89,7 +89,7 @@
/* Backlight current magnitude (mA) */
enum aat2870_current {
- AAT2870_CURRENT_0_45,
+ AAT2870_CURRENT_0_45 = 1,
AAT2870_CURRENT_0_90,
AAT2870_CURRENT_1_80,
AAT2870_CURRENT_2_70,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3172a1c..f2690cf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1600,6 +1600,7 @@
};
extern void memory_failure(unsigned long pfn, int trapno);
extern int __memory_failure(unsigned long pfn, int trapno, int flags);
+extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
diff --git a/include/linux/of.h b/include/linux/of.h
index 0085bb0..bc3dc63 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -68,6 +68,7 @@
/* Pointer for first entry in chain of all nodes. */
extern struct device_node *allnodes;
extern struct device_node *of_chosen;
+extern struct device_node *of_aliases;
extern rwlock_t devtree_lock;
static inline bool of_have_populated_dt(void)
@@ -209,6 +210,9 @@
extern const void *of_get_property(const struct device_node *node,
const char *name,
int *lenp);
+#define for_each_property(pp, properties) \
+ for (pp = properties; pp != NULL; pp = pp->next)
+
extern int of_n_addr_cells(struct device_node *np);
extern int of_n_size_cells(struct device_node *np);
extern const struct of_device_id *of_match_node(
@@ -221,6 +225,10 @@
const char *list_name, const char *cells_name, int index,
struct device_node **out_node, const void **out_args);
+extern void *early_init_dt_alloc_memory_arch(u64 size, u64 align);
+extern void of_alias_scan(void);
+extern int of_alias_get_id(struct device_node *np, const char *stem);
+
extern int of_machine_is_compatible(const char *compat);
extern int prom_add_property(struct device_node* np, struct property* prop);
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index c84d900..b74b74f 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -97,7 +97,6 @@
extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
int depth, void *data);
extern void early_init_dt_add_memory_arch(u64 base, u64 size);
-extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);
extern u64 dt_mem_next_cell(int s, __be32 **cellp);
/*
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 23241c2..9d4539c 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -39,7 +39,15 @@
* when it is shrunk, before we rcu free the node. See shrink code for
* details.
*/
-#define RADIX_TREE_INDIRECT_PTR 1
+#define RADIX_TREE_INDIRECT_PTR 1
+/*
+ * A common use of the radix tree is to store pointers to struct pages;
+ * but shmem/tmpfs needs also to store swap entries in the same tree:
+ * those are marked as exceptional entries to distinguish them.
+ * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
+ */
+#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
+#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
#define radix_tree_indirect_to_ptr(ptr) \
radix_tree_indirect_to_ptr((void __force *)(ptr))
@@ -174,6 +182,28 @@
}
/**
+ * radix_tree_exceptional_entry - radix_tree_deref_slot gave exceptional entry?
+ * @arg: value returned by radix_tree_deref_slot
+ * Returns: 0 if well-aligned pointer, non-0 if exceptional entry.
+ */
+static inline int radix_tree_exceptional_entry(void *arg)
+{
+ /* Not unlikely because radix_tree_exception often tested first */
+ return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY;
+}
+
+/**
+ * radix_tree_exception - radix_tree_deref_slot returned either exception?
+ * @arg: value returned by radix_tree_deref_slot
+ * Returns: 0 if well-aligned pointer, non-0 if either kind of exception.
+ */
+static inline int radix_tree_exception(void *arg)
+{
+ return unlikely((unsigned long)arg &
+ (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
+}
+
+/**
* radix_tree_replace_slot - replace item in a slot
* @pslot: pointer to slot, returned by radix_tree_lookup_slot
* @item: new item to store in the slot.
@@ -194,8 +224,8 @@
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items);
-unsigned int
-radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+ void ***results, unsigned long *indices,
unsigned long first_index, unsigned int max_items);
unsigned long radix_tree_next_hole(struct radix_tree_root *root,
unsigned long index, unsigned long max_scan);
@@ -222,6 +252,7 @@
unsigned long nr_to_tag,
unsigned int fromtag, unsigned int totag);
int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
static inline void radix_tree_preload_end(void)
{
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index aa08fa8..9291ac3 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -8,22 +8,15 @@
/* inode in-kernel data */
-#define SHMEM_NR_DIRECT 16
-
-#define SHMEM_SYMLINK_INLINE_LEN (SHMEM_NR_DIRECT * sizeof(swp_entry_t))
-
struct shmem_inode_info {
spinlock_t lock;
unsigned long flags;
unsigned long alloced; /* data pages alloced to file */
- unsigned long swapped; /* subtotal assigned to swap */
- unsigned long next_index; /* highest alloced index + 1 */
- struct shared_policy policy; /* NUMA memory alloc policy */
- struct page *i_indirect; /* top indirect blocks page */
union {
- swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */
- char inline_symlink[SHMEM_SYMLINK_INLINE_LEN];
+ unsigned long swapped; /* subtotal assigned to swap */
+ char *symlink; /* unswappable short symlink */
};
+ struct shared_policy policy; /* NUMA memory alloc policy */
struct list_head swaplist; /* chain of maybes on swap */
struct list_head xattr_list; /* list of shmem_xattr */
struct inode vfs_inode;
@@ -49,7 +42,7 @@
/*
* Functions in mm/shmem.c called directly from elsewhere:
*/
-extern int init_tmpfs(void);
+extern int shmem_init(void);
extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
extern struct file *shmem_file_setup(const char *name,
loff_t size, unsigned long flags);
@@ -59,8 +52,6 @@
pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(swp_entry_t entry, struct page *page);
-extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- struct page **pagep, swp_entry_t *ent);
static inline struct page *shmem_read_mapping_page(
struct address_space *mapping, pgoff_t index)
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cd42e30..2189d3f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -1,3 +1,8 @@
+#ifndef _LINUX_SWAPOPS_H
+#define _LINUX_SWAPOPS_H
+
+#include <linux/radix-tree.h>
+
/*
* swapcache pages are stored in the swapper_space radix tree. We want to
* get good packing density in that tree, so the index should be dense in
@@ -76,6 +81,22 @@
return __swp_entry_to_pte(arch_entry);
}
+static inline swp_entry_t radix_to_swp_entry(void *arg)
+{
+ swp_entry_t entry;
+
+ entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+ return entry;
+}
+
+static inline void *swp_to_radix_entry(swp_entry_t entry)
+{
+ unsigned long value;
+
+ value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
+ return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
+}
+
#ifdef CONFIG_MIGRATION
static inline swp_entry_t make_migration_entry(struct page *page, int write)
{
@@ -169,3 +190,5 @@
return 0;
}
#endif
+
+#endif /* _LINUX_SWAPOPS_H */
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d3ec89f..47b4a27 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -85,22 +85,6 @@
((long)t-2732+5)/10 : ((long)t-2732-5)/10)
#define CELSIUS_TO_KELVIN(t) ((t)*10+2732)
-#if defined(CONFIG_THERMAL_HWMON)
-/* thermal zone devices with the same type share one hwmon device */
-struct thermal_hwmon_device {
- char type[THERMAL_NAME_LENGTH];
- struct device *device;
- int count;
- struct list_head tz_list;
- struct list_head node;
-};
-
-struct thermal_hwmon_attr {
- struct device_attribute attr;
- char name[16];
-};
-#endif
-
struct thermal_zone_device {
int id;
char type[THERMAL_NAME_LENGTH];
@@ -120,12 +104,6 @@
struct mutex lock; /* protect cooling devices list */
struct list_head node;
struct delayed_work poll_queue;
-#if defined(CONFIG_THERMAL_HWMON)
- struct list_head hwmon_node;
- struct thermal_hwmon_device *hwmon;
- struct thermal_hwmon_attr temp_input; /* hwmon sys attr */
- struct thermal_hwmon_attr temp_crit; /* hwmon sys attr */
-#endif
};
/* Adding event notification support elements */
#define THERMAL_GENL_FAMILY_NAME "thermal_event"
diff --git a/init/main.c b/init/main.c
index d7211fa..9c51ee7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -369,9 +369,12 @@
init_idle_bootup_task(current);
preempt_enable_no_resched();
schedule();
- preempt_disable();
+
+ /* At this point, we can enable user mode helper functionality */
+ usermodehelper_enable();
/* Call into cpu_idle with preempt disabled */
+ preempt_disable();
cpu_idle();
}
@@ -715,7 +718,7 @@
{
cpuset_init_smp();
usermodehelper_init();
- init_tmpfs();
+ shmem_init();
driver_init();
init_irq_proc();
do_ctors();
diff --git a/ipc/shm.c b/ipc/shm.c
index 9fb044f3b..b5bae9d 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -294,7 +294,7 @@
void shm_destroy_orphaned(struct ipc_namespace *ns)
{
down_write(&shm_ids(ns).rw_mutex);
- if (&shm_ids(ns).in_use)
+ if (shm_ids(ns).in_use)
idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
up_write(&shm_ids(ns).rw_mutex);
}
@@ -304,9 +304,12 @@
{
struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+ if (shm_ids(ns).in_use == 0)
+ return;
+
/* Destroy all already created segments, but not mapped yet */
down_write(&shm_ids(ns).rw_mutex);
- if (&shm_ids(ns).in_use)
+ if (shm_ids(ns).in_use)
idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
up_write(&shm_ids(ns).rw_mutex);
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 47613df..ddc7644 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -274,7 +274,7 @@
* (used for preventing user land processes from being created after the user
* land has been frozen during a system-wide hibernation or suspend operation).
*/
-static int usermodehelper_disabled;
+static int usermodehelper_disabled = 1;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d1db288..e19ce14 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -291,30 +291,28 @@
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
- s = NULL;
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
- if (!s)
- s = kmalloc_node(sizeof(struct listener),
- GFP_KERNEL, cpu_to_node(cpu));
+ s = kmalloc_node(sizeof(struct listener),
+ GFP_KERNEL, cpu_to_node(cpu));
if (!s)
goto cleanup;
+
s->pid = pid;
- INIT_LIST_HEAD(&s->list);
s->valid = 1;
listeners = &per_cpu(listener_array, cpu);
down_write(&listeners->sem);
- list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
- if (s2->pid == pid)
- goto next_cpu;
+ list_for_each_entry(s2, &listeners->list, list) {
+ if (s2->pid == pid && s2->valid)
+ goto exists;
}
list_add(&s->list, &listeners->list);
s = NULL;
-next_cpu:
+exists:
up_write(&listeners->sem);
+ kfree(s); /* nop if NULL */
}
- kfree(s);
return 0;
}
diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5a..6c695ff 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -276,4 +276,7 @@
so its calculations are in fixed point. Modules can select this
when they require this function. Module will be called cordic.
+config LLIST
+ bool
+
endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 892f4e2..6457af4a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -115,6 +115,8 @@
obj-$(CONFIG_CORDIC) += cordic.o
+obj-$(CONFIG_LLIST) += llist.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 37ef4b0..2f4412e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -271,8 +271,6 @@
}
EXPORT_SYMBOL(__bitmap_weight);
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
-
void bitmap_set(unsigned long *map, int start, int nr)
{
unsigned long *p = map + BIT_WORD(start);
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 2577b12..f193b77 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -197,21 +197,15 @@
return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
}
-void cleanup_fault_attr_dentries(struct fault_attr *attr)
-{
- debugfs_remove_recursive(attr->dir);
-}
-
-int init_fault_attr_dentries(struct fault_attr *attr, const char *name)
+struct dentry *fault_create_debugfs_attr(const char *name,
+ struct dentry *parent, struct fault_attr *attr)
{
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
- dir = debugfs_create_dir(name, NULL);
+ dir = debugfs_create_dir(name, parent);
if (!dir)
- return -ENOMEM;
-
- attr->dir = dir;
+ return ERR_PTR(-ENOMEM);
if (!debugfs_create_ul("probability", mode, dir, &attr->probability))
goto fail;
@@ -243,11 +237,11 @@
#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */
- return 0;
+ return dir;
fail:
- debugfs_remove_recursive(attr->dir);
+ debugfs_remove_recursive(dir);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 577ddf8..f352cc4 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -1,8 +1,26 @@
/*
- * Basic general purpose allocator for managing special purpose memory
- * not managed by the regular kmalloc/kfree interface.
- * Uses for this includes on-device special memory, uncached memory
- * etc.
+ * Basic general purpose allocator for managing special purpose
+ * memory, for example, memory that is not managed by the regular
+ * kmalloc/kfree interface. Uses for this includes on-device special
+ * memory, uncached memory etc.
+ *
+ * It is safe to use the allocator in NMI handlers and other special
+ * unblockable contexts that could otherwise deadlock on locks. This
+ * is implemented by using atomic operations and retries on any
+ * conflicts. The disadvantage is that there may be livelocks in
+ * extreme cases. For better scalability, one allocator can be used
+ * for each CPU.
+ *
+ * The lockless operation only works if there is enough memory
+ * available. If new memory is added to the pool a lock has to be
+ * still taken. So any user relying on locklessness has to ensure
+ * that sufficient memory is preallocated.
+ *
+ * The basic atomic operation of this allocator is cmpxchg on long.
+ * On architectures that don't have NMI-safe cmpxchg implementation,
+ * the allocator can NOT be used in NMI handler. So code uses the
+ * allocator in NMI handler should depend on
+ * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
*
* Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
*
@@ -13,8 +31,109 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/bitmap.h>
+#include <linux/rculist.h>
+#include <linux/interrupt.h>
#include <linux/genalloc.h>
+static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
+{
+ unsigned long val, nval;
+
+ nval = *addr;
+ do {
+ val = nval;
+ if (val & mask_to_set)
+ return -EBUSY;
+ cpu_relax();
+ } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
+
+ return 0;
+}
+
+static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear)
+{
+ unsigned long val, nval;
+
+ nval = *addr;
+ do {
+ val = nval;
+ if ((val & mask_to_clear) != mask_to_clear)
+ return -EBUSY;
+ cpu_relax();
+ } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
+
+ return 0;
+}
+
+/*
+ * bitmap_set_ll - set the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Set @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users set the same bit, one user will return remain bits, otherwise
+ * return 0.
+ */
+static int bitmap_set_ll(unsigned long *map, int start, int nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const int size = start + nr;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+ while (nr - bits_to_set >= 0) {
+ if (set_bits_ll(p, mask_to_set))
+ return nr;
+ nr -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = ~0UL;
+ p++;
+ }
+ if (nr) {
+ mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+ if (set_bits_ll(p, mask_to_set))
+ return nr;
+ }
+
+ return 0;
+}
+
+/*
+ * bitmap_clear_ll - clear the specified number of bits at the specified position
+ * @map: pointer to a bitmap
+ * @start: a bit position in @map
+ * @nr: number of bits to set
+ *
+ * Clear @nr bits start from @start in @map lock-lessly. Several users
+ * can set/clear the same bitmap simultaneously without lock. If two
+ * users clear the same bit, one user will return remain bits,
+ * otherwise return 0.
+ */
+static int bitmap_clear_ll(unsigned long *map, int start, int nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const int size = start + nr;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+ while (nr - bits_to_clear >= 0) {
+ if (clear_bits_ll(p, mask_to_clear))
+ return nr;
+ nr -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ p++;
+ }
+ if (nr) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ if (clear_bits_ll(p, mask_to_clear))
+ return nr;
+ }
+
+ return 0;
+}
/**
* gen_pool_create - create a new special memory pool
@@ -30,7 +149,7 @@
pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
if (pool != NULL) {
- rwlock_init(&pool->lock);
+ spin_lock_init(&pool->lock);
INIT_LIST_HEAD(&pool->chunks);
pool->min_alloc_order = min_alloc_order;
}
@@ -63,14 +182,14 @@
if (unlikely(chunk == NULL))
return -ENOMEM;
- spin_lock_init(&chunk->lock);
chunk->phys_addr = phys;
chunk->start_addr = virt;
chunk->end_addr = virt + size;
+ atomic_set(&chunk->avail, size);
- write_lock(&pool->lock);
- list_add(&chunk->next_chunk, &pool->chunks);
- write_unlock(&pool->lock);
+ spin_lock(&pool->lock);
+ list_add_rcu(&chunk->next_chunk, &pool->chunks);
+ spin_unlock(&pool->lock);
return 0;
}
@@ -85,19 +204,19 @@
*/
phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
{
- struct list_head *_chunk;
struct gen_pool_chunk *chunk;
+ phys_addr_t paddr = -1;
- read_lock(&pool->lock);
- list_for_each(_chunk, &pool->chunks) {
- chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
-
- if (addr >= chunk->start_addr && addr < chunk->end_addr)
- return chunk->phys_addr + addr - chunk->start_addr;
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
+ if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+ paddr = chunk->phys_addr + (addr - chunk->start_addr);
+ break;
+ }
}
- read_unlock(&pool->lock);
+ rcu_read_unlock();
- return -1;
+ return paddr;
}
EXPORT_SYMBOL(gen_pool_virt_to_phys);
@@ -115,7 +234,6 @@
int order = pool->min_alloc_order;
int bit, end_bit;
-
list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
list_del(&chunk->next_chunk);
@@ -137,44 +255,50 @@
* @size: number of bytes to allocate from the pool
*
* Allocate the requested number of bytes from the specified pool.
- * Uses a first-fit algorithm.
+ * Uses a first-fit algorithm. Can not be used in NMI handler on
+ * architectures without NMI-safe cmpxchg implementation.
*/
unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
{
- struct list_head *_chunk;
struct gen_pool_chunk *chunk;
- unsigned long addr, flags;
+ unsigned long addr = 0;
int order = pool->min_alloc_order;
- int nbits, start_bit, end_bit;
+ int nbits, start_bit = 0, end_bit, remain;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
if (size == 0)
return 0;
nbits = (size + (1UL << order) - 1) >> order;
-
- read_lock(&pool->lock);
- list_for_each(_chunk, &pool->chunks) {
- chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
+ if (size > atomic_read(&chunk->avail))
+ continue;
end_bit = (chunk->end_addr - chunk->start_addr) >> order;
-
- spin_lock_irqsave(&chunk->lock, flags);
- start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0,
- nbits, 0);
- if (start_bit >= end_bit) {
- spin_unlock_irqrestore(&chunk->lock, flags);
+retry:
+ start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit,
+ start_bit, nbits, 0);
+ if (start_bit >= end_bit)
continue;
+ remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
+ if (remain) {
+ remain = bitmap_clear_ll(chunk->bits, start_bit,
+ nbits - remain);
+ BUG_ON(remain);
+ goto retry;
}
addr = chunk->start_addr + ((unsigned long)start_bit << order);
-
- bitmap_set(chunk->bits, start_bit, nbits);
- spin_unlock_irqrestore(&chunk->lock, flags);
- read_unlock(&pool->lock);
- return addr;
+ size = nbits << order;
+ atomic_sub(size, &chunk->avail);
+ break;
}
- read_unlock(&pool->lock);
- return 0;
+ rcu_read_unlock();
+ return addr;
}
EXPORT_SYMBOL(gen_pool_alloc);
@@ -184,33 +308,95 @@
* @addr: starting address of memory to free back to pool
* @size: size in bytes of memory to free
*
- * Free previously allocated special memory back to the specified pool.
+ * Free previously allocated special memory back to the specified
+ * pool. Can not be used in NMI handler on architectures without
+ * NMI-safe cmpxchg implementation.
*/
void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
{
- struct list_head *_chunk;
struct gen_pool_chunk *chunk;
- unsigned long flags;
int order = pool->min_alloc_order;
- int bit, nbits;
+ int start_bit, nbits, remain;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
nbits = (size + (1UL << order) - 1) >> order;
-
- read_lock(&pool->lock);
- list_for_each(_chunk, &pool->chunks) {
- chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
-
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
if (addr >= chunk->start_addr && addr < chunk->end_addr) {
BUG_ON(addr + size > chunk->end_addr);
- spin_lock_irqsave(&chunk->lock, flags);
- bit = (addr - chunk->start_addr) >> order;
- while (nbits--)
- __clear_bit(bit++, chunk->bits);
- spin_unlock_irqrestore(&chunk->lock, flags);
- break;
+ start_bit = (addr - chunk->start_addr) >> order;
+ remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
+ BUG_ON(remain);
+ size = nbits << order;
+ atomic_add(size, &chunk->avail);
+ rcu_read_unlock();
+ return;
}
}
- BUG_ON(nbits > 0);
- read_unlock(&pool->lock);
+ rcu_read_unlock();
+ BUG();
}
EXPORT_SYMBOL(gen_pool_free);
+
+/**
+ * gen_pool_for_each_chunk - call func for every chunk of generic memory pool
+ * @pool: the generic memory pool
+ * @func: func to call
+ * @data: additional data used by @func
+ *
+ * Call @func for every chunk of generic memory pool. The @func is
+ * called with rcu_read_lock held.
+ */
+void gen_pool_for_each_chunk(struct gen_pool *pool,
+ void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data),
+ void *data)
+{
+ struct gen_pool_chunk *chunk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk)
+ func(pool, chunk, data);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(gen_pool_for_each_chunk);
+
+/**
+ * gen_pool_avail - get available free space of the pool
+ * @pool: pool to get available free space
+ *
+ * Return available free space of the specified pool.
+ */
+size_t gen_pool_avail(struct gen_pool *pool)
+{
+ struct gen_pool_chunk *chunk;
+ size_t avail = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
+ avail += atomic_read(&chunk->avail);
+ rcu_read_unlock();
+ return avail;
+}
+EXPORT_SYMBOL_GPL(gen_pool_avail);
+
+/**
+ * gen_pool_size - get size in bytes of memory managed by the pool
+ * @pool: pool to get size
+ *
+ * Return size in bytes of memory managed by the pool.
+ */
+size_t gen_pool_size(struct gen_pool *pool)
+{
+ struct gen_pool_chunk *chunk;
+ size_t size = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
+ size += chunk->end_addr - chunk->start_addr;
+ rcu_read_unlock();
+ return size;
+}
+EXPORT_SYMBOL_GPL(gen_pool_size);
diff --git a/lib/idr.c b/lib/idr.c
index e15502e..db040ce 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -34,8 +34,10 @@
#include <linux/err.h>
#include <linux/string.h>
#include <linux/idr.h>
+#include <linux/spinlock.h>
static struct kmem_cache *idr_layer_cache;
+static DEFINE_SPINLOCK(simple_ida_lock);
static struct idr_layer *get_from_free_list(struct idr *idp)
{
@@ -926,6 +928,71 @@
EXPORT_SYMBOL(ida_destroy);
/**
+ * ida_simple_get - get a new id.
+ * @ida: the (initialized) ida.
+ * @start: the minimum id (inclusive, < 0x8000000)
+ * @end: the maximum id (exclusive, < 0x8000000 or 0)
+ * @gfp_mask: memory allocation flags
+ *
+ * Allocates an id in the range start <= id < end, or returns -ENOSPC.
+ * On memory allocation failure, returns -ENOMEM.
+ *
+ * Use ida_simple_remove() to get rid of an id.
+ */
+int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
+ gfp_t gfp_mask)
+{
+ int ret, id;
+ unsigned int max;
+
+ BUG_ON((int)start < 0);
+ BUG_ON((int)end < 0);
+
+ if (end == 0)
+ max = 0x80000000;
+ else {
+ BUG_ON(end < start);
+ max = end - 1;
+ }
+
+again:
+ if (!ida_pre_get(ida, gfp_mask))
+ return -ENOMEM;
+
+ spin_lock(&simple_ida_lock);
+ ret = ida_get_new_above(ida, start, &id);
+ if (!ret) {
+ if (id > max) {
+ ida_remove(ida, id);
+ ret = -ENOSPC;
+ } else {
+ ret = id;
+ }
+ }
+ spin_unlock(&simple_ida_lock);
+
+ if (unlikely(ret == -EAGAIN))
+ goto again;
+
+ return ret;
+}
+EXPORT_SYMBOL(ida_simple_get);
+
+/**
+ * ida_simple_remove - remove an allocated id.
+ * @ida: the (initialized) ida.
+ * @id: the id returned by ida_simple_get.
+ */
+void ida_simple_remove(struct ida *ida, unsigned int id)
+{
+ BUG_ON((int)id < 0);
+ spin_lock(&simple_ida_lock);
+ ida_remove(ida, id);
+ spin_unlock(&simple_ida_lock);
+}
+EXPORT_SYMBOL(ida_simple_remove);
+
+/**
* ida_init - initialize ida handle
* @ida: ida handle
*
diff --git a/lib/llist.c b/lib/llist.c
new file mode 100644
index 0000000..da44572
--- /dev/null
+++ b/lib/llist.c
@@ -0,0 +1,129 @@
+/*
+ * Lock-less NULL terminated single linked list
+ *
+ * The basic atomic operation of this list is cmpxchg on long. On
+ * architectures that don't have NMI-safe cmpxchg implementation, the
+ * list can NOT be used in NMI handler. So code uses the list in NMI
+ * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010,2011 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/llist.h>
+
+#include <asm/system.h>
+
+/**
+ * llist_add - add a new entry
+ * @new: new entry to be added
+ * @head: the head for your lock-less list
+ */
+void llist_add(struct llist_node *new, struct llist_head *head)
+{
+ struct llist_node *entry, *old_entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ entry = head->first;
+ do {
+ old_entry = entry;
+ new->next = entry;
+ cpu_relax();
+ } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
+}
+EXPORT_SYMBOL_GPL(llist_add);
+
+/**
+ * llist_add_batch - add several linked entries in batch
+ * @new_first: first entry in batch to be added
+ * @new_last: last entry in batch to be added
+ * @head: the head for your lock-less list
+ */
+void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+ struct llist_head *head)
+{
+ struct llist_node *entry, *old_entry;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ entry = head->first;
+ do {
+ old_entry = entry;
+ new_last->next = entry;
+ cpu_relax();
+ } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry);
+}
+EXPORT_SYMBOL_GPL(llist_add_batch);
+
+/**
+ * llist_del_first - delete the first entry of lock-less list
+ * @head: the head for your lock-less list
+ *
+ * If list is empty, return NULL, otherwise, return the first entry
+ * deleted, this is the newest added one.
+ *
+ * Only one llist_del_first user can be used simultaneously with
+ * multiple llist_add users without lock. Because otherwise
+ * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
+ * llist_add) sequence in another user may change @head->first->next,
+ * but keep @head->first. If multiple consumers are needed, please
+ * use llist_del_all or use lock between consumers.
+ */
+struct llist_node *llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *old_entry, *next;
+
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ entry = head->first;
+ do {
+ if (entry == NULL)
+ return NULL;
+ old_entry = entry;
+ next = entry->next;
+ cpu_relax();
+ } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry);
+
+ return entry;
+}
+EXPORT_SYMBOL_GPL(llist_del_first);
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head: the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry. The order of entries
+ * deleted is from the newest to the oldest added one.
+ */
+struct llist_node *llist_del_all(struct llist_head *head)
+{
+#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+ BUG_ON(in_nmi());
+#endif
+
+ return xchg(&head->first, NULL);
+}
+EXPORT_SYMBOL_GPL(llist_del_all);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7ea2e03..a2f9da5 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -823,8 +823,8 @@
EXPORT_SYMBOL(radix_tree_prev_hole);
static unsigned int
-__lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
- unsigned int max_items, unsigned long *next_index)
+__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices,
+ unsigned long index, unsigned int max_items, unsigned long *next_index)
{
unsigned int nr_found = 0;
unsigned int shift, height;
@@ -857,12 +857,16 @@
/* Bottom level: grab some items */
for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
- index++;
if (slot->slots[i]) {
- results[nr_found++] = &(slot->slots[i]);
- if (nr_found == max_items)
+ results[nr_found] = &(slot->slots[i]);
+ if (indices)
+ indices[nr_found] = index;
+ if (++nr_found == max_items) {
+ index++;
goto out;
+ }
}
+ index++;
}
out:
*next_index = index;
@@ -918,8 +922,8 @@
if (cur_index > max_index)
break;
- slots_found = __lookup(node, (void ***)results + ret, cur_index,
- max_items - ret, &next_index);
+ slots_found = __lookup(node, (void ***)results + ret, NULL,
+ cur_index, max_items - ret, &next_index);
nr_found = 0;
for (i = 0; i < slots_found; i++) {
struct radix_tree_node *slot;
@@ -944,6 +948,7 @@
* radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
* @root: radix tree root
* @results: where the results of the lookup are placed
+ * @indices: where their indices should be placed (but usually NULL)
* @first_index: start the lookup from this key
* @max_items: place up to this many items at *results
*
@@ -958,7 +963,8 @@
* protection, radix_tree_deref_slot may fail requiring a retry.
*/
unsigned int
-radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+radix_tree_gang_lookup_slot(struct radix_tree_root *root,
+ void ***results, unsigned long *indices,
unsigned long first_index, unsigned int max_items)
{
unsigned long max_index;
@@ -974,6 +980,8 @@
if (first_index > 0)
return 0;
results[0] = (void **)&root->rnode;
+ if (indices)
+ indices[0] = 0;
return 1;
}
node = indirect_to_ptr(node);
@@ -987,8 +995,9 @@
if (cur_index > max_index)
break;
- slots_found = __lookup(node, results + ret, cur_index,
- max_items - ret, &next_index);
+ slots_found = __lookup(node, results + ret,
+ indices ? indices + ret : NULL,
+ cur_index, max_items - ret, &next_index);
ret += slots_found;
if (next_index == 0)
break;
@@ -1194,6 +1203,98 @@
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
+#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
+#include <linux/sched.h> /* for cond_resched() */
+
+/*
+ * This linear search is at present only useful to shmem_unuse_inode().
+ */
+static unsigned long __locate(struct radix_tree_node *slot, void *item,
+ unsigned long index, unsigned long *found_index)
+{
+ unsigned int shift, height;
+ unsigned long i;
+
+ height = slot->height;
+ shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+ for ( ; height > 1; height--) {
+ i = (index >> shift) & RADIX_TREE_MAP_MASK;
+ for (;;) {
+ if (slot->slots[i] != NULL)
+ break;
+ index &= ~((1UL << shift) - 1);
+ index += 1UL << shift;
+ if (index == 0)
+ goto out; /* 32-bit wraparound */
+ i++;
+ if (i == RADIX_TREE_MAP_SIZE)
+ goto out;
+ }
+
+ shift -= RADIX_TREE_MAP_SHIFT;
+ slot = rcu_dereference_raw(slot->slots[i]);
+ if (slot == NULL)
+ goto out;
+ }
+
+ /* Bottom level: check items */
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ if (slot->slots[i] == item) {
+ *found_index = index + i;
+ index = 0;
+ goto out;
+ }
+ }
+ index += RADIX_TREE_MAP_SIZE;
+out:
+ return index;
+}
+
+/**
+ * radix_tree_locate_item - search through radix tree for item
+ * @root: radix tree root
+ * @item: item to be found
+ *
+ * Returns index where item was found, or -1 if not found.
+ * Caller must hold no lock (since this time-consuming function needs
+ * to be preemptible), and must check afterwards if item is still there.
+ */
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
+{
+ struct radix_tree_node *node;
+ unsigned long max_index;
+ unsigned long cur_index = 0;
+ unsigned long found_index = -1;
+
+ do {
+ rcu_read_lock();
+ node = rcu_dereference_raw(root->rnode);
+ if (!radix_tree_is_indirect_ptr(node)) {
+ rcu_read_unlock();
+ if (node == item)
+ found_index = 0;
+ break;
+ }
+
+ node = indirect_to_ptr(node);
+ max_index = radix_tree_maxindex(node->height);
+ if (cur_index > max_index)
+ break;
+
+ cur_index = __locate(node, item, cur_index, &found_index);
+ rcu_read_unlock();
+ cond_resched();
+ } while (cur_index != 0 && cur_index <= max_index);
+
+ return found_index;
+}
+#else
+unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
+{
+ return -1;
+}
+#endif /* CONFIG_SHMEM && CONFIG_SWAP */
/**
* radix_tree_shrink - shrink height of a radix tree to minimal
diff --git a/mm/failslab.c b/mm/failslab.c
index 1ce58c2..0dd7b8f 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -34,23 +34,23 @@
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init failslab_debugfs_init(void)
{
+ struct dentry *dir;
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
- int err;
- err = init_fault_attr_dentries(&failslab.attr, "failslab");
- if (err)
- return err;
+ dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir,
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
&failslab.ignore_gfp_wait))
goto fail;
- if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir,
+ if (!debugfs_create_bool("cache-filter", mode, dir,
&failslab.cache_filter))
goto fail;
return 0;
fail:
- cleanup_fault_attr_dentries(&failslab.attr);
+ debugfs_remove_recursive(dir);
return -ENOMEM;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 867d402..645a080 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,7 +33,6 @@
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
#include <linux/cleancache.h>
#include "internal.h"
@@ -462,6 +461,7 @@
int error;
VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +479,6 @@
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
- if (PageSwapBacked(page))
- __inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
@@ -502,22 +500,9 @@
{
int ret;
- /*
- * Splice_read and readahead add shmem/tmpfs pages into the page cache
- * before shmem_readpage has a chance to mark them as SwapBacked: they
- * need to go on the anon lru below, and mem_cgroup_cache_charge
- * (called in add_to_page_cache) needs to know where they're going too.
- */
- if (mapping_cap_swap_backed(mapping))
- SetPageSwapBacked(page);
-
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
- if (ret == 0) {
- if (page_is_file_cache(page))
- lru_cache_add_file(page);
- else
- lru_cache_add_anon(page);
- }
+ if (ret == 0)
+ lru_cache_add_file(page);
return ret;
}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +699,16 @@
page = radix_tree_deref_slot(pagep);
if (unlikely(!page))
goto out;
- if (radix_tree_deref_retry(page))
- goto repeat;
-
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto repeat;
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ goto out;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -753,7 +745,7 @@
repeat:
page = find_get_page(mapping, offset);
- if (page) {
+ if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
@@ -840,7 +832,7 @@
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, start, nr_pages);
+ (void ***)pages, NULL, start, nr_pages);
ret = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
@@ -849,13 +841,22 @@
if (unlikely(!page))
continue;
- /*
- * This can only trigger when the entry at index 0 moves out
- * of or back to the root: none yet gotten, safe to restart.
- */
- if (radix_tree_deref_retry(page)) {
- WARN_ON(start | i);
- goto restart;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(start | i);
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so skip over it -
+ * we only reach this from invalidate_mapping_pages().
+ */
+ continue;
}
if (!page_cache_get_speculative(page))
@@ -903,7 +904,7 @@
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, index, nr_pages);
+ (void ***)pages, NULL, index, nr_pages);
ret = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
@@ -912,12 +913,22 @@
if (unlikely(!page))
continue;
- /*
- * This can only trigger when the entry at index 0 moves out
- * of or back to the root: none yet gotten, safe to restart.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so stop looking for
+ * contiguous pages.
+ */
+ break;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -977,12 +988,21 @@
if (unlikely(!page))
continue;
- /*
- * This can only trigger when the entry at index 0 moves out
- * of or back to the root: none yet gotten, safe to restart.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * This function is never used on a shmem/tmpfs
+ * mapping, so a swap entry won't be found here.
+ */
+ BUG();
+ }
if (!page_cache_get_speculative(page))
goto repeat;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5f84d23..f4ec4e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,7 +35,6 @@
#include <linux/limits.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
-#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -2873,30 +2872,6 @@
return 0;
if (PageCompound(page))
return 0;
- /*
- * Corner case handling. This is called from add_to_page_cache()
- * in usual. But some FS (shmem) precharges this page before calling it
- * and call add_to_page_cache() with GFP_NOWAIT.
- *
- * For GFP_NOWAIT case, the page may be pre-charged before calling
- * add_to_page_cache(). (See shmem.c) check it here and avoid to call
- * charge twice. (It works but has to pay a bit larger cost.)
- * And when the page is SwapCache, it should take swap information
- * into account. This is under lock_page() now.
- */
- if (!(gfp_mask & __GFP_WAIT)) {
- struct page_cgroup *pc;
-
- pc = lookup_page_cgroup(page);
- if (!pc)
- return 0;
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc)) {
- unlock_page_cgroup(pc);
- return 0;
- }
- unlock_page_cgroup(pc);
- }
if (unlikely(!mm))
mm = &init_mm;
@@ -3486,31 +3461,6 @@
cgroup_release_and_wakeup_rmdir(&mem->css);
}
-/*
- * A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
- * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
- * not from the memcg which this page would be charged to.
- * try_charge_swapin does all of these works properly.
- */
-int mem_cgroup_shmem_charge_fallback(struct page *page,
- struct mm_struct *mm,
- gfp_t gfp_mask)
-{
- struct mem_cgroup *mem;
- int ret;
-
- if (mem_cgroup_disabled())
- return 0;
-
- ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
- if (!ret)
- mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
-
- return ret;
-}
-
#ifdef CONFIG_DEBUG_VM
static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
{
@@ -5330,15 +5280,17 @@
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
- if (!mapping_cap_swap_backed(mapping)) { /* normal file */
- page = find_get_page(mapping, pgoff);
- } else { /* shmem/tmpfs file. we should take account of swap too. */
- swp_entry_t ent;
- mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
- if (do_swap_account)
- entry->val = ent.val;
- }
+ page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+ /* shmem/tmpfs may report page out on swap: account for that too. */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swap = radix_to_swp_entry(page);
+ if (do_swap_account)
+ *entry = swap;
+ page = find_get_page(&swapper_space, swap.val);
+ }
+#endif
return page;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f5..2b43ba0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -53,6 +53,7 @@
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
+#include <linux/kfifo.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1179,97 @@
__memory_failure(pfn, trapno, 0);
}
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int trapno;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .trapno = trapno,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, &entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
+{
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = &__get_cpu_var(memory_failure_cpu);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ __memory_failure(entry.pfn, entry.trapno, entry.flags);
+ }
+}
+
+static int __init memory_failure_init(void)
+{
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
+}
+core_initcall(memory_failure_init);
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d..636a868 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@
* file will not get a swp_entry_t in its pte, but rather it is like
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
- *
- * However when tmpfs moves the page from pagecache and into swapcache,
- * it is still in core, but the find_get_page below won't find it.
- * No big deal, but make a note of it.
*/
page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+ /* shmem/tmpfs may return swap: account for swapcache page too. */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swap = radix_to_swp_entry(page);
+ page = find_get_page(&swapper_space, swap.val);
+ }
+#endif
if (page) {
present = PageUptodate(page);
page_cache_release(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1dbcf88..6e8ecb6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1409,14 +1409,11 @@
{
mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
- int err;
- err = init_fault_attr_dentries(&fail_page_alloc.attr,
- "fail_page_alloc");
- if (err)
- return err;
-
- dir = fail_page_alloc.attr.dir;
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
&fail_page_alloc.ignore_gfp_wait))
@@ -1430,7 +1427,7 @@
return 0;
fail:
- cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+ debugfs_remove_recursive(dir);
return -ENOMEM;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 5cc21f8..32f6763 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
* 2000-2001 Christoph Rohland
* 2000-2001 SAP AG
* 2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
* Copyright (C) 2002-2005 VERITAS Software Corporation.
* Copyright (C) 2004 Andi Kleen, SuSE Labs
*
@@ -28,7 +29,6 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/percpu_counter.h>
#include <linux/swap.h>
static struct vfsmount *shm_mnt;
@@ -51,6 +51,8 @@
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
@@ -63,43 +65,17 @@
#include <linux/magic.h>
#include <asm/uaccess.h>
-#include <asm/div64.h>
#include <asm/pgtable.h>
-/*
- * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- *
- * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
- * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- *
- * We use / and * instead of shifts in the definitions below, so that the swap
- * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- */
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-
-#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
-
-#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
-#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
-
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN VM_READ
-#define SHMEM_TRUNCATE VM_WRITE
-
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT 64
-
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
struct shmem_xattr {
struct list_head list; /* anchored by shmem_inode_info->xattr_list */
char *name; /* xattr name */
@@ -107,7 +83,7 @@
char value[0];
};
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Flag allocation requirements to shmem_getpage */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -137,56 +113,6 @@
mapping_gfp_mask(inode->i_mapping), fault_type);
}
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
- /*
- * The above definition of ENTRIES_PER_PAGE, and the use of
- * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
- * might be reconsidered if it ever diverges from PAGE_SIZE.
- *
- * Mobility flags are masked out as swap vectors cannot move
- */
- return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
- PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static inline void shmem_dir_free(struct page *page)
-{
- __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static struct page **shmem_dir_map(struct page *page)
-{
- return (struct page **)kmap_atomic(page, KM_USER0);
-}
-
-static inline void shmem_dir_unmap(struct page **dir)
-{
- kunmap_atomic(dir, KM_USER0);
-}
-
-static swp_entry_t *shmem_swp_map(struct page *page)
-{
- return (swp_entry_t *)kmap_atomic(page, KM_USER1);
-}
-
-static inline void shmem_swp_balance_unmap(void)
-{
- /*
- * When passing a pointer to an i_direct entry, to code which
- * also handles indirect entries and so will shmem_swp_unmap,
- * we must arrange for the preempt count to remain in balance.
- * What kmap_atomic of a lowmem page does depends on config
- * and architecture, so pretend to kmap_atomic some lowmem page.
- */
- (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
-
-static inline void shmem_swp_unmap(swp_entry_t *entry)
-{
- kunmap_atomic(entry, KM_USER1);
-}
-
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -244,15 +170,6 @@
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks) {
- percpu_counter_add(&sbinfo->used_blocks, -pages);
- inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- }
-}
-
static int shmem_reserve_inode(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -279,7 +196,7 @@
}
/**
- * shmem_recalc_inode - recalculate the size of an inode
+ * shmem_recalc_inode - recalculate the block usage of an inode
* @inode: inode to recalc
*
* We have to calculate the free blocks since the mm can drop
@@ -297,474 +214,297 @@
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -freed);
info->alloced -= freed;
+ inode->i_blocks -= freed * BLOCKS_PER_PAGE;
shmem_unacct_blocks(info->flags, freed);
- shmem_free_blocks(inode, freed);
}
}
-/**
- * shmem_swp_entry - find the swap vector position in the info structure
- * @info: info structure for the inode
- * @index: index of the page to find
- * @page: optional page to add to the structure. Has to be preset to
- * all zeros
- *
- * If there is no space allocated yet it will return NULL when
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
- *
- * The swap vector is organized the following way:
- *
- * There are SHMEM_NR_DIRECT entries directly stored in the
- * shmem_inode_info structure. So small files do not need an addional
- * allocation.
- *
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
- * i_indirect which points to a page which holds in the first half
- * doubly indirect blocks, in the second half triple indirect blocks:
- *
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- * following layout (for SHMEM_NR_DIRECT == 16):
- *
- * i_indirect -> dir --> 16-19
- * | +-> 20-23
- * |
- * +-->dir2 --> 24-27
- * | +-> 28-31
- * | +-> 32-35
- * | +-> 36-39
- * |
- * +-->dir3 --> 40-43
- * +-> 44-47
- * +-> 48-51
- * +-> 52-55
+/*
+ * Replace item expected in radix tree by a new item, while holding tree lock.
*/
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
+static int shmem_radix_tree_replace(struct address_space *mapping,
+ pgoff_t index, void *expected, void *replacement)
{
- unsigned long offset;
- struct page **dir;
- struct page *subdir;
+ void **pslot;
+ void *item = NULL;
- if (index < SHMEM_NR_DIRECT) {
- shmem_swp_balance_unmap();
- return info->i_direct+index;
- }
- if (!info->i_indirect) {
- if (page) {
- info->i_indirect = *page;
- *page = NULL;
+ VM_BUG_ON(!expected);
+ pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ if (pslot)
+ item = radix_tree_deref_slot_protected(pslot,
+ &mapping->tree_lock);
+ if (item != expected)
+ return -ENOENT;
+ if (replacement)
+ radix_tree_replace_slot(pslot, replacement);
+ else
+ radix_tree_delete(&mapping->page_tree, index);
+ return 0;
+}
+
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+ struct address_space *mapping,
+ pgoff_t index, gfp_t gfp, void *expected)
+{
+ int error = 0;
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
+ if (!expected)
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = index;
+
+ spin_lock_irq(&mapping->tree_lock);
+ if (!expected)
+ error = radix_tree_insert(&mapping->page_tree,
+ index, page);
+ else
+ error = shmem_radix_tree_replace(mapping, index,
+ expected, page);
+ if (!error) {
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
}
- return NULL; /* need another page */
+ if (!expected)
+ radix_tree_preload_end();
}
+ if (error)
+ mem_cgroup_uncharge_cache_page(page);
+ return error;
+}
- index -= SHMEM_NR_DIRECT;
- offset = index % ENTRIES_PER_PAGE;
- index /= ENTRIES_PER_PAGE;
- dir = shmem_dir_map(info->i_indirect);
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+{
+ struct address_space *mapping = page->mapping;
+ int error;
- if (index >= ENTRIES_PER_PAGE/2) {
- index -= ENTRIES_PER_PAGE/2;
- dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
- index %= ENTRIES_PER_PAGE;
- subdir = *dir;
- if (!subdir) {
- if (page) {
- *dir = *page;
- *page = NULL;
+ spin_lock_irq(&mapping->tree_lock);
+ error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ page->mapping = NULL;
+ mapping->nrpages--;
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ __dec_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
+ BUG_ON(error);
+}
+
+/*
+ * Like find_get_pages, but collecting swap entries as well as pages.
+ */
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages, pgoff_t *indices)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, indices, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ /*
+ * Otherwise, we must be storing a swap entry
+ * here as an exceptional entry: so return it
+ * without attempting to raise page count.
+ */
+ goto export;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+export:
+ indices[ret] = indices[i];
+ pages[ret] = page;
+ ret++;
+ }
+ if (unlikely(!ret && nr_found))
+ goto restart;
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Remove swap entry from radix tree, free the swap and its page cache.
+ */
+static int shmem_free_swap(struct address_space *mapping,
+ pgoff_t index, void *radswap)
+{
+ int error;
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+ spin_unlock_irq(&mapping->tree_lock);
+ if (!error)
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return error;
+}
+
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+ pagevec_release(pvec);
+}
+
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+ pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ long nr_swaps_freed = 0;
+ pgoff_t index;
+ int i;
+
+ BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+
+ pagevec_init(&pvec, 0);
+ index = start;
+ while (index <= end) {
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ index = indices[i];
+ if (index > end)
+ break;
+
+ if (radix_tree_exceptional_entry(page)) {
+ nr_swaps_freed += !shmem_free_swap(mapping,
+ index, page);
+ continue;
}
- shmem_dir_unmap(dir);
- return NULL; /* need another page */
+
+ if (!trylock_page(page))
+ continue;
+ if (page->mapping == mapping) {
+ VM_BUG_ON(PageWriteback(page));
+ truncate_inode_page(mapping, page);
+ }
+ unlock_page(page);
}
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(subdir);
+ shmem_pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+ index++;
}
- dir += index;
- subdir = *dir;
- if (!subdir) {
- if (!page || !(subdir = *page)) {
- shmem_dir_unmap(dir);
- return NULL; /* need a page */
+ if (partial) {
+ struct page *page = NULL;
+ shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ if (page) {
+ zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
}
- *dir = subdir;
- *page = NULL;
}
- shmem_dir_unmap(dir);
- return shmem_swp_map(subdir) + offset;
-}
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
-{
- long incdec = value? 1: -1;
-
- entry->val = value;
- info->swapped += incdec;
- if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
- struct page *page = kmap_atomic_to_page(entry);
- set_page_private(page, page_private(page) + incdec);
- }
-}
-
-/**
- * shmem_swp_alloc - get the position of the swap entry for the page.
- * @info: info structure for the inode
- * @index: index of the page to find
- * @sgp: check and recheck i_size? skip allocation?
- * @gfp: gfp mask to use for any page allocation
- *
- * If the entry does not exist, allocate it.
- */
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
- unsigned long index, enum sgp_type sgp, gfp_t gfp)
-{
- struct inode *inode = &info->vfs_inode;
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- struct page *page = NULL;
- swp_entry_t *entry;
-
- if (sgp != SGP_WRITE &&
- ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return ERR_PTR(-EINVAL);
-
- while (!(entry = shmem_swp_entry(info, index, &page))) {
- if (sgp == SGP_READ)
- return shmem_swp_map(ZERO_PAGE(0));
- /*
- * Test used_blocks against 1 less max_blocks, since we have 1 data
- * page (and perhaps indirect index pages) yet to allocate:
- * a waste to allocate index if we cannot allocate data.
- */
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - 1) >= 0)
- return ERR_PTR(-ENOSPC);
- percpu_counter_inc(&sbinfo->used_blocks);
- inode->i_blocks += BLOCKS_PER_PAGE;
+ index = start;
+ for ( ; ; ) {
+ cond_resched();
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ pvec.pages, indices);
+ if (!pvec.nr) {
+ if (index == start)
+ break;
+ index = start;
+ continue;
}
-
- spin_unlock(&info->lock);
- page = shmem_dir_alloc(gfp);
- spin_lock(&info->lock);
-
- if (!page) {
- shmem_free_blocks(inode, 1);
- return ERR_PTR(-ENOMEM);
- }
- if (sgp != SGP_WRITE &&
- ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
- entry = ERR_PTR(-EINVAL);
+ if (index == start && indices[0] > end) {
+ shmem_pagevec_release(&pvec);
break;
}
- if (info->next_index <= index)
- info->next_index = index + 1;
- }
- if (page) {
- /* another task gave its page, or truncated the file */
- shmem_free_blocks(inode, 1);
- shmem_dir_free(page);
- }
- if (info->next_index <= index && !IS_ERR(entry))
- info->next_index = index + 1;
- return entry;
-}
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
-/**
- * shmem_free_swp - free some swap entries in a directory
- * @dir: pointer to the directory
- * @edir: pointer after last entry of the directory
- * @punch_lock: pointer to spinlock when needed for the holepunch case
- */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
- spinlock_t *punch_lock)
-{
- spinlock_t *punch_unlock = NULL;
- swp_entry_t *ptr;
- int freed = 0;
+ index = indices[i];
+ if (index > end)
+ break;
- for (ptr = dir; ptr < edir; ptr++) {
- if (ptr->val) {
- if (unlikely(punch_lock)) {
- punch_unlock = punch_lock;
- punch_lock = NULL;
- spin_lock(punch_unlock);
- if (!ptr->val)
- continue;
+ if (radix_tree_exceptional_entry(page)) {
+ nr_swaps_freed += !shmem_free_swap(mapping,
+ index, page);
+ continue;
}
- free_swap_and_cache(*ptr);
- *ptr = (swp_entry_t){0};
- freed++;
- }
- }
- if (punch_unlock)
- spin_unlock(punch_unlock);
- return freed;
-}
-static int shmem_map_and_free_swp(struct page *subdir, int offset,
- int limit, struct page ***dir, spinlock_t *punch_lock)
-{
- swp_entry_t *ptr;
- int freed = 0;
-
- ptr = shmem_swp_map(subdir);
- for (; offset < limit; offset += LATENCY_LIMIT) {
- int size = limit - offset;
- if (size > LATENCY_LIMIT)
- size = LATENCY_LIMIT;
- freed += shmem_free_swp(ptr+offset, ptr+offset+size,
- punch_lock);
- if (need_resched()) {
- shmem_swp_unmap(ptr);
- if (*dir) {
- shmem_dir_unmap(*dir);
- *dir = NULL;
+ lock_page(page);
+ if (page->mapping == mapping) {
+ VM_BUG_ON(PageWriteback(page));
+ truncate_inode_page(mapping, page);
}
- cond_resched();
- ptr = shmem_swp_map(subdir);
+ unlock_page(page);
}
- }
- shmem_swp_unmap(ptr);
- return freed;
-}
-
-static void shmem_free_pages(struct list_head *next)
-{
- struct page *page;
- int freed = 0;
-
- do {
- page = container_of(next, struct page, lru);
- next = next->next;
- shmem_dir_free(page);
- freed++;
- if (freed >= LATENCY_LIMIT) {
- cond_resched();
- freed = 0;
- }
- } while (next);
-}
-
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
-{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long idx;
- unsigned long size;
- unsigned long limit;
- unsigned long stage;
- unsigned long diroff;
- struct page **dir;
- struct page *topdir;
- struct page *middir;
- struct page *subdir;
- swp_entry_t *ptr;
- LIST_HEAD(pages_to_free);
- long nr_pages_to_free = 0;
- long nr_swaps_freed = 0;
- int offset;
- int freed;
- int punch_hole;
- spinlock_t *needs_lock;
- spinlock_t *punch_lock;
- unsigned long upper_limit;
-
- truncate_inode_pages_range(inode->i_mapping, start, end);
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (idx >= info->next_index)
- return;
-
- spin_lock(&info->lock);
- info->flags |= SHMEM_TRUNCATE;
- if (likely(end == (loff_t) -1)) {
- limit = info->next_index;
- upper_limit = SHMEM_MAX_INDEX;
- info->next_index = idx;
- needs_lock = NULL;
- punch_hole = 0;
- } else {
- if (end + 1 >= inode->i_size) { /* we may free a little more */
- limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- upper_limit = SHMEM_MAX_INDEX;
- } else {
- limit = (end + 1) >> PAGE_CACHE_SHIFT;
- upper_limit = limit;
- }
- needs_lock = &info->lock;
- punch_hole = 1;
- }
-
- topdir = info->i_indirect;
- if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
- info->i_indirect = NULL;
- nr_pages_to_free++;
- list_add(&topdir->lru, &pages_to_free);
- }
- spin_unlock(&info->lock);
-
- if (info->swapped && idx < SHMEM_NR_DIRECT) {
- ptr = info->i_direct;
- size = limit;
- if (size > SHMEM_NR_DIRECT)
- size = SHMEM_NR_DIRECT;
- nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
- }
-
- /*
- * If there are no indirect blocks or we are punching a hole
- * below indirect blocks, nothing to be done.
- */
- if (!topdir || limit <= SHMEM_NR_DIRECT)
- goto done2;
-
- /*
- * The truncation case has already dropped info->lock, and we're safe
- * because i_size and next_index have already been lowered, preventing
- * access beyond. But in the punch_hole case, we still need to take
- * the lock when updating the swap directory, because there might be
- * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
- * shmem_writepage. However, whenever we find we can remove a whole
- * directory page (not at the misaligned start or end of the range),
- * we first NULLify its pointer in the level above, and then have no
- * need to take the lock when updating its contents: needs_lock and
- * punch_lock (either pointing to info->lock or NULL) manage this.
- */
-
- upper_limit -= SHMEM_NR_DIRECT;
- limit -= SHMEM_NR_DIRECT;
- idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
- offset = idx % ENTRIES_PER_PAGE;
- idx -= offset;
-
- dir = shmem_dir_map(topdir);
- stage = ENTRIES_PER_PAGEPAGE/2;
- if (idx < ENTRIES_PER_PAGEPAGE/2) {
- middir = topdir;
- diroff = idx/ENTRIES_PER_PAGE;
- } else {
- dir += ENTRIES_PER_PAGE/2;
- dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
- while (stage <= idx)
- stage += ENTRIES_PER_PAGEPAGE;
- middir = *dir;
- if (*dir) {
- diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
- ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
- if (!diroff && !offset && upper_limit >= stage) {
- if (needs_lock) {
- spin_lock(needs_lock);
- *dir = NULL;
- spin_unlock(needs_lock);
- needs_lock = NULL;
- } else
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
- }
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(middir);
- } else {
- diroff = 0;
- offset = 0;
- idx = stage;
- }
- }
-
- for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
- if (unlikely(idx == stage)) {
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(topdir) +
- ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- while (!*dir) {
- dir++;
- idx += ENTRIES_PER_PAGEPAGE;
- if (idx >= limit)
- goto done1;
- }
- stage = idx + ENTRIES_PER_PAGEPAGE;
- middir = *dir;
- if (punch_hole)
- needs_lock = &info->lock;
- if (upper_limit >= stage) {
- if (needs_lock) {
- spin_lock(needs_lock);
- *dir = NULL;
- spin_unlock(needs_lock);
- needs_lock = NULL;
- } else
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
- }
- shmem_dir_unmap(dir);
- cond_resched();
- dir = shmem_dir_map(middir);
- diroff = 0;
- }
- punch_lock = needs_lock;
- subdir = dir[diroff];
- if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
- if (needs_lock) {
- spin_lock(needs_lock);
- dir[diroff] = NULL;
- spin_unlock(needs_lock);
- punch_lock = NULL;
- } else
- dir[diroff] = NULL;
- nr_pages_to_free++;
- list_add(&subdir->lru, &pages_to_free);
- }
- if (subdir && page_private(subdir) /* has swap entries */) {
- size = limit - idx;
- if (size > ENTRIES_PER_PAGE)
- size = ENTRIES_PER_PAGE;
- freed = shmem_map_and_free_swp(subdir,
- offset, size, &dir, punch_lock);
- if (!dir)
- dir = shmem_dir_map(middir);
- nr_swaps_freed += freed;
- if (offset || punch_lock) {
- spin_lock(&info->lock);
- set_page_private(subdir,
- page_private(subdir) - freed);
- spin_unlock(&info->lock);
- } else
- BUG_ON(page_private(subdir) != freed);
- }
- offset = 0;
- }
-done1:
- shmem_dir_unmap(dir);
-done2:
- if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
- /*
- * Call truncate_inode_pages again: racing shmem_unuse_inode
- * may have swizzled a page in from swap since
- * truncate_pagecache or generic_delete_inode did it, before we
- * lowered next_index. Also, though shmem_getpage checks
- * i_size before adding to cache, no recheck after: so fix the
- * narrow window there too.
- */
- truncate_inode_pages_range(inode->i_mapping, start, end);
+ shmem_pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ index++;
}
spin_lock(&info->lock);
- info->flags &= ~SHMEM_TRUNCATE;
info->swapped -= nr_swaps_freed;
- if (nr_pages_to_free)
- shmem_free_blocks(inode, nr_pages_to_free);
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
- /*
- * Empty swap vector directory pages to be freed?
- */
- if (!list_empty(&pages_to_free)) {
- pages_to_free.prev->next = NULL;
- shmem_free_pages(pages_to_free.next);
- }
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -780,37 +520,7 @@
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
- struct page *page = NULL;
- if (newsize < oldsize) {
- /*
- * If truncating down to a partial page, then
- * if that page is already allocated, hold it
- * in memory until the truncation is over, so
- * truncate_partial_page cannot miss it were
- * it assigned to swap.
- */
- if (newsize & (PAGE_CACHE_SIZE-1)) {
- (void) shmem_getpage(inode,
- newsize >> PAGE_CACHE_SHIFT,
- &page, SGP_READ, NULL);
- if (page)
- unlock_page(page);
- }
- /*
- * Reset SHMEM_PAGEIN flag so that shmem_truncate can
- * detect if any pages might have been added to cache
- * after truncate_inode_pages. But we needn't bother
- * if it's being fully truncated to zero-length: the
- * nrpages check is efficient enough in that case.
- */
- if (newsize) {
- struct shmem_inode_info *info = SHMEM_I(inode);
- spin_lock(&info->lock);
- info->flags &= ~SHMEM_PAGEIN;
- spin_unlock(&info->lock);
- }
- }
if (newsize != oldsize) {
i_size_write(inode, newsize);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -822,8 +532,6 @@
/* unmap again to remove racily COWed private pages */
unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
}
- if (page)
- page_cache_release(page);
}
setattr_copy(inode, attr);
@@ -848,7 +556,8 @@
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
- }
+ } else
+ kfree(info->symlink);
list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
kfree(xattr->name);
@@ -859,106 +568,27 @@
end_writeback(inode);
}
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct shmem_inode_info *info,
+ swp_entry_t swap, struct page *page)
{
- swp_entry_t *ptr;
-
- for (ptr = dir; ptr < edir; ptr++) {
- if (ptr->val == entry.val)
- return ptr - dir;
- }
- return -1;
-}
-
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
-{
- struct address_space *mapping;
- unsigned long idx;
- unsigned long size;
- unsigned long limit;
- unsigned long stage;
- struct page **dir;
- struct page *subdir;
- swp_entry_t *ptr;
- int offset;
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ void *radswap;
+ pgoff_t index;
int error;
- idx = 0;
- ptr = info->i_direct;
- spin_lock(&info->lock);
- if (!info->swapped) {
- list_del_init(&info->swaplist);
- goto lost2;
- }
- limit = info->next_index;
- size = limit;
- if (size > SHMEM_NR_DIRECT)
- size = SHMEM_NR_DIRECT;
- offset = shmem_find_swp(entry, ptr, ptr+size);
- if (offset >= 0) {
- shmem_swp_balance_unmap();
- goto found;
- }
- if (!info->i_indirect)
- goto lost2;
-
- dir = shmem_dir_map(info->i_indirect);
- stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-
- for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
- if (unlikely(idx == stage)) {
- shmem_dir_unmap(dir-1);
- if (cond_resched_lock(&info->lock)) {
- /* check it has not been truncated */
- if (limit > info->next_index) {
- limit = info->next_index;
- if (idx >= limit)
- goto lost2;
- }
- }
- dir = shmem_dir_map(info->i_indirect) +
- ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- while (!*dir) {
- dir++;
- idx += ENTRIES_PER_PAGEPAGE;
- if (idx >= limit)
- goto lost1;
- }
- stage = idx + ENTRIES_PER_PAGEPAGE;
- subdir = *dir;
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(subdir);
- }
- subdir = *dir;
- if (subdir && page_private(subdir)) {
- ptr = shmem_swp_map(subdir);
- size = limit - idx;
- if (size > ENTRIES_PER_PAGE)
- size = ENTRIES_PER_PAGE;
- offset = shmem_find_swp(entry, ptr, ptr+size);
- shmem_swp_unmap(ptr);
- if (offset >= 0) {
- shmem_dir_unmap(dir);
- ptr = shmem_swp_map(subdir);
- goto found;
- }
- }
- }
-lost1:
- shmem_dir_unmap(dir-1);
-lost2:
- spin_unlock(&info->lock);
- return 0;
-found:
- idx += offset;
- ptr += offset;
+ radswap = swp_to_radix_entry(swap);
+ index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ if (index == -1)
+ return 0;
/*
* Move _head_ to start search for next from here.
* But be careful: shmem_evict_inode checks list_empty without taking
* mutex, and there's an instant in list_move_tail when info->swaplist
- * would appear empty, if it were the only one on shmem_swaplist. We
- * could avoid doing it if inode NULL; or use this minor optimization.
+ * would appear empty, if it were the only one on shmem_swaplist.
*/
if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -968,29 +598,34 @@
* but also to hold up shmem_evict_inode(): so inode cannot be freed
* beneath us (pagelock doesn't help until the page is in pagecache).
*/
- mapping = info->vfs_inode.i_mapping;
- error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+ error = shmem_add_to_page_cache(page, mapping, index,
+ GFP_NOWAIT, radswap);
/* which does mem_cgroup_uncharge_cache_page on error */
if (error != -ENOMEM) {
+ /*
+ * Truncation and eviction use free_swap_and_cache(), which
+ * only does trylock page: if we raced, best clean up here.
+ */
delete_from_swap_cache(page);
set_page_dirty(page);
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, ptr, 0);
- swap_free(entry);
+ if (!error) {
+ spin_lock(&info->lock);
+ info->swapped--;
+ spin_unlock(&info->lock);
+ swap_free(swap);
+ }
error = 1; /* not an error, but entry was found */
}
- shmem_swp_unmap(ptr);
- spin_unlock(&info->lock);
return error;
}
/*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
*/
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
{
- struct list_head *p, *next;
+ struct list_head *this, *next;
struct shmem_inode_info *info;
int found = 0;
int error;
@@ -999,32 +634,25 @@
* Charge page using GFP_KERNEL while we can wait, before taking
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
- * add_to_page_cache() will be called with GFP_NOWAIT.
*/
error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
if (error)
goto out;
- /*
- * Try to preload while we can wait, to not make a habit of
- * draining atomic reserves; but don't latch on to this cpu,
- * it's okay if sometimes we get rescheduled after this.
- */
- error = radix_tree_preload(GFP_KERNEL);
- if (error)
- goto uncharge;
- radix_tree_preload_end();
+ /* No radix_tree_preload: swap entry keeps a place for page in tree */
mutex_lock(&shmem_swaplist_mutex);
- list_for_each_safe(p, next, &shmem_swaplist) {
- info = list_entry(p, struct shmem_inode_info, swaplist);
- found = shmem_unuse_inode(info, entry, page);
+ list_for_each_safe(this, next, &shmem_swaplist) {
+ info = list_entry(this, struct shmem_inode_info, swaplist);
+ if (info->swapped)
+ found = shmem_unuse_inode(info, swap, page);
+ else
+ list_del_init(&info->swaplist);
cond_resched();
if (found)
break;
}
mutex_unlock(&shmem_swaplist_mutex);
-uncharge:
if (!found)
mem_cgroup_uncharge_cache_page(page);
if (found < 0)
@@ -1041,10 +669,10 @@
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
struct shmem_inode_info *info;
- swp_entry_t *entry, swap;
struct address_space *mapping;
- unsigned long index;
struct inode *inode;
+ swp_entry_t swap;
+ pgoff_t index;
BUG_ON(!PageLocked(page));
mapping = page->mapping;
@@ -1073,50 +701,32 @@
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now because we cannot take
- * mutex while holding spinlock, and must do so before the page
- * is moved to swap cache, when its pagelock no longer protects
+ * if it's not already there. Do it now before the page is
+ * moved to swap cache, when its pagelock no longer protects
* the inode from eviction. But don't unlock the mutex until
- * we've taken the spinlock, because shmem_unuse_inode() will
- * prune a !swapped inode from the swaplist under both locks.
+ * we've incremented swapped, because shmem_unuse_inode() will
+ * prune a !swapped inode from the swaplist under this mutex.
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
list_add_tail(&info->swaplist, &shmem_swaplist);
- spin_lock(&info->lock);
- mutex_unlock(&shmem_swaplist_mutex);
-
- if (index >= info->next_index) {
- BUG_ON(!(info->flags & SHMEM_TRUNCATE));
- goto unlock;
- }
- entry = shmem_swp_entry(info, index, NULL);
- if (entry->val) {
- WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
- free_swap_and_cache(*entry);
- shmem_swp_set(info, entry, 0);
- }
- shmem_recalc_inode(inode);
-
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
- delete_from_page_cache(page);
- shmem_swp_set(info, entry, swap.val);
- shmem_swp_unmap(entry);
swap_shmem_alloc(swap);
+ shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+ spin_lock(&info->lock);
+ info->swapped++;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+
+ mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(page_mapped(page));
swap_writepage(page, wbc);
return 0;
}
- shmem_swp_unmap(entry);
-unlock:
- spin_unlock(&info->lock);
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
+ mutex_unlock(&shmem_swaplist_mutex);
swapcache_free(swap, NULL);
redirty:
set_page_dirty(page);
@@ -1153,35 +763,33 @@
}
#endif /* CONFIG_TMPFS */
-static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
- struct page *page;
spol = mpol_cond_copy(&mpol,
- mpol_shared_policy_lookup(&info->policy, idx));
+ mpol_shared_policy_lookup(&info->policy, index));
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
- pvma.vm_pgoff = idx;
+ pvma.vm_pgoff = index;
pvma.vm_ops = NULL;
pvma.vm_policy = spol;
- page = swapin_readahead(entry, gfp, &pvma, 0);
- return page;
+ return swapin_readahead(swap, gfp, &pvma, 0);
}
static struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+ struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
- pvma.vm_pgoff = idx;
+ pvma.vm_pgoff = index;
pvma.vm_ops = NULL;
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
/*
* alloc_page_vma() will drop the shared policy reference
@@ -1190,19 +798,19 @@
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
}
#endif /* CONFIG_TMPFS */
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
- return swapin_readahead(entry, gfp, NULL, 0);
+ return swapin_readahead(swap, gfp, NULL, 0);
}
static inline struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, unsigned long idx)
+ struct shmem_inode_info *info, pgoff_t index)
{
return alloc_page(gfp);
}
@@ -1222,243 +830,190 @@
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache
*/
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
struct page *page;
- struct page *prealloc_page = NULL;
- swp_entry_t *entry;
swp_entry_t swap;
int error;
- int ret;
+ int once = 0;
- if (idx >= SHMEM_MAX_INDEX)
+ if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
return -EFBIG;
repeat:
- page = find_lock_page(mapping, idx);
- if (page) {
+ swap.val = 0;
+ page = find_lock_page(mapping, index);
+ if (radix_tree_exceptional_entry(page)) {
+ swap = radix_to_swp_entry(page);
+ page = NULL;
+ }
+
+ if (sgp != SGP_WRITE &&
+ ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ error = -EINVAL;
+ goto failed;
+ }
+
+ if (page || (sgp == SGP_READ && !swap.val)) {
/*
* Once we can get the page lock, it must be uptodate:
* if there were an error in reading back from swap,
* the page would not be inserted into the filecache.
*/
- BUG_ON(!PageUptodate(page));
- goto done;
+ BUG_ON(page && !PageUptodate(page));
+ *pagep = page;
+ return 0;
}
/*
- * Try to preload while we can wait, to not make a habit of
- * draining atomic reserves; but don't latch on to this cpu.
+ * Fast cache lookup did not find it:
+ * bring it back from swap or allocate.
*/
- error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
- if (error)
- goto out;
- radix_tree_preload_end();
-
- if (sgp != SGP_READ && !prealloc_page) {
- prealloc_page = shmem_alloc_page(gfp, info, idx);
- if (prealloc_page) {
- SetPageSwapBacked(prealloc_page);
- if (mem_cgroup_cache_charge(prealloc_page,
- current->mm, GFP_KERNEL)) {
- page_cache_release(prealloc_page);
- prealloc_page = NULL;
- }
- }
- }
-
- spin_lock(&info->lock);
- shmem_recalc_inode(inode);
- entry = shmem_swp_alloc(info, idx, sgp, gfp);
- if (IS_ERR(entry)) {
- spin_unlock(&info->lock);
- error = PTR_ERR(entry);
- goto out;
- }
- swap = *entry;
+ info = SHMEM_I(inode);
+ sbinfo = SHMEM_SB(inode->i_sb);
if (swap.val) {
/* Look it up and read it in.. */
page = lookup_swap_cache(swap);
if (!page) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
/* here we actually do the io */
if (fault_type)
*fault_type |= VM_FAULT_MAJOR;
- page = shmem_swapin(swap, gfp, info, idx);
+ page = shmem_swapin(swap, gfp, info, index);
if (!page) {
- spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp, gfp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- if (entry->val == swap.val)
- error = -ENOMEM;
- shmem_swp_unmap(entry);
- }
- spin_unlock(&info->lock);
- if (error)
- goto out;
- goto repeat;
+ error = -ENOMEM;
+ goto failed;
}
- wait_on_page_locked(page);
- page_cache_release(page);
- goto repeat;
}
/* We have to do this with page locked to prevent races */
- if (!trylock_page(page)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- wait_on_page_locked(page);
- page_cache_release(page);
- goto repeat;
- }
- if (PageWriteback(page)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- wait_on_page_writeback(page);
- unlock_page(page);
- page_cache_release(page);
- goto repeat;
- }
+ lock_page(page);
if (!PageUptodate(page)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- unlock_page(page);
- page_cache_release(page);
error = -EIO;
- goto out;
+ goto failed;
+ }
+ wait_on_page_writeback(page);
+
+ /* Someone may have already done it for us */
+ if (page->mapping) {
+ if (page->mapping == mapping &&
+ page->index == index)
+ goto done;
+ error = -EEXIST;
+ goto failed;
}
- error = add_to_page_cache_locked(page, mapping,
- idx, GFP_NOWAIT);
- if (error) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- if (error == -ENOMEM) {
- /*
- * reclaim from proper memory cgroup and
- * call memcg's OOM if needed.
- */
- error = mem_cgroup_shmem_charge_fallback(
- page, current->mm, gfp);
- if (error) {
- unlock_page(page);
- page_cache_release(page);
- goto out;
- }
- }
- unlock_page(page);
- page_cache_release(page);
- goto repeat;
- }
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (!error)
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, swp_to_radix_entry(swap));
+ if (error)
+ goto failed;
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(page);
+ spin_lock(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+
+ delete_from_swap_cache(page);
set_page_dirty(page);
swap_free(swap);
- } else if (sgp == SGP_READ) {
- shmem_swp_unmap(entry);
- page = find_get_page(mapping, idx);
- if (page && !trylock_page(page)) {
- spin_unlock(&info->lock);
- wait_on_page_locked(page);
- page_cache_release(page);
- goto repeat;
+ } else {
+ if (shmem_acct_block(info->flags)) {
+ error = -ENOSPC;
+ goto failed;
}
- spin_unlock(&info->lock);
-
- } else if (prealloc_page) {
- shmem_swp_unmap(entry);
- sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks) >= 0 ||
- shmem_acct_block(info->flags))
- goto nospace;
+ sbinfo->max_blocks) >= 0) {
+ error = -ENOSPC;
+ goto unacct;
+ }
percpu_counter_inc(&sbinfo->used_blocks);
- inode->i_blocks += BLOCKS_PER_PAGE;
- } else if (shmem_acct_block(info->flags))
- goto nospace;
-
- page = prealloc_page;
- prealloc_page = NULL;
-
- entry = shmem_swp_alloc(info, idx, sgp, gfp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- ret = error || swap.val;
- if (ret)
- mem_cgroup_uncharge_cache_page(page);
- else
- ret = add_to_page_cache_lru(page, mapping,
- idx, GFP_NOWAIT);
- /*
- * At add_to_page_cache_lru() failure,
- * uncharge will be done automatically.
- */
- if (ret) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- spin_unlock(&info->lock);
- page_cache_release(page);
- if (error)
- goto out;
- goto repeat;
}
- info->flags |= SHMEM_PAGEIN;
+ page = shmem_alloc_page(gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto decused;
+ }
+
+ SetPageSwapBacked(page);
+ __set_page_locked(page);
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (!error)
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, NULL);
+ if (error)
+ goto decused;
+ lru_cache_add_anon(page);
+
+ spin_lock(&info->lock);
info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+
clear_highpage(page);
flush_dcache_page(page);
SetPageUptodate(page);
if (sgp == SGP_DIRTY)
set_page_dirty(page);
-
- } else {
- spin_unlock(&info->lock);
- error = -ENOMEM;
- goto out;
}
done:
- *pagep = page;
- error = 0;
-out:
- if (prealloc_page) {
- mem_cgroup_uncharge_cache_page(prealloc_page);
- page_cache_release(prealloc_page);
+ /* Perhaps the file has been truncated since we checked */
+ if (sgp != SGP_WRITE &&
+ ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ error = -EINVAL;
+ goto trunc;
}
- return error;
+ *pagep = page;
+ return 0;
-nospace:
/*
- * Perhaps the page was brought in from swap between find_lock_page
- * and taking info->lock? We allow for that at add_to_page_cache_lru,
- * but must also avoid reporting a spurious ENOSPC while working on a
- * full tmpfs.
+ * Error recovery.
*/
- page = find_get_page(mapping, idx);
+trunc:
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
+ spin_lock(&info->lock);
+ info->alloced--;
+ inode->i_blocks -= BLOCKS_PER_PAGE;
spin_unlock(&info->lock);
+decused:
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+ shmem_unacct_blocks(info->flags, 1);
+failed:
+ if (swap.val && error != -EINVAL) {
+ struct page *test = find_get_page(mapping, index);
+ if (test && !radix_tree_exceptional_entry(test))
+ page_cache_release(test);
+ /* Have another try if the entry has changed */
+ if (test != swp_to_radix_entry(swap))
+ error = -EEXIST;
+ }
if (page) {
+ unlock_page(page);
page_cache_release(page);
+ }
+ if (error == -ENOSPC && !once++) {
+ info = SHMEM_I(inode);
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
goto repeat;
}
- error = -ENOSPC;
- goto out;
+ if (error == -EEXIST)
+ goto repeat;
+ return error;
}
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -1467,9 +1022,6 @@
int error;
int ret = VM_FAULT_LOCKED;
- if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
-
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1482,20 +1034,20 @@
}
#ifdef CONFIG_NUMA
-static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
- struct inode *i = vma->vm_file->f_path.dentry->d_inode;
- return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
unsigned long addr)
{
- struct inode *i = vma->vm_file->f_path.dentry->d_inode;
- unsigned long idx;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ pgoff_t index;
- idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+ index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
#endif
@@ -1593,7 +1145,7 @@
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_short_symlink_operations;
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1626,7 +1178,8 @@
{
struct inode *inode = filp->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
- unsigned long index, offset;
+ pgoff_t index;
+ unsigned long offset;
enum sgp_type sgp = SGP_READ;
/*
@@ -1642,7 +1195,8 @@
for (;;) {
struct page *page = NULL;
- unsigned long end_index, nr, ret;
+ pgoff_t end_index;
+ unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1880,8 +1434,9 @@
buf->f_namelen = NAME_MAX;
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree =
- sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+ buf->f_bavail =
+ buf->f_bfree = sbinfo->max_blocks -
+ percpu_counter_sum(&sbinfo->used_blocks);
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
@@ -2055,10 +1610,13 @@
info = SHMEM_I(inode);
inode->i_size = len-1;
- if (len <= SHMEM_SYMLINK_INLINE_LEN) {
- /* do it inline */
- memcpy(info->inline_symlink, symname, len);
- inode->i_op = &shmem_symlink_inline_operations;
+ if (len <= SHORT_SYMLINK_LEN) {
+ info->symlink = kmemdup(symname, len, GFP_KERNEL);
+ if (!info->symlink) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ inode->i_op = &shmem_short_symlink_operations;
} else {
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) {
@@ -2081,17 +1639,17 @@
return 0;
}
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
{
- nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+ nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
return NULL;
}
static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct page *page = NULL;
- int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
- nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+ int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+ nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
if (page)
unlock_page(page);
return page;
@@ -2202,7 +1760,6 @@
return err;
}
-
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
&generic_acl_access_handler,
@@ -2332,9 +1889,9 @@
}
#endif /* CONFIG_TMPFS_XATTR */
-static const struct inode_operations shmem_symlink_inline_operations = {
+static const struct inode_operations shmem_short_symlink_operations = {
.readlink = generic_readlink,
- .follow_link = shmem_follow_link_inline,
+ .follow_link = shmem_follow_short_symlink,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
.getxattr = shmem_getxattr,
@@ -2534,8 +2091,7 @@
if (config.max_inodes < inodes)
goto out;
/*
- * Those tests also disallow limited->unlimited while any are in
- * use, so i_blocks will always be zero when max_blocks is zero;
+ * Those tests disallow limited->unlimited while any are in use;
* but we must separately disallow unlimited->limited, because
* in that case we have no record of how much is already in use.
*/
@@ -2627,7 +2183,7 @@
goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
- sb->s_maxbytes = SHMEM_MAX_BYTES;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
@@ -2662,14 +2218,14 @@
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
- struct shmem_inode_info *p;
- p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
- if (!p)
+ struct shmem_inode_info *info;
+ info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ if (!info)
return NULL;
- return &p->vfs_inode;
+ return &info->vfs_inode;
}
-static void shmem_i_callback(struct rcu_head *head)
+static void shmem_destroy_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
INIT_LIST_HEAD(&inode->i_dentry);
@@ -2678,29 +2234,26 @@
static void shmem_destroy_inode(struct inode *inode)
{
- if ((inode->i_mode & S_IFMT) == S_IFREG) {
- /* only struct inode is valid if it's an inline symlink */
+ if ((inode->i_mode & S_IFMT) == S_IFREG)
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- }
- call_rcu(&inode->i_rcu, shmem_i_callback);
+ call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
-static void init_once(void *foo)
+static void shmem_init_inode(void *foo)
{
- struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
-
- inode_init_once(&p->vfs_inode);
+ struct shmem_inode_info *info = foo;
+ inode_init_once(&info->vfs_inode);
}
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, SLAB_PANIC, init_once);
+ 0, SLAB_PANIC, shmem_init_inode);
return 0;
}
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
{
kmem_cache_destroy(shmem_inode_cachep);
}
@@ -2797,21 +2350,20 @@
#endif
};
-
static struct dentry *shmem_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_nodev(fs_type, flags, data, shmem_fill_super);
}
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
.mount = shmem_mount,
.kill_sb = kill_litter_super,
};
-int __init init_tmpfs(void)
+int __init shmem_init(void)
{
int error;
@@ -2819,18 +2371,18 @@
if (error)
goto out4;
- error = init_inodecache();
+ error = shmem_init_inodecache();
if (error)
goto out3;
- error = register_filesystem(&tmpfs_fs_type);
+ error = register_filesystem(&shmem_fs_type);
if (error) {
printk(KERN_ERR "Could not register tmpfs\n");
goto out2;
}
- shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
- tmpfs_fs_type.name, NULL);
+ shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
+ shmem_fs_type.name, NULL);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2839,9 +2391,9 @@
return 0;
out1:
- unregister_filesystem(&tmpfs_fs_type);
+ unregister_filesystem(&shmem_fs_type);
out2:
- destroy_inodecache();
+ shmem_destroy_inodecache();
out3:
bdi_destroy(&shmem_backing_dev_info);
out4:
@@ -2849,45 +2401,6 @@
return error;
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- struct page **pagep, swp_entry_t *ent)
-{
- swp_entry_t entry = { .val = 0 }, *ptr;
- struct page *page = NULL;
- struct shmem_inode_info *info = SHMEM_I(inode);
-
- if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- goto out;
-
- spin_lock(&info->lock);
- ptr = shmem_swp_entry(info, pgoff, NULL);
-#ifdef CONFIG_SWAP
- if (ptr && ptr->val) {
- entry.val = ptr->val;
- page = find_get_page(&swapper_space, entry.val);
- } else
-#endif
- page = find_get_page(inode->i_mapping, pgoff);
- if (ptr)
- shmem_swp_unmap(ptr);
- spin_unlock(&info->lock);
-out:
- *pagep = page;
- *ent = entry;
-}
-#endif
-
#else /* !CONFIG_SHMEM */
/*
@@ -2901,23 +2414,23 @@
#include <linux/ramfs.h>
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.mount = ramfs_mount,
.kill_sb = kill_litter_super,
};
-int __init init_tmpfs(void)
+int __init shmem_init(void)
{
- BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+ BUG_ON(register_filesystem(&shmem_fs_type) != 0);
- shm_mnt = kern_mount(&tmpfs_fs_type);
+ shm_mnt = kern_mount(&shmem_fs_type);
BUG_ON(IS_ERR(shm_mnt));
return 0;
}
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
{
return 0;
}
@@ -2927,43 +2440,17 @@
return 0;
}
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
- truncate_inode_pages_range(inode->i_mapping, start, end);
+ truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- struct page **pagep, swp_entry_t *ent)
-{
- struct page *page = NULL;
-
- if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- goto out;
- page = find_get_page(inode->i_mapping, pgoff);
-out:
- *pagep = page;
- *ent = (swp_entry_t){ .val = 0 };
-}
-#endif
-
#define shmem_vm_ops generic_file_vm_ops
#define shmem_file_operations ramfs_file_operations
#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
#define shmem_acct_size(flags, size) 0
#define shmem_unacct_size(flags, size) do {} while (0)
-#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
#endif /* CONFIG_SHMEM */
@@ -2987,7 +2474,7 @@
if (IS_ERR(shm_mnt))
return (void *)shm_mnt;
- if (size < 0 || size > SHMEM_MAX_BYTES)
+ if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size))
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1b8c339..17bc224 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1924,20 +1924,24 @@
/*
* Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number of
- * bits for the swap offset in the swp_entry_t type and
- * 2) the number of bits in the a swap pte as defined by
- * the different architectures. In order to find the
- * largest possible bit mask a swap entry with swap type 0
+ * device. There are three limiting factors: 1) the number
+ * of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte as defined by the
+ * the different architectures, and 3) the number of free bits
+ * in an exceptional radix_tree entry. In order to find the
+ * largest possible bit mask, a swap entry with swap type 0
* and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again and finally the swap
+ * decoded to a swp_entry_t again, and finally the swap
* offset is extracted. This will mask all the bits from
* the initial ~0UL mask that can't be encoded in either
* the swp_entry_t or the architecture definition of a
- * swap pte.
+ * swap pte. Then the same is done for a radix_tree entry.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ swp_entry_to_pte(swp_entry(0, ~0UL))));
+ maxpages = swp_offset(radix_to_swp_entry(
+ swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
+
if (maxpages > swap_header->info.last_page) {
maxpages = swap_header->info.last_page + 1;
/* p->max is an unsigned int: don't overflow it */
diff --git a/mm/truncate.c b/mm/truncate.c
index 232eb27..b40ac6d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -336,6 +336,14 @@
unsigned long count = 0;
int i;
+ /*
+ * Note: this function may get called on a shmem/tmpfs mapping:
+ * pagevec_lookup() might then return 0 prematurely (because it
+ * got a gangful of swap entries); but it's hardly worth worrying
+ * about - it can rarely have anything to free from such a mapping
+ * (most pages are dirty), and already skips over any difficulties.
+ */
+
pagevec_init(&pvec, 0);
while (index <= end && pagevec_lookup(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6d8ef4a..8b2d37b 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -128,34 +128,34 @@
void print_header(void)
{
if (show_pkg)
- fprintf(stderr, "pkg ");
+ fprintf(stderr, "pk");
if (show_core)
- fprintf(stderr, "core");
+ fprintf(stderr, " cr");
if (show_cpu)
fprintf(stderr, " CPU");
if (do_nhm_cstates)
- fprintf(stderr, " %%c0 ");
+ fprintf(stderr, " %%c0 ");
if (has_aperf)
- fprintf(stderr, " GHz");
+ fprintf(stderr, " GHz");
fprintf(stderr, " TSC");
if (do_nhm_cstates)
- fprintf(stderr, " %%c1 ");
+ fprintf(stderr, " %%c1");
if (do_nhm_cstates)
- fprintf(stderr, " %%c3 ");
+ fprintf(stderr, " %%c3");
if (do_nhm_cstates)
- fprintf(stderr, " %%c6 ");
+ fprintf(stderr, " %%c6");
if (do_snb_cstates)
- fprintf(stderr, " %%c7 ");
+ fprintf(stderr, " %%c7");
if (do_snb_cstates)
- fprintf(stderr, " %%pc2 ");
+ fprintf(stderr, " %%pc2");
if (do_nhm_cstates)
- fprintf(stderr, " %%pc3 ");
+ fprintf(stderr, " %%pc3");
if (do_nhm_cstates)
- fprintf(stderr, " %%pc6 ");
+ fprintf(stderr, " %%pc6");
if (do_snb_cstates)
- fprintf(stderr, " %%pc7 ");
+ fprintf(stderr, " %%pc7");
if (extra_msr_offset)
- fprintf(stderr, " MSR 0x%x ", extra_msr_offset);
+ fprintf(stderr, " MSR 0x%x ", extra_msr_offset);
putc('\n', stderr);
}
@@ -194,14 +194,14 @@
/* topology columns, print blanks on 1st (average) line */
if (p == cnt_average) {
if (show_pkg)
- fprintf(stderr, " ");
+ fprintf(stderr, " ");
if (show_core)
fprintf(stderr, " ");
if (show_cpu)
fprintf(stderr, " ");
} else {
if (show_pkg)
- fprintf(stderr, "%4d", p->pkg);
+ fprintf(stderr, "%d", p->pkg);
if (show_core)
fprintf(stderr, "%4d", p->core);
if (show_cpu)
@@ -241,22 +241,22 @@
if (!skip_c1)
fprintf(stderr, "%7.2f", 100.0 * p->c1/p->tsc);
else
- fprintf(stderr, " ****");
+ fprintf(stderr, " ****");
}
if (do_nhm_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->c3/p->tsc);
+ fprintf(stderr, " %6.2f", 100.0 * p->c3/p->tsc);
if (do_nhm_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->c6/p->tsc);
+ fprintf(stderr, " %6.2f", 100.0 * p->c6/p->tsc);
if (do_snb_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->c7/p->tsc);
+ fprintf(stderr, " %6.2f", 100.0 * p->c7/p->tsc);
if (do_snb_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->pc2/p->tsc);
+ fprintf(stderr, " %5.2f", 100.0 * p->pc2/p->tsc);
if (do_nhm_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->pc3/p->tsc);
+ fprintf(stderr, " %5.2f", 100.0 * p->pc3/p->tsc);
if (do_nhm_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->pc6/p->tsc);
+ fprintf(stderr, " %5.2f", 100.0 * p->pc6/p->tsc);
if (do_snb_cstates)
- fprintf(stderr, "%7.2f", 100.0 * p->pc7/p->tsc);
+ fprintf(stderr, " %5.2f", 100.0 * p->pc7/p->tsc);
if (extra_msr_offset)
fprintf(stderr, " 0x%016llx", p->extra_msr);
putc('\n', stderr);
diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
index 2618ef2..33c5c7e 100644
--- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
+++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c
@@ -137,7 +137,6 @@
void validate_cpuid(void)
{
unsigned int eax, ebx, ecx, edx, max_level;
- char brand[16];
unsigned int fms, family, model, stepping;
eax = ebx = ecx = edx = 0;
@@ -160,8 +159,8 @@
model += ((fms >> 16) & 0xf) << 4;
if (verbose > 1)
- printf("CPUID %s %d levels family:model:stepping "
- "0x%x:%x:%x (%d:%d:%d)\n", brand, max_level,
+ printf("CPUID %d levels family:model:stepping "
+ "0x%x:%x:%x (%d:%d:%d)\n", max_level,
family, model, stepping, family, model, stepping);
if (!(edx & (1 << 5))) {