Implement file posix capabilities

Implement file posix capabilities.  This allows programs to be given a
subset of root's powers regardless of who runs them, without having to use
setuid and giving the binary all of root's powers.

This version works with Kaigai Kohei's userspace tools, found at
http://www.kaigai.gr.jp/index.php.  For more information on how to use this
patch, Chris Friedhoff has posted a nice page at
http://www.friedhoff.org/fscaps.html.

Changelog:
	Nov 27:
	Incorporate fixes from Andrew Morton
	(security-introduce-file-caps-tweaks and
	security-introduce-file-caps-warning-fix)
	Fix Kconfig dependency.
	Fix change signaling behavior when file caps are not compiled in.

	Nov 13:
	Integrate comments from Alexey: Remove CONFIG_ ifdef from
	capability.h, and use %zd for printing a size_t.

	Nov 13:
	Fix endianness warnings by sparse as suggested by Alexey
	Dobriyan.

	Nov 09:
	Address warnings of unused variables at cap_bprm_set_security
	when file capabilities are disabled, and simultaneously clean
	up the code a little, by pulling the new code into a helper
	function.

	Nov 08:
	For pointers to required userspace tools and how to use
	them, see http://www.friedhoff.org/fscaps.html.

	Nov 07:
	Fix the calculation of the highest bit checked in
	check_cap_sanity().

	Nov 07:
	Allow file caps to be enabled without CONFIG_SECURITY, since
	capabilities are the default.
	Hook cap_task_setscheduler when !CONFIG_SECURITY.
	Move capable(TASK_KILL) to end of cap_task_kill to reduce
	audit messages.

	Nov 05:
	Add secondary calls in selinux/hooks.c to task_setioprio and
	task_setscheduler so that selinux and capabilities with file
	cap support can be stacked.

	Sep 05:
	As Seth Arnold points out, uid checks are out of place
	for capability code.

	Sep 01:
	Define task_setscheduler, task_setioprio, cap_task_kill, and
	task_setnice to make sure a user cannot affect a process in which
	they called a program with some fscaps.

	One remaining question is the note under task_setscheduler: are we
	ok with CAP_SYS_NICE being sufficient to confine a process to a
	cpuset?

	It is a semantic change, as without fsccaps, attach_task doesn't
	allow CAP_SYS_NICE to override the uid equivalence check.  But since
	it uses security_task_setscheduler, which elsewhere is used where
	CAP_SYS_NICE can be used to override the uid equivalence check,
	fixing it might be tough.

	     task_setscheduler
		 note: this also controls cpuset:attach_task.  Are we ok with
		     CAP_SYS_NICE being used to confine to a cpuset?
	     task_setioprio
	     task_setnice
		 sys_setpriority uses this (through set_one_prio) for another
		 process.  Need same checks as setrlimit

	Aug 21:
	Updated secureexec implementation to reflect the fact that
	euid and uid might be the same and nonzero, but the process
	might still have elevated caps.

	Aug 15:
	Handle endianness of xattrs.
	Enforce capability version match between kernel and disk.
	Enforce that no bits beyond the known max capability are
	set, else return -EPERM.
	With this extra processing, it may be worth reconsidering
	doing all the work at bprm_set_security rather than
	d_instantiate.

	Aug 10:
	Always call getxattr at bprm_set_security, rather than
	caching it at d_instantiate.

[morgan@kernel.org: file-caps clean up for linux/capability.h]
[bunk@kernel.org: unexport cap_inode_killpriv]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 3a6512f..b7fc55e 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -37,7 +37,8 @@
 	int sh_bang;
 	struct file * file;
 	int e_uid, e_gid;
-	kernel_cap_t cap_inheritable, cap_permitted, cap_effective;
+	kernel_cap_t cap_inheritable, cap_permitted;
+	bool cap_effective;
 	void *security;
 	int argc, envc;
 	char * filename;	/* Name of binary as seen by procps */
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 2dfa585..8961e7f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -1,14 +1,14 @@
 /*
  * This is <linux/capability.h>
  *
- * Andrew G. Morgan <morgan@transmeta.com>
+ * Andrew G. Morgan <morgan@kernel.org>
  * Alexander Kjeldaas <astor@guardian.no>
  * with help from Aleph1, Roland Buresund and Andrew Main.
  *
  * See here for the libcap library ("POSIX draft" compliance):
  *
- * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.2/
- */ 
+ * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
+ */
 
 #ifndef _LINUX_CAPABILITY_H
 #define _LINUX_CAPABILITY_H
@@ -28,23 +28,41 @@
    following structure to such a composite is better handled in a user
    library since the draft standard requires the use of malloc/free
    etc.. */
- 
+
 #define _LINUX_CAPABILITY_VERSION  0x19980330
 
 typedef struct __user_cap_header_struct {
 	__u32 version;
 	int pid;
 } __user *cap_user_header_t;
- 
+
 typedef struct __user_cap_data_struct {
         __u32 effective;
         __u32 permitted;
         __u32 inheritable;
 } __user *cap_user_data_t;
-  
-#ifdef __KERNEL__
 
-#include <asm/current.h>
+#define XATTR_CAPS_SUFFIX "capability"
+#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
+
+#define XATTR_CAPS_SZ (3*sizeof(__le32))
+#define VFS_CAP_REVISION_MASK	0xFF000000
+#define VFS_CAP_REVISION_1	0x01000000
+
+#define VFS_CAP_REVISION	VFS_CAP_REVISION_1
+
+#define VFS_CAP_FLAGS_MASK	~VFS_CAP_REVISION_MASK
+#define VFS_CAP_FLAGS_EFFECTIVE	0x000001
+
+struct vfs_cap_data {
+	__u32 magic_etc;  /* Little endian */
+	struct {
+		__u32 permitted;    /* Little endian */
+		__u32 inheritable;  /* Little endian */
+	} data[1];
+};
+
+#ifdef __KERNEL__
 
 /* #define STRICT_CAP_T_TYPECHECKS */
 
@@ -59,7 +77,7 @@
 typedef __u32 kernel_cap_t;
 
 #endif
-  
+
 #define _USER_CAP_HEADER_SIZE  (2*sizeof(__u32))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
@@ -67,7 +85,7 @@
 
 
 /**
- ** POSIX-draft defined capabilities. 
+ ** POSIX-draft defined capabilities.
  **/
 
 /* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this
@@ -87,7 +105,7 @@
    defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. */
 
 #define CAP_DAC_READ_SEARCH  2
-    
+
 /* Overrides all restrictions about allowed operations on files, where
    file owner ID must be equal to the user ID, except where CAP_FSETID
    is applicable. It doesn't override MAC and DAC restrictions. */
@@ -257,7 +275,7 @@
 /* Override reserved space on ext2 filesystem */
 /* Modify data journaling mode on ext3 filesystem (uses journaling
    resources) */
-/* NOTE: ext2 honors fsuid when checking for resource overrides, so 
+/* NOTE: ext2 honors fsuid when checking for resource overrides, so
    you can override using fsuid too */
 /* Override size restrictions on IPC message queues */
 /* Allow more than 64hz interrupts from the real-time clock */
@@ -289,8 +307,10 @@
 
 #define CAP_AUDIT_CONTROL    30
 
+#define CAP_SETFCAP	     31
+
 #ifdef __KERNEL__
-/* 
+/*
  * Bounding set
  */
 extern kernel_cap_t cap_bset;
@@ -298,7 +318,7 @@
 /*
  * Internal kernel functions only
  */
- 
+
 #ifdef STRICT_CAP_T_TYPECHECKS
 
 #define to_cap_t(x) { x }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 365586a4..e3fc5db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -329,6 +329,7 @@
 #define ATTR_KILL_SUID	2048
 #define ATTR_KILL_SGID	4096
 #define ATTR_FILE	8192
+#define ATTR_KILL_PRIV	16384
 
 /*
  * This is the Inode Attributes structure, used for notify_change().  It
diff --git a/include/linux/security.h b/include/linux/security.h
index a300a3f..df591d2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -51,8 +51,14 @@
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, char *name, void *value, size_t size, int flags);
 extern int cap_inode_removexattr(struct dentry *dentry, char *name);
+extern int cap_inode_need_killpriv(struct dentry *dentry);
+extern int cap_inode_killpriv(struct dentry *dentry);
 extern int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
 extern void cap_task_reparent_to_init (struct task_struct *p);
+extern int cap_task_kill(struct task_struct *p, struct siginfo *info, int sig, u32 secid);
+extern int cap_task_setscheduler (struct task_struct *p, int policy, struct sched_param *lp);
+extern int cap_task_setioprio (struct task_struct *p, int ioprio);
+extern int cap_task_setnice (struct task_struct *p, int nice);
 extern int cap_syslog (int type);
 extern int cap_vm_enough_memory(struct mm_struct *mm, long pages);
 
@@ -413,6 +419,18 @@
  *	is specified by @buffer_size.  @buffer may be NULL to request
  *	the size of the buffer required.
  *	Returns number of bytes used/required on success.
+ * @inode_need_killpriv:
+ *	Called when an inode has been changed.
+ *	@dentry is the dentry being changed.
+ *	Return <0 on error to abort the inode change operation.
+ *	Return 0 if inode_killpriv does not need to be called.
+ *	Return >0 if inode_killpriv does need to be called.
+ * @inode_killpriv:
+ *	The setuid bit is being removed.  Remove similar security labels.
+ *	Called with the dentry->d_inode->i_mutex held.
+ *	@dentry is the dentry being changed.
+ *	Return 0 on success.  If error is returned, then the operation
+ *	causing setuid bit removal is failed.
  *
  * Security hooks for file operations
  *
@@ -1239,6 +1257,8 @@
 	int (*inode_getxattr) (struct dentry *dentry, char *name);
 	int (*inode_listxattr) (struct dentry *dentry);
 	int (*inode_removexattr) (struct dentry *dentry, char *name);
+	int (*inode_need_killpriv) (struct dentry *dentry);
+	int (*inode_killpriv) (struct dentry *dentry);
 	const char *(*inode_xattr_getsuffix) (void);
   	int (*inode_getsecurity)(const struct inode *inode, const char *name, void *buffer, size_t size, int err);
   	int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags);
@@ -1496,6 +1516,8 @@
 int security_inode_getxattr(struct dentry *dentry, char *name);
 int security_inode_listxattr(struct dentry *dentry);
 int security_inode_removexattr(struct dentry *dentry, char *name);
+int security_inode_need_killpriv(struct dentry *dentry);
+int security_inode_killpriv(struct dentry *dentry);
 const char *security_inode_xattr_getsuffix(void);
 int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err);
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
@@ -1891,6 +1913,16 @@
 	return cap_inode_removexattr(dentry, name);
 }
 
+static inline int security_inode_need_killpriv(struct dentry *dentry)
+{
+	return cap_inode_need_killpriv(dentry);
+}
+
+static inline int security_inode_killpriv(struct dentry *dentry)
+{
+	return cap_inode_killpriv(dentry);
+}
+
 static inline const char *security_inode_xattr_getsuffix (void)
 {
 	return NULL ;
@@ -2035,12 +2067,12 @@
 
 static inline int security_task_setnice (struct task_struct *p, int nice)
 {
-	return 0;
+	return cap_task_setnice(p, nice);
 }
 
 static inline int security_task_setioprio (struct task_struct *p, int ioprio)
 {
-	return 0;
+	return cap_task_setioprio(p, ioprio);
 }
 
 static inline int security_task_getioprio (struct task_struct *p)
@@ -2058,7 +2090,7 @@
 					      int policy,
 					      struct sched_param *lp)
 {
-	return 0;
+	return cap_task_setscheduler(p, policy, lp);
 }
 
 static inline int security_task_getscheduler (struct task_struct *p)
@@ -2075,7 +2107,7 @@
 				      struct siginfo *info, int sig,
 				      u32 secid)
 {
-	return 0;
+	return cap_task_kill(p, info, sig, secid);
 }
 
 static inline int security_task_wait (struct task_struct *p)