Merge branch 'akpm' (incoming from Andrew)
Merge more incoming from Andrew Morton:
- Various fixes which were stalled or which I picked up recently
- A large rotorooting of the AIO code. Allegedly to improve
performance but I don't really have good performance numbers (I might
have lost the email) and I can't raise Kent today. I held this out
of 3.9 and we could give it another cycle if it's all too late/scary.
I ended up taking only the first two thirds of the AIO rotorooting. I
left the percpu parts and the batch completion for later. - Linus
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (33 commits)
aio: don't include aio.h in sched.h
aio: kill ki_retry
aio: kill ki_key
aio: give shared kioctx fields their own cachelines
aio: kill struct aio_ring_info
aio: kill batch allocation
aio: change reqs_active to include unreaped completions
aio: use cancellation list lazily
aio: use flush_dcache_page()
aio: make aio_read_evt() more efficient, convert to hrtimers
wait: add wait_event_hrtimeout()
aio: refcounting cleanup
aio: make aio_put_req() lockless
aio: do fget() after aio_get_req()
aio: dprintk() -> pr_debug()
aio: move private stuff out of aio.h
aio: add kiocb_cancel()
aio: kill return value of aio_complete()
char: add aio_{read,write} to /dev/{null,zero}
aio: remove retry-based AIO
...
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 09027a9..ddf4f93 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -480,7 +480,9 @@
# per-memory cgroup local status
cache - # of bytes of page cache memory.
-rss - # of bytes of anonymous and swap cache memory.
+rss - # of bytes of anonymous and swap cache memory (includes
+ transparent hugepages).
+rss_huge - # of bytes of anonymous transparent hugepages.
mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
pgpgin - # of charging events to the memory cgroup. The charging
event happens each time a page is accounted as either mapped
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 5f7d7ba..7a539f4 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <asm/ebcdic.h>
#include "hypfs.h"
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index 9b40c9c..6cfc1b0 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -253,24 +253,15 @@
/* Free unneeded trap tables */
if (!cpu_present(1)) {
- ClearPageReserved(virt_to_page(&trapbase_cpu1));
- init_page_count(virt_to_page(&trapbase_cpu1));
- free_page((unsigned long)&trapbase_cpu1);
- totalram_pages++;
+ free_reserved_page(virt_to_page(&trapbase_cpu1));
num_physpages++;
}
if (!cpu_present(2)) {
- ClearPageReserved(virt_to_page(&trapbase_cpu2));
- init_page_count(virt_to_page(&trapbase_cpu2));
- free_page((unsigned long)&trapbase_cpu2);
- totalram_pages++;
+ free_reserved_page(virt_to_page(&trapbase_cpu2));
num_physpages++;
}
if (!cpu_present(3)) {
- ClearPageReserved(virt_to_page(&trapbase_cpu3));
- init_page_count(virt_to_page(&trapbase_cpu3));
- free_page((unsigned long)&trapbase_cpu3);
- totalram_pages++;
+ free_reserved_page(virt_to_page(&trapbase_cpu3));
num_physpages++;
}
/* Ok, they are spinning and ready to go. */
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 4490c39..af472cf 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -366,45 +366,14 @@
void free_initmem (void)
{
- unsigned long addr;
- unsigned long freed;
-
- addr = (unsigned long)(&__init_begin);
- freed = (unsigned long)(&__init_end) - addr;
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
- struct page *p;
-
- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- p = virt_to_page(addr);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- num_physpages++;
- }
- printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n",
- freed >> 10);
+ num_physpages += free_initmem_default(POISON_FREE_INITMEM);
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start < end)
- printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
- (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- struct page *p;
-
- memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE);
- p = virt_to_page(start);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- num_physpages++;
- }
+ num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
+ "initrd");
}
#endif
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index cf72a8a..a717199 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2059,8 +2059,7 @@
/* We subtract one to account for the mem_map_zero page
* allocated below.
*/
- totalram_pages -= 1;
- num_physpages = totalram_pages;
+ num_physpages = totalram_pages - 1;
/*
* Set up the zero page, mark it reserved, so that page count
@@ -2071,7 +2070,7 @@
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
}
- SetPageReserved(mem_map_zero);
+ mark_page_reserved(mem_map_zero);
codepages = (((unsigned long) _etext) - ((unsigned long) _start));
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
@@ -2111,37 +2110,22 @@
initend = (unsigned long)(__init_end) & PAGE_MASK;
for (; addr < initend; addr += PAGE_SIZE) {
unsigned long page;
- struct page *p;
page = (addr +
((unsigned long) __va(kern_base)) -
((unsigned long) KERNBASE));
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
- if (do_free) {
- p = virt_to_page(page);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- }
+ if (do_free)
+ free_reserved_page(virt_to_page(page));
}
}
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start < end)
- printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- struct page *p = virt_to_page(start);
-
- ClearPageReserved(p);
- init_page_count(p);
- __free_page(p);
- totalram_pages++;
- }
+ num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
+ "initrd");
}
#endif
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a87daa..a5ffcc9 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -27,6 +27,7 @@
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/times.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include <scsi/scsi.h>
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 2c644af..1ccbe94 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -28,6 +28,7 @@
#include <linux/pfn.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
@@ -627,6 +628,18 @@
return count;
}
+static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return 0;
+}
+
+static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return iov_length(iov, nr_segs);
+}
+
static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf,
struct splice_desc *sd)
{
@@ -670,6 +683,24 @@
return written ? written : -EFAULT;
}
+static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ size_t written = 0;
+ unsigned long i;
+ ssize_t ret;
+
+ for (i = 0; i < nr_segs; i++) {
+ ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len,
+ &pos);
+ if (ret < 0)
+ break;
+ written += ret;
+ }
+
+ return written ? written : -EFAULT;
+}
+
static int mmap_zero(struct file *file, struct vm_area_struct *vma)
{
#ifndef CONFIG_MMU
@@ -738,6 +769,7 @@
#define full_lseek null_lseek
#define write_zero write_null
#define read_full read_zero
+#define aio_write_zero aio_write_null
#define open_mem open_port
#define open_kmem open_mem
#define open_oldmem open_mem
@@ -766,6 +798,8 @@
.llseek = null_lseek,
.read = read_null,
.write = write_null,
+ .aio_read = aio_read_null,
+ .aio_write = aio_write_null,
.splice_write = splice_write_null,
};
@@ -782,6 +816,8 @@
.llseek = zero_lseek,
.read = read_zero,
.write = write_zero,
+ .aio_read = aio_read_zero,
+ .aio_write = aio_write_zero,
.mmap = mmap_zero,
};
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
index 31f9201..c40088e 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_resource.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c
@@ -62,13 +62,13 @@
kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
if (random) {
j = 0;
- random_bytes = random32();
+ random_bytes = prandom_u32();
for (i = 0; i < RANDOM_SIZE; i++)
rarray[i] = i + skip_low;
for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
if (j >= RANDOM_SIZE) {
j = 0;
- random_bytes = random32();
+ random_bytes = prandom_u32();
}
idx = (random_bytes >> (j * 2)) & 0xF;
kfifo_in(fifo,
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c
index f95e5df..0161ae6 100644
--- a/drivers/infiniband/hw/cxgb4/id_table.c
+++ b/drivers/infiniband/hw/cxgb4/id_table.c
@@ -54,7 +54,7 @@
if (obj < alloc->max) {
if (alloc->flags & C4IW_ID_TABLE_F_RANDOM)
- alloc->last += random32() % RANDOM_SKIP;
+ alloc->last += prandom_u32() % RANDOM_SKIP;
else
alloc->last = obj + 1;
if (alloc->last >= alloc->max)
@@ -88,7 +88,7 @@
alloc->start = start;
alloc->flags = flags;
if (flags & C4IW_ID_TABLE_F_RANDOM)
- alloc->last = random32() % RANDOM_SKIP;
+ alloc->last = prandom_u32() % RANDOM_SKIP;
else
alloc->last = 0;
alloc->max = num;
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index aed8afe..6d7f453 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/aio.h>
#include <linux/jiffies.h>
#include <linux/cpu.h>
#include <asm/pgtable.h>
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 934792c..4d599ce 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -93,7 +93,7 @@
__be64 mlx4_ib_gen_node_guid(void)
{
#define NODE_GUID_HI ((u64) (((u64)IB_OPENIB_OUI) << 40))
- return cpu_to_be64(NODE_GUID_HI | random32());
+ return cpu_to_be64(NODE_GUID_HI | prandom_u32());
}
__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 4f7aa30..b56c942 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -39,7 +39,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/io.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/jiffies.h>
#include <asm/pgtable.h>
#include <linux/delay.h>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 1ef880d..3eceb61 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -460,7 +460,7 @@
goto err_qp;
}
- psn = random32() & 0xffffff;
+ psn = prandom_u32() & 0xffffff;
ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
if (ret)
goto err_modify;
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index 40649a8..6b0dc13 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -4085,7 +4085,7 @@
if (!cp->csk_tbl)
return -ENOMEM;
- port_id = random32();
+ port_id = prandom_u32();
port_id %= CNIC_LOCAL_PORT_RANGE;
if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE,
CNIC_LOCAL_PORT_MIN, port_id)) {
@@ -4145,7 +4145,7 @@
{
u32 seed;
- seed = random32();
+ seed = prandom_u32();
cnic_ctx_wr(dev, 45, 0, seed);
return 0;
}
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 49b8b58..484f77e 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -449,7 +449,7 @@
if ((--bc->hdlctx.slotcnt) > 0)
return 0;
bc->hdlctx.slotcnt = bc->ch_params.slottime;
- if ((random32() % 256) > bc->ch_params.ppersist)
+ if ((prandom_u32() % 256) > bc->ch_params.ppersist)
return 0;
}
}
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index a4a3516..3169252 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -389,7 +389,7 @@
if ((--s->hdlctx.slotcnt) > 0)
return;
s->hdlctx.slotcnt = s->ch_params.slottime;
- if ((random32() % 256) > s->ch_params.ppersist)
+ if ((prandom_u32() % 256) > s->ch_params.ppersist)
return;
start_tx(dev, s);
}
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index b2d863f..0721e72 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -638,7 +638,7 @@
yp->slotcnt = yp->slot / 10;
/* is random > persist ? */
- if ((random32() % 256) > yp->pers)
+ if ((prandom_u32() % 256) > yp->pers)
return;
yam_start_tx(dev, yp);
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c
index 9eabfaa..5ca14d4 100644
--- a/drivers/net/team/team_mode_random.c
+++ b/drivers/net/team/team_mode_random.c
@@ -18,7 +18,7 @@
static u32 random_N(unsigned int N)
{
- return reciprocal_divide(random32(), N);
+ return reciprocal_divide(prandom_u32(), N);
}
static bool rnd_transmit(struct team *team, struct sk_buff *skb)
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
index 2b90da0..e7a1a47 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
@@ -1117,7 +1117,7 @@
if (afx_hdl->is_listen && afx_hdl->my_listen_chan)
/* 100ms ~ 300ms */
err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan,
- 100 * (1 + (random32() % 3)));
+ 100 * (1 + prandom_u32() % 3));
else
err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan);
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c
index a0cb077..d3c8ece 100644
--- a/drivers/net/wireless/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/mwifiex/cfg80211.c
@@ -216,7 +216,7 @@
mwifiex_form_mgmt_frame(skb, buf, len);
mwifiex_queue_tx_pkt(priv, skb);
- *cookie = random32() | 1;
+ *cookie = prandom_u32() | 1;
cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC);
wiphy_dbg(wiphy, "info: management frame transmitted\n");
@@ -271,7 +271,7 @@
duration);
if (!ret) {
- *cookie = random32() | 1;
+ *cookie = prandom_u32() | 1;
priv->roc_cfg.cookie = *cookie;
priv->roc_cfg.chan = *chan;
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c
index 224d634..ccf54f0 100644
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -68,6 +68,7 @@
enum rtc_type {
rtc_undef = 0,
rtc_r2025sd,
+ rtc_r2221tl,
rtc_rs5c372a,
rtc_rs5c372b,
rtc_rv5c386,
@@ -76,6 +77,7 @@
static const struct i2c_device_id rs5c372_id[] = {
{ "r2025sd", rtc_r2025sd },
+ { "r2221tl", rtc_r2221tl },
{ "rs5c372a", rtc_rs5c372a },
{ "rs5c372b", rtc_rs5c372b },
{ "rv5c386", rtc_rv5c386 },
@@ -529,6 +531,7 @@
rs5c372->time24 = 1;
break;
case rtc_r2025sd:
+ case rtc_r2221tl:
case rtc_rv5c386:
case rtc_rv5c387a:
buf[0] |= RV5C387_CTRL1_24;
@@ -609,6 +612,7 @@
rs5c372->time24 = 1;
break;
case rtc_r2025sd:
+ case rtc_r2221tl:
case rtc_rv5c386:
case rtc_rv5c387a:
if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24)
@@ -640,6 +644,7 @@
dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n",
({ char *s; switch (rs5c372->type) {
case rtc_r2025sd: s = "r2025sd"; break;
+ case rtc_r2221tl: s = "r2221tl"; break;
case rtc_rs5c372a: s = "rs5c372a"; break;
case rtc_rs5c372b: s = "rs5c372b"; break;
case rtc_rv5c386: s = "rv5c386"; break;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9f0c465..df5e961 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -35,6 +35,7 @@
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/aio.h>
#include <linux/errno.h>
#include <linux/mtio.h>
#include <linux/ioctl.h>
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c
index b14a557..b040200 100644
--- a/drivers/staging/android/logger.c
+++ b/drivers/staging/android/logger.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
+#include <linux/aio.h>
#include "logger.h"
#include <asm/ioctls.h>
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index dda0dc4..570c005 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -24,6 +24,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/poll.h>
+#include <linux/mmu_context.h>
+#include <linux/aio.h>
#include <linux/device.h>
#include <linux/moduleparam.h>
@@ -513,6 +515,9 @@
struct kiocb_priv {
struct usb_request *req;
struct ep_data *epdata;
+ struct kiocb *iocb;
+ struct mm_struct *mm;
+ struct work_struct work;
void *buf;
const struct iovec *iv;
unsigned long nr_segs;
@@ -528,7 +533,6 @@
local_irq_disable();
epdata = priv->epdata;
// spin_lock(&epdata->dev->lock);
- kiocbSetCancelled(iocb);
if (likely(epdata && epdata->ep && priv->req))
value = usb_ep_dequeue (epdata->ep, priv->req);
else
@@ -540,15 +544,12 @@
return value;
}
-static ssize_t ep_aio_read_retry(struct kiocb *iocb)
+static ssize_t ep_copy_to_user(struct kiocb_priv *priv)
{
- struct kiocb_priv *priv = iocb->private;
ssize_t len, total;
void *to_copy;
int i;
- /* we "retry" to get the right mm context for this: */
-
/* copy stuff into user buffers */
total = priv->actual;
len = 0;
@@ -568,9 +569,26 @@
if (total == 0)
break;
}
+
+ return len;
+}
+
+static void ep_user_copy_worker(struct work_struct *work)
+{
+ struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
+ struct mm_struct *mm = priv->mm;
+ struct kiocb *iocb = priv->iocb;
+ size_t ret;
+
+ use_mm(mm);
+ ret = ep_copy_to_user(priv);
+ unuse_mm(mm);
+
+ /* completing the iocb can drop the ctx and mm, don't touch mm after */
+ aio_complete(iocb, ret, ret);
+
kfree(priv->buf);
kfree(priv);
- return len;
}
static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
@@ -596,14 +614,14 @@
aio_complete(iocb, req->actual ? req->actual : req->status,
req->status);
} else {
- /* retry() won't report both; so we hide some faults */
+ /* ep_copy_to_user() won't report both; we hide some faults */
if (unlikely(0 != req->status))
DBG(epdata->dev, "%s fault %d len %d\n",
ep->name, req->status, req->actual);
priv->buf = req->buf;
priv->actual = req->actual;
- kick_iocb(iocb);
+ schedule_work(&priv->work);
}
spin_unlock(&epdata->dev->lock);
@@ -633,8 +651,10 @@
return value;
}
iocb->private = priv;
+ priv->iocb = iocb;
priv->iv = iv;
priv->nr_segs = nr_segs;
+ INIT_WORK(&priv->work, ep_user_copy_worker);
value = get_ready_ep(iocb->ki_filp->f_flags, epdata);
if (unlikely(value < 0)) {
@@ -642,10 +662,11 @@
goto fail;
}
- iocb->ki_cancel = ep_aio_cancel;
+ kiocb_set_cancel_fn(iocb, ep_aio_cancel);
get_ep(epdata);
priv->epdata = epdata;
priv->actual = 0;
+ priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */
/* each kiocb is coupled to one usb_request, but we can't
* allocate or submit those if the host disconnected.
@@ -674,7 +695,7 @@
kfree(priv);
put_ep(epdata);
} else
- value = (iv ? -EIOCBRETRY : -EIOCBQUEUED);
+ value = -EIOCBQUEUED;
return value;
}
@@ -692,7 +713,6 @@
if (unlikely(!buf))
return -ENOMEM;
- iocb->ki_retry = ep_aio_read_retry;
return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs);
}
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 0ad61c6..055562c 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 7e03ead..a890db4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/aio.h>
#include "internal.h"
static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index 351afe7..c5b1a8c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,6 +8,8 @@
*
* See ../COPYING for licensing terms.
*/
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
@@ -18,8 +20,6 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
-#define DEBUG 0
-
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -39,11 +39,76 @@
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#if DEBUG > 1
-#define dprintk printk
-#else
-#define dprintk(x...) do { ; } while (0)
-#endif
+#define AIO_RING_MAGIC 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES 1
+#define AIO_RING_INCOMPAT_FEATURES 0
+struct aio_ring {
+ unsigned id; /* kernel internal index number */
+ unsigned nr; /* number of io_events */
+ unsigned head;
+ unsigned tail;
+
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length; /* size of aio_ring */
+
+
+ struct io_event io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES 8
+
+struct kioctx {
+ atomic_t users;
+ atomic_t dead;
+
+ /* This needs improving */
+ unsigned long user_id;
+ struct hlist_node list;
+
+ /*
+ * This is what userspace passed to io_setup(), it's not used for
+ * anything but counting against the global max_reqs quota.
+ *
+ * The real limit is nr_events - 1, which will be larger (see
+ * aio_setup_ring())
+ */
+ unsigned max_reqs;
+
+ /* Size of ringbuffer, in units of struct io_event */
+ unsigned nr_events;
+
+ unsigned long mmap_base;
+ unsigned long mmap_size;
+
+ struct page **ring_pages;
+ long nr_pages;
+
+ struct rcu_head rcu_head;
+ struct work_struct rcu_work;
+
+ struct {
+ atomic_t reqs_active;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t ctx_lock;
+ struct list_head active_reqs; /* used for cancellation */
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ struct mutex ring_lock;
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ unsigned tail;
+ spinlock_t completion_lock;
+ } ____cacheline_aligned_in_smp;
+
+ struct page *internal_pages[AIO_RING_PAGES];
+};
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
@@ -54,11 +119,6 @@
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
@@ -68,10 +128,7 @@
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
- aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
- BUG_ON(!aio_wq);
-
- pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+ pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
return 0;
}
@@ -79,28 +136,23 @@
static void aio_free_ring(struct kioctx *ctx)
{
- struct aio_ring_info *info = &ctx->ring_info;
long i;
- for (i=0; i<info->nr_pages; i++)
- put_page(info->ring_pages[i]);
+ for (i = 0; i < ctx->nr_pages; i++)
+ put_page(ctx->ring_pages[i]);
- if (info->mmap_size) {
- BUG_ON(ctx->mm != current->mm);
- vm_munmap(info->mmap_base, info->mmap_size);
- }
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
- if (info->ring_pages && info->ring_pages != info->internal_pages)
- kfree(info->ring_pages);
- info->ring_pages = NULL;
- info->nr = 0;
+ if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+ kfree(ctx->ring_pages);
}
static int aio_setup_ring(struct kioctx *ctx)
{
struct aio_ring *ring;
- struct aio_ring_info *info = &ctx->ring_info;
unsigned nr_events = ctx->max_reqs;
+ struct mm_struct *mm = current->mm;
unsigned long size, populate;
int nr_pages;
@@ -116,46 +168,44 @@
nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
- info->nr = 0;
- info->ring_pages = info->internal_pages;
+ ctx->nr_events = 0;
+ ctx->ring_pages = ctx->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
- info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!info->ring_pages)
+ ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!ctx->ring_pages)
return -ENOMEM;
}
- info->mmap_size = nr_pages * PAGE_SIZE;
- dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
- down_write(&ctx->mm->mmap_sem);
- info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
- PROT_READ|PROT_WRITE,
- MAP_ANONYMOUS|MAP_PRIVATE, 0,
- &populate);
- if (IS_ERR((void *)info->mmap_base)) {
- up_write(&ctx->mm->mmap_sem);
- info->mmap_size = 0;
+ ctx->mmap_size = nr_pages * PAGE_SIZE;
+ pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+ down_write(&mm->mmap_sem);
+ ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
+ PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+ if (IS_ERR((void *)ctx->mmap_base)) {
+ up_write(&mm->mmap_sem);
+ ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EAGAIN;
}
- dprintk("mmap address: 0x%08lx\n", info->mmap_base);
- info->nr_pages = get_user_pages(current, ctx->mm,
- info->mmap_base, nr_pages,
- 1, 0, info->ring_pages, NULL);
- up_write(&ctx->mm->mmap_sem);
+ pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
+ 1, 0, ctx->ring_pages, NULL);
+ up_write(&mm->mmap_sem);
- if (unlikely(info->nr_pages != nr_pages)) {
+ if (unlikely(ctx->nr_pages != nr_pages)) {
aio_free_ring(ctx);
return -EAGAIN;
}
if (populate)
- mm_populate(info->mmap_base, populate);
+ mm_populate(ctx->mmap_base, populate);
- ctx->user_id = info->mmap_base;
+ ctx->user_id = ctx->mmap_base;
+ ctx->nr_events = nr_events; /* trusted copy */
- info->nr = nr_events; /* trusted copy */
-
- ring = kmap_atomic(info->ring_pages[0]);
+ ring = kmap_atomic(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
@@ -164,72 +214,133 @@
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
return 0;
}
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(). Release the pointer with put_aio_ring_event();
- */
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-#define aio_ring_event(info, nr) ({ \
- unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
- struct io_event *__event; \
- __event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
- __event += pos % AIO_EVENTS_PER_PAGE; \
- __event; \
-})
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+ struct kioctx *ctx = req->ki_ctx;
+ unsigned long flags;
-#define put_aio_ring_event(event) do { \
- struct io_event *__event = (event); \
- (void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
-static void ctx_rcu_free(struct rcu_head *head)
+ if (!req->ki_list.next)
+ list_add(&req->ki_list, &ctx->active_reqs);
+
+ req->ki_cancel = cancel;
+
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+ struct io_event *res)
+{
+ kiocb_cancel_fn *old, *cancel;
+ int ret = -EINVAL;
+
+ /*
+ * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+ * actually has a cancel function, hence the cmpxchg()
+ */
+
+ cancel = ACCESS_ONCE(kiocb->ki_cancel);
+ do {
+ if (!cancel || cancel == KIOCB_CANCELLED)
+ return ret;
+
+ old = cancel;
+ cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+ } while (cancel != old);
+
+ atomic_inc(&kiocb->ki_users);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ memset(res, 0, sizeof(*res));
+ res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+ res->data = kiocb->ki_user_data;
+ ret = cancel(kiocb, res);
+
+ spin_lock_irq(&ctx->ctx_lock);
+
+ return ret;
+}
+
+static void free_ioctx_rcu(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
kmem_cache_free(kioctx_cachep, ctx);
}
-/* __put_ioctx
- * Called when the last user of an aio context has gone away,
- * and the struct needs to be freed.
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
*/
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct kioctx *ctx)
{
- unsigned nr_events = ctx->max_reqs;
- BUG_ON(ctx->reqs_active);
+ struct aio_ring *ring;
+ struct io_event res;
+ struct kiocb *req;
+ unsigned head, avail;
- cancel_delayed_work_sync(&ctx->wq);
- aio_free_ring(ctx);
- mmdrop(ctx->mm);
- ctx->mm = NULL;
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
+ spin_lock_irq(&ctx->ctx_lock);
+
+ while (!list_empty(&ctx->active_reqs)) {
+ req = list_first_entry(&ctx->active_reqs,
+ struct kiocb, ki_list);
+
+ list_del_init(&req->ki_list);
+ kiocb_cancel(ctx, req, &res);
}
- pr_debug("__put_ioctx: freeing %p\n", ctx);
- call_rcu(&ctx->rcu_head, ctx_rcu_free);
+
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ while (atomic_read(&ctx->reqs_active) > 0) {
+ wait_event(ctx->wait, head != ctx->tail);
+
+ avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+
+ atomic_sub(avail, &ctx->reqs_active);
+ head += avail;
+ head %= ctx->nr_events;
+ }
+
+ WARN_ON(atomic_read(&ctx->reqs_active) < 0);
+
+ aio_free_ring(ctx);
+
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+ aio_nr -= ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+
+ pr_debug("freeing %p\n", ctx);
+
+ /*
+ * Here the call_rcu() is between the wait_event() for reqs_active to
+ * hit 0, and freeing the ioctx.
+ *
+ * aio_complete() decrements reqs_active, but it has to touch the ioctx
+ * after to issue a wakeup so we use rcu.
+ */
+ call_rcu(&ctx->rcu_head, free_ioctx_rcu);
}
-static inline int try_get_ioctx(struct kioctx *kioctx)
+static void put_ioctx(struct kioctx *ctx)
{
- return atomic_inc_not_zero(&kioctx->users);
-}
-
-static inline void put_ioctx(struct kioctx *kioctx)
-{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- if (unlikely(atomic_dec_and_test(&kioctx->users)))
- __put_ioctx(kioctx);
+ if (unlikely(atomic_dec_and_test(&ctx->users)))
+ free_ioctx(ctx);
}
/* ioctx_alloc
@@ -237,7 +348,7 @@
*/
static struct kioctx *ioctx_alloc(unsigned nr_events)
{
- struct mm_struct *mm;
+ struct mm_struct *mm = current->mm;
struct kioctx *ctx;
int err = -ENOMEM;
@@ -256,17 +367,15 @@
return ERR_PTR(-ENOMEM);
ctx->max_reqs = nr_events;
- mm = ctx->mm = current->mm;
- atomic_inc(&mm->mm_count);
atomic_set(&ctx->users, 2);
+ atomic_set(&ctx->dead, 0);
spin_lock_init(&ctx->ctx_lock);
- spin_lock_init(&ctx->ring_info.ring_lock);
+ spin_lock_init(&ctx->completion_lock);
+ mutex_init(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
- INIT_LIST_HEAD(&ctx->run_list);
- INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
if (aio_setup_ring(ctx) < 0)
goto out_freectx;
@@ -286,64 +395,56 @@
hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
spin_unlock(&mm->ioctx_lock);
- dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
- ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+ pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+ ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
out_freectx:
- mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
- dprintk("aio: error allocating ioctx %d\n", err);
+ pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
}
-/* kill_ctx
- * Cancels all outstanding aio requests on an aio context. Used
- * when the processes owning a context have all exited to encourage
+static void kill_ioctx_work(struct work_struct *work)
+{
+ struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
+
+ wake_up_all(&ctx->wait);
+ put_ioctx(ctx);
+}
+
+static void kill_ioctx_rcu(struct rcu_head *head)
+{
+ struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+ INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
+ schedule_work(&ctx->rcu_work);
+}
+
+/* kill_ioctx
+ * Cancels all outstanding aio requests on an aio context. Used
+ * when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ctx(struct kioctx *ctx)
+static void kill_ioctx(struct kioctx *ctx)
{
- int (*cancel)(struct kiocb *, struct io_event *);
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- struct io_event res;
+ if (!atomic_xchg(&ctx->dead, 1)) {
+ hlist_del_rcu(&ctx->list);
+ /* Between hlist_del_rcu() and dropping the initial ref */
+ synchronize_rcu();
- spin_lock_irq(&ctx->ctx_lock);
- ctx->dead = 1;
- while (!list_empty(&ctx->active_reqs)) {
- struct list_head *pos = ctx->active_reqs.next;
- struct kiocb *iocb = list_kiocb(pos);
- list_del_init(&iocb->ki_list);
- cancel = iocb->ki_cancel;
- kiocbSetCancelled(iocb);
- if (cancel) {
- iocb->ki_users++;
- spin_unlock_irq(&ctx->ctx_lock);
- cancel(iocb, &res);
- spin_lock_irq(&ctx->ctx_lock);
- }
+ /*
+ * We can't punt to workqueue here because put_ioctx() ->
+ * free_ioctx() will unmap the ringbuffer, and that has to be
+ * done in the original process's context. kill_ioctx_rcu/work()
+ * exist for exit_aio(), as in that path free_ioctx() won't do
+ * the unmap.
+ */
+ kill_ioctx_work(&ctx->rcu_work);
}
-
- if (!ctx->reqs_active)
- goto out;
-
- add_wait_queue(&ctx->wait, &wait);
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- while (ctx->reqs_active) {
- spin_unlock_irq(&ctx->ctx_lock);
- io_schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- spin_lock_irq(&ctx->ctx_lock);
- }
- __set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
-
-out:
- spin_unlock_irq(&ctx->ctx_lock);
}
/* wait_on_sync_kiocb:
@@ -351,9 +452,9 @@
*/
ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
{
- while (iocb->ki_users) {
+ while (atomic_read(&iocb->ki_users)) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!iocb->ki_users)
+ if (!atomic_read(&iocb->ki_users))
break;
io_schedule();
}
@@ -362,28 +463,26 @@
}
EXPORT_SYMBOL(wait_on_sync_kiocb);
-/* exit_aio: called when the last user of mm goes away. At this point,
- * there is no way for any new requests to be submited or any of the
- * io_* syscalls to be called on the context. However, there may be
- * outstanding requests which hold references to the context; as they
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away. At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
*/
void exit_aio(struct mm_struct *mm)
{
struct kioctx *ctx;
+ struct hlist_node *n;
- while (!hlist_empty(&mm->ioctx_list)) {
- ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
- hlist_del_rcu(&ctx->list);
-
- kill_ctx(ctx);
-
+ hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
"exit_aio:ioctx still alive: %d %d %d\n",
- atomic_read(&ctx->users), ctx->dead,
- ctx->reqs_active);
+ atomic_read(&ctx->users),
+ atomic_read(&ctx->dead),
+ atomic_read(&ctx->reqs_active));
/*
* We don't need to bother with munmap() here -
* exit_mmap(mm) is coming and it'll unmap everything.
@@ -391,150 +490,53 @@
* as indicator that it needs to unmap the area,
* just set it to 0; aio_free_ring() is the only
* place that uses ->mmap_size, so it's safe.
- * That way we get all munmap done to current->mm -
- * all other callers have ctx->mm == current->mm.
*/
- ctx->ring_info.mmap_size = 0;
- put_ioctx(ctx);
+ ctx->mmap_size = 0;
+
+ if (!atomic_xchg(&ctx->dead, 1)) {
+ hlist_del_rcu(&ctx->list);
+ call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+ }
}
}
/* aio_get_req
- * Allocate a slot for an aio request. Increments the users count
+ * Allocate a slot for an aio request. Increments the ki_users count
* of the kioctx so that the kioctx stays around until all requests are
* complete. Returns NULL if no requests are free.
*
- * Returns with kiocb->users set to 2. The io submit code path holds
+ * Returns with kiocb->ki_users set to 2. The io submit code path holds
* an extra reference while submitting the i/o.
* This prevents races between the aio code path referencing the
* req (after submitting it) and aio_complete() freeing the req.
*/
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
-{
- struct kiocb *req = NULL;
-
- req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
- if (unlikely(!req))
- return NULL;
-
- req->ki_flags = 0;
- req->ki_users = 2;
- req->ki_key = 0;
- req->ki_ctx = ctx;
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
- req->ki_dtor = NULL;
- req->private = NULL;
- req->ki_iovec = NULL;
- INIT_LIST_HEAD(&req->ki_run_list);
- req->ki_eventfd = NULL;
-
- return req;
-}
-
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE 32L
-struct kiocb_batch {
- struct list_head head;
- long count; /* number of requests left to allocate */
-};
-
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
-{
- INIT_LIST_HEAD(&batch->head);
- batch->count = total;
-}
-
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- struct kiocb *req, *n;
-
- if (list_empty(&batch->head))
- return;
-
- spin_lock_irq(&ctx->ctx_lock);
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- list_del(&req->ki_list);
- kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
- }
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs. This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
- unsigned short allocated, to_alloc;
- long avail;
- struct kiocb *req, *n;
- struct aio_ring *ring;
-
- to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
- for (allocated = 0; allocated < to_alloc; allocated++) {
- req = __aio_get_req(ctx);
- if (!req)
- /* allocation failed, go with what we've got */
- break;
- list_add(&req->ki_batch, &batch->head);
- }
-
- if (allocated == 0)
- goto out;
-
- spin_lock_irq(&ctx->ctx_lock);
- ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-
- avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
- BUG_ON(avail < 0);
- if (avail < allocated) {
- /* Trim back the number of requests. */
- list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
- list_del(&req->ki_batch);
- kmem_cache_free(kiocb_cachep, req);
- if (--allocated <= avail)
- break;
- }
- }
-
- batch->count -= allocated;
- list_for_each_entry(req, &batch->head, ki_batch) {
- list_add(&req->ki_list, &ctx->active_reqs);
- ctx->reqs_active++;
- }
-
- kunmap_atomic(ring);
- spin_unlock_irq(&ctx->ctx_lock);
-
-out:
- return allocated;
-}
-
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
- struct kiocb_batch *batch)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
- if (list_empty(&batch->head))
- if (kiocb_batch_refill(ctx, batch) == 0)
- return NULL;
- req = list_first_entry(&batch->head, struct kiocb, ki_batch);
- list_del(&req->ki_batch);
+ if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
+ return NULL;
+
+ if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
+ goto out_put;
+
+ req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+ if (unlikely(!req))
+ goto out_put;
+
+ atomic_set(&req->ki_users, 2);
+ req->ki_ctx = ctx;
+
return req;
+out_put:
+ atomic_dec(&ctx->reqs_active);
+ return NULL;
}
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
{
- assert_spin_locked(&ctx->ctx_lock);
-
+ if (req->ki_filp)
+ fput(req->ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
if (req->ki_dtor)
@@ -542,48 +544,12 @@
if (req->ki_iovec != &req->ki_inline_vec)
kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
- ctx->reqs_active--;
-
- if (unlikely(!ctx->reqs_active && ctx->dead))
- wake_up_all(&ctx->wait);
}
-/* __aio_put_req
- * Returns true if this put was the last user of the request.
- */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+void aio_put_req(struct kiocb *req)
{
- dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
- req, atomic_long_read(&req->ki_filp->f_count));
-
- assert_spin_locked(&ctx->ctx_lock);
-
- req->ki_users--;
- BUG_ON(req->ki_users < 0);
- if (likely(req->ki_users))
- return 0;
- list_del(&req->ki_list); /* remove from active_reqs */
- req->ki_cancel = NULL;
- req->ki_retry = NULL;
-
- fput(req->ki_filp);
- req->ki_filp = NULL;
- really_put_req(ctx, req);
- return 1;
-}
-
-/* aio_put_req
- * Returns true if this put was the last user of the kiocb,
- * false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
-{
- struct kioctx *ctx = req->ki_ctx;
- int ret;
- spin_lock_irq(&ctx->ctx_lock);
- ret = __aio_put_req(ctx, req);
- spin_unlock_irq(&ctx->ctx_lock);
- return ret;
+ if (atomic_dec_and_test(&req->ki_users))
+ kiocb_free(req);
}
EXPORT_SYMBOL(aio_put_req);
@@ -595,13 +561,8 @@
rcu_read_lock();
hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- /*
- * RCU protects us against accessing freed memory but
- * we have to be careful not to get a reference when the
- * reference count already dropped to 0 (ctx->dead test
- * is unreliable because of races).
- */
- if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+ if (ctx->user_id == ctx_id) {
+ atomic_inc(&ctx->users);
ret = ctx;
break;
}
@@ -611,295 +572,16 @@
return ret;
}
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
-
- assert_spin_locked(&ctx->ctx_lock);
-
- if (list_empty(&iocb->ki_run_list)) {
- list_add_tail(&iocb->ki_run_list,
- &ctx->run_list);
- return 1;
- }
- return 0;
-}
-
-/* aio_run_iocb
- * This is the core aio execution routine. It is
- * invoked both for initial i/o submission and
- * subsequent retries via the aio_kick_handler.
- * Expects to be invoked with iocb->ki_ctx->lock
- * already held. The lock is released and reacquired
- * as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- ssize_t (*retry)(struct kiocb *);
- ssize_t ret;
-
- if (!(retry = iocb->ki_retry)) {
- printk("aio_run_iocb: iocb->ki_retry = NULL\n");
- return 0;
- }
-
- /*
- * We don't want the next retry iteration for this
- * operation to start until this one has returned and
- * updated the iocb state. However, wait_queue functions
- * can trigger a kick_iocb from interrupt context in the
- * meantime, indicating that data is available for the next
- * iteration. We want to remember that and enable the
- * next retry iteration _after_ we are through with
- * this one.
- *
- * So, in order to be able to register a "kick", but
- * prevent it from being queued now, we clear the kick
- * flag, but make the kick code *think* that the iocb is
- * still on the run list until we are actually done.
- * When we are done with this iteration, we check if
- * the iocb was kicked in the meantime and if so, queue
- * it up afresh.
- */
-
- kiocbClearKicked(iocb);
-
- /*
- * This is so that aio_complete knows it doesn't need to
- * pull the iocb off the run list (We can't just call
- * INIT_LIST_HEAD because we don't want a kick_iocb to
- * queue this on the run list yet)
- */
- iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
- spin_unlock_irq(&ctx->ctx_lock);
-
- /* Quit retrying if the i/o has been cancelled */
- if (kiocbIsCancelled(iocb)) {
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- /* must not access the iocb after this */
- goto out;
- }
-
- /*
- * Now we are all set to call the retry method in async
- * context.
- */
- ret = retry(iocb);
-
- if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
- /*
- * There's no easy way to restart the syscall since other AIO's
- * may be already running. Just fail this IO with EINTR.
- */
- if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
- ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
- ret = -EINTR;
- aio_complete(iocb, ret, 0);
- }
-out:
- spin_lock_irq(&ctx->ctx_lock);
-
- if (-EIOCBRETRY == ret) {
- /*
- * OK, now that we are done with this iteration
- * and know that there is more left to go,
- * this is where we let go so that a subsequent
- * "kick" can start the next iteration
- */
-
- /* will make __queue_kicked_iocb succeed from here on */
- INIT_LIST_HEAD(&iocb->ki_run_list);
- /* we must queue the next iteration ourselves, if it
- * has already been kicked */
- if (kiocbIsKicked(iocb)) {
- __queue_kicked_iocb(iocb);
-
- /*
- * __queue_kicked_iocb will always return 1 here, because
- * iocb->ki_run_list is empty at this point so it should
- * be safe to unconditionally queue the context into the
- * work queue.
- */
- aio_queue_work(ctx);
- }
- }
- return ret;
-}
-
-/*
- * __aio_run_iocbs:
- * Process all pending retries queued on the ioctx
- * run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
-{
- struct kiocb *iocb;
- struct list_head run_list;
-
- assert_spin_locked(&ctx->ctx_lock);
-
- list_replace_init(&ctx->run_list, &run_list);
- while (!list_empty(&run_list)) {
- iocb = list_entry(run_list.next, struct kiocb,
- ki_run_list);
- list_del(&iocb->ki_run_list);
- /*
- * Hold an extra reference while retrying i/o.
- */
- iocb->ki_users++; /* grab extra reference */
- aio_run_iocb(iocb);
- __aio_put_req(ctx, iocb);
- }
- if (!list_empty(&ctx->run_list))
- return 1;
- return 0;
-}
-
-static void aio_queue_work(struct kioctx * ctx)
-{
- unsigned long timeout;
- /*
- * if someone is waiting, get the work started right
- * away, otherwise, use a longer delay
- */
- smp_mb();
- if (waitqueue_active(&ctx->wait))
- timeout = 1;
- else
- timeout = HZ/10;
- queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
-
-/*
- * aio_run_all_iocbs:
- * Process all pending retries queued on the ioctx
- * run list, and keep running them until the list
- * stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
- spin_lock_irq(&ctx->ctx_lock);
- while (__aio_run_iocbs(ctx))
- ;
- spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * aio_kick_handler:
- * Work queue handler triggered to process pending
- * retries on an ioctx. Takes on the aio issuer's
- * mm context before running the iocbs, so that
- * copy_xxx_user operates on the issuer's address
- * space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
- struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
- mm_segment_t oldfs = get_fs();
- struct mm_struct *mm;
- int requeue;
-
- set_fs(USER_DS);
- use_mm(ctx->mm);
- spin_lock_irq(&ctx->ctx_lock);
- requeue =__aio_run_iocbs(ctx);
- mm = ctx->mm;
- spin_unlock_irq(&ctx->ctx_lock);
- unuse_mm(mm);
- set_fs(oldfs);
- /*
- * we're in a worker thread already; no point using non-zero delay
- */
- if (requeue)
- queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
-
-
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
-{
- struct kioctx *ctx = iocb->ki_ctx;
- unsigned long flags;
- int run = 0;
-
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- /* set this inside the lock so that we can't race with aio_run_iocb()
- * testing it and putting the iocb on the run list under the lock */
- if (!kiocbTryKick(iocb))
- run = __queue_kicked_iocb(iocb);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- if (run)
- aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- * Called typically from a wait queue callback context
- * to trigger a retry of the iocb.
- * The retry is usually executed by aio workqueue
- * threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
- /* sync iocbs are easy: they can only ever be executing from a
- * single context. */
- if (is_sync_kiocb(iocb)) {
- kiocbSetKicked(iocb);
- wake_up_process(iocb->ki_obj.tsk);
- return;
- }
-
- try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
/* aio_complete
* Called when the io request on the given iocb is complete.
- * Returns true if this is the last user of the request. The
- * only other user of the request can be the cancellation code.
*/
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
{
struct kioctx *ctx = iocb->ki_ctx;
- struct aio_ring_info *info;
struct aio_ring *ring;
- struct io_event *event;
+ struct io_event *ev_page, *event;
unsigned long flags;
- unsigned long tail;
- int ret;
+ unsigned tail, pos;
/*
* Special case handling for sync iocbs:
@@ -909,61 +591,81 @@
* - the sync task helpfully left a reference to itself in the iocb
*/
if (is_sync_kiocb(iocb)) {
- BUG_ON(iocb->ki_users != 1);
+ BUG_ON(atomic_read(&iocb->ki_users) != 1);
iocb->ki_user_data = res;
- iocb->ki_users = 0;
+ atomic_set(&iocb->ki_users, 0);
wake_up_process(iocb->ki_obj.tsk);
- return 1;
+ return;
}
- info = &ctx->ring_info;
-
- /* add a completion event to the ring buffer.
- * must be done holding ctx->ctx_lock to prevent
- * other code from messing with the tail
- * pointer since we might be called from irq
- * context.
+ /*
+ * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+ * need to issue a wakeup after decrementing reqs_active.
*/
- spin_lock_irqsave(&ctx->ctx_lock, flags);
+ rcu_read_lock();
- if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
- list_del_init(&iocb->ki_run_list);
+ if (iocb->ki_list.next) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_del(&iocb->ki_list);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ }
/*
* cancelled requests don't get events, userland was given one
* when the event got cancelled.
*/
- if (kiocbIsCancelled(iocb))
+ if (unlikely(xchg(&iocb->ki_cancel,
+ KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
+ atomic_dec(&ctx->reqs_active);
+ /* Still need the wake_up in case free_ioctx is waiting */
goto put_rq;
+ }
- ring = kmap_atomic(info->ring_pages[0]);
+ /*
+ * Add a completion event to the ring buffer. Must be done holding
+ * ctx->ctx_lock to prevent other code from messing with the tail
+ * pointer since we might be called from irq context.
+ */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
- tail = info->tail;
- event = aio_ring_event(info, tail);
- if (++tail >= info->nr)
+ tail = ctx->tail;
+ pos = tail + AIO_EVENTS_OFFSET;
+
+ if (++tail >= ctx->nr_events)
tail = 0;
+ ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+ event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+
event->obj = (u64)(unsigned long)iocb->ki_obj.user;
event->data = iocb->ki_user_data;
event->res = res;
event->res2 = res2;
- dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
- res, res2);
+ kunmap_atomic(ev_page);
+ flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+
+ pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+ ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+ res, res2);
/* after flagging the request as done, we
* must never even look at it again
*/
smp_wmb(); /* make event visible before updating tail */
- info->tail = tail;
+ ctx->tail = tail;
+
+ ring = kmap_atomic(ctx->ring_pages[0]);
ring->tail = tail;
-
- put_aio_ring_event(event);
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
- pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ pr_debug("added to ring %p at [%u]\n", iocb, tail);
/*
* Check if the user asked us to deliver the result through an
@@ -975,7 +677,7 @@
put_rq:
/* everything turned out well, dispose of the aiocb. */
- ret = __aio_put_req(ctx, iocb);
+ aio_put_req(iocb);
/*
* We have to order our ring_info tail store above and test
@@ -988,233 +690,133 @@
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- return ret;
+ rcu_read_unlock();
}
EXPORT_SYMBOL(aio_complete);
-/* aio_read_evt
- * Pull an event off of the ioctx's event ring. Returns the number of
- * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
- * TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ * Pull an event off of the ioctx's event ring. Returns the number of
+ * events fetched
*/
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+ struct io_event __user *event, long nr)
{
- struct aio_ring_info *info = &ioctx->ring_info;
struct aio_ring *ring;
- unsigned long head;
- int ret = 0;
+ unsigned head, pos;
+ long ret = 0;
+ int copy_ret;
- ring = kmap_atomic(info->ring_pages[0]);
- dprintk("in aio_read_evt h%lu t%lu m%lu\n",
- (unsigned long)ring->head, (unsigned long)ring->tail,
- (unsigned long)ring->nr);
+ mutex_lock(&ctx->ring_lock);
- if (ring->head == ring->tail)
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
+
+ if (head == ctx->tail)
goto out;
- spin_lock(&info->ring_lock);
+ while (ret < nr) {
+ long avail;
+ struct io_event *ev;
+ struct page *page;
- head = ring->head % info->nr;
- if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head);
- *ent = *evp;
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- put_aio_ring_event(evp);
+ avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+ if (head == ctx->tail)
+ break;
+
+ avail = min(avail, nr - ret);
+ avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+ ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
+
+ pos = head + AIO_EVENTS_OFFSET;
+ page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+ pos %= AIO_EVENTS_PER_PAGE;
+
+ ev = kmap(page);
+ copy_ret = copy_to_user(event + ret, ev + pos,
+ sizeof(*ev) * avail);
+ kunmap(page);
+
+ if (unlikely(copy_ret)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret += avail;
+ head += avail;
+ head %= ctx->nr_events;
}
- spin_unlock(&info->ring_lock);
-out:
- dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
- (unsigned long)ring->head, (unsigned long)ring->tail);
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->head = head;
kunmap_atomic(ring);
+ flush_dcache_page(ctx->ring_pages[0]);
+
+ pr_debug("%li h%u t%u\n", ret, head, ctx->tail);
+
+ atomic_sub(ret, &ctx->reqs_active);
+out:
+ mutex_unlock(&ctx->ring_lock);
+
return ret;
}
-struct aio_timeout {
- struct timer_list timer;
- int timed_out;
- struct task_struct *p;
-};
-
-static void timeout_func(unsigned long data)
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+ struct io_event __user *event, long *i)
{
- struct aio_timeout *to = (struct aio_timeout *)data;
+ long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
- to->timed_out = 1;
- wake_up_process(to->p);
+ if (ret > 0)
+ *i += ret;
+
+ if (unlikely(atomic_read(&ctx->dead)))
+ ret = -EINVAL;
+
+ if (!*i)
+ *i = ret;
+
+ return ret < 0 || *i >= min_nr;
}
-static inline void init_timeout(struct aio_timeout *to)
-{
- setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
- to->timed_out = 0;
- to->p = current;
-}
-
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
- const struct timespec *ts)
-{
- to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
- if (time_after(to->timer.expires, jiffies))
- add_timer(&to->timer);
- else
- to->timed_out = 1;
-}
-
-static inline void clear_timeout(struct aio_timeout *to)
-{
- del_singleshot_timer_sync(&to->timer);
-}
-
-static int read_events(struct kioctx *ctx,
- long min_nr, long nr,
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
struct timespec __user *timeout)
{
- long start_jiffies = jiffies;
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
- int ret;
- int i = 0;
- struct io_event ent;
- struct aio_timeout to;
- int retry = 0;
+ ktime_t until = { .tv64 = KTIME_MAX };
+ long ret = 0;
- /* needed to zero any padding within an entry (there shouldn't be
- * any, but C is fun!
- */
- memset(&ent, 0, sizeof(ent));
-retry:
- ret = 0;
- while (likely(i < nr)) {
- ret = aio_read_evt(ctx, &ent);
- if (unlikely(ret <= 0))
- break;
-
- dprintk("read event: %Lx %Lx %Lx %Lx\n",
- ent.data, ent.obj, ent.res, ent.res2);
-
- /* Could we split the check in two? */
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
- }
- ret = 0;
-
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
- }
-
- if (min_nr <= i)
- return i;
- if (ret)
- return ret;
-
- /* End fast path */
-
- /* racey check, but it gets redone */
- if (!retry && unlikely(!list_empty(&ctx->run_list))) {
- retry = 1;
- aio_run_all_iocbs(ctx);
- goto retry;
- }
-
- init_timeout(&to);
if (timeout) {
struct timespec ts;
- ret = -EFAULT;
+
if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
- goto out;
+ return -EFAULT;
- set_timeout(start_jiffies, &to, &ts);
+ until = timespec_to_ktime(ts);
}
- while (likely(i < nr)) {
- add_wait_queue_exclusive(&ctx->wait, &wait);
- do {
- set_task_state(tsk, TASK_INTERRUPTIBLE);
- ret = aio_read_evt(ctx, &ent);
- if (ret)
- break;
- if (min_nr <= i)
- break;
- if (unlikely(ctx->dead)) {
- ret = -EINVAL;
- break;
- }
- if (to.timed_out) /* Only check after read evt */
- break;
- /* Try to only show up in io wait if there are ops
- * in flight */
- if (ctx->reqs_active)
- io_schedule();
- else
- schedule();
- if (signal_pending(tsk)) {
- ret = -EINTR;
- break;
- }
- /*ret = aio_read_evt(ctx, &ent);*/
- } while (1) ;
-
- set_task_state(tsk, TASK_RUNNING);
- remove_wait_queue(&ctx->wait, &wait);
-
- if (unlikely(ret <= 0))
- break;
-
- ret = -EFAULT;
- if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
- dprintk("aio: lost an event due to EFAULT.\n");
- break;
- }
-
- /* Good, event copied to userland, update counts. */
- event ++;
- i ++;
- }
-
- if (timeout)
- clear_timeout(&to);
-out:
- destroy_timer_on_stack(&to.timer);
- return i ? i : ret;
-}
-
-/* Take an ioctx and remove it from the list of ioctx's. Protects
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
-{
- struct mm_struct *mm = current->mm;
- int was_dead;
-
- /* delete the entry from the list is someone else hasn't already */
- spin_lock(&mm->ioctx_lock);
- was_dead = ioctx->dead;
- ioctx->dead = 1;
- hlist_del_rcu(&ioctx->list);
- spin_unlock(&mm->ioctx_lock);
-
- dprintk("aio_release(%p)\n", ioctx);
- if (likely(!was_dead))
- put_ioctx(ioctx); /* twice for the list */
-
- kill_ctx(ioctx);
-
/*
- * Wake up any waiters. The setting of ctx->dead must be seen
- * by other CPUs at this point. Right now, we rely on the
- * locking done by the above calls to ensure this consistency.
+ * Note that aio_read_events() is being called as the conditional - i.e.
+ * we're calling it after prepare_to_wait() has set task state to
+ * TASK_INTERRUPTIBLE.
+ *
+ * But aio_read_events() can block, and if it blocks it's going to flip
+ * the task state back to TASK_RUNNING.
+ *
+ * This should be ok, provided it doesn't flip the state back to
+ * TASK_RUNNING and return 0 too much - that causes us to spin. That
+ * will only happen if the mutex_lock() call blocks, and we then find
+ * the ringbuffer empty. So in practice we should be ok, but it's
+ * something to be aware of when touching this code.
*/
- wake_up_all(&ioctx->wait);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+
+ return ret;
}
/* sys_io_setup:
@@ -1252,7 +854,7 @@
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
}
@@ -1270,7 +872,7 @@
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- io_destroy(ioctx);
+ kill_ioctx(ioctx);
put_ioctx(ioctx);
return 0;
}
@@ -1301,30 +903,21 @@
BUG_ON(ret > 0 && iocb->ki_left == 0);
}
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
ssize_t ret = 0;
- unsigned short opcode;
-
- if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
- (iocb->ki_opcode == IOCB_CMD_PREAD)) {
- rw_op = file->f_op->aio_read;
- opcode = IOCB_CMD_PREADV;
- } else {
- rw_op = file->f_op->aio_write;
- opcode = IOCB_CMD_PWRITEV;
- }
/* This matches the pread()/pwrite() logic */
if (iocb->ki_pos < 0)
return -EINVAL;
- if (opcode == IOCB_CMD_PWRITEV)
+ if (rw == WRITE)
file_start_write(file);
do {
ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
@@ -1336,9 +929,9 @@
/* retry all partial writes. retry partial reads as long as its a
* regular file. */
} while (ret > 0 && iocb->ki_left > 0 &&
- (opcode == IOCB_CMD_PWRITEV ||
+ (rw == WRITE ||
(!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
- if (opcode == IOCB_CMD_PWRITEV)
+ if (rw == WRITE)
file_end_write(file);
/* This means we must have transferred all that we could */
@@ -1348,81 +941,49 @@
/* If we managed to write some out we return that, rather than
* the eventual error. */
- if (opcode == IOCB_CMD_PWRITEV
- && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+ if (rw == WRITE
+ && ret < 0 && ret != -EIOCBQUEUED
&& iocb->ki_nbytes - iocb->ki_left)
ret = iocb->ki_nbytes - iocb->ki_left;
return ret;
}
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 1);
- return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
- struct file *file = iocb->ki_filp;
- ssize_t ret = -EINVAL;
-
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(iocb, 0);
- return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
{
ssize_t ret;
+ kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
#ifdef CONFIG_COMPAT
if (compat)
- ret = compat_rw_copy_check_uvector(type,
+ ret = compat_rw_copy_check_uvector(rw,
(struct compat_iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
else
#endif
- ret = rw_copy_check_uvector(type,
+ ret = rw_copy_check_uvector(rw,
(struct iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
&kiocb->ki_iovec);
if (ret < 0)
- goto out;
+ return ret;
- ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
- if (ret < 0)
- goto out;
-
- kiocb->ki_nr_segs = kiocb->ki_nbytes;
- kiocb->ki_cur_seg = 0;
- /* ki_nbytes/left now reflect bytes instead of segs */
+ /* ki_nbytes now reflect bytes instead of segs */
kiocb->ki_nbytes = ret;
- kiocb->ki_left = ret;
-
- ret = 0;
-out:
- return ret;
+ return 0;
}
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
{
- int bytes;
-
- bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
- if (bytes < 0)
- return bytes;
+ if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+ return -EFAULT;
kiocb->ki_iovec = &kiocb->ki_inline_vec;
kiocb->ki_iovec->iov_base = kiocb->ki_buf;
- kiocb->ki_iovec->iov_len = bytes;
+ kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
kiocb->ki_nr_segs = 1;
- kiocb->ki_cur_seg = 0;
return 0;
}
@@ -1431,96 +992,95 @@
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
{
- struct file *file = kiocb->ki_filp;
- ssize_t ret = 0;
+ struct file *file = req->ki_filp;
+ ssize_t ret;
+ int rw;
+ fmode_t mode;
+ aio_rw_op *rw_op;
- switch (kiocb->ki_opcode) {
+ switch (req->ki_opcode) {
case IOCB_CMD_PREAD:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(READ, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
- case IOCB_CMD_PWRITE:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
- kiocb->ki_left)))
- break;
- ret = aio_setup_single_vector(WRITE, file, kiocb);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
case IOCB_CMD_PREADV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- break;
- ret = aio_setup_vectored_rw(READ, kiocb, compat);
- if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- kiocb->ki_retry = aio_rw_vect_retry;
- break;
+ mode = FMODE_READ;
+ rw = READ;
+ rw_op = file->f_op->aio_read;
+ goto rw_common;
+
+ case IOCB_CMD_PWRITE:
case IOCB_CMD_PWRITEV:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- break;
- ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+ mode = FMODE_WRITE;
+ rw = WRITE;
+ rw_op = file->f_op->aio_write;
+ goto rw_common;
+rw_common:
+ if (unlikely(!(file->f_mode & mode)))
+ return -EBADF;
+
+ if (!rw_op)
+ return -EINVAL;
+
+ ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+ req->ki_opcode == IOCB_CMD_PWRITEV)
+ ? aio_setup_vectored_rw(rw, req, compat)
+ : aio_setup_single_vector(rw, req);
if (ret)
- break;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- kiocb->ki_retry = aio_rw_vect_retry;
+ return ret;
+
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (ret < 0)
+ return ret;
+
+ req->ki_nbytes = ret;
+ req->ki_left = ret;
+
+ ret = aio_rw_vect_retry(req, rw, rw_op);
break;
+
case IOCB_CMD_FDSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fdsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 1);
break;
+
case IOCB_CMD_FSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- kiocb->ki_retry = aio_fsync;
+ if (!file->f_op->aio_fsync)
+ return -EINVAL;
+
+ ret = file->f_op->aio_fsync(req, 0);
break;
+
default:
- dprintk("EINVAL: io_submit: no operation provided\n");
- ret = -EINVAL;
+ pr_debug("EINVAL: no operation provided\n");
+ return -EINVAL;
}
- if (!kiocb->ki_retry)
- return ret;
+ if (ret != -EIOCBQUEUED) {
+ /*
+ * There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+ ret == -ERESTARTNOHAND ||
+ ret == -ERESTART_RESTARTBLOCK))
+ ret = -EINTR;
+ aio_complete(req, ret, 0);
+ }
return 0;
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, struct kiocb_batch *batch,
- bool compat)
+ struct iocb *iocb, bool compat)
{
struct kiocb *req;
- struct file *file;
ssize_t ret;
/* enforce forwards compatibility on users */
if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
- pr_debug("EINVAL: io_submit: reserve field set\n");
+ pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1534,16 +1094,16 @@
return -EINVAL;
}
- file = fget(iocb->aio_fildes);
- if (unlikely(!file))
- return -EBADF;
-
- req = aio_get_req(ctx, batch); /* returns with 2 references to req */
- if (unlikely(!req)) {
- fput(file);
+ req = aio_get_req(ctx);
+ if (unlikely(!req))
return -EAGAIN;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp)) {
+ ret = -EBADF;
+ goto out_put_req;
}
- req->ki_filp = file;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1559,9 +1119,9 @@
}
}
- ret = put_user(req->ki_key, &user_iocb->aio_key);
+ ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
if (unlikely(ret)) {
- dprintk("EFAULT: aio_key\n");
+ pr_debug("EFAULT: aio_key\n");
goto out_put_req;
}
@@ -1573,41 +1133,14 @@
req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
req->ki_opcode = iocb->aio_lio_opcode;
- ret = aio_setup_iocb(req, compat);
-
+ ret = aio_run_iocb(req, compat);
if (ret)
goto out_put_req;
- spin_lock_irq(&ctx->ctx_lock);
- /*
- * We could have raced with io_destroy() and are currently holding a
- * reference to ctx which should be destroyed. We cannot submit IO
- * since ctx gets freed as soon as io_submit() puts its reference. The
- * check here is reliable: io_destroy() sets ctx->dead before waiting
- * for outstanding IO and the barrier between these two is realized by
- * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we
- * increment ctx->reqs_active before checking for ctx->dead and the
- * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
- * don't see ctx->dead set here, io_destroy() waits for our IO to
- * finish.
- */
- if (ctx->dead) {
- spin_unlock_irq(&ctx->ctx_lock);
- ret = -EINVAL;
- goto out_put_req;
- }
- aio_run_iocb(req);
- if (!list_empty(&ctx->run_list)) {
- /* drain the run list */
- while (__aio_run_iocbs(ctx))
- ;
- }
- spin_unlock_irq(&ctx->ctx_lock);
-
aio_put_req(req); /* drop extra ref to req */
return 0;
-
out_put_req:
+ atomic_dec(&ctx->reqs_active);
aio_put_req(req); /* drop extra ref to req */
aio_put_req(req); /* drop i/o ref to req */
return ret;
@@ -1620,7 +1153,6 @@
long ret = 0;
int i = 0;
struct blk_plug plug;
- struct kiocb_batch batch;
if (unlikely(nr < 0))
return -EINVAL;
@@ -1633,12 +1165,10 @@
ctx = lookup_ioctx(ctx_id);
if (unlikely(!ctx)) {
- pr_debug("EINVAL: io_submit: invalid context id\n");
+ pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
- kiocb_batch_init(&batch, nr);
-
blk_start_plug(&plug);
/*
@@ -1659,13 +1189,12 @@
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+ ret = io_submit_one(ctx, user_iocb, &tmp, compat);
if (ret)
break;
}
blk_finish_plug(&plug);
- kiocb_batch_free(ctx, &batch);
put_ioctx(ctx);
return i ? i : ret;
}
@@ -1698,10 +1227,13 @@
assert_spin_locked(&ctx->ctx_lock);
+ if (key != KIOCB_KEY)
+ return NULL;
+
/* TODO: use a hash or array, this sucks. */
list_for_each(pos, &ctx->active_reqs) {
struct kiocb *kiocb = list_kiocb(pos);
- if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+ if (kiocb->ki_obj.user == iocb)
return kiocb;
}
return NULL;
@@ -1720,7 +1252,7 @@
SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
- int (*cancel)(struct kiocb *iocb, struct io_event *res);
+ struct io_event res;
struct kioctx *ctx;
struct kiocb *kiocb;
u32 key;
@@ -1735,32 +1267,22 @@
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- ret = -EAGAIN;
+
kiocb = lookup_kiocb(ctx, iocb, key);
- if (kiocb && kiocb->ki_cancel) {
- cancel = kiocb->ki_cancel;
- kiocb->ki_users ++;
- kiocbSetCancelled(kiocb);
- } else
- cancel = NULL;
+ if (kiocb)
+ ret = kiocb_cancel(ctx, kiocb, &res);
+ else
+ ret = -EINVAL;
+
spin_unlock_irq(&ctx->ctx_lock);
- if (NULL != cancel) {
- struct io_event tmp;
- pr_debug("calling cancel\n");
- memset(&tmp, 0, sizeof(tmp));
- tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
- tmp.data = kiocb->ki_user_data;
- ret = cancel(kiocb, &tmp);
- if (!ret) {
- /* Cancellation succeeded -- copy the result
- * into the user's buffer.
- */
- if (copy_to_user(result, &tmp, sizeof(tmp)))
- ret = -EFAULT;
- }
- } else
- ret = -EINVAL;
+ if (!ret) {
+ /* Cancellation succeeded -- copy the result
+ * into the user's buffer.
+ */
+ if (copy_to_user(result, &res, sizeof(res)))
+ ret = -EFAULT;
+ }
put_ioctx(ctx);
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6c..954d731 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,6 +19,7 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3823d3f..d9871c1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb8b7a0..bc4d54c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09c58a3..898da0a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
+#include <linux/aio.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d70830c..656e169 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "super.h"
#include "mds_client.h"
diff --git a/fs/compat.c b/fs/compat.c
index 93f7d02..fc3b55d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -47,6 +47,7 @@
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
diff --git a/fs/direct-io.c b/fs/direct-io.c
index cfb816d..51d16e0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include <linux/aio.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 63b1f54..201f0a0 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/aio.h>
#include "ecryptfs_kernel.h"
/**
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fe60cc1..0a87bb1 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
#include <linux/mpage.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext2.h"
#include "acl.h"
#include "xip.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d706dbf..23c7128 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 64848b5..4959e29 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include "ext4.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 98be6f6..b8d5d35 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
* (sct@redhat.com), 1993, 1998
*/
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 793d44b..0723774 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
+#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5929cd0..19599bd 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,6 +18,7 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
+#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7bd22a2..d0ed4ba 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4ff9016..dfce656 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,6 +19,7 @@
#include <linux/mpage.h>
#include <linux/buffer_head.h>
#include <linux/mount.h>
+#include <linux/aio.h>
#include <linux/vfs.h>
#include <linux/parser.h>
#include <linux/uio.h>
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b3aaf7b..aef34b1 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/aio.h>
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/list.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a6c1664..1d55f94 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/aio.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4655e59..d1c9b85 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/swap.h>
+#include <linux/aio.h>
static const struct file_operations fuse_direct_io_file_operations;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9883694..0bad69e 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d79c2da..acd1676 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
+#include <linux/aio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 716e1aa..f9299d8 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfs_fs.h"
#include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7faaa96..f833d35 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
+#include <linux/aio.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 523464e..a3f868a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -909,11 +909,8 @@
static int get_hstate_idx(int page_size_log)
{
- struct hstate *h;
+ struct hstate *h = hstate_sizelog(page_size_log);
- if (!page_size_log)
- return default_hstate_idx;
- h = size_to_hstate(1 << page_size_log);
if (!h)
return -1;
return h - hstates;
@@ -929,9 +926,12 @@
.d_dname = hugetlb_dname
};
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
- size_t size, vm_flags_t acctflag,
- struct user_struct **user,
+/*
+ * Note that size should be aligned to proper hugepage size in caller side,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+ vm_flags_t acctflag, struct user_struct **user,
int creat_flags, int page_size_log)
{
struct file *file = ERR_PTR(-ENOMEM);
@@ -939,8 +939,6 @@
struct path path;
struct super_block *sb;
struct qstr quick_string;
- struct hstate *hstate;
- unsigned long num_pages;
int hstate_idx;
hstate_idx = get_hstate_idx(page_size_log);
@@ -980,12 +978,10 @@
if (!inode)
goto out_dentry;
- hstate = hstate_inode(inode);
- size += addr & ~huge_page_mask(hstate);
- num_pages = ALIGN(size, huge_page_size(hstate)) >>
- huge_page_shift(hstate);
file = ERR_PTR(-ENOMEM);
- if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
+ if (hugetlb_reserve_pages(inode, 0,
+ size >> huge_page_shift(hstate_inode(inode)), NULL,
+ acctflag))
goto out_inode;
d_instantiate(path.dentry, inode);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 77554b6..730f24e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cf02f55..689fb60 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da4b81..c5670b8 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
+#include <linux/aio.h>
#include <asm/page.h>
#include <asm/uaccess.h>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d3e118c..2778b02 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/log2.h>
+#include <linux/aio.h>
#include "aops.h"
#include "attrib.h"
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index ffb2da3..f671e49 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
+#include <linux/aio.h>
+
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 12ae194..3a44a64 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2322,7 +2322,7 @@
status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 88924a3..621fc73 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -147,8 +147,6 @@
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
struct buffer_head *ocfs2_bread(struct inode *inode,
int block, int *err, int reada);
diff --git a/fs/pipe.c b/fs/pipe.c
index a029a14..d2c45e1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
+#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
diff --git a/fs/read_write.c b/fs/read_write.c
index 90ba3b3..0343000 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,6 +9,7 @@
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
@@ -329,16 +330,6 @@
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!kiocbIsKicked(iocb))
- schedule();
- else
- kiocbClearKicked(iocb);
- __set_current_state(TASK_RUNNING);
-}
-
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -350,13 +341,7 @@
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -406,13 +391,7 @@
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -592,13 +571,7 @@
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
- for (;;) {
- ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
- if (ret != -EIOCBRETRY)
- break;
- wait_on_retry_sync_kiocb(&kiocb);
- }
-
+ ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ea5061fd..77d6d47 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/swap.h>
+#include <linux/aio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f12189d..1437453 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
*/
#include "ubifs.h"
+#include <linux/aio.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/slab.h>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7a12e48..b6d15d3 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3244c98..2b2691b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 054d60c..a5f2042 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 31ff6db..1bdf965 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -9,91 +9,32 @@
#include <linux/atomic.h>
-#define AIO_MAXSEGS 4
-#define AIO_KIOGRP_NR_ATOMIC 8
-
struct kioctx;
+struct kiocb;
-/* Notes on cancelling a kiocb:
- * If a kiocb is cancelled, aio_complete may return 0 to indicate
- * that cancel has not yet disposed of the kiocb. All cancel
- * operations *must* call aio_put_req to dispose of the kiocb
- * to guard against races with the completion code.
- */
-#define KIOCB_C_CANCELLED 0x01
-#define KIOCB_C_COMPLETE 0x02
+#define KIOCB_KEY 0
-#define KIOCB_SYNC_KEY (~0U)
-
-/* ki_flags bits */
/*
- * This may be used for cancel/retry serialization in the future, but
- * for now it's unused and we probably don't want modules to even
- * think they can use it.
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
*/
-/* #define KIF_LOCKED 0 */
-#define KIF_KICKED 1
-#define KIF_CANCELLED 2
+#define KIOCB_CANCELLED ((void *) (~0ULL))
-#define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
+typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
-#define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-#define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-#define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-/* is there a better place to document function pointer methods? */
-/**
- * ki_retry - iocb forward progress callback
- * @kiocb: The kiocb struct to advance by performing an operation.
- *
- * This callback is called when the AIO core wants a given AIO operation
- * to make forward progress. The kiocb argument describes the operation
- * that is to be performed. As the operation proceeds, perhaps partially,
- * ki_retry is expected to update the kiocb with progress made. Typically
- * ki_retry is set in the AIO core and it itself calls file_operations
- * helpers.
- *
- * ki_retry's return value determines when the AIO operation is completed
- * and an event is generated in the AIO event ring. Except the special
- * return values described below, the value that is returned from ki_retry
- * is transferred directly into the completion ring as the operation's
- * resulting status. Once this has happened ki_retry *MUST NOT* reference
- * the kiocb pointer again.
- *
- * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete()
- * will be called on the kiocb pointer in the future. The AIO core will
- * not ask the method again -- ki_retry must ensure forward progress.
- * aio_complete() must be called once and only once in the future, multiple
- * calls may result in undefined behaviour.
- *
- * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb()
- * will be called on the kiocb pointer in the future. This may happen
- * through generic helpers that associate kiocb->ki_wait with a wait
- * queue head that ki_retry uses via current->io_wait. It can also happen
- * with custom tracking and manual calls to kick_iocb(), though that is
- * discouraged. In either case, kick_iocb() must be called once and only
- * once. ki_retry must ensure forward progress, the AIO core will wait
- * indefinitely for kick_iocb() to be called.
- */
struct kiocb {
- struct list_head ki_run_list;
- unsigned long ki_flags;
- int ki_users;
- unsigned ki_key; /* id of this request */
+ atomic_t ki_users;
struct file *ki_filp;
- struct kioctx *ki_ctx; /* may be NULL for sync ops */
- int (*ki_cancel)(struct kiocb *, struct io_event *);
- ssize_t (*ki_retry)(struct kiocb *);
+ struct kioctx *ki_ctx; /* NULL for sync ops */
+ kiocb_cancel_fn *ki_cancel;
void (*ki_dtor)(struct kiocb *);
union {
@@ -117,7 +58,6 @@
struct list_head ki_list; /* the aio core uses this
* for cancellation */
- struct list_head ki_batch; /* batch allocation */
/*
* If the aio_resfd field of the userspace iocb is not zero,
@@ -128,106 +68,40 @@
static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
- return kiocb->ki_key == KIOCB_SYNC_KEY;
+ return kiocb->ki_ctx == NULL;
}
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
- .ki_users = 1,
- .ki_key = KIOCB_SYNC_KEY,
+ .ki_users = ATOMIC_INIT(1),
+ .ki_ctx = NULL,
.ki_filp = filp,
.ki_obj.tsk = current,
};
}
-#define AIO_RING_MAGIC 0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES 1
-#define AIO_RING_INCOMPAT_FEATURES 0
-struct aio_ring {
- unsigned id; /* kernel internal index number */
- unsigned nr; /* number of io_events */
- unsigned head;
- unsigned tail;
-
- unsigned magic;
- unsigned compat_features;
- unsigned incompat_features;
- unsigned header_length; /* size of aio_ring */
-
-
- struct io_event io_events[0];
-}; /* 128 bytes + ring size */
-
-#define AIO_RING_PAGES 8
-struct aio_ring_info {
- unsigned long mmap_base;
- unsigned long mmap_size;
-
- struct page **ring_pages;
- spinlock_t ring_lock;
- long nr_pages;
-
- unsigned nr, tail;
-
- struct page *internal_pages[AIO_RING_PAGES];
-};
-
-static inline unsigned aio_ring_avail(struct aio_ring_info *info,
- struct aio_ring *ring)
-{
- return (ring->head + info->nr - 1 - ring->tail) % info->nr;
-}
-
-struct kioctx {
- atomic_t users;
- int dead;
- struct mm_struct *mm;
-
- /* This needs improving */
- unsigned long user_id;
- struct hlist_node list;
-
- wait_queue_head_t wait;
-
- spinlock_t ctx_lock;
-
- int reqs_active;
- struct list_head active_reqs; /* used for cancellation */
- struct list_head run_list; /* used for kicked reqs */
-
- /* sys_io_setup currently limits this to an unsigned int */
- unsigned max_reqs;
-
- struct aio_ring_info ring_info;
-
- struct delayed_work wq;
-
- struct rcu_head rcu_head;
-};
-
/* prototypes */
-extern unsigned aio_max_size;
-
#ifdef CONFIG_AIO
extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-extern int aio_put_req(struct kiocb *iocb);
-extern void kick_iocb(struct kiocb *iocb);
-extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_put_req(struct kiocb *iocb);
+extern void aio_complete(struct kiocb *iocb, long res, long res2);
struct mm_struct;
extern void exit_aio(struct mm_struct *mm);
extern long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user *__user *iocbpp, bool compat);
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
#else
static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
-static inline int aio_put_req(struct kiocb *iocb) { return 0; }
-static inline void kick_iocb(struct kiocb *iocb) { }
-static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+static inline void aio_put_req(struct kiocb *iocb) { }
+static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
struct mm_struct;
static inline void exit_aio(struct mm_struct *mm) { }
static inline long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user * __user *iocbpp,
bool compat) { return 0; }
+static inline void kiocb_set_cancel_fn(struct kiocb *req,
+ kiocb_cancel_fn *cancel) { }
#endif /* CONFIG_AIO */
static inline struct kiocb *list_kiocb(struct list_head *h)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3bff9ce..5047355 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -28,6 +28,7 @@
struct inode;
struct cgroup;
struct css_id;
+struct eventfd_ctx;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
diff --git a/include/linux/errno.h b/include/linux/errno.h
index f6bf082..89627b9 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,6 +28,5 @@
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
-#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3a62df3..6b4890f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -189,8 +189,7 @@
extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
- size_t size, vm_flags_t acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
struct user_struct **user, int creat_flags,
int page_size_log);
@@ -209,8 +208,8 @@
#define is_file_hugepages(file) 0
static inline struct file *
-hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
- vm_flags_t acctflag, struct user_struct **user, int creat_flags,
+hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
+ struct user_struct **user, int creat_flags,
int page_size_log)
{
return ERR_PTR(-ENOSYS);
@@ -288,6 +287,13 @@
return hstate_inode(file_inode(f));
}
+static inline struct hstate *hstate_sizelog(int page_size_log)
+{
+ if (!page_size_log)
+ return &default_hstate;
+ return size_to_hstate(1 << page_size_log);
+}
+
static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
return hstate_file(vma->vm_file);
@@ -352,11 +358,12 @@
return h - hstates;
}
-#else
+#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
+#define hstate_sizelog(s) NULL
#define hstate_vma(v) NULL
#define hstate_inode(i) NULL
#define huge_page_size(h) PAGE_SIZE
@@ -371,6 +378,6 @@
}
#define hstate_index_to_shift(index) 0
#define hstate_index(h) 0
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a7f19e..e0c8528 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -951,13 +951,19 @@
* (see walk_page_range for more details)
*/
struct mm_walk {
- int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
- int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
- int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
- int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
- int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
- int (*hugetlb_entry)(pte_t *, unsigned long,
- unsigned long, unsigned long, struct mm_walk *);
+ int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pud_entry)(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pte_entry)(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk);
+ int (*pte_hole)(unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
+ int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long next,
+ struct mm_walk *walk);
struct mm_struct *mm;
void *private;
};
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 731e4ec..e277266 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
+#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/kref.h>
diff --git a/include/linux/random.h b/include/linux/random.h
index 347ce55..3b9377d 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -29,13 +29,6 @@
void prandom_bytes(void *buf, int nbytes);
void prandom_seed(u32 seed);
-/*
- * These macros are preserved for backward compatibility and should be
- * removed as soon as a transition is finished.
- */
-#define random32() prandom_u32()
-#define srandom32(seed) prandom_seed(seed)
-
u32 prandom_u32_state(struct rnd_state *);
void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4800e9d..022c085 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,8 +313,6 @@
struct nsproxy;
struct user_namespace;
-#include <linux/aio.h>
-
#ifdef CONFIG_MMU
extern void arch_pick_mmap_layout(struct mm_struct *mm);
extern unsigned long
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 7cb64d4..ac38be2 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -330,6 +330,92 @@
__ret; \
})
+#define __wait_event_hrtimeout(wq, condition, timeout, state) \
+({ \
+ int __ret = 0; \
+ DEFINE_WAIT(__wait); \
+ struct hrtimer_sleeper __t; \
+ \
+ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \
+ HRTIMER_MODE_REL); \
+ hrtimer_init_sleeper(&__t, current); \
+ if ((timeout).tv64 != KTIME_MAX) \
+ hrtimer_start_range_ns(&__t.timer, timeout, \
+ current->timer_slack_ns, \
+ HRTIMER_MODE_REL); \
+ \
+ for (;;) { \
+ prepare_to_wait(&wq, &__wait, state); \
+ if (condition) \
+ break; \
+ if (state == TASK_INTERRUPTIBLE && \
+ signal_pending(current)) { \
+ __ret = -ERESTARTSYS; \
+ break; \
+ } \
+ if (!__t.task) { \
+ __ret = -ETIME; \
+ break; \
+ } \
+ schedule(); \
+ } \
+ \
+ hrtimer_cancel(&__t.timer); \
+ destroy_hrtimer_on_stack(&__t.timer); \
+ finish_wait(&wq, &__wait); \
+ __ret; \
+})
+
+/**
+ * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, or -ETIME if the timeout
+ * elapsed.
+ */
+#define wait_event_hrtimeout(wq, condition, timeout) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_UNINTERRUPTIBLE); \
+ __ret; \
+})
+
+/**
+ * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, -ERESTARTSYS if it was
+ * interrupted by a signal, or -ETIME if the timeout elapsed.
+ */
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
+({ \
+ long __ret = 0; \
+ if (!(condition)) \
+ __ret = __wait_event_hrtimeout(wq, condition, timeout, \
+ TASK_INTERRUPTIBLE); \
+ __ret; \
+})
+
#define __wait_event_interruptible_exclusive(wq, condition, ret) \
do { \
DEFINE_WAIT(__wait); \
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9a9367c..579a500 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
#define WRITEBACK_H
#include <linux/sched.h>
+#include <linux/workqueue.h>
#include <linux/fs.h>
DECLARE_PER_CPU(int, dirty_throttle_leaks);
diff --git a/ipc/shm.c b/ipc/shm.c
index 8247c49..34af1fe 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -491,10 +491,14 @@
sprintf (name, "SYSV%08x", key);
if (shmflg & SHM_HUGETLB) {
+ struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT)
+ & SHM_HUGE_MASK);
+ size_t hugesize = ALIGN(size, huge_page_size(hs));
+
/* hugetlb_file_setup applies strict accounting */
if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
- file = hugetlb_file_setup(name, 0, size, acctflag,
+ file = hugetlb_file_setup(name, hugesize, acctflag,
&shp->mlock_user, HUGETLB_SHMFS_INODE,
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
} else {
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d40687..c509cc4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 96dcfcd..fa36e14 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
+#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 17ae54d..aed981a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
+#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f1d921..cb1c9de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -92,16 +92,18 @@
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
- MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
+ MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
+ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
};
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
+ "rss_huge",
"mapped_file",
"swap",
};
@@ -917,6 +919,7 @@
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
+ struct page *page,
bool anon, int nr_pages)
{
preempt_disable();
@@ -932,6 +935,10 @@
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
+ if (PageTransHuge(page))
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ nr_pages);
+
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
@@ -2914,7 +2921,7 @@
else
anon = false;
- mem_cgroup_charge_statistics(memcg, anon, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
unlock_page_cgroup(pc);
/*
@@ -3708,16 +3715,21 @@
{
struct page_cgroup *head_pc = lookup_page_cgroup(head);
struct page_cgroup *pc;
+ struct mem_cgroup *memcg;
int i;
if (mem_cgroup_disabled())
return;
+
+ memcg = head_pc->mem_cgroup;
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
- pc->mem_cgroup = head_pc->mem_cgroup;
+ pc->mem_cgroup = memcg;
smp_wmb();/* see __commit_charge() */
pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
}
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3773,11 +3785,11 @@
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
- mem_cgroup_charge_statistics(from, anon, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, anon, nr_pages);
+ mem_cgroup_charge_statistics(to, page, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
@@ -4152,7 +4164,7 @@
break;
}
- mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -4502,7 +4514,7 @@
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
- mem_cgroup_charge_statistics(memcg, false, -1);
+ mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
ClearPageCgroupUsed(pc);
}
unlock_page_cgroup(pc);
@@ -5030,6 +5042,10 @@
return res_counter_read_u64(&memcg->memsw, RES_USAGE);
}
+ /*
+ * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+ * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+ */
val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
diff --git a/mm/mmap.c b/mm/mmap.c
index da3e9c0..1ae21d6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1363,15 +1363,20 @@
file = fget(fd);
if (!file)
goto out;
+ if (is_file_hugepages(file))
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
+
+ len = ALIGN(len, huge_page_size(hstate_sizelog(
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3dcfaf4..8a8cd02 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -14,9 +14,6 @@
* use_mm
* Makes the calling kernel thread take on the specified
* mm context.
- * Called by the retry thread execute retries within the
- * iocb issuer's mm context, so that copy_from/to_user
- * operations work seamlessly for aio.
* (Note: this routine is intended to be called only
* from a kernel thread context)
*/
diff --git a/mm/page_io.c b/mm/page_io.c
index bb5d752..06a8842 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,6 +20,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
diff --git a/mm/shmem.c b/mm/shmem.c
index 39b2a0b..5e6a842 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,6 +31,7 @@
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
+#include <linux/aio.h>
static struct vfsmount *shm_mnt;
diff --git a/mm/swap.c b/mm/swap.c
index acd40bf..dfd7d71 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
+#include <linux/uio.h>
#include "internal.h"
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b12fd86..d365724 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1522,6 +1522,8 @@
* Must not be called in NMI context (strictly speaking, only if we don't
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
+ *
+ * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
*
*/
void vfree(const void *addr)
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 8bbefc3..d4f1468 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -16,6 +16,8 @@
#include <linux/key-type.h>
#include <linux/task_work.h>
+struct iovec;
+
#ifdef __KDEBUG
#define kenter(FMT, ...) \
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 4b5c948..33cfd27 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -22,6 +22,7 @@
#include <linux/err.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 23e3c46..ccfa383 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -25,7 +25,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/pm_qos.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
#include <linux/dma-mapping.h>
#include <sound/core.h>
#include <sound/control.h>