From 0134ed4fb9e78672ee9f7b18007114404c81e63f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 10 Mar 2017 13:24:22 -0700 Subject: device-dax: fix pmd/pte fault fallback handling Jeff Moyer reports: With a device dax alignment of 4KB or 2MB, I get sigbus when running the attached fio job file for the current kernel (4.11.0-rc1+). If I specify an alignment of 1GB, it works. I turned on debug output, and saw that it was failing in the huge fault code. dax dax1.0: dax_open dax dax1.0: dax_mmap dax dax1.0: dax_dev_huge_fault: fio: write (0x7f08f0a00000 - dax dax1.0: __dax_dev_pud_fault: phys_to_pgoff(0xffffffffcf60 dax dax1.0: dax_release fio config for reproduce: [global] ioengine=dev-dax direct=0 filename=/dev/dax0.0 bs=2m [write] rw=write [read] stonewall rw=read The driver fails to fallback when taking a fault that is larger than the device alignment, or handling a larger fault when a smaller mapping is already established. While we could support larger mappings for a device with a smaller alignment, that change is too large for the immediate fix. The simplest change is to force fallback until the fault size matches the alignment. Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap") Cc: Reported-by: Jeff Moyer Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 8d9829ff2a78..a284dc532e46 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -427,6 +427,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) int rc = VM_FAULT_SIGBUS; phys_addr_t phys; pfn_t pfn; + unsigned int fault_size = PAGE_SIZE; if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; @@ -437,6 +438,9 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } + if (fault_size != dax_region->align) + return VM_FAULT_SIGBUS; + phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); if (phys == -1) { dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, @@ -464,6 +468,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; + unsigned int fault_size = PMD_SIZE; if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; @@ -480,6 +485,16 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } + if (fault_size < dax_region->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dax_region->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pmd_addr < vmf->vma->vm_start || + (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + pgoff = linear_page_index(vmf->vma, pmd_addr); phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { -- cgit v1.2.3 From 70b085b06c4560a69e95607f77bb4c2b2e41943c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 10 Mar 2017 13:24:27 -0700 Subject: device-dax: fix pud fault fallback handling Jeff Moyer reports: With a device dax alignment of 4KB or 2MB, I get sigbus when running the attached fio job file for the current kernel (4.11.0-rc1+). If I specify an alignment of 1GB, it works. I turned on debug output, and saw that it was failing in the huge fault code. dax dax1.0: dax_open dax dax1.0: dax_mmap dax dax1.0: dax_dev_huge_fault: fio: write (0x7f08f0a00000 - dax dax1.0: __dax_dev_pud_fault: phys_to_pgoff(0xffffffffcf60) dax dax1.0: dax_release fio config for reproduce: [global] ioengine=dev-dax direct=0 filename=/dev/dax0.0 bs=2m [write] rw=write [read] stonewall rw=read The driver fails to fallback when taking a fault that is larger than the device alignment, or handling a larger fault when a smaller mapping is already established. While we could support larger mappings for a device with a smaller alignment, that change is too large for the immediate fix. The simplest change is to force fallback until the fault size matches the alignment. Reported-by: Jeff Moyer Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index a284dc532e46..523fecec7bda 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -518,6 +518,8 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) phys_addr_t phys; pgoff_t pgoff; pfn_t pfn; + unsigned int fault_size = PUD_SIZE; + if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; @@ -534,6 +536,16 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } + if (fault_size < dax_region->align) + return VM_FAULT_SIGBUS; + else if (fault_size > dax_region->align) + return VM_FAULT_FALLBACK; + + /* if we are outside of the VMA */ + if (pud_addr < vmf->vma->vm_start || + (pud_addr + PUD_SIZE) > vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + pgoff = linear_page_index(vmf->vma, pud_addr); phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); if (phys == -1) { -- cgit v1.2.3 From 52084f89b38cdd896b59627c629915ef1a7bf615 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 9 Mar 2017 16:56:01 -0700 Subject: device-dax: fix debug output typo The debug output for return the return data of pgoff_to_phys() in the fault handlers has 'phys' and 'pgoff' incorrectly swapped. Reported-by: Jeff Moyer Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/dax.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 523fecec7bda..80c6db279ae1 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -443,7 +443,7 @@ static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); if (phys == -1) { - dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, vmf->pgoff); return VM_FAULT_SIGBUS; } @@ -498,7 +498,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) pgoff = linear_page_index(vmf->vma, pmd_addr); phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { - dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); return VM_FAULT_SIGBUS; } @@ -549,7 +549,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) pgoff = linear_page_index(vmf->vma, pud_addr); phys = pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); if (phys == -1) { - dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, pgoff); return VM_FAULT_SIGBUS; } -- cgit v1.2.3 From 956a4cd2c957acf638ff29951aabaa9d8e92bbc2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 7 Apr 2017 16:42:08 -0700 Subject: device-dax: switch to srcu, fix rcu_read_lock() vs pte allocation The following warning triggers with a new unit test that stresses the device-dax interface. =============================== [ ERR: suspicious RCU usage. ] 4.11.0-rc4+ #1049 Tainted: G O ------------------------------- ./include/linux/rcupdate.h:521 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 0 2 locks held by fio/9070: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x167/0x4f0 #1: (rcu_read_lock){......}, at: [] dax_dev_huge_fault+0x32/0x620 [dax] Call Trace: dump_stack+0x86/0xc3 lockdep_rcu_suspicious+0xd7/0x110 ___might_sleep+0xac/0x250 __might_sleep+0x4a/0x80 __alloc_pages_nodemask+0x23a/0x360 alloc_pages_current+0xa1/0x1f0 pte_alloc_one+0x17/0x80 __pte_alloc+0x1e/0x120 __get_locked_pte+0x1bf/0x1d0 insert_pfn.isra.70+0x3a/0x100 ? lookup_memtype+0xa6/0xd0 vm_insert_mixed+0x64/0x90 dax_dev_huge_fault+0x520/0x620 [dax] ? dax_dev_huge_fault+0x32/0x620 [dax] dax_dev_fault+0x10/0x20 [dax] __do_fault+0x1e/0x140 __handle_mm_fault+0x9af/0x10d0 handle_mm_fault+0x16d/0x370 ? handle_mm_fault+0x47/0x370 __do_page_fault+0x28c/0x4f0 trace_do_page_fault+0x58/0x2a0 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Inserting a page table entry may trigger an allocation while we are holding a read lock to keep the device instance alive for the duration of the fault. Use srcu for this keep-alive protection. Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap") Cc: Signed-off-by: Dan Williams --- drivers/dax/Kconfig | 1 + drivers/dax/dax.c | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3e2ab3b14eea..9e95bf94eb13 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -2,6 +2,7 @@ menuconfig DEV_DAX tristate "DAX: direct access to differentiated memory" default m if NVDIMM_DAX depends on TRANSPARENT_HUGEPAGE + select SRCU help Support raw access to differentiated (persistence, bandwidth, latency...) memory via an mmap(2) capable character diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 80c6db279ae1..806f180c80d8 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -25,6 +25,7 @@ #include "dax.h" static dev_t dax_devt; +DEFINE_STATIC_SRCU(dax_srcu); static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); static int nr_dax = CONFIG_NR_DEV_DAX; @@ -60,7 +61,7 @@ struct dax_region { * @region - parent region * @dev - device backing the character device * @cdev - core chardev data - * @alive - !alive + rcu grace period == no new mappings can be established + * @alive - !alive + srcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device * @res - array of physical address ranges @@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) static int dax_dev_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int rc; + int rc, id; struct file *filp = vmf->vma->vm_file; struct dax_dev *dax_dev = filp->private_data; @@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, ? "write" : "read", vmf->vma->vm_start, vmf->vma->vm_end); - rcu_read_lock(); + id = srcu_read_lock(&dax_srcu); switch (pe_size) { case PE_SIZE_PTE: rc = __dax_dev_pte_fault(dax_dev, vmf); @@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, default: return VM_FAULT_FALLBACK; } - rcu_read_unlock(); + srcu_read_unlock(&dax_srcu, id); return rc; } @@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev) * Note, rcu is not protecting the liveness of dax_dev, rcu is * ensuring that any fault handlers that might have seen * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_rcu() has started will abort + * that start after synchronize_srcu() has started will abort * upon seeing dax_dev->alive == false. */ dax_dev->alive = false; - synchronize_rcu(); + synchronize_srcu(&dax_srcu); unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); cdev_del(cdev); device_unregister(dev); -- cgit v1.2.3