From b1c10bea620f79109b5cc9935267bea4f6f29ac6 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 22 Nov 2011 14:38:03 +0000 Subject: MIPS: Add fast get_user_pages Gup is used in a few cases, say futex. This work is derived from the x86 version, and operations of pte and pmd are adapted to the defines of MIPS in straight forward manner. [ralf@linux-mips.org: Fixed up reject in arch/mips/mm/Makefile due to whitespace formatting differences. Fixed build error in gup.c due to conflicting changes elsewhere in the kernel.] Signed-off-by: Hillf Danton Cc: David Daney Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/2859/ Signed-off-by: Ralf Baechle --- arch/mips/mm/Makefile | 4 +- arch/mips/mm/gup.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+), 2 deletions(-) create mode 100644 arch/mips/mm/gup.c (limited to 'arch/mips/mm') diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index 4d8c1623eee..3ca2a065cf7 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile @@ -3,8 +3,8 @@ # obj-y += cache.o dma-default.o extable.o fault.o \ - init.o mmap.o tlbex.o tlbex-fault.o uasm.o \ - page.o + gup.o init.o mmap.o page.o tlbex.o \ + tlbex-fault.o uasm.o obj-$(CONFIG_32BIT) += ioremap.o pgtable-32.o obj-$(CONFIG_64BIT) += pgtable-64.o diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c new file mode 100644 index 00000000000..33aadbcf170 --- /dev/null +++ b/arch/mips/mm/gup.c @@ -0,0 +1,315 @@ +/* + * Lockless get_user_pages_fast for MIPS + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + * Copyright (C) 2011 Ralf Baechle + */ +#include +#include +#include +#include +#include +#include + +#include + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#if defined(CONFIG_64BIT_PHYS_ADDR) && defined(CONFIG_CPU_MIPS32) + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#else + return ACCESS_ONCE(*ptep); +#endif +} + +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + if (!pte_present(pte) || + pte_special(pte) || (write && !pte_write(pte))) { + pte_unmap(ptep); + return 0; + } + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + SetPageReferenced(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + pte_unmap(ptep - 1); + return 1; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + VM_BUG_ON(page != compound_head(page)); + VM_BUG_ON(page_count(page) == 0); + atomic_add(nr, &page->_count); + SetPageReferenced(page); +} + +static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t pte = *(pte_t *)&pmd; + struct page *head, *page; + int refs; + + if (write && !pte_write(pte)) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_special(pte)); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + get_head_page_multiple(head, refs); + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + if (unlikely(pmd_huge(pmd))) { + if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages,nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t pte = *(pte_t *)&pud; + struct page *head, *page; + int refs; + + if (write && !pte_write(pte)) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_special(pte)); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + get_head_page_multiple(head, refs); + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_huge(pud))) { + if (!gup_huge_pud(pud, addr, next, write, pages,nr)) + return 0; + } else { + if (!gup_pmd_range(pud, addr, next, write, pages,nr)) + return 0; + } + } while (pudp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch + * size will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed. + * + * So long as we atomically load page table pointers versus teardown, + * we can follow the address down to the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int ret, nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + + end = start + len; + if (end < start) + goto slow_irqon; + + /* XXX: batch / limit 'nr' */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; +slow: + local_irq_enable(); + +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + return ret; +} -- cgit v1.2.3 From 43064c0c8ee2ada8edd421520c633584d648e100 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 22 Nov 2011 14:38:03 +0000 Subject: MIPS: Handle initmem in systems with kernel not in add_memory_region() mem This patch addresses a couple of related problems: 1) The kernel may reside in physical memory outside of the ranges set by plat_mem_setup(). If this is the case, init mem cannot be reused as it resides outside of the range of pages that the kernel memory allocators control. 2) initrd images might be loaded in physical memory outside of the ranges set by plat_mem_setup(). The memory likewise cannot be reused. The patch doesn't handle this specific case, but the infrastructure is useful for future patches that do. The crux of the problem is that there are memory regions that need be memory_present(), but that cannot be free_bootmem() at the time of arch_mem_init(). We create a new type of memory (BOOT_MEM_INIT_RAM) for use with add_memory_region(). Then arch_mem_init() adds the init mem with this type if the init mem is not already covered by existing ranges. When memory is being freed into the bootmem allocator, we skip the BOOT_MEM_INIT_RAM ranges so they are not clobbered, but we do signal them as memory_present(). This way when they are later freed, the necessary memory manager structures have initialized and the Sparse allocater is prevented from crashing. The Octeon specific code that handled this case is removed, because the new general purpose code handles the case. Signed-off-by: David Daney To: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/1988/ Signed-off-by: Ralf Baechle --- arch/mips/mm/init.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/mips/mm') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index b7ebc4fa89b..3b3ffd439cd 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -304,9 +304,14 @@ int page_is_ram(unsigned long pagenr) for (i = 0; i < boot_mem_map.nr_map; i++) { unsigned long addr, end; - if (boot_mem_map.map[i].type != BOOT_MEM_RAM) + switch (boot_mem_map.map[i].type) { + case BOOT_MEM_RAM: + case BOOT_MEM_INIT_RAM: + break; + default: /* not usable memory */ continue; + } addr = PFN_UP(boot_mem_map.map[i].addr); end = PFN_DOWN(boot_mem_map.map[i].addr + @@ -379,7 +384,7 @@ void __init mem_init(void) reservedpages = ram = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) - if (page_is_ram(tmp)) { + if (page_is_ram(tmp) && pfn_valid(tmp)) { ram++; if (PageReserved(pfn_to_page(tmp))) reservedpages++; -- cgit v1.2.3 From f467e4bfb50ca6af042f1b19b3556bd4aca854c3 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 11 Jan 2012 15:37:13 +0100 Subject: MIPS: Flush huge TLB When flushing TLB, if @vma is backed by huge page, we could flush huge TLB, due to that huge page is defined to be far from normal page. Signed-off-by: Hillf Danton Acked-by: David Daney Cc: linux-mips@linux-mips.org Cc: "Jayachandran C." Patchwork: https://patchwork.linux-mips.org/patch/2825/ Signed-off-by: David Daney Acked-by: Hillf Danton Patchwork: https://patchwork.linux-mips.org/patch/3114/ Signed-off-by: Ralf Baechle --- arch/mips/mm/tlb-r4k.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/mips/mm') diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 0d394e0e883..88dc49cfa16 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -120,22 +120,30 @@ void local_flush_tlb_range(struct vm_area_struct *vma, unsigned long start, if (cpu_context(cpu, mm) != 0) { unsigned long size, flags; + int huge = is_vm_hugetlb_page(vma); ENTER_CRITICAL(flags); - size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT; - size = (size + 1) >> 1; + if (huge) { + start = round_down(start, HPAGE_SIZE); + end = round_up(end, HPAGE_SIZE); + size = (end - start) >> HPAGE_SHIFT; + } else { + start = round_down(start, PAGE_SIZE << 1); + end = round_up(end, PAGE_SIZE << 1); + size = (end - start) >> (PAGE_SHIFT + 1); + } if (size <= current_cpu_data.tlbsize/2) { int oldpid = read_c0_entryhi(); int newpid = cpu_asid(cpu, mm); - start &= (PAGE_MASK << 1); - end += ((PAGE_SIZE << 1) - 1); - end &= (PAGE_MASK << 1); while (start < end) { int idx; write_c0_entryhi(start | newpid); - start += (PAGE_SIZE << 1); + if (huge) + start += HPAGE_SIZE; + else + start += (PAGE_SIZE << 1); mtc0_tlbw_hazard(); tlb_probe(); tlb_probe_hazard(); -- cgit v1.2.3 From d7a887a73dec6c387b02a966a71aac767bbd9ce6 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 11 Jan 2012 15:37:16 +0100 Subject: MIPS: Delete unused function add_temporary_entry. Only available for R4000 style TLBs anyway and proper ordering of initialization code made this crude interface unncecessary. Signed-off-by: Ralf Baechle --- arch/mips/mm/tlb-r4k.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'arch/mips/mm') diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 88dc49cfa16..74d67c88574 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -376,51 +376,6 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1, EXIT_CRITICAL(flags); } -/* - * Used for loading TLB entries before trap_init() has started, when we - * don't actually want to add a wired entry which remains throughout the - * lifetime of the system - */ - -static int temp_tlb_entry __cpuinitdata; - -__init int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, - unsigned long entryhi, unsigned long pagemask) -{ - int ret = 0; - unsigned long flags; - unsigned long wired; - unsigned long old_pagemask; - unsigned long old_ctx; - - ENTER_CRITICAL(flags); - /* Save old context and create impossible VPN2 value */ - old_ctx = read_c0_entryhi(); - old_pagemask = read_c0_pagemask(); - wired = read_c0_wired(); - if (--temp_tlb_entry < wired) { - printk(KERN_WARNING - "No TLB space left for add_temporary_entry\n"); - ret = -ENOSPC; - goto out; - } - - write_c0_index(temp_tlb_entry); - write_c0_pagemask(pagemask); - write_c0_entryhi(entryhi); - write_c0_entrylo0(entrylo0); - write_c0_entrylo1(entrylo1); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - tlbw_use_hazard(); - - write_c0_entryhi(old_ctx); - write_c0_pagemask(old_pagemask); -out: - EXIT_CRITICAL(flags); - return ret; -} - static int __cpuinitdata ntlb; static int __init set_ntlb(char *str) { @@ -458,8 +413,6 @@ void __cpuinit tlb_init(void) write_c0_pagegrain(pg); } - temp_tlb_entry = current_cpu_data.tlbsize - 1; - /* From this point on the ARC firmware is dead. */ local_flush_tlb_all(); -- cgit v1.2.3