diff options
Diffstat (limited to 'mm')
40 files changed, 3074 insertions, 603 deletions
diff --git a/mm/Makefile b/mm/Makefile index 135bbb65511a..72227b24a616 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ +obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eca555f658d9..576220acd686 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> @@ -578,7 +579,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfpflags_allow_blocking(gfp)); + might_alloc(gfp); if (!memcg_css->parent) return &bdi->wb; @@ -94,34 +94,29 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, static void __init cma_activate_area(struct cma *cma) { - unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; - unsigned i = cma->count >> pageblock_order; + unsigned long base_pfn = cma->base_pfn, pfn; struct zone *zone; cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) goto out_error; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - - do { - unsigned j; - - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); + /* + * alloc_contig_range() requires the pfn range specified to be in the + * same zone. Simplify by forcing the entire CMA resv range to be in the + * same zone. + */ + WARN_ON_ONCE(!pfn_valid(base_pfn)); + zone = page_zone(pfn_to_page(base_pfn)); + for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + if (page_zone(pfn_to_page(pfn)) != zone) + goto not_in_zone; + } + + for (pfn = base_pfn; pfn < base_pfn + cma->count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); mutex_init(&cma->lock); @@ -135,6 +130,10 @@ static void __init cma_activate_area(struct cma *cma) not_in_zone: bitmap_free(cma->bitmap); out_error: + /* Expose all pages to the buddy, they are useless for CMA. */ + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); return; @@ -336,6 +335,23 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, limit = highmem_start; } + /* + * If there is enough memory, try a bottom-up allocation first. + * It will place the new cma area close to the start of the node + * and guarantee that the compaction is moving pages out of the + * cma area and not into it. + * Avoid using first 4GB to not interfere with constrained zones + * like DMA/DMA32. + */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { + memblock_set_bottom_up(true); + addr = memblock_alloc_range_nid(size, alignment, SZ_4G, + limit, nid, true); + memblock_set_bottom_up(false); + } +#endif + if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); @@ -484,8 +500,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, count, ret); + pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } diff --git a/mm/dmapool.c b/mm/dmapool.c index a97c97232337..f3791532fef2 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -28,6 +28,7 @@ #include <linux/mutex.h> #include <linux/poison.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/spinlock.h> @@ -319,7 +320,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(gfpflags_allow_blocking(mem_flags)); + might_alloc(mem_flags); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index a0018ad1a1f6..164607c7cdf1 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -181,17 +181,17 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } } - if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", - addr, size)) + if (WARN(slot < 0, "%s(%p, %08lx) not found slot\n", + __func__, addr, size)) return; if (WARN(prev_size[slot] != size, - "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", - addr, size, slot, prev_size[slot])) + "%s(%p, %08lx) [%d] size not consistent %08lx\n", + __func__, addr, size, slot, prev_size[slot])) return; - WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", - addr, size, slot); + WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n", + __func__, addr, size, slot); virt_addr = (unsigned long)addr; if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) diff --git a/mm/filemap.c b/mm/filemap.c index 46a8b9e82434..43700480d897 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1658,8 +1658,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, } EXPORT_SYMBOL(page_cache_prev_miss); -/** - * find_get_entry - find and get a page cache entry +/* + * mapping_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * @@ -1671,7 +1671,8 @@ EXPORT_SYMBOL(page_cache_prev_miss); * * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t index) +static struct page *mapping_get_entry(struct address_space *mapping, + pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct page *page; @@ -1708,39 +1709,6 @@ out: } /** - * find_lock_entry - Locate and lock a page cache entry. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * Looks up the page at @mapping & @index. If there is a page in the - * cache, the head page is returned locked and with an increased refcount. - * - * If the slot holds a shadow entry of a previously evicted page, or a - * swap entry from shmem/tmpfs, it is returned. - * - * Context: May sleep. - * Return: The head page or shadow entry, %NULL if nothing is found. - */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) -{ - struct page *page; - -repeat: - page = find_get_entry(mapping, index); - if (page && !xa_is_value(page)) { - lock_page(page); - /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - VM_BUG_ON_PAGE(!thp_contains(page, index), page); - } - return page; -} - -/** * pagecache_get_page - Find and get a reference to a page. * @mapping: The address_space to search. * @index: The page index. @@ -1755,6 +1723,8 @@ repeat: * * %FGP_LOCK - The page is returned locked. * * %FGP_HEAD - If the page is present and a THP, return the head page * rather than the exact page specified by the index. + * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it + * instead of allocating a new page to replace it. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1778,9 +1748,12 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct page *page; repeat: - page = find_get_entry(mapping, index); - if (xa_is_value(page)) + page = mapping_get_entry(mapping, index); + if (xa_is_value(page)) { + if (fgp_flags & FGP_ENTRY) + return page; page = NULL; + } if (!page) goto no_page; @@ -1852,18 +1825,53 @@ no_page: } EXPORT_SYMBOL(pagecache_get_page); +static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max, + xa_mark_t mark) +{ + struct page *page; + +retry: + if (mark == XA_PRESENT) + page = xas_find(xas, max); + else + page = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, page)) + goto retry; + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + return page; + + if (!page_cache_get_speculative(page)) + goto reset; + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) { + put_page(page); + goto reset; + } + + return page; +reset: + xas_reset(xas); + goto retry; +} + /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index - * @nr_entries: The maximum number of entries - * @entries: Where the resulting entries are placed + * @end: The final page index (inclusive). + * @pvec: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * - * find_get_entries() will search for and return a group of up to - * @nr_entries entries in the mapping. The entries are placed at - * @entries. find_get_entries() takes a reference against any actual - * pages it returns. + * find_get_entries() will search for and return a batch of entries in + * the mapping. The entries are placed in @pvec. find_get_entries() + * takes a reference on any actual pages it returns. * * The search returns a group of mapping-contiguous page cache entries * with ascending indexes. There may be holes in the indices due to @@ -1879,60 +1887,97 @@ EXPORT_SYMBOL(pagecache_get_page); * * Return: the number of pages and shadow entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, - pgoff_t start, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned int ret = 0; - - if (!nr_entries) - return 0; + unsigned nr_entries = PAGEVEC_SIZE; rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* * Terminate early on finding a THP, to allow the caller to * handle it all at once; but continue if this is hugetlbfs. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (!xa_is_value(page) && PageTransHuge(page) && + !PageHuge(page)) { page = find_subpage(page, xas.xa_index); nr_entries = ret + 1; } -export: + indices[ret] = xas.xa_index; - entries[ret] = page; + pvec->pages[ret] = page; if (++ret == nr_entries) break; - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } rcu_read_unlock(); + + pvec->nr = ret; return ret; } /** + * find_lock_entries - Find a batch of pagecache entries. + * @mapping: The address_space to search. + * @start: The starting page cache index. + * @end: The final page index (inclusive). + * @pvec: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @pvec. + * + * find_lock_entries() will return a batch of entries from @mapping. + * Swap, shadow and DAX entries are included. Pages are returned + * locked and with an incremented refcount. Pages which are locked by + * somebody else or under writeback are skipped. Only the head page of + * a THP is returned. Pages which are partially outside the range are + * not returned. + * + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries, THP pages, pages which could not be locked + * or pages under writeback. + * + * Return: The number of entries which were found. + */ +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(page)) { + if (page->index < start) + goto put; + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); + if (page->index + thp_nr_pages(page) - 1 > end) + goto put; + if (!trylock_page(page)) + goto put; + if (page->mapping != mapping || PageWriteback(page)) + goto unlock; + VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index), + page); + } + indices[pvec->nr] = xas.xa_index; + if (!pagevec_add(pvec, page)) + break; + goto next; +unlock: + unlock_page(page); +put: + put_page(page); +next: + if (!xa_is_value(page) && PageTransHuge(page)) + xas_set(&xas, page->index + thp_nr_pages(page)); + } + rcu_read_unlock(); + + return pagevec_count(pvec); +} + +/** * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index @@ -1965,30 +2010,16 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - xas_for_each(&xas, page, end) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, XA_PRESENT))) { /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* @@ -2062,7 +2093,7 @@ retry: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_range_tag - find and return pages in given range matching @tag + * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search * @index: the starting page index * @end: The final page index (inclusive) @@ -2070,8 +2101,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages, except we only return pages which are tagged with - * @tag. We update @index to index the next page for the traversal. + * Like find_get_pages(), except we only return head pages which are tagged + * with @tag. @index is updated to the index immediately after the last + * page we return, ready for the next iteration. * * Return: the number of pages which were found. */ @@ -2087,9 +2119,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - xas_for_each_marked(&xas, page, end, tag) { - if (xas_retry(&xas, page)) - continue; + while ((page = find_get_entry(&xas, end, tag))) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict @@ -2098,23 +2128,11 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - - pages[ret] = find_subpage(page, xas.xa_index); + pages[ret] = page; if (++ret == nr_pages) { - *index = xas.xa_index + 1; + *index = page->index + thp_nr_pages(page); goto out; } - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); } /* @@ -2592,6 +2610,109 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } EXPORT_SYMBOL(generic_file_read_iter); +static inline loff_t page_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct page *page, + loff_t start, loff_t end, bool seek_data) +{ + const struct address_space_operations *ops = mapping->a_ops; + size_t offset, bsz = i_blocksize(mapping->host); + + if (xa_is_value(page) || PageUptodate(page)) + return seek_data ? start : end; + if (!ops->is_partially_uptodate) + return seek_data ? end : start; + + xas_pause(xas); + rcu_read_unlock(); + lock_page(page); + if (unlikely(page->mapping != mapping)) + goto unlock; + + offset = offset_in_thp(page, start) & ~(bsz - 1); + + do { + if (ops->is_partially_uptodate(page, offset, bsz) == seek_data) + break; + start = (start + bsz) & ~(bsz - 1); + offset += bsz; + } while (offset < thp_size(page)); +unlock: + unlock_page(page); + rcu_read_lock(); + return start; +} + +static inline +unsigned int seek_page_size(struct xa_state *xas, struct page *page) +{ + if (xa_is_value(page)) + return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return thp_size(page); +} + +/** + * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * @mapping: Address space to search. + * @start: First byte to consider. + * @end: Limit of search (exclusive). + * @whence: Either SEEK_HOLE or SEEK_DATA. + * + * If the page cache knows which blocks contain holes and which blocks + * contain data, your filesystem can use this function to implement + * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are + * entirely memory-based such as tmpfs, and filesystems which support + * unwritten extents. + * + * Return: The requested offset on successs, or -ENXIO if @whence specifies + * SEEK_DATA and there is no data after @start. There is an implicit hole + * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start + * and @end contain data. + */ +loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, + loff_t end, int whence) +{ + XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); + pgoff_t max = (end - 1) / PAGE_SIZE; + bool seek_data = (whence == SEEK_DATA); + struct page *page; + + if (end <= start) + return -ENXIO; + + rcu_read_lock(); + while ((page = find_get_entry(&xas, max, XA_PRESENT))) { + loff_t pos = xas.xa_index * PAGE_SIZE; + + if (start < pos) { + if (!seek_data) + goto unlock; + start = pos; + } + + pos += seek_page_size(&xas, page); + start = page_seek_hole_data(&xas, mapping, page, start, pos, + seek_data); + if (start < pos) + goto unlock; + if (!xa_is_value(page)) + put_page(page); + } + rcu_read_unlock(); + + if (seek_data) + return -ENXIO; + goto out; + +unlock: + rcu_read_unlock(); + if (!xa_is_value(page)) + put_page(page); +out: + if (start > end) + return end; + return start; +} + #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d77605c30f2e..395c75111d33 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -668,9 +668,9 @@ release: * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { - const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) @@ -762,7 +762,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = alloc_hugepage_direct_gfpmask(vma); + gfp = vma_thp_gfp_mask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/internal.h b/mm/internal.h index 25d2b2439f19..9902648f2206 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -60,8 +60,8 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, &file->f_ra, nr_to_read); } -struct page *find_get_entry(struct address_space *mapping, pgoff_t index); -struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct pagevec *pvec, pgoff_t *indices); /** * page_evictable - test whether a page is evictable diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b18189ef3a92..b5e08d4cefec 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -210,6 +210,11 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, *size = optimal_size; } +void __kasan_cache_create_kmalloc(struct kmem_cache *cache) +{ + cache->kasan_info.is_kmalloc = true; +} + size_t __kasan_metadata_size(struct kmem_cache *cache) { if (!kasan_stack_collection_enabled()) @@ -256,7 +261,8 @@ void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { - kasan_poison(object, cache->object_size, KASAN_KMALLOC_REDZONE); + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_KMALLOC_REDZONE); } /* @@ -273,22 +279,13 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object) * based on objects indexes, so that objects that are next to each other * get different tags. */ -static u8 assign_tag(struct kmem_cache *cache, const void *object, - bool init, bool keep_tag) +static inline u8 assign_tag(struct kmem_cache *cache, + const void *object, bool init) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return 0xff; /* - * 1. When an object is kmalloc()'ed, two hooks are called: - * kasan_slab_alloc() and kasan_kmalloc(). We assign the - * tag only in the first one. - * 2. We reuse the same tag for krealloc'ed objects. - */ - if (keep_tag) - return get_tag(object); - - /* * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU * set, assign a tag when the object is being allocated (init == false). */ @@ -320,13 +317,13 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, } /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ - object = set_tag(object, assign_tag(cache, object, true, false)); + object = set_tag(object, assign_tag(cache, object, true)); return (void *)object; } -static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool quarantine) +static inline bool ____kasan_slab_free(struct kmem_cache *cache, + void *object, unsigned long ip, bool quarantine) { u8 tag; void *tagged_object; @@ -335,6 +332,9 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, tagged_object = object; object = kasan_reset_tag(object); + if (is_kfence_address(object)) + return false; + if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != object)) { kasan_report_invalid_free(tagged_object, ip); @@ -350,15 +350,14 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return true; } - kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); - - if (!kasan_stack_collection_enabled()) - return false; + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_KMALLOC_FREE); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; - kasan_set_free_info(cache, object, tag); + if (kasan_stack_collection_enabled()) + kasan_set_free_info(cache, object, tag); return kasan_quarantine_put(cache, object); } @@ -368,6 +367,31 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) return ____kasan_slab_free(cache, object, ip, true); } +static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) +{ + if (ptr != page_address(virt_to_head_page(ptr))) { + kasan_report_invalid_free(ptr, ip); + return true; + } + + if (!kasan_byte_accessible(ptr)) { + kasan_report_invalid_free(ptr, ip); + return true; + } + + /* + * The object will be poisoned by kasan_free_pages() or + * kasan_slab_free_mempool(). + */ + + return false; +} + +void __kasan_kfree_large(void *ptr, unsigned long ip) +{ + ____kasan_kfree_large(ptr, ip); +} + void __kasan_slab_free_mempool(void *ptr, unsigned long ip) { struct page *page; @@ -381,31 +405,68 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. */ if (unlikely(!PageSlab(page))) { - if (ptr != page_address(page)) { - kasan_report_invalid_free(ptr, ip); + if (____kasan_kfree_large(ptr, ip)) return; - } kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false); } } -static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +static void set_alloc_info(struct kmem_cache *cache, void *object, + gfp_t flags, bool is_kmalloc) { struct kasan_alloc_meta *alloc_meta; + /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ + if (cache->kasan_info.is_kmalloc && !is_kmalloc) + return; + alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) kasan_set_track(&alloc_meta->alloc_track, flags); } -static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags, bool keep_tag) +void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, + void *object, gfp_t flags) +{ + u8 tag; + void *tagged_object; + + if (gfpflags_allow_blocking(flags)) + kasan_quarantine_reduce(); + + if (unlikely(object == NULL)) + return NULL; + + if (is_kfence_address(object)) + return (void *)object; + + /* + * Generate and assign random tag for tag-based modes. + * Tag is ignored in set_tag() for the generic mode. + */ + tag = assign_tag(cache, object, false); + tagged_object = set_tag(object, tag); + + /* + * Unpoison the whole object. + * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. + */ + kasan_unpoison(tagged_object, cache->object_size); + + /* Save alloc info (if possible) for non-kmalloc() allocations. */ + if (kasan_stack_collection_enabled()) + set_alloc_info(cache, (void *)object, flags, false); + + return tagged_object; +} + +static inline void *____kasan_kmalloc(struct kmem_cache *cache, + const void *object, size_t size, gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; - u8 tag; if (gfpflags_allow_blocking(flags)) kasan_quarantine_reduce(); @@ -413,40 +474,51 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, if (unlikely(object == NULL)) return NULL; + if (is_kfence_address(kasan_reset_tag(object))) + return (void *)object; + + /* + * The object has already been unpoisoned by kasan_slab_alloc() for + * kmalloc() or by kasan_krealloc() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule((void *)object, size); + + /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(object + size), KASAN_GRANULE_SIZE); - redzone_end = round_up((unsigned long)object + cache->object_size, + redzone_end = round_up((unsigned long)(object + cache->object_size), KASAN_GRANULE_SIZE); - tag = assign_tag(cache, object, false, keep_tag); - - /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ - kasan_unpoison(set_tag(object, tag), size); kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); + /* + * Save alloc info (if possible) for kmalloc() allocations. + * This also rewrites the alloc info when called from kasan_krealloc(). + */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags); + set_alloc_info(cache, (void *)object, flags, true); - return set_tag(object, tag); -} - -void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, - void *object, gfp_t flags) -{ - return ____kasan_kmalloc(cache, object, cache->object_size, flags, false); + /* Keep the tag that was set by kasan_slab_alloc(). */ + return (void *)object; } void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, gfp_t flags) { - return ____kasan_kmalloc(cache, object, size, flags, true); + return ____kasan_kmalloc(cache, object, size, flags); } EXPORT_SYMBOL(__kasan_kmalloc); void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { - struct page *page; unsigned long redzone_start; unsigned long redzone_end; @@ -456,12 +528,23 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, if (unlikely(ptr == NULL)) return NULL; - page = virt_to_page(ptr); + /* + * The object has already been unpoisoned by kasan_alloc_pages() for + * alloc_pages() or by kasan_krealloc() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(ptr, size); + + /* Poison the aligned part of the redzone. */ redzone_start = round_up((unsigned long)(ptr + size), KASAN_GRANULE_SIZE); - redzone_end = (unsigned long)ptr + page_size(page); - - kasan_unpoison(ptr, size); + redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); @@ -475,20 +558,20 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag if (unlikely(object == ZERO_SIZE_PTR)) return (void *)object; + /* + * Unpoison the object's data. + * Part of it might already have been unpoisoned, but it's unknown + * how big that part is. + */ + kasan_unpoison(object, size); + page = virt_to_head_page(object); + /* Piggy-back on kmalloc() instrumentation to poison the redzone. */ if (unlikely(!PageSlab(page))) return __kasan_kmalloc_large(object, size, flags); else - return ____kasan_kmalloc(page->slab_cache, object, size, - flags, true); -} - -void __kasan_kfree_large(void *ptr, unsigned long ip) -{ - if (ptr != page_address(virt_to_head_page(ptr))) - kasan_report_invalid_free(ptr, ip); - /* The object will be poisoned by kasan_free_pages(). */ + return ____kasan_kmalloc(page->slab_cache, object, size, flags); } bool __kasan_check_byte(const void *address, unsigned long ip) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 3f17a1218055..2e55e0f82f39 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> +#include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/linkage.h> #include <linux/memblock.h> @@ -331,7 +332,7 @@ void kasan_record_aux_stack(void *addr) struct kasan_alloc_meta *alloc_meta; void *object; - if (!(page && PageSlab(page))) + if (is_kfence_address(addr) || !(page && PageSlab(page))) return; cache = page->slab_cache; diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index b31aeef505dd..2aad21fda156 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -48,7 +48,7 @@ EXPORT_SYMBOL(kasan_flag_enabled); /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -/* Whether panic or disable tag checking on fault. */ +/* Whether to panic or print a report and disable tag checking on fault. */ bool kasan_flag_panic __ro_after_init; /* kasan=off/on */ diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cc14b6e6c14c..8c55634d6edd 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -3,6 +3,7 @@ #define __MM_KASAN_KASAN_H #include <linux/kasan.h> +#include <linux/kfence.h> #include <linux/stackdepot.h> #ifdef CONFIG_KASAN_HW_TAGS @@ -329,16 +330,37 @@ static inline u8 kasan_random_tag(void) { return 0; } #ifdef CONFIG_KASAN_HW_TAGS -static inline void kasan_poison(const void *address, size_t size, u8 value) +static inline void kasan_poison(const void *addr, size_t size, u8 value) { - hw_set_mem_tag_range(kasan_reset_tag(address), - round_up(size, KASAN_GRANULE_SIZE), value); + addr = kasan_reset_tag(addr); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + hw_set_mem_tag_range((void *)addr, size, value); } -static inline void kasan_unpoison(const void *address, size_t size) +static inline void kasan_unpoison(const void *addr, size_t size) { - hw_set_mem_tag_range(kasan_reset_tag(address), - round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); + u8 tag = get_tag(addr); + + addr = kasan_reset_tag(addr); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + size = round_up(size, KASAN_GRANULE_SIZE); + + hw_set_mem_tag_range((void *)addr, size, tag); } static inline bool kasan_byte_accessible(const void *addr) @@ -352,12 +374,51 @@ static inline bool kasan_byte_accessible(const void *addr) #else /* CONFIG_KASAN_HW_TAGS */ -void kasan_poison(const void *address, size_t size, u8 value); -void kasan_unpoison(const void *address, size_t size); +/** + * kasan_poison - mark the memory range as unaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size, must be aligned to KASAN_GRANULE_SIZE + * @value - value that's written to metadata for the range + * + * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. + */ +void kasan_poison(const void *addr, size_t size, u8 value); + +/** + * kasan_unpoison - mark the memory range as accessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size, can be unaligned + * + * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before + * marking the range. + * For the generic mode, the last granule of the memory range gets partially + * unpoisoned based on the @size. + */ +void kasan_unpoison(const void *addr, size_t size); + bool kasan_byte_accessible(const void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC + +/** + * kasan_poison_last_granule - mark the last granule of the memory range as + * unaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * + * This function is only available for the generic mode, as it's the only mode + * that has partially poisoned memory granules. + */ +void kasan_poison_last_granule(const void *address, size_t size); + +#else /* CONFIG_KASAN_GENERIC */ + +static inline void kasan_poison_last_granule(const void *address, size_t size) { } + +#endif /* CONFIG_KASAN_GENERIC */ + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 234f35a84f19..87b271206163 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> +#include <trace/events/error_report.h> #include <asm/sections.h> @@ -84,8 +85,9 @@ static void start_report(unsigned long *flags) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags) +static void end_report(unsigned long *flags, unsigned long addr) { + trace_error_report_end(ERROR_DETECTOR_KASAN, addr); pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); @@ -355,7 +357,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) print_address_description(object, tag); pr_err("\n"); print_memory_metadata(object); - end_report(&flags); + end_report(&flags, (unsigned long)object); } static void __kasan_report(unsigned long addr, size_t size, bool is_write, @@ -401,7 +403,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, dump_stack(); } - end_report(&flags); + end_report(&flags, addr); } bool kasan_report(unsigned long addr, size_t size, bool is_write, diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 80adc85d0393..63f43443f5d7 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> +#include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/memory.h> #include <linux/mm.h> @@ -68,11 +69,7 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } -/* - * Poisons the shadow memory for 'size' bytes starting from 'addr'. - * Memory addresses should be aligned to KASAN_GRANULE_SIZE. - */ -void kasan_poison(const void *address, size_t size, u8 value) +void kasan_poison(const void *addr, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -81,37 +78,62 @@ void kasan_poison(const void *address, size_t size, u8 value) * some of the callers (e.g. kasan_poison_object_data) pass tagged * addresses to this function. */ - address = kasan_reset_tag(address); - size = round_up(size, KASAN_GRANULE_SIZE); + addr = kasan_reset_tag(addr); - shadow_start = kasan_mem_to_shadow(address); - shadow_end = kasan_mem_to_shadow(address + size); + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + shadow_start = kasan_mem_to_shadow(addr); + shadow_end = kasan_mem_to_shadow(addr + size); __memset(shadow_start, value, shadow_end - shadow_start); } EXPORT_SYMBOL(kasan_poison); -void kasan_unpoison(const void *address, size_t size) +#ifdef CONFIG_KASAN_GENERIC +void kasan_poison_last_granule(const void *addr, size_t size) { - u8 tag = get_tag(address); + if (size & KASAN_GRANULE_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size); + *shadow = size & KASAN_GRANULE_MASK; + } +} +#endif + +void kasan_unpoison(const void *addr, size_t size) +{ + u8 tag = get_tag(addr); /* * Perform shadow offset calculation based on untagged address, as * some of the callers (e.g. kasan_unpoison_object_data) pass tagged * addresses to this function. */ - address = kasan_reset_tag(address); + addr = kasan_reset_tag(addr); + + /* + * Skip KFENCE memory if called explicitly outside of sl*b. Also note + * that calls to ksize(), where size is not a multiple of machine-word + * size, would otherwise poison the invalid portion of the word. + */ + if (is_kfence_address(addr)) + return; - kasan_poison(address, size, tag); + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; - if (size & KASAN_GRANULE_MASK) { - u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + /* Unpoison all granules that cover the object. */ + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag); - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - *shadow = tag; - else /* CONFIG_KASAN_GENERIC */ - *shadow = size & KASAN_GRANULE_MASK; - } + /* Partially poison the last granule for the generic mode. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(addr, size); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile new file mode 100644 index 000000000000..6872cd5e5390 --- /dev/null +++ b/mm/kfence/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_KFENCE) := core.o report.o + +CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls +obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c new file mode 100644 index 000000000000..3b8ec938470a --- /dev/null +++ b/mm/kfence/core.c @@ -0,0 +1,841 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE guarded object allocator and fault handling. + * + * Copyright (C) 2020, Google LLC. + */ + +#define pr_fmt(fmt) "kfence: " fmt + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/kcsan-checks.h> +#include <linux/kfence.h> +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/memblock.h> +#include <linux/moduleparam.h> +#include <linux/random.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> + +#include <asm/kfence.h> + +#include "kfence.h" + +/* Disables KFENCE on the first warning assuming an irrecoverable error. */ +#define KFENCE_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) \ + WRITE_ONCE(kfence_enabled, false); \ + __cond; \ + }) + +/* === Data ================================================================= */ + +static bool kfence_enabled __read_mostly; + +static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kfence." + +static int param_set_sample_interval(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret = kstrtoul(val, 0, &num); + + if (ret < 0) + return ret; + + if (!num) /* Using 0 to indicate KFENCE is disabled. */ + WRITE_ONCE(kfence_enabled, false); + else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) + return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */ + + *((unsigned long *)kp->arg) = num; + return 0; +} + +static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) +{ + if (!READ_ONCE(kfence_enabled)) + return sprintf(buffer, "0\n"); + + return param_get_ulong(buffer, kp); +} + +static const struct kernel_param_ops sample_interval_param_ops = { + .set = param_set_sample_interval, + .get = param_get_sample_interval, +}; +module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); + +/* The pool of pages used for guard pages and objects. */ +char *__kfence_pool __ro_after_init; +EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ + +/* + * Per-object metadata, with one-to-one mapping of object metadata to + * backing pages (in __kfence_pool). + */ +static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); +struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* Freelist with available objects. */ +static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); +static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ + +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* The static key to set up a KFENCE allocation. */ +DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); +#endif + +/* Gates the allocation, ensuring only one succeeds in a given period. */ +atomic_t kfence_allocation_gate = ATOMIC_INIT(1); + +/* Statistics counters for debugfs. */ +enum kfence_counter_id { + KFENCE_COUNTER_ALLOCATED, + KFENCE_COUNTER_ALLOCS, + KFENCE_COUNTER_FREES, + KFENCE_COUNTER_ZOMBIES, + KFENCE_COUNTER_BUGS, + KFENCE_COUNTER_COUNT, +}; +static atomic_long_t counters[KFENCE_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KFENCE_COUNTER_ALLOCATED] = "currently allocated", + [KFENCE_COUNTER_ALLOCS] = "total allocations", + [KFENCE_COUNTER_FREES] = "total frees", + [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", + [KFENCE_COUNTER_BUGS] = "total bugs", +}; +static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); + +/* === Internals ============================================================ */ + +static bool kfence_protect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); +} + +static bool kfence_unprotect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); +} + +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + +static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) +{ + unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; + unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; + + /* The checks do not affect performance; only called from slow-paths. */ + + /* Only call with a pointer into kfence_metadata. */ + if (KFENCE_WARN_ON(meta < kfence_metadata || + meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) + return 0; + + /* + * This metadata object only ever maps to 1 page; verify that the stored + * address is in the expected range. + */ + if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) + return 0; + + return pageaddr; +} + +/* + * Update the object's metadata state, including updating the alloc/free stacks + * depending on the state transition. + */ +static noinline void metadata_update_state(struct kfence_metadata *meta, + enum kfence_object_state next) +{ + struct kfence_track *track = + next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; + + lockdep_assert_held(&meta->lock); + + /* + * Skip over 1 (this) functions; noinline ensures we do not accidentally + * skip over the caller by never inlining. + */ + track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + track->pid = task_pid_nr(current); + + /* + * Pairs with READ_ONCE() in + * kfence_shutdown_cache(), + * kfence_handle_page_fault(). + */ + WRITE_ONCE(meta->state, next); +} + +/* Write canary byte to @addr. */ +static inline bool set_canary_byte(u8 *addr) +{ + *addr = KFENCE_CANARY_PATTERN(addr); + return true; +} + +/* Check canary byte at @addr. */ +static inline bool check_canary_byte(u8 *addr) +{ + if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) + return true; + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr), + KFENCE_ERROR_CORRUPTION); + return false; +} + +/* __always_inline this to ensure we won't do an indirect call to fn. */ +static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *)) +{ + const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); + unsigned long addr; + + lockdep_assert_held(&meta->lock); + + /* + * We'll iterate over each canary byte per-side until fn() returns + * false. However, we'll still iterate over the canary bytes to the + * right of the object even if there was an error in the canary bytes to + * the left of the object. Specifically, if check_canary_byte() + * generates an error, showing both sides might give more clues as to + * what the error is about when displaying which bytes were corrupted. + */ + + /* Apply to left of object. */ + for (addr = pageaddr; addr < meta->addr; addr++) { + if (!fn((u8 *)addr)) + break; + } + + /* Apply to right of object. */ + for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) { + if (!fn((u8 *)addr)) + break; + } +} + +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp) +{ + struct kfence_metadata *meta = NULL; + unsigned long flags; + struct page *page; + void *addr; + + /* Try to obtain a free object. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + if (!list_empty(&kfence_freelist)) { + meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); + list_del_init(&meta->list); + } + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + if (!meta) + return NULL; + + if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { + /* + * This is extremely unlikely -- we are reporting on a + * use-after-free, which locked meta->lock, and the reporting + * code via printk calls kmalloc() which ends up in + * kfence_alloc() and tries to grab the same object that we're + * reporting on. While it has never been observed, lockdep does + * report that there is a possibility of deadlock. Fix it by + * using trylock and bailing out gracefully. + */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + /* Put the object back on the freelist. */ + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + return NULL; + } + + meta->addr = metadata_to_pageaddr(meta); + /* Unprotect if we're reusing this page. */ + if (meta->state == KFENCE_OBJECT_FREED) + kfence_unprotect(meta->addr); + + /* + * Note: for allocations made before RNG initialization, will always + * return zero. We still benefit from enabling KFENCE as early as + * possible, even when the RNG is not yet available, as this will allow + * KFENCE to detect bugs due to earlier allocations. The only downside + * is that the out-of-bounds accesses detected are deterministic for + * such allocations. + */ + if (prandom_u32_max(2)) { + /* Allocate on the "right" side, re-calculate address. */ + meta->addr += PAGE_SIZE - size; + meta->addr = ALIGN_DOWN(meta->addr, cache->align); + } + + addr = (void *)meta->addr; + + /* Update remaining metadata. */ + metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED); + /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ + WRITE_ONCE(meta->cache, cache); + meta->size = size; + for_each_canary(meta, set_canary_byte); + + /* Set required struct page fields. */ + page = virt_to_page(meta->addr); + page->slab_cache = cache; + if (IS_ENABLED(CONFIG_SLUB)) + page->objects = 1; + if (IS_ENABLED(CONFIG_SLAB)) + page->s_mem = addr; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + /* Memory initialization. */ + + /* + * We check slab_want_init_on_alloc() ourselves, rather than letting + * SL*B do the initialization, as otherwise we might overwrite KFENCE's + * redzone. + */ + if (unlikely(slab_want_init_on_alloc(gfp, cache))) + memzero_explicit(addr, size); + if (cache->ctor) + cache->ctor(addr); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS)) + kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); + + return addr; +} + +static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) +{ + struct kcsan_scoped_access assert_page_exclusive; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + + if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { + /* Invalid or double-free, bail out. */ + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, false, NULL, meta, + KFENCE_ERROR_INVALID_FREE); + raw_spin_unlock_irqrestore(&meta->lock, flags); + return; + } + + /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ + kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, + KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, + &assert_page_exclusive); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS) + kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ + + /* Restore page protection if there was an OOB access. */ + if (meta->unprotected_page) { + kfence_protect(meta->unprotected_page); + meta->unprotected_page = 0; + } + + /* Check canary bytes for memory corruption. */ + for_each_canary(meta, check_canary_byte); + + /* + * Clear memory if init-on-free is set. While we protect the page, the + * data is still there, and after a use-after-free is detected, we + * unprotect the page, so the data is still accessible. + */ + if (!zombie && unlikely(slab_want_init_on_free(meta->cache))) + memzero_explicit(addr, meta->size); + + /* Mark the object as freed. */ + metadata_update_state(meta, KFENCE_OBJECT_FREED); + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + /* Protect to detect use-after-frees. */ + kfence_protect((unsigned long)addr); + + kcsan_end_scoped_access(&assert_page_exclusive); + if (!zombie) { + /* Add it to the tail of the freelist for reuse. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + KFENCE_WARN_ON(!list_empty(&meta->list)); + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); + } else { + /* See kfence_shutdown_cache(). */ + atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); + } +} + +static void rcu_guarded_free(struct rcu_head *h) +{ + struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); + + kfence_guarded_free((void *)meta->addr, meta, false); +} + +static bool __init kfence_init_pool(void) +{ + unsigned long addr = (unsigned long)__kfence_pool; + struct page *pages; + int i; + + if (!__kfence_pool) + return false; + + if (!arch_kfence_init_pool()) + goto err; + + pages = virt_to_page(addr); + + /* + * Set up object pages: they must have PG_slab set, to avoid freeing + * these as real pages. + * + * We also want to avoid inserting kfence_free() in the kfree() + * fast-path in SLUB, and therefore need to ensure kfree() correctly + * enters __slab_free() slow-path. + */ + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + if (!i || (i % 2)) + continue; + + /* Verify we do not have a compound head page. */ + if (WARN_ON(compound_head(&pages[i]) != &pages[i])) + goto err; + + __SetPageSlab(&pages[i]); + } + + /* + * Protect the first 2 pages. The first page is mostly unnecessary, and + * merely serves as an extended guard page. However, adding one + * additional page in the beginning gives us an even number of pages, + * which simplifies the mapping of address to metadata index. + */ + for (i = 0; i < 2; i++) { + if (unlikely(!kfence_protect(addr))) + goto err; + + addr += PAGE_SIZE; + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta = &kfence_metadata[i]; + + /* Initialize metadata. */ + INIT_LIST_HEAD(&meta->list); + raw_spin_lock_init(&meta->lock); + meta->state = KFENCE_OBJECT_UNUSED; + meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ + list_add_tail(&meta->list, &kfence_freelist); + + /* Protect the right redzone. */ + if (unlikely(!kfence_protect(addr + PAGE_SIZE))) + goto err; + + addr += 2 * PAGE_SIZE; + } + + return true; + +err: + /* + * Only release unprotected pages, and do not try to go back and change + * page attributes due to risk of failing to do so as well. If changing + * page attributes for some pages fails, it is very likely that it also + * fails for the first page, and therefore expect addr==__kfence_pool in + * most failure cases. + */ + memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + __kfence_pool = NULL; + return false; +} + +/* === DebugFS Interface ==================================================== */ + +static int stats_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); + for (i = 0; i < KFENCE_COUNTER_COUNT; i++) + seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(stats); + +/* + * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. + * start_object() and next_object() return the object index + 1, because NULL is used + * to stop iteration. + */ +static void *start_object(struct seq_file *seq, loff_t *pos) +{ + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static void stop_object(struct seq_file *seq, void *v) +{ +} + +static void *next_object(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static int show_object(struct seq_file *seq, void *v) +{ + struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_print_object(seq, meta); + raw_spin_unlock_irqrestore(&meta->lock, flags); + seq_puts(seq, "---------------------------------\n"); + + return 0; +} + +static const struct seq_operations object_seqops = { + .start = start_object, + .next = next_object, + .stop = stop_object, + .show = show_object, +}; + +static int open_objects(struct inode *inode, struct file *file) +{ + return seq_open(file, &object_seqops); +} + +static const struct file_operations objects_fops = { + .open = open_objects, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int __init kfence_debugfs_init(void) +{ + struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL); + + debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); + debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); + return 0; +} + +late_initcall(kfence_debugfs_init); + +/* === Allocation Gate Timer ================================================ */ + +/* + * Set up delayed work, which will enable and disable the static key. We need to + * use a work queue (rather than a simple timer), since enabling and disabling a + * static key cannot be done from an interrupt. + * + * Note: Toggling a static branch currently causes IPIs, and here we'll end up + * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with + * more aggressive sampling intervals), we could get away with a variant that + * avoids IPIs, at the cost of not immediately capturing allocations if the + * instructions remain cached. + */ +static struct delayed_work kfence_timer; +static void toggle_allocation_gate(struct work_struct *work) +{ + if (!READ_ONCE(kfence_enabled)) + return; + + /* Enable static key, and await allocation to happen. */ + atomic_set(&kfence_allocation_gate, 0); +#ifdef CONFIG_KFENCE_STATIC_KEYS + static_branch_enable(&kfence_allocation_key); + /* + * Await an allocation. Timeout after 1 second, in case the kernel stops + * doing allocations, to avoid stalling this worker task for too long. + */ + { + unsigned long end_wait = jiffies + HZ; + + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&kfence_allocation_gate) != 0) + break; + schedule_timeout(1); + } while (time_before(jiffies, end_wait)); + __set_current_state(TASK_RUNNING); + } + /* Disable static key and reset timer. */ + static_branch_disable(&kfence_allocation_key); +#endif + schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); +} +static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); + +/* === Public interface ===================================================== */ + +void __init kfence_alloc_pool(void) +{ + if (!kfence_sample_interval) + return; + + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) + pr_err("failed to allocate pool\n"); +} + +void __init kfence_init(void) +{ + /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ + if (!kfence_sample_interval) + return; + + if (!kfence_init_pool()) { + pr_err("%s failed\n", __func__); + return; + } + + WRITE_ONCE(kfence_enabled, true); + schedule_delayed_work(&kfence_timer, 0); + pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); +} + +void kfence_shutdown_cache(struct kmem_cache *s) +{ + unsigned long flags; + struct kfence_metadata *meta; + int i; + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + bool in_use; + + meta = &kfence_metadata[i]; + + /* + * If we observe some inconsistent cache and state pair where we + * should have returned false here, cache destruction is racing + * with either kmem_cache_alloc() or kmem_cache_free(). Taking + * the lock will not help, as different critical section + * serialization will have the same outcome. + */ + if (READ_ONCE(meta->cache) != s || + READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; + raw_spin_unlock_irqrestore(&meta->lock, flags); + + if (in_use) { + /* + * This cache still has allocations, and we should not + * release them back into the freelist so they can still + * safely be used and retain the kernel's default + * behaviour of keeping the allocations alive (leak the + * cache); however, they effectively become "zombie + * allocations" as the KFENCE objects are the only ones + * still in use and the owning cache is being destroyed. + * + * We mark them freed, so that any subsequent use shows + * more useful error messages that will include stack + * traces of the user of the object, the original + * allocation, and caller to shutdown_cache(). + */ + kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); + } + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + meta = &kfence_metadata[i]; + + /* See above. */ + if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) + meta->cache = NULL; + raw_spin_unlock_irqrestore(&meta->lock, flags); + } +} + +void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) +{ + /* + * allocation_gate only needs to become non-zero, so it doesn't make + * sense to continue writing to it and pay the associated contention + * cost, in case we have a large number of concurrent allocations. + */ + if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) + return NULL; + + if (!READ_ONCE(kfence_enabled)) + return NULL; + + if (size > PAGE_SIZE) + return NULL; + + return kfence_guarded_alloc(s, size, flags); +} + +size_t kfence_ksize(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? meta->size : 0; +} + +void *kfence_object_start(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? (void *)meta->addr : NULL; +} + +void __kfence_free(void *addr) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing + * the object, as the object page may be recycled for other-typed + * objects once it has been freed. meta->cache may be NULL if the cache + * was destroyed. + */ + if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) + call_rcu(&meta->rcu_head, rcu_guarded_free); + else + kfence_guarded_free(addr, meta, false); +} + +bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) +{ + const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; + struct kfence_metadata *to_report = NULL; + enum kfence_error_type error_type; + unsigned long flags; + + if (!is_kfence_address((void *)addr)) + return false; + + if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ + return kfence_unprotect(addr); /* ... unprotect and proceed. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + + if (page_index % 2) { + /* This is a redzone, report a buffer overflow. */ + struct kfence_metadata *meta; + int distance = 0; + + meta = addr_to_metadata(addr - PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + to_report = meta; + /* Data race ok; distance calculation approximate. */ + distance = addr - data_race(meta->addr + meta->size); + } + + meta = addr_to_metadata(addr + PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + /* Data race ok; distance calculation approximate. */ + if (!to_report || distance > data_race(meta->addr) - addr) + to_report = meta; + } + + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + to_report->unprotected_page = addr; + error_type = KFENCE_ERROR_OOB; + + /* + * If the object was freed before we took the look we can still + * report this as an OOB -- the report will simply show the + * stacktrace of the free as well. + */ + } else { + to_report = addr_to_metadata(addr); + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + error_type = KFENCE_ERROR_UAF; + /* + * We may race with __kfence_alloc(), and it is possible that a + * freed object may be reallocated. We simply report this as a + * use-after-free, with the stack trace showing the place where + * the object was re-allocated. + */ + } + +out: + if (to_report) { + kfence_report_error(addr, is_write, regs, to_report, error_type); + raw_spin_unlock_irqrestore(&to_report->lock, flags); + } else { + /* This may be a UAF or OOB access, but we can't be sure. */ + kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); + } + + return kfence_unprotect(addr); /* Unprotect and let access proceed. */ +} diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h new file mode 100644 index 000000000000..24065321ff8a --- /dev/null +++ b/mm/kfence/kfence.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel Electric-Fence (KFENCE). For more info please see + * Documentation/dev-tools/kfence.rst. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef MM_KFENCE_KFENCE_H +#define MM_KFENCE_KFENCE_H + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/types.h> + +#include "../slab.h" /* for struct kmem_cache */ + +/* + * Get the canary byte pattern for @addr. Use a pattern that varies based on the + * lower 3 bits of the address, to detect memory corruptions with higher + * probability, where similar constants are used. + */ +#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) + +/* Maximum stack depth for reports. */ +#define KFENCE_STACK_DEPTH 64 + +/* KFENCE object states. */ +enum kfence_object_state { + KFENCE_OBJECT_UNUSED, /* Object is unused. */ + KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ + KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ +}; + +/* Alloc/free tracking information. */ +struct kfence_track { + pid_t pid; + int num_stack_entries; + unsigned long stack_entries[KFENCE_STACK_DEPTH]; +}; + +/* KFENCE metadata per guarded allocation. */ +struct kfence_metadata { + struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ + struct rcu_head rcu_head; /* For delayed freeing. */ + + /* + * Lock protecting below data; to ensure consistency of the below data, + * since the following may execute concurrently: __kfence_alloc(), + * __kfence_free(), kfence_handle_page_fault(). However, note that we + * cannot grab the same metadata off the freelist twice, and multiple + * __kfence_alloc() cannot run concurrently on the same metadata. + */ + raw_spinlock_t lock; + + /* The current state of the object; see above. */ + enum kfence_object_state state; + + /* + * Allocated object address; cannot be calculated from size, because of + * alignment requirements. + * + * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. + */ + unsigned long addr; + + /* + * The size of the original allocation. + */ + size_t size; + + /* + * The kmem_cache cache of the last allocation; NULL if never allocated + * or the cache has already been destroyed. + */ + struct kmem_cache *cache; + + /* + * In case of an invalid access, the page that was unprotected; we + * optimistically only store one address. + */ + unsigned long unprotected_page; + + /* Allocation and free stack information. */ + struct kfence_track alloc_track; + struct kfence_track free_track; +}; + +extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* KFENCE error types for report generation. */ +enum kfence_error_type { + KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ + KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ + KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ + KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ + KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ +}; + +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type); + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); + +#endif /* MM_KFENCE_KFENCE_H */ diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c new file mode 100644 index 000000000000..4acf4251ee04 --- /dev/null +++ b/mm/kfence/kfence_test.c @@ -0,0 +1,858 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KFENCE memory safety error detector. Since the interface with + * which KFENCE's reports are obtained is via the console, this is the output we + * should verify. For each test case checks the presence (or absence) of + * generated reports. Relies on 'console' tracepoint to capture reports as they + * appear in the kernel log. + * + * Copyright (C) 2020, Google LLC. + * Author: Alexander Potapenko <glider@google.com> + * Marco Elver <elver@google.com> + */ + +#include <kunit/test.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kfence.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/tracepoint.h> +#include <trace/events/printk.h> + +#include "kfence.h" + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[2][256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) { + /* + * KFENCE report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + } + + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Information we expect in a report. */ +struct expect_report { + enum kfence_error_type type; /* The type or error. */ + void *fn; /* Function pointer to expected function where access occurred. */ + char *addr; /* Address at which the bad access occurred. */ + bool is_write; /* Is access a write. */ +}; + +static const char *get_access_type(const struct expect_report *r) +{ + return r->is_write ? "write" : "read"; +} + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s", + get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s", + get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s", + get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free"); + break; + } + + scnprintf(cur, end - cur, " in %pS", r->fn); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + + /* Access information */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "Corrupted memory at"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "Invalid free of"); + break; + } + + cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr); + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test cases ===== */ + +#define TEST_PRIV_WANT_MEMCACHE ((void *)1) + +/* Cache used by tests; if NULL, allocate from kmalloc instead. */ +static struct kmem_cache *test_cache; + +static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags, + void (*ctor)(void *)) +{ + if (test->priv != TEST_PRIV_WANT_MEMCACHE) + return size; + + kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor); + + /* + * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any + * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to + * allocate via memcg, if enabled. + */ + flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT; + test_cache = kmem_cache_create("test", size, 1, flags, ctor); + KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache"); + + return size; +} + +static void test_cache_destroy(void) +{ + if (!test_cache) + return; + + kmem_cache_destroy(test_cache); + test_cache = NULL; +} + +static inline size_t kmalloc_cache_alignment(size_t size) +{ + return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align; +} + +/* Must always inline to match stack trace against caller. */ +static __always_inline void test_free(void *ptr) +{ + if (test_cache) + kmem_cache_free(test_cache, ptr); + else + kfree(ptr); +} + +/* + * If this should be a KFENCE allocation, and on which side the allocation and + * the closest guard page should be. + */ +enum allocation_policy { + ALLOCATE_ANY, /* KFENCE, any side. */ + ALLOCATE_LEFT, /* KFENCE, left side of page. */ + ALLOCATE_RIGHT, /* KFENCE, right side of page. */ + ALLOCATE_NONE, /* No KFENCE allocation. */ +}; + +/* + * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the + * current test_cache if set up. + */ +static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy) +{ + void *alloc; + unsigned long timeout, resched_after; + const char *policy_name; + + switch (policy) { + case ALLOCATE_ANY: + policy_name = "any"; + break; + case ALLOCATE_LEFT: + policy_name = "left"; + break; + case ALLOCATE_RIGHT: + policy_name = "right"; + break; + case ALLOCATE_NONE: + policy_name = "none"; + break; + } + + kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + policy_name, !!test_cache); + + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + /* + * Especially for non-preemption kernels, ensure the allocation-gate + * timer can catch up: after @resched_after, every failed allocation + * attempt yields, to ensure the allocation-gate timer is scheduled. + */ + resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + do { + if (test_cache) + alloc = kmem_cache_alloc(test_cache, gfp); + else + alloc = kmalloc(size, gfp); + + if (is_kfence_address(alloc)) { + struct page *page = virt_to_head_page(alloc); + struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]; + + /* + * Verify that various helpers return the right values + * even for KFENCE objects; these are required so that + * memcg accounting works correctly. + */ + KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U); + KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1); + + if (policy == ALLOCATE_ANY) + return alloc; + if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + return alloc; + if (policy == ALLOCATE_RIGHT && + !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE)) + return alloc; + } else if (policy == ALLOCATE_NONE) + return alloc; + + test_free(alloc); + + if (time_after(jiffies, resched_after)) + cond_resched(); + } while (time_before(jiffies, timeout)); + + KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE"); + return NULL; /* Unreachable. */ +} + +static void test_out_of_bounds_read(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_read, + .is_write = false, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* + * If we don't have our own cache, adjust based on alignment, so that we + * actually access guard pages on either side. + */ + if (!test_cache) + size = kmalloc_cache_alignment(size); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf + size; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_out_of_bounds_write(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_write, + .is_write = true, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_use_after_free_read(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_use_after_free_read, + .is_write = false, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_double_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_double_free, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + test_free(expect.addr); /* Double-free. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_invalid_addr_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_invalid_addr_free, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + expect.addr = buf + 1; /* Free on invalid address. */ + test_free(expect.addr); /* Invalid address free. */ + test_free(buf); /* No error. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_corruption(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_corruption, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * KFENCE is unable to detect an OOB if the allocation's alignment requirements + * leave a gap between the object and the guard page. Specifically, an + * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB + * respectively. Therefore it is impossible for the allocated object to + * contiguously line up with the right guard page. + * + * However, we test that an access to memory beyond the gap results in KFENCE + * detecting an OOB access. + */ +static void test_kmalloc_aligned_oob_read(struct kunit *test) +{ + const size_t size = 73; + const size_t align = kmalloc_cache_alignment(size); + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_kmalloc_aligned_oob_read, + .is_write = false, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + + /* + * The object is offset to the right, so there won't be an OOB to the + * left of it. + */ + READ_ONCE(*(buf - 1)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* + * @buf must be aligned on @align, therefore buf + size belongs to the + * same page -> no OOB. + */ + READ_ONCE(*(buf + size)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Overflowing by @align bytes will result in an OOB. */ + expect.addr = buf + size + align; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + test_free(buf); +} + +static void test_kmalloc_aligned_oob_write(struct kunit *test) +{ + const size_t size = 73; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_kmalloc_aligned_oob_write, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + /* + * The object is offset to the right, so we won't get a page + * fault immediately after it. + */ + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1); + KUNIT_EXPECT_FALSE(test, report_available()); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test cache shrinking and destroying with KFENCE. */ +static void test_shrink_memcache(struct kunit *test) +{ + const size_t size = 32; + void *buf; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + kmem_cache_shrink(test_cache); + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void ctor_set_x(void *obj) +{ + /* Every object has at least 8 bytes. */ + memset(obj, 'x', 8); +} + +/* Ensure that SL*B does not modify KFENCE objects on bulk free. */ +static void test_free_bulk(struct kunit *test) +{ + int iter; + + for (iter = 0; iter < 5; iter++) { + const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0, + (iter & 1) ? ctor_set_x : NULL); + void *objects[] = { + test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + }; + + kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects); + KUNIT_ASSERT_FALSE(test, report_available()); + test_cache_destroy(); + } +} + +/* Test init-on-free works. */ +static void test_init_on_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_init_on_free, + .is_write = false, + }; + int i; + + if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)) + return; + /* Assume it hasn't been disabled on command line. */ + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + expect.addr[i] = i + 1; + test_free(expect.addr); + + for (i = 0; i < size; i++) { + /* + * This may fail if the page was recycled by KFENCE and then + * written to again -- this however, is near impossible with a + * default config. + */ + KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0); + + if (!i) /* Only check first access to not fail test if page is ever re-protected. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + } +} + +/* Ensure that constructors work properly. */ +static void test_memcache_ctor(struct kunit *test) +{ + const size_t size = 32; + char *buf; + int i; + + setup_test_cache(test, size, 0, ctor_set_x); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + + for (i = 0; i < 8; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)'x'); + + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* Test that memory is zeroed if requested. */ +static void test_gfpzero(struct kunit *test) +{ + const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */ + char *buf1, *buf2; + int i; + + if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) { + kunit_warn(test, "skipping ... would take too long\n"); + return; + } + + setup_test_cache(test, size, 0, NULL); + buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + buf1[i] = i + 1; + test_free(buf1); + + /* Try to get same address again -- this can take a while. */ + for (i = 0;; i++) { + buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY); + if (buf1 == buf2) + break; + test_free(buf2); + + if (i == CONFIG_KFENCE_NUM_OBJECTS) { + kunit_warn(test, "giving up ... cannot get same object back\n"); + return; + } + } + + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf2[i], (char)0); + + test_free(buf2); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void test_invalid_access(struct kunit *test) +{ + const struct expect_report expect = { + .type = KFENCE_ERROR_INVALID, + .fn = test_invalid_access, + .addr = &__kfence_pool[10], + .is_write = false, + }; + + READ_ONCE(__kfence_pool[10]); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test SLAB_TYPESAFE_BY_RCU works. */ +static void test_memcache_typesafe_by_rcu(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_memcache_typesafe_by_rcu, + .is_write = false, + }; + + setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + *expect.addr = 42; + + rcu_read_lock(); + test_free(expect.addr); + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + /* + * Up to this point, memory should not have been freed yet, and + * therefore there should be no KFENCE report from the above access. + */ + rcu_read_unlock(); + + /* Above access to @expect.addr should not have generated a report! */ + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Only after rcu_barrier() is the memory guaranteed to be freed. */ + rcu_barrier(); + + /* Expect use-after-free. */ + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test krealloc(). */ +static void test_krealloc(struct kunit *test) +{ + const size_t size = 32; + const struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_krealloc, + .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY), + .is_write = false, + }; + char *buf = expect.addr; + int i; + + KUNIT_EXPECT_FALSE(test, test_cache); + KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */ + for (i = 0; i < size; i++) + buf[i] = i + 1; + + /* Check that we successfully change the size. */ + buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */ + /* Note: Might no longer be a KFENCE alloc. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 3); + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + for (; i < size * 3; i++) /* Fill to extra bytes. */ + buf[i] = i + 1; + + buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 2); + for (i = 0; i < size * 2; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + + buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */ + KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR); + KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */ + + READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */ + KUNIT_ASSERT_TRUE(test, report_matches(&expect)); +} + +/* Test that some objects from a bulk allocation belong to KFENCE pool. */ +static void test_memcache_alloc_bulk(struct kunit *test) +{ + const size_t size = 32; + bool pass = false; + unsigned long timeout; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + do { + void *objects[100]; + int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), + objects); + if (!num) + continue; + for (i = 0; i < ARRAY_SIZE(objects); i++) { + if (is_kfence_address(objects[i])) { + pass = true; + break; + } + } + kmem_cache_free_bulk(test_cache, num, objects); + /* + * kmem_cache_alloc_bulk() disables interrupts, and calling it + * in a tight loop may not give KFENCE a chance to switch the + * static branch. Call cond_resched() to let KFENCE chime in. + */ + cond_resched(); + } while (!pass && time_before(jiffies, timeout)); + + KUNIT_EXPECT_TRUE(test, pass); + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* + * KUnit does not provide a way to provide arguments to tests, and we encode + * additional info in the name. Set up 2 tests per test case, one using the + * default allocator, and another using a custom memcache (suffix '-memcache'). + */ +#define KFENCE_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name }, \ + { .run_case = test_name, .name = #test_name "-memcache" } + +static struct kunit_case kfence_test_cases[] = { + KFENCE_KUNIT_CASE(test_out_of_bounds_read), + KFENCE_KUNIT_CASE(test_out_of_bounds_write), + KFENCE_KUNIT_CASE(test_use_after_free_read), + KFENCE_KUNIT_CASE(test_double_free), + KFENCE_KUNIT_CASE(test_invalid_addr_free), + KFENCE_KUNIT_CASE(test_corruption), + KFENCE_KUNIT_CASE(test_free_bulk), + KFENCE_KUNIT_CASE(test_init_on_free), + KUNIT_CASE(test_kmalloc_aligned_oob_read), + KUNIT_CASE(test_kmalloc_aligned_oob_write), + KUNIT_CASE(test_shrink_memcache), + KUNIT_CASE(test_memcache_ctor), + KUNIT_CASE(test_invalid_access), + KUNIT_CASE(test_gfpzero), + KUNIT_CASE(test_memcache_typesafe_by_rcu), + KUNIT_CASE(test_krealloc), + KUNIT_CASE(test_memcache_alloc_bulk), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); i++) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + /* Any test with 'memcache' in its name will want a memcache. */ + if (strstr(test->name, "memcache")) + test->priv = TEST_PRIV_WANT_MEMCACHE; + else + test->priv = NULL; + + return 0; +} + +static void test_exit(struct kunit *test) +{ + test_cache_destroy(); +} + +static struct kunit_suite kfence_test_suite = { + .name = "kfence", + .test_cases = kfence_test_cases, + .init = test_init, + .exit = test_exit, +}; +static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL }; + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +/* + * We only want to do tracepoints setup and teardown once, therefore we have to + * customize the init and exit functions and cannot rely on kunit_test_suite(). + */ +static int __init kfence_test_init(void) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return __kunit_test_suites_init(kfence_test_suites); +} + +static void kfence_test_exit(void) +{ + __kunit_test_suites_exit(kfence_test_suites); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +late_initcall(kfence_test_init); +module_exit(kfence_test_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>"); diff --git a/mm/kfence/report.c b/mm/kfence/report.c new file mode 100644 index 000000000000..ab83d5a59bb1 --- /dev/null +++ b/mm/kfence/report.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE reporting. + * + * Copyright (C) 2020, Google LLC. + */ + +#include <stdarg.h> + +#include <linux/kernel.h> +#include <linux/lockdep.h> +#include <linux/printk.h> +#include <linux/sched/debug.h> +#include <linux/seq_file.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <trace/events/error_report.h> + +#include <asm/kfence.h> + +#include "kfence.h" + +extern bool no_hash_pointers; + +/* Helper function to either print to a seq_file or to console. */ +__printf(2, 3) +static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + if (seq) + seq_vprintf(seq, fmt, args); + else + vprintk(fmt, args); + va_end(args); +} + +/* + * Get the number of stack entries to skip to get out of MM internals. @type is + * optional, and if set to NULL, assumes an allocation or free stack. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries, + const enum kfence_error_type *type) +{ + char buf[64]; + int skipnr, fallback = 0; + + if (type) { + /* Depending on error type, find different stack entries. */ + switch (*type) { + case KFENCE_ERROR_UAF: + case KFENCE_ERROR_OOB: + case KFENCE_ERROR_INVALID: + /* + * kfence_handle_page_fault() may be called with pt_regs + * set to NULL; in that case we'll simply show the full + * stack trace. + */ + return 0; + case KFENCE_ERROR_CORRUPTION: + case KFENCE_ERROR_INVALID_FREE: + break; + } + } + + for (skipnr = 0; skipnr < num_entries; skipnr++) { + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); + + if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || + !strncmp(buf, "__slab_free", len)) { + /* + * In case of tail calls from any of the below + * to any of the above. + */ + fallback = skipnr + 1; + } + + /* Also the *_bulk() variants by only checking prefixes. */ + if (str_has_prefix(buf, "kfree") || + str_has_prefix(buf, "kmem_cache_free") || + str_has_prefix(buf, "__kmalloc") || + str_has_prefix(buf, "kmem_cache_alloc")) + goto found; + } + if (fallback < num_entries) + return fallback; +found: + skipnr++; + return skipnr < num_entries ? skipnr : 0; +} + +static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta, + bool show_alloc) +{ + const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; + + if (track->num_stack_entries) { + /* Skip allocation/free internals stack. */ + int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + + /* stack_trace_seq_print() does not exist; open code our own. */ + for (; i < track->num_stack_entries; i++) + seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]); + } else { + seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation"); + } +} + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) +{ + const int size = abs(meta->size); + const unsigned long start = meta->addr; + const struct kmem_cache *const cache = meta->cache; + + lockdep_assert_held(&meta->lock); + + if (meta->state == KFENCE_OBJECT_UNUSED) { + seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); + return; + } + + seq_con_printf(seq, + "kfence-#%zd [0x%p-0x%p" + ", size=%d, cache=%s] allocated by task %d:\n", + meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, + (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid); + kfence_print_stack(seq, meta, true); + + if (meta->state == KFENCE_OBJECT_FREED) { + seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid); + kfence_print_stack(seq, meta, false); + } +} + +/* + * Show bytes at @addr that are different from the expected canary values, up to + * @max_bytes. + */ +static void print_diff_canary(unsigned long address, size_t bytes_to_show, + const struct kfence_metadata *meta) +{ + const unsigned long show_until_addr = address + bytes_to_show; + const u8 *cur, *end; + + /* Do not show contents of object nor read into following guard page. */ + end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr) + : min(show_until_addr, PAGE_ALIGN(address))); + + pr_cont("["); + for (cur = (const u8 *)address; cur < end; cur++) { + if (*cur == KFENCE_CANARY_PATTERN(cur)) + pr_cont(" ."); + else if (no_hash_pointers) + pr_cont(" 0x%02x", *cur); + else /* Do not leak kernel memory in non-debug builds. */ + pr_cont(" !"); + } + pr_cont(" ]"); +} + +static const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type) +{ + unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; + const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; + int num_stack_entries; + int skipnr = 0; + + if (regs) { + num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0); + } else { + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); + } + + /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ + if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) + return; + + if (meta) + lockdep_assert_held(&meta->lock); + /* + * Because we may generate reports in printk-unfriendly parts of the + * kernel, such as scheduler code, the use of printk() could deadlock. + * Until such time that all printing code here is safe in all parts of + * the kernel, accept the risk, and just get our message out (given the + * system might already behave unpredictably due to the memory error). + * As such, also disable lockdep to hide warnings, and avoid disabling + * lockdep for the rest of the kernel. + */ + lockdep_off(); + + pr_err("==================================================================\n"); + /* Print report header. */ + switch (type) { + case KFENCE_ERROR_OOB: { + const bool left_of_object = address < meta->addr; + + pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", + get_access_type(is_write), (void *)address, + left_of_object ? meta->addr - address : address - meta->addr, + left_of_object ? "left" : "right", object_index); + break; + } + case KFENCE_ERROR_UAF: + pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", + get_access_type(is_write), (void *)address, object_index); + break; + case KFENCE_ERROR_CORRUPTION: + pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Corrupted memory at 0x%p ", (void *)address); + print_diff_canary(address, 16, meta); + pr_cont(" (in kfence-#%zd):\n", object_index); + break; + case KFENCE_ERROR_INVALID: + pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write), + (void *)address); + break; + case KFENCE_ERROR_INVALID_FREE: + pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, + object_index); + break; + } + + /* Print stack trace and object info. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); + + if (meta) { + pr_err("\n"); + kfence_print_object(NULL, meta); + } + + /* Print report footer. */ + pr_err("\n"); + if (no_hash_pointers && regs) + show_regs(regs); + else + dump_stack_print_info(KERN_ERR); + trace_error_report_end(ERROR_DETECTOR_KFENCE, address); + pr_err("==================================================================\n"); + + lockdep_on(); + + if (panic_on_warn) + panic("panic_on_warn set ...\n"); + + /* We encountered a memory unsafety error, taint the kernel! */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); +} diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 75e246f680f4..a7d6cb912b05 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -442,18 +442,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) static bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags) { - if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vm_flags & VM_NOHUGEPAGE) || + /* Explicitly disabled through madvise. */ + if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - if (shmem_file(vma->vm_file) || - (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && - vma->vm_file && - (vm_flags & VM_DENYWRITE))) { + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) { return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + return false; + + /* Read-only file mappings need to be aligned for THP to work. */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && + (vm_flags & VM_DENYWRITE)) { + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) return false; if (vma_is_temporary_stack(vma)) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 55c671904aac..24210c9bd843 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1312,6 +1312,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, */ put_page(page); + /* device metadata space is not recoverable */ + if (!pgmap_pfn_valid(pgmap, pfn)) { + rc = -ENXIO; + goto out; + } + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by diff --git a/mm/memory.c b/mm/memory.c index 784249f3307b..c8e357627318 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2902,7 +2902,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -3560,7 +3559,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -3745,8 +3743,6 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); - else - entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index abe43c1ae920..5ba51a8bdaeb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,17 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int memhp_default_online_type = MMOP_OFFLINE; +int mhp_default_online_type = MMOP_OFFLINE; #else -int memhp_default_online_type = MMOP_ONLINE; +int mhp_default_online_type = MMOP_ONLINE; #endif static int __init setup_memhp_default_state(char *str) { - const int online_type = memhp_online_type_from_str(str); + const int online_type = mhp_online_type_from_str(str); if (online_type >= 0) - memhp_default_online_type = online_type; + mhp_default_online_type = online_type; return 1; } @@ -107,6 +107,9 @@ static struct resource *register_memory_resource(u64 start, u64 size, if (strcmp(resource_name, "System RAM")) flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; + if (!mhp_range_allowed(start, size, true)) + return ERR_PTR(-E2BIG); + /* * Make sure value parsed from 'mem=' only restricts memory adding * while booting, so that memory hotplug won't be impacted. Please @@ -284,21 +287,53 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, return 0; } -static int check_hotplug_memory_addressable(unsigned long pfn, - unsigned long nr_pages) +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +struct page *pfn_to_online_page(unsigned long pfn) { - const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1; + unsigned long nr = pfn_to_section_nr(pfn); + struct dev_pagemap *pgmap; + struct mem_section *ms; - if (max_addr >> MAX_PHYSMEM_BITS) { - const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1; - WARN(1, - "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n", - (u64)PFN_PHYS(pfn), max_addr, max_allowed); - return -E2BIG; - } + if (nr >= NR_MEM_SECTIONS) + return NULL; - return 0; + ms = __nr_to_section(nr); + if (!online_section(ms)) + return NULL; + + /* + * Save some code text when online_section() + + * pfn_section_valid() are sufficient. + */ + if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) + return NULL; + + if (!pfn_section_valid(ms, pfn)) + return NULL; + + if (!online_device_section(ms)) + return pfn_to_page(pfn); + + /* + * Slowpath: when ZONE_DEVICE collides with + * ZONE_{NORMAL,MOVABLE} within the same section some pfns in + * the section may be 'offline' but 'valid'. Only + * get_dev_pagemap() can determine sub-section online status. + */ + pgmap = get_dev_pagemap(pfn, NULL); + put_dev_pagemap(pgmap); + + /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ + if (pgmap) + return NULL; + + return pfn_to_page(pfn); } +EXPORT_SYMBOL_GPL(pfn_to_online_page); /* * Reasonably generic function for adding memory. It is @@ -317,9 +352,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, if (WARN_ON_ONCE(!params->pgprot.pgprot)) return -EINVAL; - err = check_hotplug_memory_addressable(pfn, nr_pages); - if (err) - return err; + VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); if (altmap) { /* @@ -445,20 +478,19 @@ static void update_pgdat_span(struct pglist_data *pgdat) for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { - unsigned long zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); /* No need to lock the zones, they can't change. */ if (!zone->spanned_pages) continue; if (!node_end_pfn) { node_start_pfn = zone->zone_start_pfn; - node_end_pfn = zone_end_pfn; + node_end_pfn = end_pfn; continue; } - if (zone_end_pfn > node_end_pfn) - node_end_pfn = zone_end_pfn; + if (end_pfn > node_end_pfn) + node_end_pfn = end_pfn; if (zone->zone_start_pfn < node_start_pfn) node_start_pfn = zone->zone_start_pfn; } @@ -678,6 +710,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } + +static void section_taint_zone_device(unsigned long pfn) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; +} + /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this @@ -708,6 +748,19 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, pgdat_resize_unlock(pgdat, &flags); /* + * Subsection population requires care in pfn_to_online_page(). + * Set the taint to enable the slow path detection of + * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} + * section. + */ + if (zone_is_zone_device(zone)) { + if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn); + if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn + nr_pages); + } + + /* * TODO now we have a visible range of pages which are not associated * with their zone properly. Not nice but set_pfnblock_flags_mask * expects the zone spans the pfn range. All the pages in the range @@ -1007,7 +1060,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = memhp_default_online_type; + mem->online_type = mhp_default_online_type; return device_online(&mem->dev); } @@ -1084,11 +1137,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * In case we're allowed to merge the resource, flag it and trigger * merging now that adding succeeded. */ - if (mhp_flags & MEMHP_MERGE_RESOURCE) + if (mhp_flags & MHP_MERGE_RESOURCE) merge_system_ram_resource(res); /* online pages if requested */ - if (memhp_default_online_type != MMOP_OFFLINE) + if (mhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1180,6 +1233,61 @@ out_unlock: } EXPORT_SYMBOL_GPL(add_memory_driver_managed); +/* + * Platforms should define arch_get_mappable_range() that provides + * maximum possible addressable physical memory range for which the + * linear mapping could be created. The platform returned address + * range must adhere to these following semantics. + * + * - range.start <= range.end + * - Range includes both end points [range.start..range.end] + * + * There is also a fallback definition provided here, allowing the + * entire possible physical address range in case any platform does + * not define arch_get_mappable_range(). + */ +struct range __weak arch_get_mappable_range(void) +{ + struct range mhp_range = { + .start = 0UL, + .end = -1ULL, + }; + return mhp_range; +} + +struct range mhp_get_pluggable_range(bool need_mapping) +{ + const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + struct range mhp_range; + + if (need_mapping) { + mhp_range = arch_get_mappable_range(); + if (mhp_range.start > max_phys) { + mhp_range.start = 0; + mhp_range.end = 0; + } + mhp_range.end = min_t(u64, mhp_range.end, max_phys); + } else { + mhp_range.start = 0; + mhp_range.end = max_phys; + } + return mhp_range; +} +EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); + +bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) +{ + struct range mhp_range = mhp_get_pluggable_range(need_mapping); + u64 end = start + size; + + if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) + return true; + + pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", + start, end, mhp_range.start, mhp_range.end); + return false; +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping diff --git a/mm/memremap.c b/mm/memremap.c index 16b2fb482da1..7aa7d6e80ee5 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -80,6 +80,21 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } +bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) +{ + int i; + + for (i = 0; i < pgmap->nr_range; i++) { + struct range *range = &pgmap->ranges[i]; + + if (pfn >= PHYS_PFN(range->start) && + pfn <= PHYS_PFN(range->end)) + return pfn >= pfn_first(pgmap, i); + } + + return false; +} + static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { const struct range *range = &pgmap->ranges[range_id]; @@ -185,6 +200,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, int range_id, int nid) { + const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE; struct range *range = &pgmap->ranges[range_id]; struct dev_pagemap *conflict_pgmap; int error, is_ram; @@ -230,6 +246,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (error) goto err_pfn_remap; + if (!mhp_range_allowed(range->start, range_len(range), !is_private)) { + error = -EINVAL; + goto err_pfn_remap; + } + mem_hotplug_begin(); /* @@ -243,7 +264,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, * the CPU, we do want the linear mapping and thus use * arch_add_memory(). */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + if (is_private) { error = add_pages(nid, PHYS_PFN(range->start), PHYS_PFN(range_len(range)), params); } else { diff --git a/mm/mlock.c b/mm/mlock.c index 73960bb3464d..f8f8cc32d03d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -622,7 +622,7 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, vma = find_vma(mm, start); if (vma == NULL) - vma = mm->mmap; + return 0; for (; vma ; vma = vma->vm_next) { if (start >= vma->vm_end) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ddccc59f2f72..3e4b29ee2b1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2168,6 +2168,7 @@ void __init init_cma_reserved_pageblock(struct page *page) } adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; } #endif diff --git a/mm/rmap.c b/mm/rmap.c index e26ae119a131..b0fc27e77d6d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -168,7 +168,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in page_lock_anon_vma_read() - * and that may actually touch the spinlock even in the newly + * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * @@ -359,7 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) goto out_error_free_anon_vma; /* - * The root anon_vma's spinlock is the lock actually used when we + * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; @@ -462,8 +462,8 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against page_remove_rmap() - * the best this function can do is return a locked anon_vma that might - * have been relevant to this page. + * the best this function can do is return a refcount increased anon_vma + * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). @@ -1086,8 +1086,7 @@ static void __page_check_anon_rmap(struct page *page, * be set up correctly at this point. * * We have exclusion against page_add_anon_rmap because the caller - * always holds the page locked, except if called from page_dup_rmap, - * in which case the page is already known to be setup. + * always holds the page locked. * * We have exclusion against page_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked @@ -1737,9 +1736,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_mapcount_is_zero(struct page *page) +static int page_not_mapped(struct page *page) { - return !total_mapcount(page); + return !page_mapped(page); } /** @@ -1757,7 +1756,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_mapcount_is_zero, + .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1781,11 +1780,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) return !page_mapcount(page) ? true : false; } -static int page_not_mapped(struct page *page) -{ - return !page_mapped(page); -}; - /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked diff --git a/mm/shmem.c b/mm/shmem.c index ff741d229701..b2db4ed0fbc7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -842,7 +842,6 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) void shmem_unlock_mapping(struct address_space *mapping) { struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; pagevec_init(&pvec); @@ -850,16 +849,8 @@ void shmem_unlock_mapping(struct address_space *mapping) * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping)) { - /* - * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it - * has finished, if it hits a row of PAGEVEC_SIZE swap entries. - */ - pvec.nr = find_get_entries(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) + if (!pagevec_lookup(&pvec, mapping, &index)) break; - index = indices[pvec.nr - 1] + 1; - pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(&pvec); pagevec_release(&pvec); cond_resched(); @@ -916,18 +907,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec); index = start; - while (index < end) { - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) - break; + while (index < end && find_lock_entries(mapping, index, end - 1, + &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; index = indices[i]; - if (index >= end) - break; if (xa_is_value(page)) { if (unfalloc) @@ -936,18 +921,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index, page); continue; } + index += thp_nr_pages(page) - 1; - VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); - - if (!trylock_page(page)) - continue; - - if ((!unfalloc || !PageUptodate(page)) && - page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - if (shmem_punch_compound(page, start, end)) - truncate_inode_page(mapping, page); - } + if (!unfalloc || !PageUptodate(page)) + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_remove_exceptionals(&pvec); @@ -988,10 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) { + if (!find_get_entries(mapping, index, end - 1, &pvec, + indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; @@ -1003,9 +978,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = pvec.pages[i]; index = indices[i]; - if (index >= end) - break; - if (xa_is_value(page)) { if (unfalloc) continue; @@ -1533,6 +1505,30 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, return page; } +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { @@ -1547,8 +1543,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), + true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1804,6 +1800,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; + gfp_t huge_gfp; int error; int once = 0; int alloced = 0; @@ -1821,7 +1818,8 @@ repeat: sbinfo = SHMEM_SB(inode->i_sb); charge_mm = vma ? vma->vm_mm : current->mm; - page = find_lock_entry(mapping, index); + page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); if (xa_is_value(page)) { error = shmem_swapin_page(inode, index, &page, sgp, gfp, vma, fault_type); @@ -1889,7 +1887,9 @@ repeat: } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); if (IS_ERR(page)) { alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, @@ -2676,86 +2676,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } -/* - * llseek SEEK_DATA or SEEK_HOLE through the page cache. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int whence) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; - - pagevec_init(&pvec); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = find_get_entries(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (whence == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (whence == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !xa_is_value(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && whence == SEEK_DATA) || - (!page && whence == SEEK_HOLE)) { - done = true; - break; - } - } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; -} - static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t start, end; - loff_t new_offset; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); + if (offset < 0) + return -ENXIO; + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ - - if (offset < 0 || offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_SHIFT; - end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (whence == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } - + offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); diff --git a/mm/slab.c b/mm/slab.c index 35c68d99d460..51fd424e0d6d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -100,6 +100,7 @@ #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/kallsyms.h> +#include <linux/kfence.h> #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> @@ -3208,7 +3209,7 @@ must_grow: } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; @@ -3221,6 +3222,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (unlikely(!cachep)) return NULL; + ptr = kfence_alloc(cachep, orig_size, flags); + if (unlikely(ptr)) + goto out_hooks; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3253,6 +3258,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr) memset(ptr, 0, cachep->object_size); +out_hooks: slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr); return ptr; } @@ -3290,7 +3296,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3301,6 +3307,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (unlikely(!cachep)) return NULL; + objp = kfence_alloc(cachep, orig_size, flags); + if (unlikely(objp)) + goto out; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3311,6 +3321,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp) memset(objp, 0, cachep->object_size); +out: slab_post_alloc_hook(cachep, objcg, flags, 1, &objp); return objp; } @@ -3416,6 +3427,12 @@ free_done: static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + if (is_kfence_address(objp)) { + kmemleak_free_recursive(objp, cachep->flags); + __kfence_free(objp); + return; + } + if (unlikely(slab_want_init_on_free(cachep))) memset(objp, 0, cachep->object_size); @@ -3482,7 +3499,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = slab_alloc(cachep, flags, _RET_IP_); + void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3515,7 +3532,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); if (unlikely(!objp)) goto error; @@ -3548,7 +3565,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = slab_alloc(cachep, flags, _RET_IP_); + ret = slab_alloc(cachep, flags, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, @@ -3574,7 +3591,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, @@ -3592,7 +3609,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, { void *ret; - ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, @@ -3673,7 +3690,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = slab_alloc(cachep, flags, caller); + ret = slab_alloc(cachep, flags, size, caller); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, @@ -4172,7 +4189,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, BUG_ON(objnr >= cachep->num); /* Find offset within object. */ - offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); + if (is_kfence_address(ptr)) + offset = ptr - kfence_object_start(ptr); + else + offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep); /* Allow address range falling entirely within usercopy region. */ if (offset >= cachep->useroffset && diff --git a/mm/slab_common.c b/mm/slab_common.c index 7c8298c17145..88e833986332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -12,6 +12,7 @@ #include <linux/memory.h> #include <linux/cache.h> #include <linux/compiler.h> +#include <linux/kfence.h> #include <linux/module.h> #include <linux/cpu.h> #include <linux/uaccess.h> @@ -430,6 +431,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) rcu_barrier(); list_for_each_entry_safe(s, s2, &to_destroy, list) { + kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_release(s); #else @@ -455,6 +457,7 @@ static int shutdown_cache(struct kmem_cache *s) list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { + kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_unlink(s); sysfs_slab_release(s); @@ -640,6 +643,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, panic("Out of memory when creating slab %s\n", name); create_boot_cache(s, name, size, flags, useroffset, usersize); + kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; return s; @@ -1132,16 +1136,27 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, void *ret; size_t ks; - ks = ksize(p); + /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + if (likely(!ZERO_OR_NULL_PTR(p))) { + if (!kasan_check_byte(p)) + return NULL; + ks = kfence_ksize(p) ?: __ksize(p); + } else + ks = 0; + /* If the object still fits, repoison it precisely. */ if (ks >= new_size) { p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), ks); + kasan_enable_current(); + } return ret; } @@ -1235,7 +1250,7 @@ size_t ksize(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; - size = __ksize(objp); + size = kfence_ksize(objp) ?: __ksize(objp); /* * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. diff --git a/mm/slub.c b/mm/slub.c index b2833ce85c92..e26c274b4657 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include <linux/ctype.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> +#include <linux/kfence.h> #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> @@ -1570,6 +1571,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, void *old_tail = *tail ? *tail : *head; int rsize; + if (is_kfence_address(next)) { + slab_free_hook(s, next); + return true; + } + /* Head and tail of the reconstructed freelist */ *head = NULL; *tail = NULL; @@ -2809,7 +2815,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr) + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; struct kmem_cache_cpu *c; @@ -2820,6 +2826,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2892,20 +2903,21 @@ redo: if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(kasan_reset_tag(object), 0, s->object_size); +out: slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); return object; } static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, unsigned long addr) + gfp_t gfpflags, unsigned long addr, size_t orig_size) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); @@ -2917,7 +2929,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -2928,7 +2940,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -2942,7 +2954,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -2976,6 +2988,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); + if (kfence_free(head)) + return; + if (kmem_cache_debug(s) && !free_debug_processing(s, page, head, tail, cnt, addr)) return; @@ -3220,6 +3235,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, df->s = cache_from_obj(s, object); /* Support for memcg */ } + if (is_kfence_address(object)) { + slab_free_hook(df->s, object); + __kfence_free(object); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Start new detached freelist */ df->page = page; set_freepointer(df->s, object, NULL); @@ -3295,8 +3317,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); for (i = 0; i < size; i++) { - void *object = c->freelist; + void *object = kfence_alloc(s, s->object_size, flags); + if (unlikely(object)) { + p[i] = object; + continue; + } + + object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -3551,8 +3579,7 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; @@ -4021,7 +4048,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -4069,7 +4096,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -4095,6 +4122,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, struct kmem_cache *s; unsigned int offset; size_t object_size; + bool is_kfence = is_kfence_address(ptr); ptr = kasan_reset_tag(ptr); @@ -4107,10 +4135,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, to_user, 0, n); /* Find offset within object. */ - offset = (ptr - page_address(page)) % s->size; + if (is_kfence) + offset = ptr - kfence_object_start(ptr); + else + offset = (ptr - page_address(page)) % s->size; /* Adjust for redzone and reject if within the redzone. */ - if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { + if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { if (offset < s->red_left_pad) usercopy_abort("SLUB object in left red zone", s->name, to_user, offset, n); @@ -4527,7 +4558,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, caller); + ret = slab_alloc(s, gfpflags, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4558,7 +4589,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); diff --git a/mm/swap.c b/mm/swap.c index ab3258afcbeb..31b844d4ed94 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1018,45 +1018,11 @@ void __pagevec_lru_add(struct pagevec *pvec) } /** - * pagevec_lookup_entries - gang pagecache lookup - * @pvec: Where the resulting entries are placed - * @mapping: The address_space to search - * @start: The starting entry index - * @nr_entries: The maximum number of pages - * @indices: The cache indices corresponding to the entries in @pvec - * - * pagevec_lookup_entries() will search for and return a group of up - * to @nr_pages pages and shadow entries in the mapping. All - * entries are placed in @pvec. pagevec_lookup_entries() takes a - * reference against actual pages in @pvec. - * - * The search returns a group of mapping-contiguous entries with - * ascending indexes. There may be holes in the indices due to - * not-present entries. - * - * Only one subpage of a Transparent Huge Page is returned in one call: - * allowing truncate_inode_pages_range() to evict the whole THP without - * cycling through a pagevec of extra references. - * - * pagevec_lookup_entries() returns the number of entries which were - * found. - */ -unsigned pagevec_lookup_entries(struct pagevec *pvec, - struct address_space *mapping, - pgoff_t start, unsigned nr_entries, - pgoff_t *indices) -{ - pvec->nr = find_get_entries(mapping, start, nr_entries, - pvec->pages, indices); - return pagevec_count(pvec); -} - -/** * pagevec_remove_exceptionals - pagevec exceptionals pruning * @pvec: The pagevec to prune * - * pagevec_lookup_entries() fills both pages and exceptional radix - * tree entries into the pagevec. This function prunes all + * find_get_entries() fills both pages and XArray value entries (aka + * exceptional entries) into the pagevec. This function prunes all * exceptionals from @pvec without leaving holes, so that it can be * passed on to page-only pagevec operations. */ diff --git a/mm/swap_state.c b/mm/swap_state.c index c1a648d9092b..3cdee7b11da9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -87,11 +87,9 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) pgoff_t idx = swp_offset(entry); struct page *page; - page = find_get_entry(address_space, idx); + page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; - if (page) - put_page(page); return NULL; } @@ -405,7 +403,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = find_get_entry(mapping, index); + struct page *page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); if (!page) return page; diff --git a/mm/truncate.c b/mm/truncate.c index 8aa4907e06e0..455944264663 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -57,11 +57,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, * exceptional entries similar to what pagevec_remove_exceptionals does. */ static void truncate_exceptional_pvec_entries(struct address_space *mapping, - struct pagevec *pvec, pgoff_t *indices, - pgoff_t end) + struct pagevec *pvec, pgoff_t *indices) { int i, j; - bool dax, lock; + bool dax; /* Handled by shmem itself */ if (shmem_mapping(mapping)) @@ -75,8 +74,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; dax = dax_mapping(mapping); - lock = !dax && indices[j] < end; - if (lock) + if (!dax) xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { @@ -88,9 +86,6 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, continue; } - if (index >= end) - continue; - if (unlikely(dax)) { dax_delete_mapping_entry(mapping, index); continue; @@ -99,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, __clear_shadow_entry(mapping, index, page); } - if (lock) + if (!dax) xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -326,51 +321,19 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - /* - * Pagevec array has exceptional entries and we may also fail - * to lock some pages. So we store pages that can be deleted - * in a new pagevec. - */ - struct pagevec locked_pvec; - - pagevec_init(&locked_pvec); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - - /* We rely upon deletion not changing page->index */ - index = indices[i]; - if (index >= end) - break; - - if (xa_is_value(page)) - continue; - - if (!trylock_page(page)) - continue; - WARN_ON(page_to_index(page) != index); - if (PageWriteback(page)) { - unlock_page(page); - continue; - } - if (page->mapping != mapping) { - unlock_page(page); - continue; - } - pagevec_add(&locked_pvec, page); - } - for (i = 0; i < pagevec_count(&locked_pvec); i++) - truncate_cleanup_page(mapping, locked_pvec.pages[i]); - delete_from_page_cache_batch(mapping, &locked_pvec); - for (i = 0; i < pagevec_count(&locked_pvec); i++) - unlock_page(locked_pvec.pages[i]); - truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + while (index < end && find_lock_entries(mapping, index, end - 1, + &pvec, indices)) { + index = indices[pagevec_count(&pvec) - 1] + 1; + truncate_exceptional_pvec_entries(mapping, &pvec, indices); + for (i = 0; i < pagevec_count(&pvec); i++) + truncate_cleanup_page(mapping, pvec.pages[i]); + delete_from_page_cache_batch(mapping, &pvec); + for (i = 0; i < pagevec_count(&pvec); i++) + unlock_page(pvec.pages[i]); pagevec_release(&pvec); cond_resched(); - index++; } + if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -413,8 +376,8 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + if (!find_get_entries(mapping, index, end - 1, &pvec, + indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; @@ -422,23 +385,12 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; continue; } - if (index == start && indices[0] >= end) { - /* All gone out of hole to be punched, we're done */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - break; - } for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index >= end) { - /* Restart punch to make sure all gone */ - index = start - 1; - break; - } if (xa_is_value(page)) continue; @@ -449,7 +401,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); + truncate_exceptional_pvec_entries(mapping, &pvec, indices); pagevec_release(&pvec); index++; } @@ -539,55 +491,19 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, int i; pagevec_init(&pvec); - while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - indices)) { + while (find_lock_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index > end) - break; if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, page); continue; } - - if (!trylock_page(page)) - continue; - - WARN_ON(page_to_index(page) != index); - - /* Middle of THP: skip */ - if (PageTransTail(page)) { - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - /* - * 'end' is in the middle of THP. Don't - * invalidate the page as the part outside of - * 'end' could be still useful. - */ - if (index > end) { - unlock_page(page); - continue; - } - - /* Take a pin outside pagevec */ - get_page(page); - - /* - * Drop extra pins before trying to invalidate - * the huge page. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - } + index += thp_nr_pages(page) - 1; ret = invalidate_inode_page(page); unlock_page(page); @@ -601,9 +517,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, if (nr_pagevec) (*nr_pagevec)++; } - - if (PageTransHuge(page)) - put_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); @@ -725,16 +638,12 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_init(&pvec); index = start; - while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - indices)) { + while (find_get_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index > end) - break; if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, diff --git a/mm/vmstat.c b/mm/vmstat.c index a0e949542204..74b2c374b86c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -342,6 +342,12 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long t; if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } @@ -551,6 +557,12 @@ static inline void mod_node_state(struct pglist_data *pgdat, long o, n, t, z; if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } @@ -1637,14 +1649,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n high %lu" "\n spanned %lu" "\n present %lu" - "\n managed %lu", + "\n managed %lu" + "\n cma %lu", zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), zone->spanned_pages, zone->present_pages, - zone_managed_pages(zone)); + zone_managed_pages(zone), + zone_cma_pages(zone)); seq_printf(m, "\n protection: (%ld", @@ -1892,16 +1906,12 @@ static void vmstat_update(struct work_struct *w) */ static bool need_update(int cpu) { + pg_data_t *last_pgdat = NULL; struct zone *zone; for_each_populated_zone(zone) { struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); - - BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); -#ifdef CONFIG_NUMA - BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); -#endif - + struct per_cpu_nodestat *n; /* * The fast way of checking if there are any vmstat diffs. */ @@ -1913,6 +1923,13 @@ static bool need_update(int cpu) sizeof(p->vm_numa_stat_diff[0]))) return true; #endif + if (last_pgdat == zone->zone_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); + if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS * + sizeof(n->vm_node_stat_diff[0]))) + return true; } return false; } @@ -1963,6 +1980,8 @@ static void vmstat_shepherd(struct work_struct *w) if (!delayed_work_pending(dw) && need_update(cpu)) queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); + + cond_resched(); } put_online_cpus(); diff --git a/mm/z3fold.c b/mm/z3fold.c index c1ccf6bb0ffb..b5dafa7e44e4 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1771,6 +1771,7 @@ static u64 z3fold_zpool_total_size(void *pool) static struct zpool_driver z3fold_zpool_driver = { .type = "z3fold", + .sleep_mapped = true, .owner = THIS_MODULE, .create = z3fold_zpool_create, .destroy = z3fold_zpool_destroy, diff --git a/mm/zbud.c b/mm/zbud.c index c49966ece674..7ec5f27a68b0 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -203,6 +203,7 @@ static u64 zbud_zpool_total_size(void *pool) static struct zpool_driver zbud_zpool_driver = { .type = "zbud", + .sleep_mapped = true, .owner = THIS_MODULE, .create = zbud_zpool_create, .destroy = zbud_zpool_destroy, diff --git a/mm/zpool.c b/mm/zpool.c index 3744a2d1a624..5ed71207ced7 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -23,6 +23,7 @@ struct zpool { void *pool; const struct zpool_ops *ops; bool evictable; + bool can_sleep_mapped; struct list_head list; }; @@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; zpool->evictable = driver->shrink && ops && ops->evict; + zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool) return zpool->evictable; } +/** + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. + * @zpool: The zpool to test + * + * Returns: true if zpool can sleep; false otherwise. + */ +bool zpool_can_sleep_mapped(struct zpool *zpool) +{ + return zpool->can_sleep_mapped; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7289f502ffac..30c358b72025 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -357,7 +357,7 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { - return kmem_cache_alloc(pool->zspage_cachep, + return kmem_cache_zalloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } @@ -816,7 +816,7 @@ static int get_pages_per_zspage(int class_size) static struct zspage *get_zspage(struct page *page) { - struct zspage *zspage = (struct zspage *)page->private; + struct zspage *zspage = (struct zspage *)page_private(page); BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; @@ -1064,7 +1064,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zspage) return NULL; - memset(zspage, 0, sizeof(struct zspage)); zspage->magic = ZSPAGE_MAGIC; migrate_lock_init(zspage); @@ -2213,11 +2212,13 @@ static unsigned long zs_can_compact(struct size_class *class) return obj_wasted * class->pages_per_zspage; } -static void __zs_compact(struct zs_pool *pool, struct size_class *class) +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) { struct zs_compact_control cc; struct zspage *src_zspage; struct zspage *dst_zspage = NULL; + unsigned long pages_freed = 0; spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { @@ -2247,7 +2248,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) putback_zspage(class, dst_zspage); if (putback_zspage(class, src_zspage) == ZS_EMPTY) { free_zspage(pool, class, src_zspage); - pool->stats.pages_compacted += class->pages_per_zspage; + pages_freed += class->pages_per_zspage; } spin_unlock(&class->lock); cond_resched(); @@ -2258,12 +2259,15 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) putback_zspage(class, src_zspage); spin_unlock(&class->lock); + + return pages_freed; } unsigned long zs_compact(struct zs_pool *pool) { int i; struct size_class *class; + unsigned long pages_freed = 0; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -2271,10 +2275,11 @@ unsigned long zs_compact(struct zs_pool *pool) continue; if (class->index != i) continue; - __zs_compact(pool, class); + pages_freed += __zs_compact(pool, class); } + atomic_long_add(pages_freed, &pool->stats.pages_compacted); - return pool->stats.pages_compacted; + return pages_freed; } EXPORT_SYMBOL_GPL(zs_compact); @@ -2291,13 +2296,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - pages_freed = pool->stats.pages_compacted; /* * Compact classes and calculate compaction delta. * Can run concurrently with a manually triggered * (by user) compaction. */ - pages_freed = zs_compact(pool) - pages_freed; + pages_freed = zs_compact(pool); return pages_freed ? pages_freed : SHRINK_STOP; } diff --git a/mm/zswap.c b/mm/zswap.c index 182f6ad5aa69..578d9f256920 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + u8 *src, *tmp = NULL; unsigned int dlen; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; + if (!zpool_can_sleep_mapped(pool)) { + tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + } + /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ @@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* entry was invalidated */ spin_unlock(&tree->lock); zpool_unmap_handle(pool, handle); + kfree(tmp); return 0; } spin_unlock(&tree->lock); @@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = PAGE_SIZE; src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1022,10 +1037,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* * if we get here due to ZSWAP_SWAPCACHE_EXIST - * a load may happening concurrently - * it is safe and okay to not free the entry + * a load may be happening concurrently. + * it is safe and okay to not free the entry. * if we free the entry in the following put - * it it either okay to return !0 + * it is also okay to return !0 */ fail: spin_lock(&tree->lock); @@ -1033,7 +1048,11 @@ fail: spin_unlock(&tree->lock); end: - zpool_unmap_handle(pool, handle); + if (zpool_can_sleep_mapped(pool)) + zpool_unmap_handle(pool, handle); + else + kfree(tmp); + return ret; } @@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src, *dst; + u8 *src, *dst, *tmp; unsigned int dlen; int ret; @@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, dst = kmap_atomic(page); zswap_fill_page(dst, entry->value); kunmap_atomic(dst); + ret = 0; goto freeentry; } + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + tmp = kmalloc(entry->length, GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto freeentry; + } + } + /* decompress */ dlen = PAGE_SIZE; src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + + memcpy(tmp, src, entry->length); + src = tmp; + + zpool_unmap_handle(entry->pool->zpool, entry->handle); + } + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); @@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); mutex_unlock(acomp_ctx->mutex); - zpool_unmap_handle(entry->pool->zpool, entry->handle); + if (zpool_can_sleep_mapped(entry->pool->zpool)) + zpool_unmap_handle(entry->pool->zpool, entry->handle); + else + kfree(tmp); + BUG_ON(ret); freeentry: @@ -1279,7 +1320,7 @@ freeentry: zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - return 0; + return ret; } /* frees an entry in zswap */ |