vmalloc详解

vmalloc是一个接口函数, 内核代码使用它来分配在虚拟内存中连续但在物理内存中不一定连续的内存。

只需要一个参数，以字节为单位。

使用vmalloc的最著名的实例是内核对模块的实现. 因为模块可能在任何时候加载, 如果模块数据比较多, 那么无法保证有足够的连续内存可用, 特别是在系统已经运行了比较长时间的情况下.

如果能够用小块内存拼接出足够的内存, 那么使用vmalloc可以规避该问题。

因为用于vmalloc的内存页总是必须映射在内核地址空间中, 因此使用ZONE_HIGHMEM内存域的页要优于其他内存域. 这使得内核可以节省更宝贵的较低端内存域, 而又不会带来额外的坏处. 因此, vmalloc等映射函数是内核出于自身的目的(并非因为用户空间应用程序)使用高端内存页的少数情形之一.
内核在管理虚拟内存中的VMALLOC区域时, 内核必须跟踪哪些子区域被使用、哪些是空闲的. 为此定义了一个数据结构vm_struct:

31struct vm_struct {

    struct vm_struct    *next;

    void            *addr;

    unsigned long        size;

    unsigned long        flags;

    struct page        **pages;

    unsigned int        nr_pages;

    phys_addr_t        phys_addr;

    const void        *caller;

};

一个vm_struct代表一个vmalloc区域。

通过next形成一个链表。

addr是映射的首地址，size为映射地址区间的大小。

pages是一组指针，这些指针描述映射到这个区间里面的一个个真实的物理页对应的page指针。

nr_pages表示该地址区间映射了多少物理页。

phys_addr仅当用ioremap映射了由物理地址描述的物理内存区域时才需要。该信息保存在phys_addr中。

caller指向调用__vmalloc_node_flags被调用的地址。

flags的取值如下：

/* bits in flags of vmalloc's vm_struct below */

#define VM_IOREMAP        0x00000001    /* ioremap() and friends */

#define VM_ALLOC        0x00000002    /* vmalloc() */

#define VM_MAP            0x00000004    /* vmap()ed pages */

#define VM_USERMAP        0x00000008    /* suitable for remap_vmalloc_range */

#define VM_VPAGES        0x00000010    /* buffer for pages was vmalloc'ed */

#define VM_UNINITIALIZED    0x00000020    /* vm_struct is not fully initialized */

#define VM_NO_GUARD        0x00000040      /* don't add guard page */

#define VM_KASAN        0x00000080      /* has allocated kasan shadow memory */

/* bits [20..32] reserved for arch specific ioremap internals */

因为vmalloc的调用函数都在一起，贴上如下：

/**

1711 *    __vmalloc_node  -  allocate virtually contiguous memory

1712 *    @size:        allocation size

1713 *    @align:        desired alignment

1714 *    @gfp_mask:    flags for the page level allocator

1715 *    @prot:        protection mask for the allocated pages

1716 *    @node:        node to use for allocation or NUMA_NO_NODE

1717 *    @caller:    caller's return address

1718 *

1719 *    Allocate enough pages to cover @size from the page level

1720 *    allocator with @gfp_mask flags.  Map them into contiguous

1721 *    kernel virtual space, using a pagetable protection of @prot.

1722 */static void *__vmalloc_node(unsigned long size, unsigned long align,

                gfp_t gfp_mask, pgprot_t prot,

                int node, const void *caller)

{

    return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,

                gfp_mask, prot, , node, caller);

}

...

static inline void *__vmalloc_node_flags(unsigned long size,

                    int node, gfp_t flags)

{

    return __vmalloc_node(size, , flags, PAGE_KERNEL,

                    node, __builtin_return_address());

}

/**

1746 *    vmalloc  -  allocate virtually contiguous memory

1747 *    @size:        allocation size

1748 *    Allocate enough pages to cover @size from the page level

1749 *    allocator and map them into contiguous kernel virtual space.

1750 *

1751 *    For tight control over page level allocator and protection flags

1752 *    use __vmalloc() instead.

1753 */void *vmalloc(unsigned long size)

{

    return __vmalloc_node_flags(size, NUMA_NO_NODE,

                    GFP_KERNEL | __GFP_HIGHMEM);

}

1759EXPORT_SYMBOL(vmalloc);

最终调到__vmalloc_node_range，并把VMALLOC_START和VMALLOC_END传入，该函数是vmalloc的主要实现，用来从(start, end)中申请一段大小为size的虚拟地址空间，并给这块虚拟地址空间申请物理内存(基本是不连续的)，并写入页表。

VMALLOC_START和VMALLOC_END在arm中的定义如下：

/*

37 * Just any arbitrary offset to the start of the vmalloc VM area: the

38 * current 8MB value just means that there will be a 8MB "hole" after the

39 * physical memory until the kernel virtual memory starts.  That means that

40 * any out-of-bounds memory accesses will hopefully be caught.

41 * The vmalloc() routines leaves a hole of 4kB between each vmalloced

42 * area for the same reason. ;)

43 */

#define VMALLOC_OFFSET        (8*1024*1024)

#define VMALLOC_START        (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))

#define VMALLOC_END        0xff800000UL

可以看出VMALLOC_END为0xff800000UL，但VMALLOC_START与high_memory有关。high_memory在sanity_check_meminfo中被确定：

static void * __initdata vmalloc_min =

    (void *)(VMALLOC_END - ( << ) - VMALLOC_OFFSET);

/*

1085 * vmalloc=size forces the vmalloc area to be exactly 'size'

1086 * bytes. This can be used to increase (or decrease) the vmalloc

1087 * area - the default is 240m.

1088 */static int __init early_vmalloc(char *arg)

{

    unsigned long vmalloc_reserve = memparse(arg, NULL);

    if (vmalloc_reserve < SZ_16M) {

        vmalloc_reserve = SZ_16M;

        pr_warn("vmalloc area too small, limiting to %luMB\n",

            vmalloc_reserve >> );

    }

    if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {

        vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);

        pr_warn("vmalloc area is too big, limiting to %luMB\n",

            vmalloc_reserve >> );

    }

    vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve);

    return ;

}

1108early_param("vmalloc", early_vmalloc);

1110phys_addr_t arm_lowmem_limit __initdata = ;

void __init sanity_check_meminfo(void)

{

    phys_addr_t memblock_limit = ;

    int highmem = ;

    phys_addr_t vmalloc_limit = __pa(vmalloc_min - ) + ;

    struct memblock_region *reg;

    bool should_use_highmem = false;

    for_each_memblock(memory, reg) {

        phys_addr_t block_start = reg->base;

        phys_addr_t block_end = reg->base + reg->size;

        phys_addr_t size_limit = reg->size;

        if (reg->base >= vmalloc_limit)

            highmem = ;

        else

            size_limit = vmalloc_limit - reg->base;

        if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {

            if (highmem) {

                pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",

                      &block_start, &block_end);

                memblock_remove(reg->base, reg->size);

                should_use_highmem = true;

                continue;

            }

            if (reg->size > size_limit) {

                phys_addr_t overlap_size = reg->size - size_limit;

                pr_notice("Truncating RAM at %pa-%pa to -%pa",

                      &block_start, &block_end, &vmalloc_limit);

                memblock_remove(vmalloc_limit, overlap_size);

                block_end = vmalloc_limit;

                should_use_highmem = true;

            }

        }

        if (!highmem) {

            if (block_end > arm_lowmem_limit) {

                if (reg->size > size_limit)

                    arm_lowmem_limit = vmalloc_limit;

                else

                    arm_lowmem_limit = block_end;

            }

            /*

1161             * Find the first non-pmd-aligned page, and point

1162             * memblock_limit at it. This relies on rounding the

1163             * limit down to be pmd-aligned, which happens at the

1164             * end of this function.

1165             *

1166             * With this algorithm, the start or end of almost any

1167             * bank can be non-pmd-aligned. The only exception is

1168             * that the start of the bank 0 must be section-

1169             * aligned, since otherwise memory would need to be

1170             * allocated when mapping the start of bank 0, which

1171             * occurs before any free memory is mapped.

1172             */

            if (!memblock_limit) {

                if (!IS_ALIGNED(block_start, PMD_SIZE))

                    memblock_limit = block_start;

                else if (!IS_ALIGNED(block_end, PMD_SIZE))

                    memblock_limit = arm_lowmem_limit;

            }

        }

    }

    if (should_use_highmem)

        pr_notice("Consider using a HIGHMEM enabled kernel.\n");

    high_memory = __va(arm_lowmem_limit - ) + ;

    if (!memblock_limit)

        memblock_limit = arm_lowmem_limit;

    /*

1192     * Round the memblock limit down to a pmd size.  This

1193     * helps to ensure that we will allocate memory from the

1194     * last full pmd, which should be mapped.

1195     */

    memblock_limit = round_down(memblock_limit, PMD_SIZE);

    memblock_set_current_limit(memblock_limit);

}

如果在bootargs里面没有vmalloc=的字段，vmalloc占用的虚拟地址空间为240MB，如果设置了该参数大小为P，会用VMALLOC_END-P赋给vmalloc_min。

sanity_check_meminfo对于vmalloc来讲最重要的作用就是根据memblock里面的内存块确定arm_lowmem_limit的地址，使其不会与vmalloc区间重叠。

OK, VMALLOC_START与VMALLOC_END确定。

下面来看下这个函数的实现：

/**

1650 *    __vmalloc_node_range  -  allocate virtually contiguous memory

1651 *    @size:        allocation size

1652 *    @align:        desired alignment

1653 *    @start:        vm area range start

1654 *    @end:        vm area range end

1655 *    @gfp_mask:    flags for the page level allocator

1656 *    @prot:        protection mask for the allocated pages

1657 *    @vm_flags:    additional vm area flags (e.g. %VM_NO_GUARD)

1658 *    @node:        node to use for allocation or NUMA_NO_NODE

1659 *    @caller:    caller's return address

1660 *

1661 *    Allocate enough pages to cover @size from the page level

1662 *    allocator with @gfp_mask flags.  Map them into contiguous

1663 *    kernel virtual space, using a pagetable protection of @prot.

1664 */void *__vmalloc_node_range(unsigned long size, unsigned long align,

            unsigned long start, unsigned long end, gfp_t gfp_mask,

            pgprot_t prot, unsigned long vm_flags, int node,

            const void *caller)

{

    struct vm_struct *area;

    void *addr;

    unsigned long real_size = size;

    size = PAGE_ALIGN(size);

    if (!size || (size >> PAGE_SHIFT) > totalram_pages)

        goto fail;

    area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |

                vm_flags, start, end, node, gfp_mask, caller);

    if (!area)

        goto fail;

    addr = __vmalloc_area_node(area, gfp_mask, prot, node);

    if (!addr)

        return NULL;

    /*

1688     * In this function, newly allocated vm_struct has VM_UNINITIALIZED

1689     * flag. It means that vm_struct is not fully initialized.

1690     * Now, it is fully initialized, so remove this flag here.

1691     */

    clear_vm_uninitialized_flag(area);

    /*

1695     * A ref_count = 2 is needed because vm_struct allocated in

1696     * __get_vm_area_node() contains a reference to the virtual address of

1697     * the vmalloc'ed block.

1698     */

    kmemleak_alloc(addr, real_size, , gfp_mask);

    return addr;

1703fail:

    warn_alloc_failed(gfp_mask, ,

              "vmalloc: allocation failure: %lu bytes\n",

              real_size);

    return NULL;

}

先调用__get_vm_area_node在vmap_area组成的红黑树中找到一个位置，把由该空间组成的vmap_area插入红黑树。

然后调用setup_vmalloc_vm 把该空间保存在vm_struct中。

1317static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,

                  unsigned long flags, const void *caller)

{

    spin_lock(&vmap_area_lock);

    vm->flags = flags;

    vm->addr = (void *)va->va_start;

    vm->size = va->va_end - va->va_start;

    vm->caller = caller;

    va->vm = vm;

    va->flags |= VM_VM_AREA;

    spin_unlock(&vmap_area_lock);

}

然后回到__vmalloc_node_range中，申请完虚拟地址空间后，接着调用__vmalloc_area_node 来申请具体的物理页，并把这些页和对应的虚拟地址填入页表。

1591static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

                 pgprot_t prot, int node)

{

    const int order = ;

    struct page **pages;

    unsigned int nr_pages, array_size, i;

    const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;

    const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;

    nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;

    array_size = (nr_pages * sizeof(struct page *));

    area->nr_pages = nr_pages;

    /* Please note that the recursion is strictly bounded. */

    if (array_size > PAGE_SIZE) {

        pages = __vmalloc_node(array_size, , nested_gfp|__GFP_HIGHMEM,

                PAGE_KERNEL, node, area->caller);

        area->flags |= VM_VPAGES;

    } else {

        pages = kmalloc_node(array_size, nested_gfp, node);

    }

    area->pages = pages;

    if (!area->pages) {

        remove_vm_area(area->addr);

        kfree(area);

        return NULL;

    }

    for (i = ; i < area->nr_pages; i++) {

        struct page *page;

        if (node == NUMA_NO_NODE)

            page = alloc_page(alloc_mask);

        else

            page = alloc_pages_node(node, alloc_mask, order);

        if (unlikely(!page)) {

            /* Successfully allocated i pages, free them in __vunmap() */

            area->nr_pages = i;

            goto fail;

        }

        area->pages[i] = page;

        if (gfpflags_allow_blocking(gfp_mask))

            cond_resched();

    }

    if (map_vm_area(area, prot, pages))

        goto fail;

    return area->addr;

1641fail:

    warn_alloc_failed(gfp_mask, order,

              "vmalloc: allocation failure, allocated %ld of %ld bytes\n",

              (area->nr_pages*PAGE_SIZE), area->size);

    vfree(area->addr);

    return NULL;

}

首先为pages数组申请一段连续的虚拟地址空间（小于1页，使用kmalloc,大于1页，调vmalloc来保证虚拟地址空间的连续性），用来存入申请的物理页对应的page结构体，然后会申请nr_pages个物理页(注意是一次申请一页，因此申请到的页基本是不连续的)。

最终通过map_vm_area把这些物理页和虚拟地址空间对应起来，写入页表。

另外，社区还有对vmalloc的页面进行区分的patch。如下：

diff --git a/fs/proc/page.c b/fs/proc/page.c

index 792c78a49174..fc83dae1af7b

--- a/fs/proc/page.c

+++ b/fs/proc/page.c

@@ -, +, @@ u64 stable_page_flags(struct page *page)

         u |=  << KPF_BALLOON;

     if (PageTable(page))

         u |=  << KPF_PGTABLE;

+    if (PageVMalloc(page))

+        u |=  << KPF_VMALLOC;

     if (page_is_idle(page))

         u |=  << KPF_IDLE;

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 42619e16047f..c51ddd27bfb4

--- a/include/linux/mm_types.h

+++ b/include/linux/mm_types.h

@@ -, +, @@ struct page {

             spinlock_t ptl;

 #endif

         };

+        struct {    /* VMalloc pages */

+            struct vm_struct *vm_area;

+            unsigned long vm_offset;

+            unsigned long _vm_id;    /* MAPPING_VMalloc */

+        };

         /** @rcu_head: You can use this to free a page by RCU. */

         struct rcu_head rcu_head;

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index 901943e4754b..5232433175c1

--- a/include/linux/page-flags.h

+++ b/include/linux/page-flags.h

@@ -, +, @@ PAGE_TYPE_OPS(Kmemcg, kmemcg)

  */

 PAGE_TYPE_OPS(Table, table)

+/*

+ * vmalloc pages may be mapped to userspace, so we need some other way

+ * to distinguish them from other kinds of pages.  Use page->mapping

+ * for this purpose.  Values below 0x1000 cannot be real pointers.

+ */

+#define MAPPING_VMalloc        (void *)0x440

+

+#define PAGE_MAPPING_OPS(name)                        \

+static __always_inline int Page##name(struct page *page)        \

+{                                    \

+    return page->mapping == MAPPING_##name;                \

+}                                    \

+static __always_inline void __SetPage##name(struct page *page)        \

+{                                    \

+    VM_BUG_ON_PAGE(page->mapping != NULL, page);            \

+    page->mapping = MAPPING_##name;                    \

+}                                    \

+static __always_inline void __ClearPage##name(struct page *page)    \

+{                                    \

+    VM_BUG_ON_PAGE(page->mapping != MAPPING_##name, page);        \

+    page->mapping = NULL;                        \

+}

+

+PAGE_MAPPING_OPS(VMalloc)

+

 extern bool is_free_buddy_page(struct page *page);

 __PAGEFLAG(Isolated, isolated, PF_ANY);

diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h

index 21b9113c69da..6800968b8f47

--- a/include/uapi/linux/kernel-page-flags.h

+++ b/include/uapi/linux/kernel-page-flags.h

@@ -, +, @@

 #define KPF_ZERO_PAGE        24

 #define KPF_IDLE        25

 #define KPF_PGTABLE        26

+#define KPF_VMALLOC        27

 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */

diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index 5fbf27e7f956..98bc690d472d

--- a/mm/vmalloc.c

+++ b/mm/vmalloc.c

@@ -, +, @@ static void __vunmap(const void *addr, int deallocate_pages)

         for (i = ; i < area->nr_pages; i++) {

             struct page *page = area->pages[i];

-            BUG_ON(!page);

+            __ClearPageVMalloc(page);

             __free_pages(page, );

         }

@@ -, +, @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

             area->nr_pages = i;

             goto fail;

         }

+        __SetPageVMalloc(page);

+        page->vm_area = area;

+        page->vm_offset = i;

         area->pages[i] = page;

         if (gfpflags_allow_blocking(gfp_mask))

             cond_resched();

diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c

index cce853dca691..25cc21855be4

--- a/tools/vm/page-types.c

+++ b/tools/vm/page-types.c

@@ -, +, @@ static const char * const page_flag_names[] = {

     [KPF_THP]        = "t:thp",

     [KPF_BALLOON]        = "o:balloon",

     [KPF_PGTABLE]        = "g:pgtable",

+    [KPF_VMALLOC]        = "V:vmalloc",

     [KPF_ZERO_PAGE]        = "z:zero_page",

     [KPF_IDLE]              = "i:idle_page",

巴特西

vmalloc详解

最新文章

热门文章