page cache 与free

我们经常用free查看服务器的内存使用情况，而free中的输出却有些让人困惑，如下：

先看看各个数字的意义以及如何计算得到：

free命令输出的第二行(Mem)：这行分别显示了物理内存的总量(total)、已使用的 (used)、空闲的(free)、共享的(shared)、buffer(buffer大小)、 cache(cache的大小)的内存。我们知道Total、free、buffers、cached这几个字段是从/proc/meminfo中获取的，而used = total – free。Share列已经过时，忽略(见参考)。

free命令输出的第三行(-/+ buffers/cache)：

它显示的第一个值(used)：210236，这个值表示系统本身使用的内存总量，即除去buffer/cache，等于Mem行used列 - Mem行buffers列 - Mem行cached列。

它显示的第二个值(free)：814956，这个值表示系统当前可用内存，它等于Mem行total列— buffers/cache used，也等于Mem行free列 + Mem行buffers列 + Mem行cached列。

free命令输出的第四行(Swap) 这行显示交换内存的总量、已使用量、空闲量。

我们都知道free是从/proc/meminfo中读取相关的数据的。

下面是/proc/meminfo的实现：

static int meminfo_read_proc(char *page, char **start, off_t off,

                 int count, int *eof, void *data)

{

    struct sysinfo i;

    int len;

    unsigned long committed;

    unsigned long allowed;

    struct vmalloc_info vmi;

    long cached;

/*

 * display in kilobytes.

 */

#define K(x) ((x) << (PAGE_SHIFT - 10))

    si_meminfo(&i);

    si_swapinfo(&i);

    committed = atomic_read(&vm_committed_space);

    allowed = ((totalram_pages - hugetlb_total_pages())

        * sysctl_overcommit_ratio / ) + total_swap_pages;

    cached = global_page_state(NR_FILE_PAGES) -

            total_swapcache_pages - i.bufferram;

    if (cached < )

        cached = ;

    get_vmalloc_info(&vmi);

    /*

     * Tagged format, for easy grepping and expansion.

     */

    len = sprintf(page,

        "MemTotal:     %8lu kB\n"

        "MemFree:      %8lu kB\n"

        "Buffers:      %8lu kB\n"

        "Cached:       %8lu kB\n"

        "SwapCached:   %8lu kB\n"

        ......

        K(i.totalram),

        K(i.freeram),

        K(i.bufferram),

        K(cached),

        K(total_swapcache_pages),

        ......

#undef K

}     

struct sysinfo {

    long uptime;            /* Seconds since boot */

    unsigned long loads[];        /* 1, 5, and 15 minute load averages */

    unsigned long totalram;        /* Total usable main memory size */

    unsigned long freeram;        /* Available memory size */

    unsigned long sharedram;    /* Amount of shared memory */

    unsigned long bufferram;    /* Memory used by buffers */

    unsigned long totalswap;    /* Total swap space size */

    unsigned long freeswap;        /* swap space still available */

    unsigned short procs;        /* Number of current processes */

    unsigned short pad;        /* explicit padding for m68k */

    unsigned long totalhigh;    /* Total high memory size */

    unsigned long freehigh;        /* Available high memory size */

    unsigned int mem_unit;        /* Memory unit size in bytes */

    char _f[-*sizeof(long)-sizeof(int)];    /* Padding: libc5 uses this.. */

};

图中，Buffers对应sysinfo.bufferram，内核中以页框为单位，通过宏K转化成以KB为单位输出。

void si_meminfo(struct sysinfo *val)

{

    val->totalram = totalram_pages;//total ram pages

    val->sharedram = ;

    val->freeram = global_page_state(NR_FREE_PAGES);//free mem pages

    val->bufferram = nr_blockdev_pages();//block devices used pages

    val->totalhigh = totalhigh_pages;

    val->freehigh = nr_free_highpages();

    val->mem_unit = PAGE_SIZE;

}

long nr_blockdev_pages(void)

{

    struct block_device *bdev;

    long ret = ;

    spin_lock(&bdev_lock);

    list_for_each_entry(bdev, &all_bdevs, bd_list) {

        ret += bdev->bd_inode->i_mapping->nrpages;

    }

    spin_unlock(&bdev_lock);

    return ret;

}

nr_blockdev_pages计算块设备使用的页框数，遍历所有块设备，将使用的页框数相加。而不包含普通文件使用的页框数。

cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram;

static inline unsigned long global_page_state(enum zone_stat_item item)

{

    long x = atomic_long_read(&vm_stat[item]);

#ifdef CONFIG_SMP

    if (x < )

        x = ;

#endif

    return x;

}

Cache的大小为内核总的page cache减去swap cache和块设备占用的页框数量，实际上cache即为普通文件的占用的page cache。实际上，在函数add_to_page_cache和__add_to_swap_cache 中，都会通过调用pagecache_acct实现对内核变量nr_pagecache进行累加。前者对应page cache，内核读块设备和普通文件使用；后者对应swap cache，内核读交换分区使用。

Page cache(页面缓存)

在linux系统中，为了加快文件的读写，内核中提供了page cache作为缓存，称为页面缓存(page cache)。为了加快对块设备的读写，内核中还提供了buffer cache作为缓存。在2.4内核中，这两者是分开的。这样就造成了双缓冲，因为文件读写最后还是转化为对块设备的读写。在2.6中，buffer cache合并到page cache中，对应的页面叫作buffer page。当进行文件读写时，如果文件在磁盘上的存储块是连续的，那么文件在page cache中对应的页是普通的page，如果文件在磁盘上的数据块是不连续的，或者是设备文件，那么文件在page cache中对应的页是buffer page。buffer page与普通的page相比，每个页多了几个buffer_head结构体(个数视块的大小而定)。此外，如果对单独的块（如超级块）直接进行读写，对应的page cache中的页也是buffer page。这两种页面虽然形式略有不同，但是最终他们的数据都会被封装成bio结构体，提交到通用块设备驱动层，统一进行I/O调度。

/**

 * 块缓冲头描述符

 */

struct buffer_head {

    /* 块缓冲状态位图，如BH_Uptodate */

    unsigned long b_state;        /* buffer state bitmap (see above) */

    /* 指向下一个块缓冲，二者属于同一个页缓存 */

    struct buffer_head *b_this_page;/* circular list of page's buffers */

    /* 如果缓冲区属于页缓存，则指向缓存页。如果独立于页缓存，则为NULL */

    struct page *b_page;        /* the page this bh is mapped to */

    /* 对应的块号 */

    sector_t b_blocknr;        /* start block number */

    /* 块长 */

    size_t b_size;            /* size of mapping */

    /* 内存中的数据指针 */

    char *b_data;            /* pointer to data within the page */

    /* 后备设备 */

    struct block_device *b_bdev;

    /* 当IO操作完成时，由内核调用的回调函数 */

    bh_end_io_t *b_end_io;        /* I/O completion */

    /* 预留指针，用于b_end_io。一般用于日志文件系统。 */

     void *b_private;        /* reserved for b_end_io */

    struct list_head b_assoc_buffers; /* associated with another mapping */

    /* 所属地址空间 */

    struct address_space *b_assoc_map;    /* mapping this buffer is

                           associated with */

    /* 访问计数器 */

    atomic_t b_count;        /* users using this buffer_head */

};

在kernel2.6之后，buffer_head没有别的作用，主要用来保持页框与块设备中数据块的映射关系。

Buffer page(缓冲页)

如果内核需要单独访问一个块，就会涉及到buffer page，并会检查对应的buffer head。

内核创建buffer page的两种常见情况：

(1)当读或者写一个文件页的数据块不相邻时。发生这种情况是因为文件系统为文件分配了非连续的块，或者文件有洞。具体请参见block_read_full_page(fs/buffer.c)函数:

/**

 * 从块设备中读取整页

 */

int block_read_full_page(struct page *page, get_block_t *get_block)

{

    struct inode *inode = page->mapping->host;

    sector_t iblock, lblock;

    struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];

    unsigned int blocksize;

    int nr, i;

    int fully_mapped = ;

    BUG_ON(!PageLocked(page));

    blocksize =  << inode->i_blkbits;

    if (!page_has_buffers(page))/* 如果还没有建立缓冲区，则建立几个空缓冲区 */

        create_empty_buffers(page, blocksize, 0);

    /* 取页面关联的第一个缓冲区 */

    head = page_buffers(page);

    /* 计算要读取的块号 */

    iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

    lblock = (i_size_read(inode)+blocksize-) >> inode->i_blkbits;

    bh = head;

    nr = ;

    i = ;

    /* 遍历所有缓冲区 */

    do {

        if (buffer_uptodate(bh))/* 缓冲区已经与设备匹配了，不需要处理 */

            continue;

        if (!buffer_mapped(bh)) {/* 没有映射 */

            int err = ;

            fully_mapped = ;

            if (iblock < lblock) {/* 在设备上还不存在块 */

                WARN_ON(bh->b_size != blocksize);

                /* 获得逻辑块在磁盘上的位置 */

                err = get_block(inode, iblock, bh, );

                if (err)

                    SetPageError(page);

            }

            if (!buffer_mapped(bh)) {/* 对应的块是稀疏块，写入0即可 */

                zero_user_page(page, i * blocksize, blocksize,

                        KM_USER0);

                if (!err)

                    set_buffer_uptodate(bh);

                continue;

            }

            /*

             * get_block() might have updated the buffer

             * synchronously

             */

            if (buffer_uptodate(bh))/* get_block将缓冲区更新了，继续处理下一块 */

                continue;

        }

        /* 缓冲区已经映射，但内容不是最新的，将它放到临时数组中 */

        arr[nr++] = bh;

    } while (i++, iblock++, (bh = bh->b_this_page) != head);

    if (fully_mapped)

        SetPageMappedToDisk(page);

    if (!nr) {/* 所有缓冲区都是最新的 */

        /*

         * All buffers are uptodate - we can set the page uptodate

         * as well. But not if get_block() returned an error.

         */

        if (!PageError(page))/* 设置页的uptodate标志，然后退出 */

            SetPageUptodate(page);

        unlock_page(page);

        return ;

    }

    /* Stage two: lock the buffers */

    for (i = ; i < nr; i++) {/* 锁定缓冲区 */

        bh = arr[i];

        lock_buffer(bh);

        mark_buffer_async_read(bh);

    }

    /*

     * Stage 3: start the IO.  Check for uptodateness

     * inside the buffer lock in case another process reading

     * the underlying blockdev brought it uptodate (the sct fix).

     */

    for (i = ; i < nr; i++) {/* 遍历页内所有需要更新的缓冲区 */

        bh = arr[i];

        if (buffer_uptodate(bh))/* 在没有获得锁的期间，如果有其他进程读取的内容 */

            end_buffer_async_read(bh, );

        else

            submit_bh(READ, bh);/* 提交IO请求 */

    }

    return ;

}

这里使用buffer head主要是通过buffer head建立页框与数据块的映射关系。因为页面中的数据不是连接的，而页框描述符struct page的字段又不足以表达这种信息。

该函数会调用create_empty_buffers来创建一组全新的缓冲区，并与page关联起来

/**

 * 创建一组全新的缓冲区，以便与页关联

 */

void create_empty_buffers(struct page *page,

            unsigned long blocksize, unsigned long b_state)

{

    struct buffer_head *bh, *head, *tail;

    /* 创建所需要数目的缓冲头，并将其形成一个链表，返回第一个缓冲头 */

    head = alloc_page_buffers(page, blocksize, 1);

    /* 设置所有缓冲头的状态，并将缓冲头形成一个环形链表 */

    bh = head;

    do {

        bh->b_state |= b_state;

        tail = bh;

        bh = bh->b_this_page;

    } while (bh);

    tail->b_this_page = head;

    /* 根据页面状态设置块缓冲区的状态 */

    spin_lock(&page->mapping->private_lock);

    if (PageUptodate(page) || PageDirty(page)) {

        bh = head;

        do {/* 更新每一个缓冲头的状态 */

            if (PageDirty(page))

                set_buffer_dirty(bh);

            if (PageUptodate(page))

                set_buffer_uptodate(bh);

            bh = bh->b_this_page;

        } while (bh != head);

    }

    /* 将缓冲区关联到页面 */

    attach_page_buffers(page, head);

    spin_unlock(&page->mapping->private_lock);

}

create_empty_buffers调用alloc_page_buffers来创建一组buffer head链表，但还不是循环链表：

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,

        int retry)

{

    struct buffer_head *bh, *head;

    long offset;

try_again:

    head = NULL;

    offset = PAGE_SIZE;

    while ((offset -= size) >= ) {

        bh = alloc_buffer_head(GFP_NOFS);

        if (!bh)

            goto no_grow;

        bh->b_bdev = NULL;

        bh->b_this_page = head;

        bh->b_blocknr = -;

        head = bh;

        bh->b_state = ;

        atomic_set(&bh->b_count, );

        bh->b_private = NULL;

        bh->b_size = size;

        /* Link the buffer to its page */

        set_bh_page(bh, page, offset);

        init_buffer(bh, NULL, NULL);

    }

    return head;

......

}

alloc_page_buffers调用set_bh_page来设置b_data.

void set_bh_page(struct buffer_head *bh,

        struct page *page, unsigned long offset)

{

    bh->b_page = page;

    BUG_ON(offset >= PAGE_SIZE);

    if (PageHighMem(page))

        /*

         * This catches illegal uses and preserves the offset:

         */

        bh->b_data = (char *)( + offset);

    else

        bh->b_data = page_address(page) + offset;

}

(2)访问一个单独的磁盘块(比如，读超级块或者索引节点块时)。参见ext2_fill_super(fs/ext2/super.c)，该函数在安装ext2文件系统时调用。

Buffer page和buffer head的关系：

由代码可知，每个buffer_head对应磁盘上的一个block. 一个page cache有N（N = PAGE_SIZE/BLOCK_SIZE）个buffer_head来描述。

巴特西

page cache 与free

最新文章

热门文章