数据结构：关键结构体-内存管理

Linux Kernel内存管理涉及到的数据结构驳杂，理解这些数据结构中重要字段的涵义对于理解内存管理是至关重要的.

click here back to Homepage
click here back to Category
click here back to Linux Kernel

本文着重记录内存管理相关的一些重要数据结构，基于linux kernel 4.15版本

pagetable相关数据结构
- page
slab内存分配器相关
buddy内存分配器相关
参考文档

pagetable相关数据结构

page

定义位置：/include/linux/mm_types.h

/*                                                                                                                                                          
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *              
 * The objects in struct page are organized in double word blocks in
 * order to allows us to use atomic double word operations on portions
 * of struct page. That is currently only used by slub but the arrangement
 * allows the use of atomic double word operations on the flags/mapping
 * and lru list pointers also.
 */             
struct page {
        /* First double word block */
        unsigned long flags;            /* Atomic flags, some possibly
                                         * updated asynchronously */
        union { 
                struct address_space *mapping;  /* If low bit clear, points to
                                                 * inode address_space, or NULL.
                                                 * If page mapped as anonymous
                                                 * memory, low bit is set, and
                                                 * it points to anon_vma object:
                                                 * see PAGE_MAPPING_ANON below.
                                                 */
                void *s_mem;                    /* slab first object */
                atomic_t compound_mapcount;     /* first tail page */
                /* page_deferred_list().next     -- second tail page */
        };  
                        
        /* Second double word */
        union {         
                pgoff_t index;          /* Our offset within mapping. */
                void *freelist;         /* sl[aou]b first free object */    
                                          ////【当该slab挂在kmem_cache_cpu上时，为了避免锁，它用于保存当前cpu上释放掉的object的链表，
                                          ////此时该cpu上待分配的空闲object挂在kmem_cache_cpu->freelist上，
                                          ////在kmem_cahce_cpu初始化期间和该slab挂在kmem_cache_node上时，它保存该slab上空闲object链表首个object地址】
                /* page_deferred_list().prev    -- second tail page */
        };

        union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
        defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
                /* Used for cmpxchg_double in slub */
                unsigned long counters;   ////【确保对以上freelist的原子操作】
#else                   
                /*      
                 * Keep _refcount separate from slub cmpxchg_double data.
                 * As the rest of the double word is protected by slab_lock
                 * but _refcount is not.
                 */
                unsigned counters;
#endif                  
                struct {
                        
                        union {
                                /*
                                 * Count of ptes mapped in mms, to show when
                                 * page is mapped & limit reverse map searches.
                                 *
                                 * Extra information about page type may be
                                 * stored here for pages that are never mapped,
                                 * in which case the value MUST BE <= -2.
                                 * See page-flags.h for more details.
                                 */
                                atomic_t _mapcount;
                        
                                unsigned int active;            /* SLAB */
                                struct {                        /* SLUB */
                                        unsigned inuse:16;      ////【被使用object数量】
                                        unsigned objects:15;    ////【该slab上objects总数】
                                        unsigned frozen:1;      ////【该slab是否处于冻结状态，即是否在某个cpu的per cpu变量上】
                                };
                                int units;                      /* SLOB */
                        };
                        /*
                         * Usage count, *USE WRAPPER FUNCTION* when manual
                         * accounting. See page_ref.h
                         */
                        atomic_t _refcount;                                                                                                                 
                };      
        };              
                        
        /*              
         * Third double word block
         *              
         * WARNING: bit 0 of the first word encode PageTail(). That means
         * the rest users of the storage space MUST NOT use the bit to
         * avoid collision and false-positive PageTail().
         */             
        union {         
                struct list_head lru;   /* Pageout list, eg. active_list
                                         * protected by zone_lru_lock !
                                         * Can be used as a generic list
                                         * by the page owner.
                                         */                               ////【kmem_cache_node依靠lru把各个partial slab串联在一起】
                struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
                                            * lru or handled by a slab
                                            * allocator, this points to the
                                            * hosting device page map.
                                            */
                struct {                /* slub per cpu partial pages */
                        struct page *next;      /* Next partial slab */   ////【kmem_cache_cpu依靠next在partial链表上寻找下一个slab】
#ifdef CONFIG_64BIT     
                        int pages;      /* Nr of partial slabs left */    ////【kmem_cache_cpu上partial slab剩余数量】
                        int pobjects;   /* Approximate # of objects */    ////【kmem_cache_cpu上partial slab剩余objects总数】
#else                   
                        short int pages;                                  ////【同上面64位的】                                                                                                              
                        short int pobjects;                               ////【同上面64位的】
#endif                  
                };      
                   
                struct rcu_head rcu_head;       /* Used by SLAB                                                                                             
                                                 * when destroying via RCU                                                   
                                                 */                       ////【通过rcu释放时的链表头】
                /* Tail pages of compound page */                                                                            
                struct {                                                                                                     
                        unsigned long compound_head; /* If bit zero is set */                                                
                   
                        /* First tail page only */                                                                           
#ifdef CONFIG_64BIT                                                                                                          
                        /*                                                                                                   
                         * On 64 bit system we have enough space in struct page                                              
                         * to encode compound_dtor and compound_order with                                                   
                         * unsigned int. It can help compiler generate better or                                             
                         * smaller code on some archtectures.                                                                
                         */                                                                                                  
                        unsigned int compound_dtor;                                                                          
                        unsigned int compound_order;                                                                         
#else              
                        unsigned short int compound_dtor;                                                                    
                        unsigned short int compound_order;                                                                   
#endif             
                };                                                                                                           
                   
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
                struct {
                        unsigned long __pad;    /* do not overlay pmd_huge_pte
                                                 * with compound_head to avoid
                                                 * possible bit 0 collision.
                                                 */
                        pgtable_t pmd_huge_pte; /* protected by page->ptl */
                };     
#endif           
        };       
                 
        /* Remainder is not double word aligned */
        union {  
                unsigned long private;          /* Mapping-private opaque data:
                                                 * usually used for buffer_heads
                                                 * if PagePrivate set; used for
                                                 * swp_entry_t if PageSwapCache;
                                                 * indicates order in the buddy
                                                 * system if PG_buddy is set.
                                                 */
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
                spinlock_t *ptl;
#else           
                spinlock_t ptl;
#endif          
#endif          
                struct kmem_cache *slab_cache;  /* SL[AU]B: Pointer to slab */    ////【指向该slab cache】
        };

#ifdef CONFIG_MEMCG
        struct mem_cgroup *mem_cgroup;
#endif          
                
        /*      
         * On machines where all RAM is mapped into kernel address space,
         * we can simply calculate the virtual address. On machines with
         * highmem some memory is mapped into kernel virtual memory
         * dynamically, so we need a place to store that address.
         * Note that this field could be 16 bits on x86 ... ;)
         *      
         * Architectures with slow multiplication can define
         * WANT_PAGE_VIRTUAL in asm/page.h
         */     
#if defined(WANT_PAGE_VIRTUAL)
        void *virtual;                  /* Kernel virtual address (NULL if
                                           not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
 
#ifdef CONFIG_KMEMCHECK
        /*
         * kmemcheck wants to track the status of each byte in a page; this
         * is a pointer to such a status block. NULL if not tracked.
         */
        void *shadow;
#endif
 
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
        int _last_cpupid;
#endif
}

slab内存分配器相关

下面数据结构均以slub分配器为例来描述，当用slab/slob时可照猫画虎去Kernel源码中寻找。

kmem_cache

定义位置：kernel/include/linux/slub_def.h

/*      
 * Slab cache management.
 */     
struct kmem_cache {
        struct kmem_cache_cpu __percpu *cpu_slab;    ////【该slab cache中每个cpu当前独占的slab对应的管理结构】
        /* Used for retriving partial slabs etc */
        unsigned long flags;                         ////【该slab cache的属性标识，包括是否使能redzone等调试区域、是否开启KASAN调试功能等等】
        unsigned long min_partial;                   ////【该slab cache在每个NUMA节点上需要保留的部分空slab数量最小值】
        int size;               /* The size of an object including meta data */
        int object_size;        /* The size of an object without meta data */
        int offset;             /* Free pointer offset. */
        int cpu_partial;        /* Number of per cpu partial objects to keep around */    ////【该slab cache中每个cpu允许管理的部分空闲链表objects总数上限】
        struct kmem_cache_order_objects oo;                               ////【该slab cache中每个slab需要的物理页框的order值保存在高16位，需要的objects数量保存在低16位】
       
        /* Allocation and freeing of slabs */
        struct kmem_cache_order_objects max;
        struct kmem_cache_order_objects min;                              ////【该slab cache中保存的值与上面oo类似，只是这里是最小值，当用oo分配失败时会尝试用min去分配】
        gfp_t allocflags;       /* gfp flags to use on each alloc */
        int refcount;           /* Refcount for slab cache destroy */     ////【该slab cache的重用计数，为减少slab cache种类数目去重用已经创建的类似大小和属性的slab cache】
        void (*ctor)(void *);                                             ////【该slab cache在创建slab时的构造函数，可以自己定义、也可以为空】
        int inuse;              /* Offset to metadata */                  ////【该slab cache中每个slab中object元数据的偏移量，在开启debug或者rcu或ctor函数时，也等于free pointer的的偏移s->offset】
        int align;              /* Alignment */                           ////【该slab cache中每个slab中object的对齐量，用于调整object的size大小】
        int reserved;           /* Reserved bytes at the end of slabs */  ////【该slab cache的slab连续物理页框区域末尾保留的一部分空间，该空间已经不够分配一个object了】
        const char *name;       /* Name (only for display!) */            ////【该slab cache的名字】
        struct list_head list;  /* List of slab caches */                 ////【该slab cache链入的统一链表，表头是slab_caches】
        int red_left_pad;       /* Left redzone padding size */           ////【该slab cache在开启redzone调试区时，在object低地址区域的redzone大小，这个是2016年3月分最新进入内核主线的】
#ifdef CONFIG_SYSFS
        struct kobject kobj;    /* For sysfs */
        struct work_struct kobj_remove_work;
#endif
#ifdef CONFIG_MEMCG
        struct memcg_cache_params memcg_params;
        int max_attr_size; /* for propagation, maximum size of a stored attr */
#ifdef CONFIG_SYSFS
        struct kset *memcg_kset;
#endif                                                                                                                       
#endif                                                                                                                       
                                                                                                                             
#ifdef CONFIG_NUMA
        /*
         * Defragmentation by allocating from a remote node.
         */
        int remote_node_defrag_ratio;                ////【该slab cache如果在NUMA架构上的话，该值越小越倾向于在本地节点分配object】
#endif
       
#ifdef CONFIG_SLAB_FREELIST_RANDOM
        unsigned int *random_seq;
#endif
       
#ifdef CONFIG_KASAN
        struct kasan_cache kasan_info;               ////【开启kasan功能后信息】
#endif
       
        struct kmem_cache_node *node[MAX_NUMNODES];  ////【该slab cache的partial slab链表，如果是NUMA架构的话，每个节点上有一个，如果是SMP的话只有一个】
};

kmem_cache_cpu

定义位置：kernel/include/linux/slub_def.h

struct kmem_cache_cpu {
        void **freelist;        /* Pointer to next available object */         ////【该cpu上的slab空闲object链表指针，每次从这里分配一个object】
        unsigned long tid;      /* Globally unique transaction id */           ////【主要用于检查是否有并发，某些操作前、后分别读取对比，每分配一次对象加TID_STEP，该步长可以自定义】
        struct page *page;      /* The slab from which we are allocating */    ////【该cpu的slab描述符，复用了page结构体里面一些字段，就如同page描述一个物理页框一样】
        struct page *partial;   /* Partially allocated frozen slabs */         ////【该cpu的部分空slab链表，属于该cpu独占，可以通过宏CONFIG_SLUB_CPU_PARTIAL控制是否存在，存在时它通过page->next指向下一个slab】
#ifdef CONFIG_SLUB_STATS
        unsigned stat[NR_SLUB_STAT_ITEMS];
#endif   
};

kmem_cache_node

定义位置：kernel/mm/slab.h

/*
 * The slab lists for all objects.
 */
struct kmem_cache_node {
        spinlock_t list_lock;
       
#ifdef CONFIG_SLAB
        struct list_head slabs_partial; /* partial list first, better asm code */
        struct list_head slabs_full;
        struct list_head slabs_free;
        unsigned long total_slabs;      /* length of all slab lists */
        unsigned long free_slabs;       /* length of free slab list only */
        unsigned long free_objects;
        unsigned int free_limit;
        unsigned int colour_next;       /* Per-node cache coloring */
        struct array_cache *shared;     /* shared per node */
        struct alien_cache **alien;     /* on other nodes */
        unsigned long next_reap;        /* updated without locking */
        int free_touched;               /* updated without locking */
#endif
       
#ifdef CONFIG_SLUB
        unsigned long nr_partial;       ////【该节点上部分空slab数量】
        struct list_head partial;       ////【双向循环链表，通过page->lru字段，串联起各部分空slab】
#ifdef CONFIG_SLUB_DEBUG
        atomic_long_t nr_slabs;         ////【该节点中slab总量】
        atomic_long_t total_objects;    ////【该节点中object总量】
        struct list_head full;          ////【该节点上全部oject都被分配出去的slab链表】
#endif                                                                                                                                                      
#endif
       
};

buddy内存分配器相关

参考文档

暂无

#kernel #memory #数据结构 #slab #slub #slob #buddy

@2018-03-29 11:58

Comments