SLUB算法 之前我们讲解了伙伴系统,他的层次在于分配以page为单位的页面,但在平时我们程序使用的这么大个单位的页面机会很少,我们更倾向于分配小块例如0x2e0
的块,而此时如果再次调用伙伴系统的分配就有点杀鸡用牛刀的感觉,所以此时Linux内核引入小块的分配器,也就是
slab allocator
他存在着三种实现形式:slab
,slob
,slub
1. 数据结构们 struct slab 首先就是我们比较主要的数据结构slab
,他曾经是被嵌入在page
当中,但是目前最新内核版本6.4,他被作为一个独立的数据结构进行表示,当然实际上也是复用了page的结构体,具体如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 struct slab { unsigned long __page_flags;#if defined(CONFIG_SLAB) ... ...#elif defined(CONFIG_SLUB) struct kmem_cache *slab_cache ; union { struct { union { struct list_head slab_list ; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next ; int slabs; };#endif }; void *freelist; union { unsigned long counters; struct { unsigned inuse:16 ; unsigned objects:15 ; unsigned frozen:1 ; }; }; }; struct rcu_head rcu_head ; }; unsigned int __unused;#else #error "Unexpected slab allocator configured" #endif atomic_t __page_refcount;#ifdef CONFIG_MEMCG unsigned long memcg_data;#endif };
上面我们已经知道slab就是复用的page结构体,我们的page结构体是用来表示物理内存中的一个内存页,我们可以通过一个page结构体找到物理内存页的页框号,同样的,当我们用slab来复用page结构体的时候,我们也可以通过他来得到物理内存页的页框号,下面是其中的一些字段解释:
__page_flags
字段:他用来标识页面的相关信息
slab_cache
字段:他是一个kmem_cache
类型的指针,该结构体我们下面讲解
free_list
:该字段指向第一个空闲的object,耗尽时为NULL
slab_list
:连接多个slab的双向链表
inuse
:已经被使用的object数
objects
:总共的object数
frozen
:标志是否(1/0)被冻结,也就是是否被cpu.slab绑定
下面的这张图就是关于目前的slab结构体的相关结构,我们慢慢来充实该图
其中freelist字段指向的块就是我们先前page页所代表的页框中的某块。
struct kmem_cache 要管理小块的分配,单单一个复用page结构的slab还不够,所以这里引入我们的管理者kmem_cache
,如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 struct kmem_cache {#ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab ;#endif slab_flags_t flags; unsigned long min_partial; unsigned int size; unsigned int object_size; struct reciprocal_value reciprocal_size ; unsigned int offset; #ifdef CONFIG_SLUB_CPU_PARTIAL unsigned int cpu_partial; unsigned int cpu_partial_slabs;#endif struct kmem_cache_order_objects oo ; struct kmem_cache_order_objects min ; gfp_t allocflags; int refcount; void (*ctor)(void *); unsigned int inuse; unsigned int align; unsigned int red_left_pad; const char *name; struct list_head list ; #ifdef CONFIG_SYSFS struct kobject kobj ; #endif #ifdef CONFIG_SLAB_FREELIST_HARDENED unsigned long random;#endif #ifdef CONFIG_NUMA unsigned int remote_node_defrag_ratio;#endif #ifdef CONFIG_SLAB_FREELIST_RANDOM unsigned int *random_seq;#endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info ;#endif #ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; unsigned int usersize; #endif struct kmem_cache_node *node [MAX_NUMNODES ]; };
其中最主要的两个字段如下:
struct kmem_cache_cpu
cpu_cache
: 类型为kmem_cache_cpu
指针,一个cpu的空闲对象缓存,其大致定义如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 struct kmem_cache_cpu { void **freelist; unsigned long tid; struct slab *slab ; #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial ; #endif local_lock_t lock; #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS];#endif };
其中有下面几点需要注意:
freelist:指向下一个分配对象的指针
slab:被当前使用来分配内存的slab
partial:链表上为仍有一定空闲object的slab
其中slab->freelist
在被kmem_cache_cpu
使用时为NULL,是个无效值,只有他在处于partial链表才会有效
struct kmem_cache_node
node[MAX_NUMNODES]
:他是一个数组指针,其中每一个元素都指向一个kmem_cache_node
,他是NUMA架构中每个node节点的缓存,如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 struct kmem_cache_node {#ifdef CONFIG_SLAB ... ...#endif #ifdef CONFIG_SLUB spinlock_t list_lock; unsigned long nr_partial; struct list_head partial ; #ifdef CONFIG_SLUB_DEBUG atomic_long_t nr_slabs; atomic_long_t total_objects; struct list_head full ; #endif #endif };
我们在初始化分配器的时候,会定义一个全局变量kmalloc_caches
,如下:
1 2 3 struct kmem_cache *kmalloc_caches [NR_KMALLOC_TYPES ][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = { }; EXPORT_SYMBOL(kmalloc_caches);
其中他的行为以下枚举类型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 enum kmalloc_cache_type { KMALLOC_NORMAL = 0 ,#ifndef CONFIG_ZONE_DMA KMALLOC_DMA = KMALLOC_NORMAL,#endif #ifndef CONFIG_MEMCG_KMEM KMALLOC_CGROUP = KMALLOC_NORMAL,#endif #ifdef CONFIG_SLUB_TINY KMALLOC_RECLAIM = KMALLOC_NORMAL,#else KMALLOC_RECLAIM,#endif #ifdef CONFIG_ZONE_DMA KMALLOC_DMA,#endif #ifdef CONFIG_MEMCG_KMEM KMALLOC_CGROUP,#endif NR_KMALLOC_TYPES };
他的值主要靠编译时期的配置来决定管理不同内存段的slab,例如64位的dma
,normal
等等。
然后我们的列主要是以下部分
1 2 #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define PAGE_SHIFT 12
所以说我们的kmalloc_caches
共有14列,行数需通过配置来决定
下面我们来完善上面的图
2.分配过程 首先我们拿__kmalloc
函数来分析,他的实现如下,调用另一个函数__do_kmalloc_node
1 2 3 4 void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); }
之后该函数又调用其他函数,我超上面白写了,调用链如下:
1 2 3 4 5 6 7 __kmalloc __do_kmalloc_node __kmem_cache_alloc_node slab_alloc_node __slab_alloc_node __slab_alloc ___slab_alloc
下面我们来首先分析slab_alloc_node
函数,如下:
slab_alloc_node 一切尽在注释中:happy:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 static __fastpath_inline void *slab_alloc_node (struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; struct obj_cgroup *objcg = NULL ; bool init = false ; s = slab_pre_alloc_hook(s, lru, &objcg, 1 , gfpflags); if (!s) return NULL ; object = kfence_alloc(s, orig_size, gfpflags); if (unlikely(object)) goto out; object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); out: slab_post_alloc_hook(s, objcg, gfpflags, 1 , &object, init, orig_size); return object; }
上面注释可以看到,首先会调用kfence_alloc
进行检测,若通过则进如核心分配函数,否则直接转到out
,接着转到我们的核心函数__slab_alloc_node
,通过他获取object的之后,再调用maybe_wipe_obj_freeptr
,他把所获得的object的freelist指针清空来进行初始化,然后调用slab_want_init_on_alloc
函数,他通过判断标志位是否含有__GFP_ZERO
来进行赋0值的清空操作。
__slab_alloc_node 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { struct kmem_cache_cpu *c ; struct slab *slab ; unsigned long tid; void *object; redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); barrier(); object = c->freelist; slab = c->slab; if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, object, tid, next_object, next_tid(tid)))) { note_cmpxchg_failure("slab_alloc" , s, tid); goto redo; } prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } return object; }
该函数若是在较为理想的情况下会直接从cpu_slab
的freelist
进行分配,否则会通过__slab_alloc
分配新的slab
__slab_alloc 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p;#ifdef CONFIG_PREEMPT_COUNT c = slub_get_cpu_ptr(s->cpu_slab);#endif p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);#ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab);#endif return p; }
上面函数也没干啥,就是重新获取cpu_slab用来防止cpu切换,这里调用___slab_alloc
___slab_alloc 接下来就是咱们的重磅函数,如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *freelist; struct slab *slab ; unsigned long flags; struct partial_context pc ; stat(s, ALLOC_SLOWPATH); ... }
下面我们将一步一步解析该部分源码,内核设计他时刚好自行将其分了块,我们就根据这些块来讲解:
:one: reread_slab 1 2 3 4 5 6 7 8 9 10 11 12 reread_slab: slab = READ_ONCE(c->slab); if (!slab) { if (unlikely(node != NUMA_NO_NODE && !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; goto new_slab; }
:two:redo 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 redo: if (unlikely(!node_match(slab, node))) { if (!node_isset(node, slab_nodes)) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); goto deactivate_slab; } } if (unlikely(!pfmemalloc_match(slab, gfpflags))) goto deactivate_slab; local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(slab != c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_slab; } freelist = c->freelist; if (freelist) goto load_freelist; freelist = get_freelist(s, slab); if (!freelist) { c->slab = NULL ; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } stat(s, ALLOC_REFILL);
这里的一个get_freelist函数是将当前cpu的slab->free_list返回,或者停用该slab
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 #ifndef CONFIG_SLUB_TINY static inline void *get_freelist (struct kmem_cache *s, struct slab *slab) { struct slab new ; unsigned long counters; void *freelist; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { freelist = slab->freelist; counters = slab->counters; new.counters = counters; VM_BUG_ON(!new.frozen); new.inuse = slab->objects; new.frozen = freelist != NULL ; } while (!__cmpxchg_double_slab(s, slab, freelist, counters, NULL , new.counters, "get_freelist" )); return freelist; }
:three:load_freelist 1 2 3 4 5 6 7 8 9 10 11 12 13 14 load_freelist: lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); return freelist;
这里的get_freepointer
函数最终会调用下面这个freelist_ptr
,且参数为本kmem_cache
和当前object指向的下一个object的地址,如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 static inline void *freelist_ptr (const struct kmem_cache *s, void *ptr, unsigned long ptr_addr) {#ifdef CONFIG_SLAB_FREELIST_HARDENED return (void *)((unsigned long )ptr ^ s->random ^ swab ((unsigned long )kasan_reset_tag ((void *)ptr_addr)));#else return ptr;#endif }
一般如果我们运行到这里就可以返回一个空闲object了,但是本函数代码还并未结束
:four:deactivate_slab 1 2 3 4 5 6 7 8 9 10 11 12 13 14 deactivate_slab: local_lock_irqsave(&s->cpu_slab->lock, flags); if (slab != c->slab) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL ; c->freelist = NULL ; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, slab, freelist);
下面介绍以下激活函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 static void deactivate_slab (struct kmem_cache *s, struct slab *slab, void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0 ; enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0 ; struct slab new ; struct slab old ; if (slab->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } freelist_tail = NULL ; freelist_iter = freelist; while (freelist_iter) { nextfree = get_freepointer(s, freelist_iter); if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) break ; freelist_tail = freelist_iter; free_delta++; freelist_iter = nextfree; } redo: old.freelist = READ_ONCE(slab->freelist); old.counters = READ_ONCE(slab->counters); VM_BUG_ON(!old.frozen); new.counters = old.counters; if (freelist_tail) { new.inuse -= free_delta; set_freepointer(s, freelist_tail, old.freelist); new.freelist = freelist; } else new.freelist = old.freelist; new.frozen = 0 ; if (!new.inuse && n->nr_partial >= s->min_partial) { mode = M_FREE; } else if (new.freelist) { mode = M_PARTIAL; spin_lock_irqsave(&n->list_lock, flags); } else { mode = M_FULL_NOLIST; } if (!cmpxchg_double_slab(s, slab, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab" )) { if (mode == M_PARTIAL) spin_unlock_irqrestore(&n->list_lock, flags); goto redo; } if (mode == M_PARTIAL) { add_partial(n, slab, tail); spin_unlock_irqrestore(&n->list_lock, flags); stat(s, tail); } else if (mode == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); } else if (mode == M_FULL_NOLIST) { stat(s, DEACTIVATE_FULL); } }
:five:new_slab 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 new_slab: if (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto new_objects; } slab = c->slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; }
:six:new_object 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 new_objects: pc.flags = gfpflags; pc.slab = &slab; pc.orig_size = orig_size; freelist = get_partial(s, node, &pc); if (freelist) goto check_new_slab; slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, gfpflags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { slab_out_of_memory(s, gfpflags, node); return NULL ; } stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { freelist = alloc_single_from_new_slab(s, slab, orig_size); if (unlikely(!freelist)) goto new_objects; if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr); return freelist; } freelist = slab->freelist; slab->freelist = NULL ; slab->inuse = slab->objects; slab->frozen = 1 ; inc_slabs_node(s, slab_nid(slab), slab->objects);
这里介绍一下其中struct partial_context
1 2 3 4 5 6 struct partial_context { struct slab **slab ; gfp_t flags; unsigned int orig_size; };
下面就是allocate_slab
函数,也就是转到了咱们的伙伴系统分配page/slab
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 static struct slab *allocate_slab (struct kmem_cache *s, gfp_t flags, int node) { struct slab *slab ; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p, *next; int idx; bool shuffle; flags &= gfp_allowed_mask; flags |= s->allocflags; alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) return NULL ; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); slab->inuse = 0 ; slab->frozen = 0 ; account_slab(slab, oo_order(oo), s, flags); slab->slab_cache = s; kasan_poison_slab(slab); start = slab_address(slab); setup_slab_debug(s, slab, start); shuffle = shuffle_freelist(s, slab); if (!shuffle) { start = fixup_red_left(s, start); start = setup_object(s, start); slab->freelist = start; for (idx = 0 , p = start; idx < slab->objects - 1 ; idx++) { next = p + s->size; next = setup_object(s, next); set_freepointer(s, p, next); p = next; } set_freepointer(s, p, NULL ); } return slab; }
:seven:check_new_slab 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 check_new_slab: if (kmem_cache_debug(s)) { if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr); return freelist; } if (unlikely(!pfmemalloc_match(slab, gfpflags))) { deactivate_slab(s, slab, get_freepointer(s, freelist)); return freelist; }
:eight:retry_load_slab 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 retry_load_slab: local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; c->slab = NULL ; c->freelist = NULL ; c->tid = next_tid(c->tid); local_unlock_irqrestore(&s->cpu_slab->lock, flags); deactivate_slab(s, flush_slab, flush_freelist); stat(s, CPUSLAB_FLUSH); goto retry_load_slab; } c->slab = slab; goto load_freelist;
3.kmalloc相关 我们接触的比较多的那就直接是这个kmalloc了,其他的一些顶层接口包括他都会调用上面讲的slab_alloc_node
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 static __always_inline __alloc_size(1 ) void *kmalloc (size_t size, gfp_t flags) { if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); index = kmalloc_index(size); return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); } return __kmalloc(size, flags); }
解释一下其中所用到的几个函数
kmalloc_large 1 2 3 4 5 6 7 8 void *kmalloc_large (size_t size, gfp_t flags) { void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), flags, NUMA_NO_NODE); return ret; }
其中调用的__kmalloc_large_node
函数是先判断size属于的order大小,然后再调用伙伴系统的接口alloc_pages_node
来获取相应的页再转为地址返回
kmalloc_index 实际上就是调用了__kmalloc_index
,大致含义就是返回相应size所对应的kmem_caches
二维数组的列下标(这代码码的还挺好看
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 static __always_inline unsigned int __kmalloc_index(size_t size, bool size_is_constant) { if (!size) return 0 ; if (size <= KMALLOC_MIN_SIZE) return KMALLOC_SHIFT_LOW; if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96 ) return 1 ; if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192 ) return 2 ; if (size <= 8 ) return 3 ; if (size <= 16 ) return 4 ; if (size <= 32 ) return 5 ; if (size <= 64 ) return 6 ; if (size <= 128 ) return 7 ; if (size <= 256 ) return 8 ; if (size <= 512 ) return 9 ; if (size <= 1024 ) return 10 ; if (size <= 2 * 1024 ) return 11 ; if (size <= 4 * 1024 ) return 12 ; if (size <= 8 * 1024 ) return 13 ; if (size <= 16 * 1024 ) return 14 ; if (size <= 32 * 1024 ) return 15 ; if (size <= 64 * 1024 ) return 16 ; if (size <= 128 * 1024 ) return 17 ; if (size <= 256 * 1024 ) return 18 ; if (size <= 512 * 1024 ) return 19 ; if (size <= 1024 * 1024 ) return 20 ; if (size <= 2 * 1024 * 1024 ) return 21 ; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1 , "unexpected size in kmalloc_index()" ); else BUG(); return -1 ; }
kmalloc_trace 1 2 3 4 5 6 7 8 9 10 void *kmalloc_trace (struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, size, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; }
上面的kesan_kmalloc具体并不参与分配过程,他主要用来对内存进行安全检测,下面是搜索到的解释
Kernel Address SANitizer(KASAN)是一种动态内存安全错误检测工具,主要功能是 检查内存越界访问和使用已释放内存的问题
因此,来整理一下kmalloc的运行过程
判断size,若大于了1页的范围,则调用kmalloc_large,然后使用伙伴系统的接口
这里说明size小于1页,我们调用kmalloc_index来获取相应kmalloc_caches的下标,然后调用kmalloc_trace来作为slab_alloc_node的上层接口来返回object
4.释放过程 咱们从比较通用的接口讲起
do_slab_free 该函数并不复杂,也就是在多cpu的情况下释放一个或多个object,整体函数表现为快路径
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 static __always_inline void do_slab_free (struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { void *tail_obj = tail ? : head; struct kmem_cache_cpu *c ; unsigned long tid; void **freelist; redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); barrier(); if (unlikely(slab != c->slab)) { __slab_free(s, slab, head, tail_obj, cnt, addr); return ; } if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, freelist, tid, head, next_tid(tid)))) { note_cmpxchg_failure("slab_free" , s, tid); goto redo; } } else { local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { local_unlock(&s->cpu_slab->lock); goto redo; } tid = c->tid; freelist = c->freelist; set_freepointer(s, tail_obj, freelist); c->freelist = head; c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); } stat(s, FREE_FASTPATH); }
__slab_free 此处object属于的slab并不是当前cpu的slab,该处的函数也就是慢路径
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 static void __slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { void *prior; int was_frozen; struct slab new ; unsigned long counters; struct kmem_cache_node *n = NULL ; unsigned long flags; stat(s, FREE_SLOWPATH); if (kfence_free(head)) return ; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return ; } do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL ; } prior = slab->freelist; counters = slab->counters; set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { if (kmem_cache_has_cpu_partial(s) && !prior) { new.frozen = 1 ; } else { n = get_node(s, slab_nid(slab)); spin_lock_irqsave(&n->list_lock, flags); } } } while (!cmpxchg_double_slab(s, slab, prior, counters, head, new.counters, "__slab_free" )); if (likely(!n)) { if (likely(was_frozen)) { stat(s, FREE_FROZEN); } else if (new.frozen) { put_cpu_partial(s, slab, 1 ); stat(s, CPU_PARTIAL_FREE); } return ; } if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { remove_full(s, n, slab); add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return ; slab_empty: if (prior) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } else { remove_full(s, n, slab); } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, slab); }
5.kfree相关 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 void kfree (const void *object) { struct folio *folio ; struct slab *slab ; struct kmem_cache *s ; trace_kfree(_RET_IP_, object); if (unlikely(ZERO_OR_NULL_PTR(object))) return ; folio = virt_to_folio(object); if (unlikely(!folio_test_slab(folio))) { free_large_kmalloc(folio, (void *)object); return ; } slab = folio_slab(folio); s = slab->slab_cache; __kmem_cache_free(s, (void *)object, _RET_IP_); }
virt_to_folio 1 2 3 4 5 6 static inline struct folio *virt_to_folio (const void *x) { struct page *page = virt_to_page(x); return page_folio(page); }
通过我们的虚拟地址来获得我们相应物理地址所对应的page结构体
1 2 3 4 5 6 7 8 9 10 11 12 13 14 #define page_folio(p ) (_Generic((p ) , \ const struct page *: (const struct folio *)_compound_head(p ) , \ struct page *: (struct folio *)_compound_head(p ) ))
下面是一段folio的介绍
Linux 中的每个物理地址都有一个 struct page。 结构页是一种相当弱的数据类型; 当页面实际上是尾页并且没有映射时,很容易查看(例如)页面->映射。 Folios 是分离 struct page 的某些角色的开始。 从概念上讲,folios 获取 struct page 的内容(尾页部分除外)并将它们移动到 struct folio 中。 这并不是补丁集实际要做的事情,因为我还不够受虐狂,无法一次性完成所有这些更改。
我们可以(而且应该)走得更远。 我们需要了解内存(当前)是如何分配和使用的。 映射到用户空间的任何内存都需要有脏位和锁定位,有引用计数和映射计数。
引用的kernelnewbie,不知道他在说什么,初步判定是用来优化以往的page结构提出的新特性,这里是返回了folio结构体供我们kfree使用
free_large_kmalloc 大页面则使用伙伴系统的接口,将其直接释放至伙伴系统之中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 void free_large_kmalloc (struct folio *folio, void *object) { unsigned int order = folio_order(folio); if (WARN_ON_ONCE(order == 0 )) pr_warn_once("object pointer: 0x%p\n" , object); kmemleak_free(object); kasan_kfree_large(object); kmsan_kfree_large(object); mod_lruvec_page_state(folio_page(folio, 0 ), NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __free_pages(folio_page(folio, 0 ), order); }
__kmem_cache_free 调用了另一个函数
1 2 3 4 void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) { slab_free(s, virt_to_slab(x), x, NULL , &x, 1 , caller); }
slab_free 这里也是直接调用了我们之前所讲述的较为关键的释放函数
1 2 3 4 5 6 7 8 9 10 11 12 static __fastpath_inline void slab_free (struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); if (slab_free_freelist_hook(s, &head, &tail, &cnt)) do_slab_free(s, slab, head, tail, cnt, addr); }
-1.参考 arttnba3师傅博客
kernel doc 中文版
kernel new bies
LWN