内存管理（八）：zone的初始化

技术2025-10-26 26

linux版本：4.14.74 硬件：ARMV8 A53

1 zone概念

物理内存可以分为内存区域zone，内核定义的对zone类型的定义如下

enum zone_type { #ifdef CONFIG_ZONE_DMA /* * ZONE_DMA is used when there are devices that are not able * to do DMA to all of addressable memory (ZONE_NORMAL). Then we * carve out the portion of memory that is needed for these devices. * The range is arch specific. * * Some examples * * Architecture Limit * --------------------------- * parisc, ia64, sparc <4G * s390 <2G * arm Various * alpha Unlimited or 0-16MB. * * i386, x86_64 and multiple other arches * <16M. */ ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 /* * x86_64 needs two ZONE_DMAs because it supports devices that are * only able to do DMA to the lower 16M but also 32 bit devices that * can only do DMA areas below 4G. */ ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; ZONE DMA: DMA区域。有些设备不能直接访问所有内存，需要使用DMA区域，如旧的ISA总线只能访问16MB一下的内存DMA32区域：64位系统，如果既要支持只能访问16MB一下内存的设备，又要支持只能访问4GB以下内存的32位设备，那么必须使用DMA32区域普通区域：直接映射到内核虚拟地址空间的内存区域高端内存区域，32bit时代的产物，64bit内核虚拟地址空间非常大，完全够用了，不再需要这块区域可移动区域，它是一个伪内存区域，用来防止内存碎片。设备区域，为了支持永久内存热插拔增加的内存区域

内存区域用一个结构体zone描述

struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; /*页分配器使用的水线*/ ... long lowmem_reserve[MAX_NR_ZONES]; //页分配器使用，当前区域保留多少 //页不能借给高的区域类型 struct pglist_data *zone_pgdat; //指向内存节点的pglist_data实例 struct per_cpu_pageset __percpu *pageset; //每处理器页集合 ... /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; //当前区域的起始物理页号 unsigned long managed_pages; //伙伴分配器管理的物理页的数量 unsigned long spanned_pages; //当前区域跨越的总页数，包括空洞 unsigned long present_pages; //当前区域存在的物理页的数量，不包括空洞 const char *name; //区域名称 /* free areas of different sizes */ /*不同长度的空闲区域*/ struct free_area free_area[MAX_ORDER]; ... } ____cacheline_internodealigned_in_smp;

2 zone 初始化

bootmem_init中将调用zone_sizes_init函数进行zone初始化

void __init bootmem_init(void) { unsigned long min, max; min = PFN_UP(memblock_start_of_DRAM()); max = PFN_DOWN(memblock_end_of_DRAM()); early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT); max_pfn = max_low_pfn = max; arm64_numa_init(); /* * Sparsemem tries to allocate bootmem in memory_present(), so must be * done after the fixed reservations. */ arm64_memory_present(); sparse_init(); zone_sizes_init(min, max); memblock_dump_all(); } static void __init zone_sizes_init(unsigned long min, unsigned long max) { struct memblock_region *reg; unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES]; unsigned long max_dma = min; memset(zone_size, 0, sizeof(zone_size)); /* 4GB maximum for 32-bit only capable devices */ /****1***/ #ifdef CONFIG_ZONE_DMA max_dma = PFN_DOWN(arm64_dma_phys_limit); zone_size[ZONE_DMA] = max_dma - min; #endif zone_size[ZONE_NORMAL] = max - max_dma; memcpy(zhole_size, zone_size, sizeof(zhole_size)); for_each_memblock(memory, reg) { unsigned long start = memblock_region_memory_base_pfn(reg); unsigned long end = memblock_region_memory_end_pfn(reg); if (start >= max) continue; #ifdef CONFIG_ZONE_DMA if (start < max_dma) { unsigned long dma_end = min(end, max_dma); zhole_size[ZONE_DMA] -= dma_end - start; } #endif if (end > max_dma) { unsigned long normal_end = min(end, max); unsigned long normal_start = max(start, max_dma); zhole_size[ZONE_NORMAL] -= normal_end - normal_start; } } /****2*****/ free_area_init_node(0, zone_size, min, zhole_size); } 是否定义了DMA区域，我的项目中定义了这个区域，并且这个区域最大支持4G大小，而项目中的内存最大也不超过4G，所以实际上只有这一个区域，normal区域也不复存在了对pglist字段初始化，管理zone区域 void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); #else start_pfn = node_start_pfn; #endif /******1*****/ calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", nid, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); #endif reset_deferred_meminit(pgdat); /***2****/ free_area_init_core(pgdat); } 计算当前zone的page数量，包括spanned_pages(总页数)，present_pages（不包含空洞的总页数），并对node的总页数进行累加继续对pgdat初始化 /* * Set up the zone data structures: * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps * * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE spin_lock_init(&pgdat->split_queue_lock); INIT_LIST_HEAD(&pgdat->split_queue); pgdat->split_queue_len = 0; #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); #ifdef CONFIG_COMPACTION init_waitqueue_head(&pgdat->kcompactd_wait); #endif pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); pgdat->per_cpu_nodestats = &boot_nodestats; for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; realsize = freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ /******1*****/ memmap_pages = calc_memmap_size(size, realsize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { freesize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) nr_kernel_pages += freesize; /* Charge for highmem memmap if there are enough kernel pages */ else if (nr_kernel_pages > memmap_pages * 2) nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif zone->name = zone_names[j]; zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); zone_seqlock_init(zone); zone_pcp_init(zone); if (!size) continue; set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } 一个struct page结构体大小64字节，存储这个区域中可用的页数（present_pages）需要多少struct page，这些struct page总共会占用多少页大小

参考文件

《Linux内核深度解析》

Processed: 0.010, SQL: 9