当前位置：Gxlcms > html代码 > Linux内核源代码情景分析-内存管理之用户页面的换入_html/css_WEB-ITnose

Linux内核源代码情景分析-内存管理之用户页面的换入_html/css_WEB-ITnose

时间：2021-07-01 10:21:17 帮助过：27人阅读

在下面几种情况下会发生，页面出错异常（也叫缺页中断）：

1、相应的页面目录项或者页面表项为空，也就是该线性地址与物理地址的映射关系尚未建立，或者已经撤销。

2、相应的物理页面不在内存中。本文讨论的就是这种情况。

3、指令中规定的访问方式与页面的权限不符，例如企图写一个“只读”的页面。

假设已经建立好了映射，但是页表项最后一位P为0，表示页面不在内存中；整个页表项如下图，offset表示页面在一个磁盘设备的位置，也就是磁盘设备的逻辑页面号；而type则是指该页面在哪一个磁盘设备中。

图 1 页面交换项结构

这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。

代码如下： arch/i386/mm/fault.c

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code){	struct task_struct *tsk;	struct mm_struct *mm;	struct vm_area_struct * vma;	unsigned long address;	unsigned long page;	unsigned long fixup;	int write;	siginfo_t info;	/* get the address */	__asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中	tsk = current;//task_struct	/*	 * We fault-in kernel-space virtual memory on-demand. The	 * 'reference' page table is init_mm.pgd.	 *	 * NOTE! We MUST NOT take any locks for this case. We may	 * be in an interrupt or a critical region, and should	 * only copy the information from the master page table,	 * nothing more.	 */	if (address >= TASK_SIZE)		goto vmalloc_fault;	mm = tsk->mm;//mm_struct	info.si_code = SEGV_MAPERR;	/*	 * If we're in an interrupt or have no user	 * context, we must not take the fault..	 */	if (in_interrupt() || !mm)		goto no_context;	down(&mm->mmap_sem);	vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。	if (!vma)//没有找到，说明没有一个区间的结束地址高于给定的地址，参考上图，说明这个地址是在堆栈之下，也就是3G字节以上了。		goto bad_area;	if (vma->vm_start <= address)//起始地址不高于address，说明映射已经建立，转到good_area去进一步检查失败原因。		goto good_area;	if (!(vma->vm_flags & VM_GROWSDOWN))		goto bad_area;	..../* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */good_area:	info.si_code = SEGV_ACCERR;	write = 0;	switch (error_code & 3) {// 110 & 011 = 2		default:	/* 3: write, present */#ifdef TEST_VERIFY_AREA			if (regs->cs == KERNEL_CS)				printk("WP fault at %08lx\n", regs->eip);#endif			/* fall through */		case 2:		/* write, not present */			if (!(vma->vm_flags & VM_WRITE))				goto bad_area;			write++;//执行到这里			break;		case 1:		/* read, present */			goto bad_area;		case 0:		/* read, not present */			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))				goto bad_area;	}	/*	 * If for any reason at all we couldn't handle the fault,	 * make sure we exit gracefully rather than endlessly redo	 * the fault.	 */	switch (handle_mm_fault(mm, vma, address, write)) {	case 1:		tsk->min_flt++;		break;	case 2:		tsk->maj_flt++;		break;	case 0:		goto do_sigbus;	default:		goto out_of_memory;	}	/*	 * Did it hit the DOS screen memory VA from vm86 mode?	 */	if (regs->eflags & VM_MASK) {		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;		if (bit < 32)			tsk->thread.screen_bitmap |= 1 << bit;	}	up(&mm->mmap_sem);	return;        .......}

内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs，它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。

error_code:

bit 0 == 0 means no page found, 1 means protection fault

bit 1 == 0 means read, 1 means write

bit 2 == 0 means kernel, 1 means user-mode 此时，error_code为110，用户态，页面不在内存中，写。

handle_mm_fault函数，代码如下：

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,	unsigned long address, int write_access){	int ret = -1;	pgd_t *pgd;	pmd_t *pmd;	pgd = pgd_offset(mm, address);//返回页面表项指针	pmd = pmd_alloc(pgd, address);//中转了一下，还是页目录表项指针	if (pmd) {		pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针		if (pte)			ret = handle_pte_fault(mm, vma, address, write_access, pte);	}	return ret;}

handle_pte_fault函数，如下：

static inline int handle_pte_fault(struct mm_struct *mm,	struct vm_area_struct * vma, unsigned long address,	int write_access, pte_t * pte){	pte_t entry;	/*	 * We need the page table lock to synchronize with kswapd	 * and the SMP-safe atomic PTE updates.	 */	spin_lock(&mm->page_table_lock);	entry = *pte;//页表项中内容	if (!pte_present(entry)) {//页面不在内存中		/*		 * If it truly wasn't present, we know that kswapd		 * and the PTE updates will not touch it later. So		 * drop the lock.		 */		spin_unlock(&mm->page_table_lock);		if (pte_none(entry))//页表项不为空			return do_no_page(mm, vma, address, write_access, pte);		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//执行到这里	}	if (write_access) {		if (!pte_write(entry))			return do_wp_page(mm, vma, address, pte, entry);		entry = pte_mkdirty(entry);	}	entry = pte_mkyoung(entry);	establish_pte(vma, address, pte, entry);	spin_unlock(&mm->page_table_lock);	return 1;}

do_swap_page函数，如下：

static int do_swap_page(struct mm_struct * mm,	struct vm_area_struct * vma, unsigned long address,	pte_t * page_table, swp_entry_t entry, int write_access){	struct page *page = lookup_swap_cache(entry);//从hash表中寻找	pte_t pte;	if (!page) {		lock_kernel();		swapin_readahead(entry);//预读页面		page = read_swap_cache(entry);//真正得到一个页面，这个页面可能从hash表中寻找到，因为上面预读了。或者自己申请页面，并且从盘上将其内容读进来。		unlock_kernel();		if (!page)			return -1;		flush_page_to_ram(page);		flush_icache_page(vma, page);	}	mm->rss++;	pte = mk_pte(page, vma->vm_page_prot);//形成页表项	/*	 * Freeze the "shared"ness of the page, ie page_count + swap_count.	 * Must lock page before transferring our swap count to already	 * obtained page count.	 */	lock_page(page);	swap_free(entry);	if (write_access && !is_page_shared(page))		pte = pte_mkwrite(pte_mkdirty(pte));//页表项赋予已写过对应的物理页，可进行读、写或者执行	UnlockPage(page);	set_pte(page_table, pte);//页表项(属性刚才已经设置了)指向对应的页面	/* No need to invalidate - it was non-present before */	update_mmu_cache(vma, address, pte);	return 1;	/* Minor fault */}

一、下面分别解释各个函数。首先解释swapin_readahead函数，如下：

void swapin_readahead(swp_entry_t entry){	int i, num;	struct page *new_page;	unsigned long offset;	/*	 * Get the number of handles we should do readahead io to. Also,	 * grab temporary references on them, releasing them as io completes.	 */	num = valid_swaphandles(entry, &offset);	for (i = 0; i < num; offset++, i++) {		......		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);		if (new_page != NULL)			page_cache_release(new_page);//page使用计数减1		swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));	}	return;}

提前预读相邻的盘面，根据下面的描述，__get_free_page，page使用计数为1，add_to_swap_cache，page使用计数再加1；此时page_cache_release，page使用计数又变成了1。直到有进程认领，才变成2。

read_swap_cache_async函数，如下：

struct page * read_swap_cache_async(swp_entry_t entry, int wait){	struct page *found_page = 0, *new_page;	unsigned long new_page_addr;		/*	 * Make sure the swap entry is still in use.	 */	if (!swap_duplicate(entry))	/* Account for the swap cache */		goto out;	/*	 * Look for the page in the swap cache.	 */	found_page = lookup_swap_cache(entry);//假设没有找到	if (found_page)		goto out_free_swap;	new_page_addr = __get_free_page(GFP_USER);//刚申请的page结构，使用计数为1	if (!new_page_addr)		goto out_free_swap;	/* Out of memory */	new_page = virt_to_page(new_page_addr);//转化成对应的page结构指针	/*	 * Check the swap cache again, in case we stalled above.	 */	found_page = lookup_swap_cache(entry);//假设没有找到	if (found_page)		goto out_free_page;	/* 	 * Add it to the swap cache and read its contents.	 */	lock_page(new_page);	add_to_swap_cache(new_page, entry);//加入到对应的链表上	rw_swap_page(READ, new_page, wait);//真正的把磁盘上的数据读到新申请的page上，等待块设备驱动一章再来看	return new_page;out_free_page:	page_cache_release(new_page);out_free_swap:	swap_free(entry);out:	return found_page;}

add_to_swap_cache函数是重点，代码如下：

void add_to_swap_cache(struct page *page, swp_entry_t entry){	unsigned long flags;#ifdef SWAP_CACHE_INFO	swap_cache_add_total++;#endif	if (!PageLocked(page))		BUG();	if (PageTestandSetSwapCache(page))		BUG();	if (page->mapping)		BUG();	flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1));	page->flags = flags | (1 << PG_uptodate);	add_to_page_cache_locked(page, &swapper_space, entry.val);}

add_to_page_cache_locked函数，代码如下：

void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index){	if (!PageLocked(page))		BUG();	page_cache_get(page);//增加了使用计数，现在使用计数为2	spin_lock(&pagecache_lock);	page->index = index;//index存着页面交换项	add_page_to_inode_queue(mapping, page);//page->list链入mapping->clean_pages	add_page_to_hash_queue(page, page_hash(mapping, index));//page->next_hash和page->pprev_hash链入全局的Hash表	lru_cache_add(page);//page->lru链入了全局的active_list	spin_unlock(&pagecache_lock);}

add_page_to_inode_queue函数，代码如下：

static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page){	struct list_head *head = &mapping->clean_pages;	mapping->nrpages++;	list_add(&page->list, head);//page->list链入mapping->clean_pages	page->mapping = mapping;//mapping指向了swapper_space}

struct address_space swapper_space = {	LIST_HEAD_INIT(swapper_space.clean_pages),	LIST_HEAD_INIT(swapper_space.dirty_pages),	LIST_HEAD_INIT(swapper_space.locked_pages),	0,				/* nrpages	*/	&swap_aops,};

add_page_to_hash_queue函数，如下：

static void add_page_to_hash_queue(struct page * page, struct page **p){	struct page *next = *p;//page->next_hash和page->pprev_hash链入全局的Hash表	*p = page;	page->next_hash = next;	page->pprev_hash = p;	if (next)		next->pprev_hash = &page->next_hash;	if (page->buffers)		PAGE_BUG(page);	atomic_inc(&page_cache_size);}

lru_cache_add函数，如下：

void lru_cache_add(struct page * page){	spin_lock(&pagemap_lru_lock);	if (!PageLocked(page))		BUG();	DEBUG_ADD_PAGE	add_page_to_active_list(page);	/* This should be relatively rare */	if (!page->age)		deactivate_page_nolock(page);	spin_unlock(&pagemap_lru_lock);}

add_page_to_active_list函数，如下：

#define add_page_to_active_list(page) { \	DEBUG_ADD_PAGE \	ZERO_PAGE_BUG \	SetPageActive(page); \	list_add(&(page)->lru, &active_list); \ //page->lru链入了全局的active_list	nr_active_pages++; \ //全局的nr_active_pages加1}

二、下面解释read_swap_cache函数，如下：

#define read_swap_cache(entry) read_swap_cache_async(entry, 1);

还是调用read_swap_cache_async函数，只是本次执行，很可能从lookup_swap_cache函数，找到了page。

struct page * read_swap_cache_async(swp_entry_t entry, int wait){	struct page *found_page = 0, *new_page;	unsigned long new_page_addr;		/*	 * Make sure the swap entry is still in use.	 */	if (!swap_duplicate(entry))	/* Account for the swap cache */		goto out;	/*	 * Look for the page in the swap cache.	 */	found_page = lookup_swap_cache(entry);//假设在hash表中找到对应的page，有进程认领了，使用计数为2	if (found_page)		goto out_free_swap;	new_page_addr = __get_free_page(GFP_USER);	if (!new_page_addr)		goto out_free_swap;	/* Out of memory */	new_page = virt_to_page(new_page_addr);	/*	 * Check the swap cache again, in case we stalled above.	 */	found_page = lookup_swap_cache(entry);//有可能__get_free_page，没有足够的可分配的页面，切换到其他进程了，再切回来时，在Hash表中再寻找一遍	if (found_page)		goto out_free_page;	/* 	 * Add it to the swap cache and read its contents.	 */	lock_page(new_page);	add_to_swap_cache(new_page, entry);	rw_swap_page(READ, new_page, wait);	return new_page;out_free_page:	page_cache_release(new_page);out_free_swap:	swap_free(entry);out:	return found_page;}

三、lookup_swap_cache函数，如下：

struct page * lookup_swap_cache(swp_entry_t entry){	struct page *found;#ifdef SWAP_CACHE_INFO	swap_cache_find_total++;#endif	while (1) {		/*		 * Right now the pagecache is 32-bit only.  But it's a 32 bit index. =)		 */repeat:		found = find_lock_page(&swapper_space, entry.val);//entry.val为页面交换项		if (!found)			return 0;		/*		 * Though the "found" page was in the swap cache an instant		 * earlier, it might have been removed by refill_inactive etc.		 * Re search ... Since find_lock_page grabs a reference on		 * the page, it can not be reused for anything else, namely		 * it can not be associated with another swaphandle, so it		 * is enough to check whether the page is still in the scache.		 */		if (!PageSwapCache(found)) {			UnlockPage(found);			page_cache_release(found);			goto repeat;		}		if (found->mapping != &swapper_space)			goto out_bad;#ifdef SWAP_CACHE_INFO		swap_cache_find_success++;#endif		UnlockPage(found);		return found;}

find_lock_page函数，如下：

#define find_lock_page(mapping, index) \		__find_lock_page(mapping, index, page_hash(mapping, index))

__find_lock_page函数，如下：

struct page * __find_lock_page (struct address_space *mapping,				unsigned long offset, struct page **hash){	struct page *page;	/*	 * We scan the hash list read-only. Addition to and removal from	 * the hash-list needs a held write-lock.	 */repeat:	spin_lock(&pagecache_lock);	page = __find_page_nolock(mapping, offset, *hash);//得到了hash表的其中一个链表的头	if (page) {		page_cache_get(page);//增加使用计数		spin_unlock(&pagecache_lock);		lock_page(page);		/* Is the page still hashed? Ok, good.. */		if (page->mapping)			return page;		/* Nope: we raced. Release and try again.. */		UnlockPage(page);		page_cache_release(page);		goto repeat;	}	spin_unlock(&pagecache_lock);	return NULL;}

__find_page_nolock函数，如下：

static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page){	goto inside;	for (;;) {		page = page->next_hash;//从hash表中寻找inside:		if (!page)			goto not_found;		if (page->mapping != mapping)			continue;		if (page->index == offset)			break;	}	/*	 * Touching the page may move it to the active list.	 * If we end up with too few inactive pages, we wake	 * up kswapd.	 */	age_page_up(page);	if (inactive_shortage() > inactive_target / 2 && free_shortage())			wakeup_kswapd(0);not_found:	return page;}

根据页面交换项，在hash表中寻找page结构。

      swapin_readahead(entry);//预读页面      page = read_swap_cache(entry);//真正得到一个页面，这个页面可能从hash表中寻找到，因为上面预读了。或者自己申请页面，并且从盘上将其内容读进来。

read_swap_cache无论从hash表中读取页面，还是自己申请页面，并加入到对应的链表。最后使用计数都是2。

swapin_readahead预读了很多页面，如果没有被进程认领，那么使用计数为1。

Linux内核源代码情景分析-内存管理之用户页面的换入_html/css_WEB-ITnose

人气教程排行