kernel/common.git - Unnamed repository; edit this file 'description' to name the repository.

1 /*
2  *	mm/mremap.c
3  *
4  *	(C) Copyright 1996 Linus Torvalds
5  *
6  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
7  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
8  */
9
10 #include <linux/mm.h>
11 #include <linux/hugetlb.h>
12 #include <linux/shm.h>
13 #include <linux/ksm.h>
14 #include <linux/mman.h>
15 #include <linux/swap.h>
16 #include <linux/capability.h>
17 #include <linux/fs.h>
18 #include <linux/swapops.h>
19 #include <linux/highmem.h>
20 #include <linux/security.h>
21 #include <linux/syscalls.h>
22 #include <linux/mmu_notifier.h>
23 #include <linux/uaccess.h>
24 #include <linux/mm-arch-hooks.h>
25
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28
29 #include "internal.h"
30
31 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
32 {
33 	pgd_t *pgd;
34 	pud_t *pud;
35 	pmd_t *pmd;
36
37 	pgd = pgd_offset(mm, addr);
38 	if (pgd_none_or_clear_bad(pgd))
39 		return NULL;
40
41 	pud = pud_offset(pgd, addr);
42 	if (pud_none_or_clear_bad(pud))
43 		return NULL;
44
45 	pmd = pmd_offset(pud, addr);
46 	if (pmd_none(*pmd))
47 		return NULL;
48
49 	return pmd;
50 }
51
52 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
53 			    unsigned long addr)
54 {
55 	pgd_t *pgd;
56 	pud_t *pud;
57 	pmd_t *pmd;
58
59 	pgd = pgd_offset(mm, addr);
60 	pud = pud_alloc(mm, pgd, addr);
61 	if (!pud)
62 		return NULL;
63
64 	pmd = pmd_alloc(mm, pud, addr);
65 	if (!pmd)
66 		return NULL;
67
68 	VM_BUG_ON(pmd_trans_huge(*pmd));
69
70 	return pmd;
71 }
72
73 static void take_rmap_locks(struct vm_area_struct *vma)
74 {
75 	if (vma->vm_file)
76 		i_mmap_lock_write(vma->vm_file->f_mapping);
77 	if (vma->anon_vma)
78 		anon_vma_lock_write(vma->anon_vma);
79 }
80
81 static void drop_rmap_locks(struct vm_area_struct *vma)
82 {
83 	if (vma->anon_vma)
84 		anon_vma_unlock_write(vma->anon_vma);
85 	if (vma->vm_file)
86 		i_mmap_unlock_write(vma->vm_file->f_mapping);
87 }
88
89 static pte_t move_soft_dirty_pte(pte_t pte)
90 {
91 	/*
92 	 * Set soft dirty bit so we can notice
93 	 * in userspace the ptes were moved.
94 	 */
95 #ifdef CONFIG_MEM_SOFT_DIRTY
96 	if (pte_present(pte))
97 		pte = pte_mksoft_dirty(pte);
98 	else if (is_swap_pte(pte))
99 		pte = pte_swp_mksoft_dirty(pte);
100 #endif
101 	return pte;
102 }
103
104 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
105 		unsigned long old_addr, unsigned long old_end,
106 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
107 		unsigned long new_addr, bool need_rmap_locks)
108 {
109 	struct mm_struct *mm = vma->vm_mm;
110 	pte_t *old_pte, *new_pte, pte;
111 	spinlock_t *old_ptl, *new_ptl;
112 	bool force_flush = false;
113 	unsigned long len = old_end - old_addr;
114
115 	/*
116 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
117 	 * locks to ensure that rmap will always observe either the old or the
118 	 * new ptes. This is the easiest way to avoid races with
119 	 * truncate_pagecache(), page migration, etc...
120 	 *
121 	 * When need_rmap_locks is false, we use other ways to avoid
122 	 * such races:
123 	 *
124 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
125 	 *   which rmap call sites look for using is_vma_temporary_stack().
126 	 *
127 	 * - During mremap(), new_vma is often known to be placed after vma
128 	 *   in rmap traversal order. This ensures rmap will always observe
129 	 *   either the old pte, or the new pte, or both (the page table locks
130 	 *   serialize access to individual ptes, but only rmap traversal
131 	 *   order guarantees that we won't miss both the old and new ptes).
132 	 */
133 	if (need_rmap_locks)
134 		take_rmap_locks(vma);
135
136 	/*
137 	 * We don't have to worry about the ordering of src and dst
138 	 * pte locks because exclusive mmap_sem prevents deadlock.
139 	 */
140 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
141 	new_pte = pte_offset_map(new_pmd, new_addr);
142 	new_ptl = pte_lockptr(mm, new_pmd);
143 	if (new_ptl != old_ptl)
144 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
145 	flush_tlb_batched_pending(vma->vm_mm);
146 	arch_enter_lazy_mmu_mode();
147
148 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
149 				   new_pte++, new_addr += PAGE_SIZE) {
150 		if (pte_none(*old_pte))
151 			continue;
152
153 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
154 		/*
155 		 * If we are remapping a valid PTE, make sure
156 		 * to flush TLB before we drop the PTL for the
157 		 * PTE.
158 		 *
159 		 * NOTE! Both old and new PTL matter: the old one
160 		 * for racing with page_mkclean(), the new one to
161 		 * make sure the physical page stays valid until
162 		 * the TLB entry for the old mapping has been
163 		 * flushed.
164 		 */
165 		if (pte_present(pte))
166 			force_flush = true;
167 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
168 		pte = move_soft_dirty_pte(pte);
169 		set_pte_at(mm, new_addr, new_pte, pte);
170 	}
171
172 	arch_leave_lazy_mmu_mode();
173 	if (force_flush)
174 		flush_tlb_range(vma, old_end - len, old_end);
175 	if (new_ptl != old_ptl)
176 		spin_unlock(new_ptl);
177 	pte_unmap(new_pte - 1);
178 	pte_unmap_unlock(old_pte - 1, old_ptl);
179 	if (need_rmap_locks)
180 		drop_rmap_locks(vma);
181 }
182
183 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
184
185 unsigned long move_page_tables(struct vm_area_struct *vma,
186 		unsigned long old_addr, struct vm_area_struct *new_vma,
187 		unsigned long new_addr, unsigned long len,
188 		bool need_rmap_locks)
189 {
190 	unsigned long extent, next, old_end;
191 	pmd_t *old_pmd, *new_pmd;
192 	unsigned long mmun_start;	/* For mmu_notifiers */
193 	unsigned long mmun_end;		/* For mmu_notifiers */
194
195 	old_end = old_addr + len;
196 	flush_cache_range(vma, old_addr, old_end);
197
198 	mmun_start = old_addr;
199 	mmun_end   = old_end;
200 	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
201
202 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
203 		cond_resched();
204 		next = (old_addr + PMD_SIZE) & PMD_MASK;
205 		/* even if next overflowed, extent below will be ok */
206 		extent = next - old_addr;
207 		if (extent > old_end - old_addr)
208 			extent = old_end - old_addr;
209 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
210 		if (!old_pmd)
211 			continue;
212 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
213 		if (!new_pmd)
214 			break;
215 		if (pmd_trans_huge(*old_pmd)) {
216 			if (extent == HPAGE_PMD_SIZE) {
217 				bool moved;
218 				/* See comment in move_ptes() */
219 				if (need_rmap_locks)
220 					take_rmap_locks(vma);
221 				moved = move_huge_pmd(vma, old_addr, new_addr,
222 						    old_end, old_pmd, new_pmd);
223 				if (need_rmap_locks)
224 					drop_rmap_locks(vma);
225 				if (moved)
226 					continue;
227 			}
228 			split_huge_pmd(vma, old_pmd, old_addr);
229 			if (pmd_trans_unstable(old_pmd))
230 				continue;
231 		}
232 		if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
233 			break;
234 		next = (new_addr + PMD_SIZE) & PMD_MASK;
235 		if (extent > next - new_addr)
236 			extent = next - new_addr;
237 		if (extent > LATENCY_LIMIT)
238 			extent = LATENCY_LIMIT;
239 		move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
240 			  new_pmd, new_addr, need_rmap_locks);
241 	}
242
243 	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
244
245 	return len + old_addr - old_end;	/* how much done */
246 }
247
248 static unsigned long move_vma(struct vm_area_struct *vma,
249 		unsigned long old_addr, unsigned long old_len,
250 		unsigned long new_len, unsigned long new_addr, bool *locked)
251 {
252 	struct mm_struct *mm = vma->vm_mm;
253 	struct vm_area_struct *new_vma;
254 	unsigned long vm_flags = vma->vm_flags;
255 	unsigned long new_pgoff;
256 	unsigned long moved_len;
257 	unsigned long excess = 0;
258 	unsigned long hiwater_vm;
259 	int split = 0;
260 	int err;
261 	bool need_rmap_locks;
262
263 	/*
264 	 * We'd prefer to avoid failure later on in do_munmap:
265 	 * which may split one vma into three before unmapping.
266 	 */
267 	if (mm->map_count >= sysctl_max_map_count - 3)
268 		return -ENOMEM;
269
270 	/*
271 	 * Advise KSM to break any KSM pages in the area to be moved:
272 	 * it would be confusing if they were to turn up at the new
273 	 * location, where they happen to coincide with different KSM
274 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
275 	 * so KSM can come around to merge on vma and new_vma afterwards.
276 	 */
277 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
278 						MADV_UNMERGEABLE, &vm_flags);
279 	if (err)
280 		return err;
281
282 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
283 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
284 			   &need_rmap_locks);
285 	if (!new_vma)
286 		return -ENOMEM;
287
288 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
289 				     need_rmap_locks);
290 	if (moved_len < old_len) {
291 		err = -ENOMEM;
292 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
293 		err = vma->vm_ops->mremap(new_vma);
294 	}
295
296 	if (unlikely(err)) {
297 		/*
298 		 * On error, move entries back from new area to old,
299 		 * which will succeed since page tables still there,
300 		 * and then proceed to unmap new area instead of old.
301 		 */
302 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
303 				 true);
304 		vma = new_vma;
305 		old_len = new_len;
306 		old_addr = new_addr;
307 		new_addr = err;
308 	} else {
309 		arch_remap(mm, old_addr, old_addr + old_len,
310 			   new_addr, new_addr + new_len);
311 	}
312
313 	/* Conceal VM_ACCOUNT so old reservation is not undone */
314 	if (vm_flags & VM_ACCOUNT) {
315 		vma->vm_flags &= ~VM_ACCOUNT;
316 		excess = vma->vm_end - vma->vm_start - old_len;
317 		if (old_addr > vma->vm_start &&
318 		    old_addr + old_len < vma->vm_end)
319 			split = 1;
320 	}
321
322 	/*
323 	 * If we failed to move page tables we still do total_vm increment
324 	 * since do_munmap() will decrement it by old_len == new_len.
325 	 *
326 	 * Since total_vm is about to be raised artificially high for a
327 	 * moment, we need to restore high watermark afterwards: if stats
328 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
329 	 * If this were a serious issue, we'd add a flag to do_munmap().
330 	 */
331 	hiwater_vm = mm->hiwater_vm;
332 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
333
334 	/* Tell pfnmap has moved from this vma */
335 	if (unlikely(vma->vm_flags & VM_PFNMAP))
336 		untrack_pfn_moved(vma);
337
338 	if (do_munmap(mm, old_addr, old_len) < 0) {
339 		/* OOM: unable to split vma, just get accounts right */
340 		vm_unacct_memory(excess >> PAGE_SHIFT);
341 		excess = 0;
342 	}
343 	mm->hiwater_vm = hiwater_vm;
344
345 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
346 	if (excess) {
347 		vma->vm_flags |= VM_ACCOUNT;
348 		if (split)
349 			vma->vm_next->vm_flags |= VM_ACCOUNT;
350 	}
351
352 	if (vm_flags & VM_LOCKED) {
353 		mm->locked_vm += new_len >> PAGE_SHIFT;
354 		*locked = true;
355 	}
356
357 	return new_addr;
358 }
359
360 static struct vm_area_struct *vma_to_resize(unsigned long addr,
361 	unsigned long old_len, unsigned long new_len, unsigned long *p)
362 {
363 	struct mm_struct *mm = current->mm;
364 	struct vm_area_struct *vma = find_vma(mm, addr);
365 	unsigned long pgoff;
366
367 	if (!vma || vma->vm_start > addr)
368 		return ERR_PTR(-EFAULT);
369
370 	if (is_vm_hugetlb_page(vma))
371 		return ERR_PTR(-EINVAL);
372
373 	/* We can't remap across vm area boundaries */
374 	if (old_len > vma->vm_end - addr)
375 		return ERR_PTR(-EFAULT);
376
377 	if (new_len == old_len)
378 		return vma;
379
380 	/* Need to be careful about a growing mapping */
381 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
382 	pgoff += vma->vm_pgoff;
383 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
384 		return ERR_PTR(-EINVAL);
385
386 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
387 		return ERR_PTR(-EFAULT);
388
389 	if (vma->vm_flags & VM_LOCKED) {
390 		unsigned long locked, lock_limit;
391 		locked = mm->locked_vm << PAGE_SHIFT;
392 		lock_limit = rlimit(RLIMIT_MEMLOCK);
393 		locked += new_len - old_len;
394 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
395 			return ERR_PTR(-EAGAIN);
396 	}
397
398 	if (!may_expand_vm(mm, vma->vm_flags,
399 				(new_len - old_len) >> PAGE_SHIFT))
400 		return ERR_PTR(-ENOMEM);
401
402 	if (vma->vm_flags & VM_ACCOUNT) {
403 		unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
404 		if (security_vm_enough_memory_mm(mm, charged))
405 			return ERR_PTR(-ENOMEM);
406 		*p = charged;
407 	}
408
409 	return vma;
410 }
411
412 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
413 		unsigned long new_addr, unsigned long new_len, bool *locked)
414 {
415 	struct mm_struct *mm = current->mm;
416 	struct vm_area_struct *vma;
417 	unsigned long ret = -EINVAL;
418 	unsigned long charged = 0;
419 	unsigned long map_flags;
420
421 	if (offset_in_page(new_addr))
422 		goto out;
423
424 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
425 		goto out;
426
427 	/* Ensure the old/new locations do not overlap */
428 	if (addr + old_len > new_addr && new_addr + new_len > addr)
429 		goto out;
430
431 	ret = do_munmap(mm, new_addr, new_len);
432 	if (ret)
433 		goto out;
434
435 	if (old_len >= new_len) {
436 		ret = do_munmap(mm, addr+new_len, old_len - new_len);
437 		if (ret && old_len != new_len)
438 			goto out;
439 		old_len = new_len;
440 	}
441
442 	vma = vma_to_resize(addr, old_len, new_len, &charged);
443 	if (IS_ERR(vma)) {
444 		ret = PTR_ERR(vma);
445 		goto out;
446 	}
447
448 	map_flags = MAP_FIXED;
449 	if (vma->vm_flags & VM_MAYSHARE)
450 		map_flags |= MAP_SHARED;
451
452 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
453 				((addr - vma->vm_start) >> PAGE_SHIFT),
454 				map_flags);
455 	if (offset_in_page(ret))
456 		goto out1;
457
458 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
459 	if (!(offset_in_page(ret)))
460 		goto out;
461 out1:
462 	vm_unacct_memory(charged);
463
464 out:
465 	return ret;
466 }
467
468 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
469 {
470 	unsigned long end = vma->vm_end + delta;
471 	if (end < vma->vm_end) /* overflow */
472 		return 0;
473 	if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
474 		return 0;
475 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
476 			      0, MAP_FIXED) & ~PAGE_MASK)
477 		return 0;
478 	return 1;
479 }
480
481 /*
482  * Expand (or shrink) an existing mapping, potentially moving it at the
483  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
484  *
485  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
486  * This option implies MREMAP_MAYMOVE.
487  */
488 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
489 		unsigned long, new_len, unsigned long, flags,
490 		unsigned long, new_addr)
491 {
492 	struct mm_struct *mm = current->mm;
493 	struct vm_area_struct *vma;
494 	unsigned long ret = -EINVAL;
495 	unsigned long charged = 0;
496 	bool locked = false;
497
498 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
499 		return ret;
500
501 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
502 		return ret;
503
504 	if (offset_in_page(addr))
505 		return ret;
506
507 	old_len = PAGE_ALIGN(old_len);
508 	new_len = PAGE_ALIGN(new_len);
509
510 	/*
511 	 * We allow a zero old-len as a special case
512 	 * for DOS-emu "duplicate shm area" thing. But
513 	 * a zero new-len is nonsensical.
514 	 */
515 	if (!new_len)
516 		return ret;
517
518 	if (down_write_killable(&current->mm->mmap_sem))
519 		return -EINTR;
520
521 	if (flags & MREMAP_FIXED) {
522 		ret = mremap_to(addr, old_len, new_addr, new_len,
523 				&locked);
524 		goto out;
525 	}
526
527 	/*
528 	 * Always allow a shrinking remap: that just unmaps
529 	 * the unnecessary pages..
530 	 * do_munmap does all the needed commit accounting
531 	 */
532 	if (old_len >= new_len) {
533 		ret = do_munmap(mm, addr+new_len, old_len - new_len);
534 		if (ret && old_len != new_len)
535 			goto out;
536 		ret = addr;
537 		goto out;
538 	}
539
540 	/*
541 	 * Ok, we need to grow..
542 	 */
543 	vma = vma_to_resize(addr, old_len, new_len, &charged);
544 	if (IS_ERR(vma)) {
545 		ret = PTR_ERR(vma);
546 		goto out;
547 	}
548
549 	/* old_len exactly to the end of the area..
550 	 */
551 	if (old_len == vma->vm_end - addr) {
552 		/* can we just expand the current mapping? */
553 		if (vma_expandable(vma, new_len - old_len)) {
554 			int pages = (new_len - old_len) >> PAGE_SHIFT;
555
556 			if (vma_adjust(vma, vma->vm_start, addr + new_len,
557 				       vma->vm_pgoff, NULL)) {
558 				ret = -ENOMEM;
559 				goto out;
560 			}
561
562 			vm_stat_account(mm, vma->vm_flags, pages);
563 			if (vma->vm_flags & VM_LOCKED) {
564 				mm->locked_vm += pages;
565 				locked = true;
566 				new_addr = addr;
567 			}
568 			ret = addr;
569 			goto out;
570 		}
571 	}
572
573 	/*
574 	 * We weren't able to just expand or shrink the area,
575 	 * we need to create a new one and move it..
576 	 */
577 	ret = -ENOMEM;
578 	if (flags & MREMAP_MAYMOVE) {
579 		unsigned long map_flags = 0;
580 		if (vma->vm_flags & VM_MAYSHARE)
581 			map_flags |= MAP_SHARED;
582
583 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
584 					vma->vm_pgoff +
585 					((addr - vma->vm_start) >> PAGE_SHIFT),
586 					map_flags);
587 		if (offset_in_page(new_addr)) {
588 			ret = new_addr;
589 			goto out;
590 		}
591
592 		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
593 	}
594 out:
595 	if (offset_in_page(ret)) {
596 		vm_unacct_memory(charged);
597 		locked = 0;
598 	}
599 	up_write(&current->mm->mmap_sem);
600 	if (locked && new_len > old_len)
601 		mm_populate(new_addr + old_len, new_len - old_len);
602 	return ret;
603 }
604
1	/*
2	* mm/mremap.c
3	*
4	* (C) Copyright 1996 Linus Torvalds
5	*
6	* Address space accounting code <alan@lxorguk.ukuu.org.uk>
7	* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8	*/
9
10	#include <linux/mm.h>
11	#include <linux/hugetlb.h>
12	#include <linux/shm.h>
13	#include <linux/ksm.h>
14	#include <linux/mman.h>
15	#include <linux/swap.h>
16	#include <linux/capability.h>
17	#include <linux/fs.h>
18	#include <linux/swapops.h>
19	#include <linux/highmem.h>
20	#include <linux/security.h>
21	#include <linux/syscalls.h>
22	#include <linux/mmu_notifier.h>
23	#include <linux/uaccess.h>
24	#include <linux/mm-arch-hooks.h>
25
26	#include <asm/cacheflush.h>
27	#include <asm/tlbflush.h>
28
29	#include "internal.h"
30
31	static pmd_t get_old_pmd(struct mm_struct mm, unsigned long addr)
32	{
33	pgd_t *pgd;
34	pud_t *pud;
35	pmd_t *pmd;
36
37	pgd = pgd_offset(mm, addr);
38	if (pgd_none_or_clear_bad(pgd))
39	return NULL;
40
41	pud = pud_offset(pgd, addr);
42	if (pud_none_or_clear_bad(pud))
43	return NULL;
44
45	pmd = pmd_offset(pud, addr);
46	if (pmd_none(*pmd))
47	return NULL;
48
49	return pmd;
50	}
51
52	static pmd_t alloc_new_pmd(struct mm_struct mm, struct vm_area_struct *vma,
53	unsigned long addr)
54	{
55	pgd_t *pgd;
56	pud_t *pud;
57	pmd_t *pmd;
58
59	pgd = pgd_offset(mm, addr);
60	pud = pud_alloc(mm, pgd, addr);
61	if (!pud)
62	return NULL;
63
64	pmd = pmd_alloc(mm, pud, addr);
65	if (!pmd)
66	return NULL;
67
68	VM_BUG_ON(pmd_trans_huge(*pmd));
69
70	return pmd;
71	}
72
73	static void take_rmap_locks(struct vm_area_struct *vma)
74	{
75	if (vma->vm_file)
76	i_mmap_lock_write(vma->vm_file->f_mapping);
77	if (vma->anon_vma)
78	anon_vma_lock_write(vma->anon_vma);
79	}
80
81	static void drop_rmap_locks(struct vm_area_struct *vma)
82	{
83	if (vma->anon_vma)
84	anon_vma_unlock_write(vma->anon_vma);
85	if (vma->vm_file)
86	i_mmap_unlock_write(vma->vm_file->f_mapping);
87	}
88
89	static pte_t move_soft_dirty_pte(pte_t pte)
90	{
91	/*
92	* Set soft dirty bit so we can notice
93	* in userspace the ptes were moved.
94	*/
95	#ifdef CONFIG_MEM_SOFT_DIRTY
96	if (pte_present(pte))
97	pte = pte_mksoft_dirty(pte);
98	else if (is_swap_pte(pte))
99	pte = pte_swp_mksoft_dirty(pte);
100	#endif
101	return pte;
102	}
103
104	static void move_ptes(struct vm_area_struct vma, pmd_t old_pmd,
105	unsigned long old_addr, unsigned long old_end,
106	struct vm_area_struct new_vma, pmd_t new_pmd,
107	unsigned long new_addr, bool need_rmap_locks)
108	{
109	struct mm_struct *mm = vma->vm_mm;
110	pte_t old_pte, new_pte, pte;
111	spinlock_t old_ptl, new_ptl;
112	bool force_flush = false;
113	unsigned long len = old_end - old_addr;
114
115	/*
116	* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
117	* locks to ensure that rmap will always observe either the old or the
118	* new ptes. This is the easiest way to avoid races with
119	* truncate_pagecache(), page migration, etc...
120	*
121	* When need_rmap_locks is false, we use other ways to avoid
122	* such races:
123	*
124	* - During exec() shift_arg_pages(), we use a specially tagged vma
125	* which rmap call sites look for using is_vma_temporary_stack().
126	*
127	* - During mremap(), new_vma is often known to be placed after vma
128	* in rmap traversal order. This ensures rmap will always observe
129	* either the old pte, or the new pte, or both (the page table locks
130	* serialize access to individual ptes, but only rmap traversal
131	* order guarantees that we won't miss both the old and new ptes).
132	*/
133	if (need_rmap_locks)
134	take_rmap_locks(vma);
135
136	/*
137	* We don't have to worry about the ordering of src and dst
138	* pte locks because exclusive mmap_sem prevents deadlock.
139	*/
140	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
141	new_pte = pte_offset_map(new_pmd, new_addr);
142	new_ptl = pte_lockptr(mm, new_pmd);
143	if (new_ptl != old_ptl)
144	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
145	flush_tlb_batched_pending(vma->vm_mm);
146	arch_enter_lazy_mmu_mode();
147
148	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
149	new_pte++, new_addr += PAGE_SIZE) {
150	if (pte_none(*old_pte))
151	continue;
152
153	pte = ptep_get_and_clear(mm, old_addr, old_pte);
154	/*
155	* If we are remapping a valid PTE, make sure
156	* to flush TLB before we drop the PTL for the
157	* PTE.
158	*
159	* NOTE! Both old and new PTL matter: the old one
160	* for racing with page_mkclean(), the new one to
161	* make sure the physical page stays valid until
162	* the TLB entry for the old mapping has been
163	* flushed.
164	*/
165	if (pte_present(pte))
166	force_flush = true;
167	pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
168	pte = move_soft_dirty_pte(pte);
169	set_pte_at(mm, new_addr, new_pte, pte);
170	}
171
172	arch_leave_lazy_mmu_mode();
173	if (force_flush)
174	flush_tlb_range(vma, old_end - len, old_end);
175	if (new_ptl != old_ptl)
176	spin_unlock(new_ptl);
177	pte_unmap(new_pte - 1);
178	pte_unmap_unlock(old_pte - 1, old_ptl);
179	if (need_rmap_locks)
180	drop_rmap_locks(vma);
181	}
182
183	#define LATENCY_LIMIT (64 * PAGE_SIZE)
184
185	unsigned long move_page_tables(struct vm_area_struct *vma,
186	unsigned long old_addr, struct vm_area_struct *new_vma,
187	unsigned long new_addr, unsigned long len,
188	bool need_rmap_locks)
189	{
190	unsigned long extent, next, old_end;
191	pmd_t old_pmd, new_pmd;
192	unsigned long mmun_start; /* For mmu_notifiers */
193	unsigned long mmun_end; /* For mmu_notifiers */
194
195	old_end = old_addr + len;
196	flush_cache_range(vma, old_addr, old_end);
197
198	mmun_start = old_addr;
199	mmun_end = old_end;
200	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
201
202	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
203	cond_resched();
204	next = (old_addr + PMD_SIZE) & PMD_MASK;
205	/* even if next overflowed, extent below will be ok */
206	extent = next - old_addr;
207	if (extent > old_end - old_addr)
208	extent = old_end - old_addr;
209	old_pmd = get_old_pmd(vma->vm_mm, old_addr);
210	if (!old_pmd)
211	continue;
212	new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
213	if (!new_pmd)
214	break;
215	if (pmd_trans_huge(*old_pmd)) {
216	if (extent == HPAGE_PMD_SIZE) {
217	bool moved;
218	/* See comment in move_ptes() */
219	if (need_rmap_locks)
220	take_rmap_locks(vma);
221	moved = move_huge_pmd(vma, old_addr, new_addr,
222	old_end, old_pmd, new_pmd);
223	if (need_rmap_locks)
224	drop_rmap_locks(vma);
225	if (moved)
226	continue;
227	}
228	split_huge_pmd(vma, old_pmd, old_addr);
229	if (pmd_trans_unstable(old_pmd))
230	continue;
231	}
232	if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
233	break;
234	next = (new_addr + PMD_SIZE) & PMD_MASK;
235	if (extent > next - new_addr)
236	extent = next - new_addr;
237	if (extent > LATENCY_LIMIT)
238	extent = LATENCY_LIMIT;
239	move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
240	new_pmd, new_addr, need_rmap_locks);
241	}
242
243	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
244
245	return len + old_addr - old_end; /* how much done */
246	}
247
248	static unsigned long move_vma(struct vm_area_struct *vma,
249	unsigned long old_addr, unsigned long old_len,
250	unsigned long new_len, unsigned long new_addr, bool *locked)
251	{
252	struct mm_struct *mm = vma->vm_mm;
253	struct vm_area_struct *new_vma;
254	unsigned long vm_flags = vma->vm_flags;
255	unsigned long new_pgoff;
256	unsigned long moved_len;
257	unsigned long excess = 0;
258	unsigned long hiwater_vm;
259	int split = 0;
260	int err;
261	bool need_rmap_locks;
262
263	/*
264	* We'd prefer to avoid failure later on in do_munmap:
265	* which may split one vma into three before unmapping.
266	*/
267	if (mm->map_count >= sysctl_max_map_count - 3)
268	return -ENOMEM;
269
270	/*
271	* Advise KSM to break any KSM pages in the area to be moved:
272	* it would be confusing if they were to turn up at the new
273	* location, where they happen to coincide with different KSM
274	* pages recently unmapped. But leave vma->vm_flags as it was,
275	* so KSM can come around to merge on vma and new_vma afterwards.
276	*/
277	err = ksm_madvise(vma, old_addr, old_addr + old_len,
278	MADV_UNMERGEABLE, &vm_flags);
279	if (err)
280	return err;
281
282	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
283	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
284	&need_rmap_locks);
285	if (!new_vma)
286	return -ENOMEM;
287
288	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
289	need_rmap_locks);
290	if (moved_len < old_len) {
291	err = -ENOMEM;
292	} else if (vma->vm_ops && vma->vm_ops->mremap) {
293	err = vma->vm_ops->mremap(new_vma);
294	}
295
296	if (unlikely(err)) {
297	/*
298	* On error, move entries back from new area to old,
299	* which will succeed since page tables still there,
300	* and then proceed to unmap new area instead of old.
301	*/
302	move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
303	true);
304	vma = new_vma;
305	old_len = new_len;
306	old_addr = new_addr;
307	new_addr = err;
308	} else {
309	arch_remap(mm, old_addr, old_addr + old_len,
310	new_addr, new_addr + new_len);
311	}
312
313	/* Conceal VM_ACCOUNT so old reservation is not undone */
314	if (vm_flags & VM_ACCOUNT) {
315	vma->vm_flags &= ~VM_ACCOUNT;
316	excess = vma->vm_end - vma->vm_start - old_len;
317	if (old_addr > vma->vm_start &&
318	old_addr + old_len < vma->vm_end)
319	split = 1;
320	}
321
322	/*
323	* If we failed to move page tables we still do total_vm increment
324	* since do_munmap() will decrement it by old_len == new_len.
325	*
326	* Since total_vm is about to be raised artificially high for a
327	* moment, we need to restore high watermark afterwards: if stats
328	* are taken meanwhile, total_vm and hiwater_vm appear too high.
329	* If this were a serious issue, we'd add a flag to do_munmap().
330	*/
331	hiwater_vm = mm->hiwater_vm;
332	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
333
334	/* Tell pfnmap has moved from this vma */
335	if (unlikely(vma->vm_flags & VM_PFNMAP))
336	untrack_pfn_moved(vma);
337
338	if (do_munmap(mm, old_addr, old_len) < 0) {
339	/* OOM: unable to split vma, just get accounts right */
340	vm_unacct_memory(excess >> PAGE_SHIFT);
341	excess = 0;
342	}
343	mm->hiwater_vm = hiwater_vm;
344
345	/* Restore VM_ACCOUNT if one or two pieces of vma left */
346	if (excess) {
347	vma->vm_flags \|= VM_ACCOUNT;
348	if (split)
349	vma->vm_next->vm_flags \|= VM_ACCOUNT;
350	}
351
352	if (vm_flags & VM_LOCKED) {
353	mm->locked_vm += new_len >> PAGE_SHIFT;
354	*locked = true;
355	}
356
357	return new_addr;
358	}
359
360	static struct vm_area_struct *vma_to_resize(unsigned long addr,
361	unsigned long old_len, unsigned long new_len, unsigned long *p)
362	{
363	struct mm_struct *mm = current->mm;
364	struct vm_area_struct *vma = find_vma(mm, addr);
365	unsigned long pgoff;
366
367	if (!vma \|\| vma->vm_start > addr)
368	return ERR_PTR(-EFAULT);
369
370	if (is_vm_hugetlb_page(vma))
371	return ERR_PTR(-EINVAL);
372
373	/* We can't remap across vm area boundaries */
374	if (old_len > vma->vm_end - addr)
375	return ERR_PTR(-EFAULT);
376
377	if (new_len == old_len)
378	return vma;
379
380	/* Need to be careful about a growing mapping */
381	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
382	pgoff += vma->vm_pgoff;
383	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
384	return ERR_PTR(-EINVAL);
385
386	if (vma->vm_flags & (VM_DONTEXPAND \| VM_PFNMAP))
387	return ERR_PTR(-EFAULT);
388
389	if (vma->vm_flags & VM_LOCKED) {
390	unsigned long locked, lock_limit;
391	locked = mm->locked_vm << PAGE_SHIFT;
392	lock_limit = rlimit(RLIMIT_MEMLOCK);
393	locked += new_len - old_len;
394	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
395	return ERR_PTR(-EAGAIN);
396	}
397
398	if (!may_expand_vm(mm, vma->vm_flags,
399	(new_len - old_len) >> PAGE_SHIFT))
400	return ERR_PTR(-ENOMEM);
401
402	if (vma->vm_flags & VM_ACCOUNT) {
403	unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
404	if (security_vm_enough_memory_mm(mm, charged))
405	return ERR_PTR(-ENOMEM);
406	*p = charged;
407	}
408
409	return vma;
410	}
411
412	static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
413	unsigned long new_addr, unsigned long new_len, bool *locked)
414	{
415	struct mm_struct *mm = current->mm;
416	struct vm_area_struct *vma;
417	unsigned long ret = -EINVAL;
418	unsigned long charged = 0;
419	unsigned long map_flags;
420
421	if (offset_in_page(new_addr))
422	goto out;
423
424	if (new_len > TASK_SIZE \|\| new_addr > TASK_SIZE - new_len)
425	goto out;
426
427	/* Ensure the old/new locations do not overlap */
428	if (addr + old_len > new_addr && new_addr + new_len > addr)
429	goto out;
430
431	ret = do_munmap(mm, new_addr, new_len);
432	if (ret)
433	goto out;
434
435	if (old_len >= new_len) {
436	ret = do_munmap(mm, addr+new_len, old_len - new_len);
437	if (ret && old_len != new_len)
438	goto out;
439	old_len = new_len;
440	}
441
442	vma = vma_to_resize(addr, old_len, new_len, &charged);
443	if (IS_ERR(vma)) {
444	ret = PTR_ERR(vma);
445	goto out;
446	}
447
448	map_flags = MAP_FIXED;
449	if (vma->vm_flags & VM_MAYSHARE)
450	map_flags \|= MAP_SHARED;
451
452	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
453	((addr - vma->vm_start) >> PAGE_SHIFT),
454	map_flags);
455	if (offset_in_page(ret))
456	goto out1;
457
458	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
459	if (!(offset_in_page(ret)))
460	goto out;
461	out1:
462	vm_unacct_memory(charged);
463
464	out:
465	return ret;
466	}
467
468	static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
469	{
470	unsigned long end = vma->vm_end + delta;
471	if (end < vma->vm_end) /* overflow */
472	return 0;
473	if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
474	return 0;
475	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
476	0, MAP_FIXED) & ~PAGE_MASK)
477	return 0;
478	return 1;
479	}
480
481	/*
482	* Expand (or shrink) an existing mapping, potentially moving it at the
483	* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
484	*
485	* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
486	* This option implies MREMAP_MAYMOVE.
487	*/
488	SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
489	unsigned long, new_len, unsigned long, flags,
490	unsigned long, new_addr)
491	{
492	struct mm_struct *mm = current->mm;
493	struct vm_area_struct *vma;
494	unsigned long ret = -EINVAL;
495	unsigned long charged = 0;
496	bool locked = false;
497
498	if (flags & ~(MREMAP_FIXED \| MREMAP_MAYMOVE))
499	return ret;
500
501	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
502	return ret;
503
504	if (offset_in_page(addr))
505	return ret;
506
507	old_len = PAGE_ALIGN(old_len);
508	new_len = PAGE_ALIGN(new_len);
509
510	/*
511	* We allow a zero old-len as a special case
512	* for DOS-emu "duplicate shm area" thing. But
513	* a zero new-len is nonsensical.
514	*/
515	if (!new_len)
516	return ret;
517
518	if (down_write_killable(&current->mm->mmap_sem))
519	return -EINTR;
520
521	if (flags & MREMAP_FIXED) {
522	ret = mremap_to(addr, old_len, new_addr, new_len,
523	&locked);
524	goto out;
525	}
526
527	/*
528	* Always allow a shrinking remap: that just unmaps
529	* the unnecessary pages..
530	* do_munmap does all the needed commit accounting
531	*/
532	if (old_len >= new_len) {
533	ret = do_munmap(mm, addr+new_len, old_len - new_len);
534	if (ret && old_len != new_len)
535	goto out;
536	ret = addr;
537	goto out;
538	}
539
540	/*
541	* Ok, we need to grow..
542	*/
543	vma = vma_to_resize(addr, old_len, new_len, &charged);
544	if (IS_ERR(vma)) {
545	ret = PTR_ERR(vma);
546	goto out;
547	}
548
549	/* old_len exactly to the end of the area..
550	*/
551	if (old_len == vma->vm_end - addr) {
552	/* can we just expand the current mapping? */
553	if (vma_expandable(vma, new_len - old_len)) {
554	int pages = (new_len - old_len) >> PAGE_SHIFT;
555
556	if (vma_adjust(vma, vma->vm_start, addr + new_len,
557	vma->vm_pgoff, NULL)) {
558	ret = -ENOMEM;
559	goto out;
560	}
561
562	vm_stat_account(mm, vma->vm_flags, pages);
563	if (vma->vm_flags & VM_LOCKED) {
564	mm->locked_vm += pages;
565	locked = true;
566	new_addr = addr;
567	}
568	ret = addr;
569	goto out;
570	}
571	}
572
573	/*
574	* We weren't able to just expand or shrink the area,
575	* we need to create a new one and move it..
576	*/
577	ret = -ENOMEM;
578	if (flags & MREMAP_MAYMOVE) {
579	unsigned long map_flags = 0;
580	if (vma->vm_flags & VM_MAYSHARE)
581	map_flags \|= MAP_SHARED;
582
583	new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
584	vma->vm_pgoff +
585	((addr - vma->vm_start) >> PAGE_SHIFT),
586	map_flags);
587	if (offset_in_page(new_addr)) {
588	ret = new_addr;
589	goto out;
590	}
591
592	ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
593	}
594	out:
595	if (offset_in_page(ret)) {
596	vm_unacct_memory(charged);
597	locked = 0;
598	}
599	up_write(&current->mm->mmap_sem);
600	if (locked && new_len > old_len)
601	mm_populate(new_addr + old_len, new_len - old_len);
602	return ret;
603	}
604