kernel/common.git - Unnamed repository; edit this file 'description' to name the repository.

1 /*
2  * Simple NUMA memory policy for the Linux kernel.
3  *
4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6  * Subject to the GNU Public License, version 2.
7  *
8  * NUMA policy allows the user to give hints in which node(s) memory should
9  * be allocated.
10  *
11  * Support four policies per VMA and per process:
12  *
13  * The VMA policy has priority over the process policy for a page fault.
14  *
15  * interleave     Allocate memory interleaved over a set of nodes,
16  *                with normal fallback if it fails.
17  *                For VMA based allocations this interleaves based on the
18  *                offset into the backing object or offset into the mapping
19  *                for anonymous memory. For process policy an process counter
20  *                is used.
21  *
22  * bind           Only allocate memory on a specific set of nodes,
23  *                no fallback.
24  *                FIXME: memory is allocated starting with the first node
25  *                to the last. It would be better if bind would truly restrict
26  *                the allocation to memory nodes instead
27  *
28  * preferred       Try a specific node first before normal fallback.
29  *                As a special case NUMA_NO_NODE here means do the allocation
30  *                on the local CPU. This is normally identical to default,
31  *                but useful to set in a VMA when you have a non default
32  *                process policy.
33  *
34  * default        Allocate on the local node first, or when on a VMA
35  *                use the process policy. This is what Linux always did
36  *		  in a NUMA aware kernel and still does by, ahem, default.
37  *
38  * The process policy is applied for most non interrupt memory allocations
39  * in that process' context. Interrupts ignore the policies and always
40  * try to allocate on the local CPU. The VMA policy is only applied for memory
41  * allocations for a VMA in the VM.
42  *
43  * Currently there are a few corner cases in swapping where the policy
44  * is not applied, but the majority should be handled. When process policy
45  * is used it is not remembered over swap outs/swap ins.
46  *
47  * Only the highest zone in the zone hierarchy gets policied. Allocations
48  * requesting a lower zone just use default policy. This implies that
49  * on systems with highmem kernel lowmem allocation don't get policied.
50  * Same with GFP_DMA allocations.
51  *
52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53  * all users and remembered even when nobody has memory mapped.
54  */
55
56 /* Notebook:
57    fix mmap readahead to honour policy and enable policy for any page cache
58    object
59    statistics for bigpages
60    global policy for page cache? currently it uses process policy. Requires
61    first item above.
62    handle mremap for shared memory (currently ignored for the policy)
63    grows down?
64    make bind policy root only? It can trigger oom much faster and the
65    kernel is not always grateful with that.
66 */
67
68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70 #include <linux/mempolicy.h>
71 #include <linux/mm.h>
72 #include <linux/highmem.h>
73 #include <linux/hugetlb.h>
74 #include <linux/kernel.h>
75 #include <linux/sched.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/slab.h>
79 #include <linux/string.h>
80 #include <linux/export.h>
81 #include <linux/nsproxy.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/swap.h>
86 #include <linux/seq_file.h>
87 #include <linux/proc_fs.h>
88 #include <linux/migrate.h>
89 #include <linux/ksm.h>
90 #include <linux/rmap.h>
91 #include <linux/security.h>
92 #include <linux/syscalls.h>
93 #include <linux/ctype.h>
94 #include <linux/mm_inline.h>
95 #include <linux/mmu_notifier.h>
96 #include <linux/printk.h>
97
98 #include <asm/tlbflush.h>
99 #include <asm/uaccess.h>
100
101 #include "internal.h"
102
103 /* Internal flags */
104 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
105 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
106
107 static struct kmem_cache *policy_cache;
108 static struct kmem_cache *sn_cache;
109
110 /* Highest zone. An specific allocation for a zone below that is not
111    policied. */
112 enum zone_type policy_zone = 0;
113
114 /*
115  * run-time system-wide default policy => local allocation
116  */
117 static struct mempolicy default_policy = {
118 	.refcnt = ATOMIC_INIT(1), /* never free it */
119 	.mode = MPOL_PREFERRED,
120 	.flags = MPOL_F_LOCAL,
121 };
122
123 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
124
125 struct mempolicy *get_task_policy(struct task_struct *p)
126 {
127 	struct mempolicy *pol = p->mempolicy;
128 	int node;
129
130 	if (pol)
131 		return pol;
132
133 	node = numa_node_id();
134 	if (node != NUMA_NO_NODE) {
135 		pol = &preferred_node_policy[node];
136 		/* preferred_node_policy is not initialised early in boot */
137 		if (pol->mode)
138 			return pol;
139 	}
140
141 	return &default_policy;
142 }
143
144 static const struct mempolicy_operations {
145 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146 	/*
147 	 * If read-side task has no lock to protect task->mempolicy, write-side
148 	 * task will rebind the task->mempolicy by two step. The first step is
149 	 * setting all the newly nodes, and the second step is cleaning all the
150 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
151 	 * page.
152 	 * If we have a lock to protect task->mempolicy in read-side, we do
153 	 * rebind directly.
154 	 *
155 	 * step:
156 	 * 	MPOL_REBIND_ONCE - do rebind work at once
157 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
158 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
159 	 */
160 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 			enum mpol_rebind_step step);
162 } mpol_ops[MPOL_MAX];
163
164 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
165 {
166 	return pol->flags & MPOL_MODE_FLAGS;
167 }
168
169 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
170 				   const nodemask_t *rel)
171 {
172 	nodemask_t tmp;
173 	nodes_fold(tmp, *orig, nodes_weight(*rel));
174 	nodes_onto(*ret, tmp, *rel);
175 }
176
177 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
178 {
179 	if (nodes_empty(*nodes))
180 		return -EINVAL;
181 	pol->v.nodes = *nodes;
182 	return 0;
183 }
184
185 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
186 {
187 	if (!nodes)
188 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
189 	else if (nodes_empty(*nodes))
190 		return -EINVAL;			/*  no allowed nodes */
191 	else
192 		pol->v.preferred_node = first_node(*nodes);
193 	return 0;
194 }
195
196 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
197 {
198 	if (nodes_empty(*nodes))
199 		return -EINVAL;
200 	pol->v.nodes = *nodes;
201 	return 0;
202 }
203
204 /*
205  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
206  * any, for the new policy.  mpol_new() has already validated the nodes
207  * parameter with respect to the policy mode and flags.  But, we need to
208  * handle an empty nodemask with MPOL_PREFERRED here.
209  *
210  * Must be called holding task's alloc_lock to protect task's mems_allowed
211  * and mempolicy.  May also be called holding the mmap_semaphore for write.
212  */
213 static int mpol_set_nodemask(struct mempolicy *pol,
214 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
215 {
216 	int ret;
217
218 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
219 	if (pol == NULL)
220 		return 0;
221 	/* Check N_MEMORY */
222 	nodes_and(nsc->mask1,
223 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
224
225 	VM_BUG_ON(!nodes);
226 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
227 		nodes = NULL;	/* explicit local allocation */
228 	else {
229 		if (pol->flags & MPOL_F_RELATIVE_NODES)
230 			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
231 		else
232 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
233
234 		if (mpol_store_user_nodemask(pol))
235 			pol->w.user_nodemask = *nodes;
236 		else
237 			pol->w.cpuset_mems_allowed =
238 						cpuset_current_mems_allowed;
239 	}
240
241 	if (nodes)
242 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
243 	else
244 		ret = mpol_ops[pol->mode].create(pol, NULL);
245 	return ret;
246 }
247
248 /*
249  * This function just creates a new policy, does some check and simple
250  * initialization. You must invoke mpol_set_nodemask() to set nodes.
251  */
252 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
253 				  nodemask_t *nodes)
254 {
255 	struct mempolicy *policy;
256
257 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
258 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
259
260 	if (mode == MPOL_DEFAULT) {
261 		if (nodes && !nodes_empty(*nodes))
262 			return ERR_PTR(-EINVAL);
263 		return NULL;
264 	}
265 	VM_BUG_ON(!nodes);
266
267 	/*
268 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
269 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
270 	 * All other modes require a valid pointer to a non-empty nodemask.
271 	 */
272 	if (mode == MPOL_PREFERRED) {
273 		if (nodes_empty(*nodes)) {
274 			if (((flags & MPOL_F_STATIC_NODES) ||
275 			     (flags & MPOL_F_RELATIVE_NODES)))
276 				return ERR_PTR(-EINVAL);
277 		}
278 	} else if (mode == MPOL_LOCAL) {
279 		if (!nodes_empty(*nodes))
280 			return ERR_PTR(-EINVAL);
281 		mode = MPOL_PREFERRED;
282 	} else if (nodes_empty(*nodes))
283 		return ERR_PTR(-EINVAL);
284 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
285 	if (!policy)
286 		return ERR_PTR(-ENOMEM);
287 	atomic_set(&policy->refcnt, 1);
288 	policy->mode = mode;
289 	policy->flags = flags;
290
291 	return policy;
292 }
293
294 /* Slow path of a mpol destructor. */
295 void __mpol_put(struct mempolicy *p)
296 {
297 	if (!atomic_dec_and_test(&p->refcnt))
298 		return;
299 	kmem_cache_free(policy_cache, p);
300 }
301
302 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
303 				enum mpol_rebind_step step)
304 {
305 }
306
307 /*
308  * step:
309  * 	MPOL_REBIND_ONCE  - do rebind work at once
310  * 	MPOL_REBIND_STEP1 - set all the newly nodes
311  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
312  */
313 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
314 				 enum mpol_rebind_step step)
315 {
316 	nodemask_t tmp;
317
318 	if (pol->flags & MPOL_F_STATIC_NODES)
319 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
320 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
321 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
322 	else {
323 		/*
324 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
325 		 * result
326 		 */
327 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
328 			nodes_remap(tmp, pol->v.nodes,
329 					pol->w.cpuset_mems_allowed, *nodes);
330 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
331 		} else if (step == MPOL_REBIND_STEP2) {
332 			tmp = pol->w.cpuset_mems_allowed;
333 			pol->w.cpuset_mems_allowed = *nodes;
334 		} else
335 			BUG();
336 	}
337
338 	if (nodes_empty(tmp))
339 		tmp = *nodes;
340
341 	if (step == MPOL_REBIND_STEP1)
342 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
343 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
344 		pol->v.nodes = tmp;
345 	else
346 		BUG();
347
348 	if (!node_isset(current->il_next, tmp)) {
349 		current->il_next = next_node_in(current->il_next, tmp);
350 		if (current->il_next >= MAX_NUMNODES)
351 			current->il_next = numa_node_id();
352 	}
353 }
354
355 static void mpol_rebind_preferred(struct mempolicy *pol,
356 				  const nodemask_t *nodes,
357 				  enum mpol_rebind_step step)
358 {
359 	nodemask_t tmp;
360
361 	if (pol->flags & MPOL_F_STATIC_NODES) {
362 		int node = first_node(pol->w.user_nodemask);
363
364 		if (node_isset(node, *nodes)) {
365 			pol->v.preferred_node = node;
366 			pol->flags &= ~MPOL_F_LOCAL;
367 		} else
368 			pol->flags |= MPOL_F_LOCAL;
369 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
370 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
371 		pol->v.preferred_node = first_node(tmp);
372 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
373 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
374 						   pol->w.cpuset_mems_allowed,
375 						   *nodes);
376 		pol->w.cpuset_mems_allowed = *nodes;
377 	}
378 }
379
380 /*
381  * mpol_rebind_policy - Migrate a policy to a different set of nodes
382  *
383  * If read-side task has no lock to protect task->mempolicy, write-side
384  * task will rebind the task->mempolicy by two step. The first step is
385  * setting all the newly nodes, and the second step is cleaning all the
386  * disallowed nodes. In this way, we can avoid finding no node to alloc
387  * page.
388  * If we have a lock to protect task->mempolicy in read-side, we do
389  * rebind directly.
390  *
391  * step:
392  * 	MPOL_REBIND_ONCE  - do rebind work at once
393  * 	MPOL_REBIND_STEP1 - set all the newly nodes
394  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
395  */
396 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
397 				enum mpol_rebind_step step)
398 {
399 	if (!pol)
400 		return;
401 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
402 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
403 		return;
404
405 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
406 		return;
407
408 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
409 		BUG();
410
411 	if (step == MPOL_REBIND_STEP1)
412 		pol->flags |= MPOL_F_REBINDING;
413 	else if (step == MPOL_REBIND_STEP2)
414 		pol->flags &= ~MPOL_F_REBINDING;
415 	else if (step >= MPOL_REBIND_NSTEP)
416 		BUG();
417
418 	mpol_ops[pol->mode].rebind(pol, newmask, step);
419 }
420
421 /*
422  * Wrapper for mpol_rebind_policy() that just requires task
423  * pointer, and updates task mempolicy.
424  *
425  * Called with task's alloc_lock held.
426  */
427
428 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
429 			enum mpol_rebind_step step)
430 {
431 	mpol_rebind_policy(tsk->mempolicy, new, step);
432 }
433
434 /*
435  * Rebind each vma in mm to new nodemask.
436  *
437  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
438  */
439
440 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
441 {
442 	struct vm_area_struct *vma;
443
444 	down_write(&mm->mmap_sem);
445 	for (vma = mm->mmap; vma; vma = vma->vm_next)
446 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
447 	up_write(&mm->mmap_sem);
448 }
449
450 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
451 	[MPOL_DEFAULT] = {
452 		.rebind = mpol_rebind_default,
453 	},
454 	[MPOL_INTERLEAVE] = {
455 		.create = mpol_new_interleave,
456 		.rebind = mpol_rebind_nodemask,
457 	},
458 	[MPOL_PREFERRED] = {
459 		.create = mpol_new_preferred,
460 		.rebind = mpol_rebind_preferred,
461 	},
462 	[MPOL_BIND] = {
463 		.create = mpol_new_bind,
464 		.rebind = mpol_rebind_nodemask,
465 	},
466 };
467
468 static void migrate_page_add(struct page *page, struct list_head *pagelist,
469 				unsigned long flags);
470
471 struct queue_pages {
472 	struct list_head *pagelist;
473 	unsigned long flags;
474 	nodemask_t *nmask;
475 	struct vm_area_struct *prev;
476 };
477
478 /*
479  * Scan through pages checking if pages follow certain conditions,
480  * and move them to the pagelist if they do.
481  */
482 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
483 			unsigned long end, struct mm_walk *walk)
484 {
485 	struct vm_area_struct *vma = walk->vma;
486 	struct page *page;
487 	struct queue_pages *qp = walk->private;
488 	unsigned long flags = qp->flags;
489 	int nid, ret;
490 	pte_t *pte;
491 	spinlock_t *ptl;
492
493 	if (pmd_trans_huge(*pmd)) {
494 		ptl = pmd_lock(walk->mm, pmd);
495 		if (pmd_trans_huge(*pmd)) {
496 			page = pmd_page(*pmd);
497 			if (is_huge_zero_page(page)) {
498 				spin_unlock(ptl);
499 				split_huge_pmd(vma, pmd, addr);
500 			} else {
501 				get_page(page);
502 				spin_unlock(ptl);
503 				lock_page(page);
504 				ret = split_huge_page(page);
505 				unlock_page(page);
506 				put_page(page);
507 				if (ret)
508 					return 0;
509 			}
510 		} else {
511 			spin_unlock(ptl);
512 		}
513 	}
514
515 	if (pmd_trans_unstable(pmd))
516 		return 0;
517 retry:
518 	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
519 	for (; addr != end; pte++, addr += PAGE_SIZE) {
520 		if (!pte_present(*pte))
521 			continue;
522 		page = vm_normal_page(vma, addr, *pte);
523 		if (!page)
524 			continue;
525 		/*
526 		 * vm_normal_page() filters out zero pages, but there might
527 		 * still be PageReserved pages to skip, perhaps in a VDSO.
528 		 */
529 		if (PageReserved(page))
530 			continue;
531 		nid = page_to_nid(page);
532 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 			continue;
534 		if (PageTransCompound(page)) {
535 			get_page(page);
536 			pte_unmap_unlock(pte, ptl);
537 			lock_page(page);
538 			ret = split_huge_page(page);
539 			unlock_page(page);
540 			put_page(page);
541 			/* Failed to split -- skip. */
542 			if (ret) {
543 				pte = pte_offset_map_lock(walk->mm, pmd,
544 						addr, &ptl);
545 				continue;
546 			}
547 			goto retry;
548 		}
549
550 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
551 			if (!vma_migratable(vma))
552 				break;
553 			migrate_page_add(page, qp->pagelist, flags);
554 		} else
555 			break;
556 	}
557 	pte_unmap_unlock(pte - 1, ptl);
558 	cond_resched();
559 	return addr != end ? -EIO : 0;
560 }
561
562 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
563 			       unsigned long addr, unsigned long end,
564 			       struct mm_walk *walk)
565 {
566 #ifdef CONFIG_HUGETLB_PAGE
567 	struct queue_pages *qp = walk->private;
568 	unsigned long flags = qp->flags;
569 	int nid;
570 	struct page *page;
571 	spinlock_t *ptl;
572 	pte_t entry;
573
574 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
575 	entry = huge_ptep_get(pte);
576 	if (!pte_present(entry))
577 		goto unlock;
578 	page = pte_page(entry);
579 	nid = page_to_nid(page);
580 	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
581 		goto unlock;
582 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
583 	if (flags & (MPOL_MF_MOVE_ALL) ||
584 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
585 		isolate_huge_page(page, qp->pagelist);
586 unlock:
587 	spin_unlock(ptl);
588 #else
589 	BUG();
590 #endif
591 	return 0;
592 }
593
594 #ifdef CONFIG_NUMA_BALANCING
595 /*
596  * This is used to mark a range of virtual addresses to be inaccessible.
597  * These are later cleared by a NUMA hinting fault. Depending on these
598  * faults, pages may be migrated for better NUMA placement.
599  *
600  * This is assuming that NUMA faults are handled using PROT_NONE. If
601  * an architecture makes a different choice, it will need further
602  * changes to the core.
603  */
604 unsigned long change_prot_numa(struct vm_area_struct *vma,
605 			unsigned long addr, unsigned long end)
606 {
607 	int nr_updated;
608
609 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
610 	if (nr_updated)
611 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
612
613 	return nr_updated;
614 }
615 #else
616 static unsigned long change_prot_numa(struct vm_area_struct *vma,
617 			unsigned long addr, unsigned long end)
618 {
619 	return 0;
620 }
621 #endif /* CONFIG_NUMA_BALANCING */
622
623 static int queue_pages_test_walk(unsigned long start, unsigned long end,
624 				struct mm_walk *walk)
625 {
626 	struct vm_area_struct *vma = walk->vma;
627 	struct queue_pages *qp = walk->private;
628 	unsigned long endvma = vma->vm_end;
629 	unsigned long flags = qp->flags;
630
631 	/*
632 	 * Need check MPOL_MF_STRICT to return -EIO if possible
633 	 * regardless of vma_migratable
634 	 */
635 	if (!vma_migratable(vma) &&
636 	    !(flags & MPOL_MF_STRICT))
637 		return 1;
638
639 	if (endvma > end)
640 		endvma = end;
641 	if (vma->vm_start > start)
642 		start = vma->vm_start;
643
644 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
645 		if (!vma->vm_next && vma->vm_end < end)
646 			return -EFAULT;
647 		if (qp->prev && qp->prev->vm_end < vma->vm_start)
648 			return -EFAULT;
649 	}
650
651 	qp->prev = vma;
652
653 	if (flags & MPOL_MF_LAZY) {
654 		/* Similar to task_numa_work, skip inaccessible VMAs */
655 		if (!is_vm_hugetlb_page(vma) &&
656 			(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
657 			!(vma->vm_flags & VM_MIXEDMAP))
658 			change_prot_numa(vma, start, endvma);
659 		return 1;
660 	}
661
662 	/* queue pages from current vma */
663 	if (flags & MPOL_MF_VALID)
664 		return 0;
665 	return 1;
666 }
667
668 /*
669  * Walk through page tables and collect pages to be migrated.
670  *
671  * If pages found in a given range are on a set of nodes (determined by
672  * @nodes and @flags,) it's isolated and queued to the pagelist which is
673  * passed via @private.)
674  */
675 static int
676 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
677 		nodemask_t *nodes, unsigned long flags,
678 		struct list_head *pagelist)
679 {
680 	struct queue_pages qp = {
681 		.pagelist = pagelist,
682 		.flags = flags,
683 		.nmask = nodes,
684 		.prev = NULL,
685 	};
686 	struct mm_walk queue_pages_walk = {
687 		.hugetlb_entry = queue_pages_hugetlb,
688 		.pmd_entry = queue_pages_pte_range,
689 		.test_walk = queue_pages_test_walk,
690 		.mm = mm,
691 		.private = &qp,
692 	};
693
694 	return walk_page_range(start, end, &queue_pages_walk);
695 }
696
697 /*
698  * Apply policy to a single VMA
699  * This must be called with the mmap_sem held for writing.
700  */
701 static int vma_replace_policy(struct vm_area_struct *vma,
702 						struct mempolicy *pol)
703 {
704 	int err;
705 	struct mempolicy *old;
706 	struct mempolicy *new;
707
708 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
709 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
710 		 vma->vm_ops, vma->vm_file,
711 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
712
713 	new = mpol_dup(pol);
714 	if (IS_ERR(new))
715 		return PTR_ERR(new);
716
717 	if (vma->vm_ops && vma->vm_ops->set_policy) {
718 		err = vma->vm_ops->set_policy(vma, new);
719 		if (err)
720 			goto err_out;
721 	}
722
723 	old = vma->vm_policy;
724 	vma->vm_policy = new; /* protected by mmap_sem */
725 	mpol_put(old);
726
727 	return 0;
728  err_out:
729 	mpol_put(new);
730 	return err;
731 }
732
733 /* Step 2: apply policy to a range and do splits. */
734 static int mbind_range(struct mm_struct *mm, unsigned long start,
735 		       unsigned long end, struct mempolicy *new_pol)
736 {
737 	struct vm_area_struct *next;
738 	struct vm_area_struct *prev;
739 	struct vm_area_struct *vma;
740 	int err = 0;
741 	pgoff_t pgoff;
742 	unsigned long vmstart;
743 	unsigned long vmend;
744
745 	vma = find_vma(mm, start);
746 	if (!vma || vma->vm_start > start)
747 		return -EFAULT;
748
749 	prev = vma->vm_prev;
750 	if (start > vma->vm_start)
751 		prev = vma;
752
753 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
754 		next = vma->vm_next;
755 		vmstart = max(start, vma->vm_start);
756 		vmend   = min(end, vma->vm_end);
757
758 		if (mpol_equal(vma_policy(vma), new_pol))
759 			continue;
760
761 		pgoff = vma->vm_pgoff +
762 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
763 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
764 				 vma->anon_vma, vma->vm_file, pgoff,
765 				 new_pol, vma->vm_userfaultfd_ctx,
766 				 vma_get_anon_name(vma));
767 		if (prev) {
768 			vma = prev;
769 			next = vma->vm_next;
770 			if (mpol_equal(vma_policy(vma), new_pol))
771 				continue;
772 			/* vma_merge() joined vma && vma->next, case 8 */
773 			goto replace;
774 		}
775 		if (vma->vm_start != vmstart) {
776 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
777 			if (err)
778 				goto out;
779 		}
780 		if (vma->vm_end != vmend) {
781 			err = split_vma(vma->vm_mm, vma, vmend, 0);
782 			if (err)
783 				goto out;
784 		}
785  replace:
786 		err = vma_replace_policy(vma, new_pol);
787 		if (err)
788 			goto out;
789 	}
790
791  out:
792 	return err;
793 }
794
795 /* Set the process memory policy */
796 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
797 			     nodemask_t *nodes)
798 {
799 	struct mempolicy *new, *old;
800 	NODEMASK_SCRATCH(scratch);
801 	int ret;
802
803 	if (!scratch)
804 		return -ENOMEM;
805
806 	new = mpol_new(mode, flags, nodes);
807 	if (IS_ERR(new)) {
808 		ret = PTR_ERR(new);
809 		goto out;
810 	}
811
812 	task_lock(current);
813 	ret = mpol_set_nodemask(new, nodes, scratch);
814 	if (ret) {
815 		task_unlock(current);
816 		mpol_put(new);
817 		goto out;
818 	}
819 	old = current->mempolicy;
820 	current->mempolicy = new;
821 	if (new && new->mode == MPOL_INTERLEAVE &&
822 	    nodes_weight(new->v.nodes))
823 		current->il_next = first_node(new->v.nodes);
824 	task_unlock(current);
825 	mpol_put(old);
826 	ret = 0;
827 out:
828 	NODEMASK_SCRATCH_FREE(scratch);
829 	return ret;
830 }
831
832 /*
833  * Return nodemask for policy for get_mempolicy() query
834  *
835  * Called with task's alloc_lock held
836  */
837 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
838 {
839 	nodes_clear(*nodes);
840 	if (p == &default_policy)
841 		return;
842
843 	switch (p->mode) {
844 	case MPOL_BIND:
845 		/* Fall through */
846 	case MPOL_INTERLEAVE:
847 		*nodes = p->v.nodes;
848 		break;
849 	case MPOL_PREFERRED:
850 		if (!(p->flags & MPOL_F_LOCAL))
851 			node_set(p->v.preferred_node, *nodes);
852 		/* else return empty node mask for local allocation */
853 		break;
854 	default:
855 		BUG();
856 	}
857 }
858
859 static int lookup_node(unsigned long addr)
860 {
861 	struct page *p;
862 	int err;
863
864 	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
865 	if (err >= 0) {
866 		err = page_to_nid(p);
867 		put_page(p);
868 	}
869 	return err;
870 }
871
872 /* Retrieve NUMA policy */
873 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
874 			     unsigned long addr, unsigned long flags)
875 {
876 	int err;
877 	struct mm_struct *mm = current->mm;
878 	struct vm_area_struct *vma = NULL;
879 	struct mempolicy *pol = current->mempolicy;
880
881 	if (flags &
882 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
883 		return -EINVAL;
884
885 	if (flags & MPOL_F_MEMS_ALLOWED) {
886 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
887 			return -EINVAL;
888 		*policy = 0;	/* just so it's initialized */
889 		task_lock(current);
890 		*nmask  = cpuset_current_mems_allowed;
891 		task_unlock(current);
892 		return 0;
893 	}
894
895 	if (flags & MPOL_F_ADDR) {
896 		/*
897 		 * Do NOT fall back to task policy if the
898 		 * vma/shared policy at addr is NULL.  We
899 		 * want to return MPOL_DEFAULT in this case.
900 		 */
901 		down_read(&mm->mmap_sem);
902 		vma = find_vma_intersection(mm, addr, addr+1);
903 		if (!vma) {
904 			up_read(&mm->mmap_sem);
905 			return -EFAULT;
906 		}
907 		if (vma->vm_ops && vma->vm_ops->get_policy)
908 			pol = vma->vm_ops->get_policy(vma, addr);
909 		else
910 			pol = vma->vm_policy;
911 	} else if (addr)
912 		return -EINVAL;
913
914 	if (!pol)
915 		pol = &default_policy;	/* indicates default behavior */
916
917 	if (flags & MPOL_F_NODE) {
918 		if (flags & MPOL_F_ADDR) {
919 			err = lookup_node(addr);
920 			if (err < 0)
921 				goto out;
922 			*policy = err;
923 		} else if (pol == current->mempolicy &&
924 				pol->mode == MPOL_INTERLEAVE) {
925 			*policy = current->il_next;
926 		} else {
927 			err = -EINVAL;
928 			goto out;
929 		}
930 	} else {
931 		*policy = pol == &default_policy ? MPOL_DEFAULT :
932 						pol->mode;
933 		/*
934 		 * Internal mempolicy flags must be masked off before exposing
935 		 * the policy to userspace.
936 		 */
937 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
938 	}
939
940 	err = 0;
941 	if (nmask) {
942 		if (mpol_store_user_nodemask(pol)) {
943 			*nmask = pol->w.user_nodemask;
944 		} else {
945 			task_lock(current);
946 			get_policy_nodemask(pol, nmask);
947 			task_unlock(current);
948 		}
949 	}
950
951  out:
952 	mpol_cond_put(pol);
953 	if (vma)
954 		up_read(&current->mm->mmap_sem);
955 	return err;
956 }
957
958 #ifdef CONFIG_MIGRATION
959 /*
960  * page migration
961  */
962 static void migrate_page_add(struct page *page, struct list_head *pagelist,
963 				unsigned long flags)
964 {
965 	/*
966 	 * Avoid migrating a page that is shared with others.
967 	 */
968 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
969 		if (!isolate_lru_page(page)) {
970 			list_add_tail(&page->lru, pagelist);
971 			inc_node_page_state(page, NR_ISOLATED_ANON +
972 					    page_is_file_cache(page));
973 		}
974 	}
975 }
976
977 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
978 {
979 	if (PageHuge(page))
980 		return alloc_huge_page_node(page_hstate(compound_head(page)),
981 					node);
982 	else
983 		return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
984 						    __GFP_THISNODE, 0);
985 }
986
987 /*
988  * Migrate pages from one node to a target node.
989  * Returns error or the number of pages not migrated.
990  */
991 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
992 			   int flags)
993 {
994 	nodemask_t nmask;
995 	LIST_HEAD(pagelist);
996 	int err = 0;
997
998 	nodes_clear(nmask);
999 	node_set(source, nmask);
1000
1001 	/*
1002 	 * This does not "check" the range but isolates all pages that
1003 	 * need migration.  Between passing in the full user address
1004 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1005 	 */
1006 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1007 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1008 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1009
1010 	if (!list_empty(&pagelist)) {
1011 		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1012 					MIGRATE_SYNC, MR_SYSCALL);
1013 		if (err)
1014 			putback_movable_pages(&pagelist);
1015 	}
1016
1017 	return err;
1018 }
1019
1020 /*
1021  * Move pages between the two nodesets so as to preserve the physical
1022  * layout as much as possible.
1023  *
1024  * Returns the number of page that could not be moved.
1025  */
1026 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1027 		     const nodemask_t *to, int flags)
1028 {
1029 	int busy = 0;
1030 	int err;
1031 	nodemask_t tmp;
1032
1033 	err = migrate_prep();
1034 	if (err)
1035 		return err;
1036
1037 	down_read(&mm->mmap_sem);
1038
1039 	/*
1040 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1041 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1042 	 * bit in 'tmp', and return that <source, dest> pair for migration.
1043 	 * The pair of nodemasks 'to' and 'from' define the map.
1044 	 *
1045 	 * If no pair of bits is found that way, fallback to picking some
1046 	 * pair of 'source' and 'dest' bits that are not the same.  If the
1047 	 * 'source' and 'dest' bits are the same, this represents a node
1048 	 * that will be migrating to itself, so no pages need move.
1049 	 *
1050 	 * If no bits are left in 'tmp', or if all remaining bits left
1051 	 * in 'tmp' correspond to the same bit in 'to', return false
1052 	 * (nothing left to migrate).
1053 	 *
1054 	 * This lets us pick a pair of nodes to migrate between, such that
1055 	 * if possible the dest node is not already occupied by some other
1056 	 * source node, minimizing the risk of overloading the memory on a
1057 	 * node that would happen if we migrated incoming memory to a node
1058 	 * before migrating outgoing memory source that same node.
1059 	 *
1060 	 * A single scan of tmp is sufficient.  As we go, we remember the
1061 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1062 	 * that not only moved, but what's better, moved to an empty slot
1063 	 * (d is not set in tmp), then we break out then, with that pair.
1064 	 * Otherwise when we finish scanning from_tmp, we at least have the
1065 	 * most recent <s, d> pair that moved.  If we get all the way through
1066 	 * the scan of tmp without finding any node that moved, much less
1067 	 * moved to an empty node, then there is nothing left worth migrating.
1068 	 */
1069
1070 	tmp = *from;
1071 	while (!nodes_empty(tmp)) {
1072 		int s,d;
1073 		int source = NUMA_NO_NODE;
1074 		int dest = 0;
1075
1076 		for_each_node_mask(s, tmp) {
1077
1078 			/*
1079 			 * do_migrate_pages() tries to maintain the relative
1080 			 * node relationship of the pages established between
1081 			 * threads and memory areas.
1082                          *
1083 			 * However if the number of source nodes is not equal to
1084 			 * the number of destination nodes we can not preserve
1085 			 * this node relative relationship.  In that case, skip
1086 			 * copying memory from a node that is in the destination
1087 			 * mask.
1088 			 *
1089 			 * Example: [2,3,4] -> [3,4,5] moves everything.
1090 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1091 			 */
1092
1093 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1094 						(node_isset(s, *to)))
1095 				continue;
1096
1097 			d = node_remap(s, *from, *to);
1098 			if (s == d)
1099 				continue;
1100
1101 			source = s;	/* Node moved. Memorize */
1102 			dest = d;
1103
1104 			/* dest not in remaining from nodes? */
1105 			if (!node_isset(dest, tmp))
1106 				break;
1107 		}
1108 		if (source == NUMA_NO_NODE)
1109 			break;
1110
1111 		node_clear(source, tmp);
1112 		err = migrate_to_node(mm, source, dest, flags);
1113 		if (err > 0)
1114 			busy += err;
1115 		if (err < 0)
1116 			break;
1117 	}
1118 	up_read(&mm->mmap_sem);
1119 	if (err < 0)
1120 		return err;
1121 	return busy;
1122
1123 }
1124
1125 /*
1126  * Allocate a new page for page migration based on vma policy.
1127  * Start by assuming the page is mapped by the same vma as contains @start.
1128  * Search forward from there, if not.  N.B., this assumes that the
1129  * list of pages handed to migrate_pages()--which is how we get here--
1130  * is in virtual address order.
1131  */
1132 static struct page *new_page(struct page *page, unsigned long start, int **x)
1133 {
1134 	struct vm_area_struct *vma;
1135 	unsigned long uninitialized_var(address);
1136
1137 	vma = find_vma(current->mm, start);
1138 	while (vma) {
1139 		address = page_address_in_vma(page, vma);
1140 		if (address != -EFAULT)
1141 			break;
1142 		vma = vma->vm_next;
1143 	}
1144
1145 	if (PageHuge(page)) {
1146 		BUG_ON(!vma);
1147 		return alloc_huge_page_noerr(vma, address, 1);
1148 	}
1149 	/*
1150 	 * if !vma, alloc_page_vma() will use task or system default policy
1151 	 */
1152 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1153 }
1154 #else
1155
1156 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1157 				unsigned long flags)
1158 {
1159 }
1160
1161 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1162 		     const nodemask_t *to, int flags)
1163 {
1164 	return -ENOSYS;
1165 }
1166
1167 static struct page *new_page(struct page *page, unsigned long start, int **x)
1168 {
1169 	return NULL;
1170 }
1171 #endif
1172
1173 static long do_mbind(unsigned long start, unsigned long len,
1174 		     unsigned short mode, unsigned short mode_flags,
1175 		     nodemask_t *nmask, unsigned long flags)
1176 {
1177 	struct mm_struct *mm = current->mm;
1178 	struct mempolicy *new;
1179 	unsigned long end;
1180 	int err;
1181 	LIST_HEAD(pagelist);
1182
1183 	if (flags & ~(unsigned long)MPOL_MF_VALID)
1184 		return -EINVAL;
1185 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1186 		return -EPERM;
1187
1188 	if (start & ~PAGE_MASK)
1189 		return -EINVAL;
1190
1191 	if (mode == MPOL_DEFAULT)
1192 		flags &= ~MPOL_MF_STRICT;
1193
1194 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1195 	end = start + len;
1196
1197 	if (end < start)
1198 		return -EINVAL;
1199 	if (end == start)
1200 		return 0;
1201
1202 	new = mpol_new(mode, mode_flags, nmask);
1203 	if (IS_ERR(new))
1204 		return PTR_ERR(new);
1205
1206 	if (flags & MPOL_MF_LAZY)
1207 		new->flags |= MPOL_F_MOF;
1208
1209 	/*
1210 	 * If we are using the default policy then operation
1211 	 * on discontinuous address spaces is okay after all
1212 	 */
1213 	if (!new)
1214 		flags |= MPOL_MF_DISCONTIG_OK;
1215
1216 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1217 		 start, start + len, mode, mode_flags,
1218 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1219
1220 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1221
1222 		err = migrate_prep();
1223 		if (err)
1224 			goto mpol_out;
1225 	}
1226 	{
1227 		NODEMASK_SCRATCH(scratch);
1228 		if (scratch) {
1229 			down_write(&mm->mmap_sem);
1230 			task_lock(current);
1231 			err = mpol_set_nodemask(new, nmask, scratch);
1232 			task_unlock(current);
1233 			if (err)
1234 				up_write(&mm->mmap_sem);
1235 		} else
1236 			err = -ENOMEM;
1237 		NODEMASK_SCRATCH_FREE(scratch);
1238 	}
1239 	if (err)
1240 		goto mpol_out;
1241
1242 	err = queue_pages_range(mm, start, end, nmask,
1243 			  flags | MPOL_MF_INVERT, &pagelist);
1244 	if (!err)
1245 		err = mbind_range(mm, start, end, new);
1246
1247 	if (!err) {
1248 		int nr_failed = 0;
1249
1250 		if (!list_empty(&pagelist)) {
1251 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1252 			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1253 				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1254 			if (nr_failed)
1255 				putback_movable_pages(&pagelist);
1256 		}
1257
1258 		if (nr_failed && (flags & MPOL_MF_STRICT))
1259 			err = -EIO;
1260 	} else
1261 		putback_movable_pages(&pagelist);
1262
1263 	up_write(&mm->mmap_sem);
1264  mpol_out:
1265 	mpol_put(new);
1266 	return err;
1267 }
1268
1269 /*
1270  * User space interface with variable sized bitmaps for nodelists.
1271  */
1272
1273 /* Copy a node mask from user space. */
1274 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1275 		     unsigned long maxnode)
1276 {
1277 	unsigned long k;
1278 	unsigned long t;
1279 	unsigned long nlongs;
1280 	unsigned long endmask;
1281
1282 	--maxnode;
1283 	nodes_clear(*nodes);
1284 	if (maxnode == 0 || !nmask)
1285 		return 0;
1286 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1287 		return -EINVAL;
1288
1289 	nlongs = BITS_TO_LONGS(maxnode);
1290 	if ((maxnode % BITS_PER_LONG) == 0)
1291 		endmask = ~0UL;
1292 	else
1293 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1294
1295 	/*
1296 	 * When the user specified more nodes than supported just check
1297 	 * if the non supported part is all zero.
1298 	 *
1299 	 * If maxnode have more longs than MAX_NUMNODES, check
1300 	 * the bits in that area first. And then go through to
1301 	 * check the rest bits which equal or bigger than MAX_NUMNODES.
1302 	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1303 	 */
1304 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1305 		if (nlongs > PAGE_SIZE/sizeof(long))
1306 			return -EINVAL;
1307 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1308 			if (get_user(t, nmask + k))
1309 				return -EFAULT;
1310 			if (k == nlongs - 1) {
1311 				if (t & endmask)
1312 					return -EINVAL;
1313 			} else if (t)
1314 				return -EINVAL;
1315 		}
1316 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1317 		endmask = ~0UL;
1318 	}
1319
1320 	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1321 		unsigned long valid_mask = endmask;
1322
1323 		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1324 		if (get_user(t, nmask + nlongs - 1))
1325 			return -EFAULT;
1326 		if (t & valid_mask)
1327 			return -EINVAL;
1328 	}
1329
1330 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1331 		return -EFAULT;
1332 	nodes_addr(*nodes)[nlongs-1] &= endmask;
1333 	return 0;
1334 }
1335
1336 /* Copy a kernel node mask to user space */
1337 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1338 			      nodemask_t *nodes)
1339 {
1340 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1341 	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1342
1343 	if (copy > nbytes) {
1344 		if (copy > PAGE_SIZE)
1345 			return -EINVAL;
1346 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1347 			return -EFAULT;
1348 		copy = nbytes;
1349 	}
1350 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1351 }
1352
1353 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1354 		unsigned long, mode, const unsigned long __user *, nmask,
1355 		unsigned long, maxnode, unsigned, flags)
1356 {
1357 	nodemask_t nodes;
1358 	int err;
1359 	unsigned short mode_flags;
1360
1361 	mode_flags = mode & MPOL_MODE_FLAGS;
1362 	mode &= ~MPOL_MODE_FLAGS;
1363 	if (mode >= MPOL_MAX)
1364 		return -EINVAL;
1365 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1366 	    (mode_flags & MPOL_F_RELATIVE_NODES))
1367 		return -EINVAL;
1368 	err = get_nodes(&nodes, nmask, maxnode);
1369 	if (err)
1370 		return err;
1371 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1372 }
1373
1374 /* Set the process memory policy */
1375 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1376 		unsigned long, maxnode)
1377 {
1378 	int err;
1379 	nodemask_t nodes;
1380 	unsigned short flags;
1381
1382 	flags = mode & MPOL_MODE_FLAGS;
1383 	mode &= ~MPOL_MODE_FLAGS;
1384 	if ((unsigned int)mode >= MPOL_MAX)
1385 		return -EINVAL;
1386 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1387 		return -EINVAL;
1388 	err = get_nodes(&nodes, nmask, maxnode);
1389 	if (err)
1390 		return err;
1391 	return do_set_mempolicy(mode, flags, &nodes);
1392 }
1393
1394 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1395 		const unsigned long __user *, old_nodes,
1396 		const unsigned long __user *, new_nodes)
1397 {
1398 	const struct cred *cred = current_cred(), *tcred;
1399 	struct mm_struct *mm = NULL;
1400 	struct task_struct *task;
1401 	nodemask_t task_nodes;
1402 	int err;
1403 	nodemask_t *old;
1404 	nodemask_t *new;
1405 	NODEMASK_SCRATCH(scratch);
1406
1407 	if (!scratch)
1408 		return -ENOMEM;
1409
1410 	old = &scratch->mask1;
1411 	new = &scratch->mask2;
1412
1413 	err = get_nodes(old, old_nodes, maxnode);
1414 	if (err)
1415 		goto out;
1416
1417 	err = get_nodes(new, new_nodes, maxnode);
1418 	if (err)
1419 		goto out;
1420
1421 	/* Find the mm_struct */
1422 	rcu_read_lock();
1423 	task = pid ? find_task_by_vpid(pid) : current;
1424 	if (!task) {
1425 		rcu_read_unlock();
1426 		err = -ESRCH;
1427 		goto out;
1428 	}
1429 	get_task_struct(task);
1430
1431 	err = -EINVAL;
1432
1433 	/*
1434 	 * Check if this process has the right to modify the specified
1435 	 * process. The right exists if the process has administrative
1436 	 * capabilities, superuser privileges or the same
1437 	 * userid as the target process.
1438 	 */
1439 	tcred = __task_cred(task);
1440 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1441 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1442 	    !capable(CAP_SYS_NICE)) {
1443 		rcu_read_unlock();
1444 		err = -EPERM;
1445 		goto out_put;
1446 	}
1447 	rcu_read_unlock();
1448
1449 	task_nodes = cpuset_mems_allowed(task);
1450 	/* Is the user allowed to access the target nodes? */
1451 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1452 		err = -EPERM;
1453 		goto out_put;
1454 	}
1455
1456 	task_nodes = cpuset_mems_allowed(current);
1457 	nodes_and(*new, *new, task_nodes);
1458 	if (nodes_empty(*new))
1459 		goto out_put;
1460
1461 	nodes_and(*new, *new, node_states[N_MEMORY]);
1462 	if (nodes_empty(*new))
1463 		goto out_put;
1464
1465 	err = security_task_movememory(task);
1466 	if (err)
1467 		goto out_put;
1468
1469 	mm = get_task_mm(task);
1470 	put_task_struct(task);
1471
1472 	if (!mm) {
1473 		err = -EINVAL;
1474 		goto out;
1475 	}
1476
1477 	err = do_migrate_pages(mm, old, new,
1478 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1479
1480 	mmput(mm);
1481 out:
1482 	NODEMASK_SCRATCH_FREE(scratch);
1483
1484 	return err;
1485
1486 out_put:
1487 	put_task_struct(task);
1488 	goto out;
1489
1490 }
1491
1492
1493 /* Retrieve NUMA policy */
1494 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1495 		unsigned long __user *, nmask, unsigned long, maxnode,
1496 		unsigned long, addr, unsigned long, flags)
1497 {
1498 	int err;
1499 	int uninitialized_var(pval);
1500 	nodemask_t nodes;
1501
1502 	if (nmask != NULL && maxnode < nr_node_ids)
1503 		return -EINVAL;
1504
1505 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1506
1507 	if (err)
1508 		return err;
1509
1510 	if (policy && put_user(pval, policy))
1511 		return -EFAULT;
1512
1513 	if (nmask)
1514 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1515
1516 	return err;
1517 }
1518
1519 #ifdef CONFIG_COMPAT
1520
1521 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1522 		       compat_ulong_t __user *, nmask,
1523 		       compat_ulong_t, maxnode,
1524 		       compat_ulong_t, addr, compat_ulong_t, flags)
1525 {
1526 	long err;
1527 	unsigned long __user *nm = NULL;
1528 	unsigned long nr_bits, alloc_size;
1529 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1530
1531 	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1532 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1533
1534 	if (nmask)
1535 		nm = compat_alloc_user_space(alloc_size);
1536
1537 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1538
1539 	if (!err && nmask) {
1540 		unsigned long copy_size;
1541 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1542 		err = copy_from_user(bm, nm, copy_size);
1543 		/* ensure entire bitmap is zeroed */
1544 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1545 		err |= compat_put_bitmap(nmask, bm, nr_bits);
1546 	}
1547
1548 	return err;
1549 }
1550
1551 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1552 		       compat_ulong_t, maxnode)
1553 {
1554 	unsigned long __user *nm = NULL;
1555 	unsigned long nr_bits, alloc_size;
1556 	DECLARE_BITMAP(bm, MAX_NUMNODES);
1557
1558 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1559 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1560
1561 	if (nmask) {
1562 		if (compat_get_bitmap(bm, nmask, nr_bits))
1563 			return -EFAULT;
1564 		nm = compat_alloc_user_space(alloc_size);
1565 		if (copy_to_user(nm, bm, alloc_size))
1566 			return -EFAULT;
1567 	}
1568
1569 	return sys_set_mempolicy(mode, nm, nr_bits+1);
1570 }
1571
1572 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1573 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1574 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1575 {
1576 	unsigned long __user *nm = NULL;
1577 	unsigned long nr_bits, alloc_size;
1578 	nodemask_t bm;
1579
1580 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1581 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1582
1583 	if (nmask) {
1584 		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1585 			return -EFAULT;
1586 		nm = compat_alloc_user_space(alloc_size);
1587 		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1588 			return -EFAULT;
1589 	}
1590
1591 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1592 }
1593
1594 #endif
1595
1596 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1597 						unsigned long addr)
1598 {
1599 	struct mempolicy *pol = NULL;
1600
1601 	if (vma) {
1602 		if (vma->vm_ops && vma->vm_ops->get_policy) {
1603 			pol = vma->vm_ops->get_policy(vma, addr);
1604 		} else if (vma->vm_policy) {
1605 			pol = vma->vm_policy;
1606
1607 			/*
1608 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1609 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1610 			 * count on these policies which will be dropped by
1611 			 * mpol_cond_put() later
1612 			 */
1613 			if (mpol_needs_cond_ref(pol))
1614 				mpol_get(pol);
1615 		}
1616 	}
1617
1618 	return pol;
1619 }
1620
1621 /*
1622  * get_vma_policy(@vma, @addr)
1623  * @vma: virtual memory area whose policy is sought
1624  * @addr: address in @vma for shared policy lookup
1625  *
1626  * Returns effective policy for a VMA at specified address.
1627  * Falls back to current->mempolicy or system default policy, as necessary.
1628  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1629  * count--added by the get_policy() vm_op, as appropriate--to protect against
1630  * freeing by another task.  It is the caller's responsibility to free the
1631  * extra reference for shared policies.
1632  */
1633 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1634 						unsigned long addr)
1635 {
1636 	struct mempolicy *pol = __get_vma_policy(vma, addr);
1637
1638 	if (!pol)
1639 		pol = get_task_policy(current);
1640
1641 	return pol;
1642 }
1643
1644 bool vma_policy_mof(struct vm_area_struct *vma)
1645 {
1646 	struct mempolicy *pol;
1647
1648 	if (vma->vm_ops && vma->vm_ops->get_policy) {
1649 		bool ret = false;
1650
1651 		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1652 		if (pol && (pol->flags & MPOL_F_MOF))
1653 			ret = true;
1654 		mpol_cond_put(pol);
1655
1656 		return ret;
1657 	}
1658
1659 	pol = vma->vm_policy;
1660 	if (!pol)
1661 		pol = get_task_policy(current);
1662
1663 	return pol->flags & MPOL_F_MOF;
1664 }
1665
1666 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1667 {
1668 	enum zone_type dynamic_policy_zone = policy_zone;
1669
1670 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1671
1672 	/*
1673 	 * if policy->v.nodes has movable memory only,
1674 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1675 	 *
1676 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1677 	 * so if the following test faile, it implies
1678 	 * policy->v.nodes has movable memory only.
1679 	 */
1680 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1681 		dynamic_policy_zone = ZONE_MOVABLE;
1682
1683 	return zone >= dynamic_policy_zone;
1684 }
1685
1686 /*
1687  * Return a nodemask representing a mempolicy for filtering nodes for
1688  * page allocation
1689  */
1690 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1691 {
1692 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1693 	if (unlikely(policy->mode == MPOL_BIND) &&
1694 			apply_policy_zone(policy, gfp_zone(gfp)) &&
1695 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1696 		return &policy->v.nodes;
1697
1698 	return NULL;
1699 }
1700
1701 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1702 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1703 	int nd)
1704 {
1705 	switch (policy->mode) {
1706 	case MPOL_PREFERRED:
1707 		if (!(policy->flags & MPOL_F_LOCAL))
1708 			nd = policy->v.preferred_node;
1709 		break;
1710 	case MPOL_BIND:
1711 		/*
1712 		 * Normally, MPOL_BIND allocations are node-local within the
1713 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1714 		 * current node isn't part of the mask, we use the zonelist for
1715 		 * the first node in the mask instead.
1716 		 */
1717 		if (unlikely(gfp & __GFP_THISNODE) &&
1718 				unlikely(!node_isset(nd, policy->v.nodes)))
1719 			nd = first_node(policy->v.nodes);
1720 		break;
1721 	default:
1722 		BUG();
1723 	}
1724 	return node_zonelist(nd, gfp);
1725 }
1726
1727 /* Do dynamic interleaving for a process */
1728 static unsigned interleave_nodes(struct mempolicy *policy)
1729 {
1730 	unsigned nid, next;
1731 	struct task_struct *me = current;
1732
1733 	nid = me->il_next;
1734 	next = next_node_in(nid, policy->v.nodes);
1735 	if (next < MAX_NUMNODES)
1736 		me->il_next = next;
1737 	return nid;
1738 }
1739
1740 /*
1741  * Depending on the memory policy provide a node from which to allocate the
1742  * next slab entry.
1743  */
1744 unsigned int mempolicy_slab_node(void)
1745 {
1746 	struct mempolicy *policy;
1747 	int node = numa_mem_id();
1748
1749 	if (in_interrupt())
1750 		return node;
1751
1752 	policy = current->mempolicy;
1753 	if (!policy || policy->flags & MPOL_F_LOCAL)
1754 		return node;
1755
1756 	switch (policy->mode) {
1757 	case MPOL_PREFERRED:
1758 		/*
1759 		 * handled MPOL_F_LOCAL above
1760 		 */
1761 		return policy->v.preferred_node;
1762
1763 	case MPOL_INTERLEAVE:
1764 		return interleave_nodes(policy);
1765
1766 	case MPOL_BIND: {
1767 		struct zoneref *z;
1768
1769 		/*
1770 		 * Follow bind policy behavior and start allocation at the
1771 		 * first node.
1772 		 */
1773 		struct zonelist *zonelist;
1774 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1775 		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1776 		z = first_zones_zonelist(zonelist, highest_zoneidx,
1777 							&policy->v.nodes);
1778 		return z->zone ? z->zone->node : node;
1779 	}
1780
1781 	default:
1782 		BUG();
1783 	}
1784 }
1785
1786 /*
1787  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1788  * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1789  * number of present nodes.
1790  */
1791 static unsigned offset_il_node(struct mempolicy *pol,
1792 			       struct vm_area_struct *vma, unsigned long n)
1793 {
1794 	unsigned nnodes = nodes_weight(pol->v.nodes);
1795 	unsigned target;
1796 	int i;
1797 	int nid;
1798
1799 	if (!nnodes)
1800 		return numa_node_id();
1801 	target = (unsigned int)n % nnodes;
1802 	nid = first_node(pol->v.nodes);
1803 	for (i = 0; i < target; i++)
1804 		nid = next_node(nid, pol->v.nodes);
1805 	return nid;
1806 }
1807
1808 /* Determine a node number for interleave */
1809 static inline unsigned interleave_nid(struct mempolicy *pol,
1810 		 struct vm_area_struct *vma, unsigned long addr, int shift)
1811 {
1812 	if (vma) {
1813 		unsigned long off;
1814
1815 		/*
1816 		 * for small pages, there is no difference between
1817 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1818 		 * for huge pages, since vm_pgoff is in units of small
1819 		 * pages, we need to shift off the always 0 bits to get
1820 		 * a useful offset.
1821 		 */
1822 		BUG_ON(shift < PAGE_SHIFT);
1823 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1824 		off += (addr - vma->vm_start) >> shift;
1825 		return offset_il_node(pol, vma, off);
1826 	} else
1827 		return interleave_nodes(pol);
1828 }
1829
1830 #ifdef CONFIG_HUGETLBFS
1831 /*
1832  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1833  * @vma: virtual memory area whose policy is sought
1834  * @addr: address in @vma for shared policy lookup and interleave policy
1835  * @gfp_flags: for requested zone
1836  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1837  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1838  *
1839  * Returns a zonelist suitable for a huge page allocation and a pointer
1840  * to the struct mempolicy for conditional unref after allocation.
1841  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1842  * @nodemask for filtering the zonelist.
1843  *
1844  * Must be protected by read_mems_allowed_begin()
1845  */
1846 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1847 				gfp_t gfp_flags, struct mempolicy **mpol,
1848 				nodemask_t **nodemask)
1849 {
1850 	struct zonelist *zl;
1851
1852 	*mpol = get_vma_policy(vma, addr);
1853 	*nodemask = NULL;	/* assume !MPOL_BIND */
1854
1855 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1856 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1857 				huge_page_shift(hstate_vma(vma))), gfp_flags);
1858 	} else {
1859 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1860 		if ((*mpol)->mode == MPOL_BIND)
1861 			*nodemask = &(*mpol)->v.nodes;
1862 	}
1863 	return zl;
1864 }
1865
1866 /*
1867  * init_nodemask_of_mempolicy
1868  *
1869  * If the current task's mempolicy is "default" [NULL], return 'false'
1870  * to indicate default policy.  Otherwise, extract the policy nodemask
1871  * for 'bind' or 'interleave' policy into the argument nodemask, or
1872  * initialize the argument nodemask to contain the single node for
1873  * 'preferred' or 'local' policy and return 'true' to indicate presence
1874  * of non-default mempolicy.
1875  *
1876  * We don't bother with reference counting the mempolicy [mpol_get/put]
1877  * because the current task is examining it's own mempolicy and a task's
1878  * mempolicy is only ever changed by the task itself.
1879  *
1880  * N.B., it is the caller's responsibility to free a returned nodemask.
1881  */
1882 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1883 {
1884 	struct mempolicy *mempolicy;
1885 	int nid;
1886
1887 	if (!(mask && current->mempolicy))
1888 		return false;
1889
1890 	task_lock(current);
1891 	mempolicy = current->mempolicy;
1892 	switch (mempolicy->mode) {
1893 	case MPOL_PREFERRED:
1894 		if (mempolicy->flags & MPOL_F_LOCAL)
1895 			nid = numa_node_id();
1896 		else
1897 			nid = mempolicy->v.preferred_node;
1898 		init_nodemask_of_node(mask, nid);
1899 		break;
1900
1901 	case MPOL_BIND:
1902 		/* Fall through */
1903 	case MPOL_INTERLEAVE:
1904 		*mask =  mempolicy->v.nodes;
1905 		break;
1906
1907 	default:
1908 		BUG();
1909 	}
1910 	task_unlock(current);
1911
1912 	return true;
1913 }
1914 #endif
1915
1916 /*
1917  * mempolicy_nodemask_intersects
1918  *
1919  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1920  * policy.  Otherwise, check for intersection between mask and the policy
1921  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1922  * policy, always return true since it may allocate elsewhere on fallback.
1923  *
1924  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1925  */
1926 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1927 					const nodemask_t *mask)
1928 {
1929 	struct mempolicy *mempolicy;
1930 	bool ret = true;
1931
1932 	if (!mask)
1933 		return ret;
1934 	task_lock(tsk);
1935 	mempolicy = tsk->mempolicy;
1936 	if (!mempolicy)
1937 		goto out;
1938
1939 	switch (mempolicy->mode) {
1940 	case MPOL_PREFERRED:
1941 		/*
1942 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1943 		 * allocate from, they may fallback to other nodes when oom.
1944 		 * Thus, it's possible for tsk to have allocated memory from
1945 		 * nodes in mask.
1946 		 */
1947 		break;
1948 	case MPOL_BIND:
1949 	case MPOL_INTERLEAVE:
1950 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1951 		break;
1952 	default:
1953 		BUG();
1954 	}
1955 out:
1956 	task_unlock(tsk);
1957 	return ret;
1958 }
1959
1960 /* Allocate a page in interleaved policy.
1961    Own path because it needs to do special accounting. */
1962 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1963 					unsigned nid)
1964 {
1965 	struct zonelist *zl;
1966 	struct page *page;
1967
1968 	zl = node_zonelist(nid, gfp);
1969 	page = __alloc_pages(gfp, order, zl);
1970 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1971 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1972 	return page;
1973 }
1974
1975 /**
1976  * 	alloc_pages_vma	- Allocate a page for a VMA.
1977  *
1978  * 	@gfp:
1979  *      %GFP_USER    user allocation.
1980  *      %GFP_KERNEL  kernel allocations,
1981  *      %GFP_HIGHMEM highmem/user allocations,
1982  *      %GFP_FS      allocation should not call back into a file system.
1983  *      %GFP_ATOMIC  don't sleep.
1984  *
1985  *	@order:Order of the GFP allocation.
1986  * 	@vma:  Pointer to VMA or NULL if not available.
1987  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1988  *	@node: Which node to prefer for allocation (modulo policy).
1989  *	@hugepage: for hugepages try only the preferred node if possible
1990  *
1991  * 	This function allocates a page from the kernel page pool and applies
1992  *	a NUMA policy associated with the VMA or the current process.
1993  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1994  *	mm_struct of the VMA to prevent it from going away. Should be used for
1995  *	all allocations for pages that will be mapped into user space. Returns
1996  *	NULL when no page can be allocated.
1997  */
1998 struct page *
1999 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2000 		unsigned long addr, int node, bool hugepage)
2001 {
2002 	struct mempolicy *pol;
2003 	struct page *page;
2004 	unsigned int cpuset_mems_cookie;
2005 	struct zonelist *zl;
2006 	nodemask_t *nmask;
2007
2008 retry_cpuset:
2009 	pol = get_vma_policy(vma, addr);
2010 	cpuset_mems_cookie = read_mems_allowed_begin();
2011
2012 	if (pol->mode == MPOL_INTERLEAVE) {
2013 		unsigned nid;
2014
2015 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2016 		mpol_cond_put(pol);
2017 		page = alloc_page_interleave(gfp, order, nid);
2018 		goto out;
2019 	}
2020
2021 	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2022 		int hpage_node = node;
2023
2024 		/*
2025 		 * For hugepage allocation and non-interleave policy which
2026 		 * allows the current node (or other explicitly preferred
2027 		 * node) we only try to allocate from the current/preferred
2028 		 * node and don't fall back to other nodes, as the cost of
2029 		 * remote accesses would likely offset THP benefits.
2030 		 *
2031 		 * If the policy is interleave, or does not allow the current
2032 		 * node in its nodemask, we allocate the standard way.
2033 		 */
2034 		if (pol->mode == MPOL_PREFERRED &&
2035 						!(pol->flags & MPOL_F_LOCAL))
2036 			hpage_node = pol->v.preferred_node;
2037
2038 		nmask = policy_nodemask(gfp, pol);
2039 		if (!nmask || node_isset(hpage_node, *nmask)) {
2040 			mpol_cond_put(pol);
2041 			/*
2042 			 * We cannot invoke reclaim if __GFP_THISNODE
2043 			 * is set. Invoking reclaim with
2044 			 * __GFP_THISNODE set, would cause THP
2045 			 * allocations to trigger heavy swapping
2046 			 * despite there may be tons of free memory
2047 			 * (including potentially plenty of THP
2048 			 * already available in the buddy) on all the
2049 			 * other NUMA nodes.
2050 			 *
2051 			 * At most we could invoke compaction when
2052 			 * __GFP_THISNODE is set (but we would need to
2053 			 * refrain from invoking reclaim even if
2054 			 * compaction returned COMPACT_SKIPPED because
2055 			 * there wasn't not enough memory to succeed
2056 			 * compaction). For now just avoid
2057 			 * __GFP_THISNODE instead of limiting the
2058 			 * allocation path to a strict and single
2059 			 * compaction invocation.
2060 			 *
2061 			 * Supposedly if direct reclaim was enabled by
2062 			 * the caller, the app prefers THP regardless
2063 			 * of the node it comes from so this would be
2064 			 * more desiderable behavior than only
2065 			 * providing THP originated from the local
2066 			 * node in such case.
2067 			 */
2068 			if (!(gfp & __GFP_DIRECT_RECLAIM))
2069 				gfp |= __GFP_THISNODE;
2070 			page = __alloc_pages_node(hpage_node, gfp, order);
2071 			goto out;
2072 		}
2073 	}
2074
2075 	nmask = policy_nodemask(gfp, pol);
2076 	zl = policy_zonelist(gfp, pol, node);
2077 	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2078 	mpol_cond_put(pol);
2079 out:
2080 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2081 		goto retry_cpuset;
2082 	return page;
2083 }
2084
2085 /**
2086  * 	alloc_pages_current - Allocate pages.
2087  *
2088  *	@gfp:
2089  *		%GFP_USER   user allocation,
2090  *      	%GFP_KERNEL kernel allocation,
2091  *      	%GFP_HIGHMEM highmem allocation,
2092  *      	%GFP_FS     don't call back into a file system.
2093  *      	%GFP_ATOMIC don't sleep.
2094  *	@order: Power of two of allocation size in pages. 0 is a single page.
2095  *
2096  *	Allocate a page from the kernel page pool.  When not in
2097  *	interrupt context and apply the current process NUMA policy.
2098  *	Returns NULL when no page can be allocated.
2099  *
2100  *	Don't call cpuset_update_task_memory_state() unless
2101  *	1) it's ok to take cpuset_sem (can WAIT), and
2102  *	2) allocating for current task (not interrupt).
2103  */
2104 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2105 {
2106 	struct mempolicy *pol = &default_policy;
2107 	struct page *page;
2108 	unsigned int cpuset_mems_cookie;
2109
2110 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2111 		pol = get_task_policy(current);
2112
2113 retry_cpuset:
2114 	cpuset_mems_cookie = read_mems_allowed_begin();
2115
2116 	/*
2117 	 * No reference counting needed for current->mempolicy
2118 	 * nor system default_policy
2119 	 */
2120 	if (pol->mode == MPOL_INTERLEAVE)
2121 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2122 	else
2123 		page = __alloc_pages_nodemask(gfp, order,
2124 				policy_zonelist(gfp, pol, numa_node_id()),
2125 				policy_nodemask(gfp, pol));
2126
2127 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2128 		goto retry_cpuset;
2129
2130 	return page;
2131 }
2132 EXPORT_SYMBOL(alloc_pages_current);
2133
2134 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2135 {
2136 	struct mempolicy *pol = mpol_dup(vma_policy(src));
2137
2138 	if (IS_ERR(pol))
2139 		return PTR_ERR(pol);
2140 	dst->vm_policy = pol;
2141 	return 0;
2142 }
2143
2144 /*
2145  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2146  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2147  * with the mems_allowed returned by cpuset_mems_allowed().  This
2148  * keeps mempolicies cpuset relative after its cpuset moves.  See
2149  * further kernel/cpuset.c update_nodemask().
2150  *
2151  * current's mempolicy may be rebinded by the other task(the task that changes
2152  * cpuset's mems), so we needn't do rebind work for current task.
2153  */
2154
2155 /* Slow path of a mempolicy duplicate */
2156 struct mempolicy *__mpol_dup(struct mempolicy *old)
2157 {
2158 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2159
2160 	if (!new)
2161 		return ERR_PTR(-ENOMEM);
2162
2163 	/* task's mempolicy is protected by alloc_lock */
2164 	if (old == current->mempolicy) {
2165 		task_lock(current);
2166 		*new = *old;
2167 		task_unlock(current);
2168 	} else
2169 		*new = *old;
2170
2171 	if (current_cpuset_is_being_rebound()) {
2172 		nodemask_t mems = cpuset_mems_allowed(current);
2173 		if (new->flags & MPOL_F_REBINDING)
2174 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2175 		else
2176 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2177 	}
2178 	atomic_set(&new->refcnt, 1);
2179 	return new;
2180 }
2181
2182 /* Slow path of a mempolicy comparison */
2183 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2184 {
2185 	if (!a || !b)
2186 		return false;
2187 	if (a->mode != b->mode)
2188 		return false;
2189 	if (a->flags != b->flags)
2190 		return false;
2191 	if (mpol_store_user_nodemask(a))
2192 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2193 			return false;
2194
2195 	switch (a->mode) {
2196 	case MPOL_BIND:
2197 		/* Fall through */
2198 	case MPOL_INTERLEAVE:
2199 		return !!nodes_equal(a->v.nodes, b->v.nodes);
2200 	case MPOL_PREFERRED:
2201 		/* a's ->flags is the same as b's */
2202 		if (a->flags & MPOL_F_LOCAL)
2203 			return true;
2204 		return a->v.preferred_node == b->v.preferred_node;
2205 	default:
2206 		BUG();
2207 		return false;
2208 	}
2209 }
2210
2211 /*
2212  * Shared memory backing store policy support.
2213  *
2214  * Remember policies even when nobody has shared memory mapped.
2215  * The policies are kept in Red-Black tree linked from the inode.
2216  * They are protected by the sp->lock rwlock, which should be held
2217  * for any accesses to the tree.
2218  */
2219
2220 /*
2221  * lookup first element intersecting start-end.  Caller holds sp->lock for
2222  * reading or for writing
2223  */
2224 static struct sp_node *
2225 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2226 {
2227 	struct rb_node *n = sp->root.rb_node;
2228
2229 	while (n) {
2230 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2231
2232 		if (start >= p->end)
2233 			n = n->rb_right;
2234 		else if (end <= p->start)
2235 			n = n->rb_left;
2236 		else
2237 			break;
2238 	}
2239 	if (!n)
2240 		return NULL;
2241 	for (;;) {
2242 		struct sp_node *w = NULL;
2243 		struct rb_node *prev = rb_prev(n);
2244 		if (!prev)
2245 			break;
2246 		w = rb_entry(prev, struct sp_node, nd);
2247 		if (w->end <= start)
2248 			break;
2249 		n = prev;
2250 	}
2251 	return rb_entry(n, struct sp_node, nd);
2252 }
2253
2254 /*
2255  * Insert a new shared policy into the list.  Caller holds sp->lock for
2256  * writing.
2257  */
2258 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2259 {
2260 	struct rb_node **p = &sp->root.rb_node;
2261 	struct rb_node *parent = NULL;
2262 	struct sp_node *nd;
2263
2264 	while (*p) {
2265 		parent = *p;
2266 		nd = rb_entry(parent, struct sp_node, nd);
2267 		if (new->start < nd->start)
2268 			p = &(*p)->rb_left;
2269 		else if (new->end > nd->end)
2270 			p = &(*p)->rb_right;
2271 		else
2272 			BUG();
2273 	}
2274 	rb_link_node(&new->nd, parent, p);
2275 	rb_insert_color(&new->nd, &sp->root);
2276 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2277 		 new->policy ? new->policy->mode : 0);
2278 }
2279
2280 /* Find shared policy intersecting idx */
2281 struct mempolicy *
2282 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2283 {
2284 	struct mempolicy *pol = NULL;
2285 	struct sp_node *sn;
2286
2287 	if (!sp->root.rb_node)
2288 		return NULL;
2289 	read_lock(&sp->lock);
2290 	sn = sp_lookup(sp, idx, idx+1);
2291 	if (sn) {
2292 		mpol_get(sn->policy);
2293 		pol = sn->policy;
2294 	}
2295 	read_unlock(&sp->lock);
2296 	return pol;
2297 }
2298
2299 static void sp_free(struct sp_node *n)
2300 {
2301 	mpol_put(n->policy);
2302 	kmem_cache_free(sn_cache, n);
2303 }
2304
2305 /**
2306  * mpol_misplaced - check whether current page node is valid in policy
2307  *
2308  * @page: page to be checked
2309  * @vma: vm area where page mapped
2310  * @addr: virtual address where page mapped
2311  *
2312  * Lookup current policy node id for vma,addr and "compare to" page's
2313  * node id.
2314  *
2315  * Returns:
2316  *	-1	- not misplaced, page is in the right node
2317  *	node	- node id where the page should be
2318  *
2319  * Policy determination "mimics" alloc_page_vma().
2320  * Called from fault path where we know the vma and faulting address.
2321  */
2322 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2323 {
2324 	struct mempolicy *pol;
2325 	struct zoneref *z;
2326 	int curnid = page_to_nid(page);
2327 	unsigned long pgoff;
2328 	int thiscpu = raw_smp_processor_id();
2329 	int thisnid = cpu_to_node(thiscpu);
2330 	int polnid = -1;
2331 	int ret = -1;
2332
2333 	BUG_ON(!vma);
2334
2335 	pol = get_vma_policy(vma, addr);
2336 	if (!(pol->flags & MPOL_F_MOF))
2337 		goto out;
2338
2339 	switch (pol->mode) {
2340 	case MPOL_INTERLEAVE:
2341 		BUG_ON(addr >= vma->vm_end);
2342 		BUG_ON(addr < vma->vm_start);
2343
2344 		pgoff = vma->vm_pgoff;
2345 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2346 		polnid = offset_il_node(pol, vma, pgoff);
2347 		break;
2348
2349 	case MPOL_PREFERRED:
2350 		if (pol->flags & MPOL_F_LOCAL)
2351 			polnid = numa_node_id();
2352 		else
2353 			polnid = pol->v.preferred_node;
2354 		break;
2355
2356 	case MPOL_BIND:
2357
2358 		/*
2359 		 * allows binding to multiple nodes.
2360 		 * use current page if in policy nodemask,
2361 		 * else select nearest allowed node, if any.
2362 		 * If no allowed nodes, use current [!misplaced].
2363 		 */
2364 		if (node_isset(curnid, pol->v.nodes))
2365 			goto out;
2366 		z = first_zones_zonelist(
2367 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2368 				gfp_zone(GFP_HIGHUSER),
2369 				&pol->v.nodes);
2370 		polnid = z->zone->node;
2371 		break;
2372
2373 	default:
2374 		BUG();
2375 	}
2376
2377 	/* Migrate the page towards the node whose CPU is referencing it */
2378 	if (pol->flags & MPOL_F_MORON) {
2379 		polnid = thisnid;
2380
2381 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2382 			goto out;
2383 	}
2384
2385 	if (curnid != polnid)
2386 		ret = polnid;
2387 out:
2388 	mpol_cond_put(pol);
2389
2390 	return ret;
2391 }
2392
2393 /*
2394  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2395  * dropped after task->mempolicy is set to NULL so that any allocation done as
2396  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2397  * policy.
2398  */
2399 void mpol_put_task_policy(struct task_struct *task)
2400 {
2401 	struct mempolicy *pol;
2402
2403 	task_lock(task);
2404 	pol = task->mempolicy;
2405 	task->mempolicy = NULL;
2406 	task_unlock(task);
2407 	mpol_put(pol);
2408 }
2409
2410 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2411 {
2412 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2413 	rb_erase(&n->nd, &sp->root);
2414 	sp_free(n);
2415 }
2416
2417 static void sp_node_init(struct sp_node *node, unsigned long start,
2418 			unsigned long end, struct mempolicy *pol)
2419 {
2420 	node->start = start;
2421 	node->end = end;
2422 	node->policy = pol;
2423 }
2424
2425 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2426 				struct mempolicy *pol)
2427 {
2428 	struct sp_node *n;
2429 	struct mempolicy *newpol;
2430
2431 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2432 	if (!n)
2433 		return NULL;
2434
2435 	newpol = mpol_dup(pol);
2436 	if (IS_ERR(newpol)) {
2437 		kmem_cache_free(sn_cache, n);
2438 		return NULL;
2439 	}
2440 	newpol->flags |= MPOL_F_SHARED;
2441 	sp_node_init(n, start, end, newpol);
2442
2443 	return n;
2444 }
2445
2446 /* Replace a policy range. */
2447 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2448 				 unsigned long end, struct sp_node *new)
2449 {
2450 	struct sp_node *n;
2451 	struct sp_node *n_new = NULL;
2452 	struct mempolicy *mpol_new = NULL;
2453 	int ret = 0;
2454
2455 restart:
2456 	write_lock(&sp->lock);
2457 	n = sp_lookup(sp, start, end);
2458 	/* Take care of old policies in the same range. */
2459 	while (n && n->start < end) {
2460 		struct rb_node *next = rb_next(&n->nd);
2461 		if (n->start >= start) {
2462 			if (n->end <= end)
2463 				sp_delete(sp, n);
2464 			else
2465 				n->start = end;
2466 		} else {
2467 			/* Old policy spanning whole new range. */
2468 			if (n->end > end) {
2469 				if (!n_new)
2470 					goto alloc_new;
2471
2472 				*mpol_new = *n->policy;
2473 				atomic_set(&mpol_new->refcnt, 1);
2474 				sp_node_init(n_new, end, n->end, mpol_new);
2475 				n->end = start;
2476 				sp_insert(sp, n_new);
2477 				n_new = NULL;
2478 				mpol_new = NULL;
2479 				break;
2480 			} else
2481 				n->end = start;
2482 		}
2483 		if (!next)
2484 			break;
2485 		n = rb_entry(next, struct sp_node, nd);
2486 	}
2487 	if (new)
2488 		sp_insert(sp, new);
2489 	write_unlock(&sp->lock);
2490 	ret = 0;
2491
2492 err_out:
2493 	if (mpol_new)
2494 		mpol_put(mpol_new);
2495 	if (n_new)
2496 		kmem_cache_free(sn_cache, n_new);
2497
2498 	return ret;
2499
2500 alloc_new:
2501 	write_unlock(&sp->lock);
2502 	ret = -ENOMEM;
2503 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2504 	if (!n_new)
2505 		goto err_out;
2506 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2507 	if (!mpol_new)
2508 		goto err_out;
2509 	goto restart;
2510 }
2511
2512 /**
2513  * mpol_shared_policy_init - initialize shared policy for inode
2514  * @sp: pointer to inode shared policy
2515  * @mpol:  struct mempolicy to install
2516  *
2517  * Install non-NULL @mpol in inode's shared policy rb-tree.
2518  * On entry, the current task has a reference on a non-NULL @mpol.
2519  * This must be released on exit.
2520  * This is called at get_inode() calls and we can use GFP_KERNEL.
2521  */
2522 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2523 {
2524 	int ret;
2525
2526 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2527 	rwlock_init(&sp->lock);
2528
2529 	if (mpol) {
2530 		struct vm_area_struct pvma;
2531 		struct mempolicy *new;
2532 		NODEMASK_SCRATCH(scratch);
2533
2534 		if (!scratch)
2535 			goto put_mpol;
2536 		/* contextualize the tmpfs mount point mempolicy */
2537 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2538 		if (IS_ERR(new))
2539 			goto free_scratch; /* no valid nodemask intersection */
2540
2541 		task_lock(current);
2542 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2543 		task_unlock(current);
2544 		if (ret)
2545 			goto put_new;
2546
2547 		/* Create pseudo-vma that contains just the policy */
2548 		memset(&pvma, 0, sizeof(struct vm_area_struct));
2549 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2550 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2551
2552 put_new:
2553 		mpol_put(new);			/* drop initial ref */
2554 free_scratch:
2555 		NODEMASK_SCRATCH_FREE(scratch);
2556 put_mpol:
2557 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2558 	}
2559 }
2560
2561 int mpol_set_shared_policy(struct shared_policy *info,
2562 			struct vm_area_struct *vma, struct mempolicy *npol)
2563 {
2564 	int err;
2565 	struct sp_node *new = NULL;
2566 	unsigned long sz = vma_pages(vma);
2567
2568 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2569 		 vma->vm_pgoff,
2570 		 sz, npol ? npol->mode : -1,
2571 		 npol ? npol->flags : -1,
2572 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2573
2574 	if (npol) {
2575 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2576 		if (!new)
2577 			return -ENOMEM;
2578 	}
2579 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2580 	if (err && new)
2581 		sp_free(new);
2582 	return err;
2583 }
2584
2585 /* Free a backing policy store on inode delete. */
2586 void mpol_free_shared_policy(struct shared_policy *p)
2587 {
2588 	struct sp_node *n;
2589 	struct rb_node *next;
2590
2591 	if (!p->root.rb_node)
2592 		return;
2593 	write_lock(&p->lock);
2594 	next = rb_first(&p->root);
2595 	while (next) {
2596 		n = rb_entry(next, struct sp_node, nd);
2597 		next = rb_next(&n->nd);
2598 		sp_delete(p, n);
2599 	}
2600 	write_unlock(&p->lock);
2601 }
2602
2603 #ifdef CONFIG_NUMA_BALANCING
2604 static int __initdata numabalancing_override;
2605
2606 static void __init check_numabalancing_enable(void)
2607 {
2608 	bool numabalancing_default = false;
2609
2610 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2611 		numabalancing_default = true;
2612
2613 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2614 	if (numabalancing_override)
2615 		set_numabalancing_state(numabalancing_override == 1);
2616
2617 	if (num_online_nodes() > 1 && !numabalancing_override) {
2618 		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2619 			numabalancing_default ? "Enabling" : "Disabling");
2620 		set_numabalancing_state(numabalancing_default);
2621 	}
2622 }
2623
2624 static int __init setup_numabalancing(char *str)
2625 {
2626 	int ret = 0;
2627 	if (!str)
2628 		goto out;
2629
2630 	if (!strcmp(str, "enable")) {
2631 		numabalancing_override = 1;
2632 		ret = 1;
2633 	} else if (!strcmp(str, "disable")) {
2634 		numabalancing_override = -1;
2635 		ret = 1;
2636 	}
2637 out:
2638 	if (!ret)
2639 		pr_warn("Unable to parse numa_balancing=\n");
2640
2641 	return ret;
2642 }
2643 __setup("numa_balancing=", setup_numabalancing);
2644 #else
2645 static inline void __init check_numabalancing_enable(void)
2646 {
2647 }
2648 #endif /* CONFIG_NUMA_BALANCING */
2649
2650 /* assumes fs == KERNEL_DS */
2651 void __init numa_policy_init(void)
2652 {
2653 	nodemask_t interleave_nodes;
2654 	unsigned long largest = 0;
2655 	int nid, prefer = 0;
2656
2657 	policy_cache = kmem_cache_create("numa_policy",
2658 					 sizeof(struct mempolicy),
2659 					 0, SLAB_PANIC, NULL);
2660
2661 	sn_cache = kmem_cache_create("shared_policy_node",
2662 				     sizeof(struct sp_node),
2663 				     0, SLAB_PANIC, NULL);
2664
2665 	for_each_node(nid) {
2666 		preferred_node_policy[nid] = (struct mempolicy) {
2667 			.refcnt = ATOMIC_INIT(1),
2668 			.mode = MPOL_PREFERRED,
2669 			.flags = MPOL_F_MOF | MPOL_F_MORON,
2670 			.v = { .preferred_node = nid, },
2671 		};
2672 	}
2673
2674 	/*
2675 	 * Set interleaving policy for system init. Interleaving is only
2676 	 * enabled across suitably sized nodes (default is >= 16MB), or
2677 	 * fall back to the largest node if they're all smaller.
2678 	 */
2679 	nodes_clear(interleave_nodes);
2680 	for_each_node_state(nid, N_MEMORY) {
2681 		unsigned long total_pages = node_present_pages(nid);
2682
2683 		/* Preserve the largest node */
2684 		if (largest < total_pages) {
2685 			largest = total_pages;
2686 			prefer = nid;
2687 		}
2688
2689 		/* Interleave this node? */
2690 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2691 			node_set(nid, interleave_nodes);
2692 	}
2693
2694 	/* All too small, use the largest */
2695 	if (unlikely(nodes_empty(interleave_nodes)))
2696 		node_set(prefer, interleave_nodes);
2697
2698 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2699 		pr_err("%s: interleaving failed\n", __func__);
2700
2701 	check_numabalancing_enable();
2702 }
2703
2704 /* Reset policy of current process to default */
2705 void numa_default_policy(void)
2706 {
2707 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2708 }
2709
2710 /*
2711  * Parse and format mempolicy from/to strings
2712  */
2713
2714 /*
2715  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2716  */
2717 static const char * const policy_modes[] =
2718 {
2719 	[MPOL_DEFAULT]    = "default",
2720 	[MPOL_PREFERRED]  = "prefer",
2721 	[MPOL_BIND]       = "bind",
2722 	[MPOL_INTERLEAVE] = "interleave",
2723 	[MPOL_LOCAL]      = "local",
2724 };
2725
2726
2727 #ifdef CONFIG_TMPFS
2728 /**
2729  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2730  * @str:  string containing mempolicy to parse
2731  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2732  *
2733  * Format of input:
2734  *	<mode>[=<flags>][:<nodelist>]
2735  *
2736  * On success, returns 0, else 1
2737  */
2738 int mpol_parse_str(char *str, struct mempolicy **mpol)
2739 {
2740 	struct mempolicy *new = NULL;
2741 	unsigned short mode;
2742 	unsigned short mode_flags;
2743 	nodemask_t nodes;
2744 	char *nodelist = strchr(str, ':');
2745 	char *flags = strchr(str, '=');
2746 	int err = 1;
2747
2748 	if (nodelist) {
2749 		/* NUL-terminate mode or flags string */
2750 		*nodelist++ = '\0';
2751 		if (nodelist_parse(nodelist, nodes))
2752 			goto out;
2753 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2754 			goto out;
2755 	} else
2756 		nodes_clear(nodes);
2757
2758 	if (flags)
2759 		*flags++ = '\0';	/* terminate mode string */
2760
2761 	for (mode = 0; mode < MPOL_MAX; mode++) {
2762 		if (!strcmp(str, policy_modes[mode])) {
2763 			break;
2764 		}
2765 	}
2766 	if (mode >= MPOL_MAX)
2767 		goto out;
2768
2769 	switch (mode) {
2770 	case MPOL_PREFERRED:
2771 		/*
2772 		 * Insist on a nodelist of one node only
2773 		 */
2774 		if (nodelist) {
2775 			char *rest = nodelist;
2776 			while (isdigit(*rest))
2777 				rest++;
2778 			if (*rest)
2779 				goto out;
2780 		}
2781 		break;
2782 	case MPOL_INTERLEAVE:
2783 		/*
2784 		 * Default to online nodes with memory if no nodelist
2785 		 */
2786 		if (!nodelist)
2787 			nodes = node_states[N_MEMORY];
2788 		break;
2789 	case MPOL_LOCAL:
2790 		/*
2791 		 * Don't allow a nodelist;  mpol_new() checks flags
2792 		 */
2793 		if (nodelist)
2794 			goto out;
2795 		mode = MPOL_PREFERRED;
2796 		break;
2797 	case MPOL_DEFAULT:
2798 		/*
2799 		 * Insist on a empty nodelist
2800 		 */
2801 		if (!nodelist)
2802 			err = 0;
2803 		goto out;
2804 	case MPOL_BIND:
2805 		/*
2806 		 * Insist on a nodelist
2807 		 */
2808 		if (!nodelist)
2809 			goto out;
2810 	}
2811
2812 	mode_flags = 0;
2813 	if (flags) {
2814 		/*
2815 		 * Currently, we only support two mutually exclusive
2816 		 * mode flags.
2817 		 */
2818 		if (!strcmp(flags, "static"))
2819 			mode_flags |= MPOL_F_STATIC_NODES;
2820 		else if (!strcmp(flags, "relative"))
2821 			mode_flags |= MPOL_F_RELATIVE_NODES;
2822 		else
2823 			goto out;
2824 	}
2825
2826 	new = mpol_new(mode, mode_flags, &nodes);
2827 	if (IS_ERR(new))
2828 		goto out;
2829
2830 	/*
2831 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2832 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2833 	 */
2834 	if (mode != MPOL_PREFERRED)
2835 		new->v.nodes = nodes;
2836 	else if (nodelist)
2837 		new->v.preferred_node = first_node(nodes);
2838 	else
2839 		new->flags |= MPOL_F_LOCAL;
2840
2841 	/*
2842 	 * Save nodes for contextualization: this will be used to "clone"
2843 	 * the mempolicy in a specific context [cpuset] at a later time.
2844 	 */
2845 	new->w.user_nodemask = nodes;
2846
2847 	err = 0;
2848
2849 out:
2850 	/* Restore string for error message */
2851 	if (nodelist)
2852 		*--nodelist = ':';
2853 	if (flags)
2854 		*--flags = '=';
2855 	if (!err)
2856 		*mpol = new;
2857 	return err;
2858 }
2859 #endif /* CONFIG_TMPFS */
2860
2861 /**
2862  * mpol_to_str - format a mempolicy structure for printing
2863  * @buffer:  to contain formatted mempolicy string
2864  * @maxlen:  length of @buffer
2865  * @pol:  pointer to mempolicy to be formatted
2866  *
2867  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2868  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2869  * longest flag, "relative", and to display at least a few node ids.
2870  */
2871 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2872 {
2873 	char *p = buffer;
2874 	nodemask_t nodes = NODE_MASK_NONE;
2875 	unsigned short mode = MPOL_DEFAULT;
2876 	unsigned short flags = 0;
2877
2878 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2879 		mode = pol->mode;
2880 		flags = pol->flags;
2881 	}
2882
2883 	switch (mode) {
2884 	case MPOL_DEFAULT:
2885 		break;
2886 	case MPOL_PREFERRED:
2887 		if (flags & MPOL_F_LOCAL)
2888 			mode = MPOL_LOCAL;
2889 		else
2890 			node_set(pol->v.preferred_node, nodes);
2891 		break;
2892 	case MPOL_BIND:
2893 	case MPOL_INTERLEAVE:
2894 		nodes = pol->v.nodes;
2895 		break;
2896 	default:
2897 		WARN_ON_ONCE(1);
2898 		snprintf(p, maxlen, "unknown");
2899 		return;
2900 	}
2901
2902 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2903
2904 	if (flags & MPOL_MODE_FLAGS) {
2905 		p += snprintf(p, buffer + maxlen - p, "=");
2906
2907 		/*
2908 		 * Currently, the only defined flags are mutually exclusive
2909 		 */
2910 		if (flags & MPOL_F_STATIC_NODES)
2911 			p += snprintf(p, buffer + maxlen - p, "static");
2912 		else if (flags & MPOL_F_RELATIVE_NODES)
2913 			p += snprintf(p, buffer + maxlen - p, "relative");
2914 	}
2915
2916 	if (!nodes_empty(nodes))
2917 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2918 			       nodemask_pr_args(&nodes));
2919 }
2920