blob: 5471f35be825968f32c3c15e332e3363073effa9
1 | /* |
2 | * mm/mprotect.c |
3 | * |
4 | * (C) Copyright 1994 Linus Torvalds |
5 | * (C) Copyright 2002 Christoph Hellwig |
6 | * |
7 | * Address space accounting code <alan@lxorguk.ukuu.org.uk> |
8 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved |
9 | */ |
10 | |
11 | #include <linux/mm.h> |
12 | #include <linux/hugetlb.h> |
13 | #include <linux/shm.h> |
14 | #include <linux/mman.h> |
15 | #include <linux/fs.h> |
16 | #include <linux/highmem.h> |
17 | #include <linux/security.h> |
18 | #include <linux/mempolicy.h> |
19 | #include <linux/personality.h> |
20 | #include <linux/syscalls.h> |
21 | #include <linux/swap.h> |
22 | #include <linux/swapops.h> |
23 | #include <linux/mmu_notifier.h> |
24 | #include <linux/migrate.h> |
25 | #include <linux/perf_event.h> |
26 | #include <linux/pkeys.h> |
27 | #include <linux/ksm.h> |
28 | #include <asm/uaccess.h> |
29 | #include <asm/pgtable.h> |
30 | #include <asm/cacheflush.h> |
31 | #include <asm/mmu_context.h> |
32 | #include <asm/tlbflush.h> |
33 | |
34 | #include "internal.h" |
35 | |
36 | /* |
37 | * For a prot_numa update we only hold mmap_sem for read so there is a |
38 | * potential race with faulting where a pmd was temporarily none. This |
39 | * function checks for a transhuge pmd under the appropriate lock. It |
40 | * returns a pte if it was successfully locked or NULL if it raced with |
41 | * a transhuge insertion. |
42 | */ |
43 | static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, |
44 | unsigned long addr, int prot_numa, spinlock_t **ptl) |
45 | { |
46 | pte_t *pte; |
47 | spinlock_t *pmdl; |
48 | |
49 | /* !prot_numa is protected by mmap_sem held for write */ |
50 | if (!prot_numa) |
51 | return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); |
52 | |
53 | pmdl = pmd_lock(vma->vm_mm, pmd); |
54 | if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { |
55 | spin_unlock(pmdl); |
56 | return NULL; |
57 | } |
58 | |
59 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); |
60 | spin_unlock(pmdl); |
61 | return pte; |
62 | } |
63 | |
64 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
65 | unsigned long addr, unsigned long end, pgprot_t newprot, |
66 | int dirty_accountable, int prot_numa) |
67 | { |
68 | struct mm_struct *mm = vma->vm_mm; |
69 | pte_t *pte, oldpte; |
70 | spinlock_t *ptl; |
71 | unsigned long pages = 0; |
72 | |
73 | pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); |
74 | if (!pte) |
75 | return 0; |
76 | |
77 | flush_tlb_batched_pending(vma->vm_mm); |
78 | arch_enter_lazy_mmu_mode(); |
79 | do { |
80 | oldpte = *pte; |
81 | if (pte_present(oldpte)) { |
82 | pte_t ptent; |
83 | bool preserve_write = prot_numa && pte_write(oldpte); |
84 | |
85 | /* |
86 | * Avoid trapping faults against the zero or KSM |
87 | * pages. See similar comment in change_huge_pmd. |
88 | */ |
89 | if (prot_numa) { |
90 | struct page *page; |
91 | |
92 | page = vm_normal_page(vma, addr, oldpte); |
93 | if (!page || PageKsm(page)) |
94 | continue; |
95 | |
96 | /* Avoid TLB flush if possible */ |
97 | if (pte_protnone(oldpte)) |
98 | continue; |
99 | } |
100 | |
101 | ptent = ptep_modify_prot_start(mm, addr, pte); |
102 | ptent = pte_modify(ptent, newprot); |
103 | if (preserve_write) |
104 | ptent = pte_mkwrite(ptent); |
105 | |
106 | /* Avoid taking write faults for known dirty pages */ |
107 | if (dirty_accountable && pte_dirty(ptent) && |
108 | (pte_soft_dirty(ptent) || |
109 | !(vma->vm_flags & VM_SOFTDIRTY))) { |
110 | ptent = pte_mkwrite(ptent); |
111 | } |
112 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
113 | pages++; |
114 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
115 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
116 | |
117 | if (is_write_migration_entry(entry)) { |
118 | pte_t newpte; |
119 | /* |
120 | * A protection check is difficult so |
121 | * just be safe and disable write |
122 | */ |
123 | make_migration_entry_read(&entry); |
124 | newpte = swp_entry_to_pte(entry); |
125 | if (pte_swp_soft_dirty(oldpte)) |
126 | newpte = pte_swp_mksoft_dirty(newpte); |
127 | set_pte_at(mm, addr, pte, newpte); |
128 | |
129 | pages++; |
130 | } |
131 | } |
132 | } while (pte++, addr += PAGE_SIZE, addr != end); |
133 | arch_leave_lazy_mmu_mode(); |
134 | pte_unmap_unlock(pte - 1, ptl); |
135 | |
136 | return pages; |
137 | } |
138 | |
139 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, |
140 | pud_t *pud, unsigned long addr, unsigned long end, |
141 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
142 | { |
143 | pmd_t *pmd; |
144 | struct mm_struct *mm = vma->vm_mm; |
145 | unsigned long next; |
146 | unsigned long pages = 0; |
147 | unsigned long nr_huge_updates = 0; |
148 | unsigned long mni_start = 0; |
149 | |
150 | pmd = pmd_offset(pud, addr); |
151 | do { |
152 | unsigned long this_pages; |
153 | |
154 | next = pmd_addr_end(addr, end); |
155 | if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) |
156 | && pmd_none_or_clear_bad(pmd)) |
157 | continue; |
158 | |
159 | /* invoke the mmu notifier if the pmd is populated */ |
160 | if (!mni_start) { |
161 | mni_start = addr; |
162 | mmu_notifier_invalidate_range_start(mm, mni_start, end); |
163 | } |
164 | |
165 | if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { |
166 | if (next - addr != HPAGE_PMD_SIZE) { |
167 | split_huge_pmd(vma, pmd, addr); |
168 | if (pmd_trans_unstable(pmd)) |
169 | continue; |
170 | } else { |
171 | int nr_ptes = change_huge_pmd(vma, pmd, addr, |
172 | newprot, prot_numa); |
173 | |
174 | if (nr_ptes) { |
175 | if (nr_ptes == HPAGE_PMD_NR) { |
176 | pages += HPAGE_PMD_NR; |
177 | nr_huge_updates++; |
178 | } |
179 | |
180 | /* huge pmd was handled */ |
181 | continue; |
182 | } |
183 | } |
184 | /* fall through, the trans huge pmd just split */ |
185 | } |
186 | this_pages = change_pte_range(vma, pmd, addr, next, newprot, |
187 | dirty_accountable, prot_numa); |
188 | pages += this_pages; |
189 | } while (pmd++, addr = next, addr != end); |
190 | |
191 | if (mni_start) |
192 | mmu_notifier_invalidate_range_end(mm, mni_start, end); |
193 | |
194 | if (nr_huge_updates) |
195 | count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); |
196 | return pages; |
197 | } |
198 | |
199 | static inline unsigned long change_pud_range(struct vm_area_struct *vma, |
200 | pgd_t *pgd, unsigned long addr, unsigned long end, |
201 | pgprot_t newprot, int dirty_accountable, int prot_numa) |
202 | { |
203 | pud_t *pud; |
204 | unsigned long next; |
205 | unsigned long pages = 0; |
206 | |
207 | pud = pud_offset(pgd, addr); |
208 | do { |
209 | next = pud_addr_end(addr, end); |
210 | if (pud_none_or_clear_bad(pud)) |
211 | continue; |
212 | pages += change_pmd_range(vma, pud, addr, next, newprot, |
213 | dirty_accountable, prot_numa); |
214 | } while (pud++, addr = next, addr != end); |
215 | |
216 | return pages; |
217 | } |
218 | |
219 | static unsigned long change_protection_range(struct vm_area_struct *vma, |
220 | unsigned long addr, unsigned long end, pgprot_t newprot, |
221 | int dirty_accountable, int prot_numa) |
222 | { |
223 | struct mm_struct *mm = vma->vm_mm; |
224 | pgd_t *pgd; |
225 | unsigned long next; |
226 | unsigned long start = addr; |
227 | unsigned long pages = 0; |
228 | |
229 | BUG_ON(addr >= end); |
230 | pgd = pgd_offset(mm, addr); |
231 | flush_cache_range(vma, addr, end); |
232 | set_tlb_flush_pending(mm); |
233 | do { |
234 | next = pgd_addr_end(addr, end); |
235 | if (pgd_none_or_clear_bad(pgd)) |
236 | continue; |
237 | pages += change_pud_range(vma, pgd, addr, next, newprot, |
238 | dirty_accountable, prot_numa); |
239 | } while (pgd++, addr = next, addr != end); |
240 | |
241 | /* Only flush the TLB if we actually modified any entries: */ |
242 | if (pages) |
243 | flush_tlb_range(vma, start, end); |
244 | clear_tlb_flush_pending(mm); |
245 | |
246 | return pages; |
247 | } |
248 | |
249 | unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, |
250 | unsigned long end, pgprot_t newprot, |
251 | int dirty_accountable, int prot_numa) |
252 | { |
253 | unsigned long pages; |
254 | |
255 | if (is_vm_hugetlb_page(vma)) |
256 | pages = hugetlb_change_protection(vma, start, end, newprot); |
257 | else |
258 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); |
259 | |
260 | return pages; |
261 | } |
262 | |
263 | static int prot_none_pte_entry(pte_t *pte, unsigned long addr, |
264 | unsigned long next, struct mm_walk *walk) |
265 | { |
266 | return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? |
267 | 0 : -EACCES; |
268 | } |
269 | |
270 | static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, |
271 | unsigned long addr, unsigned long next, |
272 | struct mm_walk *walk) |
273 | { |
274 | return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? |
275 | 0 : -EACCES; |
276 | } |
277 | |
278 | static int prot_none_test(unsigned long addr, unsigned long next, |
279 | struct mm_walk *walk) |
280 | { |
281 | return 0; |
282 | } |
283 | |
284 | static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, |
285 | unsigned long end, unsigned long newflags) |
286 | { |
287 | pgprot_t new_pgprot = vm_get_page_prot(newflags); |
288 | struct mm_walk prot_none_walk = { |
289 | .pte_entry = prot_none_pte_entry, |
290 | .hugetlb_entry = prot_none_hugetlb_entry, |
291 | .test_walk = prot_none_test, |
292 | .mm = current->mm, |
293 | .private = &new_pgprot, |
294 | }; |
295 | |
296 | return walk_page_range(start, end, &prot_none_walk); |
297 | } |
298 | |
299 | int |
300 | mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, |
301 | unsigned long start, unsigned long end, unsigned long newflags) |
302 | { |
303 | struct mm_struct *mm = vma->vm_mm; |
304 | unsigned long oldflags = vma->vm_flags; |
305 | long nrpages = (end - start) >> PAGE_SHIFT; |
306 | unsigned long charged = 0; |
307 | pgoff_t pgoff; |
308 | int error; |
309 | int dirty_accountable = 0; |
310 | |
311 | if (newflags == oldflags) { |
312 | *pprev = vma; |
313 | return 0; |
314 | } |
315 | |
316 | /* |
317 | * Do PROT_NONE PFN permission checks here when we can still |
318 | * bail out without undoing a lot of state. This is a rather |
319 | * uncommon case, so doesn't need to be very optimized. |
320 | */ |
321 | if (arch_has_pfn_modify_check() && |
322 | (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && |
323 | (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { |
324 | error = prot_none_walk(vma, start, end, newflags); |
325 | if (error) |
326 | return error; |
327 | } |
328 | |
329 | /* |
330 | * If we make a private mapping writable we increase our commit; |
331 | * but (without finer accounting) cannot reduce our commit if we |
332 | * make it unwritable again. hugetlb mapping were accounted for |
333 | * even if read-only so there is no need to account for them here |
334 | */ |
335 | if (newflags & VM_WRITE) { |
336 | /* Check space limits when area turns into data. */ |
337 | if (!may_expand_vm(mm, newflags, nrpages) && |
338 | may_expand_vm(mm, oldflags, nrpages)) |
339 | return -ENOMEM; |
340 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| |
341 | VM_SHARED|VM_NORESERVE))) { |
342 | charged = nrpages; |
343 | if (security_vm_enough_memory_mm(mm, charged)) |
344 | return -ENOMEM; |
345 | newflags |= VM_ACCOUNT; |
346 | } |
347 | } |
348 | |
349 | /* |
350 | * First try to merge with previous and/or next vma. |
351 | */ |
352 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
353 | *pprev = vma_merge(mm, *pprev, start, end, newflags, |
354 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), |
355 | vma->vm_userfaultfd_ctx, vma_get_anon_name(vma)); |
356 | if (*pprev) { |
357 | vma = *pprev; |
358 | VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); |
359 | goto success; |
360 | } |
361 | |
362 | *pprev = vma; |
363 | |
364 | if (start != vma->vm_start) { |
365 | error = split_vma(mm, vma, start, 1); |
366 | if (error) |
367 | goto fail; |
368 | } |
369 | |
370 | if (end != vma->vm_end) { |
371 | error = split_vma(mm, vma, end, 0); |
372 | if (error) |
373 | goto fail; |
374 | } |
375 | |
376 | success: |
377 | /* |
378 | * vm_flags and vm_page_prot are protected by the mmap_sem |
379 | * held in write mode. |
380 | */ |
381 | vma->vm_flags = newflags; |
382 | dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); |
383 | vma_set_page_prot(vma); |
384 | |
385 | change_protection(vma, start, end, vma->vm_page_prot, |
386 | dirty_accountable, 0); |
387 | |
388 | /* |
389 | * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major |
390 | * fault on access. |
391 | */ |
392 | if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && |
393 | (newflags & VM_WRITE)) { |
394 | populate_vma_page_range(vma, start, end, NULL); |
395 | } |
396 | |
397 | vm_stat_account(mm, oldflags, -nrpages); |
398 | vm_stat_account(mm, newflags, nrpages); |
399 | perf_event_mmap(vma); |
400 | return 0; |
401 | |
402 | fail: |
403 | vm_unacct_memory(charged); |
404 | return error; |
405 | } |
406 | |
407 | /* |
408 | * pkey==-1 when doing a legacy mprotect() |
409 | */ |
410 | static int do_mprotect_pkey(unsigned long start, size_t len, |
411 | unsigned long prot, int pkey) |
412 | { |
413 | unsigned long nstart, end, tmp, reqprot; |
414 | struct vm_area_struct *vma, *prev; |
415 | int error = -EINVAL; |
416 | const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); |
417 | const bool rier = (current->personality & READ_IMPLIES_EXEC) && |
418 | (prot & PROT_READ); |
419 | |
420 | prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); |
421 | if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ |
422 | return -EINVAL; |
423 | |
424 | if (start & ~PAGE_MASK) |
425 | return -EINVAL; |
426 | if (!len) |
427 | return 0; |
428 | len = PAGE_ALIGN(len); |
429 | end = start + len; |
430 | if (end <= start) |
431 | return -ENOMEM; |
432 | if (!arch_validate_prot(prot)) |
433 | return -EINVAL; |
434 | |
435 | reqprot = prot; |
436 | |
437 | if (down_write_killable(¤t->mm->mmap_sem)) |
438 | return -EINTR; |
439 | |
440 | /* |
441 | * If userspace did not allocate the pkey, do not let |
442 | * them use it here. |
443 | */ |
444 | error = -EINVAL; |
445 | if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) |
446 | goto out; |
447 | |
448 | vma = find_vma(current->mm, start); |
449 | error = -ENOMEM; |
450 | if (!vma) |
451 | goto out; |
452 | prev = vma->vm_prev; |
453 | if (unlikely(grows & PROT_GROWSDOWN)) { |
454 | if (vma->vm_start >= end) |
455 | goto out; |
456 | start = vma->vm_start; |
457 | error = -EINVAL; |
458 | if (!(vma->vm_flags & VM_GROWSDOWN)) |
459 | goto out; |
460 | } else { |
461 | if (vma->vm_start > start) |
462 | goto out; |
463 | if (unlikely(grows & PROT_GROWSUP)) { |
464 | end = vma->vm_end; |
465 | error = -EINVAL; |
466 | if (!(vma->vm_flags & VM_GROWSUP)) |
467 | goto out; |
468 | } |
469 | } |
470 | if (start > vma->vm_start) |
471 | prev = vma; |
472 | |
473 | for (nstart = start ; ; ) { |
474 | unsigned long mask_off_old_flags; |
475 | unsigned long newflags; |
476 | int new_vma_pkey; |
477 | |
478 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
479 | |
480 | /* Does the application expect PROT_READ to imply PROT_EXEC */ |
481 | if (rier && (vma->vm_flags & VM_MAYEXEC)) |
482 | prot |= PROT_EXEC; |
483 | |
484 | /* |
485 | * Each mprotect() call explicitly passes r/w/x permissions. |
486 | * If a permission is not passed to mprotect(), it must be |
487 | * cleared from the VMA. |
488 | */ |
489 | mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | |
490 | ARCH_VM_PKEY_FLAGS; |
491 | |
492 | new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); |
493 | newflags = calc_vm_prot_bits(prot, new_vma_pkey); |
494 | newflags |= (vma->vm_flags & ~mask_off_old_flags); |
495 | |
496 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
497 | if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { |
498 | error = -EACCES; |
499 | goto out; |
500 | } |
501 | |
502 | error = security_file_mprotect(vma, reqprot, prot); |
503 | if (error) |
504 | goto out; |
505 | |
506 | tmp = vma->vm_end; |
507 | if (tmp > end) |
508 | tmp = end; |
509 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); |
510 | if (error) |
511 | goto out; |
512 | nstart = tmp; |
513 | |
514 | if (nstart < prev->vm_end) |
515 | nstart = prev->vm_end; |
516 | if (nstart >= end) |
517 | goto out; |
518 | |
519 | vma = prev->vm_next; |
520 | if (!vma || vma->vm_start != nstart) { |
521 | error = -ENOMEM; |
522 | goto out; |
523 | } |
524 | prot = reqprot; |
525 | } |
526 | out: |
527 | up_write(¤t->mm->mmap_sem); |
528 | return error; |
529 | } |
530 | |
531 | SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, |
532 | unsigned long, prot) |
533 | { |
534 | return do_mprotect_pkey(start, len, prot, -1); |
535 | } |
536 | |
537 | SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, |
538 | unsigned long, prot, int, pkey) |
539 | { |
540 | return do_mprotect_pkey(start, len, prot, pkey); |
541 | } |
542 | |
543 | SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) |
544 | { |
545 | int pkey; |
546 | int ret; |
547 | |
548 | /* No flags supported yet. */ |
549 | if (flags) |
550 | return -EINVAL; |
551 | /* check for unsupported init values */ |
552 | if (init_val & ~PKEY_ACCESS_MASK) |
553 | return -EINVAL; |
554 | |
555 | down_write(¤t->mm->mmap_sem); |
556 | pkey = mm_pkey_alloc(current->mm); |
557 | |
558 | ret = -ENOSPC; |
559 | if (pkey == -1) |
560 | goto out; |
561 | |
562 | ret = arch_set_user_pkey_access(current, pkey, init_val); |
563 | if (ret) { |
564 | mm_pkey_free(current->mm, pkey); |
565 | goto out; |
566 | } |
567 | ret = pkey; |
568 | out: |
569 | up_write(¤t->mm->mmap_sem); |
570 | return ret; |
571 | } |
572 | |
573 | SYSCALL_DEFINE1(pkey_free, int, pkey) |
574 | { |
575 | int ret; |
576 | |
577 | down_write(¤t->mm->mmap_sem); |
578 | ret = mm_pkey_free(current->mm, pkey); |
579 | up_write(¤t->mm->mmap_sem); |
580 | |
581 | /* |
582 | * We could provie warnings or errors if any VMA still |
583 | * has the pkey set here. |
584 | */ |
585 | return ret; |
586 | } |
587 |