blob: 3975856d476c23b93534096763aefcc07a37cec6
1 | /* |
2 | * linux/kernel/seccomp.c |
3 | * |
4 | * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> |
5 | * |
6 | * Copyright (C) 2012 Google, Inc. |
7 | * Will Drewry <wad@chromium.org> |
8 | * |
9 | * This defines a simple but solid secure-computing facility. |
10 | * |
11 | * Mode 1 uses a fixed list of allowed system calls. |
12 | * Mode 2 allows user-defined system call filters in the form |
13 | * of Berkeley Packet Filters/Linux Socket Filters. |
14 | */ |
15 | |
16 | #include <linux/atomic.h> |
17 | #include <linux/audit.h> |
18 | #include <linux/compat.h> |
19 | #include <linux/nospec.h> |
20 | #include <linux/prctl.h> |
21 | #include <linux/sched.h> |
22 | #include <linux/seccomp.h> |
23 | #include <linux/slab.h> |
24 | #include <linux/syscalls.h> |
25 | |
26 | #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
27 | #include <asm/syscall.h> |
28 | #endif |
29 | |
30 | #ifdef CONFIG_SECCOMP_FILTER |
31 | #include <linux/filter.h> |
32 | #include <linux/pid.h> |
33 | #include <linux/ptrace.h> |
34 | #include <linux/security.h> |
35 | #include <linux/tracehook.h> |
36 | #include <linux/uaccess.h> |
37 | |
38 | /** |
39 | * struct seccomp_filter - container for seccomp BPF programs |
40 | * |
41 | * @usage: reference count to manage the object lifetime. |
42 | * get/put helpers should be used when accessing an instance |
43 | * outside of a lifetime-guarded section. In general, this |
44 | * is only needed for handling filters shared across tasks. |
45 | * @prev: points to a previously installed, or inherited, filter |
46 | * @len: the number of instructions in the program |
47 | * @insnsi: the BPF program instructions to evaluate |
48 | * |
49 | * seccomp_filter objects are organized in a tree linked via the @prev |
50 | * pointer. For any task, it appears to be a singly-linked list starting |
51 | * with current->seccomp.filter, the most recently attached or inherited filter. |
52 | * However, multiple filters may share a @prev node, by way of fork(), which |
53 | * results in a unidirectional tree existing in memory. This is similar to |
54 | * how namespaces work. |
55 | * |
56 | * seccomp_filter objects should never be modified after being attached |
57 | * to a task_struct (other than @usage). |
58 | */ |
59 | struct seccomp_filter { |
60 | atomic_t usage; |
61 | struct seccomp_filter *prev; |
62 | struct bpf_prog *prog; |
63 | }; |
64 | |
65 | /* Limit any path through the tree to 256KB worth of instructions. */ |
66 | #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) |
67 | |
68 | /* |
69 | * Endianness is explicitly ignored and left for BPF program authors to manage |
70 | * as per the specific architecture. |
71 | */ |
72 | static void populate_seccomp_data(struct seccomp_data *sd) |
73 | { |
74 | struct task_struct *task = current; |
75 | struct pt_regs *regs = task_pt_regs(task); |
76 | unsigned long args[6]; |
77 | |
78 | sd->nr = syscall_get_nr(task, regs); |
79 | sd->arch = syscall_get_arch(); |
80 | syscall_get_arguments(task, regs, 0, 6, args); |
81 | sd->args[0] = args[0]; |
82 | sd->args[1] = args[1]; |
83 | sd->args[2] = args[2]; |
84 | sd->args[3] = args[3]; |
85 | sd->args[4] = args[4]; |
86 | sd->args[5] = args[5]; |
87 | sd->instruction_pointer = KSTK_EIP(task); |
88 | } |
89 | |
90 | /** |
91 | * seccomp_check_filter - verify seccomp filter code |
92 | * @filter: filter to verify |
93 | * @flen: length of filter |
94 | * |
95 | * Takes a previously checked filter (by bpf_check_classic) and |
96 | * redirects all filter code that loads struct sk_buff data |
97 | * and related data through seccomp_bpf_load. It also |
98 | * enforces length and alignment checking of those loads. |
99 | * |
100 | * Returns 0 if the rule set is legal or -EINVAL if not. |
101 | */ |
102 | static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) |
103 | { |
104 | int pc; |
105 | for (pc = 0; pc < flen; pc++) { |
106 | struct sock_filter *ftest = &filter[pc]; |
107 | u16 code = ftest->code; |
108 | u32 k = ftest->k; |
109 | |
110 | switch (code) { |
111 | case BPF_LD | BPF_W | BPF_ABS: |
112 | ftest->code = BPF_LDX | BPF_W | BPF_ABS; |
113 | /* 32-bit aligned and not out of bounds. */ |
114 | if (k >= sizeof(struct seccomp_data) || k & 3) |
115 | return -EINVAL; |
116 | continue; |
117 | case BPF_LD | BPF_W | BPF_LEN: |
118 | ftest->code = BPF_LD | BPF_IMM; |
119 | ftest->k = sizeof(struct seccomp_data); |
120 | continue; |
121 | case BPF_LDX | BPF_W | BPF_LEN: |
122 | ftest->code = BPF_LDX | BPF_IMM; |
123 | ftest->k = sizeof(struct seccomp_data); |
124 | continue; |
125 | /* Explicitly include allowed calls. */ |
126 | case BPF_RET | BPF_K: |
127 | case BPF_RET | BPF_A: |
128 | case BPF_ALU | BPF_ADD | BPF_K: |
129 | case BPF_ALU | BPF_ADD | BPF_X: |
130 | case BPF_ALU | BPF_SUB | BPF_K: |
131 | case BPF_ALU | BPF_SUB | BPF_X: |
132 | case BPF_ALU | BPF_MUL | BPF_K: |
133 | case BPF_ALU | BPF_MUL | BPF_X: |
134 | case BPF_ALU | BPF_DIV | BPF_K: |
135 | case BPF_ALU | BPF_DIV | BPF_X: |
136 | case BPF_ALU | BPF_AND | BPF_K: |
137 | case BPF_ALU | BPF_AND | BPF_X: |
138 | case BPF_ALU | BPF_OR | BPF_K: |
139 | case BPF_ALU | BPF_OR | BPF_X: |
140 | case BPF_ALU | BPF_XOR | BPF_K: |
141 | case BPF_ALU | BPF_XOR | BPF_X: |
142 | case BPF_ALU | BPF_LSH | BPF_K: |
143 | case BPF_ALU | BPF_LSH | BPF_X: |
144 | case BPF_ALU | BPF_RSH | BPF_K: |
145 | case BPF_ALU | BPF_RSH | BPF_X: |
146 | case BPF_ALU | BPF_NEG: |
147 | case BPF_LD | BPF_IMM: |
148 | case BPF_LDX | BPF_IMM: |
149 | case BPF_MISC | BPF_TAX: |
150 | case BPF_MISC | BPF_TXA: |
151 | case BPF_LD | BPF_MEM: |
152 | case BPF_LDX | BPF_MEM: |
153 | case BPF_ST: |
154 | case BPF_STX: |
155 | case BPF_JMP | BPF_JA: |
156 | case BPF_JMP | BPF_JEQ | BPF_K: |
157 | case BPF_JMP | BPF_JEQ | BPF_X: |
158 | case BPF_JMP | BPF_JGE | BPF_K: |
159 | case BPF_JMP | BPF_JGE | BPF_X: |
160 | case BPF_JMP | BPF_JGT | BPF_K: |
161 | case BPF_JMP | BPF_JGT | BPF_X: |
162 | case BPF_JMP | BPF_JSET | BPF_K: |
163 | case BPF_JMP | BPF_JSET | BPF_X: |
164 | continue; |
165 | default: |
166 | return -EINVAL; |
167 | } |
168 | } |
169 | return 0; |
170 | } |
171 | |
172 | /** |
173 | * seccomp_run_filters - evaluates all seccomp filters against @syscall |
174 | * @syscall: number of the current system call |
175 | * |
176 | * Returns valid seccomp BPF response codes. |
177 | */ |
178 | static u32 seccomp_run_filters(const struct seccomp_data *sd) |
179 | { |
180 | struct seccomp_data sd_local; |
181 | u32 ret = SECCOMP_RET_ALLOW; |
182 | /* Make sure cross-thread synced filter points somewhere sane. */ |
183 | struct seccomp_filter *f = |
184 | lockless_dereference(current->seccomp.filter); |
185 | |
186 | /* Ensure unexpected behavior doesn't result in failing open. */ |
187 | if (unlikely(WARN_ON(f == NULL))) |
188 | return SECCOMP_RET_KILL; |
189 | |
190 | if (!sd) { |
191 | populate_seccomp_data(&sd_local); |
192 | sd = &sd_local; |
193 | } |
194 | |
195 | /* |
196 | * All filters in the list are evaluated and the lowest BPF return |
197 | * value always takes priority (ignoring the DATA). |
198 | */ |
199 | for (; f; f = f->prev) { |
200 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); |
201 | |
202 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
203 | ret = cur_ret; |
204 | } |
205 | return ret; |
206 | } |
207 | #endif /* CONFIG_SECCOMP_FILTER */ |
208 | |
209 | static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) |
210 | { |
211 | assert_spin_locked(¤t->sighand->siglock); |
212 | |
213 | if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) |
214 | return false; |
215 | |
216 | return true; |
217 | } |
218 | |
219 | void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } |
220 | |
221 | static inline void seccomp_assign_mode(struct task_struct *task, |
222 | unsigned long seccomp_mode, |
223 | unsigned long flags) |
224 | { |
225 | assert_spin_locked(&task->sighand->siglock); |
226 | |
227 | task->seccomp.mode = seccomp_mode; |
228 | /* |
229 | * Make sure TIF_SECCOMP cannot be set before the mode (and |
230 | * filter) is set. |
231 | */ |
232 | smp_mb__before_atomic(); |
233 | /* Assume default seccomp processes want spec flaw mitigation. */ |
234 | if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) |
235 | arch_seccomp_spec_mitigate(task); |
236 | set_tsk_thread_flag(task, TIF_SECCOMP); |
237 | } |
238 | |
239 | #ifdef CONFIG_SECCOMP_FILTER |
240 | /* Returns 1 if the parent is an ancestor of the child. */ |
241 | static int is_ancestor(struct seccomp_filter *parent, |
242 | struct seccomp_filter *child) |
243 | { |
244 | /* NULL is the root ancestor. */ |
245 | if (parent == NULL) |
246 | return 1; |
247 | for (; child; child = child->prev) |
248 | if (child == parent) |
249 | return 1; |
250 | return 0; |
251 | } |
252 | |
253 | /** |
254 | * seccomp_can_sync_threads: checks if all threads can be synchronized |
255 | * |
256 | * Expects sighand and cred_guard_mutex locks to be held. |
257 | * |
258 | * Returns 0 on success, -ve on error, or the pid of a thread which was |
259 | * either not in the correct seccomp mode or it did not have an ancestral |
260 | * seccomp filter. |
261 | */ |
262 | static inline pid_t seccomp_can_sync_threads(void) |
263 | { |
264 | struct task_struct *thread, *caller; |
265 | |
266 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); |
267 | assert_spin_locked(¤t->sighand->siglock); |
268 | |
269 | /* Validate all threads being eligible for synchronization. */ |
270 | caller = current; |
271 | for_each_thread(caller, thread) { |
272 | pid_t failed; |
273 | |
274 | /* Skip current, since it is initiating the sync. */ |
275 | if (thread == caller) |
276 | continue; |
277 | |
278 | if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || |
279 | (thread->seccomp.mode == SECCOMP_MODE_FILTER && |
280 | is_ancestor(thread->seccomp.filter, |
281 | caller->seccomp.filter))) |
282 | continue; |
283 | |
284 | /* Return the first thread that cannot be synchronized. */ |
285 | failed = task_pid_vnr(thread); |
286 | /* If the pid cannot be resolved, then return -ESRCH */ |
287 | if (unlikely(WARN_ON(failed == 0))) |
288 | failed = -ESRCH; |
289 | return failed; |
290 | } |
291 | |
292 | return 0; |
293 | } |
294 | |
295 | /** |
296 | * seccomp_sync_threads: sets all threads to use current's filter |
297 | * |
298 | * Expects sighand and cred_guard_mutex locks to be held, and for |
299 | * seccomp_can_sync_threads() to have returned success already |
300 | * without dropping the locks. |
301 | * |
302 | */ |
303 | static inline void seccomp_sync_threads(unsigned long flags) |
304 | { |
305 | struct task_struct *thread, *caller; |
306 | |
307 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); |
308 | assert_spin_locked(¤t->sighand->siglock); |
309 | |
310 | /* Synchronize all threads. */ |
311 | caller = current; |
312 | for_each_thread(caller, thread) { |
313 | /* Skip current, since it needs no changes. */ |
314 | if (thread == caller) |
315 | continue; |
316 | |
317 | /* Get a task reference for the new leaf node. */ |
318 | get_seccomp_filter(caller); |
319 | /* |
320 | * Drop the task reference to the shared ancestor since |
321 | * current's path will hold a reference. (This also |
322 | * allows a put before the assignment.) |
323 | */ |
324 | put_seccomp_filter(thread); |
325 | smp_store_release(&thread->seccomp.filter, |
326 | caller->seccomp.filter); |
327 | |
328 | /* |
329 | * Don't let an unprivileged task work around |
330 | * the no_new_privs restriction by creating |
331 | * a thread that sets it up, enters seccomp, |
332 | * then dies. |
333 | */ |
334 | if (task_no_new_privs(caller)) |
335 | task_set_no_new_privs(thread); |
336 | |
337 | /* |
338 | * Opt the other thread into seccomp if needed. |
339 | * As threads are considered to be trust-realm |
340 | * equivalent (see ptrace_may_access), it is safe to |
341 | * allow one thread to transition the other. |
342 | */ |
343 | if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) |
344 | seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, |
345 | flags); |
346 | } |
347 | } |
348 | |
349 | /** |
350 | * seccomp_prepare_filter: Prepares a seccomp filter for use. |
351 | * @fprog: BPF program to install |
352 | * |
353 | * Returns filter on success or an ERR_PTR on failure. |
354 | */ |
355 | static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) |
356 | { |
357 | struct seccomp_filter *sfilter; |
358 | int ret; |
359 | const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); |
360 | |
361 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) |
362 | return ERR_PTR(-EINVAL); |
363 | |
364 | BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); |
365 | |
366 | /* |
367 | * Installing a seccomp filter requires that the task has |
368 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. |
369 | * This avoids scenarios where unprivileged tasks can affect the |
370 | * behavior of privileged children. |
371 | */ |
372 | if (!task_no_new_privs(current) && |
373 | security_capable_noaudit(current_cred(), current_user_ns(), |
374 | CAP_SYS_ADMIN) != 0) |
375 | return ERR_PTR(-EACCES); |
376 | |
377 | /* Allocate a new seccomp_filter */ |
378 | sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); |
379 | if (!sfilter) |
380 | return ERR_PTR(-ENOMEM); |
381 | |
382 | ret = bpf_prog_create_from_user(&sfilter->prog, fprog, |
383 | seccomp_check_filter, save_orig); |
384 | if (ret < 0) { |
385 | kfree(sfilter); |
386 | return ERR_PTR(ret); |
387 | } |
388 | |
389 | atomic_set(&sfilter->usage, 1); |
390 | |
391 | return sfilter; |
392 | } |
393 | |
394 | /** |
395 | * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog |
396 | * @user_filter: pointer to the user data containing a sock_fprog. |
397 | * |
398 | * Returns 0 on success and non-zero otherwise. |
399 | */ |
400 | static struct seccomp_filter * |
401 | seccomp_prepare_user_filter(const char __user *user_filter) |
402 | { |
403 | struct sock_fprog fprog; |
404 | struct seccomp_filter *filter = ERR_PTR(-EFAULT); |
405 | |
406 | #ifdef CONFIG_COMPAT |
407 | if (in_compat_syscall()) { |
408 | struct compat_sock_fprog fprog32; |
409 | if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) |
410 | goto out; |
411 | fprog.len = fprog32.len; |
412 | fprog.filter = compat_ptr(fprog32.filter); |
413 | } else /* falls through to the if below. */ |
414 | #endif |
415 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) |
416 | goto out; |
417 | filter = seccomp_prepare_filter(&fprog); |
418 | out: |
419 | return filter; |
420 | } |
421 | |
422 | /** |
423 | * seccomp_attach_filter: validate and attach filter |
424 | * @flags: flags to change filter behavior |
425 | * @filter: seccomp filter to add to the current process |
426 | * |
427 | * Caller must be holding current->sighand->siglock lock. |
428 | * |
429 | * Returns 0 on success, -ve on error. |
430 | */ |
431 | static long seccomp_attach_filter(unsigned int flags, |
432 | struct seccomp_filter *filter) |
433 | { |
434 | unsigned long total_insns; |
435 | struct seccomp_filter *walker; |
436 | |
437 | assert_spin_locked(¤t->sighand->siglock); |
438 | |
439 | /* Validate resulting filter length. */ |
440 | total_insns = filter->prog->len; |
441 | for (walker = current->seccomp.filter; walker; walker = walker->prev) |
442 | total_insns += walker->prog->len + 4; /* 4 instr penalty */ |
443 | if (total_insns > MAX_INSNS_PER_PATH) |
444 | return -ENOMEM; |
445 | |
446 | /* If thread sync has been requested, check that it is possible. */ |
447 | if (flags & SECCOMP_FILTER_FLAG_TSYNC) { |
448 | int ret; |
449 | |
450 | ret = seccomp_can_sync_threads(); |
451 | if (ret) |
452 | return ret; |
453 | } |
454 | |
455 | /* |
456 | * If there is an existing filter, make it the prev and don't drop its |
457 | * task reference. |
458 | */ |
459 | filter->prev = current->seccomp.filter; |
460 | current->seccomp.filter = filter; |
461 | |
462 | /* Now that the new filter is in place, synchronize to all threads. */ |
463 | if (flags & SECCOMP_FILTER_FLAG_TSYNC) |
464 | seccomp_sync_threads(flags); |
465 | |
466 | return 0; |
467 | } |
468 | |
469 | void __get_seccomp_filter(struct seccomp_filter *filter) |
470 | { |
471 | /* Reference count is bounded by the number of total processes. */ |
472 | atomic_inc(&filter->usage); |
473 | } |
474 | |
475 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ |
476 | void get_seccomp_filter(struct task_struct *tsk) |
477 | { |
478 | struct seccomp_filter *orig = tsk->seccomp.filter; |
479 | if (!orig) |
480 | return; |
481 | __get_seccomp_filter(orig); |
482 | } |
483 | |
484 | static inline void seccomp_filter_free(struct seccomp_filter *filter) |
485 | { |
486 | if (filter) { |
487 | bpf_prog_destroy(filter->prog); |
488 | kfree(filter); |
489 | } |
490 | } |
491 | |
492 | static void __put_seccomp_filter(struct seccomp_filter *orig) |
493 | { |
494 | /* Clean up single-reference branches iteratively. */ |
495 | while (orig && atomic_dec_and_test(&orig->usage)) { |
496 | struct seccomp_filter *freeme = orig; |
497 | orig = orig->prev; |
498 | seccomp_filter_free(freeme); |
499 | } |
500 | } |
501 | |
502 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ |
503 | void put_seccomp_filter(struct task_struct *tsk) |
504 | { |
505 | __put_seccomp_filter(tsk->seccomp.filter); |
506 | } |
507 | |
508 | /** |
509 | * seccomp_send_sigsys - signals the task to allow in-process syscall emulation |
510 | * @syscall: syscall number to send to userland |
511 | * @reason: filter-supplied reason code to send to userland (via si_errno) |
512 | * |
513 | * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. |
514 | */ |
515 | static void seccomp_send_sigsys(int syscall, int reason) |
516 | { |
517 | struct siginfo info; |
518 | memset(&info, 0, sizeof(info)); |
519 | info.si_signo = SIGSYS; |
520 | info.si_code = SYS_SECCOMP; |
521 | info.si_call_addr = (void __user *)KSTK_EIP(current); |
522 | info.si_errno = reason; |
523 | info.si_arch = syscall_get_arch(); |
524 | info.si_syscall = syscall; |
525 | force_sig_info(SIGSYS, &info, current); |
526 | } |
527 | #endif /* CONFIG_SECCOMP_FILTER */ |
528 | |
529 | /* |
530 | * Secure computing mode 1 allows only read/write/exit/sigreturn. |
531 | * To be fully secure this must be combined with rlimit |
532 | * to limit the stack allocations too. |
533 | */ |
534 | static const int mode1_syscalls[] = { |
535 | __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, |
536 | 0, /* null terminated */ |
537 | }; |
538 | |
539 | static void __secure_computing_strict(int this_syscall) |
540 | { |
541 | const int *syscall_whitelist = mode1_syscalls; |
542 | #ifdef CONFIG_COMPAT |
543 | if (in_compat_syscall()) |
544 | syscall_whitelist = get_compat_mode1_syscalls(); |
545 | #endif |
546 | do { |
547 | if (*syscall_whitelist == this_syscall) |
548 | return; |
549 | } while (*++syscall_whitelist); |
550 | |
551 | #ifdef SECCOMP_DEBUG |
552 | dump_stack(); |
553 | #endif |
554 | audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); |
555 | do_exit(SIGKILL); |
556 | } |
557 | |
558 | #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER |
559 | void secure_computing_strict(int this_syscall) |
560 | { |
561 | int mode = current->seccomp.mode; |
562 | |
563 | if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && |
564 | unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) |
565 | return; |
566 | |
567 | if (mode == SECCOMP_MODE_DISABLED) |
568 | return; |
569 | else if (mode == SECCOMP_MODE_STRICT) |
570 | __secure_computing_strict(this_syscall); |
571 | else |
572 | BUG(); |
573 | } |
574 | #else |
575 | |
576 | #ifdef CONFIG_SECCOMP_FILTER |
577 | static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, |
578 | const bool recheck_after_trace) |
579 | { |
580 | u32 filter_ret, action; |
581 | int data; |
582 | |
583 | /* |
584 | * Make sure that any changes to mode from another thread have |
585 | * been seen after TIF_SECCOMP was seen. |
586 | */ |
587 | rmb(); |
588 | |
589 | filter_ret = seccomp_run_filters(sd); |
590 | data = filter_ret & SECCOMP_RET_DATA; |
591 | action = filter_ret & SECCOMP_RET_ACTION; |
592 | |
593 | switch (action) { |
594 | case SECCOMP_RET_ERRNO: |
595 | /* Set low-order bits as an errno, capped at MAX_ERRNO. */ |
596 | if (data > MAX_ERRNO) |
597 | data = MAX_ERRNO; |
598 | syscall_set_return_value(current, task_pt_regs(current), |
599 | -data, 0); |
600 | goto skip; |
601 | |
602 | case SECCOMP_RET_TRAP: |
603 | /* Show the handler the original registers. */ |
604 | syscall_rollback(current, task_pt_regs(current)); |
605 | /* Let the filter pass back 16 bits of data. */ |
606 | seccomp_send_sigsys(this_syscall, data); |
607 | goto skip; |
608 | |
609 | case SECCOMP_RET_TRACE: |
610 | /* We've been put in this state by the ptracer already. */ |
611 | if (recheck_after_trace) |
612 | return 0; |
613 | |
614 | /* ENOSYS these calls if there is no tracer attached. */ |
615 | if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { |
616 | syscall_set_return_value(current, |
617 | task_pt_regs(current), |
618 | -ENOSYS, 0); |
619 | goto skip; |
620 | } |
621 | |
622 | /* Allow the BPF to provide the event message */ |
623 | ptrace_event(PTRACE_EVENT_SECCOMP, data); |
624 | /* |
625 | * The delivery of a fatal signal during event |
626 | * notification may silently skip tracer notification, |
627 | * which could leave us with a potentially unmodified |
628 | * syscall that the tracer would have liked to have |
629 | * changed. Since the process is about to die, we just |
630 | * force the syscall to be skipped and let the signal |
631 | * kill the process and correctly handle any tracer exit |
632 | * notifications. |
633 | */ |
634 | if (fatal_signal_pending(current)) |
635 | goto skip; |
636 | /* Check if the tracer forced the syscall to be skipped. */ |
637 | this_syscall = syscall_get_nr(current, task_pt_regs(current)); |
638 | if (this_syscall < 0) |
639 | goto skip; |
640 | |
641 | /* |
642 | * Recheck the syscall, since it may have changed. This |
643 | * intentionally uses a NULL struct seccomp_data to force |
644 | * a reload of all registers. This does not goto skip since |
645 | * a skip would have already been reported. |
646 | */ |
647 | if (__seccomp_filter(this_syscall, NULL, true)) |
648 | return -1; |
649 | |
650 | return 0; |
651 | |
652 | case SECCOMP_RET_ALLOW: |
653 | return 0; |
654 | |
655 | case SECCOMP_RET_KILL: |
656 | default: |
657 | audit_seccomp(this_syscall, SIGSYS, action); |
658 | do_exit(SIGSYS); |
659 | } |
660 | |
661 | unreachable(); |
662 | |
663 | skip: |
664 | audit_seccomp(this_syscall, 0, action); |
665 | return -1; |
666 | } |
667 | #else |
668 | static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, |
669 | const bool recheck_after_trace) |
670 | { |
671 | BUG(); |
672 | } |
673 | #endif |
674 | |
675 | int __secure_computing(const struct seccomp_data *sd) |
676 | { |
677 | int mode = current->seccomp.mode; |
678 | int this_syscall; |
679 | |
680 | if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && |
681 | unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) |
682 | return 0; |
683 | |
684 | this_syscall = sd ? sd->nr : |
685 | syscall_get_nr(current, task_pt_regs(current)); |
686 | |
687 | switch (mode) { |
688 | case SECCOMP_MODE_STRICT: |
689 | __secure_computing_strict(this_syscall); /* may call do_exit */ |
690 | return 0; |
691 | case SECCOMP_MODE_FILTER: |
692 | return __seccomp_filter(this_syscall, sd, false); |
693 | default: |
694 | BUG(); |
695 | } |
696 | } |
697 | #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ |
698 | |
699 | long prctl_get_seccomp(void) |
700 | { |
701 | return current->seccomp.mode; |
702 | } |
703 | |
704 | /** |
705 | * seccomp_set_mode_strict: internal function for setting strict seccomp |
706 | * |
707 | * Once current->seccomp.mode is non-zero, it may not be changed. |
708 | * |
709 | * Returns 0 on success or -EINVAL on failure. |
710 | */ |
711 | static long seccomp_set_mode_strict(void) |
712 | { |
713 | const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; |
714 | long ret = -EINVAL; |
715 | |
716 | spin_lock_irq(¤t->sighand->siglock); |
717 | |
718 | if (!seccomp_may_assign_mode(seccomp_mode)) |
719 | goto out; |
720 | |
721 | #ifdef TIF_NOTSC |
722 | disable_TSC(); |
723 | #endif |
724 | seccomp_assign_mode(current, seccomp_mode, 0); |
725 | ret = 0; |
726 | |
727 | out: |
728 | spin_unlock_irq(¤t->sighand->siglock); |
729 | |
730 | return ret; |
731 | } |
732 | |
733 | #ifdef CONFIG_SECCOMP_FILTER |
734 | /** |
735 | * seccomp_set_mode_filter: internal function for setting seccomp filter |
736 | * @flags: flags to change filter behavior |
737 | * @filter: struct sock_fprog containing filter |
738 | * |
739 | * This function may be called repeatedly to install additional filters. |
740 | * Every filter successfully installed will be evaluated (in reverse order) |
741 | * for each system call the task makes. |
742 | * |
743 | * Once current->seccomp.mode is non-zero, it may not be changed. |
744 | * |
745 | * Returns 0 on success or -EINVAL on failure. |
746 | */ |
747 | static long seccomp_set_mode_filter(unsigned int flags, |
748 | const char __user *filter) |
749 | { |
750 | const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; |
751 | struct seccomp_filter *prepared = NULL; |
752 | long ret = -EINVAL; |
753 | |
754 | /* Validate flags. */ |
755 | if (flags & ~SECCOMP_FILTER_FLAG_MASK) |
756 | return -EINVAL; |
757 | |
758 | /* Prepare the new filter before holding any locks. */ |
759 | prepared = seccomp_prepare_user_filter(filter); |
760 | if (IS_ERR(prepared)) |
761 | return PTR_ERR(prepared); |
762 | |
763 | /* |
764 | * Make sure we cannot change seccomp or nnp state via TSYNC |
765 | * while another thread is in the middle of calling exec. |
766 | */ |
767 | if (flags & SECCOMP_FILTER_FLAG_TSYNC && |
768 | mutex_lock_killable(¤t->signal->cred_guard_mutex)) |
769 | goto out_free; |
770 | |
771 | spin_lock_irq(¤t->sighand->siglock); |
772 | |
773 | if (!seccomp_may_assign_mode(seccomp_mode)) |
774 | goto out; |
775 | |
776 | ret = seccomp_attach_filter(flags, prepared); |
777 | if (ret) |
778 | goto out; |
779 | /* Do not free the successfully attached filter. */ |
780 | prepared = NULL; |
781 | |
782 | seccomp_assign_mode(current, seccomp_mode, flags); |
783 | out: |
784 | spin_unlock_irq(¤t->sighand->siglock); |
785 | if (flags & SECCOMP_FILTER_FLAG_TSYNC) |
786 | mutex_unlock(¤t->signal->cred_guard_mutex); |
787 | out_free: |
788 | seccomp_filter_free(prepared); |
789 | return ret; |
790 | } |
791 | #else |
792 | static inline long seccomp_set_mode_filter(unsigned int flags, |
793 | const char __user *filter) |
794 | { |
795 | return -EINVAL; |
796 | } |
797 | #endif |
798 | |
799 | /* Common entry point for both prctl and syscall. */ |
800 | static long do_seccomp(unsigned int op, unsigned int flags, |
801 | const char __user *uargs) |
802 | { |
803 | switch (op) { |
804 | case SECCOMP_SET_MODE_STRICT: |
805 | if (flags != 0 || uargs != NULL) |
806 | return -EINVAL; |
807 | return seccomp_set_mode_strict(); |
808 | case SECCOMP_SET_MODE_FILTER: |
809 | return seccomp_set_mode_filter(flags, uargs); |
810 | default: |
811 | return -EINVAL; |
812 | } |
813 | } |
814 | |
815 | SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, |
816 | const char __user *, uargs) |
817 | { |
818 | return do_seccomp(op, flags, uargs); |
819 | } |
820 | |
821 | /** |
822 | * prctl_set_seccomp: configures current->seccomp.mode |
823 | * @seccomp_mode: requested mode to use |
824 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER |
825 | * |
826 | * Returns 0 on success or -EINVAL on failure. |
827 | */ |
828 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) |
829 | { |
830 | unsigned int op; |
831 | char __user *uargs; |
832 | |
833 | switch (seccomp_mode) { |
834 | case SECCOMP_MODE_STRICT: |
835 | op = SECCOMP_SET_MODE_STRICT; |
836 | /* |
837 | * Setting strict mode through prctl always ignored filter, |
838 | * so make sure it is always NULL here to pass the internal |
839 | * check in do_seccomp(). |
840 | */ |
841 | uargs = NULL; |
842 | break; |
843 | case SECCOMP_MODE_FILTER: |
844 | op = SECCOMP_SET_MODE_FILTER; |
845 | uargs = filter; |
846 | break; |
847 | default: |
848 | return -EINVAL; |
849 | } |
850 | |
851 | /* prctl interface doesn't have flags, so they are always zero. */ |
852 | return do_seccomp(op, 0, uargs); |
853 | } |
854 | |
855 | #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) |
856 | long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, |
857 | void __user *data) |
858 | { |
859 | struct seccomp_filter *filter; |
860 | struct sock_fprog_kern *fprog; |
861 | long ret; |
862 | unsigned long count = 0; |
863 | |
864 | if (!capable(CAP_SYS_ADMIN) || |
865 | current->seccomp.mode != SECCOMP_MODE_DISABLED) { |
866 | return -EACCES; |
867 | } |
868 | |
869 | spin_lock_irq(&task->sighand->siglock); |
870 | if (task->seccomp.mode != SECCOMP_MODE_FILTER) { |
871 | ret = -EINVAL; |
872 | goto out; |
873 | } |
874 | |
875 | filter = task->seccomp.filter; |
876 | while (filter) { |
877 | filter = filter->prev; |
878 | count++; |
879 | } |
880 | |
881 | if (filter_off >= count) { |
882 | ret = -ENOENT; |
883 | goto out; |
884 | } |
885 | count -= filter_off; |
886 | |
887 | filter = task->seccomp.filter; |
888 | while (filter && count > 1) { |
889 | filter = filter->prev; |
890 | count--; |
891 | } |
892 | |
893 | if (WARN_ON(count != 1 || !filter)) { |
894 | /* The filter tree shouldn't shrink while we're using it. */ |
895 | ret = -ENOENT; |
896 | goto out; |
897 | } |
898 | |
899 | fprog = filter->prog->orig_prog; |
900 | if (!fprog) { |
901 | /* This must be a new non-cBPF filter, since we save |
902 | * every cBPF filter's orig_prog above when |
903 | * CONFIG_CHECKPOINT_RESTORE is enabled. |
904 | */ |
905 | ret = -EMEDIUMTYPE; |
906 | goto out; |
907 | } |
908 | |
909 | ret = fprog->len; |
910 | if (!data) |
911 | goto out; |
912 | |
913 | __get_seccomp_filter(filter); |
914 | spin_unlock_irq(&task->sighand->siglock); |
915 | |
916 | if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) |
917 | ret = -EFAULT; |
918 | |
919 | __put_seccomp_filter(filter); |
920 | return ret; |
921 | |
922 | out: |
923 | spin_unlock_irq(&task->sighand->siglock); |
924 | return ret; |
925 | } |
926 | #endif |
927 |