blob: a301b365b8955bce3f518e051f098fcaed6ab4b6
1 | /* vi: set sw=4 ts=4: */ |
2 | /* Copyright 2005 Rob Landley <rob@landley.net> |
3 | * |
4 | * Switch from rootfs to another filesystem as the root of the mount tree. |
5 | * |
6 | * Licensed under GPLv2, see file LICENSE in this source tree. |
7 | */ |
8 | |
9 | //usage:#define switch_root_trivial_usage |
10 | //usage: "[-c /dev/console] NEW_ROOT NEW_INIT [ARGS]" |
11 | //usage:#define switch_root_full_usage "\n\n" |
12 | //usage: "Free initramfs and switch to another root fs:\n" |
13 | //usage: "chroot to NEW_ROOT, delete all in /, move NEW_ROOT to /,\n" |
14 | //usage: "execute NEW_INIT. PID must be 1. NEW_ROOT must be a mountpoint.\n" |
15 | //usage: "\n -c DEV Reopen stdio to DEV after switch" |
16 | |
17 | #include <sys/vfs.h> |
18 | #include <sys/mount.h> |
19 | #include "libbb.h" |
20 | // Make up for header deficiencies |
21 | #ifndef RAMFS_MAGIC |
22 | # define RAMFS_MAGIC ((unsigned)0x858458f6) |
23 | #endif |
24 | #ifndef TMPFS_MAGIC |
25 | # define TMPFS_MAGIC ((unsigned)0x01021994) |
26 | #endif |
27 | #ifndef MS_MOVE |
28 | # define MS_MOVE 8192 |
29 | #endif |
30 | |
31 | // Recursively delete contents of rootfs |
32 | static void delete_contents(const char *directory, dev_t rootdev) |
33 | { |
34 | DIR *dir; |
35 | struct dirent *d; |
36 | struct stat st; |
37 | |
38 | // Don't descend into other filesystems |
39 | if (lstat(directory, &st) || st.st_dev != rootdev) |
40 | return; |
41 | |
42 | // Recursively delete the contents of directories |
43 | if (S_ISDIR(st.st_mode)) { |
44 | dir = opendir(directory); |
45 | if (dir) { |
46 | while ((d = readdir(dir))) { |
47 | char *newdir = d->d_name; |
48 | |
49 | // Skip . and .. |
50 | if (DOT_OR_DOTDOT(newdir)) |
51 | continue; |
52 | |
53 | // Recurse to delete contents |
54 | newdir = concat_path_file(directory, newdir); |
55 | delete_contents(newdir, rootdev); |
56 | free(newdir); |
57 | } |
58 | closedir(dir); |
59 | |
60 | // Directory should now be empty, zap it |
61 | rmdir(directory); |
62 | } |
63 | } else { |
64 | // It wasn't a directory, zap it |
65 | unlink(directory); |
66 | } |
67 | } |
68 | |
69 | int switch_root_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
70 | int switch_root_main(int argc UNUSED_PARAM, char **argv) |
71 | { |
72 | char *newroot, *console = NULL; |
73 | struct stat st; |
74 | struct statfs stfs; |
75 | dev_t rootdev; |
76 | |
77 | // Parse args (-c console) |
78 | opt_complementary = "-2"; // minimum 2 params |
79 | getopt32(argv, "+c:", &console); // '+': stop at first non-option |
80 | argv += optind; |
81 | newroot = *argv++; |
82 | |
83 | // Change to new root directory and verify it's a different fs |
84 | xchdir(newroot); |
85 | xstat("/", &st); |
86 | rootdev = st.st_dev; |
87 | xstat(".", &st); |
88 | if (st.st_dev == rootdev || getpid() != 1) { |
89 | // Show usage, it says new root must be a mountpoint |
90 | // and we must be PID 1 |
91 | bb_show_usage(); |
92 | } |
93 | |
94 | // Additional sanity checks: we're about to rm -rf /, so be REALLY SURE |
95 | // we mean it. I could make this a CONFIG option, but I would get email |
96 | // from all the people who WILL destroy their filesystems. |
97 | if (stat("/init", &st) != 0 || !S_ISREG(st.st_mode)) { |
98 | bb_error_msg_and_die("/init is not a regular file"); |
99 | } |
100 | statfs("/", &stfs); // this never fails |
101 | if ((unsigned)stfs.f_type != RAMFS_MAGIC |
102 | && (unsigned)stfs.f_type != TMPFS_MAGIC |
103 | ) { |
104 | bb_error_msg_and_die("root filesystem is not ramfs/tmpfs"); |
105 | } |
106 | |
107 | // Zap everything out of rootdev |
108 | delete_contents("/", rootdev); |
109 | |
110 | // Overmount / with newdir and chroot into it |
111 | if (mount(".", "/", NULL, MS_MOVE, NULL)) { |
112 | // For example, fails when newroot is not a mountpoint |
113 | bb_perror_msg_and_die("error moving root"); |
114 | } |
115 | xchroot("."); |
116 | // The chdir is needed to recalculate "." and ".." links |
117 | /*xchdir("/"); - done in xchroot */ |
118 | |
119 | // If a new console specified, redirect stdin/stdout/stderr to it |
120 | if (console) { |
121 | close(0); |
122 | xopen(console, O_RDWR); |
123 | xdup2(0, 1); |
124 | xdup2(0, 2); |
125 | } |
126 | |
127 | // Exec real init |
128 | execv(argv[0], argv); |
129 | bb_perror_msg_and_die("can't execute '%s'", argv[0]); |
130 | } |
131 | |
132 | /* |
133 | From: Rob Landley <rob@landley.net> |
134 | Date: Tue, Jun 16, 2009 at 7:47 PM |
135 | Subject: Re: switch_root... |
136 | |
137 | ... |
138 | ... |
139 | ... |
140 | |
141 | If you're _not_ running out of init_ramfs (if for example you're using initrd |
142 | instead), you probably shouldn't use switch_root because it's the wrong tool. |
143 | |
144 | Basically what the sucker does is something like the following shell script: |
145 | |
146 | find / -xdev | xargs rm -rf |
147 | cd "$1" |
148 | shift |
149 | mount --move . / |
150 | exec chroot . "$@" |
151 | |
152 | There are a couple reasons that won't work as a shell script: |
153 | |
154 | 1) If you delete the commands out of your $PATH, your shell scripts can't run |
155 | more commands, but you can't start using dynamically linked _new_ commands |
156 | until after you do the chroot because the path to the dynamic linker is wrong. |
157 | So there's a step that needs to be sort of atomic but can't be as a shell |
158 | script. (You can work around this with static linking or very carefully laid |
159 | out paths and sequencing, but it's brittle, ugly, and non-obvious.) |
160 | |
161 | 2) The "find | rm" bit will acually delete everything because the mount points |
162 | still show up (even if their contents don't), and rm -rf will then happily zap |
163 | that. So the first line is an oversimplification of what you need to do _not_ |
164 | to descend into other filesystems and delete their contents. |
165 | |
166 | The reason we do this is to free up memory, by the way. Since initramfs is a |
167 | ramfs, deleting its contents frees up the memory it uses. (We leave it with |
168 | one remaining dentry for the new mount point, but that's ok.) |
169 | |
170 | Note that you cannot ever umount rootfs, for approximately the same reason you |
171 | can't kill PID 1. The kernel tracks mount points as a doubly linked list, and |
172 | the pointer to the start/end of that list always points to an entry that's |
173 | known to be there (rootfs), so it never has to worry about moving that pointer |
174 | and it never has to worry about the list being empty. (Back around 2.6.13 |
175 | there _was_ a bug that let you umount rootfs, and the system locked hard the |
176 | instant you did so endlessly looping to find the end of the mount list and |
177 | never stopping. They fixed it.) |
178 | |
179 | Oh, and the reason we mount --move _and_ do the chroot is due to the way "/" |
180 | works. Each process has two special symlinks, ".", and "/". Each of them |
181 | points to the dentry of a directory, and give you a location paths can start |
182 | from. (Historically ".." was also special, because you could enter a |
183 | directory via a symlink so backing out to the directory you came from doesn't |
184 | necessarily mean the one physically above where "." points to. These days I |
185 | think it's just handed off to the filesystem.) |
186 | |
187 | Anyway, path resolution starts with "." or "/" (although the "./" at the start |
188 | of the path may be implicit), meaning it's relative to one of those two |
189 | directories. Your current directory, and your current root directory. The |
190 | chdir() syscall changes where "." points to, and the chroot() syscall changes |
191 | where "/" points to. (Again, both are per-process which is why chroot only |
192 | affects your current process and its child processes.) |
193 | |
194 | Note that chroot() does _not_ change where "." points to, and back before they |
195 | put crazy security checks into the kernel your current directory could be |
196 | somewhere you could no longer access after the chroot. (The command line |
197 | chroot does a cd as well, the chroot _syscall_ is what I'm talking about.) |
198 | |
199 | The reason mounting something new over / has no obvious effect is the same |
200 | reason mounting something over your current directory has no obvious effect: |
201 | the . and / links aren't recalculated after a mount, so they still point to |
202 | the same dentry they did before, even if that dentry is no longer accessible |
203 | by other means. Note that "cd ." is a NOP, and "chroot /" is a nop; both look |
204 | up the cached dentry and set it right back. They don't re-parse any paths, |
205 | because they're what all paths your process uses would be relative to. |
206 | |
207 | That's why the careful sequencing above: we cd into the new mount point before |
208 | we do the mount --move. Moving the mount point would otherwise make it |
209 | totally inaccessible to is because cd-ing to the old path wouldn't give it to |
210 | us anymore, and cd "/" just gives us the cached dentry from when the process |
211 | was created (in this case the old initramfs one). But the "." symlink gives |
212 | us the dentry of the filesystem we just moved, so we can then "chroot ." to |
213 | copy that dentry to "/" and get the new filesystem. If we _didn't_ save that |
214 | dentry in "." we couldn't get it back after the mount --move. |
215 | |
216 | (Yes, this is all screwy and I had to email questions to Linus Torvalds to get |
217 | it straight myself. I keep meaning to write up a "how mount actually works" |
218 | document someday...) |
219 | */ |
220 |