summaryrefslogtreecommitdiff
path: root/networking/wget.c (plain)
blob: e15f68ddd01bb2554dd0dc147417d5d232cf0730
1/* vi: set sw=4 ts=4: */
2/*
3 * wget - retrieve a file using HTTP or FTP
4 *
5 * Chip Rosenthal Covad Communications <chip@laserlink.net>
6 * Licensed under GPLv2, see file LICENSE in this source tree.
7 *
8 * Copyright (C) 2010 Bradley M. Kuhn <bkuhn@ebb.org>
9 * Kuhn's copyrights are licensed GPLv2-or-later. File as a whole remains GPLv2.
10 */
11
12//usage:#define wget_trivial_usage
13//usage: IF_FEATURE_WGET_LONG_OPTIONS(
14//usage: "[-c|--continue] [-s|--spider] [-q|--quiet] [-O|--output-document FILE]\n"
15//usage: " [--header 'header: value'] [-Y|--proxy on/off] [-P DIR]\n"
16/* Since we ignore these opts, we don't show them in --help */
17/* //usage: " [--no-check-certificate] [--no-cache]" */
18//usage: " [-U|--user-agent AGENT]" IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
19//usage: )
20//usage: IF_NOT_FEATURE_WGET_LONG_OPTIONS(
21//usage: "[-csq] [-O FILE] [-Y on/off] [-P DIR] [-U AGENT]"
22//usage: IF_FEATURE_WGET_TIMEOUT(" [-T SEC]") " URL..."
23//usage: )
24//usage:#define wget_full_usage "\n\n"
25//usage: "Retrieve files via HTTP or FTP\n"
26//usage: "\n -s Spider mode - only check file existence"
27//usage: "\n -c Continue retrieval of aborted transfer"
28//usage: "\n -q Quiet"
29//usage: "\n -P DIR Save to DIR (default .)"
30//usage: IF_FEATURE_WGET_TIMEOUT(
31//usage: "\n -T SEC Network read timeout is SEC seconds"
32//usage: )
33//usage: "\n -O FILE Save to FILE ('-' for stdout)"
34//usage: "\n -U STR Use STR for User-Agent header"
35//usage: "\n -Y Use proxy ('on' or 'off')"
36
37#include "libbb.h"
38
39#if 0
40# define log_io(...) bb_error_msg(__VA_ARGS__)
41#else
42# define log_io(...) ((void)0)
43#endif
44
45
46struct host_info {
47 char *allocated;
48 const char *path;
49 const char *user;
50 char *host;
51 int port;
52 smallint is_ftp;
53};
54
55
56/* Globals */
57struct globals {
58 off_t content_len; /* Content-length of the file */
59 off_t beg_range; /* Range at which continue begins */
60#if ENABLE_FEATURE_WGET_STATUSBAR
61 off_t transferred; /* Number of bytes transferred so far */
62 const char *curfile; /* Name of current file being transferred */
63 bb_progress_t pmt;
64#endif
65 char *dir_prefix;
66#if ENABLE_FEATURE_WGET_LONG_OPTIONS
67 char *post_data;
68 char *extra_headers;
69#endif
70 char *fname_out; /* where to direct output (-O) */
71 const char *proxy_flag; /* Use proxies if env vars are set */
72 const char *user_agent; /* "User-Agent" header field */
73#if ENABLE_FEATURE_WGET_TIMEOUT
74 unsigned timeout_seconds;
75 bool connecting;
76#endif
77 int output_fd;
78 int o_flags;
79 smallint chunked; /* chunked transfer encoding */
80 smallint got_clen; /* got content-length: from server */
81 /* Local downloads do benefit from big buffer.
82 * With 512 byte buffer, it was measured to be
83 * an order of magnitude slower than with big one.
84 */
85 uint64_t just_to_align_next_member;
86 char wget_buf[CONFIG_FEATURE_COPYBUF_KB*1024];
87} FIX_ALIASING;
88#define G (*ptr_to_globals)
89#define INIT_G() do { \
90 SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
91} while (0)
92#define FINI_G() do { \
93 FREE_PTR_TO_GLOBALS(); \
94} while (0)
95
96
97/* Must match option string! */
98enum {
99 WGET_OPT_CONTINUE = (1 << 0),
100 WGET_OPT_SPIDER = (1 << 1),
101 WGET_OPT_QUIET = (1 << 2),
102 WGET_OPT_OUTNAME = (1 << 3),
103 WGET_OPT_PREFIX = (1 << 4),
104 WGET_OPT_PROXY = (1 << 5),
105 WGET_OPT_USER_AGENT = (1 << 6),
106 WGET_OPT_NETWORK_READ_TIMEOUT = (1 << 7),
107 WGET_OPT_RETRIES = (1 << 8),
108 WGET_OPT_PASSIVE = (1 << 9),
109 WGET_OPT_HEADER = (1 << 10) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
110 WGET_OPT_POST_DATA = (1 << 11) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
111};
112
113enum {
114 PROGRESS_START = -1,
115 PROGRESS_END = 0,
116 PROGRESS_BUMP = 1,
117};
118#if ENABLE_FEATURE_WGET_STATUSBAR
119static void progress_meter(int flag)
120{
121 if (option_mask32 & WGET_OPT_QUIET)
122 return;
123
124 if (flag == PROGRESS_START)
125 bb_progress_init(&G.pmt, G.curfile);
126
127 bb_progress_update(&G.pmt,
128 G.beg_range,
129 G.transferred,
130 (G.chunked || !G.got_clen) ? 0 : G.beg_range + G.transferred + G.content_len
131 );
132
133 if (flag == PROGRESS_END) {
134 bb_progress_free(&G.pmt);
135 bb_putchar_stderr('\n');
136 G.transferred = 0;
137 }
138}
139#else
140static ALWAYS_INLINE void progress_meter(int flag UNUSED_PARAM) { }
141#endif
142
143
144/* IPv6 knows scoped address types i.e. link and site local addresses. Link
145 * local addresses can have a scope identifier to specify the
146 * interface/link an address is valid on (e.g. fe80::1%eth0). This scope
147 * identifier is only valid on a single node.
148 *
149 * RFC 4007 says that the scope identifier MUST NOT be sent across the wire,
150 * unless all nodes agree on the semantic. Apache e.g. regards zone identifiers
151 * in the Host header as invalid requests, see
152 * https://issues.apache.org/bugzilla/show_bug.cgi?id=35122
153 */
154static void strip_ipv6_scope_id(char *host)
155{
156 char *scope, *cp;
157
158 /* bbox wget actually handles IPv6 addresses without [], like
159 * wget "http://::1/xxx", but this is not standard.
160 * To save code, _here_ we do not support it. */
161
162 if (host[0] != '[')
163 return; /* not IPv6 */
164
165 scope = strchr(host, '%');
166 if (!scope)
167 return;
168
169 /* Remove the IPv6 zone identifier from the host address */
170 cp = strchr(host, ']');
171 if (!cp || (cp[1] != ':' && cp[1] != '\0')) {
172 /* malformed address (not "[xx]:nn" or "[xx]") */
173 return;
174 }
175
176 /* cp points to "]...", scope points to "%eth0]..." */
177 overlapping_strcpy(scope, cp);
178}
179
180#if ENABLE_FEATURE_WGET_AUTHENTICATION
181/* Base64-encode character string. */
182static char *base64enc(const char *str)
183{
184 unsigned len = strlen(str);
185 if (len > sizeof(G.wget_buf)/4*3 - 10) /* paranoia */
186 len = sizeof(G.wget_buf)/4*3 - 10;
187 bb_uuencode(G.wget_buf, str, len, bb_uuenc_tbl_base64);
188 return G.wget_buf;
189}
190#endif
191
192static char* sanitize_string(char *s)
193{
194 unsigned char *p = (void *) s;
195 while (*p >= ' ')
196 p++;
197 *p = '\0';
198 return s;
199}
200
201#if ENABLE_FEATURE_WGET_TIMEOUT
202static void alarm_handler(int sig UNUSED_PARAM)
203{
204 /* This is theoretically unsafe (uses stdio and malloc in signal handler) */
205 if (G.connecting)
206 bb_error_msg_and_die("download timed out");
207}
208#endif
209
210static FILE *open_socket(len_and_sockaddr *lsa)
211{
212 int fd;
213 FILE *fp;
214
215 IF_FEATURE_WGET_TIMEOUT(alarm(G.timeout_seconds); G.connecting = 1;)
216 fd = xconnect_stream(lsa);
217 IF_FEATURE_WGET_TIMEOUT(G.connecting = 0;)
218
219 /* glibc 2.4 seems to try seeking on it - ??! */
220 /* hopefully it understands what ESPIPE means... */
221 fp = fdopen(fd, "r+");
222 if (fp == NULL)
223 bb_perror_msg_and_die("%s", bb_msg_memory_exhausted);
224
225 return fp;
226}
227
228/* Returns '\n' if it was seen, else '\0'. Trims at first '\r' or '\n' */
229/* FIXME: does not respect FEATURE_WGET_TIMEOUT and -T N: */
230static char fgets_and_trim(FILE *fp)
231{
232 char c;
233 char *buf_ptr;
234
235 if (fgets(G.wget_buf, sizeof(G.wget_buf) - 1, fp) == NULL)
236 bb_perror_msg_and_die("error getting response");
237
238 buf_ptr = strchrnul(G.wget_buf, '\n');
239 c = *buf_ptr;
240 *buf_ptr = '\0';
241 buf_ptr = strchrnul(G.wget_buf, '\r');
242 *buf_ptr = '\0';
243
244 log_io("< %s", G.wget_buf);
245
246 return c;
247}
248
249static int ftpcmd(const char *s1, const char *s2, FILE *fp)
250{
251 int result;
252 if (s1) {
253 if (!s2)
254 s2 = "";
255 fprintf(fp, "%s%s\r\n", s1, s2);
256 fflush(fp);
257 log_io("> %s%s", s1, s2);
258 }
259
260 do {
261 fgets_and_trim(fp);
262 } while (!isdigit(G.wget_buf[0]) || G.wget_buf[3] != ' ');
263
264 G.wget_buf[3] = '\0';
265 result = xatoi_positive(G.wget_buf);
266 G.wget_buf[3] = ' ';
267 return result;
268}
269
270static void parse_url(const char *src_url, struct host_info *h)
271{
272 char *url, *p, *sp;
273
274 free(h->allocated);
275 h->allocated = url = xstrdup(src_url);
276
277 if (strncmp(url, "ftp://", 6) == 0) {
278 h->port = bb_lookup_port("ftp", "tcp", 21);
279 h->host = url + 6;
280 h->is_ftp = 1;
281 } else
282 if (strncmp(url, "http://", 7) == 0) {
283 h->host = url + 7;
284 http:
285 h->port = bb_lookup_port("http", "tcp", 80);
286 h->is_ftp = 0;
287 } else
288 if (!strstr(url, "//")) {
289 // GNU wget is user-friendly and falls back to http://
290 h->host = url;
291 goto http;
292 } else
293 bb_error_msg_and_die("not an http or ftp url: %s", sanitize_string(url));
294
295 // FYI:
296 // "Real" wget 'http://busybox.net?var=a/b' sends this request:
297 // 'GET /?var=a/b HTTP 1.0'
298 // and saves 'index.html?var=a%2Fb' (we save 'b')
299 // wget 'http://busybox.net?login=john@doe':
300 // request: 'GET /?login=john@doe HTTP/1.0'
301 // saves: 'index.html?login=john@doe' (we save '?login=john@doe')
302 // wget 'http://busybox.net#test/test':
303 // request: 'GET / HTTP/1.0'
304 // saves: 'index.html' (we save 'test')
305 //
306 // We also don't add unique .N suffix if file exists...
307 sp = strchr(h->host, '/');
308 p = strchr(h->host, '?'); if (!sp || (p && sp > p)) sp = p;
309 p = strchr(h->host, '#'); if (!sp || (p && sp > p)) sp = p;
310 if (!sp) {
311 h->path = "";
312 } else if (*sp == '/') {
313 *sp = '\0';
314 h->path = sp + 1;
315 } else { // '#' or '?'
316 // http://busybox.net?login=john@doe is a valid URL
317 // memmove converts to:
318 // http:/busybox.nett?login=john@doe...
319 memmove(h->host - 1, h->host, sp - h->host);
320 h->host--;
321 sp[-1] = '\0';
322 h->path = sp;
323 }
324
325 // We used to set h->user to NULL here, but this interferes
326 // with handling of code 302 ("object was moved")
327
328 sp = strrchr(h->host, '@');
329 if (sp != NULL) {
330 // URL-decode "user:password" string before base64-encoding:
331 // wget http://test:my%20pass@example.com should send
332 // Authorization: Basic dGVzdDpteSBwYXNz
333 // which decodes to "test:my pass".
334 // Standard wget and curl do this too.
335 *sp = '\0';
336 h->user = percent_decode_in_place(h->host, /*strict:*/ 0);
337 h->host = sp + 1;
338 }
339
340 sp = h->host;
341}
342
343static char *gethdr(FILE *fp)
344{
345 char *s, *hdrval;
346 int c;
347
348 /* retrieve header line */
349 c = fgets_and_trim(fp);
350
351 /* end of the headers? */
352 if (G.wget_buf[0] == '\0')
353 return NULL;
354
355 /* convert the header name to lower case */
356 for (s = G.wget_buf; isalnum(*s) || *s == '-' || *s == '.' || *s == '_'; ++s) {
357 /*
358 * No-op for 20-3f and 60-7f. "0-9a-z-." are in these ranges.
359 * 40-5f range ("@A-Z[\]^_") maps to 60-7f.
360 * "A-Z" maps to "a-z".
361 * "@[\]" can't occur in header names.
362 * "^_" maps to "~,DEL" (which is wrong).
363 * "^" was never seen yet, "_" was seen from web.archive.org
364 * (x-archive-orig-x_commoncrawl_Signature: HEXSTRING).
365 */
366 *s |= 0x20;
367 }
368
369 /* verify we are at the end of the header name */
370 if (*s != ':')
371 bb_error_msg_and_die("bad header line: %s", sanitize_string(G.wget_buf));
372
373 /* locate the start of the header value */
374 *s++ = '\0';
375 hdrval = skip_whitespace(s);
376
377 if (c != '\n') {
378 /* Rats! The buffer isn't big enough to hold the entire header value */
379 while (c = getc(fp), c != EOF && c != '\n')
380 continue;
381 }
382
383 return hdrval;
384}
385
386static void reset_beg_range_to_zero(void)
387{
388 bb_error_msg("restart failed");
389 G.beg_range = 0;
390 xlseek(G.output_fd, 0, SEEK_SET);
391 /* Done at the end instead: */
392 /* ftruncate(G.output_fd, 0); */
393}
394
395static FILE* prepare_ftp_session(FILE **dfpp, struct host_info *target, len_and_sockaddr *lsa)
396{
397 FILE *sfp;
398 char *str;
399 int port;
400
401 if (!target->user)
402 target->user = xstrdup("anonymous:busybox@");
403
404 sfp = open_socket(lsa);
405 if (ftpcmd(NULL, NULL, sfp) != 220)
406 bb_error_msg_and_die("%s", sanitize_string(G.wget_buf + 4));
407
408 /*
409 * Splitting username:password pair,
410 * trying to log in
411 */
412 str = strchr(target->user, ':');
413 if (str)
414 *str++ = '\0';
415 switch (ftpcmd("USER ", target->user, sfp)) {
416 case 230:
417 break;
418 case 331:
419 if (ftpcmd("PASS ", str, sfp) == 230)
420 break;
421 /* fall through (failed login) */
422 default:
423 bb_error_msg_and_die("ftp login: %s", sanitize_string(G.wget_buf + 4));
424 }
425
426 ftpcmd("TYPE I", NULL, sfp);
427
428 /*
429 * Querying file size
430 */
431 if (ftpcmd("SIZE ", target->path, sfp) == 213) {
432 G.content_len = BB_STRTOOFF(G.wget_buf + 4, NULL, 10);
433 if (G.content_len < 0 || errno) {
434 bb_error_msg_and_die("SIZE value is garbage");
435 }
436 G.got_clen = 1;
437 }
438
439 /*
440 * Entering passive mode
441 */
442 if (ftpcmd("PASV", NULL, sfp) != 227) {
443 pasv_error:
444 bb_error_msg_and_die("bad response to %s: %s", "PASV", sanitize_string(G.wget_buf));
445 }
446 // Response is "227 garbageN1,N2,N3,N4,P1,P2[)garbage]
447 // Server's IP is N1.N2.N3.N4 (we ignore it)
448 // Server's port for data connection is P1*256+P2
449 str = strrchr(G.wget_buf, ')');
450 if (str) str[0] = '\0';
451 str = strrchr(G.wget_buf, ',');
452 if (!str) goto pasv_error;
453 port = xatou_range(str+1, 0, 255);
454 *str = '\0';
455 str = strrchr(G.wget_buf, ',');
456 if (!str) goto pasv_error;
457 port += xatou_range(str+1, 0, 255) * 256;
458 set_nport(&lsa->u.sa, htons(port));
459
460 *dfpp = open_socket(lsa);
461
462 if (G.beg_range != 0) {
463 sprintf(G.wget_buf, "REST %"OFF_FMT"u", G.beg_range);
464 if (ftpcmd(G.wget_buf, NULL, sfp) == 350)
465 G.content_len -= G.beg_range;
466 else
467 reset_beg_range_to_zero();
468 }
469
470 if (ftpcmd("RETR ", target->path, sfp) > 150)
471 bb_error_msg_and_die("bad response to %s: %s", "RETR", sanitize_string(G.wget_buf));
472
473 return sfp;
474}
475
476static void NOINLINE retrieve_file_data(FILE *dfp)
477{
478#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
479# if ENABLE_FEATURE_WGET_TIMEOUT
480 unsigned second_cnt = G.timeout_seconds;
481# endif
482 struct pollfd polldata;
483
484 polldata.fd = fileno(dfp);
485 polldata.events = POLLIN | POLLPRI;
486#endif
487 progress_meter(PROGRESS_START);
488
489 if (G.chunked)
490 goto get_clen;
491
492 /* Loops only if chunked */
493 while (1) {
494
495#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
496 /* Must use nonblocking I/O, otherwise fread will loop
497 * and *block* until it reads full buffer,
498 * which messes up progress bar and/or timeout logic.
499 * Because of nonblocking I/O, we need to dance
500 * very carefully around EAGAIN. See explanation at
501 * clearerr() calls.
502 */
503 ndelay_on(polldata.fd);
504#endif
505 while (1) {
506 int n;
507 unsigned rdsz;
508
509#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
510 /* fread internally uses read loop, which in our case
511 * is usually exited when we get EAGAIN.
512 * In this case, libc sets error marker on the stream.
513 * Need to clear it before next fread to avoid possible
514 * rare false positive ferror below. Rare because usually
515 * fread gets more than zero bytes, and we don't fall
516 * into if (n <= 0) ...
517 */
518 clearerr(dfp);
519#endif
520 errno = 0;
521 rdsz = sizeof(G.wget_buf);
522 if (G.got_clen) {
523 if (G.content_len < (off_t)sizeof(G.wget_buf)) {
524 if ((int)G.content_len <= 0)
525 break;
526 rdsz = (unsigned)G.content_len;
527 }
528 }
529 n = fread(G.wget_buf, 1, rdsz, dfp);
530
531 if (n > 0) {
532 xwrite(G.output_fd, G.wget_buf, n);
533#if ENABLE_FEATURE_WGET_STATUSBAR
534 G.transferred += n;
535#endif
536 if (G.got_clen) {
537 G.content_len -= n;
538 if (G.content_len == 0)
539 break;
540 }
541#if ENABLE_FEATURE_WGET_TIMEOUT
542 second_cnt = G.timeout_seconds;
543#endif
544 continue;
545 }
546
547 /* n <= 0.
548 * man fread:
549 * If error occurs, or EOF is reached, the return value
550 * is a short item count (or zero).
551 * fread does not distinguish between EOF and error.
552 */
553 if (errno != EAGAIN) {
554 if (ferror(dfp)) {
555 progress_meter(PROGRESS_END);
556 bb_perror_msg_and_die(bb_msg_read_error);
557 }
558 break; /* EOF, not error */
559 }
560
561#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
562 /* It was EAGAIN. There is no data. Wait up to one second
563 * then abort if timed out, or update the bar and try reading again.
564 */
565 if (safe_poll(&polldata, 1, 1000) == 0) {
566# if ENABLE_FEATURE_WGET_TIMEOUT
567 if (second_cnt != 0 && --second_cnt == 0) {
568 progress_meter(PROGRESS_END);
569 bb_error_msg_and_die("download timed out");
570 }
571# endif
572 /* We used to loop back to poll here,
573 * but there is no great harm in letting fread
574 * to try reading anyway.
575 */
576 }
577 /* Need to do it _every_ second for "stalled" indicator
578 * to be shown properly.
579 */
580 progress_meter(PROGRESS_BUMP);
581#endif
582 } /* while (reading data) */
583
584#if ENABLE_FEATURE_WGET_STATUSBAR || ENABLE_FEATURE_WGET_TIMEOUT
585 clearerr(dfp);
586 ndelay_off(polldata.fd); /* else fgets can get very unhappy */
587#endif
588 if (!G.chunked)
589 break;
590
591 fgets_and_trim(dfp); /* Eat empty line */
592 get_clen:
593 fgets_and_trim(dfp);
594 G.content_len = STRTOOFF(G.wget_buf, NULL, 16);
595 /* FIXME: error check? */
596 if (G.content_len == 0)
597 break; /* all done! */
598 G.got_clen = 1;
599 /*
600 * Note that fgets may result in some data being buffered in dfp.
601 * We loop back to fread, which will retrieve this data.
602 * Also note that code has to be arranged so that fread
603 * is done _before_ one-second poll wait - poll doesn't know
604 * about stdio buffering and can result in spurious one second waits!
605 */
606 }
607
608 /* If -c failed, we restart from the beginning,
609 * but we do not truncate file then, we do it only now, at the end.
610 * This lets user to ^C if his 99% complete 10 GB file download
611 * failed to restart *without* losing the almost complete file.
612 */
613 {
614 off_t pos = lseek(G.output_fd, 0, SEEK_CUR);
615 if (pos != (off_t)-1)
616 ftruncate(G.output_fd, pos);
617 }
618
619 /* Draw full bar and free its resources */
620 G.chunked = 0; /* makes it show 100% even for chunked download */
621 G.got_clen = 1; /* makes it show 100% even for download of (formerly) unknown size */
622 progress_meter(PROGRESS_END);
623}
624
625static void download_one_url(const char *url)
626{
627 bool use_proxy; /* Use proxies if env vars are set */
628 int redir_limit;
629 len_and_sockaddr *lsa;
630 FILE *sfp; /* socket to web/ftp server */
631 FILE *dfp; /* socket to ftp server (data) */
632 char *proxy = NULL;
633 char *fname_out_alloc;
634 char *redirected_path = NULL;
635 struct host_info server;
636 struct host_info target;
637
638 server.allocated = NULL;
639 target.allocated = NULL;
640 server.user = NULL;
641 target.user = NULL;
642
643 parse_url(url, &target);
644
645 /* Use the proxy if necessary */
646 use_proxy = (strcmp(G.proxy_flag, "off") != 0);
647 if (use_proxy) {
648 proxy = getenv(target.is_ftp ? "ftp_proxy" : "http_proxy");
649 use_proxy = (proxy && proxy[0]);
650 if (use_proxy)
651 parse_url(proxy, &server);
652 }
653 if (!use_proxy) {
654 server.port = target.port;
655 if (ENABLE_FEATURE_IPV6) {
656 //free(server.allocated); - can't be non-NULL
657 server.host = server.allocated = xstrdup(target.host);
658 } else {
659 server.host = target.host;
660 }
661 }
662
663 if (ENABLE_FEATURE_IPV6)
664 strip_ipv6_scope_id(target.host);
665
666 /* If there was no -O FILE, guess output filename */
667 fname_out_alloc = NULL;
668 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
669 G.fname_out = bb_get_last_path_component_nostrip(target.path);
670 /* handle "wget http://kernel.org//" */
671 if (G.fname_out[0] == '/' || !G.fname_out[0])
672 G.fname_out = (char*)"index.html";
673 /* -P DIR is considered only if there was no -O FILE */
674 if (G.dir_prefix)
675 G.fname_out = fname_out_alloc = concat_path_file(G.dir_prefix, G.fname_out);
676 else {
677 /* redirects may free target.path later, need to make a copy */
678 G.fname_out = fname_out_alloc = xstrdup(G.fname_out);
679 }
680 }
681#if ENABLE_FEATURE_WGET_STATUSBAR
682 G.curfile = bb_get_last_path_component_nostrip(G.fname_out);
683#endif
684
685 /* Determine where to start transfer */
686 G.beg_range = 0;
687 if (option_mask32 & WGET_OPT_CONTINUE) {
688 G.output_fd = open(G.fname_out, O_WRONLY);
689 if (G.output_fd >= 0) {
690 G.beg_range = xlseek(G.output_fd, 0, SEEK_END);
691 }
692 /* File doesn't exist. We do not create file here yet.
693 * We are not sure it exists on remote side */
694 }
695
696 redir_limit = 5;
697 resolve_lsa:
698 lsa = xhost2sockaddr(server.host, server.port);
699 if (!(option_mask32 & WGET_OPT_QUIET)) {
700 char *s = xmalloc_sockaddr2dotted(&lsa->u.sa);
701 fprintf(stderr, "Connecting to %s (%s)\n", server.host, s);
702 free(s);
703 }
704 establish_session:
705 /*G.content_len = 0; - redundant, got_clen = 0 is enough */
706 G.got_clen = 0;
707 G.chunked = 0;
708 if (use_proxy || !target.is_ftp) {
709 /*
710 * HTTP session
711 */
712 char *str;
713 int status;
714
715
716 /* Open socket to http server */
717 sfp = open_socket(lsa);
718
719 /* Send HTTP request */
720 if (use_proxy) {
721 fprintf(sfp, "GET %stp://%s/%s HTTP/1.1\r\n",
722 target.is_ftp ? "f" : "ht", target.host,
723 target.path);
724 } else {
725 if (option_mask32 & WGET_OPT_POST_DATA)
726 fprintf(sfp, "POST /%s HTTP/1.1\r\n", target.path);
727 else
728 fprintf(sfp, "GET /%s HTTP/1.1\r\n", target.path);
729 }
730
731 fprintf(sfp, "Host: %s\r\nUser-Agent: %s\r\n",
732 target.host, G.user_agent);
733
734 /* Ask server to close the connection as soon as we are done
735 * (IOW: we do not intend to send more requests)
736 */
737 fprintf(sfp, "Connection: close\r\n");
738
739#if ENABLE_FEATURE_WGET_AUTHENTICATION
740 if (target.user) {
741 fprintf(sfp, "Proxy-Authorization: Basic %s\r\n"+6,
742 base64enc(target.user));
743 }
744 if (use_proxy && server.user) {
745 fprintf(sfp, "Proxy-Authorization: Basic %s\r\n",
746 base64enc(server.user));
747 }
748#endif
749
750 if (G.beg_range != 0)
751 fprintf(sfp, "Range: bytes=%"OFF_FMT"u-\r\n", G.beg_range);
752
753#if ENABLE_FEATURE_WGET_LONG_OPTIONS
754 if (G.extra_headers)
755 fputs(G.extra_headers, sfp);
756
757 if (option_mask32 & WGET_OPT_POST_DATA) {
758 fprintf(sfp,
759 "Content-Type: application/x-www-form-urlencoded\r\n"
760 "Content-Length: %u\r\n"
761 "\r\n"
762 "%s",
763 (int) strlen(G.post_data), G.post_data
764 );
765 } else
766#endif
767 {
768 fprintf(sfp, "\r\n");
769 }
770
771 fflush(sfp);
772
773 /*
774 * Retrieve HTTP response line and check for "200" status code.
775 */
776 read_response:
777 fgets_and_trim(sfp);
778
779 str = G.wget_buf;
780 str = skip_non_whitespace(str);
781 str = skip_whitespace(str);
782 // FIXME: no error check
783 // xatou wouldn't work: "200 OK"
784 status = atoi(str);
785 switch (status) {
786 case 0:
787 case 100:
788 while (gethdr(sfp) != NULL)
789 /* eat all remaining headers */;
790 goto read_response;
791 case 200:
792/*
793Response 204 doesn't say "null file", it says "metadata
794has changed but data didn't":
795
796"10.2.5 204 No Content
797The server has fulfilled the request but does not need to return
798an entity-body, and might want to return updated metainformation.
799The response MAY include new or updated metainformation in the form
800of entity-headers, which if present SHOULD be associated with
801the requested variant.
802
803If the client is a user agent, it SHOULD NOT change its document
804view from that which caused the request to be sent. This response
805is primarily intended to allow input for actions to take place
806without causing a change to the user agent's active document view,
807although any new or updated metainformation SHOULD be applied
808to the document currently in the user agent's active view.
809
810The 204 response MUST NOT include a message-body, and thus
811is always terminated by the first empty line after the header fields."
812
813However, in real world it was observed that some web servers
814(e.g. Boa/0.94.14rc21) simply use code 204 when file size is zero.
815*/
816 case 204:
817 if (G.beg_range != 0) {
818 /* "Range:..." was not honored by the server.
819 * Restart download from the beginning.
820 */
821 reset_beg_range_to_zero();
822 }
823 break;
824 case 300: /* redirection */
825 case 301:
826 case 302:
827 case 303:
828 break;
829 case 206: /* Partial Content */
830 if (G.beg_range != 0)
831 /* "Range:..." worked. Good. */
832 break;
833 /* Partial Content even though we did not ask for it??? */
834 /* fall through */
835 default:
836 bb_error_msg_and_die("server returned error: %s", sanitize_string(G.wget_buf));
837 }
838
839 /*
840 * Retrieve HTTP headers.
841 */
842 while ((str = gethdr(sfp)) != NULL) {
843 static const char keywords[] ALIGN1 =
844 "content-length\0""transfer-encoding\0""location\0";
845 enum {
846 KEY_content_length = 1, KEY_transfer_encoding, KEY_location
847 };
848 smalluint key;
849
850 /* gethdr converted "FOO:" string to lowercase */
851
852 /* strip trailing whitespace */
853 char *s = strchrnul(str, '\0') - 1;
854 while (s >= str && (*s == ' ' || *s == '\t')) {
855 *s = '\0';
856 s--;
857 }
858 key = index_in_strings(keywords, G.wget_buf) + 1;
859 if (key == KEY_content_length) {
860 G.content_len = BB_STRTOOFF(str, NULL, 10);
861 if (G.content_len < 0 || errno) {
862 bb_error_msg_and_die("content-length %s is garbage", sanitize_string(str));
863 }
864 G.got_clen = 1;
865 continue;
866 }
867 if (key == KEY_transfer_encoding) {
868 if (strcmp(str_tolower(str), "chunked") != 0)
869 bb_error_msg_and_die("transfer encoding '%s' is not supported", sanitize_string(str));
870 G.chunked = 1;
871 }
872 if (key == KEY_location && status >= 300) {
873 if (--redir_limit == 0)
874 bb_error_msg_and_die("too many redirections");
875 fclose(sfp);
876 if (str[0] == '/') {
877 free(redirected_path);
878 target.path = redirected_path = xstrdup(str+1);
879 /* lsa stays the same: it's on the same server */
880 } else {
881 parse_url(str, &target);
882 if (!use_proxy) {
883 free(server.allocated);
884 server.allocated = NULL;
885 server.host = target.host;
886 /* strip_ipv6_scope_id(target.host); - no! */
887 /* we assume remote never gives us IPv6 addr with scope id */
888 server.port = target.port;
889 free(lsa);
890 goto resolve_lsa;
891 } /* else: lsa stays the same: we use proxy */
892 }
893 goto establish_session;
894 }
895 }
896// if (status >= 300)
897// bb_error_msg_and_die("bad redirection (no Location: header from server)");
898
899 /* For HTTP, data is pumped over the same connection */
900 dfp = sfp;
901
902 } else {
903 /*
904 * FTP session
905 */
906 sfp = prepare_ftp_session(&dfp, &target, lsa);
907 }
908
909 free(lsa);
910
911 if (!(option_mask32 & WGET_OPT_SPIDER)) {
912 if (G.output_fd < 0)
913 G.output_fd = xopen(G.fname_out, G.o_flags);
914 retrieve_file_data(dfp);
915 if (!(option_mask32 & WGET_OPT_OUTNAME)) {
916 xclose(G.output_fd);
917 G.output_fd = -1;
918 }
919 }
920
921 if (dfp != sfp) {
922 /* It's ftp. Close data connection properly */
923 fclose(dfp);
924 if (ftpcmd(NULL, NULL, sfp) != 226)
925 bb_error_msg_and_die("ftp error: %s", sanitize_string(G.wget_buf + 4));
926 /* ftpcmd("QUIT", NULL, sfp); - why bother? */
927 }
928 fclose(sfp);
929
930 free(server.allocated);
931 free(target.allocated);
932 free(fname_out_alloc);
933 free(redirected_path);
934}
935
936int wget_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
937int wget_main(int argc UNUSED_PARAM, char **argv)
938{
939#if ENABLE_FEATURE_WGET_LONG_OPTIONS
940 static const char wget_longopts[] ALIGN1 =
941 /* name, has_arg, val */
942 "continue\0" No_argument "c"
943//FIXME: -s isn't --spider, it's --save-headers!
944 "spider\0" No_argument "s"
945 "quiet\0" No_argument "q"
946 "output-document\0" Required_argument "O"
947 "directory-prefix\0" Required_argument "P"
948 "proxy\0" Required_argument "Y"
949 "user-agent\0" Required_argument "U"
950#if ENABLE_FEATURE_WGET_TIMEOUT
951 "timeout\0" Required_argument "T"
952#endif
953 /* Ignored: */
954 // "tries\0" Required_argument "t"
955 /* Ignored (we always use PASV): */
956 "passive-ftp\0" No_argument "\xff"
957 "header\0" Required_argument "\xfe"
958 "post-data\0" Required_argument "\xfd"
959 /* Ignored (we don't do ssl) */
960 "no-check-certificate\0" No_argument "\xfc"
961 /* Ignored (we don't support caching) */
962 "no-cache\0" No_argument "\xfb"
963 ;
964#endif
965
966#if ENABLE_FEATURE_WGET_LONG_OPTIONS
967 llist_t *headers_llist = NULL;
968#endif
969
970 INIT_G();
971
972#if ENABLE_FEATURE_WGET_TIMEOUT
973 G.timeout_seconds = 900;
974 signal(SIGALRM, alarm_handler);
975#endif
976 G.proxy_flag = "on"; /* use proxies if env vars are set */
977 G.user_agent = "Wget"; /* "User-Agent" header field */
978
979#if ENABLE_FEATURE_WGET_LONG_OPTIONS
980 applet_long_options = wget_longopts;
981#endif
982 opt_complementary = "-1" IF_FEATURE_WGET_TIMEOUT(":T+") IF_FEATURE_WGET_LONG_OPTIONS(":\xfe::");
983 getopt32(argv, "csqO:P:Y:U:T:" /*ignored:*/ "t:",
984 &G.fname_out, &G.dir_prefix,
985 &G.proxy_flag, &G.user_agent,
986 IF_FEATURE_WGET_TIMEOUT(&G.timeout_seconds) IF_NOT_FEATURE_WGET_TIMEOUT(NULL),
987 NULL /* -t RETRIES */
988 IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
989 IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
990 );
991 argv += optind;
992
993#if ENABLE_FEATURE_WGET_LONG_OPTIONS
994 if (headers_llist) {
995 int size = 1;
996 char *cp;
997 llist_t *ll = headers_llist;
998 while (ll) {
999 size += strlen(ll->data) + 2;
1000 ll = ll->link;
1001 }
1002 G.extra_headers = cp = xmalloc(size);
1003 while (headers_llist) {
1004 cp += sprintf(cp, "%s\r\n", (char*)llist_pop(&headers_llist));
1005 }
1006 }
1007#endif
1008
1009 G.output_fd = -1;
1010 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC | O_EXCL;
1011 if (G.fname_out) { /* -O FILE ? */
1012 if (LONE_DASH(G.fname_out)) { /* -O - ? */
1013 G.output_fd = 1;
1014 option_mask32 &= ~WGET_OPT_CONTINUE;
1015 }
1016 /* compat with wget: -O FILE can overwrite */
1017 G.o_flags = O_WRONLY | O_CREAT | O_TRUNC;
1018 }
1019
1020 while (*argv)
1021 download_one_url(*argv++);
1022
1023 if (G.output_fd >= 0)
1024 xclose(G.output_fd);
1025
1026#if ENABLE_FEATURE_CLEAN_UP && ENABLE_FEATURE_WGET_LONG_OPTIONS
1027 free(G.extra_headers);
1028#endif
1029 FINI_G();
1030
1031 return EXIT_SUCCESS;
1032}
1033