blob: 64b85e1d8da4fff4027527d09107ffc4a6e625cc
1 | /* vi: set sw=4 ts=4: */ |
2 | /* |
3 | * SuS3 compliant sort implementation for busybox |
4 | * |
5 | * Copyright (C) 2004 by Rob Landley <rob@landley.net> |
6 | * |
7 | * MAINTAINER: Rob Landley <rob@landley.net> |
8 | * |
9 | * Licensed under GPLv2 or later, see file LICENSE in this source tree. |
10 | * |
11 | * See SuS3 sort standard at: |
12 | * http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html |
13 | */ |
14 | //config:config SORT |
15 | //config: bool "sort" |
16 | //config: default y |
17 | //config: help |
18 | //config: sort is used to sort lines of text in specified files. |
19 | //config: |
20 | //config:config FEATURE_SORT_BIG |
21 | //config: bool "Full SuSv3 compliant sort (support -ktcsbdfiozgM)" |
22 | //config: default y |
23 | //config: depends on SORT |
24 | //config: help |
25 | //config: Without this, sort only supports -r, -u, and an integer version |
26 | //config: of -n. Selecting this adds sort keys, floating point support, and |
27 | //config: more. This adds a little over 3k to a nonstatic build on x86. |
28 | //config: |
29 | //config: The SuSv3 sort standard is available at: |
30 | //config: http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html |
31 | |
32 | //applet:IF_SORT(APPLET_NOEXEC(sort, sort, BB_DIR_USR_BIN, BB_SUID_DROP, sort)) |
33 | |
34 | //kbuild:lib-$(CONFIG_SORT) += sort.o |
35 | |
36 | //usage:#define sort_trivial_usage |
37 | //usage: "[-nru" |
38 | //usage: IF_FEATURE_SORT_BIG("gMcszbdfiokt] [-o FILE] [-k start[.offset][opts][,end[.offset][opts]] [-t CHAR") |
39 | //usage: "] [FILE]..." |
40 | //usage:#define sort_full_usage "\n\n" |
41 | //usage: "Sort lines of text\n" |
42 | //usage: IF_FEATURE_SORT_BIG( |
43 | //usage: "\n -o FILE Output to FILE" |
44 | //usage: "\n -c Check whether input is sorted" |
45 | //usage: "\n -b Ignore leading blanks" |
46 | //usage: "\n -f Ignore case" |
47 | //usage: "\n -i Ignore unprintable characters" |
48 | //usage: "\n -d Dictionary order (blank or alphanumeric only)" |
49 | //usage: "\n -g General numerical sort" |
50 | //usage: "\n -M Sort month" |
51 | //usage: ) |
52 | //-h, --human-numeric-sort: compare human readable numbers (e.g., 2K 1G) |
53 | //usage: "\n -n Sort numbers" |
54 | //usage: IF_FEATURE_SORT_BIG( |
55 | //usage: "\n -t CHAR Field separator" |
56 | //usage: "\n -k N[,M] Sort by Nth field" |
57 | //usage: ) |
58 | //usage: "\n -r Reverse sort order" |
59 | //usage: IF_FEATURE_SORT_BIG( |
60 | //usage: "\n -s Stable (don't sort ties alphabetically)" |
61 | //usage: ) |
62 | //usage: "\n -u Suppress duplicate lines" |
63 | //usage: IF_FEATURE_SORT_BIG( |
64 | //usage: "\n -z Lines are terminated by NUL, not newline" |
65 | ////usage: "\n -m Ignored for GNU compatibility" |
66 | ////usage: "\n -S BUFSZ Ignored for GNU compatibility" |
67 | ////usage: "\n -T TMPDIR Ignored for GNU compatibility" |
68 | //usage: ) |
69 | //usage: |
70 | //usage:#define sort_example_usage |
71 | //usage: "$ echo -e \"e\\nf\\nb\\nd\\nc\\na\" | sort\n" |
72 | //usage: "a\n" |
73 | //usage: "b\n" |
74 | //usage: "c\n" |
75 | //usage: "d\n" |
76 | //usage: "e\n" |
77 | //usage: "f\n" |
78 | //usage: IF_FEATURE_SORT_BIG( |
79 | //usage: "$ echo -e \"c 3\\nb 2\\nd 2\" | $SORT -k 2,2n -k 1,1r\n" |
80 | //usage: "d 2\n" |
81 | //usage: "b 2\n" |
82 | //usage: "c 3\n" |
83 | //usage: ) |
84 | //usage: "" |
85 | |
86 | #include "libbb.h" |
87 | |
88 | /* This is a NOEXEC applet. Be very careful! */ |
89 | |
90 | |
91 | /* |
92 | sort [-m][-o output][-bdfinru][-t char][-k keydef]... [file...] |
93 | sort -c [-bdfinru][-t char][-k keydef][file] |
94 | */ |
95 | |
96 | /* These are sort types */ |
97 | static const char OPT_STR[] ALIGN1 = "ngMucszbrdfimS:T:o:k:*t:"; |
98 | enum { |
99 | FLAG_n = 1, /* Numeric sort */ |
100 | FLAG_g = 2, /* Sort using strtod() */ |
101 | FLAG_M = 4, /* Sort date */ |
102 | /* ucsz apply to root level only, not keys. b at root level implies bb */ |
103 | FLAG_u = 8, /* Unique */ |
104 | FLAG_c = 0x10, /* Check: no output, exit(!ordered) */ |
105 | FLAG_s = 0x20, /* Stable sort, no ascii fallback at end */ |
106 | FLAG_z = 0x40, /* Input and output is NUL terminated, not \n */ |
107 | /* These can be applied to search keys, the previous four can't */ |
108 | FLAG_b = 0x80, /* Ignore leading blanks */ |
109 | FLAG_r = 0x100, /* Reverse */ |
110 | FLAG_d = 0x200, /* Ignore !(isalnum()|isspace()) */ |
111 | FLAG_f = 0x400, /* Force uppercase */ |
112 | FLAG_i = 0x800, /* Ignore !isprint() */ |
113 | FLAG_m = 0x1000, /* ignored: merge already sorted files; do not sort */ |
114 | FLAG_S = 0x2000, /* ignored: -S, --buffer-size=SIZE */ |
115 | FLAG_T = 0x4000, /* ignored: -T, --temporary-directory=DIR */ |
116 | FLAG_o = 0x8000, |
117 | FLAG_k = 0x10000, |
118 | FLAG_t = 0x20000, |
119 | FLAG_bb = 0x80000000, /* Ignore trailing blanks */ |
120 | }; |
121 | |
122 | #if ENABLE_FEATURE_SORT_BIG |
123 | static char key_separator; |
124 | |
125 | static struct sort_key { |
126 | struct sort_key *next_key; /* linked list */ |
127 | unsigned range[4]; /* start word, start char, end word, end char */ |
128 | unsigned flags; |
129 | } *key_list; |
130 | |
131 | static char *get_key(char *str, struct sort_key *key, int flags) |
132 | { |
133 | int start = start; /* for compiler */ |
134 | int end; |
135 | int len, j; |
136 | unsigned i; |
137 | |
138 | /* Special case whole string, so we don't have to make a copy */ |
139 | if (key->range[0] == 1 && !key->range[1] && !key->range[2] && !key->range[3] |
140 | && !(flags & (FLAG_b | FLAG_d | FLAG_f | FLAG_i | FLAG_bb)) |
141 | ) { |
142 | return str; |
143 | } |
144 | |
145 | /* Find start of key on first pass, end on second pass */ |
146 | len = strlen(str); |
147 | for (j = 0; j < 2; j++) { |
148 | if (!key->range[2*j]) |
149 | end = len; |
150 | /* Loop through fields */ |
151 | else { |
152 | unsigned char ch = 0; |
153 | |
154 | end = 0; |
155 | for (i = 1; i < key->range[2*j] + j; i++) { |
156 | if (key_separator) { |
157 | /* Skip body of key and separator */ |
158 | while ((ch = str[end]) != '\0') { |
159 | end++; |
160 | if (ch == key_separator) |
161 | break; |
162 | } |
163 | } else { |
164 | /* Skip leading blanks */ |
165 | while (isspace(str[end])) |
166 | end++; |
167 | /* Skip body of key */ |
168 | while (str[end] != '\0') { |
169 | if (isspace(str[end])) |
170 | break; |
171 | end++; |
172 | } |
173 | } |
174 | } |
175 | /* Remove last delim: "abc:def:" => "abc:def" */ |
176 | if (j && ch) { |
177 | //if (str[end-1] != key_separator) |
178 | // bb_error_msg(_and_die("BUG! " |
179 | // "str[start:%d,end:%d]:'%.*s'", |
180 | // start, end, (int)(end-start), &str[start]); |
181 | end--; |
182 | } |
183 | } |
184 | if (!j) start = end; |
185 | } |
186 | /* Strip leading whitespace if necessary */ |
187 | if (flags & FLAG_b) |
188 | /* not using skip_whitespace() for speed */ |
189 | while (isspace(str[start])) start++; |
190 | /* Strip trailing whitespace if necessary */ |
191 | if (flags & FLAG_bb) |
192 | while (end > start && isspace(str[end-1])) end--; |
193 | /* -kSTART,N.ENDCHAR: honor ENDCHAR (1-based) */ |
194 | if (key->range[3]) { |
195 | end = key->range[3]; |
196 | if (end > len) end = len; |
197 | } |
198 | /* -kN.STARTCHAR[,...]: honor STARTCHAR (1-based) */ |
199 | if (key->range[1]) { |
200 | start += key->range[1] - 1; |
201 | if (start > len) start = len; |
202 | } |
203 | /* Make the copy */ |
204 | if (end < start) |
205 | end = start; |
206 | str = xstrndup(str+start, end-start); |
207 | /* Handle -d */ |
208 | if (flags & FLAG_d) { |
209 | for (start = end = 0; str[end]; end++) |
210 | if (isspace(str[end]) || isalnum(str[end])) |
211 | str[start++] = str[end]; |
212 | str[start] = '\0'; |
213 | } |
214 | /* Handle -i */ |
215 | if (flags & FLAG_i) { |
216 | for (start = end = 0; str[end]; end++) |
217 | if (isprint_asciionly(str[end])) |
218 | str[start++] = str[end]; |
219 | str[start] = '\0'; |
220 | } |
221 | /* Handle -f */ |
222 | if (flags & FLAG_f) |
223 | for (i = 0; str[i]; i++) |
224 | str[i] = toupper(str[i]); |
225 | |
226 | return str; |
227 | } |
228 | |
229 | static struct sort_key *add_key(void) |
230 | { |
231 | struct sort_key **pkey = &key_list; |
232 | while (*pkey) |
233 | pkey = &((*pkey)->next_key); |
234 | return *pkey = xzalloc(sizeof(struct sort_key)); |
235 | } |
236 | |
237 | #define GET_LINE(fp) \ |
238 | ((option_mask32 & FLAG_z) \ |
239 | ? bb_get_chunk_from_file(fp, NULL) \ |
240 | : xmalloc_fgetline(fp)) |
241 | #else |
242 | #define GET_LINE(fp) xmalloc_fgetline(fp) |
243 | #endif |
244 | |
245 | /* Iterate through keys list and perform comparisons */ |
246 | static int compare_keys(const void *xarg, const void *yarg) |
247 | { |
248 | int flags = option_mask32, retval = 0; |
249 | char *x, *y; |
250 | |
251 | #if ENABLE_FEATURE_SORT_BIG |
252 | struct sort_key *key; |
253 | |
254 | for (key = key_list; !retval && key; key = key->next_key) { |
255 | flags = key->flags ? key->flags : option_mask32; |
256 | /* Chop out and modify key chunks, handling -dfib */ |
257 | x = get_key(*(char **)xarg, key, flags); |
258 | y = get_key(*(char **)yarg, key, flags); |
259 | #else |
260 | /* This curly bracket serves no purpose but to match the nesting |
261 | * level of the for () loop we're not using */ |
262 | { |
263 | x = *(char **)xarg; |
264 | y = *(char **)yarg; |
265 | #endif |
266 | /* Perform actual comparison */ |
267 | switch (flags & (FLAG_n | FLAG_M | FLAG_g)) { |
268 | default: |
269 | bb_error_msg_and_die("unknown sort type"); |
270 | break; |
271 | /* Ascii sort */ |
272 | case 0: |
273 | #if ENABLE_LOCALE_SUPPORT |
274 | retval = strcoll(x, y); |
275 | #else |
276 | retval = strcmp(x, y); |
277 | #endif |
278 | break; |
279 | #if ENABLE_FEATURE_SORT_BIG |
280 | case FLAG_g: { |
281 | char *xx, *yy; |
282 | double dx = strtod(x, &xx); |
283 | double dy = strtod(y, &yy); |
284 | /* not numbers < NaN < -infinity < numbers < +infinity) */ |
285 | if (x == xx) |
286 | retval = (y == yy ? 0 : -1); |
287 | else if (y == yy) |
288 | retval = 1; |
289 | /* Check for isnan */ |
290 | else if (dx != dx) |
291 | retval = (dy != dy) ? 0 : -1; |
292 | else if (dy != dy) |
293 | retval = 1; |
294 | /* Check for infinity. Could underflow, but it avoids libm. */ |
295 | else if (1.0 / dx == 0.0) { |
296 | if (dx < 0) |
297 | retval = (1.0 / dy == 0.0 && dy < 0) ? 0 : -1; |
298 | else |
299 | retval = (1.0 / dy == 0.0 && dy > 0) ? 0 : 1; |
300 | } else if (1.0 / dy == 0.0) |
301 | retval = (dy < 0) ? 1 : -1; |
302 | else |
303 | retval = (dx > dy) ? 1 : ((dx < dy) ? -1 : 0); |
304 | break; |
305 | } |
306 | case FLAG_M: { |
307 | struct tm thyme; |
308 | int dx; |
309 | char *xx, *yy; |
310 | |
311 | xx = strptime(x, "%b", &thyme); |
312 | dx = thyme.tm_mon; |
313 | yy = strptime(y, "%b", &thyme); |
314 | if (!xx) |
315 | retval = (!yy) ? 0 : -1; |
316 | else if (!yy) |
317 | retval = 1; |
318 | else |
319 | retval = dx - thyme.tm_mon; |
320 | break; |
321 | } |
322 | /* Full floating point version of -n */ |
323 | case FLAG_n: { |
324 | double dx = atof(x); |
325 | double dy = atof(y); |
326 | retval = (dx > dy) ? 1 : ((dx < dy) ? -1 : 0); |
327 | break; |
328 | } |
329 | } /* switch */ |
330 | /* Free key copies. */ |
331 | if (x != *(char **)xarg) free(x); |
332 | if (y != *(char **)yarg) free(y); |
333 | /* if (retval) break; - done by for () anyway */ |
334 | #else |
335 | /* Integer version of -n for tiny systems */ |
336 | case FLAG_n: |
337 | retval = atoi(x) - atoi(y); |
338 | break; |
339 | } /* switch */ |
340 | #endif |
341 | } /* for */ |
342 | |
343 | /* Perform fallback sort if necessary */ |
344 | if (!retval && !(option_mask32 & FLAG_s)) { |
345 | flags = option_mask32; |
346 | retval = strcmp(*(char **)xarg, *(char **)yarg); |
347 | } |
348 | |
349 | if (flags & FLAG_r) |
350 | return -retval; |
351 | |
352 | return retval; |
353 | } |
354 | |
355 | #if ENABLE_FEATURE_SORT_BIG |
356 | static unsigned str2u(char **str) |
357 | { |
358 | unsigned long lu; |
359 | if (!isdigit((*str)[0])) |
360 | bb_error_msg_and_die("bad field specification"); |
361 | lu = strtoul(*str, str, 10); |
362 | if ((sizeof(long) > sizeof(int) && lu > INT_MAX) || !lu) |
363 | bb_error_msg_and_die("bad field specification"); |
364 | return lu; |
365 | } |
366 | #endif |
367 | |
368 | int sort_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
369 | int sort_main(int argc UNUSED_PARAM, char **argv) |
370 | { |
371 | char *line, **lines; |
372 | char *str_ignored, *str_o, *str_t; |
373 | llist_t *lst_k = NULL; |
374 | int i; |
375 | int linecount; |
376 | unsigned opts; |
377 | |
378 | xfunc_error_retval = 2; |
379 | |
380 | /* Parse command line options */ |
381 | /* -o and -t can be given at most once */ |
382 | opt_complementary = "o--o:t--t"; /* -t, -o: at most one of each */ |
383 | opts = getopt32(argv, OPT_STR, &str_ignored, &str_ignored, &str_o, &lst_k, &str_t); |
384 | /* global b strips leading and trailing spaces */ |
385 | if (opts & FLAG_b) |
386 | option_mask32 |= FLAG_bb; |
387 | #if ENABLE_FEATURE_SORT_BIG |
388 | if (opts & FLAG_t) { |
389 | if (!str_t[0] || str_t[1]) |
390 | bb_error_msg_and_die("bad -t parameter"); |
391 | key_separator = str_t[0]; |
392 | } |
393 | /* note: below this point we use option_mask32, not opts, |
394 | * since that reduces register pressure and makes code smaller */ |
395 | |
396 | /* Parse sort key */ |
397 | while (lst_k) { |
398 | enum { |
399 | FLAG_allowed_for_k = |
400 | FLAG_n | /* Numeric sort */ |
401 | FLAG_g | /* Sort using strtod() */ |
402 | FLAG_M | /* Sort date */ |
403 | FLAG_b | /* Ignore leading blanks */ |
404 | FLAG_r | /* Reverse */ |
405 | FLAG_d | /* Ignore !(isalnum()|isspace()) */ |
406 | FLAG_f | /* Force uppercase */ |
407 | FLAG_i | /* Ignore !isprint() */ |
408 | 0 |
409 | }; |
410 | struct sort_key *key = add_key(); |
411 | char *str_k = llist_pop(&lst_k); |
412 | |
413 | i = 0; /* i==0 before comma, 1 after (-k3,6) */ |
414 | while (*str_k) { |
415 | /* Start of range */ |
416 | /* Cannot use bb_strtou - suffix can be a letter */ |
417 | key->range[2*i] = str2u(&str_k); |
418 | if (*str_k == '.') { |
419 | str_k++; |
420 | key->range[2*i+1] = str2u(&str_k); |
421 | } |
422 | while (*str_k) { |
423 | int flag; |
424 | const char *idx; |
425 | |
426 | if (*str_k == ',' && !i++) { |
427 | str_k++; |
428 | break; |
429 | } /* no else needed: fall through to syntax error |
430 | because comma isn't in OPT_STR */ |
431 | idx = strchr(OPT_STR, *str_k); |
432 | if (!idx) |
433 | bb_error_msg_and_die("unknown key option"); |
434 | flag = 1 << (idx - OPT_STR); |
435 | if (flag & ~FLAG_allowed_for_k) |
436 | bb_error_msg_and_die("unknown sort type"); |
437 | /* b after ',' means strip _trailing_ space */ |
438 | if (i && flag == FLAG_b) |
439 | flag = FLAG_bb; |
440 | key->flags |= flag; |
441 | str_k++; |
442 | } |
443 | } |
444 | } |
445 | #endif |
446 | |
447 | /* Open input files and read data */ |
448 | argv += optind; |
449 | if (!*argv) |
450 | *--argv = (char*)"-"; |
451 | linecount = 0; |
452 | lines = NULL; |
453 | do { |
454 | /* coreutils 6.9 compat: abort on first open error, |
455 | * do not continue to next file: */ |
456 | FILE *fp = xfopen_stdin(*argv); |
457 | for (;;) { |
458 | line = GET_LINE(fp); |
459 | if (!line) |
460 | break; |
461 | lines = xrealloc_vector(lines, 6, linecount); |
462 | lines[linecount++] = line; |
463 | } |
464 | fclose_if_not_stdin(fp); |
465 | } while (*++argv); |
466 | |
467 | #if ENABLE_FEATURE_SORT_BIG |
468 | /* If no key, perform alphabetic sort */ |
469 | if (!key_list) |
470 | add_key()->range[0] = 1; |
471 | /* Handle -c */ |
472 | if (option_mask32 & FLAG_c) { |
473 | int j = (option_mask32 & FLAG_u) ? -1 : 0; |
474 | for (i = 1; i < linecount; i++) { |
475 | if (compare_keys(&lines[i-1], &lines[i]) > j) { |
476 | fprintf(stderr, "Check line %u\n", i); |
477 | return EXIT_FAILURE; |
478 | } |
479 | } |
480 | return EXIT_SUCCESS; |
481 | } |
482 | #endif |
483 | /* Perform the actual sort */ |
484 | qsort(lines, linecount, sizeof(lines[0]), compare_keys); |
485 | |
486 | /* Handle -u */ |
487 | if (option_mask32 & FLAG_u) { |
488 | int j = 0; |
489 | /* coreutils 6.3 drop lines for which only key is the same */ |
490 | /* -- disabling last-resort compare... */ |
491 | option_mask32 |= FLAG_s; |
492 | for (i = 1; i < linecount; i++) { |
493 | if (compare_keys(&lines[j], &lines[i]) == 0) |
494 | free(lines[i]); |
495 | else |
496 | lines[++j] = lines[i]; |
497 | } |
498 | if (linecount) |
499 | linecount = j+1; |
500 | } |
501 | |
502 | /* Print it */ |
503 | #if ENABLE_FEATURE_SORT_BIG |
504 | /* Open output file _after_ we read all input ones */ |
505 | if (option_mask32 & FLAG_o) |
506 | xmove_fd(xopen3(str_o, O_WRONLY|O_CREAT|O_TRUNC, 0666), STDOUT_FILENO); |
507 | #endif |
508 | { |
509 | int ch = (option_mask32 & FLAG_z) ? '\0' : '\n'; |
510 | for (i = 0; i < linecount; i++) |
511 | printf("%s%c", lines[i], ch); |
512 | } |
513 | |
514 | fflush_stdout_and_exit(EXIT_SUCCESS); |
515 | } |
516 |