blob: 73837141ee26d85ae87650ff43b2b1e07ae982ae
1 | /* vi: set sw=4 ts=4: */ |
2 | /* |
3 | * wc implementation for busybox |
4 | * |
5 | * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org> |
6 | * |
7 | * Licensed under GPLv2 or later, see file LICENSE in this source tree. |
8 | */ |
9 | /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org) |
10 | * |
11 | * Rewritten to fix a number of problems and do some size optimizations. |
12 | * Problems in the previous busybox implementation (besides bloat) included: |
13 | * 1) broken 'wc -c' optimization (read note below) |
14 | * 2) broken handling of '-' args |
15 | * 3) no checking of ferror on EOF returns |
16 | * 4) isprint() wasn't considered when word counting. |
17 | * |
18 | * NOTES: |
19 | * |
20 | * The previous busybox wc attempted an optimization using stat for the |
21 | * case of counting chars only. I omitted that because it was broken. |
22 | * It didn't take into account the possibility of input coming from a |
23 | * pipe, or input from a file with file pointer not at the beginning. |
24 | * |
25 | * To implement such a speed optimization correctly, not only do you |
26 | * need the size, but also the file position. Note also that the |
27 | * file position may be past the end of file. Consider the example |
28 | * (adapted from example in gnu wc.c) |
29 | * |
30 | * echo hello > /tmp/testfile && |
31 | * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile |
32 | * |
33 | * for which 'wc -c' should output '0'. |
34 | */ |
35 | //config:config WC |
36 | //config: bool "wc" |
37 | //config: default y |
38 | //config: help |
39 | //config: wc is used to print the number of bytes, words, and lines, |
40 | //config: in specified files. |
41 | //config: |
42 | //config:config FEATURE_WC_LARGE |
43 | //config: bool "Support very large files in wc" |
44 | //config: default y |
45 | //config: depends on WC |
46 | //config: help |
47 | //config: Use "unsigned long long" in wc for counter variables. |
48 | |
49 | //applet:IF_WC(APPLET(wc, BB_DIR_USR_BIN, BB_SUID_DROP)) |
50 | |
51 | //kbuild:lib-$(CONFIG_WC) += wc.o |
52 | |
53 | /* BB_AUDIT SUSv3 compliant. */ |
54 | /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */ |
55 | |
56 | #include "libbb.h" |
57 | #include "unicode.h" |
58 | |
59 | #if !ENABLE_LOCALE_SUPPORT |
60 | # undef isprint |
61 | # undef isspace |
62 | # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20)) |
63 | # define isspace(c) ((c) == ' ') |
64 | #endif |
65 | |
66 | #if ENABLE_FEATURE_WC_LARGE |
67 | # define COUNT_T unsigned long long |
68 | # define COUNT_FMT "llu" |
69 | #else |
70 | # define COUNT_T unsigned |
71 | # define COUNT_FMT "u" |
72 | #endif |
73 | |
74 | /* We support -m even when UNICODE_SUPPORT is off, |
75 | * we just don't advertise it in help text, |
76 | * since it is the same as -c in this case. |
77 | */ |
78 | |
79 | //usage:#define wc_trivial_usage |
80 | //usage: "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..." |
81 | //usage: |
82 | //usage:#define wc_full_usage "\n\n" |
83 | //usage: "Count lines, words, and bytes for each FILE (or stdin)\n" |
84 | //usage: "\n -c Count bytes" |
85 | //usage: IF_UNICODE_SUPPORT( |
86 | //usage: "\n -m Count characters" |
87 | //usage: ) |
88 | //usage: "\n -l Count newlines" |
89 | //usage: "\n -w Count words" |
90 | //usage: "\n -L Print longest line length" |
91 | //usage: |
92 | //usage:#define wc_example_usage |
93 | //usage: "$ wc /etc/passwd\n" |
94 | //usage: " 31 46 1365 /etc/passwd\n" |
95 | |
96 | /* Order is important if we want to be compatible with |
97 | * column order in "wc -cmlwL" output: |
98 | */ |
99 | enum { |
100 | WC_LINES = 0, /* -l */ |
101 | WC_WORDS = 1, /* -w */ |
102 | WC_UNICHARS = 2, /* -m */ |
103 | WC_BYTES = 3, /* -c */ |
104 | WC_LENGTH = 4, /* -L */ |
105 | NUM_WCS = 5, |
106 | }; |
107 | |
108 | int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
109 | int wc_main(int argc UNUSED_PARAM, char **argv) |
110 | { |
111 | const char *arg; |
112 | const char *start_fmt = " %9"COUNT_FMT + 1; |
113 | const char *fname_fmt = " %s\n"; |
114 | COUNT_T *pcounts; |
115 | COUNT_T counts[NUM_WCS]; |
116 | COUNT_T totals[NUM_WCS]; |
117 | int num_files; |
118 | smallint status = EXIT_SUCCESS; |
119 | unsigned print_type; |
120 | |
121 | init_unicode(); |
122 | |
123 | print_type = getopt32(argv, "lwmcL"); |
124 | |
125 | if (print_type == 0) { |
126 | print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES); |
127 | } |
128 | |
129 | argv += optind; |
130 | if (!argv[0]) { |
131 | *--argv = (char *) bb_msg_standard_input; |
132 | fname_fmt = "\n"; |
133 | } |
134 | if (!argv[1]) { /* zero or one filename? */ |
135 | if (!((print_type-1) & print_type)) /* exactly one option? */ |
136 | start_fmt = "%"COUNT_FMT; |
137 | } |
138 | |
139 | memset(totals, 0, sizeof(totals)); |
140 | |
141 | pcounts = counts; |
142 | |
143 | num_files = 0; |
144 | while ((arg = *argv++) != NULL) { |
145 | FILE *fp; |
146 | const char *s; |
147 | unsigned u; |
148 | unsigned linepos; |
149 | smallint in_word; |
150 | |
151 | ++num_files; |
152 | fp = fopen_or_warn_stdin(arg); |
153 | if (!fp) { |
154 | status = EXIT_FAILURE; |
155 | continue; |
156 | } |
157 | |
158 | memset(counts, 0, sizeof(counts)); |
159 | linepos = 0; |
160 | in_word = 0; |
161 | |
162 | while (1) { |
163 | int c; |
164 | /* Our -w doesn't match GNU wc exactly... oh well */ |
165 | |
166 | c = getc(fp); |
167 | if (c == EOF) { |
168 | if (ferror(fp)) { |
169 | bb_simple_perror_msg(arg); |
170 | status = EXIT_FAILURE; |
171 | } |
172 | goto DO_EOF; /* Treat an EOF as '\r'. */ |
173 | } |
174 | |
175 | /* Cater for -c and -m */ |
176 | ++counts[WC_BYTES]; |
177 | if (unicode_status != UNICODE_ON /* every byte is a new char */ |
178 | || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */ |
179 | ) { |
180 | ++counts[WC_UNICHARS]; |
181 | } |
182 | |
183 | if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */ |
184 | ++linepos; |
185 | if (!isspace(c)) { |
186 | in_word = 1; |
187 | continue; |
188 | } |
189 | } else if ((unsigned)(c - 9) <= 4) { |
190 | /* \t 9 |
191 | * \n 10 |
192 | * \v 11 |
193 | * \f 12 |
194 | * \r 13 |
195 | */ |
196 | if (c == '\t') { |
197 | linepos = (linepos | 7) + 1; |
198 | } else { /* '\n', '\r', '\f', or '\v' */ |
199 | DO_EOF: |
200 | if (linepos > counts[WC_LENGTH]) { |
201 | counts[WC_LENGTH] = linepos; |
202 | } |
203 | if (c == '\n') { |
204 | ++counts[WC_LINES]; |
205 | } |
206 | if (c != '\v') { |
207 | linepos = 0; |
208 | } |
209 | } |
210 | } else { |
211 | continue; |
212 | } |
213 | |
214 | counts[WC_WORDS] += in_word; |
215 | in_word = 0; |
216 | if (c == EOF) { |
217 | break; |
218 | } |
219 | } |
220 | |
221 | fclose_if_not_stdin(fp); |
222 | |
223 | if (totals[WC_LENGTH] < counts[WC_LENGTH]) { |
224 | totals[WC_LENGTH] = counts[WC_LENGTH]; |
225 | } |
226 | totals[WC_LENGTH] -= counts[WC_LENGTH]; |
227 | |
228 | OUTPUT: |
229 | /* coreutils wc tries hard to print pretty columns |
230 | * (saves results for all files, finds max col len etc...) |
231 | * we won't try that hard, it will bloat us too much */ |
232 | s = start_fmt; |
233 | u = 0; |
234 | do { |
235 | if (print_type & (1 << u)) { |
236 | printf(s, pcounts[u]); |
237 | s = " %9"COUNT_FMT; /* Ok... restore the leading space. */ |
238 | } |
239 | totals[u] += pcounts[u]; |
240 | } while (++u < NUM_WCS); |
241 | printf(fname_fmt, arg); |
242 | } |
243 | |
244 | /* If more than one file was processed, we want the totals. To save some |
245 | * space, we set the pcounts ptr to the totals array. This has the side |
246 | * effect of trashing the totals array after outputting it, but that's |
247 | * irrelavent since we no longer need it. */ |
248 | if (num_files > 1) { |
249 | num_files = 0; /* Make sure we don't get here again. */ |
250 | arg = "total"; |
251 | pcounts = totals; |
252 | --argv; |
253 | goto OUTPUT; |
254 | } |
255 | |
256 | fflush_stdout_and_exit(status); |
257 | } |
258 |