blob: a410e407a0cb155d00f57509cfb5abf8315aad3a
1 | /* vi: set sw=4 ts=4: */ |
2 | /* |
3 | * wc implementation for busybox |
4 | * |
5 | * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org> |
6 | * |
7 | * Licensed under GPLv2 or later, see file LICENSE in this source tree. |
8 | */ |
9 | |
10 | /* BB_AUDIT SUSv3 compliant. */ |
11 | /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */ |
12 | |
13 | /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org) |
14 | * |
15 | * Rewritten to fix a number of problems and do some size optimizations. |
16 | * Problems in the previous busybox implementation (besides bloat) included: |
17 | * 1) broken 'wc -c' optimization (read note below) |
18 | * 2) broken handling of '-' args |
19 | * 3) no checking of ferror on EOF returns |
20 | * 4) isprint() wasn't considered when word counting. |
21 | * |
22 | * NOTES: |
23 | * |
24 | * The previous busybox wc attempted an optimization using stat for the |
25 | * case of counting chars only. I omitted that because it was broken. |
26 | * It didn't take into account the possibility of input coming from a |
27 | * pipe, or input from a file with file pointer not at the beginning. |
28 | * |
29 | * To implement such a speed optimization correctly, not only do you |
30 | * need the size, but also the file position. Note also that the |
31 | * file position may be past the end of file. Consider the example |
32 | * (adapted from example in gnu wc.c) |
33 | * |
34 | * echo hello > /tmp/testfile && |
35 | * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile |
36 | * |
37 | * for which 'wc -c' should output '0'. |
38 | */ |
39 | #include "libbb.h" |
40 | #include "unicode.h" |
41 | |
42 | #if !ENABLE_LOCALE_SUPPORT |
43 | # undef isprint |
44 | # undef isspace |
45 | # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20)) |
46 | # define isspace(c) ((c) == ' ') |
47 | #endif |
48 | |
49 | #if ENABLE_FEATURE_WC_LARGE |
50 | # define COUNT_T unsigned long long |
51 | # define COUNT_FMT "llu" |
52 | #else |
53 | # define COUNT_T unsigned |
54 | # define COUNT_FMT "u" |
55 | #endif |
56 | |
57 | /* We support -m even when UNICODE_SUPPORT is off, |
58 | * we just don't advertise it in help text, |
59 | * since it is the same as -c in this case. |
60 | */ |
61 | |
62 | //usage:#define wc_trivial_usage |
63 | //usage: "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..." |
64 | //usage: |
65 | //usage:#define wc_full_usage "\n\n" |
66 | //usage: "Count lines, words, and bytes for each FILE (or stdin)\n" |
67 | //usage: "\n -c Count bytes" |
68 | //usage: IF_UNICODE_SUPPORT( |
69 | //usage: "\n -m Count characters" |
70 | //usage: ) |
71 | //usage: "\n -l Count newlines" |
72 | //usage: "\n -w Count words" |
73 | //usage: "\n -L Print longest line length" |
74 | //usage: |
75 | //usage:#define wc_example_usage |
76 | //usage: "$ wc /etc/passwd\n" |
77 | //usage: " 31 46 1365 /etc/passwd\n" |
78 | |
79 | /* Order is important if we want to be compatible with |
80 | * column order in "wc -cmlwL" output: |
81 | */ |
82 | enum { |
83 | WC_LINES = 0, /* -l */ |
84 | WC_WORDS = 1, /* -w */ |
85 | WC_UNICHARS = 2, /* -m */ |
86 | WC_BYTES = 3, /* -c */ |
87 | WC_LENGTH = 4, /* -L */ |
88 | NUM_WCS = 5, |
89 | }; |
90 | |
91 | int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
92 | int wc_main(int argc UNUSED_PARAM, char **argv) |
93 | { |
94 | const char *arg; |
95 | const char *start_fmt = " %9"COUNT_FMT + 1; |
96 | const char *fname_fmt = " %s\n"; |
97 | COUNT_T *pcounts; |
98 | COUNT_T counts[NUM_WCS]; |
99 | COUNT_T totals[NUM_WCS]; |
100 | int num_files; |
101 | smallint status = EXIT_SUCCESS; |
102 | unsigned print_type; |
103 | |
104 | init_unicode(); |
105 | |
106 | print_type = getopt32(argv, "lwmcL"); |
107 | |
108 | if (print_type == 0) { |
109 | print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES); |
110 | } |
111 | |
112 | argv += optind; |
113 | if (!argv[0]) { |
114 | *--argv = (char *) bb_msg_standard_input; |
115 | fname_fmt = "\n"; |
116 | } |
117 | if (!argv[1]) { /* zero or one filename? */ |
118 | if (!((print_type-1) & print_type)) /* exactly one option? */ |
119 | start_fmt = "%"COUNT_FMT; |
120 | } |
121 | |
122 | memset(totals, 0, sizeof(totals)); |
123 | |
124 | pcounts = counts; |
125 | |
126 | num_files = 0; |
127 | while ((arg = *argv++) != NULL) { |
128 | FILE *fp; |
129 | const char *s; |
130 | unsigned u; |
131 | unsigned linepos; |
132 | smallint in_word; |
133 | |
134 | ++num_files; |
135 | fp = fopen_or_warn_stdin(arg); |
136 | if (!fp) { |
137 | status = EXIT_FAILURE; |
138 | continue; |
139 | } |
140 | |
141 | memset(counts, 0, sizeof(counts)); |
142 | linepos = 0; |
143 | in_word = 0; |
144 | |
145 | while (1) { |
146 | int c; |
147 | /* Our -w doesn't match GNU wc exactly... oh well */ |
148 | |
149 | c = getc(fp); |
150 | if (c == EOF) { |
151 | if (ferror(fp)) { |
152 | bb_simple_perror_msg(arg); |
153 | status = EXIT_FAILURE; |
154 | } |
155 | goto DO_EOF; /* Treat an EOF as '\r'. */ |
156 | } |
157 | |
158 | /* Cater for -c and -m */ |
159 | ++counts[WC_BYTES]; |
160 | if (unicode_status != UNICODE_ON /* every byte is a new char */ |
161 | || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */ |
162 | ) { |
163 | ++counts[WC_UNICHARS]; |
164 | } |
165 | |
166 | if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */ |
167 | ++linepos; |
168 | if (!isspace(c)) { |
169 | in_word = 1; |
170 | continue; |
171 | } |
172 | } else if ((unsigned)(c - 9) <= 4) { |
173 | /* \t 9 |
174 | * \n 10 |
175 | * \v 11 |
176 | * \f 12 |
177 | * \r 13 |
178 | */ |
179 | if (c == '\t') { |
180 | linepos = (linepos | 7) + 1; |
181 | } else { /* '\n', '\r', '\f', or '\v' */ |
182 | DO_EOF: |
183 | if (linepos > counts[WC_LENGTH]) { |
184 | counts[WC_LENGTH] = linepos; |
185 | } |
186 | if (c == '\n') { |
187 | ++counts[WC_LINES]; |
188 | } |
189 | if (c != '\v') { |
190 | linepos = 0; |
191 | } |
192 | } |
193 | } else { |
194 | continue; |
195 | } |
196 | |
197 | counts[WC_WORDS] += in_word; |
198 | in_word = 0; |
199 | if (c == EOF) { |
200 | break; |
201 | } |
202 | } |
203 | |
204 | fclose_if_not_stdin(fp); |
205 | |
206 | if (totals[WC_LENGTH] < counts[WC_LENGTH]) { |
207 | totals[WC_LENGTH] = counts[WC_LENGTH]; |
208 | } |
209 | totals[WC_LENGTH] -= counts[WC_LENGTH]; |
210 | |
211 | OUTPUT: |
212 | /* coreutils wc tries hard to print pretty columns |
213 | * (saves results for all files, finds max col len etc...) |
214 | * we won't try that hard, it will bloat us too much */ |
215 | s = start_fmt; |
216 | u = 0; |
217 | do { |
218 | if (print_type & (1 << u)) { |
219 | printf(s, pcounts[u]); |
220 | s = " %9"COUNT_FMT; /* Ok... restore the leading space. */ |
221 | } |
222 | totals[u] += pcounts[u]; |
223 | } while (++u < NUM_WCS); |
224 | printf(fname_fmt, arg); |
225 | } |
226 | |
227 | /* If more than one file was processed, we want the totals. To save some |
228 | * space, we set the pcounts ptr to the totals array. This has the side |
229 | * effect of trashing the totals array after outputting it, but that's |
230 | * irrelavent since we no longer need it. */ |
231 | if (num_files > 1) { |
232 | num_files = 0; /* Make sure we don't get here again. */ |
233 | arg = "total"; |
234 | pcounts = totals; |
235 | --argv; |
236 | goto OUTPUT; |
237 | } |
238 | |
239 | fflush_stdout_and_exit(status); |
240 | } |
241 |