blob: e65df9920a4814e81c512049af1ce68b6dbb7b9a
1 | /** |
2 | * unistr.c - Unicode string handling. Originated from the Linux-NTFS project. |
3 | * |
4 | * Copyright (c) 2000-2004 Anton Altaparmakov |
5 | * Copyright (c) 2002-2009 Szabolcs Szakacsits |
6 | * Copyright (c) 2008-2009 Jean-Pierre Andre |
7 | * Copyright (c) 2008 Bernhard Kaindl |
8 | * |
9 | * This program/include file is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU General Public License as published |
11 | * by the Free Software Foundation; either version 2 of the License, or |
12 | * (at your option) any later version. |
13 | * |
14 | * This program/include file is distributed in the hope that it will be |
15 | * useful, but WITHOUT ANY WARRANTY; without even the implied warranty |
16 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program (in the main directory of the NTFS-3G |
21 | * distribution in the file COPYING); if not, write to the Free Software |
22 | * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
23 | */ |
24 | |
25 | #ifdef HAVE_CONFIG_H |
26 | #include "config.h" |
27 | #endif |
28 | |
29 | #ifdef HAVE_STDIO_H |
30 | #include <stdio.h> |
31 | #endif |
32 | #ifdef HAVE_STDLIB_H |
33 | #include <stdlib.h> |
34 | #endif |
35 | #ifdef HAVE_WCHAR_H |
36 | #include <wchar.h> |
37 | #endif |
38 | #ifdef HAVE_STRING_H |
39 | #include <string.h> |
40 | #endif |
41 | #ifdef HAVE_ERRNO_H |
42 | #include <errno.h> |
43 | #endif |
44 | #ifdef HAVE_LOCALE_H |
45 | #include <locale.h> |
46 | #endif |
47 | |
48 | #if defined(__APPLE__) || defined(__DARWIN__) |
49 | #ifdef ENABLE_NFCONV |
50 | #include <CoreFoundation/CoreFoundation.h> |
51 | #endif /* ENABLE_NFCONV */ |
52 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
53 | |
54 | #include "compat.h" |
55 | #include "attrib.h" |
56 | #include "types.h" |
57 | #include "unistr.h" |
58 | #include "debug.h" |
59 | #include "logging.h" |
60 | #include "misc.h" |
61 | |
62 | #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */ |
63 | |
64 | /* |
65 | * IMPORTANT |
66 | * ========= |
67 | * |
68 | * All these routines assume that the Unicode characters are in little endian |
69 | * encoding inside the strings!!! |
70 | */ |
71 | |
72 | static int use_utf8 = 1; /* use UTF-8 encoding for file names */ |
73 | |
74 | #if defined(__APPLE__) || defined(__DARWIN__) |
75 | #ifdef ENABLE_NFCONV |
76 | /** |
77 | * This variable controls whether or not automatic normalization form conversion |
78 | * should be performed when translating NTFS unicode file names to UTF-8. |
79 | * Defaults to on, but can be controlled from the outside using the function |
80 | * int ntfs_macosx_normalize_filenames(int normalize); |
81 | */ |
82 | static int nfconvert_utf8 = 1; |
83 | #endif /* ENABLE_NFCONV */ |
84 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
85 | |
86 | /* |
87 | * This is used by the name collation functions to quickly determine what |
88 | * characters are (in)valid. |
89 | */ |
90 | #if 0 |
91 | static const u8 legal_ansi_char_array[0x40] = { |
92 | 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
93 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
94 | |
95 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
96 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
97 | |
98 | 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, |
99 | 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, |
100 | |
101 | 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, |
102 | 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, |
103 | }; |
104 | #endif |
105 | |
106 | /** |
107 | * ntfs_names_are_equal - compare two Unicode names for equality |
108 | * @s1: name to compare to @s2 |
109 | * @s1_len: length in Unicode characters of @s1 |
110 | * @s2: name to compare to @s1 |
111 | * @s2_len: length in Unicode characters of @s2 |
112 | * @ic: ignore case bool |
113 | * @upcase: upcase table (only if @ic == IGNORE_CASE) |
114 | * @upcase_size: length in Unicode characters of @upcase (if present) |
115 | * |
116 | * Compare the names @s1 and @s2 and return TRUE (1) if the names are |
117 | * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, |
118 | * the @upcase table is used to perform a case insensitive comparison. |
119 | */ |
120 | BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len, |
121 | const ntfschar *s2, size_t s2_len, |
122 | const IGNORE_CASE_BOOL ic, |
123 | const ntfschar *upcase, const u32 upcase_size) |
124 | { |
125 | if (s1_len != s2_len) |
126 | return FALSE; |
127 | if (!s1_len) |
128 | return TRUE; |
129 | if (ic == CASE_SENSITIVE) |
130 | return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE; |
131 | return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE: |
132 | TRUE; |
133 | } |
134 | |
135 | /* |
136 | * ntfs_names_full_collate() fully collate two Unicode names |
137 | * |
138 | * @name1: first Unicode name to compare |
139 | * @name1_len: length of first Unicode name to compare |
140 | * @name2: second Unicode name to compare |
141 | * @name2_len: length of second Unicode name to compare |
142 | * @ic: either CASE_SENSITIVE or IGNORE_CASE |
143 | * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) |
144 | * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) |
145 | * |
146 | * -1 if the first name collates before the second one, |
147 | * 0 if the names match, |
148 | * 1 if the second name collates before the first one, or |
149 | * |
150 | */ |
151 | int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len, |
152 | const ntfschar *name2, const u32 name2_len, |
153 | const IGNORE_CASE_BOOL ic, const ntfschar *upcase, |
154 | const u32 upcase_len) |
155 | { |
156 | u32 cnt; |
157 | u16 c1, c2; |
158 | u16 u1, u2; |
159 | |
160 | #ifdef DEBUG |
161 | if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) { |
162 | ntfs_log_debug("ntfs_names_collate received NULL pointer!\n"); |
163 | exit(1); |
164 | } |
165 | #endif |
166 | cnt = min(name1_len, name2_len); |
167 | if (cnt > 0) { |
168 | if (ic == CASE_SENSITIVE) { |
169 | do { |
170 | c1 = le16_to_cpu(*name1); |
171 | name1++; |
172 | c2 = le16_to_cpu(*name2); |
173 | name2++; |
174 | } while (--cnt && (c1 == c2)); |
175 | u1 = c1; |
176 | u2 = c2; |
177 | if (u1 < upcase_len) |
178 | u1 = le16_to_cpu(upcase[u1]); |
179 | if (u2 < upcase_len) |
180 | u2 = le16_to_cpu(upcase[u2]); |
181 | if ((u1 == u2) && cnt) |
182 | do { |
183 | u1 = le16_to_cpu(*name1); |
184 | name1++; |
185 | u2 = le16_to_cpu(*name2); |
186 | name2++; |
187 | if (u1 < upcase_len) |
188 | u1 = le16_to_cpu(upcase[u1]); |
189 | if (u2 < upcase_len) |
190 | u2 = le16_to_cpu(upcase[u2]); |
191 | } while ((u1 == u2) && --cnt); |
192 | if (u1 < u2) |
193 | return -1; |
194 | if (u1 > u2) |
195 | return 1; |
196 | if (name1_len < name2_len) |
197 | return -1; |
198 | if (name1_len > name2_len) |
199 | return 1; |
200 | if (c1 < c2) |
201 | return -1; |
202 | if (c1 > c2) |
203 | return 1; |
204 | } else { |
205 | do { |
206 | u1 = c1 = le16_to_cpu(*name1); |
207 | name1++; |
208 | u2 = c2 = le16_to_cpu(*name2); |
209 | name2++; |
210 | if (u1 < upcase_len) |
211 | u1 = le16_to_cpu(upcase[u1]); |
212 | if (u2 < upcase_len) |
213 | u2 = le16_to_cpu(upcase[u2]); |
214 | } while ((u1 == u2) && --cnt); |
215 | if (u1 < u2) |
216 | return -1; |
217 | if (u1 > u2) |
218 | return 1; |
219 | if (name1_len < name2_len) |
220 | return -1; |
221 | if (name1_len > name2_len) |
222 | return 1; |
223 | } |
224 | } else { |
225 | if (name1_len < name2_len) |
226 | return -1; |
227 | if (name1_len > name2_len) |
228 | return 1; |
229 | } |
230 | return 0; |
231 | } |
232 | |
233 | /** |
234 | * ntfs_ucsncmp - compare two little endian Unicode strings |
235 | * @s1: first string |
236 | * @s2: second string |
237 | * @n: maximum unicode characters to compare |
238 | * |
239 | * Compare the first @n characters of the Unicode strings @s1 and @s2, |
240 | * The strings in little endian format and appropriate le16_to_cpu() |
241 | * conversion is performed on non-little endian machines. |
242 | * |
243 | * The function returns an integer less than, equal to, or greater than zero |
244 | * if @s1 (or the first @n Unicode characters thereof) is found, respectively, |
245 | * to be less than, to match, or be greater than @s2. |
246 | */ |
247 | int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) |
248 | { |
249 | ntfschar c1, c2; |
250 | size_t i; |
251 | |
252 | #ifdef DEBUG |
253 | if (!s1 || !s2) { |
254 | ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n"); |
255 | exit(1); |
256 | } |
257 | #endif |
258 | for (i = 0; i < n; ++i) { |
259 | c1 = le16_to_cpu(s1[i]); |
260 | c2 = le16_to_cpu(s2[i]); |
261 | if (c1 < c2) |
262 | return -1; |
263 | if (c1 > c2) |
264 | return 1; |
265 | if (!c1) |
266 | break; |
267 | } |
268 | return 0; |
269 | } |
270 | |
271 | /** |
272 | * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case |
273 | * @s1: first string |
274 | * @s2: second string |
275 | * @n: maximum unicode characters to compare |
276 | * @upcase: upcase table |
277 | * @upcase_size: upcase table size in Unicode characters |
278 | * |
279 | * Compare the first @n characters of the Unicode strings @s1 and @s2, |
280 | * ignoring case. The strings in little endian format and appropriate |
281 | * le16_to_cpu() conversion is performed on non-little endian machines. |
282 | * |
283 | * Each character is uppercased using the @upcase table before the comparison. |
284 | * |
285 | * The function returns an integer less than, equal to, or greater than zero |
286 | * if @s1 (or the first @n Unicode characters thereof) is found, respectively, |
287 | * to be less than, to match, or be greater than @s2. |
288 | */ |
289 | int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, |
290 | const ntfschar *upcase, const u32 upcase_size) |
291 | { |
292 | u16 c1, c2; |
293 | size_t i; |
294 | |
295 | #ifdef DEBUG |
296 | if (!s1 || !s2 || !upcase) { |
297 | ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n"); |
298 | exit(1); |
299 | } |
300 | #endif |
301 | for (i = 0; i < n; ++i) { |
302 | if ((c1 = le16_to_cpu(s1[i])) < upcase_size) |
303 | c1 = le16_to_cpu(upcase[c1]); |
304 | if ((c2 = le16_to_cpu(s2[i])) < upcase_size) |
305 | c2 = le16_to_cpu(upcase[c2]); |
306 | if (c1 < c2) |
307 | return -1; |
308 | if (c1 > c2) |
309 | return 1; |
310 | if (!c1) |
311 | break; |
312 | } |
313 | return 0; |
314 | } |
315 | |
316 | /** |
317 | * ntfs_ucsnlen - determine the length of a little endian Unicode string |
318 | * @s: pointer to Unicode string |
319 | * @maxlen: maximum length of string @s |
320 | * |
321 | * Return the number of Unicode characters in the little endian Unicode |
322 | * string @s up to a maximum of maxlen Unicode characters, not including |
323 | * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s |
324 | * and @s + @maxlen, @maxlen is returned. |
325 | * |
326 | * This function never looks beyond @s + @maxlen. |
327 | */ |
328 | u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen) |
329 | { |
330 | u32 i; |
331 | |
332 | for (i = 0; i < maxlen; i++) { |
333 | if (!le16_to_cpu(s[i])) |
334 | break; |
335 | } |
336 | return i; |
337 | } |
338 | |
339 | /** |
340 | * ntfs_ucsndup - duplicate little endian Unicode string |
341 | * @s: pointer to Unicode string |
342 | * @maxlen: maximum length of string @s |
343 | * |
344 | * Return a pointer to a new little endian Unicode string which is a duplicate |
345 | * of the string s. Memory for the new string is obtained with ntfs_malloc(3), |
346 | * and can be freed with free(3). |
347 | * |
348 | * A maximum of @maxlen Unicode characters are copied and a terminating |
349 | * (ntfschar)'\0' little endian Unicode character is added. |
350 | * |
351 | * This function never looks beyond @s + @maxlen. |
352 | * |
353 | * Return a pointer to the new little endian Unicode string on success and NULL |
354 | * on failure with errno set to the error code. |
355 | */ |
356 | ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen) |
357 | { |
358 | ntfschar *dst; |
359 | u32 len; |
360 | |
361 | len = ntfs_ucsnlen(s, maxlen); |
362 | dst = ntfs_malloc((len + 1) * sizeof(ntfschar)); |
363 | if (dst) { |
364 | memcpy(dst, s, len * sizeof(ntfschar)); |
365 | dst[len] = cpu_to_le16(L'\0'); |
366 | } |
367 | return dst; |
368 | } |
369 | |
370 | /** |
371 | * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent |
372 | * @name: |
373 | * @name_len: |
374 | * @upcase: |
375 | * @upcase_len: |
376 | * |
377 | * Description... |
378 | * |
379 | * Returns: |
380 | */ |
381 | void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase, |
382 | const u32 upcase_len) |
383 | { |
384 | u32 i; |
385 | u16 u; |
386 | |
387 | for (i = 0; i < name_len; i++) |
388 | if ((u = le16_to_cpu(name[i])) < upcase_len) |
389 | name[i] = upcase[u]; |
390 | } |
391 | |
392 | /** |
393 | * ntfs_file_value_upcase - Convert a filename to upper case |
394 | * @file_name_attr: |
395 | * @upcase: |
396 | * @upcase_len: |
397 | * |
398 | * Description... |
399 | * |
400 | * Returns: |
401 | */ |
402 | void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr, |
403 | const ntfschar *upcase, const u32 upcase_len) |
404 | { |
405 | ntfs_name_upcase((ntfschar*)&file_name_attr->file_name, |
406 | file_name_attr->file_name_length, upcase, upcase_len); |
407 | } |
408 | |
409 | /* |
410 | NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough |
411 | for now]) for path names, but the Unicode code points need to be |
412 | converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI, |
413 | glibc does this even without a locale in a hard-coded fashion as that |
414 | appears to be is easy because the low 7-bit ASCII range appears to be |
415 | available in all charsets but it does not convert anything if |
416 | there was some error with the locale setup or none set up like |
417 | when mount is called during early boot where he (by policy) do |
418 | not use locales (and may be not available if /usr is not yet mounted), |
419 | so this patch fixes the resulting issues for systems which use |
420 | UTF-8 and for others, specifying the locale in fstab brings them |
421 | the encoding which they want. |
422 | |
423 | If no locale is defined or there was a problem with setting one |
424 | up and whenever nl_langinfo(CODESET) returns a sting starting with |
425 | "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix |
426 | the bug where NTFS-3G does not show any path names which include |
427 | international characters!!! (and also fails on creating them) as result. |
428 | |
429 | Author: Bernhard Kaindl <bk@suse.de> |
430 | Jean-Pierre Andre made it compliant with RFC3629/RFC2781. |
431 | */ |
432 | |
433 | /* |
434 | * Return the amount of 8-bit elements in UTF-8 needed (without the terminating |
435 | * null) to store a given UTF-16LE string. |
436 | * |
437 | * Return -1 with errno set if string has invalid byte sequence or too long. |
438 | */ |
439 | static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len) |
440 | { |
441 | int i, ret = -1; |
442 | int count = 0; |
443 | BOOL surrog; |
444 | |
445 | surrog = FALSE; |
446 | for (i = 0; i < ins_len && ins[i]; i++) { |
447 | unsigned short c = le16_to_cpu(ins[i]); |
448 | if (surrog) { |
449 | if ((c >= 0xdc00) && (c < 0xe000)) { |
450 | surrog = FALSE; |
451 | count += 4; |
452 | } else |
453 | goto fail; |
454 | } else |
455 | if (c < 0x80) |
456 | count++; |
457 | else if (c < 0x800) |
458 | count += 2; |
459 | else if (c < 0xd800) |
460 | count += 3; |
461 | else if (c < 0xdc00) |
462 | surrog = TRUE; |
463 | #if NOREVBOM |
464 | else if ((c >= 0xe000) && (c < 0xfffe)) |
465 | #else |
466 | else if (c >= 0xe000) |
467 | #endif |
468 | count += 3; |
469 | else |
470 | goto fail; |
471 | if (count > outs_len) { |
472 | errno = ENAMETOOLONG; |
473 | goto out; |
474 | } |
475 | } |
476 | if (surrog) |
477 | goto fail; |
478 | |
479 | ret = count; |
480 | out: |
481 | return ret; |
482 | fail: |
483 | errno = EILSEQ; |
484 | goto out; |
485 | } |
486 | |
487 | /* |
488 | * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string |
489 | * @ins: input utf16 string buffer |
490 | * @ins_len: length of input string in utf16 characters |
491 | * @outs: on return contains the (allocated) output multibyte string |
492 | * @outs_len: length of output buffer in bytes |
493 | * |
494 | * Return -1 with errno set if string has invalid byte sequence or too long. |
495 | */ |
496 | static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, |
497 | char **outs, int outs_len) |
498 | { |
499 | #if defined(__APPLE__) || defined(__DARWIN__) |
500 | #ifdef ENABLE_NFCONV |
501 | char *original_outs_value = *outs; |
502 | int original_outs_len = outs_len; |
503 | #endif /* ENABLE_NFCONV */ |
504 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
505 | |
506 | char *t; |
507 | int i, size, ret = -1; |
508 | int halfpair; |
509 | |
510 | halfpair = 0; |
511 | if (!*outs) |
512 | outs_len = PATH_MAX; |
513 | |
514 | size = utf16_to_utf8_size(ins, ins_len, outs_len); |
515 | |
516 | if (size < 0) |
517 | goto out; |
518 | |
519 | if (!*outs) { |
520 | outs_len = size + 1; |
521 | *outs = ntfs_malloc(outs_len); |
522 | if (!*outs) |
523 | goto out; |
524 | } |
525 | |
526 | t = *outs; |
527 | |
528 | for (i = 0; i < ins_len && ins[i]; i++) { |
529 | unsigned short c = le16_to_cpu(ins[i]); |
530 | /* size not double-checked */ |
531 | if (halfpair) { |
532 | if ((c >= 0xdc00) && (c < 0xe000)) { |
533 | *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7); |
534 | *t++ = 0x80 + (((halfpair + 64) >> 2) & 63); |
535 | *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); |
536 | *t++ = 0x80 + (c & 63); |
537 | halfpair = 0; |
538 | } else |
539 | goto fail; |
540 | } else if (c < 0x80) { |
541 | *t++ = c; |
542 | } else { |
543 | if (c < 0x800) { |
544 | *t++ = (0xc0 | ((c >> 6) & 0x3f)); |
545 | *t++ = 0x80 | (c & 0x3f); |
546 | } else if (c < 0xd800) { |
547 | *t++ = 0xe0 | (c >> 12); |
548 | *t++ = 0x80 | ((c >> 6) & 0x3f); |
549 | *t++ = 0x80 | (c & 0x3f); |
550 | } else if (c < 0xdc00) |
551 | halfpair = c; |
552 | else if (c >= 0xe000) { |
553 | *t++ = 0xe0 | (c >> 12); |
554 | *t++ = 0x80 | ((c >> 6) & 0x3f); |
555 | *t++ = 0x80 | (c & 0x3f); |
556 | } else |
557 | goto fail; |
558 | } |
559 | } |
560 | *t = '\0'; |
561 | |
562 | #if defined(__APPLE__) || defined(__DARWIN__) |
563 | #ifdef ENABLE_NFCONV |
564 | if(nfconvert_utf8 && (t - *outs) > 0) { |
565 | char *new_outs = NULL; |
566 | int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form |
567 | if(new_outs_len >= 0 && new_outs != NULL) { |
568 | if(original_outs_value != *outs) { |
569 | // We have allocated outs ourselves. |
570 | free(*outs); |
571 | *outs = new_outs; |
572 | t = *outs + new_outs_len; |
573 | } |
574 | else { |
575 | // We need to copy new_outs into the fixed outs buffer. |
576 | memset(*outs, 0, original_outs_len); |
577 | strncpy(*outs, new_outs, original_outs_len-1); |
578 | t = *outs + original_outs_len; |
579 | free(new_outs); |
580 | } |
581 | } |
582 | else { |
583 | ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs); |
584 | ntfs_log_error(" new_outs=0x%p\n", new_outs); |
585 | ntfs_log_error(" new_outs_len=%d\n", new_outs_len); |
586 | } |
587 | } |
588 | #endif /* ENABLE_NFCONV */ |
589 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
590 | |
591 | ret = t - *outs; |
592 | out: |
593 | return ret; |
594 | fail: |
595 | errno = EILSEQ; |
596 | goto out; |
597 | } |
598 | |
599 | /* |
600 | * Return the amount of 16-bit elements in UTF-16LE needed |
601 | * (without the terminating null) to store given UTF-8 string. |
602 | * |
603 | * Return -1 with errno set if it's longer than PATH_MAX or string is invalid. |
604 | * |
605 | * Note: This does not check whether the input sequence is a valid utf8 string, |
606 | * and should be used only in context where such check is made! |
607 | */ |
608 | static int utf8_to_utf16_size(const char *s) |
609 | { |
610 | int ret = -1; |
611 | unsigned int byte; |
612 | size_t count = 0; |
613 | |
614 | while ((byte = *((const unsigned char *)s++))) { |
615 | if (++count >= PATH_MAX) |
616 | goto fail; |
617 | if (byte >= 0xc0) { |
618 | if (byte >= 0xF5) { |
619 | errno = EILSEQ; |
620 | goto out; |
621 | } |
622 | if (!*s) |
623 | break; |
624 | if (byte >= 0xC0) |
625 | s++; |
626 | if (!*s) |
627 | break; |
628 | if (byte >= 0xE0) |
629 | s++; |
630 | if (!*s) |
631 | break; |
632 | if (byte >= 0xF0) { |
633 | s++; |
634 | if (++count >= PATH_MAX) |
635 | goto fail; |
636 | } |
637 | } |
638 | } |
639 | ret = count; |
640 | out: |
641 | return ret; |
642 | fail: |
643 | errno = ENAMETOOLONG; |
644 | goto out; |
645 | } |
646 | /* |
647 | * This converts one UTF-8 sequence to cpu-endian Unicode value |
648 | * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF |
649 | * |
650 | * Return the number of used utf8 bytes or -1 with errno set |
651 | * if sequence is invalid. |
652 | */ |
653 | static int utf8_to_unicode(u32 *wc, const char *s) |
654 | { |
655 | unsigned int byte = *((const unsigned char *)s); |
656 | |
657 | /* single byte */ |
658 | if (byte == 0) { |
659 | *wc = (u32) 0; |
660 | return 0; |
661 | } else if (byte < 0x80) { |
662 | *wc = (u32) byte; |
663 | return 1; |
664 | /* double byte */ |
665 | } else if (byte < 0xc2) { |
666 | goto fail; |
667 | } else if (byte < 0xE0) { |
668 | if ((s[1] & 0xC0) == 0x80) { |
669 | *wc = ((u32)(byte & 0x1F) << 6) |
670 | | ((u32)(s[1] & 0x3F)); |
671 | return 2; |
672 | } else |
673 | goto fail; |
674 | /* three-byte */ |
675 | } else if (byte < 0xF0) { |
676 | if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { |
677 | *wc = ((u32)(byte & 0x0F) << 12) |
678 | | ((u32)(s[1] & 0x3F) << 6) |
679 | | ((u32)(s[2] & 0x3F)); |
680 | /* Check valid ranges */ |
681 | #if NOREVBOM |
682 | if (((*wc >= 0x800) && (*wc <= 0xD7FF)) |
683 | || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) |
684 | return 3; |
685 | #else |
686 | if (((*wc >= 0x800) && (*wc <= 0xD7FF)) |
687 | || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) |
688 | return 3; |
689 | #endif |
690 | } |
691 | goto fail; |
692 | /* four-byte */ |
693 | } else if (byte < 0xF5) { |
694 | if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) |
695 | && ((s[3] & 0xC0) == 0x80)) { |
696 | *wc = ((u32)(byte & 0x07) << 18) |
697 | | ((u32)(s[1] & 0x3F) << 12) |
698 | | ((u32)(s[2] & 0x3F) << 6) |
699 | | ((u32)(s[3] & 0x3F)); |
700 | /* Check valid ranges */ |
701 | if ((*wc <= 0x10ffff) && (*wc >= 0x10000)) |
702 | return 4; |
703 | } |
704 | goto fail; |
705 | } |
706 | fail: |
707 | errno = EILSEQ; |
708 | return -1; |
709 | } |
710 | |
711 | /** |
712 | * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string |
713 | * @ins: input multibyte string buffer |
714 | * @outs: on return contains the (allocated) output utf16 string |
715 | * @outs_len: length of output buffer in utf16 characters |
716 | * |
717 | * Return -1 with errno set. |
718 | */ |
719 | static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs) |
720 | { |
721 | #if defined(__APPLE__) || defined(__DARWIN__) |
722 | #ifdef ENABLE_NFCONV |
723 | char *new_ins = NULL; |
724 | if(nfconvert_utf8) { |
725 | int new_ins_len; |
726 | new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form |
727 | if(new_ins_len >= 0) |
728 | ins = new_ins; |
729 | else |
730 | ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins); |
731 | } |
732 | #endif /* ENABLE_NFCONV */ |
733 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
734 | const char *t = ins; |
735 | u32 wc; |
736 | BOOL allocated; |
737 | ntfschar *outpos; |
738 | int shorts, ret = -1; |
739 | |
740 | shorts = utf8_to_utf16_size(ins); |
741 | if (shorts < 0) |
742 | goto fail; |
743 | |
744 | allocated = FALSE; |
745 | if (!*outs) { |
746 | *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar)); |
747 | if (!*outs) |
748 | goto fail; |
749 | allocated = TRUE; |
750 | } |
751 | |
752 | outpos = *outs; |
753 | |
754 | while(1) { |
755 | int m = utf8_to_unicode(&wc, t); |
756 | if (m <= 0) { |
757 | if (m < 0) { |
758 | /* do not leave space allocated if failed */ |
759 | if (allocated) { |
760 | free(*outs); |
761 | *outs = (ntfschar*)NULL; |
762 | } |
763 | goto fail; |
764 | } |
765 | *outpos++ = const_cpu_to_le16(0); |
766 | break; |
767 | } |
768 | if (wc < 0x10000) |
769 | *outpos++ = cpu_to_le16(wc); |
770 | else { |
771 | wc -= 0x10000; |
772 | *outpos++ = cpu_to_le16((wc >> 10) + 0xd800); |
773 | *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00); |
774 | } |
775 | t += m; |
776 | } |
777 | |
778 | ret = --outpos - *outs; |
779 | fail: |
780 | #if defined(__APPLE__) || defined(__DARWIN__) |
781 | #ifdef ENABLE_NFCONV |
782 | if(new_ins != NULL) |
783 | free(new_ins); |
784 | #endif /* ENABLE_NFCONV */ |
785 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
786 | return ret; |
787 | } |
788 | |
789 | /** |
790 | * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string |
791 | * @ins: input Unicode string buffer |
792 | * @ins_len: length of input string in Unicode characters |
793 | * @outs: on return contains the (allocated) output multibyte string |
794 | * @outs_len: length of output buffer in bytes |
795 | * |
796 | * Convert the input little endian, 2-byte Unicode string @ins, of length |
797 | * @ins_len into the multibyte string format dictated by the current locale. |
798 | * |
799 | * If *@outs is NULL, the function allocates the string and the caller is |
800 | * responsible for calling free(*@outs); when finished with it. |
801 | * |
802 | * On success the function returns the number of bytes written to the output |
803 | * string *@outs (>= 0), not counting the terminating NULL byte. If the output |
804 | * string buffer was allocated, *@outs is set to it. |
805 | * |
806 | * On error, -1 is returned, and errno is set to the error code. The following |
807 | * error codes can be expected: |
808 | * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). |
809 | * EILSEQ The input string cannot be represented as a multibyte |
810 | * sequence according to the current locale. |
811 | * ENAMETOOLONG Destination buffer is too small for input string. |
812 | * ENOMEM Not enough memory to allocate destination buffer. |
813 | */ |
814 | int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, |
815 | int outs_len) |
816 | { |
817 | char *mbs; |
818 | wchar_t wc; |
819 | int i, o, mbs_len; |
820 | int cnt = 0; |
821 | #ifdef HAVE_MBSINIT |
822 | mbstate_t mbstate; |
823 | #endif |
824 | |
825 | if (!ins || !outs) { |
826 | errno = EINVAL; |
827 | return -1; |
828 | } |
829 | mbs = *outs; |
830 | mbs_len = outs_len; |
831 | if (mbs && !mbs_len) { |
832 | errno = ENAMETOOLONG; |
833 | return -1; |
834 | } |
835 | if (use_utf8) |
836 | return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len); |
837 | if (!mbs) { |
838 | mbs_len = (ins_len + 1) * MB_CUR_MAX; |
839 | mbs = ntfs_malloc(mbs_len); |
840 | if (!mbs) |
841 | return -1; |
842 | } |
843 | #ifdef HAVE_MBSINIT |
844 | memset(&mbstate, 0, sizeof(mbstate)); |
845 | #else |
846 | wctomb(NULL, 0); |
847 | #endif |
848 | for (i = o = 0; i < ins_len; i++) { |
849 | /* Reallocate memory if necessary or abort. */ |
850 | if ((int)(o + MB_CUR_MAX) > mbs_len) { |
851 | char *tc; |
852 | if (mbs == *outs) { |
853 | errno = ENAMETOOLONG; |
854 | return -1; |
855 | } |
856 | tc = ntfs_malloc((mbs_len + 64) & ~63); |
857 | if (!tc) |
858 | goto err_out; |
859 | memcpy(tc, mbs, mbs_len); |
860 | mbs_len = (mbs_len + 64) & ~63; |
861 | free(mbs); |
862 | mbs = tc; |
863 | } |
864 | /* Convert the LE Unicode character to a CPU wide character. */ |
865 | wc = (wchar_t)le16_to_cpu(ins[i]); |
866 | if (!wc) |
867 | break; |
868 | /* Convert the CPU endian wide character to multibyte. */ |
869 | #ifdef HAVE_MBSINIT |
870 | cnt = wcrtomb(mbs + o, wc, &mbstate); |
871 | #else |
872 | cnt = wctomb(mbs + o, wc); |
873 | #endif |
874 | if (cnt == -1) |
875 | goto err_out; |
876 | if (cnt <= 0) { |
877 | ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt); |
878 | errno = EINVAL; |
879 | goto err_out; |
880 | } |
881 | o += cnt; |
882 | } |
883 | #ifdef HAVE_MBSINIT |
884 | /* Make sure we are back in the initial state. */ |
885 | if (!mbsinit(&mbstate)) { |
886 | ntfs_log_debug("Eeek. mbstate not in initial state!\n"); |
887 | errno = EILSEQ; |
888 | goto err_out; |
889 | } |
890 | #endif |
891 | /* Now write the NULL character. */ |
892 | mbs[o] = '\0'; |
893 | if (*outs != mbs) |
894 | *outs = mbs; |
895 | return o; |
896 | err_out: |
897 | if (mbs != *outs) { |
898 | int eo = errno; |
899 | free(mbs); |
900 | errno = eo; |
901 | } |
902 | return -1; |
903 | } |
904 | |
905 | /** |
906 | * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string |
907 | * @ins: input multibyte string buffer |
908 | * @outs: on return contains the (allocated) output Unicode string |
909 | * |
910 | * Convert the input multibyte string @ins, from the current locale into the |
911 | * corresponding little endian, 2-byte Unicode string. |
912 | * |
913 | * The function allocates the string and the caller is responsible for calling |
914 | * free(*@outs); when finished with it. |
915 | * |
916 | * On success the function returns the number of Unicode characters written to |
917 | * the output string *@outs (>= 0), not counting the terminating Unicode NULL |
918 | * character. |
919 | * |
920 | * On error, -1 is returned, and errno is set to the error code. The following |
921 | * error codes can be expected: |
922 | * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). |
923 | * EILSEQ The input string cannot be represented as a Unicode |
924 | * string according to the current locale. |
925 | * ENAMETOOLONG Destination buffer is too small for input string. |
926 | * ENOMEM Not enough memory to allocate destination buffer. |
927 | */ |
928 | int ntfs_mbstoucs(const char *ins, ntfschar **outs) |
929 | { |
930 | ntfschar *ucs; |
931 | const char *s; |
932 | wchar_t wc; |
933 | int i, o, cnt, ins_len, ucs_len, ins_size; |
934 | #ifdef HAVE_MBSINIT |
935 | mbstate_t mbstate; |
936 | #endif |
937 | |
938 | if (!ins || !outs) { |
939 | errno = EINVAL; |
940 | return -1; |
941 | } |
942 | |
943 | if (use_utf8) |
944 | return ntfs_utf8_to_utf16(ins, outs); |
945 | |
946 | /* Determine the size of the multi-byte string in bytes. */ |
947 | ins_size = strlen(ins); |
948 | /* Determine the length of the multi-byte string. */ |
949 | s = ins; |
950 | #if defined(HAVE_MBSINIT) |
951 | memset(&mbstate, 0, sizeof(mbstate)); |
952 | ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); |
953 | #ifdef __CYGWIN32__ |
954 | if (!ins_len && *ins) { |
955 | /* Older Cygwin had broken mbsrtowcs() implementation. */ |
956 | ins_len = strlen(ins); |
957 | } |
958 | #endif |
959 | #elif !defined(DJGPP) |
960 | ins_len = mbstowcs(NULL, s, 0); |
961 | #else |
962 | /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ |
963 | ins_len = strlen(ins); |
964 | #endif |
965 | if (ins_len == -1) |
966 | return ins_len; |
967 | #ifdef HAVE_MBSINIT |
968 | if ((s != ins) || !mbsinit(&mbstate)) { |
969 | #else |
970 | if (s != ins) { |
971 | #endif |
972 | errno = EILSEQ; |
973 | return -1; |
974 | } |
975 | /* Add the NULL terminator. */ |
976 | ins_len++; |
977 | ucs_len = ins_len; |
978 | ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); |
979 | if (!ucs) |
980 | return -1; |
981 | #ifdef HAVE_MBSINIT |
982 | memset(&mbstate, 0, sizeof(mbstate)); |
983 | #else |
984 | mbtowc(NULL, NULL, 0); |
985 | #endif |
986 | for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { |
987 | /* Reallocate memory if necessary. */ |
988 | if (o >= ucs_len) { |
989 | ntfschar *tc; |
990 | ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; |
991 | tc = realloc(ucs, ucs_len); |
992 | if (!tc) |
993 | goto err_out; |
994 | ucs = tc; |
995 | ucs_len /= sizeof(ntfschar); |
996 | } |
997 | /* Convert the multibyte character to a wide character. */ |
998 | #ifdef HAVE_MBSINIT |
999 | cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); |
1000 | #else |
1001 | cnt = mbtowc(&wc, ins + i, ins_size - i); |
1002 | #endif |
1003 | if (!cnt) |
1004 | break; |
1005 | if (cnt == -1) |
1006 | goto err_out; |
1007 | if (cnt < -1) { |
1008 | ntfs_log_trace("Eeek. cnt = %i\n", cnt); |
1009 | errno = EINVAL; |
1010 | goto err_out; |
1011 | } |
1012 | /* Make sure we are not overflowing the NTFS Unicode set. */ |
1013 | if ((unsigned long)wc >= (unsigned long)(1 << |
1014 | (8 * sizeof(ntfschar)))) { |
1015 | errno = EILSEQ; |
1016 | goto err_out; |
1017 | } |
1018 | /* Convert the CPU wide character to a LE Unicode character. */ |
1019 | ucs[o] = cpu_to_le16(wc); |
1020 | } |
1021 | #ifdef HAVE_MBSINIT |
1022 | /* Make sure we are back in the initial state. */ |
1023 | if (!mbsinit(&mbstate)) { |
1024 | ntfs_log_trace("Eeek. mbstate not in initial state!\n"); |
1025 | errno = EILSEQ; |
1026 | goto err_out; |
1027 | } |
1028 | #endif |
1029 | /* Now write the NULL character. */ |
1030 | ucs[o] = cpu_to_le16(L'\0'); |
1031 | *outs = ucs; |
1032 | return o; |
1033 | err_out: |
1034 | free(ucs); |
1035 | return -1; |
1036 | } |
1037 | |
1038 | /** |
1039 | * ntfs_upcase_table_build - build the default upcase table for NTFS |
1040 | * @uc: destination buffer where to store the built table |
1041 | * @uc_len: size of destination buffer in bytes |
1042 | * |
1043 | * ntfs_upcase_table_build() builds the default upcase table for NTFS and |
1044 | * stores it in the caller supplied buffer @uc of size @uc_len. |
1045 | * |
1046 | * Note, @uc_len must be at least 128kiB in size or bad things will happen! |
1047 | */ |
1048 | void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len) |
1049 | { |
1050 | static int uc_run_table[][3] = { /* Start, End, Add */ |
1051 | {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, |
1052 | {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, |
1053 | {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, |
1054 | {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, |
1055 | {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, |
1056 | {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, |
1057 | {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, |
1058 | {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, |
1059 | {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, |
1060 | {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, |
1061 | {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, |
1062 | {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, |
1063 | {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, |
1064 | {0} |
1065 | }; |
1066 | static int uc_dup_table[][2] = { /* Start, End */ |
1067 | {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, |
1068 | {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, |
1069 | {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, |
1070 | {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, |
1071 | {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, |
1072 | {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, |
1073 | {0} |
1074 | }; |
1075 | static int uc_byte_table[][2] = { /* Offset, Value */ |
1076 | {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, |
1077 | {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, |
1078 | {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, |
1079 | {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, |
1080 | {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, |
1081 | {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, |
1082 | {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, |
1083 | {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, |
1084 | {0} |
1085 | }; |
1086 | int i, r; |
1087 | int k, off; |
1088 | |
1089 | memset((char*)uc, 0, uc_len); |
1090 | uc_len >>= 1; |
1091 | if (uc_len > 65536) |
1092 | uc_len = 65536; |
1093 | for (i = 0; (u32)i < uc_len; i++) |
1094 | uc[i] = cpu_to_le16(i); |
1095 | for (r = 0; uc_run_table[r][0]; r++) { |
1096 | off = uc_run_table[r][2]; |
1097 | for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) |
1098 | uc[i] = cpu_to_le16(i + off); |
1099 | } |
1100 | for (r = 0; uc_dup_table[r][0]; r++) |
1101 | for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) |
1102 | uc[i + 1] = cpu_to_le16(i); |
1103 | for (r = 0; uc_byte_table[r][0]; r++) { |
1104 | k = uc_byte_table[r][1]; |
1105 | uc[uc_byte_table[r][0]] = cpu_to_le16(k); |
1106 | } |
1107 | } |
1108 | |
1109 | /** |
1110 | * ntfs_str2ucs - convert a string to a valid NTFS file name |
1111 | * @s: input string |
1112 | * @len: length of output buffer in Unicode characters |
1113 | * |
1114 | * Convert the input @s string into the corresponding little endian, |
1115 | * 2-byte Unicode string. The length of the converted string is less |
1116 | * or equal to the maximum length allowed by the NTFS format (255). |
1117 | * |
1118 | * If @s is NULL then return AT_UNNAMED. |
1119 | * |
1120 | * On success the function returns the Unicode string in an allocated |
1121 | * buffer and the caller is responsible to free it when it's not needed |
1122 | * anymore. |
1123 | * |
1124 | * On error NULL is returned and errno is set to the error code. |
1125 | */ |
1126 | ntfschar *ntfs_str2ucs(const char *s, int *len) |
1127 | { |
1128 | ntfschar *ucs = NULL; |
1129 | |
1130 | if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) { |
1131 | ntfs_log_perror("Couldn't convert '%s' to Unicode", s); |
1132 | return NULL; |
1133 | } |
1134 | if (*len > NTFS_MAX_NAME_LEN) { |
1135 | free(ucs); |
1136 | errno = ENAMETOOLONG; |
1137 | return NULL; |
1138 | } |
1139 | if (!ucs || !*len) { |
1140 | ucs = AT_UNNAMED; |
1141 | *len = 0; |
1142 | } |
1143 | return ucs; |
1144 | } |
1145 | |
1146 | /** |
1147 | * ntfs_ucsfree - free memory allocated by ntfs_str2ucs() |
1148 | * @ucs input string to be freed |
1149 | * |
1150 | * Free memory at @ucs and which was allocated by ntfs_str2ucs. |
1151 | * |
1152 | * Return value: none. |
1153 | */ |
1154 | void ntfs_ucsfree(ntfschar *ucs) |
1155 | { |
1156 | if (ucs && (ucs != AT_UNNAMED)) |
1157 | free(ucs); |
1158 | } |
1159 | |
1160 | /* |
1161 | * Check whether a name contains no chars forbidden |
1162 | * for DOS or Win32 use |
1163 | * |
1164 | * If there is a bad char, errno is set to EINVAL |
1165 | */ |
1166 | |
1167 | BOOL ntfs_forbidden_chars(const ntfschar *name, int len) |
1168 | { |
1169 | BOOL forbidden; |
1170 | int ch; |
1171 | int i; |
1172 | u32 mainset = (1L << ('\"' - 0x20)) |
1173 | | (1L << ('*' - 0x20)) |
1174 | | (1L << ('/' - 0x20)) |
1175 | | (1L << (':' - 0x20)) |
1176 | | (1L << ('<' - 0x20)) |
1177 | | (1L << ('>' - 0x20)) |
1178 | | (1L << ('?' - 0x20)); |
1179 | |
1180 | forbidden = (len == 0) || (le16_to_cpu(name[len-1]) == ' '); |
1181 | for (i=0; i<len; i++) { |
1182 | ch = le16_to_cpu(name[i]); |
1183 | if ((ch < 0x20) |
1184 | || ((ch < 0x40) |
1185 | && ((1L << (ch - 0x20)) & mainset)) |
1186 | || (ch == '\\') |
1187 | || (ch == '|')) |
1188 | forbidden = TRUE; |
1189 | } |
1190 | if (forbidden) |
1191 | errno = EINVAL; |
1192 | return (forbidden); |
1193 | } |
1194 | |
1195 | /* |
1196 | * Check whether the same name can be used as a DOS and |
1197 | * a Win32 name |
1198 | * |
1199 | * The names must be the same, or the short name the uppercase |
1200 | * variant of the long name |
1201 | */ |
1202 | |
1203 | BOOL ntfs_collapsible_chars(ntfs_volume *vol, |
1204 | const ntfschar *shortname, int shortlen, |
1205 | const ntfschar *longname, int longlen) |
1206 | { |
1207 | BOOL collapsible; |
1208 | unsigned int ch; |
1209 | int i; |
1210 | |
1211 | collapsible = shortlen == longlen; |
1212 | if (collapsible) |
1213 | for (i=0; i<shortlen; i++) { |
1214 | ch = le16_to_cpu(longname[i]); |
1215 | if ((ch >= vol->upcase_len) |
1216 | || ((shortname[i] != longname[i]) |
1217 | && (shortname[i] != vol->upcase[ch]))) |
1218 | collapsible = FALSE; |
1219 | } |
1220 | return (collapsible); |
1221 | } |
1222 | |
1223 | /* |
1224 | * Define the character encoding to be used. |
1225 | * Use UTF-8 unless specified otherwise. |
1226 | */ |
1227 | |
1228 | int ntfs_set_char_encoding(const char *locale) |
1229 | { |
1230 | use_utf8 = 0; |
1231 | if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8") |
1232 | || strstr(locale,"utf-8") || strstr(locale,"UTF-8")) |
1233 | use_utf8 = 1; |
1234 | else |
1235 | if (setlocale(LC_ALL, locale)) |
1236 | use_utf8 = 0; |
1237 | else { |
1238 | ntfs_log_error("Invalid locale, encoding to UTF-8\n"); |
1239 | use_utf8 = 1; |
1240 | } |
1241 | return 0; /* always successful */ |
1242 | } |
1243 | |
1244 | #if defined(__APPLE__) || defined(__DARWIN__) |
1245 | |
1246 | int ntfs_macosx_normalize_filenames(int normalize) { |
1247 | #ifdef ENABLE_NFCONV |
1248 | if(normalize == 0 || normalize == 1) { |
1249 | nfconvert_utf8 = normalize; |
1250 | return 0; |
1251 | } |
1252 | else |
1253 | return -1; |
1254 | #else |
1255 | return -1; |
1256 | #endif /* ENABLE_NFCONV */ |
1257 | } |
1258 | |
1259 | int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target, |
1260 | int composed) { |
1261 | #ifdef ENABLE_NFCONV |
1262 | /* For this code to compile, the CoreFoundation framework must be fed to the linker. */ |
1263 | CFStringRef cfSourceString; |
1264 | CFMutableStringRef cfMutableString; |
1265 | CFRange rangeToProcess; |
1266 | CFIndex requiredBufferLength; |
1267 | char *result = NULL; |
1268 | int resultLength = -1; |
1269 | |
1270 | /* Convert the UTF-8 string to a CFString. */ |
1271 | cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8); |
1272 | if(cfSourceString == NULL) { |
1273 | ntfs_log_error("CFStringCreateWithCString failed!\n"); |
1274 | return -2; |
1275 | } |
1276 | |
1277 | /* Create a mutable string from cfSourceString that we are free to modify. */ |
1278 | cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString); |
1279 | CFRelease(cfSourceString); /* End-of-life. */ |
1280 | if(cfMutableString == NULL) { |
1281 | ntfs_log_error("CFStringCreateMutableCopy failed!\n"); |
1282 | return -3; |
1283 | } |
1284 | |
1285 | /* Normalize the mutable string to the desired normalization form. */ |
1286 | CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD)); |
1287 | |
1288 | /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */ |
1289 | rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString)); |
1290 | if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) { |
1291 | resultLength = sizeof(char)*(requiredBufferLength + 1); |
1292 | result = ntfs_calloc(resultLength); |
1293 | |
1294 | if(result != NULL) { |
1295 | if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, |
1296 | 0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) { |
1297 | ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n"); |
1298 | free(result); |
1299 | result = NULL; |
1300 | } |
1301 | } |
1302 | else |
1303 | ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength); |
1304 | } |
1305 | else |
1306 | ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n"); |
1307 | |
1308 | |
1309 | CFRelease(cfMutableString); |
1310 | |
1311 | if(result != NULL) { |
1312 | *target = result; |
1313 | return resultLength - 1; |
1314 | } |
1315 | else |
1316 | return -1; |
1317 | #else |
1318 | return -1; |
1319 | #endif /* ENABLE_NFCONV */ |
1320 | } |
1321 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
1322 |