blob: 9a2380145e99452b8437ddc37e4a8fcd2d1a763e
1 | /** |
2 | * unistr.c - Unicode string handling. Originated from the Linux-NTFS project. |
3 | * |
4 | * Copyright (c) 2000-2004 Anton Altaparmakov |
5 | * Copyright (c) 2002-2009 Szabolcs Szakacsits |
6 | * Copyright (c) 2008-2011 Jean-Pierre Andre |
7 | * Copyright (c) 2008 Bernhard Kaindl |
8 | * |
9 | * This program/include file is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU General Public License as published |
11 | * by the Free Software Foundation; either version 2 of the License, or |
12 | * (at your option) any later version. |
13 | * |
14 | * This program/include file is distributed in the hope that it will be |
15 | * useful, but WITHOUT ANY WARRANTY; without even the implied warranty |
16 | * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License |
20 | * along with this program (in the main directory of the NTFS-3G |
21 | * distribution in the file COPYING); if not, write to the Free Software |
22 | * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
23 | */ |
24 | |
25 | #ifdef HAVE_CONFIG_H |
26 | #include "config.h" |
27 | #endif |
28 | |
29 | #ifdef HAVE_STDIO_H |
30 | #include <stdio.h> |
31 | #endif |
32 | #ifdef HAVE_STDLIB_H |
33 | #include <stdlib.h> |
34 | #endif |
35 | #ifdef HAVE_WCHAR_H |
36 | #include <wchar.h> |
37 | #endif |
38 | #ifdef HAVE_STRING_H |
39 | #include <string.h> |
40 | #endif |
41 | #ifdef HAVE_ERRNO_H |
42 | #include <errno.h> |
43 | #endif |
44 | #ifdef HAVE_LOCALE_H |
45 | #include <locale.h> |
46 | #endif |
47 | |
48 | #if defined(__APPLE__) || defined(__DARWIN__) |
49 | #ifdef ENABLE_NFCONV |
50 | #include <CoreFoundation/CoreFoundation.h> |
51 | #endif /* ENABLE_NFCONV */ |
52 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
53 | |
54 | #include "compat.h" |
55 | #include "attrib.h" |
56 | #include "types.h" |
57 | #include "unistr.h" |
58 | #include "debug.h" |
59 | #include "logging.h" |
60 | #include "misc.h" |
61 | |
62 | #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */ |
63 | |
64 | /* |
65 | * IMPORTANT |
66 | * ========= |
67 | * |
68 | * All these routines assume that the Unicode characters are in little endian |
69 | * encoding inside the strings!!! |
70 | */ |
71 | |
72 | static int use_utf8 = 1; /* use UTF-8 encoding for file names */ |
73 | |
74 | #if defined(__APPLE__) || defined(__DARWIN__) |
75 | #ifdef ENABLE_NFCONV |
76 | /** |
77 | * This variable controls whether or not automatic normalization form conversion |
78 | * should be performed when translating NTFS unicode file names to UTF-8. |
79 | * Defaults to on, but can be controlled from the outside using the function |
80 | * int ntfs_macosx_normalize_filenames(int normalize); |
81 | */ |
82 | static int nfconvert_utf8 = 1; |
83 | #endif /* ENABLE_NFCONV */ |
84 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
85 | |
86 | /* |
87 | * This is used by the name collation functions to quickly determine what |
88 | * characters are (in)valid. |
89 | */ |
90 | #if 0 |
91 | static const u8 legal_ansi_char_array[0x40] = { |
92 | 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
93 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
94 | |
95 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
96 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, |
97 | |
98 | 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, |
99 | 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, |
100 | |
101 | 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, |
102 | 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, |
103 | }; |
104 | #endif |
105 | |
106 | /** |
107 | * ntfs_names_are_equal - compare two Unicode names for equality |
108 | * @s1: name to compare to @s2 |
109 | * @s1_len: length in Unicode characters of @s1 |
110 | * @s2: name to compare to @s1 |
111 | * @s2_len: length in Unicode characters of @s2 |
112 | * @ic: ignore case bool |
113 | * @upcase: upcase table (only if @ic == IGNORE_CASE) |
114 | * @upcase_size: length in Unicode characters of @upcase (if present) |
115 | * |
116 | * Compare the names @s1 and @s2 and return TRUE (1) if the names are |
117 | * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, |
118 | * the @upcase table is used to perform a case insensitive comparison. |
119 | */ |
120 | BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len, |
121 | const ntfschar *s2, size_t s2_len, |
122 | const IGNORE_CASE_BOOL ic, |
123 | const ntfschar *upcase, const u32 upcase_size) |
124 | { |
125 | if (s1_len != s2_len) |
126 | return FALSE; |
127 | if (!s1_len) |
128 | return TRUE; |
129 | if (ic == CASE_SENSITIVE) |
130 | return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE; |
131 | return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE: |
132 | TRUE; |
133 | } |
134 | |
135 | /* |
136 | * ntfs_names_full_collate() fully collate two Unicode names |
137 | * |
138 | * @name1: first Unicode name to compare |
139 | * @name1_len: length of first Unicode name to compare |
140 | * @name2: second Unicode name to compare |
141 | * @name2_len: length of second Unicode name to compare |
142 | * @ic: either CASE_SENSITIVE or IGNORE_CASE |
143 | * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) |
144 | * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) |
145 | * |
146 | * -1 if the first name collates before the second one, |
147 | * 0 if the names match, |
148 | * 1 if the second name collates before the first one, or |
149 | * |
150 | */ |
151 | int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len, |
152 | const ntfschar *name2, const u32 name2_len, |
153 | const IGNORE_CASE_BOOL ic, const ntfschar *upcase, |
154 | const u32 upcase_len) |
155 | { |
156 | u32 cnt; |
157 | u16 c1, c2; |
158 | u16 u1, u2; |
159 | |
160 | #ifdef DEBUG |
161 | if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) { |
162 | ntfs_log_debug("ntfs_names_collate received NULL pointer!\n"); |
163 | exit(1); |
164 | } |
165 | #endif |
166 | cnt = min(name1_len, name2_len); |
167 | if (cnt > 0) { |
168 | if (ic == CASE_SENSITIVE) { |
169 | while (--cnt && (*name1 == *name2)) { |
170 | name1++; |
171 | name2++; |
172 | } |
173 | u1 = c1 = le16_to_cpu(*name1); |
174 | u2 = c2 = le16_to_cpu(*name2); |
175 | if (u1 < upcase_len) |
176 | u1 = le16_to_cpu(upcase[u1]); |
177 | if (u2 < upcase_len) |
178 | u2 = le16_to_cpu(upcase[u2]); |
179 | if ((u1 == u2) && cnt) |
180 | do { |
181 | name1++; |
182 | u1 = le16_to_cpu(*name1); |
183 | name2++; |
184 | u2 = le16_to_cpu(*name2); |
185 | if (u1 < upcase_len) |
186 | u1 = le16_to_cpu(upcase[u1]); |
187 | if (u2 < upcase_len) |
188 | u2 = le16_to_cpu(upcase[u2]); |
189 | } while ((u1 == u2) && --cnt); |
190 | if (u1 < u2) |
191 | return -1; |
192 | if (u1 > u2) |
193 | return 1; |
194 | if (name1_len < name2_len) |
195 | return -1; |
196 | if (name1_len > name2_len) |
197 | return 1; |
198 | if (c1 < c2) |
199 | return -1; |
200 | if (c1 > c2) |
201 | return 1; |
202 | } else { |
203 | do { |
204 | u1 = c1 = le16_to_cpu(*name1); |
205 | name1++; |
206 | u2 = c2 = le16_to_cpu(*name2); |
207 | name2++; |
208 | if (u1 < upcase_len) |
209 | u1 = le16_to_cpu(upcase[u1]); |
210 | if (u2 < upcase_len) |
211 | u2 = le16_to_cpu(upcase[u2]); |
212 | } while ((u1 == u2) && --cnt); |
213 | if (u1 < u2) |
214 | return -1; |
215 | if (u1 > u2) |
216 | return 1; |
217 | if (name1_len < name2_len) |
218 | return -1; |
219 | if (name1_len > name2_len) |
220 | return 1; |
221 | } |
222 | } else { |
223 | if (name1_len < name2_len) |
224 | return -1; |
225 | if (name1_len > name2_len) |
226 | return 1; |
227 | } |
228 | return 0; |
229 | } |
230 | |
231 | /** |
232 | * ntfs_ucsncmp - compare two little endian Unicode strings |
233 | * @s1: first string |
234 | * @s2: second string |
235 | * @n: maximum unicode characters to compare |
236 | * |
237 | * Compare the first @n characters of the Unicode strings @s1 and @s2, |
238 | * The strings in little endian format and appropriate le16_to_cpu() |
239 | * conversion is performed on non-little endian machines. |
240 | * |
241 | * The function returns an integer less than, equal to, or greater than zero |
242 | * if @s1 (or the first @n Unicode characters thereof) is found, respectively, |
243 | * to be less than, to match, or be greater than @s2. |
244 | */ |
245 | int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) |
246 | { |
247 | ntfschar c1, c2; |
248 | size_t i; |
249 | |
250 | #ifdef DEBUG |
251 | if (!s1 || !s2) { |
252 | ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n"); |
253 | exit(1); |
254 | } |
255 | #endif |
256 | for (i = 0; i < n; ++i) { |
257 | c1 = le16_to_cpu(s1[i]); |
258 | c2 = le16_to_cpu(s2[i]); |
259 | if (c1 < c2) |
260 | return -1; |
261 | if (c1 > c2) |
262 | return 1; |
263 | if (!c1) |
264 | break; |
265 | } |
266 | return 0; |
267 | } |
268 | |
269 | /** |
270 | * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case |
271 | * @s1: first string |
272 | * @s2: second string |
273 | * @n: maximum unicode characters to compare |
274 | * @upcase: upcase table |
275 | * @upcase_size: upcase table size in Unicode characters |
276 | * |
277 | * Compare the first @n characters of the Unicode strings @s1 and @s2, |
278 | * ignoring case. The strings in little endian format and appropriate |
279 | * le16_to_cpu() conversion is performed on non-little endian machines. |
280 | * |
281 | * Each character is uppercased using the @upcase table before the comparison. |
282 | * |
283 | * The function returns an integer less than, equal to, or greater than zero |
284 | * if @s1 (or the first @n Unicode characters thereof) is found, respectively, |
285 | * to be less than, to match, or be greater than @s2. |
286 | */ |
287 | int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, |
288 | const ntfschar *upcase, const u32 upcase_size) |
289 | { |
290 | u16 c1, c2; |
291 | size_t i; |
292 | |
293 | #ifdef DEBUG |
294 | if (!s1 || !s2 || !upcase) { |
295 | ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n"); |
296 | exit(1); |
297 | } |
298 | #endif |
299 | for (i = 0; i < n; ++i) { |
300 | if ((c1 = le16_to_cpu(s1[i])) < upcase_size) |
301 | c1 = le16_to_cpu(upcase[c1]); |
302 | if ((c2 = le16_to_cpu(s2[i])) < upcase_size) |
303 | c2 = le16_to_cpu(upcase[c2]); |
304 | if (c1 < c2) |
305 | return -1; |
306 | if (c1 > c2) |
307 | return 1; |
308 | if (!c1) |
309 | break; |
310 | } |
311 | return 0; |
312 | } |
313 | |
314 | /** |
315 | * ntfs_ucsnlen - determine the length of a little endian Unicode string |
316 | * @s: pointer to Unicode string |
317 | * @maxlen: maximum length of string @s |
318 | * |
319 | * Return the number of Unicode characters in the little endian Unicode |
320 | * string @s up to a maximum of maxlen Unicode characters, not including |
321 | * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s |
322 | * and @s + @maxlen, @maxlen is returned. |
323 | * |
324 | * This function never looks beyond @s + @maxlen. |
325 | */ |
326 | u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen) |
327 | { |
328 | u32 i; |
329 | |
330 | for (i = 0; i < maxlen; i++) { |
331 | if (!le16_to_cpu(s[i])) |
332 | break; |
333 | } |
334 | return i; |
335 | } |
336 | |
337 | /** |
338 | * ntfs_ucsndup - duplicate little endian Unicode string |
339 | * @s: pointer to Unicode string |
340 | * @maxlen: maximum length of string @s |
341 | * |
342 | * Return a pointer to a new little endian Unicode string which is a duplicate |
343 | * of the string s. Memory for the new string is obtained with ntfs_malloc(3), |
344 | * and can be freed with free(3). |
345 | * |
346 | * A maximum of @maxlen Unicode characters are copied and a terminating |
347 | * (ntfschar)'\0' little endian Unicode character is added. |
348 | * |
349 | * This function never looks beyond @s + @maxlen. |
350 | * |
351 | * Return a pointer to the new little endian Unicode string on success and NULL |
352 | * on failure with errno set to the error code. |
353 | */ |
354 | ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen) |
355 | { |
356 | ntfschar *dst; |
357 | u32 len; |
358 | |
359 | len = ntfs_ucsnlen(s, maxlen); |
360 | dst = ntfs_malloc((len + 1) * sizeof(ntfschar)); |
361 | if (dst) { |
362 | memcpy(dst, s, len * sizeof(ntfschar)); |
363 | dst[len] = cpu_to_le16(L'\0'); |
364 | } |
365 | return dst; |
366 | } |
367 | |
368 | /** |
369 | * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent |
370 | * @name: |
371 | * @name_len: |
372 | * @upcase: |
373 | * @upcase_len: |
374 | * |
375 | * Description... |
376 | * |
377 | * Returns: |
378 | */ |
379 | void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase, |
380 | const u32 upcase_len) |
381 | { |
382 | u32 i; |
383 | u16 u; |
384 | |
385 | for (i = 0; i < name_len; i++) |
386 | if ((u = le16_to_cpu(name[i])) < upcase_len) |
387 | name[i] = upcase[u]; |
388 | } |
389 | |
390 | /** |
391 | * ntfs_name_locase - Map a Unicode name to its lowercase equivalent |
392 | */ |
393 | void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase, |
394 | const u32 locase_len) |
395 | { |
396 | u32 i; |
397 | u16 u; |
398 | |
399 | if (locase) |
400 | for (i = 0; i < name_len; i++) |
401 | if ((u = le16_to_cpu(name[i])) < locase_len) |
402 | name[i] = locase[u]; |
403 | } |
404 | |
405 | /** |
406 | * ntfs_file_value_upcase - Convert a filename to upper case |
407 | * @file_name_attr: |
408 | * @upcase: |
409 | * @upcase_len: |
410 | * |
411 | * Description... |
412 | * |
413 | * Returns: |
414 | */ |
415 | void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr, |
416 | const ntfschar *upcase, const u32 upcase_len) |
417 | { |
418 | ntfs_name_upcase((ntfschar*)&file_name_attr->file_name, |
419 | file_name_attr->file_name_length, upcase, upcase_len); |
420 | } |
421 | |
422 | /* |
423 | NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough |
424 | for now]) for path names, but the Unicode code points need to be |
425 | converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI, |
426 | glibc does this even without a locale in a hard-coded fashion as that |
427 | appears to be is easy because the low 7-bit ASCII range appears to be |
428 | available in all charsets but it does not convert anything if |
429 | there was some error with the locale setup or none set up like |
430 | when mount is called during early boot where he (by policy) do |
431 | not use locales (and may be not available if /usr is not yet mounted), |
432 | so this patch fixes the resulting issues for systems which use |
433 | UTF-8 and for others, specifying the locale in fstab brings them |
434 | the encoding which they want. |
435 | |
436 | If no locale is defined or there was a problem with setting one |
437 | up and whenever nl_langinfo(CODESET) returns a sting starting with |
438 | "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix |
439 | the bug where NTFS-3G does not show any path names which include |
440 | international characters!!! (and also fails on creating them) as result. |
441 | |
442 | Author: Bernhard Kaindl <bk@suse.de> |
443 | Jean-Pierre Andre made it compliant with RFC3629/RFC2781. |
444 | */ |
445 | |
446 | /* |
447 | * Return the amount of 8-bit elements in UTF-8 needed (without the terminating |
448 | * null) to store a given UTF-16LE string. |
449 | * |
450 | * Return -1 with errno set if string has invalid byte sequence or too long. |
451 | */ |
452 | static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len) |
453 | { |
454 | int i, ret = -1; |
455 | int count = 0; |
456 | BOOL surrog; |
457 | |
458 | surrog = FALSE; |
459 | for (i = 0; i < ins_len && ins[i]; i++) { |
460 | unsigned short c = le16_to_cpu(ins[i]); |
461 | if (surrog) { |
462 | if ((c >= 0xdc00) && (c < 0xe000)) { |
463 | surrog = FALSE; |
464 | count += 4; |
465 | } else |
466 | goto fail; |
467 | } else |
468 | if (c < 0x80) |
469 | count++; |
470 | else if (c < 0x800) |
471 | count += 2; |
472 | else if (c < 0xd800) |
473 | count += 3; |
474 | else if (c < 0xdc00) |
475 | surrog = TRUE; |
476 | #if NOREVBOM |
477 | else if ((c >= 0xe000) && (c < 0xfffe)) |
478 | #else |
479 | else if (c >= 0xe000) |
480 | #endif |
481 | count += 3; |
482 | else |
483 | goto fail; |
484 | if (count > outs_len) { |
485 | errno = ENAMETOOLONG; |
486 | goto out; |
487 | } |
488 | } |
489 | if (surrog) |
490 | goto fail; |
491 | |
492 | ret = count; |
493 | out: |
494 | return ret; |
495 | fail: |
496 | errno = EILSEQ; |
497 | goto out; |
498 | } |
499 | |
500 | /* |
501 | * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string |
502 | * @ins: input utf16 string buffer |
503 | * @ins_len: length of input string in utf16 characters |
504 | * @outs: on return contains the (allocated) output multibyte string |
505 | * @outs_len: length of output buffer in bytes |
506 | * |
507 | * Return -1 with errno set if string has invalid byte sequence or too long. |
508 | */ |
509 | static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, |
510 | char **outs, int outs_len) |
511 | { |
512 | #if defined(__APPLE__) || defined(__DARWIN__) |
513 | #ifdef ENABLE_NFCONV |
514 | char *original_outs_value = *outs; |
515 | int original_outs_len = outs_len; |
516 | #endif /* ENABLE_NFCONV */ |
517 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
518 | |
519 | char *t; |
520 | int i, size, ret = -1; |
521 | int halfpair; |
522 | |
523 | halfpair = 0; |
524 | if (!*outs) |
525 | outs_len = PATH_MAX; |
526 | |
527 | size = utf16_to_utf8_size(ins, ins_len, outs_len); |
528 | |
529 | if (size < 0) |
530 | goto out; |
531 | |
532 | if (!*outs) { |
533 | outs_len = size + 1; |
534 | *outs = ntfs_malloc(outs_len); |
535 | if (!*outs) |
536 | goto out; |
537 | } |
538 | |
539 | t = *outs; |
540 | |
541 | for (i = 0; i < ins_len && ins[i]; i++) { |
542 | unsigned short c = le16_to_cpu(ins[i]); |
543 | /* size not double-checked */ |
544 | if (halfpair) { |
545 | if ((c >= 0xdc00) && (c < 0xe000)) { |
546 | *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7); |
547 | *t++ = 0x80 + (((halfpair + 64) >> 2) & 63); |
548 | *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); |
549 | *t++ = 0x80 + (c & 63); |
550 | halfpair = 0; |
551 | } else |
552 | goto fail; |
553 | } else if (c < 0x80) { |
554 | *t++ = c; |
555 | } else { |
556 | if (c < 0x800) { |
557 | *t++ = (0xc0 | ((c >> 6) & 0x3f)); |
558 | *t++ = 0x80 | (c & 0x3f); |
559 | } else if (c < 0xd800) { |
560 | *t++ = 0xe0 | (c >> 12); |
561 | *t++ = 0x80 | ((c >> 6) & 0x3f); |
562 | *t++ = 0x80 | (c & 0x3f); |
563 | } else if (c < 0xdc00) |
564 | halfpair = c; |
565 | else if (c >= 0xe000) { |
566 | *t++ = 0xe0 | (c >> 12); |
567 | *t++ = 0x80 | ((c >> 6) & 0x3f); |
568 | *t++ = 0x80 | (c & 0x3f); |
569 | } else |
570 | goto fail; |
571 | } |
572 | } |
573 | *t = '\0'; |
574 | |
575 | #if defined(__APPLE__) || defined(__DARWIN__) |
576 | #ifdef ENABLE_NFCONV |
577 | if(nfconvert_utf8 && (t - *outs) > 0) { |
578 | char *new_outs = NULL; |
579 | int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form |
580 | if(new_outs_len >= 0 && new_outs != NULL) { |
581 | if(original_outs_value != *outs) { |
582 | // We have allocated outs ourselves. |
583 | free(*outs); |
584 | *outs = new_outs; |
585 | t = *outs + new_outs_len; |
586 | } |
587 | else { |
588 | // We need to copy new_outs into the fixed outs buffer. |
589 | memset(*outs, 0, original_outs_len); |
590 | strncpy(*outs, new_outs, original_outs_len-1); |
591 | t = *outs + original_outs_len; |
592 | free(new_outs); |
593 | } |
594 | } |
595 | else { |
596 | ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs); |
597 | ntfs_log_error(" new_outs=0x%p\n", new_outs); |
598 | ntfs_log_error(" new_outs_len=%d\n", new_outs_len); |
599 | } |
600 | } |
601 | #endif /* ENABLE_NFCONV */ |
602 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
603 | |
604 | ret = t - *outs; |
605 | out: |
606 | return ret; |
607 | fail: |
608 | errno = EILSEQ; |
609 | goto out; |
610 | } |
611 | |
612 | /* |
613 | * Return the amount of 16-bit elements in UTF-16LE needed |
614 | * (without the terminating null) to store given UTF-8 string. |
615 | * |
616 | * Return -1 with errno set if it's longer than PATH_MAX or string is invalid. |
617 | * |
618 | * Note: This does not check whether the input sequence is a valid utf8 string, |
619 | * and should be used only in context where such check is made! |
620 | */ |
621 | static int utf8_to_utf16_size(const char *s) |
622 | { |
623 | int ret = -1; |
624 | unsigned int byte; |
625 | size_t count = 0; |
626 | |
627 | while ((byte = *((const unsigned char *)s++))) { |
628 | if (++count >= PATH_MAX) |
629 | goto fail; |
630 | if (byte >= 0xc0) { |
631 | if (byte >= 0xF5) { |
632 | errno = EILSEQ; |
633 | goto out; |
634 | } |
635 | if (!*s) |
636 | break; |
637 | if (byte >= 0xC0) |
638 | s++; |
639 | if (!*s) |
640 | break; |
641 | if (byte >= 0xE0) |
642 | s++; |
643 | if (!*s) |
644 | break; |
645 | if (byte >= 0xF0) { |
646 | s++; |
647 | if (++count >= PATH_MAX) |
648 | goto fail; |
649 | } |
650 | } |
651 | } |
652 | ret = count; |
653 | out: |
654 | return ret; |
655 | fail: |
656 | errno = ENAMETOOLONG; |
657 | goto out; |
658 | } |
659 | /* |
660 | * This converts one UTF-8 sequence to cpu-endian Unicode value |
661 | * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF |
662 | * |
663 | * Return the number of used utf8 bytes or -1 with errno set |
664 | * if sequence is invalid. |
665 | */ |
666 | static int utf8_to_unicode(u32 *wc, const char *s) |
667 | { |
668 | unsigned int byte = *((const unsigned char *)s); |
669 | |
670 | /* single byte */ |
671 | if (byte == 0) { |
672 | *wc = (u32) 0; |
673 | return 0; |
674 | } else if (byte < 0x80) { |
675 | *wc = (u32) byte; |
676 | return 1; |
677 | /* double byte */ |
678 | } else if (byte < 0xc2) { |
679 | goto fail; |
680 | } else if (byte < 0xE0) { |
681 | if ((s[1] & 0xC0) == 0x80) { |
682 | *wc = ((u32)(byte & 0x1F) << 6) |
683 | | ((u32)(s[1] & 0x3F)); |
684 | return 2; |
685 | } else |
686 | goto fail; |
687 | /* three-byte */ |
688 | } else if (byte < 0xF0) { |
689 | if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { |
690 | *wc = ((u32)(byte & 0x0F) << 12) |
691 | | ((u32)(s[1] & 0x3F) << 6) |
692 | | ((u32)(s[2] & 0x3F)); |
693 | /* Check valid ranges */ |
694 | #if NOREVBOM |
695 | if (((*wc >= 0x800) && (*wc <= 0xD7FF)) |
696 | || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) |
697 | return 3; |
698 | #else |
699 | if (((*wc >= 0x800) && (*wc <= 0xD7FF)) |
700 | || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) |
701 | return 3; |
702 | #endif |
703 | } |
704 | goto fail; |
705 | /* four-byte */ |
706 | } else if (byte < 0xF5) { |
707 | if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) |
708 | && ((s[3] & 0xC0) == 0x80)) { |
709 | *wc = ((u32)(byte & 0x07) << 18) |
710 | | ((u32)(s[1] & 0x3F) << 12) |
711 | | ((u32)(s[2] & 0x3F) << 6) |
712 | | ((u32)(s[3] & 0x3F)); |
713 | /* Check valid ranges */ |
714 | if ((*wc <= 0x10ffff) && (*wc >= 0x10000)) |
715 | return 4; |
716 | } |
717 | goto fail; |
718 | } |
719 | fail: |
720 | errno = EILSEQ; |
721 | return -1; |
722 | } |
723 | |
724 | /** |
725 | * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string |
726 | * @ins: input multibyte string buffer |
727 | * @outs: on return contains the (allocated) output utf16 string |
728 | * @outs_len: length of output buffer in utf16 characters |
729 | * |
730 | * Return -1 with errno set. |
731 | */ |
732 | static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs) |
733 | { |
734 | #if defined(__APPLE__) || defined(__DARWIN__) |
735 | #ifdef ENABLE_NFCONV |
736 | char *new_ins = NULL; |
737 | if(nfconvert_utf8) { |
738 | int new_ins_len; |
739 | new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form |
740 | if(new_ins_len >= 0) |
741 | ins = new_ins; |
742 | else |
743 | ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins); |
744 | } |
745 | #endif /* ENABLE_NFCONV */ |
746 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
747 | const char *t = ins; |
748 | u32 wc; |
749 | BOOL allocated; |
750 | ntfschar *outpos; |
751 | int shorts, ret = -1; |
752 | |
753 | shorts = utf8_to_utf16_size(ins); |
754 | if (shorts < 0) |
755 | goto fail; |
756 | |
757 | allocated = FALSE; |
758 | if (!*outs) { |
759 | *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar)); |
760 | if (!*outs) |
761 | goto fail; |
762 | allocated = TRUE; |
763 | } |
764 | |
765 | outpos = *outs; |
766 | |
767 | while(1) { |
768 | int m = utf8_to_unicode(&wc, t); |
769 | if (m <= 0) { |
770 | if (m < 0) { |
771 | /* do not leave space allocated if failed */ |
772 | if (allocated) { |
773 | free(*outs); |
774 | *outs = (ntfschar*)NULL; |
775 | } |
776 | goto fail; |
777 | } |
778 | *outpos++ = const_cpu_to_le16(0); |
779 | break; |
780 | } |
781 | if (wc < 0x10000) |
782 | *outpos++ = cpu_to_le16(wc); |
783 | else { |
784 | wc -= 0x10000; |
785 | *outpos++ = cpu_to_le16((wc >> 10) + 0xd800); |
786 | *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00); |
787 | } |
788 | t += m; |
789 | } |
790 | |
791 | ret = --outpos - *outs; |
792 | fail: |
793 | #if defined(__APPLE__) || defined(__DARWIN__) |
794 | #ifdef ENABLE_NFCONV |
795 | if(new_ins != NULL) |
796 | free(new_ins); |
797 | #endif /* ENABLE_NFCONV */ |
798 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
799 | return ret; |
800 | } |
801 | |
802 | /** |
803 | * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string |
804 | * @ins: input Unicode string buffer |
805 | * @ins_len: length of input string in Unicode characters |
806 | * @outs: on return contains the (allocated) output multibyte string |
807 | * @outs_len: length of output buffer in bytes |
808 | * |
809 | * Convert the input little endian, 2-byte Unicode string @ins, of length |
810 | * @ins_len into the multibyte string format dictated by the current locale. |
811 | * |
812 | * If *@outs is NULL, the function allocates the string and the caller is |
813 | * responsible for calling free(*@outs); when finished with it. |
814 | * |
815 | * On success the function returns the number of bytes written to the output |
816 | * string *@outs (>= 0), not counting the terminating NULL byte. If the output |
817 | * string buffer was allocated, *@outs is set to it. |
818 | * |
819 | * On error, -1 is returned, and errno is set to the error code. The following |
820 | * error codes can be expected: |
821 | * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). |
822 | * EILSEQ The input string cannot be represented as a multibyte |
823 | * sequence according to the current locale. |
824 | * ENAMETOOLONG Destination buffer is too small for input string. |
825 | * ENOMEM Not enough memory to allocate destination buffer. |
826 | */ |
827 | int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, |
828 | int outs_len) |
829 | { |
830 | char *mbs; |
831 | int mbs_len; |
832 | #ifdef MB_CUR_MAX |
833 | wchar_t wc; |
834 | int i, o; |
835 | int cnt = 0; |
836 | #ifdef HAVE_MBSINIT |
837 | mbstate_t mbstate; |
838 | #endif |
839 | #endif /* MB_CUR_MAX */ |
840 | |
841 | if (!ins || !outs) { |
842 | errno = EINVAL; |
843 | return -1; |
844 | } |
845 | mbs = *outs; |
846 | mbs_len = outs_len; |
847 | if (mbs && !mbs_len) { |
848 | errno = ENAMETOOLONG; |
849 | return -1; |
850 | } |
851 | if (use_utf8) |
852 | return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len); |
853 | #ifdef MB_CUR_MAX |
854 | if (!mbs) { |
855 | mbs_len = (ins_len + 1) * MB_CUR_MAX; |
856 | mbs = ntfs_malloc(mbs_len); |
857 | if (!mbs) |
858 | return -1; |
859 | } |
860 | #ifdef HAVE_MBSINIT |
861 | memset(&mbstate, 0, sizeof(mbstate)); |
862 | #else |
863 | wctomb(NULL, 0); |
864 | #endif |
865 | for (i = o = 0; i < ins_len; i++) { |
866 | /* Reallocate memory if necessary or abort. */ |
867 | if ((int)(o + MB_CUR_MAX) > mbs_len) { |
868 | char *tc; |
869 | if (mbs == *outs) { |
870 | errno = ENAMETOOLONG; |
871 | return -1; |
872 | } |
873 | tc = ntfs_malloc((mbs_len + 64) & ~63); |
874 | if (!tc) |
875 | goto err_out; |
876 | memcpy(tc, mbs, mbs_len); |
877 | mbs_len = (mbs_len + 64) & ~63; |
878 | free(mbs); |
879 | mbs = tc; |
880 | } |
881 | /* Convert the LE Unicode character to a CPU wide character. */ |
882 | wc = (wchar_t)le16_to_cpu(ins[i]); |
883 | if (!wc) |
884 | break; |
885 | /* Convert the CPU endian wide character to multibyte. */ |
886 | #ifdef HAVE_MBSINIT |
887 | cnt = wcrtomb(mbs + o, wc, &mbstate); |
888 | #else |
889 | cnt = wctomb(mbs + o, wc); |
890 | #endif |
891 | if (cnt == -1) |
892 | goto err_out; |
893 | if (cnt <= 0) { |
894 | ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt); |
895 | errno = EINVAL; |
896 | goto err_out; |
897 | } |
898 | o += cnt; |
899 | } |
900 | #ifdef HAVE_MBSINIT |
901 | /* Make sure we are back in the initial state. */ |
902 | if (!mbsinit(&mbstate)) { |
903 | ntfs_log_debug("Eeek. mbstate not in initial state!\n"); |
904 | errno = EILSEQ; |
905 | goto err_out; |
906 | } |
907 | #endif |
908 | /* Now write the NULL character. */ |
909 | mbs[o] = '\0'; |
910 | if (*outs != mbs) |
911 | *outs = mbs; |
912 | return o; |
913 | err_out: |
914 | if (mbs != *outs) { |
915 | int eo = errno; |
916 | free(mbs); |
917 | errno = eo; |
918 | } |
919 | #else /* MB_CUR_MAX */ |
920 | errno = EILSEQ; |
921 | #endif /* MB_CUR_MAX */ |
922 | return -1; |
923 | } |
924 | |
925 | /** |
926 | * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string |
927 | * @ins: input multibyte string buffer |
928 | * @outs: on return contains the (allocated) output Unicode string |
929 | * |
930 | * Convert the input multibyte string @ins, from the current locale into the |
931 | * corresponding little endian, 2-byte Unicode string. |
932 | * |
933 | * The function allocates the string and the caller is responsible for calling |
934 | * free(*@outs); when finished with it. |
935 | * |
936 | * On success the function returns the number of Unicode characters written to |
937 | * the output string *@outs (>= 0), not counting the terminating Unicode NULL |
938 | * character. |
939 | * |
940 | * On error, -1 is returned, and errno is set to the error code. The following |
941 | * error codes can be expected: |
942 | * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). |
943 | * EILSEQ The input string cannot be represented as a Unicode |
944 | * string according to the current locale. |
945 | * ENAMETOOLONG Destination buffer is too small for input string. |
946 | * ENOMEM Not enough memory to allocate destination buffer. |
947 | */ |
948 | int ntfs_mbstoucs(const char *ins, ntfschar **outs) |
949 | { |
950 | #ifdef MB_CUR_MAX |
951 | ntfschar *ucs; |
952 | const char *s; |
953 | wchar_t wc; |
954 | int i, o, cnt, ins_len, ucs_len, ins_size; |
955 | #ifdef HAVE_MBSINIT |
956 | mbstate_t mbstate; |
957 | #endif |
958 | #endif /* MB_CUR_MAX */ |
959 | |
960 | if (!ins || !outs) { |
961 | errno = EINVAL; |
962 | return -1; |
963 | } |
964 | |
965 | if (use_utf8) |
966 | return ntfs_utf8_to_utf16(ins, outs); |
967 | |
968 | #ifdef MB_CUR_MAX |
969 | /* Determine the size of the multi-byte string in bytes. */ |
970 | ins_size = strlen(ins); |
971 | /* Determine the length of the multi-byte string. */ |
972 | s = ins; |
973 | #if defined(HAVE_MBSINIT) |
974 | memset(&mbstate, 0, sizeof(mbstate)); |
975 | ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); |
976 | #ifdef __CYGWIN32__ |
977 | if (!ins_len && *ins) { |
978 | /* Older Cygwin had broken mbsrtowcs() implementation. */ |
979 | ins_len = strlen(ins); |
980 | } |
981 | #endif |
982 | #elif !defined(DJGPP) |
983 | ins_len = mbstowcs(NULL, s, 0); |
984 | #else |
985 | /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ |
986 | ins_len = strlen(ins); |
987 | #endif |
988 | if (ins_len == -1) |
989 | return ins_len; |
990 | #ifdef HAVE_MBSINIT |
991 | if ((s != ins) || !mbsinit(&mbstate)) { |
992 | #else |
993 | if (s != ins) { |
994 | #endif |
995 | errno = EILSEQ; |
996 | return -1; |
997 | } |
998 | /* Add the NULL terminator. */ |
999 | ins_len++; |
1000 | ucs_len = ins_len; |
1001 | ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); |
1002 | if (!ucs) |
1003 | return -1; |
1004 | #ifdef HAVE_MBSINIT |
1005 | memset(&mbstate, 0, sizeof(mbstate)); |
1006 | #else |
1007 | mbtowc(NULL, NULL, 0); |
1008 | #endif |
1009 | for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { |
1010 | /* Reallocate memory if necessary. */ |
1011 | if (o >= ucs_len) { |
1012 | ntfschar *tc; |
1013 | ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; |
1014 | tc = realloc(ucs, ucs_len); |
1015 | if (!tc) |
1016 | goto err_out; |
1017 | ucs = tc; |
1018 | ucs_len /= sizeof(ntfschar); |
1019 | } |
1020 | /* Convert the multibyte character to a wide character. */ |
1021 | #ifdef HAVE_MBSINIT |
1022 | cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); |
1023 | #else |
1024 | cnt = mbtowc(&wc, ins + i, ins_size - i); |
1025 | #endif |
1026 | if (!cnt) |
1027 | break; |
1028 | if (cnt == -1) |
1029 | goto err_out; |
1030 | if (cnt < -1) { |
1031 | ntfs_log_trace("Eeek. cnt = %i\n", cnt); |
1032 | errno = EINVAL; |
1033 | goto err_out; |
1034 | } |
1035 | /* Make sure we are not overflowing the NTFS Unicode set. */ |
1036 | if ((unsigned long)wc >= (unsigned long)(1 << |
1037 | (8 * sizeof(ntfschar)))) { |
1038 | errno = EILSEQ; |
1039 | goto err_out; |
1040 | } |
1041 | /* Convert the CPU wide character to a LE Unicode character. */ |
1042 | ucs[o] = cpu_to_le16(wc); |
1043 | } |
1044 | #ifdef HAVE_MBSINIT |
1045 | /* Make sure we are back in the initial state. */ |
1046 | if (!mbsinit(&mbstate)) { |
1047 | ntfs_log_trace("Eeek. mbstate not in initial state!\n"); |
1048 | errno = EILSEQ; |
1049 | goto err_out; |
1050 | } |
1051 | #endif |
1052 | /* Now write the NULL character. */ |
1053 | ucs[o] = cpu_to_le16(L'\0'); |
1054 | *outs = ucs; |
1055 | return o; |
1056 | err_out: |
1057 | free(ucs); |
1058 | #else /* MB_CUR_MAX */ |
1059 | errno = EILSEQ; |
1060 | #endif /* MB_CUR_MAX */ |
1061 | return -1; |
1062 | } |
1063 | |
1064 | /* |
1065 | * Turn a UTF8 name uppercase |
1066 | * |
1067 | * Returns an allocated uppercase name which has to be freed by caller |
1068 | * or NULL if there is an error (described by errno) |
1069 | */ |
1070 | |
1071 | char *ntfs_uppercase_mbs(const char *low, |
1072 | const ntfschar *upcase, u32 upcase_size) |
1073 | { |
1074 | int size; |
1075 | char *upp; |
1076 | u32 wc; |
1077 | int n; |
1078 | const char *s; |
1079 | char *t; |
1080 | |
1081 | size = strlen(low); |
1082 | upp = (char*)ntfs_malloc(3*size + 1); |
1083 | if (upp) { |
1084 | s = low; |
1085 | t = upp; |
1086 | do { |
1087 | n = utf8_to_unicode(&wc, s); |
1088 | if (n > 0) { |
1089 | if (wc < upcase_size) |
1090 | wc = le16_to_cpu(upcase[wc]); |
1091 | if (wc < 0x80) |
1092 | *t++ = wc; |
1093 | else if (wc < 0x800) { |
1094 | *t++ = (0xc0 | ((wc >> 6) & 0x3f)); |
1095 | *t++ = 0x80 | (wc & 0x3f); |
1096 | } else if (wc < 0x10000) { |
1097 | *t++ = 0xe0 | (wc >> 12); |
1098 | *t++ = 0x80 | ((wc >> 6) & 0x3f); |
1099 | *t++ = 0x80 | (wc & 0x3f); |
1100 | } else { |
1101 | *t++ = 0xf0 | ((wc >> 18) & 7); |
1102 | *t++ = 0x80 | ((wc >> 12) & 63); |
1103 | *t++ = 0x80 | ((wc >> 6) & 0x3f); |
1104 | *t++ = 0x80 | (wc & 0x3f); |
1105 | } |
1106 | s += n; |
1107 | } |
1108 | } while (n > 0); |
1109 | if (n < 0) { |
1110 | free(upp); |
1111 | upp = (char*)NULL; |
1112 | errno = EILSEQ; |
1113 | } |
1114 | *t = 0; |
1115 | } |
1116 | return (upp); |
1117 | } |
1118 | |
1119 | /** |
1120 | * ntfs_upcase_table_build - build the default upcase table for NTFS |
1121 | * @uc: destination buffer where to store the built table |
1122 | * @uc_len: size of destination buffer in bytes |
1123 | * |
1124 | * ntfs_upcase_table_build() builds the default upcase table for NTFS and |
1125 | * stores it in the caller supplied buffer @uc of size @uc_len. |
1126 | * |
1127 | * Note, @uc_len must be at least 128kiB in size or bad things will happen! |
1128 | */ |
1129 | void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len) |
1130 | { |
1131 | #if 1 /* Vista */ |
1132 | /* |
1133 | * This is the table as defined by Vista |
1134 | */ |
1135 | /* |
1136 | * "Start" is inclusive and "End" is exclusive, every value has the |
1137 | * value of "Add" added to it. |
1138 | */ |
1139 | static int uc_run_table[][3] = { /* Start, End, Add */ |
1140 | {0x0061, 0x007b, -32}, {0x00e0, 0x00f7, -32}, {0x00f8, 0x00ff, -32}, |
1141 | {0x0256, 0x0258, -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130}, |
1142 | {0x03ac, 0x03ad, -38}, {0x03ad, 0x03b0, -37}, {0x03b1, 0x03c2, -32}, |
1143 | {0x03c2, 0x03c3, -31}, {0x03c3, 0x03cc, -32}, {0x03cc, 0x03cd, -64}, |
1144 | {0x03cd, 0x03cf, -63}, {0x0430, 0x0450, -32}, {0x0450, 0x0460, -80}, |
1145 | {0x0561, 0x0587, -48}, {0x1f00, 0x1f08, 8}, {0x1f10, 0x1f16, 8}, |
1146 | {0x1f20, 0x1f28, 8}, {0x1f30, 0x1f38, 8}, {0x1f40, 0x1f46, 8}, |
1147 | {0x1f51, 0x1f52, 8}, {0x1f53, 0x1f54, 8}, {0x1f55, 0x1f56, 8}, |
1148 | {0x1f57, 0x1f58, 8}, {0x1f60, 0x1f68, 8}, {0x1f70, 0x1f72, 74}, |
1149 | {0x1f72, 0x1f76, 86}, {0x1f76, 0x1f78, 100}, {0x1f78, 0x1f7a, 128}, |
1150 | {0x1f7a, 0x1f7c, 112}, {0x1f7c, 0x1f7e, 126}, {0x1f80, 0x1f88, 8}, |
1151 | {0x1f90, 0x1f98, 8}, {0x1fa0, 0x1fa8, 8}, {0x1fb0, 0x1fb2, 8}, |
1152 | {0x1fb3, 0x1fb4, 9}, {0x1fcc, 0x1fcd, -9}, {0x1fd0, 0x1fd2, 8}, |
1153 | {0x1fe0, 0x1fe2, 8}, {0x1fe5, 0x1fe6, 7}, {0x1ffc, 0x1ffd, -9}, |
1154 | {0x2170, 0x2180, -16}, {0x24d0, 0x24ea, -26}, {0x2c30, 0x2c5f, -48}, |
1155 | {0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b, -32}, {0} |
1156 | }; |
1157 | /* |
1158 | * "Start" is exclusive and "End" is inclusive, every second value is |
1159 | * decremented by one. |
1160 | */ |
1161 | static int uc_dup_table[][2] = { /* Start, End */ |
1162 | {0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178}, |
1163 | {0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd}, |
1164 | {0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220}, |
1165 | {0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f}, |
1166 | {0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481}, |
1167 | {0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce}, |
1168 | {0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513}, |
1169 | {0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61}, |
1170 | {0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0} |
1171 | }; |
1172 | /* |
1173 | * Set the Unicode character at offset "Offset" to "Value". Note, |
1174 | * "Value" is host endian. |
1175 | */ |
1176 | static int uc_byte_table[][2] = { /* Offset, Value */ |
1177 | {0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184}, |
1178 | {0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6}, |
1179 | {0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7}, |
1180 | {0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc}, |
1181 | {0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca}, |
1182 | {0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66}, |
1183 | {0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190}, |
1184 | {0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196}, |
1185 | {0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f}, |
1186 | {0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae}, |
1187 | {0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9}, |
1188 | {0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0} |
1189 | }; |
1190 | #else /* Vista */ |
1191 | /* |
1192 | * This is the table as defined by Windows XP |
1193 | */ |
1194 | static int uc_run_table[][3] = { /* Start, End, Add */ |
1195 | {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, |
1196 | {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, |
1197 | {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, |
1198 | {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, |
1199 | {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, |
1200 | {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, |
1201 | {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, |
1202 | {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, |
1203 | {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, |
1204 | {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, |
1205 | {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, |
1206 | {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, |
1207 | {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, |
1208 | {0} |
1209 | }; |
1210 | static int uc_dup_table[][2] = { /* Start, End */ |
1211 | {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, |
1212 | {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, |
1213 | {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, |
1214 | {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, |
1215 | {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, |
1216 | {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, |
1217 | {0} |
1218 | }; |
1219 | static int uc_byte_table[][2] = { /* Offset, Value */ |
1220 | {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, |
1221 | {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, |
1222 | {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, |
1223 | {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, |
1224 | {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, |
1225 | {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, |
1226 | {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, |
1227 | {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, |
1228 | {0} |
1229 | }; |
1230 | #endif /* Vista */ |
1231 | int i, r; |
1232 | int k, off; |
1233 | |
1234 | memset((char*)uc, 0, uc_len); |
1235 | uc_len >>= 1; |
1236 | if (uc_len > 65536) |
1237 | uc_len = 65536; |
1238 | for (i = 0; (u32)i < uc_len; i++) |
1239 | uc[i] = cpu_to_le16(i); |
1240 | for (r = 0; uc_run_table[r][0]; r++) { |
1241 | off = uc_run_table[r][2]; |
1242 | for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) |
1243 | uc[i] = cpu_to_le16(i + off); |
1244 | } |
1245 | for (r = 0; uc_dup_table[r][0]; r++) |
1246 | for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) |
1247 | uc[i + 1] = cpu_to_le16(i); |
1248 | for (r = 0; uc_byte_table[r][0]; r++) { |
1249 | k = uc_byte_table[r][1]; |
1250 | uc[uc_byte_table[r][0]] = cpu_to_le16(k); |
1251 | } |
1252 | } |
1253 | |
1254 | /* |
1255 | * Allocate and build the default upcase table |
1256 | * |
1257 | * Returns the number of entries |
1258 | * 0 if failed |
1259 | */ |
1260 | |
1261 | #define UPCASE_LEN 65536 /* default number of entries in upcase */ |
1262 | |
1263 | u32 ntfs_upcase_build_default(ntfschar **upcase) |
1264 | { |
1265 | u32 upcase_len = 0; |
1266 | |
1267 | *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2); |
1268 | if (*upcase) { |
1269 | ntfs_upcase_table_build(*upcase, UPCASE_LEN*2); |
1270 | upcase_len = UPCASE_LEN; |
1271 | } |
1272 | return (upcase_len); |
1273 | } |
1274 | |
1275 | /* |
1276 | * Build a table for converting to lower case |
1277 | * |
1278 | * This is only meaningful when there is a single lower case |
1279 | * character leading to an upper case one, and currently the |
1280 | * only exception is the greek letter sigma which has a single |
1281 | * upper case glyph (code U+03A3), but two lower case glyphs |
1282 | * (code U+03C3 and U+03C2, the latter to be used at the end |
1283 | * of a word). In the following implementation the upper case |
1284 | * sigma will be lowercased as U+03C3. |
1285 | */ |
1286 | |
1287 | ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt) |
1288 | { |
1289 | ntfschar *lc; |
1290 | u32 upp; |
1291 | u32 i; |
1292 | |
1293 | lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar)); |
1294 | if (lc) { |
1295 | for (i=0; i<uc_cnt; i++) |
1296 | lc[i] = cpu_to_le16(i); |
1297 | for (i=0; i<uc_cnt; i++) { |
1298 | upp = le16_to_cpu(uc[i]); |
1299 | if ((upp != i) && (upp < uc_cnt)) |
1300 | lc[upp] = cpu_to_le16(i); |
1301 | } |
1302 | } else |
1303 | ntfs_log_error("Could not build the locase table\n"); |
1304 | return (lc); |
1305 | } |
1306 | |
1307 | /** |
1308 | * ntfs_str2ucs - convert a string to a valid NTFS file name |
1309 | * @s: input string |
1310 | * @len: length of output buffer in Unicode characters |
1311 | * |
1312 | * Convert the input @s string into the corresponding little endian, |
1313 | * 2-byte Unicode string. The length of the converted string is less |
1314 | * or equal to the maximum length allowed by the NTFS format (255). |
1315 | * |
1316 | * If @s is NULL then return AT_UNNAMED. |
1317 | * |
1318 | * On success the function returns the Unicode string in an allocated |
1319 | * buffer and the caller is responsible to free it when it's not needed |
1320 | * anymore. |
1321 | * |
1322 | * On error NULL is returned and errno is set to the error code. |
1323 | */ |
1324 | ntfschar *ntfs_str2ucs(const char *s, int *len) |
1325 | { |
1326 | ntfschar *ucs = NULL; |
1327 | |
1328 | if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) { |
1329 | ntfs_log_perror("Couldn't convert '%s' to Unicode", s); |
1330 | return NULL; |
1331 | } |
1332 | if (*len > NTFS_MAX_NAME_LEN) { |
1333 | free(ucs); |
1334 | errno = ENAMETOOLONG; |
1335 | return NULL; |
1336 | } |
1337 | if (!ucs || !*len) { |
1338 | ucs = AT_UNNAMED; |
1339 | *len = 0; |
1340 | } |
1341 | return ucs; |
1342 | } |
1343 | |
1344 | /** |
1345 | * ntfs_ucsfree - free memory allocated by ntfs_str2ucs() |
1346 | * @ucs input string to be freed |
1347 | * |
1348 | * Free memory at @ucs and which was allocated by ntfs_str2ucs. |
1349 | * |
1350 | * Return value: none. |
1351 | */ |
1352 | void ntfs_ucsfree(ntfschar *ucs) |
1353 | { |
1354 | if (ucs && (ucs != AT_UNNAMED)) |
1355 | free(ucs); |
1356 | } |
1357 | |
1358 | /* |
1359 | * Check whether a name contains no chars forbidden |
1360 | * for DOS or Win32 use |
1361 | * |
1362 | * If there is a bad char, errno is set to EINVAL |
1363 | */ |
1364 | |
1365 | BOOL ntfs_forbidden_chars(const ntfschar *name, int len) |
1366 | { |
1367 | BOOL forbidden; |
1368 | int ch; |
1369 | int i; |
1370 | u32 mainset = (1L << ('\"' - 0x20)) |
1371 | | (1L << ('*' - 0x20)) |
1372 | | (1L << ('/' - 0x20)) |
1373 | | (1L << (':' - 0x20)) |
1374 | | (1L << ('<' - 0x20)) |
1375 | | (1L << ('>' - 0x20)) |
1376 | | (1L << ('?' - 0x20)); |
1377 | |
1378 | forbidden = (len == 0) |
1379 | || (le16_to_cpu(name[len-1]) == ' ') |
1380 | || (le16_to_cpu(name[len-1]) == '.'); |
1381 | for (i=0; i<len; i++) { |
1382 | ch = le16_to_cpu(name[i]); |
1383 | if ((ch < 0x20) |
1384 | || ((ch < 0x40) |
1385 | && ((1L << (ch - 0x20)) & mainset)) |
1386 | || (ch == '\\') |
1387 | || (ch == '|')) |
1388 | forbidden = TRUE; |
1389 | } |
1390 | if (forbidden) |
1391 | errno = EINVAL; |
1392 | return (forbidden); |
1393 | } |
1394 | |
1395 | /* |
1396 | * Check whether the same name can be used as a DOS and |
1397 | * a Win32 name |
1398 | * |
1399 | * The names must be the same, or the short name the uppercase |
1400 | * variant of the long name |
1401 | */ |
1402 | |
1403 | BOOL ntfs_collapsible_chars(ntfs_volume *vol, |
1404 | const ntfschar *shortname, int shortlen, |
1405 | const ntfschar *longname, int longlen) |
1406 | { |
1407 | BOOL collapsible; |
1408 | unsigned int ch; |
1409 | unsigned int cs; |
1410 | int i; |
1411 | |
1412 | collapsible = shortlen == longlen; |
1413 | for (i=0; collapsible && (i<shortlen); i++) { |
1414 | ch = le16_to_cpu(longname[i]); |
1415 | cs = le16_to_cpu(shortname[i]); |
1416 | if ((cs != ch) |
1417 | && ((ch >= vol->upcase_len) |
1418 | || (cs >= vol->upcase_len) |
1419 | || (vol->upcase[cs] != vol->upcase[ch]))) |
1420 | collapsible = FALSE; |
1421 | } |
1422 | return (collapsible); |
1423 | } |
1424 | |
1425 | /* |
1426 | * Define the character encoding to be used. |
1427 | * Use UTF-8 unless specified otherwise. |
1428 | */ |
1429 | |
1430 | int ntfs_set_char_encoding(const char *locale) |
1431 | { |
1432 | use_utf8 = 0; |
1433 | if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8") |
1434 | || strstr(locale,"utf-8") || strstr(locale,"UTF-8")) |
1435 | use_utf8 = 1; |
1436 | else |
1437 | if (setlocale(LC_ALL, locale)) |
1438 | use_utf8 = 0; |
1439 | else { |
1440 | ntfs_log_error("Invalid locale, encoding to UTF-8\n"); |
1441 | use_utf8 = 1; |
1442 | } |
1443 | return 0; /* always successful */ |
1444 | } |
1445 | |
1446 | #if defined(__APPLE__) || defined(__DARWIN__) |
1447 | |
1448 | int ntfs_macosx_normalize_filenames(int normalize) { |
1449 | #ifdef ENABLE_NFCONV |
1450 | if(normalize == 0 || normalize == 1) { |
1451 | nfconvert_utf8 = normalize; |
1452 | return 0; |
1453 | } |
1454 | else |
1455 | return -1; |
1456 | #else |
1457 | return -1; |
1458 | #endif /* ENABLE_NFCONV */ |
1459 | } |
1460 | |
1461 | int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target, |
1462 | int composed) { |
1463 | #ifdef ENABLE_NFCONV |
1464 | /* For this code to compile, the CoreFoundation framework must be fed to the linker. */ |
1465 | CFStringRef cfSourceString; |
1466 | CFMutableStringRef cfMutableString; |
1467 | CFRange rangeToProcess; |
1468 | CFIndex requiredBufferLength; |
1469 | char *result = NULL; |
1470 | int resultLength = -1; |
1471 | |
1472 | /* Convert the UTF-8 string to a CFString. */ |
1473 | cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8); |
1474 | if(cfSourceString == NULL) { |
1475 | ntfs_log_error("CFStringCreateWithCString failed!\n"); |
1476 | return -2; |
1477 | } |
1478 | |
1479 | /* Create a mutable string from cfSourceString that we are free to modify. */ |
1480 | cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString); |
1481 | CFRelease(cfSourceString); /* End-of-life. */ |
1482 | if(cfMutableString == NULL) { |
1483 | ntfs_log_error("CFStringCreateMutableCopy failed!\n"); |
1484 | return -3; |
1485 | } |
1486 | |
1487 | /* Normalize the mutable string to the desired normalization form. */ |
1488 | CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD)); |
1489 | |
1490 | /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */ |
1491 | rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString)); |
1492 | if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) { |
1493 | resultLength = sizeof(char)*(requiredBufferLength + 1); |
1494 | result = ntfs_calloc(resultLength); |
1495 | |
1496 | if(result != NULL) { |
1497 | if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, |
1498 | 0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) { |
1499 | ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n"); |
1500 | free(result); |
1501 | result = NULL; |
1502 | } |
1503 | } |
1504 | else |
1505 | ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength); |
1506 | } |
1507 | else |
1508 | ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n"); |
1509 | |
1510 | |
1511 | CFRelease(cfMutableString); |
1512 | |
1513 | if(result != NULL) { |
1514 | *target = result; |
1515 | return resultLength - 1; |
1516 | } |
1517 | else |
1518 | return -1; |
1519 | #else |
1520 | return -1; |
1521 | #endif /* ENABLE_NFCONV */ |
1522 | } |
1523 | #endif /* defined(__APPLE__) || defined(__DARWIN__) */ |
1524 |