summaryrefslogtreecommitdiff
path: root/libntfs-3g/unistr.c (plain)
blob: 9a2380145e99452b8437ddc37e4a8fcd2d1a763e
1/**
2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3 *
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2011 Jean-Pierre Andre
7 * Copyright (c) 2008 Bernhard Kaindl
8 *
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25#ifdef HAVE_CONFIG_H
26#include "config.h"
27#endif
28
29#ifdef HAVE_STDIO_H
30#include <stdio.h>
31#endif
32#ifdef HAVE_STDLIB_H
33#include <stdlib.h>
34#endif
35#ifdef HAVE_WCHAR_H
36#include <wchar.h>
37#endif
38#ifdef HAVE_STRING_H
39#include <string.h>
40#endif
41#ifdef HAVE_ERRNO_H
42#include <errno.h>
43#endif
44#ifdef HAVE_LOCALE_H
45#include <locale.h>
46#endif
47
48#if defined(__APPLE__) || defined(__DARWIN__)
49#ifdef ENABLE_NFCONV
50#include <CoreFoundation/CoreFoundation.h>
51#endif /* ENABLE_NFCONV */
52#endif /* defined(__APPLE__) || defined(__DARWIN__) */
53
54#include "compat.h"
55#include "attrib.h"
56#include "types.h"
57#include "unistr.h"
58#include "debug.h"
59#include "logging.h"
60#include "misc.h"
61
62#define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */
63
64/*
65 * IMPORTANT
66 * =========
67 *
68 * All these routines assume that the Unicode characters are in little endian
69 * encoding inside the strings!!!
70 */
71
72static int use_utf8 = 1; /* use UTF-8 encoding for file names */
73
74#if defined(__APPLE__) || defined(__DARWIN__)
75#ifdef ENABLE_NFCONV
76/**
77 * This variable controls whether or not automatic normalization form conversion
78 * should be performed when translating NTFS unicode file names to UTF-8.
79 * Defaults to on, but can be controlled from the outside using the function
80 * int ntfs_macosx_normalize_filenames(int normalize);
81 */
82static int nfconvert_utf8 = 1;
83#endif /* ENABLE_NFCONV */
84#endif /* defined(__APPLE__) || defined(__DARWIN__) */
85
86/*
87 * This is used by the name collation functions to quickly determine what
88 * characters are (in)valid.
89 */
90#if 0
91static const u8 legal_ansi_char_array[0x40] = {
92 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
93 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
94
95 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
96 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97
98 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
99 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
100
101 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
102 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
103};
104#endif
105
106/**
107 * ntfs_names_are_equal - compare two Unicode names for equality
108 * @s1: name to compare to @s2
109 * @s1_len: length in Unicode characters of @s1
110 * @s2: name to compare to @s1
111 * @s2_len: length in Unicode characters of @s2
112 * @ic: ignore case bool
113 * @upcase: upcase table (only if @ic == IGNORE_CASE)
114 * @upcase_size: length in Unicode characters of @upcase (if present)
115 *
116 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
117 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
118 * the @upcase table is used to perform a case insensitive comparison.
119 */
120BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
121 const ntfschar *s2, size_t s2_len,
122 const IGNORE_CASE_BOOL ic,
123 const ntfschar *upcase, const u32 upcase_size)
124{
125 if (s1_len != s2_len)
126 return FALSE;
127 if (!s1_len)
128 return TRUE;
129 if (ic == CASE_SENSITIVE)
130 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
131 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
132 TRUE;
133}
134
135/*
136 * ntfs_names_full_collate() fully collate two Unicode names
137 *
138 * @name1: first Unicode name to compare
139 * @name1_len: length of first Unicode name to compare
140 * @name2: second Unicode name to compare
141 * @name2_len: length of second Unicode name to compare
142 * @ic: either CASE_SENSITIVE or IGNORE_CASE
143 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
144 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
145 *
146 * -1 if the first name collates before the second one,
147 * 0 if the names match,
148 * 1 if the second name collates before the first one, or
149 *
150 */
151int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
152 const ntfschar *name2, const u32 name2_len,
153 const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
154 const u32 upcase_len)
155{
156 u32 cnt;
157 u16 c1, c2;
158 u16 u1, u2;
159
160#ifdef DEBUG
161 if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
162 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
163 exit(1);
164 }
165#endif
166 cnt = min(name1_len, name2_len);
167 if (cnt > 0) {
168 if (ic == CASE_SENSITIVE) {
169 while (--cnt && (*name1 == *name2)) {
170 name1++;
171 name2++;
172 }
173 u1 = c1 = le16_to_cpu(*name1);
174 u2 = c2 = le16_to_cpu(*name2);
175 if (u1 < upcase_len)
176 u1 = le16_to_cpu(upcase[u1]);
177 if (u2 < upcase_len)
178 u2 = le16_to_cpu(upcase[u2]);
179 if ((u1 == u2) && cnt)
180 do {
181 name1++;
182 u1 = le16_to_cpu(*name1);
183 name2++;
184 u2 = le16_to_cpu(*name2);
185 if (u1 < upcase_len)
186 u1 = le16_to_cpu(upcase[u1]);
187 if (u2 < upcase_len)
188 u2 = le16_to_cpu(upcase[u2]);
189 } while ((u1 == u2) && --cnt);
190 if (u1 < u2)
191 return -1;
192 if (u1 > u2)
193 return 1;
194 if (name1_len < name2_len)
195 return -1;
196 if (name1_len > name2_len)
197 return 1;
198 if (c1 < c2)
199 return -1;
200 if (c1 > c2)
201 return 1;
202 } else {
203 do {
204 u1 = c1 = le16_to_cpu(*name1);
205 name1++;
206 u2 = c2 = le16_to_cpu(*name2);
207 name2++;
208 if (u1 < upcase_len)
209 u1 = le16_to_cpu(upcase[u1]);
210 if (u2 < upcase_len)
211 u2 = le16_to_cpu(upcase[u2]);
212 } while ((u1 == u2) && --cnt);
213 if (u1 < u2)
214 return -1;
215 if (u1 > u2)
216 return 1;
217 if (name1_len < name2_len)
218 return -1;
219 if (name1_len > name2_len)
220 return 1;
221 }
222 } else {
223 if (name1_len < name2_len)
224 return -1;
225 if (name1_len > name2_len)
226 return 1;
227 }
228 return 0;
229}
230
231/**
232 * ntfs_ucsncmp - compare two little endian Unicode strings
233 * @s1: first string
234 * @s2: second string
235 * @n: maximum unicode characters to compare
236 *
237 * Compare the first @n characters of the Unicode strings @s1 and @s2,
238 * The strings in little endian format and appropriate le16_to_cpu()
239 * conversion is performed on non-little endian machines.
240 *
241 * The function returns an integer less than, equal to, or greater than zero
242 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
243 * to be less than, to match, or be greater than @s2.
244 */
245int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
246{
247 ntfschar c1, c2;
248 size_t i;
249
250#ifdef DEBUG
251 if (!s1 || !s2) {
252 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
253 exit(1);
254 }
255#endif
256 for (i = 0; i < n; ++i) {
257 c1 = le16_to_cpu(s1[i]);
258 c2 = le16_to_cpu(s2[i]);
259 if (c1 < c2)
260 return -1;
261 if (c1 > c2)
262 return 1;
263 if (!c1)
264 break;
265 }
266 return 0;
267}
268
269/**
270 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
271 * @s1: first string
272 * @s2: second string
273 * @n: maximum unicode characters to compare
274 * @upcase: upcase table
275 * @upcase_size: upcase table size in Unicode characters
276 *
277 * Compare the first @n characters of the Unicode strings @s1 and @s2,
278 * ignoring case. The strings in little endian format and appropriate
279 * le16_to_cpu() conversion is performed on non-little endian machines.
280 *
281 * Each character is uppercased using the @upcase table before the comparison.
282 *
283 * The function returns an integer less than, equal to, or greater than zero
284 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
285 * to be less than, to match, or be greater than @s2.
286 */
287int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
288 const ntfschar *upcase, const u32 upcase_size)
289{
290 u16 c1, c2;
291 size_t i;
292
293#ifdef DEBUG
294 if (!s1 || !s2 || !upcase) {
295 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
296 exit(1);
297 }
298#endif
299 for (i = 0; i < n; ++i) {
300 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
301 c1 = le16_to_cpu(upcase[c1]);
302 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
303 c2 = le16_to_cpu(upcase[c2]);
304 if (c1 < c2)
305 return -1;
306 if (c1 > c2)
307 return 1;
308 if (!c1)
309 break;
310 }
311 return 0;
312}
313
314/**
315 * ntfs_ucsnlen - determine the length of a little endian Unicode string
316 * @s: pointer to Unicode string
317 * @maxlen: maximum length of string @s
318 *
319 * Return the number of Unicode characters in the little endian Unicode
320 * string @s up to a maximum of maxlen Unicode characters, not including
321 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
322 * and @s + @maxlen, @maxlen is returned.
323 *
324 * This function never looks beyond @s + @maxlen.
325 */
326u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
327{
328 u32 i;
329
330 for (i = 0; i < maxlen; i++) {
331 if (!le16_to_cpu(s[i]))
332 break;
333 }
334 return i;
335}
336
337/**
338 * ntfs_ucsndup - duplicate little endian Unicode string
339 * @s: pointer to Unicode string
340 * @maxlen: maximum length of string @s
341 *
342 * Return a pointer to a new little endian Unicode string which is a duplicate
343 * of the string s. Memory for the new string is obtained with ntfs_malloc(3),
344 * and can be freed with free(3).
345 *
346 * A maximum of @maxlen Unicode characters are copied and a terminating
347 * (ntfschar)'\0' little endian Unicode character is added.
348 *
349 * This function never looks beyond @s + @maxlen.
350 *
351 * Return a pointer to the new little endian Unicode string on success and NULL
352 * on failure with errno set to the error code.
353 */
354ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
355{
356 ntfschar *dst;
357 u32 len;
358
359 len = ntfs_ucsnlen(s, maxlen);
360 dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
361 if (dst) {
362 memcpy(dst, s, len * sizeof(ntfschar));
363 dst[len] = cpu_to_le16(L'\0');
364 }
365 return dst;
366}
367
368/**
369 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
370 * @name:
371 * @name_len:
372 * @upcase:
373 * @upcase_len:
374 *
375 * Description...
376 *
377 * Returns:
378 */
379void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
380 const u32 upcase_len)
381{
382 u32 i;
383 u16 u;
384
385 for (i = 0; i < name_len; i++)
386 if ((u = le16_to_cpu(name[i])) < upcase_len)
387 name[i] = upcase[u];
388}
389
390/**
391 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
392 */
393void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
394 const u32 locase_len)
395{
396 u32 i;
397 u16 u;
398
399 if (locase)
400 for (i = 0; i < name_len; i++)
401 if ((u = le16_to_cpu(name[i])) < locase_len)
402 name[i] = locase[u];
403}
404
405/**
406 * ntfs_file_value_upcase - Convert a filename to upper case
407 * @file_name_attr:
408 * @upcase:
409 * @upcase_len:
410 *
411 * Description...
412 *
413 * Returns:
414 */
415void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
416 const ntfschar *upcase, const u32 upcase_len)
417{
418 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
419 file_name_attr->file_name_length, upcase, upcase_len);
420}
421
422/*
423 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
424 for now]) for path names, but the Unicode code points need to be
425 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
426 glibc does this even without a locale in a hard-coded fashion as that
427 appears to be is easy because the low 7-bit ASCII range appears to be
428 available in all charsets but it does not convert anything if
429 there was some error with the locale setup or none set up like
430 when mount is called during early boot where he (by policy) do
431 not use locales (and may be not available if /usr is not yet mounted),
432 so this patch fixes the resulting issues for systems which use
433 UTF-8 and for others, specifying the locale in fstab brings them
434 the encoding which they want.
435
436 If no locale is defined or there was a problem with setting one
437 up and whenever nl_langinfo(CODESET) returns a sting starting with
438 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
439 the bug where NTFS-3G does not show any path names which include
440 international characters!!! (and also fails on creating them) as result.
441
442 Author: Bernhard Kaindl <bk@suse.de>
443 Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
444*/
445
446/*
447 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
448 * null) to store a given UTF-16LE string.
449 *
450 * Return -1 with errno set if string has invalid byte sequence or too long.
451 */
452static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
453{
454 int i, ret = -1;
455 int count = 0;
456 BOOL surrog;
457
458 surrog = FALSE;
459 for (i = 0; i < ins_len && ins[i]; i++) {
460 unsigned short c = le16_to_cpu(ins[i]);
461 if (surrog) {
462 if ((c >= 0xdc00) && (c < 0xe000)) {
463 surrog = FALSE;
464 count += 4;
465 } else
466 goto fail;
467 } else
468 if (c < 0x80)
469 count++;
470 else if (c < 0x800)
471 count += 2;
472 else if (c < 0xd800)
473 count += 3;
474 else if (c < 0xdc00)
475 surrog = TRUE;
476#if NOREVBOM
477 else if ((c >= 0xe000) && (c < 0xfffe))
478#else
479 else if (c >= 0xe000)
480#endif
481 count += 3;
482 else
483 goto fail;
484 if (count > outs_len) {
485 errno = ENAMETOOLONG;
486 goto out;
487 }
488 }
489 if (surrog)
490 goto fail;
491
492 ret = count;
493out:
494 return ret;
495fail:
496 errno = EILSEQ;
497 goto out;
498}
499
500/*
501 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
502 * @ins: input utf16 string buffer
503 * @ins_len: length of input string in utf16 characters
504 * @outs: on return contains the (allocated) output multibyte string
505 * @outs_len: length of output buffer in bytes
506 *
507 * Return -1 with errno set if string has invalid byte sequence or too long.
508 */
509static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
510 char **outs, int outs_len)
511{
512#if defined(__APPLE__) || defined(__DARWIN__)
513#ifdef ENABLE_NFCONV
514 char *original_outs_value = *outs;
515 int original_outs_len = outs_len;
516#endif /* ENABLE_NFCONV */
517#endif /* defined(__APPLE__) || defined(__DARWIN__) */
518
519 char *t;
520 int i, size, ret = -1;
521 int halfpair;
522
523 halfpair = 0;
524 if (!*outs)
525 outs_len = PATH_MAX;
526
527 size = utf16_to_utf8_size(ins, ins_len, outs_len);
528
529 if (size < 0)
530 goto out;
531
532 if (!*outs) {
533 outs_len = size + 1;
534 *outs = ntfs_malloc(outs_len);
535 if (!*outs)
536 goto out;
537 }
538
539 t = *outs;
540
541 for (i = 0; i < ins_len && ins[i]; i++) {
542 unsigned short c = le16_to_cpu(ins[i]);
543 /* size not double-checked */
544 if (halfpair) {
545 if ((c >= 0xdc00) && (c < 0xe000)) {
546 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
547 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
548 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
549 *t++ = 0x80 + (c & 63);
550 halfpair = 0;
551 } else
552 goto fail;
553 } else if (c < 0x80) {
554 *t++ = c;
555 } else {
556 if (c < 0x800) {
557 *t++ = (0xc0 | ((c >> 6) & 0x3f));
558 *t++ = 0x80 | (c & 0x3f);
559 } else if (c < 0xd800) {
560 *t++ = 0xe0 | (c >> 12);
561 *t++ = 0x80 | ((c >> 6) & 0x3f);
562 *t++ = 0x80 | (c & 0x3f);
563 } else if (c < 0xdc00)
564 halfpair = c;
565 else if (c >= 0xe000) {
566 *t++ = 0xe0 | (c >> 12);
567 *t++ = 0x80 | ((c >> 6) & 0x3f);
568 *t++ = 0x80 | (c & 0x3f);
569 } else
570 goto fail;
571 }
572 }
573 *t = '\0';
574
575#if defined(__APPLE__) || defined(__DARWIN__)
576#ifdef ENABLE_NFCONV
577 if(nfconvert_utf8 && (t - *outs) > 0) {
578 char *new_outs = NULL;
579 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
580 if(new_outs_len >= 0 && new_outs != NULL) {
581 if(original_outs_value != *outs) {
582 // We have allocated outs ourselves.
583 free(*outs);
584 *outs = new_outs;
585 t = *outs + new_outs_len;
586 }
587 else {
588 // We need to copy new_outs into the fixed outs buffer.
589 memset(*outs, 0, original_outs_len);
590 strncpy(*outs, new_outs, original_outs_len-1);
591 t = *outs + original_outs_len;
592 free(new_outs);
593 }
594 }
595 else {
596 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
597 ntfs_log_error(" new_outs=0x%p\n", new_outs);
598 ntfs_log_error(" new_outs_len=%d\n", new_outs_len);
599 }
600 }
601#endif /* ENABLE_NFCONV */
602#endif /* defined(__APPLE__) || defined(__DARWIN__) */
603
604 ret = t - *outs;
605out:
606 return ret;
607fail:
608 errno = EILSEQ;
609 goto out;
610}
611
612/*
613 * Return the amount of 16-bit elements in UTF-16LE needed
614 * (without the terminating null) to store given UTF-8 string.
615 *
616 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
617 *
618 * Note: This does not check whether the input sequence is a valid utf8 string,
619 * and should be used only in context where such check is made!
620 */
621static int utf8_to_utf16_size(const char *s)
622{
623 int ret = -1;
624 unsigned int byte;
625 size_t count = 0;
626
627 while ((byte = *((const unsigned char *)s++))) {
628 if (++count >= PATH_MAX)
629 goto fail;
630 if (byte >= 0xc0) {
631 if (byte >= 0xF5) {
632 errno = EILSEQ;
633 goto out;
634 }
635 if (!*s)
636 break;
637 if (byte >= 0xC0)
638 s++;
639 if (!*s)
640 break;
641 if (byte >= 0xE0)
642 s++;
643 if (!*s)
644 break;
645 if (byte >= 0xF0) {
646 s++;
647 if (++count >= PATH_MAX)
648 goto fail;
649 }
650 }
651 }
652 ret = count;
653out:
654 return ret;
655fail:
656 errno = ENAMETOOLONG;
657 goto out;
658}
659/*
660 * This converts one UTF-8 sequence to cpu-endian Unicode value
661 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
662 *
663 * Return the number of used utf8 bytes or -1 with errno set
664 * if sequence is invalid.
665 */
666static int utf8_to_unicode(u32 *wc, const char *s)
667{
668 unsigned int byte = *((const unsigned char *)s);
669
670 /* single byte */
671 if (byte == 0) {
672 *wc = (u32) 0;
673 return 0;
674 } else if (byte < 0x80) {
675 *wc = (u32) byte;
676 return 1;
677 /* double byte */
678 } else if (byte < 0xc2) {
679 goto fail;
680 } else if (byte < 0xE0) {
681 if ((s[1] & 0xC0) == 0x80) {
682 *wc = ((u32)(byte & 0x1F) << 6)
683 | ((u32)(s[1] & 0x3F));
684 return 2;
685 } else
686 goto fail;
687 /* three-byte */
688 } else if (byte < 0xF0) {
689 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
690 *wc = ((u32)(byte & 0x0F) << 12)
691 | ((u32)(s[1] & 0x3F) << 6)
692 | ((u32)(s[2] & 0x3F));
693 /* Check valid ranges */
694#if NOREVBOM
695 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
696 || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
697 return 3;
698#else
699 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
700 || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
701 return 3;
702#endif
703 }
704 goto fail;
705 /* four-byte */
706 } else if (byte < 0xF5) {
707 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
708 && ((s[3] & 0xC0) == 0x80)) {
709 *wc = ((u32)(byte & 0x07) << 18)
710 | ((u32)(s[1] & 0x3F) << 12)
711 | ((u32)(s[2] & 0x3F) << 6)
712 | ((u32)(s[3] & 0x3F));
713 /* Check valid ranges */
714 if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
715 return 4;
716 }
717 goto fail;
718 }
719fail:
720 errno = EILSEQ;
721 return -1;
722}
723
724/**
725 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
726 * @ins: input multibyte string buffer
727 * @outs: on return contains the (allocated) output utf16 string
728 * @outs_len: length of output buffer in utf16 characters
729 *
730 * Return -1 with errno set.
731 */
732static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
733{
734#if defined(__APPLE__) || defined(__DARWIN__)
735#ifdef ENABLE_NFCONV
736 char *new_ins = NULL;
737 if(nfconvert_utf8) {
738 int new_ins_len;
739 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
740 if(new_ins_len >= 0)
741 ins = new_ins;
742 else
743 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
744 }
745#endif /* ENABLE_NFCONV */
746#endif /* defined(__APPLE__) || defined(__DARWIN__) */
747 const char *t = ins;
748 u32 wc;
749 BOOL allocated;
750 ntfschar *outpos;
751 int shorts, ret = -1;
752
753 shorts = utf8_to_utf16_size(ins);
754 if (shorts < 0)
755 goto fail;
756
757 allocated = FALSE;
758 if (!*outs) {
759 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
760 if (!*outs)
761 goto fail;
762 allocated = TRUE;
763 }
764
765 outpos = *outs;
766
767 while(1) {
768 int m = utf8_to_unicode(&wc, t);
769 if (m <= 0) {
770 if (m < 0) {
771 /* do not leave space allocated if failed */
772 if (allocated) {
773 free(*outs);
774 *outs = (ntfschar*)NULL;
775 }
776 goto fail;
777 }
778 *outpos++ = const_cpu_to_le16(0);
779 break;
780 }
781 if (wc < 0x10000)
782 *outpos++ = cpu_to_le16(wc);
783 else {
784 wc -= 0x10000;
785 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
786 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
787 }
788 t += m;
789 }
790
791 ret = --outpos - *outs;
792fail:
793#if defined(__APPLE__) || defined(__DARWIN__)
794#ifdef ENABLE_NFCONV
795 if(new_ins != NULL)
796 free(new_ins);
797#endif /* ENABLE_NFCONV */
798#endif /* defined(__APPLE__) || defined(__DARWIN__) */
799 return ret;
800}
801
802/**
803 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
804 * @ins: input Unicode string buffer
805 * @ins_len: length of input string in Unicode characters
806 * @outs: on return contains the (allocated) output multibyte string
807 * @outs_len: length of output buffer in bytes
808 *
809 * Convert the input little endian, 2-byte Unicode string @ins, of length
810 * @ins_len into the multibyte string format dictated by the current locale.
811 *
812 * If *@outs is NULL, the function allocates the string and the caller is
813 * responsible for calling free(*@outs); when finished with it.
814 *
815 * On success the function returns the number of bytes written to the output
816 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
817 * string buffer was allocated, *@outs is set to it.
818 *
819 * On error, -1 is returned, and errno is set to the error code. The following
820 * error codes can be expected:
821 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
822 * EILSEQ The input string cannot be represented as a multibyte
823 * sequence according to the current locale.
824 * ENAMETOOLONG Destination buffer is too small for input string.
825 * ENOMEM Not enough memory to allocate destination buffer.
826 */
827int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
828 int outs_len)
829{
830 char *mbs;
831 int mbs_len;
832#ifdef MB_CUR_MAX
833 wchar_t wc;
834 int i, o;
835 int cnt = 0;
836#ifdef HAVE_MBSINIT
837 mbstate_t mbstate;
838#endif
839#endif /* MB_CUR_MAX */
840
841 if (!ins || !outs) {
842 errno = EINVAL;
843 return -1;
844 }
845 mbs = *outs;
846 mbs_len = outs_len;
847 if (mbs && !mbs_len) {
848 errno = ENAMETOOLONG;
849 return -1;
850 }
851 if (use_utf8)
852 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
853#ifdef MB_CUR_MAX
854 if (!mbs) {
855 mbs_len = (ins_len + 1) * MB_CUR_MAX;
856 mbs = ntfs_malloc(mbs_len);
857 if (!mbs)
858 return -1;
859 }
860#ifdef HAVE_MBSINIT
861 memset(&mbstate, 0, sizeof(mbstate));
862#else
863 wctomb(NULL, 0);
864#endif
865 for (i = o = 0; i < ins_len; i++) {
866 /* Reallocate memory if necessary or abort. */
867 if ((int)(o + MB_CUR_MAX) > mbs_len) {
868 char *tc;
869 if (mbs == *outs) {
870 errno = ENAMETOOLONG;
871 return -1;
872 }
873 tc = ntfs_malloc((mbs_len + 64) & ~63);
874 if (!tc)
875 goto err_out;
876 memcpy(tc, mbs, mbs_len);
877 mbs_len = (mbs_len + 64) & ~63;
878 free(mbs);
879 mbs = tc;
880 }
881 /* Convert the LE Unicode character to a CPU wide character. */
882 wc = (wchar_t)le16_to_cpu(ins[i]);
883 if (!wc)
884 break;
885 /* Convert the CPU endian wide character to multibyte. */
886#ifdef HAVE_MBSINIT
887 cnt = wcrtomb(mbs + o, wc, &mbstate);
888#else
889 cnt = wctomb(mbs + o, wc);
890#endif
891 if (cnt == -1)
892 goto err_out;
893 if (cnt <= 0) {
894 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
895 errno = EINVAL;
896 goto err_out;
897 }
898 o += cnt;
899 }
900#ifdef HAVE_MBSINIT
901 /* Make sure we are back in the initial state. */
902 if (!mbsinit(&mbstate)) {
903 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
904 errno = EILSEQ;
905 goto err_out;
906 }
907#endif
908 /* Now write the NULL character. */
909 mbs[o] = '\0';
910 if (*outs != mbs)
911 *outs = mbs;
912 return o;
913err_out:
914 if (mbs != *outs) {
915 int eo = errno;
916 free(mbs);
917 errno = eo;
918 }
919#else /* MB_CUR_MAX */
920 errno = EILSEQ;
921#endif /* MB_CUR_MAX */
922 return -1;
923}
924
925/**
926 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
927 * @ins: input multibyte string buffer
928 * @outs: on return contains the (allocated) output Unicode string
929 *
930 * Convert the input multibyte string @ins, from the current locale into the
931 * corresponding little endian, 2-byte Unicode string.
932 *
933 * The function allocates the string and the caller is responsible for calling
934 * free(*@outs); when finished with it.
935 *
936 * On success the function returns the number of Unicode characters written to
937 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
938 * character.
939 *
940 * On error, -1 is returned, and errno is set to the error code. The following
941 * error codes can be expected:
942 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
943 * EILSEQ The input string cannot be represented as a Unicode
944 * string according to the current locale.
945 * ENAMETOOLONG Destination buffer is too small for input string.
946 * ENOMEM Not enough memory to allocate destination buffer.
947 */
948int ntfs_mbstoucs(const char *ins, ntfschar **outs)
949{
950#ifdef MB_CUR_MAX
951 ntfschar *ucs;
952 const char *s;
953 wchar_t wc;
954 int i, o, cnt, ins_len, ucs_len, ins_size;
955#ifdef HAVE_MBSINIT
956 mbstate_t mbstate;
957#endif
958#endif /* MB_CUR_MAX */
959
960 if (!ins || !outs) {
961 errno = EINVAL;
962 return -1;
963 }
964
965 if (use_utf8)
966 return ntfs_utf8_to_utf16(ins, outs);
967
968#ifdef MB_CUR_MAX
969 /* Determine the size of the multi-byte string in bytes. */
970 ins_size = strlen(ins);
971 /* Determine the length of the multi-byte string. */
972 s = ins;
973#if defined(HAVE_MBSINIT)
974 memset(&mbstate, 0, sizeof(mbstate));
975 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
976#ifdef __CYGWIN32__
977 if (!ins_len && *ins) {
978 /* Older Cygwin had broken mbsrtowcs() implementation. */
979 ins_len = strlen(ins);
980 }
981#endif
982#elif !defined(DJGPP)
983 ins_len = mbstowcs(NULL, s, 0);
984#else
985 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
986 ins_len = strlen(ins);
987#endif
988 if (ins_len == -1)
989 return ins_len;
990#ifdef HAVE_MBSINIT
991 if ((s != ins) || !mbsinit(&mbstate)) {
992#else
993 if (s != ins) {
994#endif
995 errno = EILSEQ;
996 return -1;
997 }
998 /* Add the NULL terminator. */
999 ins_len++;
1000 ucs_len = ins_len;
1001 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1002 if (!ucs)
1003 return -1;
1004#ifdef HAVE_MBSINIT
1005 memset(&mbstate, 0, sizeof(mbstate));
1006#else
1007 mbtowc(NULL, NULL, 0);
1008#endif
1009 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1010 /* Reallocate memory if necessary. */
1011 if (o >= ucs_len) {
1012 ntfschar *tc;
1013 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1014 tc = realloc(ucs, ucs_len);
1015 if (!tc)
1016 goto err_out;
1017 ucs = tc;
1018 ucs_len /= sizeof(ntfschar);
1019 }
1020 /* Convert the multibyte character to a wide character. */
1021#ifdef HAVE_MBSINIT
1022 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1023#else
1024 cnt = mbtowc(&wc, ins + i, ins_size - i);
1025#endif
1026 if (!cnt)
1027 break;
1028 if (cnt == -1)
1029 goto err_out;
1030 if (cnt < -1) {
1031 ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1032 errno = EINVAL;
1033 goto err_out;
1034 }
1035 /* Make sure we are not overflowing the NTFS Unicode set. */
1036 if ((unsigned long)wc >= (unsigned long)(1 <<
1037 (8 * sizeof(ntfschar)))) {
1038 errno = EILSEQ;
1039 goto err_out;
1040 }
1041 /* Convert the CPU wide character to a LE Unicode character. */
1042 ucs[o] = cpu_to_le16(wc);
1043 }
1044#ifdef HAVE_MBSINIT
1045 /* Make sure we are back in the initial state. */
1046 if (!mbsinit(&mbstate)) {
1047 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1048 errno = EILSEQ;
1049 goto err_out;
1050 }
1051#endif
1052 /* Now write the NULL character. */
1053 ucs[o] = cpu_to_le16(L'\0');
1054 *outs = ucs;
1055 return o;
1056err_out:
1057 free(ucs);
1058#else /* MB_CUR_MAX */
1059 errno = EILSEQ;
1060#endif /* MB_CUR_MAX */
1061 return -1;
1062}
1063
1064/*
1065 * Turn a UTF8 name uppercase
1066 *
1067 * Returns an allocated uppercase name which has to be freed by caller
1068 * or NULL if there is an error (described by errno)
1069 */
1070
1071char *ntfs_uppercase_mbs(const char *low,
1072 const ntfschar *upcase, u32 upcase_size)
1073{
1074 int size;
1075 char *upp;
1076 u32 wc;
1077 int n;
1078 const char *s;
1079 char *t;
1080
1081 size = strlen(low);
1082 upp = (char*)ntfs_malloc(3*size + 1);
1083 if (upp) {
1084 s = low;
1085 t = upp;
1086 do {
1087 n = utf8_to_unicode(&wc, s);
1088 if (n > 0) {
1089 if (wc < upcase_size)
1090 wc = le16_to_cpu(upcase[wc]);
1091 if (wc < 0x80)
1092 *t++ = wc;
1093 else if (wc < 0x800) {
1094 *t++ = (0xc0 | ((wc >> 6) & 0x3f));
1095 *t++ = 0x80 | (wc & 0x3f);
1096 } else if (wc < 0x10000) {
1097 *t++ = 0xe0 | (wc >> 12);
1098 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1099 *t++ = 0x80 | (wc & 0x3f);
1100 } else {
1101 *t++ = 0xf0 | ((wc >> 18) & 7);
1102 *t++ = 0x80 | ((wc >> 12) & 63);
1103 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1104 *t++ = 0x80 | (wc & 0x3f);
1105 }
1106 s += n;
1107 }
1108 } while (n > 0);
1109 if (n < 0) {
1110 free(upp);
1111 upp = (char*)NULL;
1112 errno = EILSEQ;
1113 }
1114 *t = 0;
1115 }
1116 return (upp);
1117}
1118
1119/**
1120 * ntfs_upcase_table_build - build the default upcase table for NTFS
1121 * @uc: destination buffer where to store the built table
1122 * @uc_len: size of destination buffer in bytes
1123 *
1124 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1125 * stores it in the caller supplied buffer @uc of size @uc_len.
1126 *
1127 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1128 */
1129void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1130{
1131#if 1 /* Vista */
1132 /*
1133 * This is the table as defined by Vista
1134 */
1135 /*
1136 * "Start" is inclusive and "End" is exclusive, every value has the
1137 * value of "Add" added to it.
1138 */
1139 static int uc_run_table[][3] = { /* Start, End, Add */
1140 {0x0061, 0x007b, -32}, {0x00e0, 0x00f7, -32}, {0x00f8, 0x00ff, -32},
1141 {0x0256, 0x0258, -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130},
1142 {0x03ac, 0x03ad, -38}, {0x03ad, 0x03b0, -37}, {0x03b1, 0x03c2, -32},
1143 {0x03c2, 0x03c3, -31}, {0x03c3, 0x03cc, -32}, {0x03cc, 0x03cd, -64},
1144 {0x03cd, 0x03cf, -63}, {0x0430, 0x0450, -32}, {0x0450, 0x0460, -80},
1145 {0x0561, 0x0587, -48}, {0x1f00, 0x1f08, 8}, {0x1f10, 0x1f16, 8},
1146 {0x1f20, 0x1f28, 8}, {0x1f30, 0x1f38, 8}, {0x1f40, 0x1f46, 8},
1147 {0x1f51, 0x1f52, 8}, {0x1f53, 0x1f54, 8}, {0x1f55, 0x1f56, 8},
1148 {0x1f57, 0x1f58, 8}, {0x1f60, 0x1f68, 8}, {0x1f70, 0x1f72, 74},
1149 {0x1f72, 0x1f76, 86}, {0x1f76, 0x1f78, 100}, {0x1f78, 0x1f7a, 128},
1150 {0x1f7a, 0x1f7c, 112}, {0x1f7c, 0x1f7e, 126}, {0x1f80, 0x1f88, 8},
1151 {0x1f90, 0x1f98, 8}, {0x1fa0, 0x1fa8, 8}, {0x1fb0, 0x1fb2, 8},
1152 {0x1fb3, 0x1fb4, 9}, {0x1fcc, 0x1fcd, -9}, {0x1fd0, 0x1fd2, 8},
1153 {0x1fe0, 0x1fe2, 8}, {0x1fe5, 0x1fe6, 7}, {0x1ffc, 0x1ffd, -9},
1154 {0x2170, 0x2180, -16}, {0x24d0, 0x24ea, -26}, {0x2c30, 0x2c5f, -48},
1155 {0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b, -32}, {0}
1156 };
1157 /*
1158 * "Start" is exclusive and "End" is inclusive, every second value is
1159 * decremented by one.
1160 */
1161 static int uc_dup_table[][2] = { /* Start, End */
1162 {0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178},
1163 {0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd},
1164 {0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220},
1165 {0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f},
1166 {0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481},
1167 {0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce},
1168 {0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513},
1169 {0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61},
1170 {0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0}
1171 };
1172 /*
1173 * Set the Unicode character at offset "Offset" to "Value". Note,
1174 * "Value" is host endian.
1175 */
1176 static int uc_byte_table[][2] = { /* Offset, Value */
1177 {0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184},
1178 {0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6},
1179 {0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7},
1180 {0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc},
1181 {0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca},
1182 {0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66},
1183 {0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190},
1184 {0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196},
1185 {0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f},
1186 {0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae},
1187 {0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9},
1188 {0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0}
1189 };
1190#else /* Vista */
1191 /*
1192 * This is the table as defined by Windows XP
1193 */
1194 static int uc_run_table[][3] = { /* Start, End, Add */
1195 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
1196 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
1197 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1198 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
1199 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
1200 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
1201 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
1202 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
1203 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
1204 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
1205 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
1206 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
1207 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
1208 {0}
1209 };
1210 static int uc_dup_table[][2] = { /* Start, End */
1211 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1212 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1213 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1214 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1215 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1216 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1217 {0}
1218 };
1219 static int uc_byte_table[][2] = { /* Offset, Value */
1220 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1221 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1222 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1223 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1224 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1225 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1226 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1227 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1228 {0}
1229 };
1230#endif /* Vista */
1231 int i, r;
1232 int k, off;
1233
1234 memset((char*)uc, 0, uc_len);
1235 uc_len >>= 1;
1236 if (uc_len > 65536)
1237 uc_len = 65536;
1238 for (i = 0; (u32)i < uc_len; i++)
1239 uc[i] = cpu_to_le16(i);
1240 for (r = 0; uc_run_table[r][0]; r++) {
1241 off = uc_run_table[r][2];
1242 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1243 uc[i] = cpu_to_le16(i + off);
1244 }
1245 for (r = 0; uc_dup_table[r][0]; r++)
1246 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1247 uc[i + 1] = cpu_to_le16(i);
1248 for (r = 0; uc_byte_table[r][0]; r++) {
1249 k = uc_byte_table[r][1];
1250 uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1251 }
1252}
1253
1254/*
1255 * Allocate and build the default upcase table
1256 *
1257 * Returns the number of entries
1258 * 0 if failed
1259 */
1260
1261#define UPCASE_LEN 65536 /* default number of entries in upcase */
1262
1263u32 ntfs_upcase_build_default(ntfschar **upcase)
1264{
1265 u32 upcase_len = 0;
1266
1267 *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1268 if (*upcase) {
1269 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1270 upcase_len = UPCASE_LEN;
1271 }
1272 return (upcase_len);
1273}
1274
1275/*
1276 * Build a table for converting to lower case
1277 *
1278 * This is only meaningful when there is a single lower case
1279 * character leading to an upper case one, and currently the
1280 * only exception is the greek letter sigma which has a single
1281 * upper case glyph (code U+03A3), but two lower case glyphs
1282 * (code U+03C3 and U+03C2, the latter to be used at the end
1283 * of a word). In the following implementation the upper case
1284 * sigma will be lowercased as U+03C3.
1285 */
1286
1287ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1288{
1289 ntfschar *lc;
1290 u32 upp;
1291 u32 i;
1292
1293 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1294 if (lc) {
1295 for (i=0; i<uc_cnt; i++)
1296 lc[i] = cpu_to_le16(i);
1297 for (i=0; i<uc_cnt; i++) {
1298 upp = le16_to_cpu(uc[i]);
1299 if ((upp != i) && (upp < uc_cnt))
1300 lc[upp] = cpu_to_le16(i);
1301 }
1302 } else
1303 ntfs_log_error("Could not build the locase table\n");
1304 return (lc);
1305}
1306
1307/**
1308 * ntfs_str2ucs - convert a string to a valid NTFS file name
1309 * @s: input string
1310 * @len: length of output buffer in Unicode characters
1311 *
1312 * Convert the input @s string into the corresponding little endian,
1313 * 2-byte Unicode string. The length of the converted string is less
1314 * or equal to the maximum length allowed by the NTFS format (255).
1315 *
1316 * If @s is NULL then return AT_UNNAMED.
1317 *
1318 * On success the function returns the Unicode string in an allocated
1319 * buffer and the caller is responsible to free it when it's not needed
1320 * anymore.
1321 *
1322 * On error NULL is returned and errno is set to the error code.
1323 */
1324ntfschar *ntfs_str2ucs(const char *s, int *len)
1325{
1326 ntfschar *ucs = NULL;
1327
1328 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1329 ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1330 return NULL;
1331 }
1332 if (*len > NTFS_MAX_NAME_LEN) {
1333 free(ucs);
1334 errno = ENAMETOOLONG;
1335 return NULL;
1336 }
1337 if (!ucs || !*len) {
1338 ucs = AT_UNNAMED;
1339 *len = 0;
1340 }
1341 return ucs;
1342}
1343
1344/**
1345 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1346 * @ucs input string to be freed
1347 *
1348 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1349 *
1350 * Return value: none.
1351 */
1352void ntfs_ucsfree(ntfschar *ucs)
1353{
1354 if (ucs && (ucs != AT_UNNAMED))
1355 free(ucs);
1356}
1357
1358/*
1359 * Check whether a name contains no chars forbidden
1360 * for DOS or Win32 use
1361 *
1362 * If there is a bad char, errno is set to EINVAL
1363 */
1364
1365BOOL ntfs_forbidden_chars(const ntfschar *name, int len)
1366{
1367 BOOL forbidden;
1368 int ch;
1369 int i;
1370 u32 mainset = (1L << ('\"' - 0x20))
1371 | (1L << ('*' - 0x20))
1372 | (1L << ('/' - 0x20))
1373 | (1L << (':' - 0x20))
1374 | (1L << ('<' - 0x20))
1375 | (1L << ('>' - 0x20))
1376 | (1L << ('?' - 0x20));
1377
1378 forbidden = (len == 0)
1379 || (le16_to_cpu(name[len-1]) == ' ')
1380 || (le16_to_cpu(name[len-1]) == '.');
1381 for (i=0; i<len; i++) {
1382 ch = le16_to_cpu(name[i]);
1383 if ((ch < 0x20)
1384 || ((ch < 0x40)
1385 && ((1L << (ch - 0x20)) & mainset))
1386 || (ch == '\\')
1387 || (ch == '|'))
1388 forbidden = TRUE;
1389 }
1390 if (forbidden)
1391 errno = EINVAL;
1392 return (forbidden);
1393}
1394
1395/*
1396 * Check whether the same name can be used as a DOS and
1397 * a Win32 name
1398 *
1399 * The names must be the same, or the short name the uppercase
1400 * variant of the long name
1401 */
1402
1403BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1404 const ntfschar *shortname, int shortlen,
1405 const ntfschar *longname, int longlen)
1406{
1407 BOOL collapsible;
1408 unsigned int ch;
1409 unsigned int cs;
1410 int i;
1411
1412 collapsible = shortlen == longlen;
1413 for (i=0; collapsible && (i<shortlen); i++) {
1414 ch = le16_to_cpu(longname[i]);
1415 cs = le16_to_cpu(shortname[i]);
1416 if ((cs != ch)
1417 && ((ch >= vol->upcase_len)
1418 || (cs >= vol->upcase_len)
1419 || (vol->upcase[cs] != vol->upcase[ch])))
1420 collapsible = FALSE;
1421 }
1422 return (collapsible);
1423}
1424
1425/*
1426 * Define the character encoding to be used.
1427 * Use UTF-8 unless specified otherwise.
1428 */
1429
1430int ntfs_set_char_encoding(const char *locale)
1431{
1432 use_utf8 = 0;
1433 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1434 || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1435 use_utf8 = 1;
1436 else
1437 if (setlocale(LC_ALL, locale))
1438 use_utf8 = 0;
1439 else {
1440 ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1441 use_utf8 = 1;
1442 }
1443 return 0; /* always successful */
1444}
1445
1446#if defined(__APPLE__) || defined(__DARWIN__)
1447
1448int ntfs_macosx_normalize_filenames(int normalize) {
1449#ifdef ENABLE_NFCONV
1450 if(normalize == 0 || normalize == 1) {
1451 nfconvert_utf8 = normalize;
1452 return 0;
1453 }
1454 else
1455 return -1;
1456#else
1457 return -1;
1458#endif /* ENABLE_NFCONV */
1459}
1460
1461int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1462 int composed) {
1463#ifdef ENABLE_NFCONV
1464 /* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1465 CFStringRef cfSourceString;
1466 CFMutableStringRef cfMutableString;
1467 CFRange rangeToProcess;
1468 CFIndex requiredBufferLength;
1469 char *result = NULL;
1470 int resultLength = -1;
1471
1472 /* Convert the UTF-8 string to a CFString. */
1473 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8);
1474 if(cfSourceString == NULL) {
1475 ntfs_log_error("CFStringCreateWithCString failed!\n");
1476 return -2;
1477 }
1478
1479 /* Create a mutable string from cfSourceString that we are free to modify. */
1480 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString);
1481 CFRelease(cfSourceString); /* End-of-life. */
1482 if(cfMutableString == NULL) {
1483 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1484 return -3;
1485 }
1486
1487 /* Normalize the mutable string to the desired normalization form. */
1488 CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1489
1490 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1491 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1492 if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) {
1493 resultLength = sizeof(char)*(requiredBufferLength + 1);
1494 result = ntfs_calloc(resultLength);
1495
1496 if(result != NULL) {
1497 if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8,
1498 0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) {
1499 ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1500 free(result);
1501 result = NULL;
1502 }
1503 }
1504 else
1505 ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength);
1506 }
1507 else
1508 ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1509
1510
1511 CFRelease(cfMutableString);
1512
1513 if(result != NULL) {
1514 *target = result;
1515 return resultLength - 1;
1516 }
1517 else
1518 return -1;
1519#else
1520 return -1;
1521#endif /* ENABLE_NFCONV */
1522}
1523#endif /* defined(__APPLE__) || defined(__DARWIN__) */
1524