platform/external/ntfs-3g.git - Unnamed repository; edit this file 'description' to name the repository.

1 /**
2  * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3  *
4  * Copyright (c) 2000-2004 Anton Altaparmakov
5  * Copyright (c) 2002-2009 Szabolcs Szakacsits
6  * Copyright (c) 2008-2011 Jean-Pierre Andre
7  * Copyright (c) 2008      Bernhard Kaindl
8  *
9  * This program/include file is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as published
11  * by the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program/include file is distributed in the hope that it will be
15  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program (in the main directory of the NTFS-3G
21  * distribution in the file COPYING); if not, write to the Free Software
22  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  */
24
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28
29 #ifdef HAVE_STDIO_H
30 #include <stdio.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47
48 #if defined(__APPLE__) || defined(__DARWIN__)
49 #ifdef ENABLE_NFCONV
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
53
54 #include "compat.h"
55 #include "attrib.h"
56 #include "types.h"
57 #include "unistr.h"
58 #include "debug.h"
59 #include "logging.h"
60 #include "misc.h"
61
62 #define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
63
64 /*
65  * IMPORTANT
66  * =========
67  *
68  * All these routines assume that the Unicode characters are in little endian
69  * encoding inside the strings!!!
70  */
71
72 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
73
74 #if defined(__APPLE__) || defined(__DARWIN__)
75 #ifdef ENABLE_NFCONV
76 /**
77  * This variable controls whether or not automatic normalization form conversion
78  * should be performed when translating NTFS unicode file names to UTF-8.
79  * Defaults to on, but can be controlled from the outside using the function
80  *   int ntfs_macosx_normalize_filenames(int normalize);
81  */
82 static int nfconvert_utf8 = 1;
83 #endif /* ENABLE_NFCONV */
84 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
85
86 /*
87  * This is used by the name collation functions to quickly determine what
88  * characters are (in)valid.
89  */
90 #if 0
91 static const u8 legal_ansi_char_array[0x40] = {
92 	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
93 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
94
95 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
96 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97
98 	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
99 	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
100
101 	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
102 	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
103 };
104 #endif
105
106 /**
107  * ntfs_names_are_equal - compare two Unicode names for equality
108  * @s1:			name to compare to @s2
109  * @s1_len:		length in Unicode characters of @s1
110  * @s2:			name to compare to @s1
111  * @s2_len:		length in Unicode characters of @s2
112  * @ic:			ignore case bool
113  * @upcase:		upcase table (only if @ic == IGNORE_CASE)
114  * @upcase_size:	length in Unicode characters of @upcase (if present)
115  *
116  * Compare the names @s1 and @s2 and return TRUE (1) if the names are
117  * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
118  * the @upcase table is used to perform a case insensitive comparison.
119  */
120 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
121 		const ntfschar *s2, size_t s2_len,
122 		const IGNORE_CASE_BOOL ic,
123 		const ntfschar *upcase, const u32 upcase_size)
124 {
125 	if (s1_len != s2_len)
126 		return FALSE;
127 	if (!s1_len)
128 		return TRUE;
129 	if (ic == CASE_SENSITIVE)
130 		return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
131 	return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
132 								       TRUE;
133 }
134
135 /*
136  * ntfs_names_full_collate() fully collate two Unicode names
137  *
138  * @name1:	first Unicode name to compare
139  * @name1_len:	length of first Unicode name to compare
140  * @name2:	second Unicode name to compare
141  * @name2_len:	length of second Unicode name to compare
142  * @ic:		either CASE_SENSITIVE or IGNORE_CASE
143  * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
144  * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
145  *
146  *  -1 if the first name collates before the second one,
147  *   0 if the names match,
148  *   1 if the second name collates before the first one, or
149  *
150  */
151 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
152 		const ntfschar *name2, const u32 name2_len,
153 		const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
154 		const u32 upcase_len)
155 {
156 	u32 cnt;
157 	u16 c1, c2;
158 	u16 u1, u2;
159
160 #ifdef DEBUG
161 	if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
162 		ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
163 		exit(1);
164 	}
165 #endif
166 	cnt = min(name1_len, name2_len);
167 	if (cnt > 0) {
168 		if (ic == CASE_SENSITIVE) {
169 			while (--cnt && (*name1 == *name2)) {
170 				name1++;
171 				name2++;
172 			}
173 			u1 = c1 = le16_to_cpu(*name1);
174 			u2 = c2 = le16_to_cpu(*name2);
175 			if (u1 < upcase_len)
176 				u1 = le16_to_cpu(upcase[u1]);
177 			if (u2 < upcase_len)
178 				u2 = le16_to_cpu(upcase[u2]);
179 			if ((u1 == u2) && cnt)
180 				do {
181 					name1++;
182 					u1 = le16_to_cpu(*name1);
183 					name2++;
184 					u2 = le16_to_cpu(*name2);
185 					if (u1 < upcase_len)
186 						u1 = le16_to_cpu(upcase[u1]);
187 					if (u2 < upcase_len)
188 						u2 = le16_to_cpu(upcase[u2]);
189 				} while ((u1 == u2) && --cnt);
190 			if (u1 < u2)
191 				return -1;
192 			if (u1 > u2)
193 				return 1;
194 			if (name1_len < name2_len)
195 				return -1;
196 			if (name1_len > name2_len)
197 				return 1;
198 			if (c1 < c2)
199 				return -1;
200 			if (c1 > c2)
201 				return 1;
202 		} else {
203 			do {
204 				u1 = c1 = le16_to_cpu(*name1);
205 				name1++;
206 				u2 = c2 = le16_to_cpu(*name2);
207 				name2++;
208 				if (u1 < upcase_len)
209 					u1 = le16_to_cpu(upcase[u1]);
210 				if (u2 < upcase_len)
211 					u2 = le16_to_cpu(upcase[u2]);
212 			} while ((u1 == u2) && --cnt);
213 			if (u1 < u2)
214 				return -1;
215 			if (u1 > u2)
216 				return 1;
217 			if (name1_len < name2_len)
218 				return -1;
219 			if (name1_len > name2_len)
220 				return 1;
221 		}
222 	} else {
223 		if (name1_len < name2_len)
224 			return -1;
225 		if (name1_len > name2_len)
226 			return 1;
227 	}
228 	return 0;
229 }
230
231 /**
232  * ntfs_ucsncmp - compare two little endian Unicode strings
233  * @s1:		first string
234  * @s2:		second string
235  * @n:		maximum unicode characters to compare
236  *
237  * Compare the first @n characters of the Unicode strings @s1 and @s2,
238  * The strings in little endian format and appropriate le16_to_cpu()
239  * conversion is performed on non-little endian machines.
240  *
241  * The function returns an integer less than, equal to, or greater than zero
242  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
243  * to be less than, to match, or be greater than @s2.
244  */
245 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
246 {
247 	ntfschar c1, c2;
248 	size_t i;
249
250 #ifdef DEBUG
251 	if (!s1 || !s2) {
252 		ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
253 		exit(1);
254 	}
255 #endif
256 	for (i = 0; i < n; ++i) {
257 		c1 = le16_to_cpu(s1[i]);
258 		c2 = le16_to_cpu(s2[i]);
259 		if (c1 < c2)
260 			return -1;
261 		if (c1 > c2)
262 			return 1;
263 		if (!c1)
264 			break;
265 	}
266 	return 0;
267 }
268
269 /**
270  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
271  * @s1:			first string
272  * @s2:			second string
273  * @n:			maximum unicode characters to compare
274  * @upcase:		upcase table
275  * @upcase_size:	upcase table size in Unicode characters
276  *
277  * Compare the first @n characters of the Unicode strings @s1 and @s2,
278  * ignoring case. The strings in little endian format and appropriate
279  * le16_to_cpu() conversion is performed on non-little endian machines.
280  *
281  * Each character is uppercased using the @upcase table before the comparison.
282  *
283  * The function returns an integer less than, equal to, or greater than zero
284  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
285  * to be less than, to match, or be greater than @s2.
286  */
287 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
288 		const ntfschar *upcase, const u32 upcase_size)
289 {
290 	u16 c1, c2;
291 	size_t i;
292
293 #ifdef DEBUG
294 	if (!s1 || !s2 || !upcase) {
295 		ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
296 		exit(1);
297 	}
298 #endif
299 	for (i = 0; i < n; ++i) {
300 		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
301 			c1 = le16_to_cpu(upcase[c1]);
302 		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
303 			c2 = le16_to_cpu(upcase[c2]);
304 		if (c1 < c2)
305 			return -1;
306 		if (c1 > c2)
307 			return 1;
308 		if (!c1)
309 			break;
310 	}
311 	return 0;
312 }
313
314 /**
315  * ntfs_ucsnlen - determine the length of a little endian Unicode string
316  * @s:		pointer to Unicode string
317  * @maxlen:	maximum length of string @s
318  *
319  * Return the number of Unicode characters in the little endian Unicode
320  * string @s up to a maximum of maxlen Unicode characters, not including
321  * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
322  * and @s + @maxlen, @maxlen is returned.
323  *
324  * This function never looks beyond @s + @maxlen.
325  */
326 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
327 {
328 	u32 i;
329
330 	for (i = 0; i < maxlen; i++) {
331 		if (!le16_to_cpu(s[i]))
332 			break;
333 	}
334 	return i;
335 }
336
337 /**
338  * ntfs_ucsndup - duplicate little endian Unicode string
339  * @s:		pointer to Unicode string
340  * @maxlen:	maximum length of string @s
341  *
342  * Return a pointer to a new little endian Unicode string which is a duplicate
343  * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
344  * and can be freed with free(3).
345  *
346  * A maximum of @maxlen Unicode characters are copied and a terminating
347  * (ntfschar)'\0' little endian Unicode character is added.
348  *
349  * This function never looks beyond @s + @maxlen.
350  *
351  * Return a pointer to the new little endian Unicode string on success and NULL
352  * on failure with errno set to the error code.
353  */
354 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
355 {
356 	ntfschar *dst;
357 	u32 len;
358
359 	len = ntfs_ucsnlen(s, maxlen);
360 	dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
361 	if (dst) {
362 		memcpy(dst, s, len * sizeof(ntfschar));
363 		dst[len] = cpu_to_le16(L'\0');
364 	}
365 	return dst;
366 }
367
368 /**
369  * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
370  * @name:
371  * @name_len:
372  * @upcase:
373  * @upcase_len:
374  *
375  * Description...
376  *
377  * Returns:
378  */
379 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
380 		const u32 upcase_len)
381 {
382 	u32 i;
383 	u16 u;
384
385 	for (i = 0; i < name_len; i++)
386 		if ((u = le16_to_cpu(name[i])) < upcase_len)
387 			name[i] = upcase[u];
388 }
389
390 /**
391  * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
392  */
393 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
394 		const u32 locase_len)
395 {
396 	u32 i;
397 	u16 u;
398
399 	if (locase)
400 		for (i = 0; i < name_len; i++)
401 			if ((u = le16_to_cpu(name[i])) < locase_len)
402 				name[i] = locase[u];
403 }
404
405 /**
406  * ntfs_file_value_upcase - Convert a filename to upper case
407  * @file_name_attr:
408  * @upcase:
409  * @upcase_len:
410  *
411  * Description...
412  *
413  * Returns:
414  */
415 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
416 		const ntfschar *upcase, const u32 upcase_len)
417 {
418 	ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
419 			file_name_attr->file_name_length, upcase, upcase_len);
420 }
421
422 /*
423    NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
424    for now]) for path names, but the Unicode code points need to be
425    converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
426    glibc does this even without a locale in a hard-coded fashion as that
427    appears to be is easy because the low 7-bit ASCII range appears to be
428    available in all charsets but it does not convert anything if
429    there was some error with the locale setup or none set up like
430    when mount is called during early boot where he (by policy) do
431    not use locales (and may be not available if /usr is not yet mounted),
432    so this patch fixes the resulting issues for systems which use
433    UTF-8 and for others, specifying the locale in fstab brings them
434    the encoding which they want.
435
436    If no locale is defined or there was a problem with setting one
437    up and whenever nl_langinfo(CODESET) returns a sting starting with
438    "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
439    the bug where NTFS-3G does not show any path names which include
440    international characters!!! (and also fails on creating them) as result.
441
442    Author: Bernhard Kaindl <bk@suse.de>
443    Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
444 */
445
446 /*
447  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
448  * null) to store a given UTF-16LE string.
449  *
450  * Return -1 with errno set if string has invalid byte sequence or too long.
451  */
452 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
453 {
454 	int i, ret = -1;
455 	int count = 0;
456 	BOOL surrog;
457
458 	surrog = FALSE;
459 	for (i = 0; i < ins_len && ins[i]; i++) {
460 		unsigned short c = le16_to_cpu(ins[i]);
461 		if (surrog) {
462 			if ((c >= 0xdc00) && (c < 0xe000)) {
463 				surrog = FALSE;
464 				count += 4;
465 			} else
466 				goto fail;
467 		} else
468 			if (c < 0x80)
469 				count++;
470 			else if (c < 0x800)
471 				count += 2;
472 			else if (c < 0xd800)
473 				count += 3;
474 			else if (c < 0xdc00)
475 				surrog = TRUE;
476 #if NOREVBOM
477 			else if ((c >= 0xe000) && (c < 0xfffe))
478 #else
479 			else if (c >= 0xe000)
480 #endif
481 				count += 3;
482 			else
483 				goto fail;
484 		if (count > outs_len) {
485 			errno = ENAMETOOLONG;
486 			goto out;
487 		}
488 	}
489 	if (surrog)
490 		goto fail;
491
492 	ret = count;
493 out:
494 	return ret;
495 fail:
496 	errno = EILSEQ;
497 	goto out;
498 }
499
500 /*
501  * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
502  * @ins:	input utf16 string buffer
503  * @ins_len:	length of input string in utf16 characters
504  * @outs:	on return contains the (allocated) output multibyte string
505  * @outs_len:	length of output buffer in bytes
506  *
507  * Return -1 with errno set if string has invalid byte sequence or too long.
508  */
509 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
510 			      char **outs, int outs_len)
511 {
512 #if defined(__APPLE__) || defined(__DARWIN__)
513 #ifdef ENABLE_NFCONV
514 	char *original_outs_value = *outs;
515 	int original_outs_len = outs_len;
516 #endif /* ENABLE_NFCONV */
517 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
518
519 	char *t;
520 	int i, size, ret = -1;
521 	int halfpair;
522
523 	halfpair = 0;
524 	if (!*outs)
525 		outs_len = PATH_MAX;
526
527 	size = utf16_to_utf8_size(ins, ins_len, outs_len);
528
529 	if (size < 0)
530 		goto out;
531
532 	if (!*outs) {
533 		outs_len = size + 1;
534 		*outs = ntfs_malloc(outs_len);
535 		if (!*outs)
536 			goto out;
537 	}
538
539 	t = *outs;
540
541 	for (i = 0; i < ins_len && ins[i]; i++) {
542 	    unsigned short c = le16_to_cpu(ins[i]);
543 			/* size not double-checked */
544 		if (halfpair) {
545 			if ((c >= 0xdc00) && (c < 0xe000)) {
546 				*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
547 				*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
548 				*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
549 				*t++ = 0x80 + (c & 63);
550 				halfpair = 0;
551 			} else
552 				goto fail;
553 		} else if (c < 0x80) {
554 			*t++ = c;
555 	    	} else {
556 			if (c < 0x800) {
557 			   	*t++ = (0xc0 | ((c >> 6) & 0x3f));
558 			        *t++ = 0x80 | (c & 0x3f);
559 			} else if (c < 0xd800) {
560 			   	*t++ = 0xe0 | (c >> 12);
561 			   	*t++ = 0x80 | ((c >> 6) & 0x3f);
562 		        	*t++ = 0x80 | (c & 0x3f);
563 			} else if (c < 0xdc00)
564 				halfpair = c;
565 			else if (c >= 0xe000) {
566 				*t++ = 0xe0 | (c >> 12);
567 				*t++ = 0x80 | ((c >> 6) & 0x3f);
568 			        *t++ = 0x80 | (c & 0x3f);
569 			} else
570 				goto fail;
571 	        }
572 	}
573 	*t = '\0';
574
575 #if defined(__APPLE__) || defined(__DARWIN__)
576 #ifdef ENABLE_NFCONV
577 	if(nfconvert_utf8 && (t - *outs) > 0) {
578 		char *new_outs = NULL;
579 		int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
580 		if(new_outs_len >= 0 && new_outs != NULL) {
581 			if(original_outs_value != *outs) {
582 				// We have allocated outs ourselves.
583 				free(*outs);
584 				*outs = new_outs;
585 				t = *outs + new_outs_len;
586 			}
587 			else {
588 				// We need to copy new_outs into the fixed outs buffer.
589 				memset(*outs, 0, original_outs_len);
590 				strncpy(*outs, new_outs, original_outs_len-1);
591 				t = *outs + original_outs_len;
592 				free(new_outs);
593 			}
594 		}
595 		else {
596 			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
597 			ntfs_log_error("  new_outs=0x%p\n", new_outs);
598 			ntfs_log_error("  new_outs_len=%d\n", new_outs_len);
599 		}
600 	}
601 #endif /* ENABLE_NFCONV */
602 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
603
604 	ret = t - *outs;
605 out:
606 	return ret;
607 fail:
608 	errno = EILSEQ;
609 	goto out;
610 }
611
612 /*
613  * Return the amount of 16-bit elements in UTF-16LE needed
614  * (without the terminating null) to store given UTF-8 string.
615  *
616  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
617  *
618  * Note: This does not check whether the input sequence is a valid utf8 string,
619  *	 and should be used only in context where such check is made!
620  */
621 static int utf8_to_utf16_size(const char *s)
622 {
623 	int ret = -1;
624 	unsigned int byte;
625 	size_t count = 0;
626
627 	while ((byte = *((const unsigned char *)s++))) {
628 		if (++count >= PATH_MAX)
629 			goto fail;
630 		if (byte >= 0xc0) {
631 			if (byte >= 0xF5) {
632 				errno = EILSEQ;
633 				goto out;
634 			}
635 			if (!*s)
636 				break;
637 			if (byte >= 0xC0)
638 				s++;
639 			if (!*s)
640 				break;
641 			if (byte >= 0xE0)
642 				s++;
643 			if (!*s)
644 				break;
645 			if (byte >= 0xF0) {
646 				s++;
647 				if (++count >= PATH_MAX)
648 					goto fail;
649 			}
650 		}
651 	}
652 	ret = count;
653 out:
654 	return ret;
655 fail:
656 	errno = ENAMETOOLONG;
657 	goto out;
658 }
659 /*
660  * This converts one UTF-8 sequence to cpu-endian Unicode value
661  * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
662  *
663  * Return the number of used utf8 bytes or -1 with errno set
664  * if sequence is invalid.
665  */
666 static int utf8_to_unicode(u32 *wc, const char *s)
667 {
668     	unsigned int byte = *((const unsigned char *)s);
669
670 					/* single byte */
671 	if (byte == 0) {
672 		*wc = (u32) 0;
673 		return 0;
674 	} else if (byte < 0x80) {
675 		*wc = (u32) byte;
676 		return 1;
677 					/* double byte */
678 	} else if (byte < 0xc2) {
679 		goto fail;
680 	} else if (byte < 0xE0) {
681 		if ((s[1] & 0xC0) == 0x80) {
682 			*wc = ((u32)(byte & 0x1F) << 6)
683 			    | ((u32)(s[1] & 0x3F));
684 			return 2;
685 		} else
686 			goto fail;
687 					/* three-byte */
688 	} else if (byte < 0xF0) {
689 		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
690 			*wc = ((u32)(byte & 0x0F) << 12)
691 			    | ((u32)(s[1] & 0x3F) << 6)
692 			    | ((u32)(s[2] & 0x3F));
693 			/* Check valid ranges */
694 #if NOREVBOM
695 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
696 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
697 				return 3;
698 #else
699 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
700 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
701 				return 3;
702 #endif
703 		}
704 		goto fail;
705 					/* four-byte */
706 	} else if (byte < 0xF5) {
707 		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
708 		  && ((s[3] & 0xC0) == 0x80)) {
709 			*wc = ((u32)(byte & 0x07) << 18)
710 			    | ((u32)(s[1] & 0x3F) << 12)
711 			    | ((u32)(s[2] & 0x3F) << 6)
712 			    | ((u32)(s[3] & 0x3F));
713 			/* Check valid ranges */
714 			if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
715 				return 4;
716 		}
717 		goto fail;
718 	}
719 fail:
720 	errno = EILSEQ;
721 	return -1;
722 }
723
724 /**
725  * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
726  * @ins:	input multibyte string buffer
727  * @outs:	on return contains the (allocated) output utf16 string
728  * @outs_len:	length of output buffer in utf16 characters
729  *
730  * Return -1 with errno set.
731  */
732 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
733 {
734 #if defined(__APPLE__) || defined(__DARWIN__)
735 #ifdef ENABLE_NFCONV
736 	char *new_ins = NULL;
737 	if(nfconvert_utf8) {
738 		int new_ins_len;
739 		new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
740 		if(new_ins_len >= 0)
741 			ins = new_ins;
742 		else
743 			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
744 	}
745 #endif /* ENABLE_NFCONV */
746 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
747 	const char *t = ins;
748 	u32 wc;
749 	BOOL allocated;
750 	ntfschar *outpos;
751 	int shorts, ret = -1;
752
753 	shorts = utf8_to_utf16_size(ins);
754 	if (shorts < 0)
755 		goto fail;
756
757 	allocated = FALSE;
758 	if (!*outs) {
759 		*outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
760 		if (!*outs)
761 			goto fail;
762 		allocated = TRUE;
763 	}
764
765 	outpos = *outs;
766
767 	while(1) {
768 		int m  = utf8_to_unicode(&wc, t);
769 		if (m <= 0) {
770 			if (m < 0) {
771 				/* do not leave space allocated if failed */
772 				if (allocated) {
773 					free(*outs);
774 					*outs = (ntfschar*)NULL;
775 				}
776 				goto fail;
777 			}
778 			*outpos++ = const_cpu_to_le16(0);
779 			break;
780 		}
781 		if (wc < 0x10000)
782 			*outpos++ = cpu_to_le16(wc);
783 		else {
784 			wc -= 0x10000;
785 			*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
786 			*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
787 		}
788 		t += m;
789 	}
790
791 	ret = --outpos - *outs;
792 fail:
793 #if defined(__APPLE__) || defined(__DARWIN__)
794 #ifdef ENABLE_NFCONV
795 	if(new_ins != NULL)
796 		free(new_ins);
797 #endif /* ENABLE_NFCONV */
798 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
799 	return ret;
800 }
801
802 /**
803  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
804  * @ins:	input Unicode string buffer
805  * @ins_len:	length of input string in Unicode characters
806  * @outs:	on return contains the (allocated) output multibyte string
807  * @outs_len:	length of output buffer in bytes
808  *
809  * Convert the input little endian, 2-byte Unicode string @ins, of length
810  * @ins_len into the multibyte string format dictated by the current locale.
811  *
812  * If *@outs is NULL, the function allocates the string and the caller is
813  * responsible for calling free(*@outs); when finished with it.
814  *
815  * On success the function returns the number of bytes written to the output
816  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
817  * string buffer was allocated, *@outs is set to it.
818  *
819  * On error, -1 is returned, and errno is set to the error code. The following
820  * error codes can be expected:
821  *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
822  *	EILSEQ		The input string cannot be represented as a multibyte
823  *			sequence according to the current locale.
824  *	ENAMETOOLONG	Destination buffer is too small for input string.
825  *	ENOMEM		Not enough memory to allocate destination buffer.
826  */
827 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
828 		int outs_len)
829 {
830 	char *mbs;
831 	int mbs_len;
832 #ifdef MB_CUR_MAX
833 	wchar_t wc;
834 	int i, o;
835 	int cnt = 0;
836 #ifdef HAVE_MBSINIT
837 	mbstate_t mbstate;
838 #endif
839 #endif /* MB_CUR_MAX */
840
841 	if (!ins || !outs) {
842 		errno = EINVAL;
843 		return -1;
844 	}
845 	mbs = *outs;
846 	mbs_len = outs_len;
847 	if (mbs && !mbs_len) {
848 		errno = ENAMETOOLONG;
849 		return -1;
850 	}
851 	if (use_utf8)
852 		return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
853 #ifdef MB_CUR_MAX
854 	if (!mbs) {
855 		mbs_len = (ins_len + 1) * MB_CUR_MAX;
856 		mbs = ntfs_malloc(mbs_len);
857 		if (!mbs)
858 			return -1;
859 	}
860 #ifdef HAVE_MBSINIT
861 	memset(&mbstate, 0, sizeof(mbstate));
862 #else
863 	wctomb(NULL, 0);
864 #endif
865 	for (i = o = 0; i < ins_len; i++) {
866 		/* Reallocate memory if necessary or abort. */
867 		if ((int)(o + MB_CUR_MAX) > mbs_len) {
868 			char *tc;
869 			if (mbs == *outs) {
870 				errno = ENAMETOOLONG;
871 				return -1;
872 			}
873 			tc = ntfs_malloc((mbs_len + 64) & ~63);
874 			if (!tc)
875 				goto err_out;
876 			memcpy(tc, mbs, mbs_len);
877 			mbs_len = (mbs_len + 64) & ~63;
878 			free(mbs);
879 			mbs = tc;
880 		}
881 		/* Convert the LE Unicode character to a CPU wide character. */
882 		wc = (wchar_t)le16_to_cpu(ins[i]);
883 		if (!wc)
884 			break;
885 		/* Convert the CPU endian wide character to multibyte. */
886 #ifdef HAVE_MBSINIT
887 		cnt = wcrtomb(mbs + o, wc, &mbstate);
888 #else
889 		cnt = wctomb(mbs + o, wc);
890 #endif
891 		if (cnt == -1)
892 			goto err_out;
893 		if (cnt <= 0) {
894 			ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
895 			errno = EINVAL;
896 			goto err_out;
897 		}
898 		o += cnt;
899 	}
900 #ifdef HAVE_MBSINIT
901 	/* Make sure we are back in the initial state. */
902 	if (!mbsinit(&mbstate)) {
903 		ntfs_log_debug("Eeek. mbstate not in initial state!\n");
904 		errno = EILSEQ;
905 		goto err_out;
906 	}
907 #endif
908 	/* Now write the NULL character. */
909 	mbs[o] = '\0';
910 	if (*outs != mbs)
911 		*outs = mbs;
912 	return o;
913 err_out:
914 	if (mbs != *outs) {
915 		int eo = errno;
916 		free(mbs);
917 		errno = eo;
918 	}
919 #else /* MB_CUR_MAX */
920 	errno = EILSEQ;
921 #endif /* MB_CUR_MAX */
922 	return -1;
923 }
924
925 /**
926  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
927  * @ins:	input multibyte string buffer
928  * @outs:	on return contains the (allocated) output Unicode string
929  *
930  * Convert the input multibyte string @ins, from the current locale into the
931  * corresponding little endian, 2-byte Unicode string.
932  *
933  * The function allocates the string and the caller is responsible for calling
934  * free(*@outs); when finished with it.
935  *
936  * On success the function returns the number of Unicode characters written to
937  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
938  * character.
939  *
940  * On error, -1 is returned, and errno is set to the error code. The following
941  * error codes can be expected:
942  *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
943  *	EILSEQ		The input string cannot be represented as a Unicode
944  *			string according to the current locale.
945  *	ENAMETOOLONG	Destination buffer is too small for input string.
946  *	ENOMEM		Not enough memory to allocate destination buffer.
947  */
948 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
949 {
950 #ifdef MB_CUR_MAX
951 	ntfschar *ucs;
952 	const char *s;
953 	wchar_t wc;
954 	int i, o, cnt, ins_len, ucs_len, ins_size;
955 #ifdef HAVE_MBSINIT
956 	mbstate_t mbstate;
957 #endif
958 #endif /* MB_CUR_MAX */
959
960 	if (!ins || !outs) {
961 		errno = EINVAL;
962 		return -1;
963 	}
964
965 	if (use_utf8)
966 		return ntfs_utf8_to_utf16(ins, outs);
967
968 #ifdef MB_CUR_MAX
969 	/* Determine the size of the multi-byte string in bytes. */
970 	ins_size = strlen(ins);
971 	/* Determine the length of the multi-byte string. */
972 	s = ins;
973 #if defined(HAVE_MBSINIT)
974 	memset(&mbstate, 0, sizeof(mbstate));
975 	ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
976 #ifdef __CYGWIN32__
977 	if (!ins_len && *ins) {
978 		/* Older Cygwin had broken mbsrtowcs() implementation. */
979 		ins_len = strlen(ins);
980 	}
981 #endif
982 #elif !defined(DJGPP)
983 	ins_len = mbstowcs(NULL, s, 0);
984 #else
985 	/* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
986 	ins_len = strlen(ins);
987 #endif
988 	if (ins_len == -1)
989 		return ins_len;
990 #ifdef HAVE_MBSINIT
991 	if ((s != ins) || !mbsinit(&mbstate)) {
992 #else
993 	if (s != ins) {
994 #endif
995 		errno = EILSEQ;
996 		return -1;
997 	}
998 	/* Add the NULL terminator. */
999 	ins_len++;
1000 	ucs_len = ins_len;
1001 	ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1002 	if (!ucs)
1003 		return -1;
1004 #ifdef HAVE_MBSINIT
1005 	memset(&mbstate, 0, sizeof(mbstate));
1006 #else
1007 	mbtowc(NULL, NULL, 0);
1008 #endif
1009 	for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1010 		/* Reallocate memory if necessary. */
1011 		if (o >= ucs_len) {
1012 			ntfschar *tc;
1013 			ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1014 			tc = realloc(ucs, ucs_len);
1015 			if (!tc)
1016 				goto err_out;
1017 			ucs = tc;
1018 			ucs_len /= sizeof(ntfschar);
1019 		}
1020 		/* Convert the multibyte character to a wide character. */
1021 #ifdef HAVE_MBSINIT
1022 		cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1023 #else
1024 		cnt = mbtowc(&wc, ins + i, ins_size - i);
1025 #endif
1026 		if (!cnt)
1027 			break;
1028 		if (cnt == -1)
1029 			goto err_out;
1030 		if (cnt < -1) {
1031 			ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1032 			errno = EINVAL;
1033 			goto err_out;
1034 		}
1035 		/* Make sure we are not overflowing the NTFS Unicode set. */
1036 		if ((unsigned long)wc >= (unsigned long)(1 <<
1037 				(8 * sizeof(ntfschar)))) {
1038 			errno = EILSEQ;
1039 			goto err_out;
1040 		}
1041 		/* Convert the CPU wide character to a LE Unicode character. */
1042 		ucs[o] = cpu_to_le16(wc);
1043 	}
1044 #ifdef HAVE_MBSINIT
1045 	/* Make sure we are back in the initial state. */
1046 	if (!mbsinit(&mbstate)) {
1047 		ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1048 		errno = EILSEQ;
1049 		goto err_out;
1050 	}
1051 #endif
1052 	/* Now write the NULL character. */
1053 	ucs[o] = cpu_to_le16(L'\0');
1054 	*outs = ucs;
1055 	return o;
1056 err_out:
1057 	free(ucs);
1058 #else /* MB_CUR_MAX */
1059 	errno = EILSEQ;
1060 #endif /* MB_CUR_MAX */
1061 	return -1;
1062 }
1063
1064 /*
1065  *		Turn a UTF8 name uppercase
1066  *
1067  *	Returns an allocated uppercase name which has to be freed by caller
1068  *	or NULL if there is an error (described by errno)
1069  */
1070
1071 char *ntfs_uppercase_mbs(const char *low,
1072 			const ntfschar *upcase, u32 upcase_size)
1073 {
1074 	int size;
1075 	char *upp;
1076 	u32 wc;
1077 	int n;
1078 	const char *s;
1079 	char *t;
1080
1081 	size = strlen(low);
1082 	upp = (char*)ntfs_malloc(3*size + 1);
1083 	if (upp) {
1084 		s = low;
1085 		t = upp;
1086 		do {
1087 			n = utf8_to_unicode(&wc, s);
1088 			if (n > 0) {
1089 				if (wc < upcase_size)
1090 					wc = le16_to_cpu(upcase[wc]);
1091 				if (wc < 0x80)
1092 					*t++ = wc;
1093 				else if (wc < 0x800) {
1094 					*t++ = (0xc0 | ((wc >> 6) & 0x3f));
1095 					*t++ = 0x80 | (wc & 0x3f);
1096 				} else if (wc < 0x10000) {
1097 					*t++ = 0xe0 | (wc >> 12);
1098 					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1099 					*t++ = 0x80 | (wc & 0x3f);
1100 				} else {
1101 					*t++ = 0xf0 | ((wc >> 18) & 7);
1102 					*t++ = 0x80 | ((wc >> 12) & 63);
1103 					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1104 					*t++ = 0x80 | (wc & 0x3f);
1105 				}
1106 			s += n;
1107 			}
1108 		} while (n > 0);
1109 		if (n < 0) {
1110 			free(upp);
1111 			upp = (char*)NULL;
1112 			errno = EILSEQ;
1113 		}
1114 		*t = 0;
1115 	}
1116 	return (upp);
1117 }
1118
1119 /**
1120  * ntfs_upcase_table_build - build the default upcase table for NTFS
1121  * @uc:		destination buffer where to store the built table
1122  * @uc_len:	size of destination buffer in bytes
1123  *
1124  * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1125  * stores it in the caller supplied buffer @uc of size @uc_len.
1126  *
1127  * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1128  */
1129 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1130 {
1131 #if 1 /* Vista */
1132 	/*
1133 	 *	This is the table as defined by Vista
1134 	 */
1135 	/*
1136 	 * "Start" is inclusive and "End" is exclusive, every value has the
1137 	 * value of "Add" added to it.
1138 	 */
1139 	static int uc_run_table[][3] = { /* Start, End, Add */
1140 	{0x0061, 0x007b,   -32}, {0x00e0, 0x00f7,  -32}, {0x00f8, 0x00ff, -32},
1141 	{0x0256, 0x0258,  -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130},
1142 	{0x03ac, 0x03ad,   -38}, {0x03ad, 0x03b0,  -37}, {0x03b1, 0x03c2, -32},
1143 	{0x03c2, 0x03c3,   -31}, {0x03c3, 0x03cc,  -32}, {0x03cc, 0x03cd, -64},
1144 	{0x03cd, 0x03cf,   -63}, {0x0430, 0x0450,  -32}, {0x0450, 0x0460, -80},
1145 	{0x0561, 0x0587,   -48}, {0x1f00, 0x1f08,    8}, {0x1f10, 0x1f16,   8},
1146 	{0x1f20, 0x1f28,     8}, {0x1f30, 0x1f38,    8}, {0x1f40, 0x1f46,   8},
1147 	{0x1f51, 0x1f52,     8}, {0x1f53, 0x1f54,    8}, {0x1f55, 0x1f56,   8},
1148 	{0x1f57, 0x1f58,     8}, {0x1f60, 0x1f68,    8}, {0x1f70, 0x1f72,  74},
1149 	{0x1f72, 0x1f76,    86}, {0x1f76, 0x1f78,  100}, {0x1f78, 0x1f7a, 128},
1150 	{0x1f7a, 0x1f7c,   112}, {0x1f7c, 0x1f7e,  126}, {0x1f80, 0x1f88,   8},
1151 	{0x1f90, 0x1f98,     8}, {0x1fa0, 0x1fa8,    8}, {0x1fb0, 0x1fb2,   8},
1152 	{0x1fb3, 0x1fb4,     9}, {0x1fcc, 0x1fcd,   -9}, {0x1fd0, 0x1fd2,   8},
1153 	{0x1fe0, 0x1fe2,     8}, {0x1fe5, 0x1fe6,    7}, {0x1ffc, 0x1ffd,  -9},
1154 	{0x2170, 0x2180,   -16}, {0x24d0, 0x24ea,  -26}, {0x2c30, 0x2c5f, -48},
1155 	{0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b,  -32}, {0}
1156 	};
1157 	/*
1158 	 * "Start" is exclusive and "End" is inclusive, every second value is
1159 	 * decremented by one.
1160 	 */
1161 	static int uc_dup_table[][2] = { /* Start, End */
1162 	{0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178},
1163 	{0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd},
1164 	{0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220},
1165 	{0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f},
1166 	{0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481},
1167 	{0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce},
1168 	{0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513},
1169 	{0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61},
1170 	{0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0}
1171 	};
1172 	/*
1173 	 * Set the Unicode character at offset "Offset" to "Value".  Note,
1174 	 * "Value" is host endian.
1175 	 */
1176 	static int uc_byte_table[][2] = { /* Offset, Value */
1177 	{0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184},
1178 	{0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6},
1179 	{0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7},
1180 	{0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc},
1181 	{0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca},
1182 	{0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66},
1183 	{0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190},
1184 	{0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196},
1185 	{0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f},
1186 	{0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae},
1187 	{0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9},
1188 	{0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0}
1189 	};
1190 #else /* Vista */
1191 	/*
1192 	 *	This is the table as defined by Windows XP
1193 	 */
1194 	static int uc_run_table[][3] = { /* Start, End, Add */
1195 	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
1196 	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
1197 	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1198 	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
1199 	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
1200 	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
1201 	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
1202 	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
1203 	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
1204 	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
1205 	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
1206 	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
1207 	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
1208 	{0}
1209 	};
1210 	static int uc_dup_table[][2] = { /* Start, End */
1211 	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1212 	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1213 	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1214 	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1215 	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1216 	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1217 	{0}
1218 	};
1219 	static int uc_byte_table[][2] = { /* Offset, Value */
1220 	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1221 	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1222 	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1223 	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1224 	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1225 	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1226 	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1227 	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1228 	{0}
1229 	};
1230 #endif /* Vista */
1231 	int i, r;
1232 	int k, off;
1233
1234 	memset((char*)uc, 0, uc_len);
1235 	uc_len >>= 1;
1236 	if (uc_len > 65536)
1237 		uc_len = 65536;
1238 	for (i = 0; (u32)i < uc_len; i++)
1239 		uc[i] = cpu_to_le16(i);
1240 	for (r = 0; uc_run_table[r][0]; r++) {
1241 		off = uc_run_table[r][2];
1242 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1243 			uc[i] = cpu_to_le16(i + off);
1244 	}
1245 	for (r = 0; uc_dup_table[r][0]; r++)
1246 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1247 			uc[i + 1] = cpu_to_le16(i);
1248 	for (r = 0; uc_byte_table[r][0]; r++) {
1249 		k = uc_byte_table[r][1];
1250 		uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1251 	}
1252 }
1253
1254 /*
1255  *		Allocate and build the default upcase table
1256  *
1257  *	Returns the number of entries
1258  *		0 if failed
1259  */
1260
1261 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1262
1263 u32 ntfs_upcase_build_default(ntfschar **upcase)
1264 {
1265 	u32 upcase_len = 0;
1266
1267 	*upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1268 	if (*upcase) {
1269 		ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1270 		upcase_len = UPCASE_LEN;
1271 	}
1272 	return (upcase_len);
1273 }
1274
1275 /*
1276  *		Build a table for converting to lower case
1277  *
1278  *	This is only meaningful when there is a single lower case
1279  *	character leading to an upper case one, and currently the
1280  *	only exception is the greek letter sigma which has a single
1281  *	upper case glyph (code U+03A3), but two lower case glyphs
1282  *	(code U+03C3 and U+03C2, the latter to be used at the end
1283  *	of a word). In the following implementation the upper case
1284  *	sigma will be lowercased as U+03C3.
1285  */
1286
1287 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1288 {
1289 	ntfschar *lc;
1290 	u32 upp;
1291 	u32 i;
1292
1293 	lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1294 	if (lc) {
1295 		for (i=0; i<uc_cnt; i++)
1296 			lc[i] = cpu_to_le16(i);
1297 		for (i=0; i<uc_cnt; i++) {
1298 			upp = le16_to_cpu(uc[i]);
1299 			if ((upp != i) && (upp < uc_cnt))
1300 				lc[upp] = cpu_to_le16(i);
1301 		}
1302 	} else
1303 		ntfs_log_error("Could not build the locase table\n");
1304 	return (lc);
1305 }
1306
1307 /**
1308  * ntfs_str2ucs - convert a string to a valid NTFS file name
1309  * @s:		input string
1310  * @len:	length of output buffer in Unicode characters
1311  *
1312  * Convert the input @s string into the corresponding little endian,
1313  * 2-byte Unicode string. The length of the converted string is less
1314  * or equal to the maximum length allowed by the NTFS format (255).
1315  *
1316  * If @s is NULL then return AT_UNNAMED.
1317  *
1318  * On success the function returns the Unicode string in an allocated
1319  * buffer and the caller is responsible to free it when it's not needed
1320  * anymore.
1321  *
1322  * On error NULL is returned and errno is set to the error code.
1323  */
1324 ntfschar *ntfs_str2ucs(const char *s, int *len)
1325 {
1326 	ntfschar *ucs = NULL;
1327
1328 	if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1329 		ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1330 		return NULL;
1331 	}
1332 	if (*len > NTFS_MAX_NAME_LEN) {
1333 		free(ucs);
1334 		errno = ENAMETOOLONG;
1335 		return NULL;
1336 	}
1337 	if (!ucs || !*len) {
1338 		ucs  = AT_UNNAMED;
1339 		*len = 0;
1340 	}
1341 	return ucs;
1342 }
1343
1344 /**
1345  * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1346  * @ucs		input string to be freed
1347  *
1348  * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1349  *
1350  * Return value: none.
1351  */
1352 void ntfs_ucsfree(ntfschar *ucs)
1353 {
1354 	if (ucs && (ucs != AT_UNNAMED))
1355 		free(ucs);
1356 }
1357
1358 /*
1359  *		Check whether a name contains no chars forbidden
1360  *	for DOS or Win32 use
1361  *
1362  *	If there is a bad char, errno is set to EINVAL
1363  */
1364
1365 BOOL ntfs_forbidden_chars(const ntfschar *name, int len)
1366 {
1367 	BOOL forbidden;
1368 	int ch;
1369 	int i;
1370 	u32 mainset =     (1L << ('\"' - 0x20))
1371 			| (1L << ('*' - 0x20))
1372 			| (1L << ('/' - 0x20))
1373 			| (1L << (':' - 0x20))
1374 			| (1L << ('<' - 0x20))
1375 			| (1L << ('>' - 0x20))
1376 			| (1L << ('?' - 0x20));
1377
1378 	forbidden = (len == 0)
1379 			|| (le16_to_cpu(name[len-1]) == ' ')
1380 			|| (le16_to_cpu(name[len-1]) == '.');
1381 	for (i=0; i<len; i++) {
1382 		ch = le16_to_cpu(name[i]);
1383 		if ((ch < 0x20)
1384 		    || ((ch < 0x40)
1385 			&& ((1L << (ch - 0x20)) & mainset))
1386 		    || (ch == '\\')
1387 		    || (ch == '|'))
1388 			forbidden = TRUE;
1389 	}
1390 	if (forbidden)
1391 		errno = EINVAL;
1392 	return (forbidden);
1393 }
1394
1395 /*
1396  *		Check whether the same name can be used as a DOS and
1397  *	a Win32 name
1398  *
1399  *	The names must be the same, or the short name the uppercase
1400  *	variant of the long name
1401  */
1402
1403 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1404 			const ntfschar *shortname, int shortlen,
1405 			const ntfschar *longname, int longlen)
1406 {
1407 	BOOL collapsible;
1408 	unsigned int ch;
1409 	unsigned int cs;
1410 	int i;
1411
1412 	collapsible = shortlen == longlen;
1413 	for (i=0; collapsible && (i<shortlen); i++) {
1414 		ch = le16_to_cpu(longname[i]);
1415 		cs = le16_to_cpu(shortname[i]);
1416 		if ((cs != ch)
1417 		    && ((ch >= vol->upcase_len)
1418 			|| (cs >= vol->upcase_len)
1419 			|| (vol->upcase[cs] != vol->upcase[ch])))
1420 				collapsible = FALSE;
1421 	}
1422 	return (collapsible);
1423 }
1424
1425 /*
1426  * Define the character encoding to be used.
1427  * Use UTF-8 unless specified otherwise.
1428  */
1429
1430 int ntfs_set_char_encoding(const char *locale)
1431 {
1432 	use_utf8 = 0;
1433 	if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1434 	    || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1435 		use_utf8 = 1;
1436 	else
1437 		if (setlocale(LC_ALL, locale))
1438 			use_utf8 = 0;
1439 		else {
1440 			ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1441 			use_utf8 = 1;
1442 	 	}
1443 	return 0; /* always successful */
1444 }
1445
1446 #if defined(__APPLE__) || defined(__DARWIN__)
1447
1448 int ntfs_macosx_normalize_filenames(int normalize) {
1449 #ifdef ENABLE_NFCONV
1450 	if(normalize == 0 || normalize == 1) {
1451 		nfconvert_utf8 = normalize;
1452 		return 0;
1453 	}
1454 	else
1455 		return -1;
1456 #else
1457 	return -1;
1458 #endif /* ENABLE_NFCONV */
1459 }
1460
1461 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1462  int composed) {
1463 #ifdef ENABLE_NFCONV
1464 	/* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1465 	CFStringRef cfSourceString;
1466 	CFMutableStringRef cfMutableString;
1467 	CFRange rangeToProcess;
1468 	CFIndex requiredBufferLength;
1469 	char *result = NULL;
1470 	int resultLength = -1;
1471
1472 	/* Convert the UTF-8 string to a CFString. */
1473 	cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8);
1474 	if(cfSourceString == NULL) {
1475 		ntfs_log_error("CFStringCreateWithCString failed!\n");
1476 		return -2;
1477 	}
1478
1479 	/* Create a mutable string from cfSourceString that we are free to modify. */
1480 	cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString);
1481 	CFRelease(cfSourceString); /* End-of-life. */
1482 	if(cfMutableString == NULL) {
1483 		ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1484 		return -3;
1485 	}
1486
1487 	/* Normalize the mutable string to the desired normalization form. */
1488 	CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1489
1490 	/* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1491 	rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1492 	if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) {
1493 		resultLength = sizeof(char)*(requiredBufferLength + 1);
1494 		result = ntfs_calloc(resultLength);
1495
1496 		if(result != NULL) {
1497 			if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8,
1498 					    0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) {
1499 				ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1500 				free(result);
1501 				result = NULL;
1502 			}
1503 		}
1504 		else
1505 			ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength);
1506 	}
1507 	else
1508 		ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1509
1510
1511 	CFRelease(cfMutableString);
1512
1513 	if(result != NULL) {
1514 	 	*target = result;
1515 		return resultLength - 1;
1516 	}
1517 	else
1518 		return -1;
1519 #else
1520 	return -1;
1521 #endif /* ENABLE_NFCONV */
1522 }
1523 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
1524