diff options
author | Kenny Root | 2010-11-09 16:37:23 -0600 |
---|---|---|
committer | Alex Ray | 2013-07-30 15:56:55 -0500 |
commit | ba0165bef09729a33ab8e0ca329342be05e0d859 (patch) | |
tree | 2f961146c3c8c625a01902207485fed0d7a2cdcd /libs/utils/String8.cpp | |
parent | 3a91fca00c6b3db62b4dc0da95ba30671caf3283 (diff) | |
download | platform-system-core-ba0165bef09729a33ab8e0ca329342be05e0d859.tar.gz platform-system-core-ba0165bef09729a33ab8e0ca329342be05e0d859.tar.xz platform-system-core-ba0165bef09729a33ab8e0ca329342be05e0d859.zip |
Split UTF functions from String8/16
Split out all the UTF-8/16/32 handling code from String8/16 to its own
file to allow better reuse of code.
Change-Id: If9ce63920edc75472c38da4adce0d13cda9ad2f7
Diffstat (limited to 'libs/utils/String8.cpp')
-rw-r--r-- | libs/utils/String8.cpp | 391 |
1 files changed, 28 insertions, 363 deletions
diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp index 6358fc424..c8dc0838d 100644 --- a/libs/utils/String8.cpp +++ b/libs/utils/String8.cpp | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <utils/String8.h> | 17 | #include <utils/String8.h> |
18 | 18 | ||
19 | #include <utils/Log.h> | 19 | #include <utils/Log.h> |
20 | #include <utils/Unicode.h> | ||
21 | #include <utils/SharedBuffer.h> | ||
20 | #include <utils/String16.h> | 22 | #include <utils/String16.h> |
21 | #include <utils/TextOutput.h> | 23 | #include <utils/TextOutput.h> |
22 | #include <utils/threads.h> | 24 | #include <utils/threads.h> |
@@ -34,94 +36,10 @@ | |||
34 | 36 | ||
35 | namespace android { | 37 | namespace android { |
36 | 38 | ||
37 | static const char32_t kByteMask = 0x000000BF; | ||
38 | static const char32_t kByteMark = 0x00000080; | ||
39 | |||
40 | // Surrogates aren't valid for UTF-32 characters, so define some | ||
41 | // constants that will let us screen them out. | ||
42 | static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; | ||
43 | static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; | ||
44 | static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; | ||
45 | static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; | ||
46 | static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; | ||
47 | static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; | ||
48 | static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; | ||
49 | |||
50 | // Mask used to set appropriate bits in first byte of UTF-8 sequence, | ||
51 | // indexed by number of bytes in the sequence. | ||
52 | // 0xxxxxxx | ||
53 | // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 | ||
54 | // 110yyyyx 10xxxxxx | ||
55 | // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 | ||
56 | // 1110yyyy 10yxxxxx 10xxxxxx | ||
57 | // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 | ||
58 | // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx | ||
59 | // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 | ||
60 | static const char32_t kFirstByteMark[] = { | ||
61 | 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 | ||
62 | }; | ||
63 | |||
64 | // Separator used by resource paths. This is not platform dependent contrary | 39 | // Separator used by resource paths. This is not platform dependent contrary |
65 | // to OS_PATH_SEPARATOR. | 40 | // to OS_PATH_SEPARATOR. |
66 | #define RES_PATH_SEPARATOR '/' | 41 | #define RES_PATH_SEPARATOR '/' |
67 | 42 | ||
68 | // Return number of utf8 bytes required for the character. | ||
69 | static size_t utf32_to_utf8_bytes(char32_t srcChar) | ||
70 | { | ||
71 | size_t bytesToWrite; | ||
72 | |||
73 | // Figure out how many bytes the result will require. | ||
74 | if (srcChar < 0x00000080) | ||
75 | { | ||
76 | bytesToWrite = 1; | ||
77 | } | ||
78 | else if (srcChar < 0x00000800) | ||
79 | { | ||
80 | bytesToWrite = 2; | ||
81 | } | ||
82 | else if (srcChar < 0x00010000) | ||
83 | { | ||
84 | if ((srcChar < kUnicodeSurrogateStart) | ||
85 | || (srcChar > kUnicodeSurrogateEnd)) | ||
86 | { | ||
87 | bytesToWrite = 3; | ||
88 | } | ||
89 | else | ||
90 | { | ||
91 | // Surrogates are invalid UTF-32 characters. | ||
92 | return 0; | ||
93 | } | ||
94 | } | ||
95 | // Max code point for Unicode is 0x0010FFFF. | ||
96 | else if (srcChar <= kUnicodeMaxCodepoint) | ||
97 | { | ||
98 | bytesToWrite = 4; | ||
99 | } | ||
100 | else | ||
101 | { | ||
102 | // Invalid UTF-32 character. | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | return bytesToWrite; | ||
107 | } | ||
108 | |||
109 | // Write out the source character to <dstP>. | ||
110 | |||
111 | static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) | ||
112 | { | ||
113 | dstP += bytes; | ||
114 | switch (bytes) | ||
115 | { /* note: everything falls through. */ | ||
116 | case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; | ||
117 | case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; | ||
118 | case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6; | ||
119 | case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]); | ||
120 | } | ||
121 | } | ||
122 | |||
123 | // --------------------------------------------------------------------------- | ||
124 | |||
125 | static SharedBuffer* gEmptyStringBuf = NULL; | 43 | static SharedBuffer* gEmptyStringBuf = NULL; |
126 | static char* gEmptyString = NULL; | 44 | static char* gEmptyString = NULL; |
127 | 45 | ||
@@ -175,62 +93,47 @@ static char* allocFromUTF8(const char* in, size_t len) | |||
175 | return getEmptyString(); | 93 | return getEmptyString(); |
176 | } | 94 | } |
177 | 95 | ||
178 | template<typename T, typename L> | 96 | static char* allocFromUTF16(const char16_t* in, size_t len) |
179 | static char* allocFromUTF16OrUTF32(const T* in, L len) | ||
180 | { | 97 | { |
181 | if (len == 0) return getEmptyString(); | 98 | if (len == 0) return getEmptyString(); |
182 | 99 | ||
183 | size_t bytes = 0; | 100 | const ssize_t bytes = utf16_to_utf8_length(in, len); |
184 | const T* end = in+len; | 101 | if (bytes < 0) { |
185 | const T* p = in; | 102 | return getEmptyString(); |
186 | |||
187 | while (p < end) { | ||
188 | bytes += utf32_to_utf8_bytes(*p); | ||
189 | p++; | ||
190 | } | 103 | } |
191 | 104 | ||
192 | SharedBuffer* buf = SharedBuffer::alloc(bytes+1); | 105 | SharedBuffer* buf = SharedBuffer::alloc(bytes+1); |
193 | LOG_ASSERT(buf, "Unable to allocate shared buffer"); | 106 | LOG_ASSERT(buf, "Unable to allocate shared buffer"); |
194 | if (buf) { | 107 | if (!buf) { |
195 | p = in; | 108 | return getEmptyString(); |
196 | char* str = (char*)buf->data(); | ||
197 | char* d = str; | ||
198 | while (p < end) { | ||
199 | const T c = *p++; | ||
200 | size_t len = utf32_to_utf8_bytes(c); | ||
201 | utf32_to_utf8((uint8_t*)d, c, len); | ||
202 | d += len; | ||
203 | } | ||
204 | *d = 0; | ||
205 | |||
206 | return str; | ||
207 | } | 109 | } |
208 | 110 | ||
209 | return getEmptyString(); | 111 | char* str = (char*)buf->data(); |
112 | utf16_to_utf8(in, len, str); | ||
113 | return str; | ||
210 | } | 114 | } |
211 | 115 | ||
212 | static char* allocFromUTF16(const char16_t* in, size_t len) | 116 | static char* allocFromUTF32(const char32_t* in, size_t len) |
213 | { | 117 | { |
214 | if (len == 0) return getEmptyString(); | 118 | if (len == 0) { |
119 | return getEmptyString(); | ||
120 | } | ||
215 | 121 | ||
216 | const size_t bytes = utf8_length_from_utf16(in, len); | 122 | const ssize_t bytes = utf32_to_utf8_length(in, len); |
123 | if (bytes < 0) { | ||
124 | return getEmptyString(); | ||
125 | } | ||
217 | 126 | ||
218 | SharedBuffer* buf = SharedBuffer::alloc(bytes+1); | 127 | SharedBuffer* buf = SharedBuffer::alloc(bytes+1); |
219 | LOG_ASSERT(buf, "Unable to allocate shared buffer"); | 128 | LOG_ASSERT(buf, "Unable to allocate shared buffer"); |
220 | if (buf) { | 129 | if (!buf) { |
221 | char* str = (char*)buf->data(); | 130 | return getEmptyString(); |
222 | |||
223 | utf16_to_utf8(in, len, str, bytes+1); | ||
224 | |||
225 | return str; | ||
226 | } | 131 | } |
227 | 132 | ||
228 | return getEmptyString(); | 133 | char* str = (char*) buf->data(); |
229 | } | 134 | utf32_to_utf8(in, len, str); |
230 | 135 | ||
231 | static char* allocFromUTF32(const char32_t* in, size_t len) | 136 | return str; |
232 | { | ||
233 | return allocFromUTF16OrUTF32<char32_t, size_t>(in, len); | ||
234 | } | 137 | } |
235 | 138 | ||
236 | // --------------------------------------------------------------------------- | 139 | // --------------------------------------------------------------------------- |
@@ -510,17 +413,17 @@ void String8::toUpper(size_t start, size_t length) | |||
510 | 413 | ||
511 | size_t String8::getUtf32Length() const | 414 | size_t String8::getUtf32Length() const |
512 | { | 415 | { |
513 | return utf32_length(mString, length()); | 416 | return utf8_to_utf32_length(mString, length()); |
514 | } | 417 | } |
515 | 418 | ||
516 | int32_t String8::getUtf32At(size_t index, size_t *next_index) const | 419 | int32_t String8::getUtf32At(size_t index, size_t *next_index) const |
517 | { | 420 | { |
518 | return utf32_at(mString, length(), index, next_index); | 421 | return utf32_from_utf8_at(mString, length(), index, next_index); |
519 | } | 422 | } |
520 | 423 | ||
521 | size_t String8::getUtf32(char32_t* dst, size_t dst_len) const | 424 | void String8::getUtf32(char32_t* dst) const |
522 | { | 425 | { |
523 | return utf8_to_utf32(mString, length(), dst, dst_len); | 426 | utf8_to_utf32(mString, length(), dst); |
524 | } | 427 | } |
525 | 428 | ||
526 | TextOutput& operator<<(TextOutput& to, const String8& val) | 429 | TextOutput& operator<<(TextOutput& to, const String8& val) |
@@ -705,241 +608,3 @@ String8& String8::convertToResPath() | |||
705 | } | 608 | } |
706 | 609 | ||
707 | }; // namespace android | 610 | }; // namespace android |
708 | |||
709 | // --------------------------------------------------------------------------- | ||
710 | |||
711 | size_t strlen32(const char32_t *s) | ||
712 | { | ||
713 | const char32_t *ss = s; | ||
714 | while ( *ss ) | ||
715 | ss++; | ||
716 | return ss-s; | ||
717 | } | ||
718 | |||
719 | size_t strnlen32(const char32_t *s, size_t maxlen) | ||
720 | { | ||
721 | const char32_t *ss = s; | ||
722 | while ((maxlen > 0) && *ss) { | ||
723 | ss++; | ||
724 | maxlen--; | ||
725 | } | ||
726 | return ss-s; | ||
727 | } | ||
728 | |||
729 | size_t utf8_length(const char *src) | ||
730 | { | ||
731 | const char *cur = src; | ||
732 | size_t ret = 0; | ||
733 | while (*cur != '\0') { | ||
734 | const char first_char = *cur++; | ||
735 | if ((first_char & 0x80) == 0) { // ASCII | ||
736 | ret += 1; | ||
737 | continue; | ||
738 | } | ||
739 | // (UTF-8's character must not be like 10xxxxxx, | ||
740 | // but 110xxxxx, 1110xxxx, ... or 1111110x) | ||
741 | if ((first_char & 0x40) == 0) { | ||
742 | return 0; | ||
743 | } | ||
744 | |||
745 | int32_t mask, to_ignore_mask; | ||
746 | size_t num_to_read = 0; | ||
747 | char32_t utf32 = 0; | ||
748 | for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; | ||
749 | num_to_read < 5 && (first_char & mask); | ||
750 | num_to_read++, to_ignore_mask |= mask, mask >>= 1) { | ||
751 | if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx | ||
752 | return 0; | ||
753 | } | ||
754 | // 0x3F == 00111111 | ||
755 | utf32 = (utf32 << 6) + (*cur++ & 0x3F); | ||
756 | } | ||
757 | // "first_char" must be (110xxxxx - 11110xxx) | ||
758 | if (num_to_read == 5) { | ||
759 | return 0; | ||
760 | } | ||
761 | to_ignore_mask |= mask; | ||
762 | utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); | ||
763 | if (utf32 > android::kUnicodeMaxCodepoint) { | ||
764 | return 0; | ||
765 | } | ||
766 | |||
767 | ret += num_to_read; | ||
768 | } | ||
769 | return ret; | ||
770 | } | ||
771 | |||
772 | size_t utf32_length(const char *src, size_t src_len) | ||
773 | { | ||
774 | if (src == NULL || src_len == 0) { | ||
775 | return 0; | ||
776 | } | ||
777 | size_t ret = 0; | ||
778 | const char* cur; | ||
779 | const char* end; | ||
780 | size_t num_to_skip; | ||
781 | for (cur = src, end = src + src_len, num_to_skip = 1; | ||
782 | cur < end; | ||
783 | cur += num_to_skip, ret++) { | ||
784 | const char first_char = *cur; | ||
785 | num_to_skip = 1; | ||
786 | if ((first_char & 0x80) == 0) { // ASCII | ||
787 | continue; | ||
788 | } | ||
789 | int32_t mask; | ||
790 | |||
791 | for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { | ||
792 | } | ||
793 | } | ||
794 | return ret; | ||
795 | } | ||
796 | |||
797 | size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) | ||
798 | { | ||
799 | if (src == NULL || src_len == 0) { | ||
800 | return 0; | ||
801 | } | ||
802 | size_t ret = 0; | ||
803 | const char32_t *end = src + src_len; | ||
804 | while (src < end) { | ||
805 | ret += android::utf32_to_utf8_bytes(*src++); | ||
806 | } | ||
807 | return ret; | ||
808 | } | ||
809 | |||
810 | size_t utf8_length_from_utf16(const char16_t *src, size_t src_len) | ||
811 | { | ||
812 | if (src == NULL || src_len == 0) { | ||
813 | return 0; | ||
814 | } | ||
815 | size_t ret = 0; | ||
816 | const char16_t* const end = src + src_len; | ||
817 | while (src < end) { | ||
818 | if ((*src & 0xFC00) == 0xD800 && (src + 1) < end | ||
819 | && (*++src & 0xFC00) == 0xDC00) { | ||
820 | // surrogate pairs are always 4 bytes. | ||
821 | ret += 4; | ||
822 | src++; | ||
823 | } else { | ||
824 | ret += android::utf32_to_utf8_bytes((char32_t) *src++); | ||
825 | } | ||
826 | } | ||
827 | return ret; | ||
828 | } | ||
829 | |||
830 | static int32_t utf32_at_internal(const char* cur, size_t *num_read) | ||
831 | { | ||
832 | const char first_char = *cur; | ||
833 | if ((first_char & 0x80) == 0) { // ASCII | ||
834 | *num_read = 1; | ||
835 | return *cur; | ||
836 | } | ||
837 | cur++; | ||
838 | char32_t mask, to_ignore_mask; | ||
839 | size_t num_to_read = 0; | ||
840 | char32_t utf32 = first_char; | ||
841 | for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; | ||
842 | (first_char & mask); | ||
843 | num_to_read++, to_ignore_mask |= mask, mask >>= 1) { | ||
844 | // 0x3F == 00111111 | ||
845 | utf32 = (utf32 << 6) + (*cur++ & 0x3F); | ||
846 | } | ||
847 | to_ignore_mask |= mask; | ||
848 | utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); | ||
849 | |||
850 | *num_read = num_to_read; | ||
851 | return static_cast<int32_t>(utf32); | ||
852 | } | ||
853 | |||
854 | int32_t utf32_at(const char *src, size_t src_len, | ||
855 | size_t index, size_t *next_index) | ||
856 | { | ||
857 | if (index >= src_len) { | ||
858 | return -1; | ||
859 | } | ||
860 | size_t dummy_index; | ||
861 | if (next_index == NULL) { | ||
862 | next_index = &dummy_index; | ||
863 | } | ||
864 | size_t num_read; | ||
865 | int32_t ret = utf32_at_internal(src + index, &num_read); | ||
866 | if (ret >= 0) { | ||
867 | *next_index = index + num_read; | ||
868 | } | ||
869 | |||
870 | return ret; | ||
871 | } | ||
872 | |||
873 | size_t utf8_to_utf32(const char* src, size_t src_len, | ||
874 | char32_t* dst, size_t dst_len) | ||
875 | { | ||
876 | if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { | ||
877 | return 0; | ||
878 | } | ||
879 | |||
880 | const char* cur = src; | ||
881 | const char* end = src + src_len; | ||
882 | char32_t* cur_utf32 = dst; | ||
883 | const char32_t* end_utf32 = dst + dst_len; | ||
884 | while (cur_utf32 < end_utf32 && cur < end) { | ||
885 | size_t num_read; | ||
886 | *cur_utf32++ = | ||
887 | static_cast<char32_t>(utf32_at_internal(cur, &num_read)); | ||
888 | cur += num_read; | ||
889 | } | ||
890 | if (cur_utf32 < end_utf32) { | ||
891 | *cur_utf32 = 0; | ||
892 | } | ||
893 | return static_cast<size_t>(cur_utf32 - dst); | ||
894 | } | ||
895 | |||
896 | size_t utf32_to_utf8(const char32_t* src, size_t src_len, | ||
897 | char* dst, size_t dst_len) | ||
898 | { | ||
899 | if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { | ||
900 | return 0; | ||
901 | } | ||
902 | const char32_t *cur_utf32 = src; | ||
903 | const char32_t *end_utf32 = src + src_len; | ||
904 | char *cur = dst; | ||
905 | const char *end = dst + dst_len; | ||
906 | while (cur_utf32 < end_utf32 && cur < end) { | ||
907 | size_t len = android::utf32_to_utf8_bytes(*cur_utf32); | ||
908 | android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len); | ||
909 | cur += len; | ||
910 | } | ||
911 | if (cur < end) { | ||
912 | *cur = '\0'; | ||
913 | } | ||
914 | return cur - dst; | ||
915 | } | ||
916 | |||
917 | size_t utf16_to_utf8(const char16_t* src, size_t src_len, | ||
918 | char* dst, size_t dst_len) | ||
919 | { | ||
920 | if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { | ||
921 | return 0; | ||
922 | } | ||
923 | const char16_t* cur_utf16 = src; | ||
924 | const char16_t* const end_utf16 = src + src_len; | ||
925 | char *cur = dst; | ||
926 | const char* const end = dst + dst_len; | ||
927 | while (cur_utf16 < end_utf16 && cur < end) { | ||
928 | char32_t utf32; | ||
929 | // surrogate pairs | ||
930 | if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) { | ||
931 | utf32 = (*cur_utf16++ - 0xD800) << 10; | ||
932 | utf32 |= *cur_utf16++ - 0xDC00; | ||
933 | utf32 += 0x10000; | ||
934 | } else { | ||
935 | utf32 = (char32_t) *cur_utf16++; | ||
936 | } | ||
937 | size_t len = android::utf32_to_utf8_bytes(utf32); | ||
938 | android::utf32_to_utf8((uint8_t*)cur, utf32, len); | ||
939 | cur += len; | ||
940 | } | ||
941 | if (cur < end) { | ||
942 | *cur = '\0'; | ||
943 | } | ||
944 | return cur - dst; | ||
945 | } | ||