summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKenny Root2010-11-09 16:37:23 -0600
committerAlex Ray2013-07-30 15:56:55 -0500
commitba0165bef09729a33ab8e0ca329342be05e0d859 (patch)
tree2f961146c3c8c625a01902207485fed0d7a2cdcd /libs/utils/String8.cpp
parent3a91fca00c6b3db62b4dc0da95ba30671caf3283 (diff)
downloadplatform-system-core-ba0165bef09729a33ab8e0ca329342be05e0d859.tar.gz
platform-system-core-ba0165bef09729a33ab8e0ca329342be05e0d859.tar.xz
platform-system-core-ba0165bef09729a33ab8e0ca329342be05e0d859.zip
Split UTF functions from String8/16
Split out all the UTF-8/16/32 handling code from String8/16 to its own file to allow better reuse of code. Change-Id: If9ce63920edc75472c38da4adce0d13cda9ad2f7
Diffstat (limited to 'libs/utils/String8.cpp')
-rw-r--r--libs/utils/String8.cpp391
1 files changed, 28 insertions, 363 deletions
diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp
index 6358fc424..c8dc0838d 100644
--- a/libs/utils/String8.cpp
+++ b/libs/utils/String8.cpp
@@ -17,6 +17,8 @@
17#include <utils/String8.h> 17#include <utils/String8.h>
18 18
19#include <utils/Log.h> 19#include <utils/Log.h>
20#include <utils/Unicode.h>
21#include <utils/SharedBuffer.h>
20#include <utils/String16.h> 22#include <utils/String16.h>
21#include <utils/TextOutput.h> 23#include <utils/TextOutput.h>
22#include <utils/threads.h> 24#include <utils/threads.h>
@@ -34,94 +36,10 @@
34 36
35namespace android { 37namespace android {
36 38
37static const char32_t kByteMask = 0x000000BF;
38static const char32_t kByteMark = 0x00000080;
39
40// Surrogates aren't valid for UTF-32 characters, so define some
41// constants that will let us screen them out.
42static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
43static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
44static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
45static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
46static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
47static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
48static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
49
50// Mask used to set appropriate bits in first byte of UTF-8 sequence,
51// indexed by number of bytes in the sequence.
52// 0xxxxxxx
53// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
54// 110yyyyx 10xxxxxx
55// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
56// 1110yyyy 10yxxxxx 10xxxxxx
57// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
58// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
59// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
60static const char32_t kFirstByteMark[] = {
61 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
62};
63
64// Separator used by resource paths. This is not platform dependent contrary 39// Separator used by resource paths. This is not platform dependent contrary
65// to OS_PATH_SEPARATOR. 40// to OS_PATH_SEPARATOR.
66#define RES_PATH_SEPARATOR '/' 41#define RES_PATH_SEPARATOR '/'
67 42
68// Return number of utf8 bytes required for the character.
69static size_t utf32_to_utf8_bytes(char32_t srcChar)
70{
71 size_t bytesToWrite;
72
73 // Figure out how many bytes the result will require.
74 if (srcChar < 0x00000080)
75 {
76 bytesToWrite = 1;
77 }
78 else if (srcChar < 0x00000800)
79 {
80 bytesToWrite = 2;
81 }
82 else if (srcChar < 0x00010000)
83 {
84 if ((srcChar < kUnicodeSurrogateStart)
85 || (srcChar > kUnicodeSurrogateEnd))
86 {
87 bytesToWrite = 3;
88 }
89 else
90 {
91 // Surrogates are invalid UTF-32 characters.
92 return 0;
93 }
94 }
95 // Max code point for Unicode is 0x0010FFFF.
96 else if (srcChar <= kUnicodeMaxCodepoint)
97 {
98 bytesToWrite = 4;
99 }
100 else
101 {
102 // Invalid UTF-32 character.
103 return 0;
104 }
105
106 return bytesToWrite;
107}
108
109// Write out the source character to <dstP>.
110
111static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
112{
113 dstP += bytes;
114 switch (bytes)
115 { /* note: everything falls through. */
116 case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
117 case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
118 case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
119 case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
120 }
121}
122
123// ---------------------------------------------------------------------------
124
125static SharedBuffer* gEmptyStringBuf = NULL; 43static SharedBuffer* gEmptyStringBuf = NULL;
126static char* gEmptyString = NULL; 44static char* gEmptyString = NULL;
127 45
@@ -175,62 +93,47 @@ static char* allocFromUTF8(const char* in, size_t len)
175 return getEmptyString(); 93 return getEmptyString();
176} 94}
177 95
178template<typename T, typename L> 96static char* allocFromUTF16(const char16_t* in, size_t len)
179static char* allocFromUTF16OrUTF32(const T* in, L len)
180{ 97{
181 if (len == 0) return getEmptyString(); 98 if (len == 0) return getEmptyString();
182 99
183 size_t bytes = 0; 100 const ssize_t bytes = utf16_to_utf8_length(in, len);
184 const T* end = in+len; 101 if (bytes < 0) {
185 const T* p = in; 102 return getEmptyString();
186
187 while (p < end) {
188 bytes += utf32_to_utf8_bytes(*p);
189 p++;
190 } 103 }
191 104
192 SharedBuffer* buf = SharedBuffer::alloc(bytes+1); 105 SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
193 LOG_ASSERT(buf, "Unable to allocate shared buffer"); 106 LOG_ASSERT(buf, "Unable to allocate shared buffer");
194 if (buf) { 107 if (!buf) {
195 p = in; 108 return getEmptyString();
196 char* str = (char*)buf->data();
197 char* d = str;
198 while (p < end) {
199 const T c = *p++;
200 size_t len = utf32_to_utf8_bytes(c);
201 utf32_to_utf8((uint8_t*)d, c, len);
202 d += len;
203 }
204 *d = 0;
205
206 return str;
207 } 109 }
208 110
209 return getEmptyString(); 111 char* str = (char*)buf->data();
112 utf16_to_utf8(in, len, str);
113 return str;
210} 114}
211 115
212static char* allocFromUTF16(const char16_t* in, size_t len) 116static char* allocFromUTF32(const char32_t* in, size_t len)
213{ 117{
214 if (len == 0) return getEmptyString(); 118 if (len == 0) {
119 return getEmptyString();
120 }
215 121
216 const size_t bytes = utf8_length_from_utf16(in, len); 122 const ssize_t bytes = utf32_to_utf8_length(in, len);
123 if (bytes < 0) {
124 return getEmptyString();
125 }
217 126
218 SharedBuffer* buf = SharedBuffer::alloc(bytes+1); 127 SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
219 LOG_ASSERT(buf, "Unable to allocate shared buffer"); 128 LOG_ASSERT(buf, "Unable to allocate shared buffer");
220 if (buf) { 129 if (!buf) {
221 char* str = (char*)buf->data(); 130 return getEmptyString();
222
223 utf16_to_utf8(in, len, str, bytes+1);
224
225 return str;
226 } 131 }
227 132
228 return getEmptyString(); 133 char* str = (char*) buf->data();
229} 134 utf32_to_utf8(in, len, str);
230 135
231static char* allocFromUTF32(const char32_t* in, size_t len) 136 return str;
232{
233 return allocFromUTF16OrUTF32<char32_t, size_t>(in, len);
234} 137}
235 138
236// --------------------------------------------------------------------------- 139// ---------------------------------------------------------------------------
@@ -510,17 +413,17 @@ void String8::toUpper(size_t start, size_t length)
510 413
511size_t String8::getUtf32Length() const 414size_t String8::getUtf32Length() const
512{ 415{
513 return utf32_length(mString, length()); 416 return utf8_to_utf32_length(mString, length());
514} 417}
515 418
516int32_t String8::getUtf32At(size_t index, size_t *next_index) const 419int32_t String8::getUtf32At(size_t index, size_t *next_index) const
517{ 420{
518 return utf32_at(mString, length(), index, next_index); 421 return utf32_from_utf8_at(mString, length(), index, next_index);
519} 422}
520 423
521size_t String8::getUtf32(char32_t* dst, size_t dst_len) const 424void String8::getUtf32(char32_t* dst) const
522{ 425{
523 return utf8_to_utf32(mString, length(), dst, dst_len); 426 utf8_to_utf32(mString, length(), dst);
524} 427}
525 428
526TextOutput& operator<<(TextOutput& to, const String8& val) 429TextOutput& operator<<(TextOutput& to, const String8& val)
@@ -705,241 +608,3 @@ String8& String8::convertToResPath()
705} 608}
706 609
707}; // namespace android 610}; // namespace android
708
709// ---------------------------------------------------------------------------
710
711size_t strlen32(const char32_t *s)
712{
713 const char32_t *ss = s;
714 while ( *ss )
715 ss++;
716 return ss-s;
717}
718
719size_t strnlen32(const char32_t *s, size_t maxlen)
720{
721 const char32_t *ss = s;
722 while ((maxlen > 0) && *ss) {
723 ss++;
724 maxlen--;
725 }
726 return ss-s;
727}
728
729size_t utf8_length(const char *src)
730{
731 const char *cur = src;
732 size_t ret = 0;
733 while (*cur != '\0') {
734 const char first_char = *cur++;
735 if ((first_char & 0x80) == 0) { // ASCII
736 ret += 1;
737 continue;
738 }
739 // (UTF-8's character must not be like 10xxxxxx,
740 // but 110xxxxx, 1110xxxx, ... or 1111110x)
741 if ((first_char & 0x40) == 0) {
742 return 0;
743 }
744
745 int32_t mask, to_ignore_mask;
746 size_t num_to_read = 0;
747 char32_t utf32 = 0;
748 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
749 num_to_read < 5 && (first_char & mask);
750 num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
751 if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
752 return 0;
753 }
754 // 0x3F == 00111111
755 utf32 = (utf32 << 6) + (*cur++ & 0x3F);
756 }
757 // "first_char" must be (110xxxxx - 11110xxx)
758 if (num_to_read == 5) {
759 return 0;
760 }
761 to_ignore_mask |= mask;
762 utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
763 if (utf32 > android::kUnicodeMaxCodepoint) {
764 return 0;
765 }
766
767 ret += num_to_read;
768 }
769 return ret;
770}
771
772size_t utf32_length(const char *src, size_t src_len)
773{
774 if (src == NULL || src_len == 0) {
775 return 0;
776 }
777 size_t ret = 0;
778 const char* cur;
779 const char* end;
780 size_t num_to_skip;
781 for (cur = src, end = src + src_len, num_to_skip = 1;
782 cur < end;
783 cur += num_to_skip, ret++) {
784 const char first_char = *cur;
785 num_to_skip = 1;
786 if ((first_char & 0x80) == 0) { // ASCII
787 continue;
788 }
789 int32_t mask;
790
791 for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
792 }
793 }
794 return ret;
795}
796
797size_t utf8_length_from_utf32(const char32_t *src, size_t src_len)
798{
799 if (src == NULL || src_len == 0) {
800 return 0;
801 }
802 size_t ret = 0;
803 const char32_t *end = src + src_len;
804 while (src < end) {
805 ret += android::utf32_to_utf8_bytes(*src++);
806 }
807 return ret;
808}
809
810size_t utf8_length_from_utf16(const char16_t *src, size_t src_len)
811{
812 if (src == NULL || src_len == 0) {
813 return 0;
814 }
815 size_t ret = 0;
816 const char16_t* const end = src + src_len;
817 while (src < end) {
818 if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
819 && (*++src & 0xFC00) == 0xDC00) {
820 // surrogate pairs are always 4 bytes.
821 ret += 4;
822 src++;
823 } else {
824 ret += android::utf32_to_utf8_bytes((char32_t) *src++);
825 }
826 }
827 return ret;
828}
829
830static int32_t utf32_at_internal(const char* cur, size_t *num_read)
831{
832 const char first_char = *cur;
833 if ((first_char & 0x80) == 0) { // ASCII
834 *num_read = 1;
835 return *cur;
836 }
837 cur++;
838 char32_t mask, to_ignore_mask;
839 size_t num_to_read = 0;
840 char32_t utf32 = first_char;
841 for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
842 (first_char & mask);
843 num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
844 // 0x3F == 00111111
845 utf32 = (utf32 << 6) + (*cur++ & 0x3F);
846 }
847 to_ignore_mask |= mask;
848 utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
849
850 *num_read = num_to_read;
851 return static_cast<int32_t>(utf32);
852}
853
854int32_t utf32_at(const char *src, size_t src_len,
855 size_t index, size_t *next_index)
856{
857 if (index >= src_len) {
858 return -1;
859 }
860 size_t dummy_index;
861 if (next_index == NULL) {
862 next_index = &dummy_index;
863 }
864 size_t num_read;
865 int32_t ret = utf32_at_internal(src + index, &num_read);
866 if (ret >= 0) {
867 *next_index = index + num_read;
868 }
869
870 return ret;
871}
872
873size_t utf8_to_utf32(const char* src, size_t src_len,
874 char32_t* dst, size_t dst_len)
875{
876 if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
877 return 0;
878 }
879
880 const char* cur = src;
881 const char* end = src + src_len;
882 char32_t* cur_utf32 = dst;
883 const char32_t* end_utf32 = dst + dst_len;
884 while (cur_utf32 < end_utf32 && cur < end) {
885 size_t num_read;
886 *cur_utf32++ =
887 static_cast<char32_t>(utf32_at_internal(cur, &num_read));
888 cur += num_read;
889 }
890 if (cur_utf32 < end_utf32) {
891 *cur_utf32 = 0;
892 }
893 return static_cast<size_t>(cur_utf32 - dst);
894}
895
896size_t utf32_to_utf8(const char32_t* src, size_t src_len,
897 char* dst, size_t dst_len)
898{
899 if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
900 return 0;
901 }
902 const char32_t *cur_utf32 = src;
903 const char32_t *end_utf32 = src + src_len;
904 char *cur = dst;
905 const char *end = dst + dst_len;
906 while (cur_utf32 < end_utf32 && cur < end) {
907 size_t len = android::utf32_to_utf8_bytes(*cur_utf32);
908 android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len);
909 cur += len;
910 }
911 if (cur < end) {
912 *cur = '\0';
913 }
914 return cur - dst;
915}
916
917size_t utf16_to_utf8(const char16_t* src, size_t src_len,
918 char* dst, size_t dst_len)
919{
920 if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
921 return 0;
922 }
923 const char16_t* cur_utf16 = src;
924 const char16_t* const end_utf16 = src + src_len;
925 char *cur = dst;
926 const char* const end = dst + dst_len;
927 while (cur_utf16 < end_utf16 && cur < end) {
928 char32_t utf32;
929 // surrogate pairs
930 if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) {
931 utf32 = (*cur_utf16++ - 0xD800) << 10;
932 utf32 |= *cur_utf16++ - 0xDC00;
933 utf32 += 0x10000;
934 } else {
935 utf32 = (char32_t) *cur_utf16++;
936 }
937 size_t len = android::utf32_to_utf8_bytes(utf32);
938 android::utf32_to_utf8((uint8_t*)cur, utf32, len);
939 cur += len;
940 }
941 if (cur < end) {
942 *cur = '\0';
943 }
944 return cur - dst;
945}