diff options
-rw-r--r-- | libcutils/Android.mk | 7 | ||||
-rw-r--r-- | libcutils/arch-arm/memset32.S (renamed from libcutils/memset32.S) | 0 | ||||
-rw-r--r-- | libcutils/arch-x86/android_memset16.S | 32 | ||||
-rw-r--r-- | libcutils/arch-x86/android_memset32.S | 33 | ||||
-rw-r--r-- | libcutils/arch-x86/cache_wrapper.S | 24 | ||||
-rw-r--r-- | libcutils/arch-x86/sse2-memset16-atom.S | 722 | ||||
-rw-r--r-- | libcutils/arch-x86/sse2-memset32-atom.S | 513 | ||||
-rw-r--r-- | libcutils/memory.c | 4 |
8 files changed, 1334 insertions, 1 deletions
diff --git a/libcutils/Android.mk b/libcutils/Android.mk index e8c777541..3dc3d694a 100644 --- a/libcutils/Android.mk +++ b/libcutils/Android.mk | |||
@@ -112,12 +112,17 @@ LOCAL_MODULE := libcutils | |||
112 | LOCAL_SRC_FILES := $(commonSources) ashmem-dev.c mq.c | 112 | LOCAL_SRC_FILES := $(commonSources) ashmem-dev.c mq.c |
113 | 113 | ||
114 | ifeq ($(TARGET_ARCH),arm) | 114 | ifeq ($(TARGET_ARCH),arm) |
115 | LOCAL_SRC_FILES += memset32.S | 115 | LOCAL_SRC_FILES += arch-arm/memset32.S |
116 | else # !arm | 116 | else # !arm |
117 | ifeq ($(TARGET_ARCH),sh) | 117 | ifeq ($(TARGET_ARCH),sh) |
118 | LOCAL_SRC_FILES += memory.c atomic-android-sh.c | 118 | LOCAL_SRC_FILES += memory.c atomic-android-sh.c |
119 | else # !sh | 119 | else # !sh |
120 | ifeq ($(TARGET_ARCH_VARIANT),x86-atom) | ||
121 | LOCAL_CFLAGS += -DHAVE_MEMSET16 -DHAVE_MEMSET32 | ||
122 | LOCAL_SRC_FILES += arch-x86/android_memset16.S arch-x86/android_memset32.S memory.c | ||
123 | else # !x86-atom | ||
120 | LOCAL_SRC_FILES += memory.c | 124 | LOCAL_SRC_FILES += memory.c |
125 | endif # !x86-atom | ||
121 | endif # !sh | 126 | endif # !sh |
122 | endif # !arm | 127 | endif # !arm |
123 | 128 | ||
diff --git a/libcutils/memset32.S b/libcutils/arch-arm/memset32.S index 469726563..469726563 100644 --- a/libcutils/memset32.S +++ b/libcutils/arch-arm/memset32.S | |||
diff --git a/libcutils/arch-x86/android_memset16.S b/libcutils/arch-x86/android_memset16.S new file mode 100644 index 000000000..b1f09cba3 --- /dev/null +++ b/libcutils/arch-x86/android_memset16.S | |||
@@ -0,0 +1,32 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 The Android Open Source Project | ||
3 | * | ||
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
5 | * you may not use this file except in compliance with the License. | ||
6 | * You may obtain a copy of the License at | ||
7 | * | ||
8 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
9 | * | ||
10 | * Unless required by applicable law or agreed to in writing, software | ||
11 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
13 | * See the License for the specific language governing permissions and | ||
14 | * limitations under the License. | ||
15 | */ | ||
16 | /* | ||
17 | * Contributed by: Intel Corporation | ||
18 | */ | ||
19 | |||
20 | #if defined(USE_SSE2) | ||
21 | |||
22 | # include "cache_wrapper.S" | ||
23 | # undef __i686 | ||
24 | # define USE_AS_ANDROID | ||
25 | # define sse2_memset16_atom android_memset16 | ||
26 | # include "sse2-memset16-atom.S" | ||
27 | |||
28 | #else | ||
29 | |||
30 | # include "memset16.S" | ||
31 | |||
32 | #endif | ||
diff --git a/libcutils/arch-x86/android_memset32.S b/libcutils/arch-x86/android_memset32.S new file mode 100644 index 000000000..1fb2ffefe --- /dev/null +++ b/libcutils/arch-x86/android_memset32.S | |||
@@ -0,0 +1,33 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 The Android Open Source Project | ||
3 | * | ||
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
5 | * you may not use this file except in compliance with the License. | ||
6 | * You may obtain a copy of the License at | ||
7 | * | ||
8 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
9 | * | ||
10 | * Unless required by applicable law or agreed to in writing, software | ||
11 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
13 | * See the License for the specific language governing permissions and | ||
14 | * limitations under the License. | ||
15 | */ | ||
16 | /* | ||
17 | * Contributed by: Intel Corporation | ||
18 | */ | ||
19 | |||
20 | #if defined(USE_SSE2) | ||
21 | |||
22 | # include "cache_wrapper.S" | ||
23 | # undef __i686 | ||
24 | # define USE_AS_ANDROID | ||
25 | # define sse2_memset32_atom android_memset32 | ||
26 | # include "sse2-memset32-atom.S" | ||
27 | |||
28 | #else | ||
29 | |||
30 | # include "memset32.S" | ||
31 | |||
32 | #endif | ||
33 | |||
diff --git a/libcutils/arch-x86/cache_wrapper.S b/libcutils/arch-x86/cache_wrapper.S new file mode 100644 index 000000000..508fdd3e2 --- /dev/null +++ b/libcutils/arch-x86/cache_wrapper.S | |||
@@ -0,0 +1,24 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 The Android Open Source Project | ||
3 | * | ||
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
5 | * you may not use this file except in compliance with the License. | ||
6 | * You may obtain a copy of the License at | ||
7 | * | ||
8 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
9 | * | ||
10 | * Unless required by applicable law or agreed to in writing, software | ||
11 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
13 | * See the License for the specific language governing permissions and | ||
14 | * limitations under the License. | ||
15 | */ | ||
16 | /* | ||
17 | * Contributed by: Intel Corporation | ||
18 | */ | ||
19 | |||
20 | /* Values are optimized for Atom */ | ||
21 | #define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */ | ||
22 | #define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */ | ||
23 | #define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) | ||
24 | #define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) | ||
diff --git a/libcutils/arch-x86/sse2-memset16-atom.S b/libcutils/arch-x86/sse2-memset16-atom.S new file mode 100644 index 000000000..cafec8287 --- /dev/null +++ b/libcutils/arch-x86/sse2-memset16-atom.S | |||
@@ -0,0 +1,722 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 The Android Open Source Project | ||
3 | * | ||
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
5 | * you may not use this file except in compliance with the License. | ||
6 | * You may obtain a copy of the License at | ||
7 | * | ||
8 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
9 | * | ||
10 | * Unless required by applicable law or agreed to in writing, software | ||
11 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
13 | * See the License for the specific language governing permissions and | ||
14 | * limitations under the License. | ||
15 | */ | ||
16 | /* | ||
17 | * Contributed by: Intel Corporation | ||
18 | */ | ||
19 | |||
20 | #ifndef L | ||
21 | # define L(label) .L##label | ||
22 | #endif | ||
23 | |||
24 | #ifndef ALIGN | ||
25 | # define ALIGN(n) .p2align n | ||
26 | #endif | ||
27 | |||
28 | #ifndef cfi_startproc | ||
29 | # define cfi_startproc .cfi_startproc | ||
30 | #endif | ||
31 | |||
32 | #ifndef cfi_endproc | ||
33 | # define cfi_endproc .cfi_endproc | ||
34 | #endif | ||
35 | |||
36 | #ifndef cfi_rel_offset | ||
37 | # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off | ||
38 | #endif | ||
39 | |||
40 | #ifndef cfi_restore | ||
41 | # define cfi_restore(reg) .cfi_restore reg | ||
42 | #endif | ||
43 | |||
44 | #ifndef cfi_adjust_cfa_offset | ||
45 | # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off | ||
46 | #endif | ||
47 | |||
48 | #ifndef ENTRY | ||
49 | # define ENTRY(name) \ | ||
50 | .type name, @function; \ | ||
51 | .globl name; \ | ||
52 | .p2align 4; \ | ||
53 | name: \ | ||
54 | cfi_startproc | ||
55 | #endif | ||
56 | |||
57 | #ifndef END | ||
58 | # define END(name) \ | ||
59 | cfi_endproc; \ | ||
60 | .size name, .-name | ||
61 | #endif | ||
62 | |||
63 | #define CFI_PUSH(REG) \ | ||
64 | cfi_adjust_cfa_offset (4); \ | ||
65 | cfi_rel_offset (REG, 0) | ||
66 | |||
67 | #define CFI_POP(REG) \ | ||
68 | cfi_adjust_cfa_offset (-4); \ | ||
69 | cfi_restore (REG) | ||
70 | |||
71 | #define PUSH(REG) pushl REG; CFI_PUSH (REG) | ||
72 | #define POP(REG) popl REG; CFI_POP (REG) | ||
73 | |||
74 | #ifdef USE_AS_BZERO16 | ||
75 | # define DEST PARMS | ||
76 | # define LEN DEST+4 | ||
77 | #else | ||
78 | # define DEST PARMS | ||
79 | # define CHR DEST+4 | ||
80 | # define LEN CHR+4 | ||
81 | #endif | ||
82 | |||
83 | #if 1 | ||
84 | # define SETRTNVAL | ||
85 | #else | ||
86 | # define SETRTNVAL movl DEST(%esp), %eax | ||
87 | #endif | ||
88 | |||
89 | #ifdef SHARED | ||
90 | # define ENTRANCE PUSH (%ebx); | ||
91 | # define RETURN_END POP (%ebx); ret | ||
92 | # define RETURN RETURN_END; CFI_PUSH (%ebx) | ||
93 | # define PARMS 8 /* Preserve EBX. */ | ||
94 | # define JMPTBL(I, B) I - B | ||
95 | |||
96 | /* Load an entry in a jump table into EBX and branch to it. TABLE is a | ||
97 | jump table with relative offsets. */ | ||
98 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ | ||
99 | /* We first load PC into EBX. */ \ | ||
100 | call __i686.get_pc_thunk.bx; \ | ||
101 | /* Get the address of the jump table. */ \ | ||
102 | add $(TABLE - .), %ebx; \ | ||
103 | /* Get the entry and convert the relative offset to the \ | ||
104 | absolute address. */ \ | ||
105 | add (%ebx,%ecx,4), %ebx; \ | ||
106 | /* We loaded the jump table and adjuested EDX. Go. */ \ | ||
107 | jmp *%ebx | ||
108 | |||
109 | .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits | ||
110 | .globl __i686.get_pc_thunk.bx | ||
111 | .hidden __i686.get_pc_thunk.bx | ||
112 | ALIGN (4) | ||
113 | .type __i686.get_pc_thunk.bx,@function | ||
114 | __i686.get_pc_thunk.bx: | ||
115 | movl (%esp), %ebx | ||
116 | ret | ||
117 | #else | ||
118 | # define ENTRANCE | ||
119 | # define RETURN_END ret | ||
120 | # define RETURN RETURN_END | ||
121 | # define PARMS 4 | ||
122 | # define JMPTBL(I, B) I | ||
123 | |||
124 | /* Branch to an entry in a jump table. TABLE is a jump table with | ||
125 | absolute offsets. */ | ||
126 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ | ||
127 | jmp *TABLE(,%ecx,4) | ||
128 | #endif | ||
129 | |||
130 | .section .text.sse2,"ax",@progbits | ||
131 | ALIGN (4) | ||
132 | ENTRY (sse2_memset16_atom) | ||
133 | ENTRANCE | ||
134 | |||
135 | movl LEN(%esp), %ecx | ||
136 | #ifdef USE_AS_ANDROID | ||
137 | shr $1, %ecx | ||
138 | #endif | ||
139 | #ifdef USE_AS_BZERO16 | ||
140 | xor %eax, %eax | ||
141 | #else | ||
142 | movzwl CHR(%esp), %eax | ||
143 | mov %eax, %edx | ||
144 | shl $16, %eax | ||
145 | or %edx, %eax | ||
146 | #endif | ||
147 | movl DEST(%esp), %edx | ||
148 | cmp $32, %ecx | ||
149 | jae L(32wordsormore) | ||
150 | |||
151 | L(write_less32words): | ||
152 | lea (%edx, %ecx, 2), %edx | ||
153 | BRANCH_TO_JMPTBL_ENTRY (L(table_less32words)) | ||
154 | |||
155 | |||
156 | .pushsection .rodata.sse2,"a",@progbits | ||
157 | ALIGN (2) | ||
158 | L(table_less32words): | ||
159 | .int JMPTBL (L(write_0words), L(table_less32words)) | ||
160 | .int JMPTBL (L(write_1words), L(table_less32words)) | ||
161 | .int JMPTBL (L(write_2words), L(table_less32words)) | ||
162 | .int JMPTBL (L(write_3words), L(table_less32words)) | ||
163 | .int JMPTBL (L(write_4words), L(table_less32words)) | ||
164 | .int JMPTBL (L(write_5words), L(table_less32words)) | ||
165 | .int JMPTBL (L(write_6words), L(table_less32words)) | ||
166 | .int JMPTBL (L(write_7words), L(table_less32words)) | ||
167 | .int JMPTBL (L(write_8words), L(table_less32words)) | ||
168 | .int JMPTBL (L(write_9words), L(table_less32words)) | ||
169 | .int JMPTBL (L(write_10words), L(table_less32words)) | ||
170 | .int JMPTBL (L(write_11words), L(table_less32words)) | ||
171 | .int JMPTBL (L(write_12words), L(table_less32words)) | ||
172 | .int JMPTBL (L(write_13words), L(table_less32words)) | ||
173 | .int JMPTBL (L(write_14words), L(table_less32words)) | ||
174 | .int JMPTBL (L(write_15words), L(table_less32words)) | ||
175 | .int JMPTBL (L(write_16words), L(table_less32words)) | ||
176 | .int JMPTBL (L(write_17words), L(table_less32words)) | ||
177 | .int JMPTBL (L(write_18words), L(table_less32words)) | ||
178 | .int JMPTBL (L(write_19words), L(table_less32words)) | ||
179 | .int JMPTBL (L(write_20words), L(table_less32words)) | ||
180 | .int JMPTBL (L(write_21words), L(table_less32words)) | ||
181 | .int JMPTBL (L(write_22words), L(table_less32words)) | ||
182 | .int JMPTBL (L(write_23words), L(table_less32words)) | ||
183 | .int JMPTBL (L(write_24words), L(table_less32words)) | ||
184 | .int JMPTBL (L(write_25words), L(table_less32words)) | ||
185 | .int JMPTBL (L(write_26words), L(table_less32words)) | ||
186 | .int JMPTBL (L(write_27words), L(table_less32words)) | ||
187 | .int JMPTBL (L(write_28words), L(table_less32words)) | ||
188 | .int JMPTBL (L(write_29words), L(table_less32words)) | ||
189 | .int JMPTBL (L(write_30words), L(table_less32words)) | ||
190 | .int JMPTBL (L(write_31words), L(table_less32words)) | ||
191 | .popsection | ||
192 | |||
193 | ALIGN (4) | ||
194 | L(write_28words): | ||
195 | movl %eax, -56(%edx) | ||
196 | movl %eax, -52(%edx) | ||
197 | L(write_24words): | ||
198 | movl %eax, -48(%edx) | ||
199 | movl %eax, -44(%edx) | ||
200 | L(write_20words): | ||
201 | movl %eax, -40(%edx) | ||
202 | movl %eax, -36(%edx) | ||
203 | L(write_16words): | ||
204 | movl %eax, -32(%edx) | ||
205 | movl %eax, -28(%edx) | ||
206 | L(write_12words): | ||
207 | movl %eax, -24(%edx) | ||
208 | movl %eax, -20(%edx) | ||
209 | L(write_8words): | ||
210 | movl %eax, -16(%edx) | ||
211 | movl %eax, -12(%edx) | ||
212 | L(write_4words): | ||
213 | movl %eax, -8(%edx) | ||
214 | movl %eax, -4(%edx) | ||
215 | L(write_0words): | ||
216 | SETRTNVAL | ||
217 | RETURN | ||
218 | |||
219 | ALIGN (4) | ||
220 | L(write_29words): | ||
221 | movl %eax, -58(%edx) | ||
222 | movl %eax, -54(%edx) | ||
223 | L(write_25words): | ||
224 | movl %eax, -50(%edx) | ||
225 | movl %eax, -46(%edx) | ||
226 | L(write_21words): | ||
227 | movl %eax, -42(%edx) | ||
228 | movl %eax, -38(%edx) | ||
229 | L(write_17words): | ||
230 | movl %eax, -34(%edx) | ||
231 | movl %eax, -30(%edx) | ||
232 | L(write_13words): | ||
233 | movl %eax, -26(%edx) | ||
234 | movl %eax, -22(%edx) | ||
235 | L(write_9words): | ||
236 | movl %eax, -18(%edx) | ||
237 | movl %eax, -14(%edx) | ||
238 | L(write_5words): | ||
239 | movl %eax, -10(%edx) | ||
240 | movl %eax, -6(%edx) | ||
241 | L(write_1words): | ||
242 | mov %ax, -2(%edx) | ||
243 | SETRTNVAL | ||
244 | RETURN | ||
245 | |||
246 | ALIGN (4) | ||
247 | L(write_30words): | ||
248 | movl %eax, -60(%edx) | ||
249 | movl %eax, -56(%edx) | ||
250 | L(write_26words): | ||
251 | movl %eax, -52(%edx) | ||
252 | movl %eax, -48(%edx) | ||
253 | L(write_22words): | ||
254 | movl %eax, -44(%edx) | ||
255 | movl %eax, -40(%edx) | ||
256 | L(write_18words): | ||
257 | movl %eax, -36(%edx) | ||
258 | movl %eax, -32(%edx) | ||
259 | L(write_14words): | ||
260 | movl %eax, -28(%edx) | ||
261 | movl %eax, -24(%edx) | ||
262 | L(write_10words): | ||
263 | movl %eax, -20(%edx) | ||
264 | movl %eax, -16(%edx) | ||
265 | L(write_6words): | ||
266 | movl %eax, -12(%edx) | ||
267 | movl %eax, -8(%edx) | ||
268 | L(write_2words): | ||
269 | movl %eax, -4(%edx) | ||
270 | SETRTNVAL | ||
271 | RETURN | ||
272 | |||
273 | ALIGN (4) | ||
274 | L(write_31words): | ||
275 | movl %eax, -62(%edx) | ||
276 | movl %eax, -58(%edx) | ||
277 | L(write_27words): | ||
278 | movl %eax, -54(%edx) | ||
279 | movl %eax, -50(%edx) | ||
280 | L(write_23words): | ||
281 | movl %eax, -46(%edx) | ||
282 | movl %eax, -42(%edx) | ||
283 | L(write_19words): | ||
284 | movl %eax, -38(%edx) | ||
285 | movl %eax, -34(%edx) | ||
286 | L(write_15words): | ||
287 | movl %eax, -30(%edx) | ||
288 | movl %eax, -26(%edx) | ||
289 | L(write_11words): | ||
290 | movl %eax, -22(%edx) | ||
291 | movl %eax, -18(%edx) | ||
292 | L(write_7words): | ||
293 | movl %eax, -14(%edx) | ||
294 | movl %eax, -10(%edx) | ||
295 | L(write_3words): | ||
296 | movl %eax, -6(%edx) | ||
297 | movw %ax, -2(%edx) | ||
298 | SETRTNVAL | ||
299 | RETURN | ||
300 | |||
301 | ALIGN (4) | ||
302 | |||
303 | L(32wordsormore): | ||
304 | shl $1, %ecx | ||
305 | test $0x01, %edx | ||
306 | jz L(aligned2bytes) | ||
307 | mov %eax, (%edx) | ||
308 | mov %eax, -4(%edx, %ecx) | ||
309 | sub $2, %ecx | ||
310 | add $1, %edx | ||
311 | rol $8, %eax | ||
312 | L(aligned2bytes): | ||
313 | #ifdef USE_AS_BZERO16 | ||
314 | pxor %xmm0, %xmm0 | ||
315 | #else | ||
316 | movd %eax, %xmm0 | ||
317 | pshufd $0, %xmm0, %xmm0 | ||
318 | #endif | ||
319 | testl $0xf, %edx | ||
320 | jz L(aligned_16) | ||
321 | /* ECX > 32 and EDX is not 16 byte aligned. */ | ||
322 | L(not_aligned_16): | ||
323 | movdqu %xmm0, (%edx) | ||
324 | movl %edx, %eax | ||
325 | and $-16, %edx | ||
326 | add $16, %edx | ||
327 | sub %edx, %eax | ||
328 | add %eax, %ecx | ||
329 | movd %xmm0, %eax | ||
330 | |||
331 | ALIGN (4) | ||
332 | L(aligned_16): | ||
333 | cmp $128, %ecx | ||
334 | jae L(128bytesormore) | ||
335 | |||
336 | L(aligned_16_less128bytes): | ||
337 | add %ecx, %edx | ||
338 | shr $1, %ecx | ||
339 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
340 | |||
341 | ALIGN (4) | ||
342 | L(128bytesormore): | ||
343 | #ifdef SHARED_CACHE_SIZE | ||
344 | PUSH (%ebx) | ||
345 | mov $SHARED_CACHE_SIZE, %ebx | ||
346 | #else | ||
347 | # ifdef SHARED | ||
348 | call __i686.get_pc_thunk.bx | ||
349 | add $_GLOBAL_OFFSET_TABLE_, %ebx | ||
350 | mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx | ||
351 | # else | ||
352 | PUSH (%ebx) | ||
353 | mov __x86_shared_cache_size, %ebx | ||
354 | # endif | ||
355 | #endif | ||
356 | cmp %ebx, %ecx | ||
357 | jae L(128bytesormore_nt_start) | ||
358 | |||
359 | |||
360 | #ifdef DATA_CACHE_SIZE | ||
361 | POP (%ebx) | ||
362 | # define RESTORE_EBX_STATE CFI_PUSH (%ebx) | ||
363 | cmp $DATA_CACHE_SIZE, %ecx | ||
364 | #else | ||
365 | # ifdef SHARED | ||
366 | # define RESTORE_EBX_STATE | ||
367 | call __i686.get_pc_thunk.bx | ||
368 | add $_GLOBAL_OFFSET_TABLE_, %ebx | ||
369 | cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx | ||
370 | # else | ||
371 | POP (%ebx) | ||
372 | # define RESTORE_EBX_STATE CFI_PUSH (%ebx) | ||
373 | cmp __x86_data_cache_size, %ecx | ||
374 | # endif | ||
375 | #endif | ||
376 | |||
377 | jae L(128bytes_L2_normal) | ||
378 | subl $128, %ecx | ||
379 | L(128bytesormore_normal): | ||
380 | sub $128, %ecx | ||
381 | movdqa %xmm0, (%edx) | ||
382 | movdqa %xmm0, 0x10(%edx) | ||
383 | movdqa %xmm0, 0x20(%edx) | ||
384 | movdqa %xmm0, 0x30(%edx) | ||
385 | movdqa %xmm0, 0x40(%edx) | ||
386 | movdqa %xmm0, 0x50(%edx) | ||
387 | movdqa %xmm0, 0x60(%edx) | ||
388 | movdqa %xmm0, 0x70(%edx) | ||
389 | lea 128(%edx), %edx | ||
390 | jb L(128bytesless_normal) | ||
391 | |||
392 | |||
393 | sub $128, %ecx | ||
394 | movdqa %xmm0, (%edx) | ||
395 | movdqa %xmm0, 0x10(%edx) | ||
396 | movdqa %xmm0, 0x20(%edx) | ||
397 | movdqa %xmm0, 0x30(%edx) | ||
398 | movdqa %xmm0, 0x40(%edx) | ||
399 | movdqa %xmm0, 0x50(%edx) | ||
400 | movdqa %xmm0, 0x60(%edx) | ||
401 | movdqa %xmm0, 0x70(%edx) | ||
402 | lea 128(%edx), %edx | ||
403 | jae L(128bytesormore_normal) | ||
404 | |||
405 | L(128bytesless_normal): | ||
406 | lea 128(%ecx), %ecx | ||
407 | add %ecx, %edx | ||
408 | shr $1, %ecx | ||
409 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
410 | |||
411 | ALIGN (4) | ||
412 | L(128bytes_L2_normal): | ||
413 | prefetcht0 0x380(%edx) | ||
414 | prefetcht0 0x3c0(%edx) | ||
415 | sub $128, %ecx | ||
416 | movdqa %xmm0, (%edx) | ||
417 | movaps %xmm0, 0x10(%edx) | ||
418 | movaps %xmm0, 0x20(%edx) | ||
419 | movaps %xmm0, 0x30(%edx) | ||
420 | movaps %xmm0, 0x40(%edx) | ||
421 | movaps %xmm0, 0x50(%edx) | ||
422 | movaps %xmm0, 0x60(%edx) | ||
423 | movaps %xmm0, 0x70(%edx) | ||
424 | add $128, %edx | ||
425 | cmp $128, %ecx | ||
426 | jae L(128bytes_L2_normal) | ||
427 | |||
428 | L(128bytesless_L2_normal): | ||
429 | add %ecx, %edx | ||
430 | shr $1, %ecx | ||
431 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
432 | |||
433 | RESTORE_EBX_STATE | ||
434 | L(128bytesormore_nt_start): | ||
435 | sub %ebx, %ecx | ||
436 | mov %ebx, %eax | ||
437 | and $0x7f, %eax | ||
438 | add %eax, %ecx | ||
439 | movd %xmm0, %eax | ||
440 | ALIGN (4) | ||
441 | L(128bytesormore_shared_cache_loop): | ||
442 | prefetcht0 0x3c0(%edx) | ||
443 | prefetcht0 0x380(%edx) | ||
444 | sub $0x80, %ebx | ||
445 | movdqa %xmm0, (%edx) | ||
446 | movdqa %xmm0, 0x10(%edx) | ||
447 | movdqa %xmm0, 0x20(%edx) | ||
448 | movdqa %xmm0, 0x30(%edx) | ||
449 | movdqa %xmm0, 0x40(%edx) | ||
450 | movdqa %xmm0, 0x50(%edx) | ||
451 | movdqa %xmm0, 0x60(%edx) | ||
452 | movdqa %xmm0, 0x70(%edx) | ||
453 | add $0x80, %edx | ||
454 | cmp $0x80, %ebx | ||
455 | jae L(128bytesormore_shared_cache_loop) | ||
456 | cmp $0x80, %ecx | ||
457 | jb L(shared_cache_loop_end) | ||
458 | ALIGN (4) | ||
459 | L(128bytesormore_nt): | ||
460 | sub $0x80, %ecx | ||
461 | movntdq %xmm0, (%edx) | ||
462 | movntdq %xmm0, 0x10(%edx) | ||
463 | movntdq %xmm0, 0x20(%edx) | ||
464 | movntdq %xmm0, 0x30(%edx) | ||
465 | movntdq %xmm0, 0x40(%edx) | ||
466 | movntdq %xmm0, 0x50(%edx) | ||
467 | movntdq %xmm0, 0x60(%edx) | ||
468 | movntdq %xmm0, 0x70(%edx) | ||
469 | add $0x80, %edx | ||
470 | cmp $0x80, %ecx | ||
471 | jae L(128bytesormore_nt) | ||
472 | sfence | ||
473 | L(shared_cache_loop_end): | ||
474 | #if defined DATA_CACHE_SIZE || !defined SHARED | ||
475 | POP (%ebx) | ||
476 | #endif | ||
477 | add %ecx, %edx | ||
478 | shr $1, %ecx | ||
479 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
480 | |||
481 | |||
482 | .pushsection .rodata.sse2,"a",@progbits | ||
483 | ALIGN (2) | ||
484 | L(table_16_128bytes): | ||
485 | .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) | ||
486 | .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) | ||
487 | .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) | ||
488 | .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) | ||
489 | .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) | ||
490 | .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) | ||
491 | .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) | ||
492 | .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) | ||
493 | .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) | ||
494 | .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) | ||
495 | .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) | ||
496 | .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) | ||
497 | .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) | ||
498 | .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) | ||
499 | .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) | ||
500 | .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) | ||
501 | .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) | ||
502 | .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) | ||
503 | .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) | ||
504 | .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) | ||
505 | .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) | ||
506 | .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) | ||
507 | .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) | ||
508 | .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) | ||
509 | .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) | ||
510 | .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) | ||
511 | .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) | ||
512 | .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) | ||
513 | .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) | ||
514 | .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) | ||
515 | .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) | ||
516 | .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) | ||
517 | .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) | ||
518 | .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) | ||
519 | .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) | ||
520 | .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) | ||
521 | .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) | ||
522 | .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) | ||
523 | .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) | ||
524 | .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) | ||
525 | .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) | ||
526 | .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) | ||
527 | .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) | ||
528 | .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) | ||
529 | .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) | ||
530 | .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) | ||
531 | .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) | ||
532 | .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) | ||
533 | .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) | ||
534 | .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) | ||
535 | .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) | ||
536 | .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) | ||
537 | .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) | ||
538 | .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) | ||
539 | .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) | ||
540 | .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) | ||
541 | .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) | ||
542 | .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) | ||
543 | .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) | ||
544 | .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) | ||
545 | .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) | ||
546 | .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) | ||
547 | .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) | ||
548 | .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) | ||
549 | .popsection | ||
550 | |||
551 | |||
552 | ALIGN (4) | ||
553 | L(aligned_16_112bytes): | ||
554 | movdqa %xmm0, -112(%edx) | ||
555 | L(aligned_16_96bytes): | ||
556 | movdqa %xmm0, -96(%edx) | ||
557 | L(aligned_16_80bytes): | ||
558 | movdqa %xmm0, -80(%edx) | ||
559 | L(aligned_16_64bytes): | ||
560 | movdqa %xmm0, -64(%edx) | ||
561 | L(aligned_16_48bytes): | ||
562 | movdqa %xmm0, -48(%edx) | ||
563 | L(aligned_16_32bytes): | ||
564 | movdqa %xmm0, -32(%edx) | ||
565 | L(aligned_16_16bytes): | ||
566 | movdqa %xmm0, -16(%edx) | ||
567 | L(aligned_16_0bytes): | ||
568 | SETRTNVAL | ||
569 | RETURN | ||
570 | |||
571 | |||
572 | ALIGN (4) | ||
573 | L(aligned_16_114bytes): | ||
574 | movdqa %xmm0, -114(%edx) | ||
575 | L(aligned_16_98bytes): | ||
576 | movdqa %xmm0, -98(%edx) | ||
577 | L(aligned_16_82bytes): | ||
578 | movdqa %xmm0, -82(%edx) | ||
579 | L(aligned_16_66bytes): | ||
580 | movdqa %xmm0, -66(%edx) | ||
581 | L(aligned_16_50bytes): | ||
582 | movdqa %xmm0, -50(%edx) | ||
583 | L(aligned_16_34bytes): | ||
584 | movdqa %xmm0, -34(%edx) | ||
585 | L(aligned_16_18bytes): | ||
586 | movdqa %xmm0, -18(%edx) | ||
587 | L(aligned_16_2bytes): | ||
588 | movw %ax, -2(%edx) | ||
589 | SETRTNVAL | ||
590 | RETURN | ||
591 | |||
592 | ALIGN (4) | ||
593 | L(aligned_16_116bytes): | ||
594 | movdqa %xmm0, -116(%edx) | ||
595 | L(aligned_16_100bytes): | ||
596 | movdqa %xmm0, -100(%edx) | ||
597 | L(aligned_16_84bytes): | ||
598 | movdqa %xmm0, -84(%edx) | ||
599 | L(aligned_16_68bytes): | ||
600 | movdqa %xmm0, -68(%edx) | ||
601 | L(aligned_16_52bytes): | ||
602 | movdqa %xmm0, -52(%edx) | ||
603 | L(aligned_16_36bytes): | ||
604 | movdqa %xmm0, -36(%edx) | ||
605 | L(aligned_16_20bytes): | ||
606 | movdqa %xmm0, -20(%edx) | ||
607 | L(aligned_16_4bytes): | ||
608 | movl %eax, -4(%edx) | ||
609 | SETRTNVAL | ||
610 | RETURN | ||
611 | |||
612 | |||
613 | ALIGN (4) | ||
614 | L(aligned_16_118bytes): | ||
615 | movdqa %xmm0, -118(%edx) | ||
616 | L(aligned_16_102bytes): | ||
617 | movdqa %xmm0, -102(%edx) | ||
618 | L(aligned_16_86bytes): | ||
619 | movdqa %xmm0, -86(%edx) | ||
620 | L(aligned_16_70bytes): | ||
621 | movdqa %xmm0, -70(%edx) | ||
622 | L(aligned_16_54bytes): | ||
623 | movdqa %xmm0, -54(%edx) | ||
624 | L(aligned_16_38bytes): | ||
625 | movdqa %xmm0, -38(%edx) | ||
626 | L(aligned_16_22bytes): | ||
627 | movdqa %xmm0, -22(%edx) | ||
628 | L(aligned_16_6bytes): | ||
629 | movl %eax, -6(%edx) | ||
630 | movw %ax, -2(%edx) | ||
631 | SETRTNVAL | ||
632 | RETURN | ||
633 | |||
634 | |||
635 | ALIGN (4) | ||
636 | L(aligned_16_120bytes): | ||
637 | movdqa %xmm0, -120(%edx) | ||
638 | L(aligned_16_104bytes): | ||
639 | movdqa %xmm0, -104(%edx) | ||
640 | L(aligned_16_88bytes): | ||
641 | movdqa %xmm0, -88(%edx) | ||
642 | L(aligned_16_72bytes): | ||
643 | movdqa %xmm0, -72(%edx) | ||
644 | L(aligned_16_56bytes): | ||
645 | movdqa %xmm0, -56(%edx) | ||
646 | L(aligned_16_40bytes): | ||
647 | movdqa %xmm0, -40(%edx) | ||
648 | L(aligned_16_24bytes): | ||
649 | movdqa %xmm0, -24(%edx) | ||
650 | L(aligned_16_8bytes): | ||
651 | movq %xmm0, -8(%edx) | ||
652 | SETRTNVAL | ||
653 | RETURN | ||
654 | |||
655 | |||
656 | ALIGN (4) | ||
657 | L(aligned_16_122bytes): | ||
658 | movdqa %xmm0, -122(%edx) | ||
659 | L(aligned_16_106bytes): | ||
660 | movdqa %xmm0, -106(%edx) | ||
661 | L(aligned_16_90bytes): | ||
662 | movdqa %xmm0, -90(%edx) | ||
663 | L(aligned_16_74bytes): | ||
664 | movdqa %xmm0, -74(%edx) | ||
665 | L(aligned_16_58bytes): | ||
666 | movdqa %xmm0, -58(%edx) | ||
667 | L(aligned_16_42bytes): | ||
668 | movdqa %xmm0, -42(%edx) | ||
669 | L(aligned_16_26bytes): | ||
670 | movdqa %xmm0, -26(%edx) | ||
671 | L(aligned_16_10bytes): | ||
672 | movq %xmm0, -10(%edx) | ||
673 | movw %ax, -2(%edx) | ||
674 | SETRTNVAL | ||
675 | RETURN | ||
676 | |||
677 | |||
678 | ALIGN (4) | ||
679 | L(aligned_16_124bytes): | ||
680 | movdqa %xmm0, -124(%edx) | ||
681 | L(aligned_16_108bytes): | ||
682 | movdqa %xmm0, -108(%edx) | ||
683 | L(aligned_16_92bytes): | ||
684 | movdqa %xmm0, -92(%edx) | ||
685 | L(aligned_16_76bytes): | ||
686 | movdqa %xmm0, -76(%edx) | ||
687 | L(aligned_16_60bytes): | ||
688 | movdqa %xmm0, -60(%edx) | ||
689 | L(aligned_16_44bytes): | ||
690 | movdqa %xmm0, -44(%edx) | ||
691 | L(aligned_16_28bytes): | ||
692 | movdqa %xmm0, -28(%edx) | ||
693 | L(aligned_16_12bytes): | ||
694 | movq %xmm0, -12(%edx) | ||
695 | movl %eax, -4(%edx) | ||
696 | SETRTNVAL | ||
697 | RETURN | ||
698 | |||
699 | |||
700 | ALIGN (4) | ||
701 | L(aligned_16_126bytes): | ||
702 | movdqa %xmm0, -126(%edx) | ||
703 | L(aligned_16_110bytes): | ||
704 | movdqa %xmm0, -110(%edx) | ||
705 | L(aligned_16_94bytes): | ||
706 | movdqa %xmm0, -94(%edx) | ||
707 | L(aligned_16_78bytes): | ||
708 | movdqa %xmm0, -78(%edx) | ||
709 | L(aligned_16_62bytes): | ||
710 | movdqa %xmm0, -62(%edx) | ||
711 | L(aligned_16_46bytes): | ||
712 | movdqa %xmm0, -46(%edx) | ||
713 | L(aligned_16_30bytes): | ||
714 | movdqa %xmm0, -30(%edx) | ||
715 | L(aligned_16_14bytes): | ||
716 | movq %xmm0, -14(%edx) | ||
717 | movl %eax, -6(%edx) | ||
718 | movw %ax, -2(%edx) | ||
719 | SETRTNVAL | ||
720 | RETURN | ||
721 | |||
722 | END (sse2_memset16_atom) | ||
diff --git a/libcutils/arch-x86/sse2-memset32-atom.S b/libcutils/arch-x86/sse2-memset32-atom.S new file mode 100644 index 000000000..4a5248450 --- /dev/null +++ b/libcutils/arch-x86/sse2-memset32-atom.S | |||
@@ -0,0 +1,513 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 The Android Open Source Project | ||
3 | * | ||
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | ||
5 | * you may not use this file except in compliance with the License. | ||
6 | * You may obtain a copy of the License at | ||
7 | * | ||
8 | * http://www.apache.org/licenses/LICENSE-2.0 | ||
9 | * | ||
10 | * Unless required by applicable law or agreed to in writing, software | ||
11 | * distributed under the License is distributed on an "AS IS" BASIS, | ||
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
13 | * See the License for the specific language governing permissions and | ||
14 | * limitations under the License. | ||
15 | */ | ||
16 | /* | ||
17 | * Contributed by: Intel Corporation | ||
18 | */ | ||
19 | |||
20 | #ifndef L | ||
21 | # define L(label) .L##label | ||
22 | #endif | ||
23 | |||
24 | #ifndef ALIGN | ||
25 | # define ALIGN(n) .p2align n | ||
26 | #endif | ||
27 | |||
28 | #ifndef cfi_startproc | ||
29 | # define cfi_startproc .cfi_startproc | ||
30 | #endif | ||
31 | |||
32 | #ifndef cfi_endproc | ||
33 | # define cfi_endproc .cfi_endproc | ||
34 | #endif | ||
35 | |||
36 | #ifndef cfi_rel_offset | ||
37 | # define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off | ||
38 | #endif | ||
39 | |||
40 | #ifndef cfi_restore | ||
41 | # define cfi_restore(reg) .cfi_restore reg | ||
42 | #endif | ||
43 | |||
44 | #ifndef cfi_adjust_cfa_offset | ||
45 | # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off | ||
46 | #endif | ||
47 | |||
48 | #ifndef ENTRY | ||
49 | # define ENTRY(name) \ | ||
50 | .type name, @function; \ | ||
51 | .globl name; \ | ||
52 | .p2align 4; \ | ||
53 | name: \ | ||
54 | cfi_startproc | ||
55 | #endif | ||
56 | |||
57 | #ifndef END | ||
58 | # define END(name) \ | ||
59 | cfi_endproc; \ | ||
60 | .size name, .-name | ||
61 | #endif | ||
62 | |||
63 | #define CFI_PUSH(REG) \ | ||
64 | cfi_adjust_cfa_offset (4); \ | ||
65 | cfi_rel_offset (REG, 0) | ||
66 | |||
67 | #define CFI_POP(REG) \ | ||
68 | cfi_adjust_cfa_offset (-4); \ | ||
69 | cfi_restore (REG) | ||
70 | |||
71 | #define PUSH(REG) pushl REG; CFI_PUSH (REG) | ||
72 | #define POP(REG) popl REG; CFI_POP (REG) | ||
73 | |||
74 | #ifdef USE_AS_BZERO32 | ||
75 | # define DEST PARMS | ||
76 | # define LEN DEST+4 | ||
77 | #else | ||
78 | # define DEST PARMS | ||
79 | # define DWDS DEST+4 | ||
80 | # define LEN DWDS+4 | ||
81 | #endif | ||
82 | |||
83 | #ifdef USE_AS_WMEMSET32 | ||
84 | # define SETRTNVAL movl DEST(%esp), %eax | ||
85 | #else | ||
86 | # define SETRTNVAL | ||
87 | #endif | ||
88 | |||
89 | #ifdef SHARED | ||
90 | # define ENTRANCE PUSH (%ebx); | ||
91 | # define RETURN_END POP (%ebx); ret | ||
92 | # define RETURN RETURN_END; CFI_PUSH (%ebx) | ||
93 | # define PARMS 8 /* Preserve EBX. */ | ||
94 | # define JMPTBL(I, B) I - B | ||
95 | |||
96 | /* Load an entry in a jump table into EBX and branch to it. TABLE is a | ||
97 | jump table with relative offsets. */ | ||
98 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ | ||
99 | /* We first load PC into EBX. */ \ | ||
100 | call __i686.get_pc_thunk.bx; \ | ||
101 | /* Get the address of the jump table. */ \ | ||
102 | add $(TABLE - .), %ebx; \ | ||
103 | /* Get the entry and convert the relative offset to the \ | ||
104 | absolute address. */ \ | ||
105 | add (%ebx,%ecx,4), %ebx; \ | ||
106 | /* We loaded the jump table and adjuested EDX. Go. */ \ | ||
107 | jmp *%ebx | ||
108 | |||
109 | .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits | ||
110 | .globl __i686.get_pc_thunk.bx | ||
111 | .hidden __i686.get_pc_thunk.bx | ||
112 | ALIGN (4) | ||
113 | .type __i686.get_pc_thunk.bx,@function | ||
114 | __i686.get_pc_thunk.bx: | ||
115 | movl (%esp), %ebx | ||
116 | ret | ||
117 | #else | ||
118 | # define ENTRANCE | ||
119 | # define RETURN_END ret | ||
120 | # define RETURN RETURN_END | ||
121 | # define PARMS 4 | ||
122 | # define JMPTBL(I, B) I | ||
123 | |||
124 | /* Branch to an entry in a jump table. TABLE is a jump table with | ||
125 | absolute offsets. */ | ||
126 | # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ | ||
127 | jmp *TABLE(,%ecx,4) | ||
128 | #endif | ||
129 | |||
130 | .section .text.sse2,"ax",@progbits | ||
131 | ALIGN (4) | ||
132 | ENTRY (sse2_memset32_atom) | ||
133 | ENTRANCE | ||
134 | |||
135 | movl LEN(%esp), %ecx | ||
136 | #ifdef USE_AS_ANDROID | ||
137 | shr $2, %ecx | ||
138 | #endif | ||
139 | #ifdef USE_AS_BZERO32 | ||
140 | xor %eax, %eax | ||
141 | #else | ||
142 | mov DWDS(%esp), %eax | ||
143 | mov %eax, %edx | ||
144 | #endif | ||
145 | movl DEST(%esp), %edx | ||
146 | cmp $16, %ecx | ||
147 | jae L(16dbwordsormore) | ||
148 | |||
149 | L(write_less16dbwords): | ||
150 | lea (%edx, %ecx, 4), %edx | ||
151 | BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords)) | ||
152 | |||
153 | .pushsection .rodata.sse2,"a",@progbits | ||
154 | ALIGN (2) | ||
155 | L(table_less16dbwords): | ||
156 | .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) | ||
157 | .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) | ||
158 | .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) | ||
159 | .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) | ||
160 | .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) | ||
161 | .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) | ||
162 | .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) | ||
163 | .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) | ||
164 | .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) | ||
165 | .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) | ||
166 | .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) | ||
167 | .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) | ||
168 | .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) | ||
169 | .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) | ||
170 | .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) | ||
171 | .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) | ||
172 | .popsection | ||
173 | |||
174 | ALIGN (4) | ||
175 | L(write_15dbwords): | ||
176 | movl %eax, -60(%edx) | ||
177 | L(write_14dbwords): | ||
178 | movl %eax, -56(%edx) | ||
179 | L(write_13dbwords): | ||
180 | movl %eax, -52(%edx) | ||
181 | L(write_12dbwords): | ||
182 | movl %eax, -48(%edx) | ||
183 | L(write_11dbwords): | ||
184 | movl %eax, -44(%edx) | ||
185 | L(write_10dbwords): | ||
186 | movl %eax, -40(%edx) | ||
187 | L(write_9dbwords): | ||
188 | movl %eax, -36(%edx) | ||
189 | L(write_8dbwords): | ||
190 | movl %eax, -32(%edx) | ||
191 | L(write_7dbwords): | ||
192 | movl %eax, -28(%edx) | ||
193 | L(write_6dbwords): | ||
194 | movl %eax, -24(%edx) | ||
195 | L(write_5dbwords): | ||
196 | movl %eax, -20(%edx) | ||
197 | L(write_4dbwords): | ||
198 | movl %eax, -16(%edx) | ||
199 | L(write_3dbwords): | ||
200 | movl %eax, -12(%edx) | ||
201 | L(write_2dbwords): | ||
202 | movl %eax, -8(%edx) | ||
203 | L(write_1dbwords): | ||
204 | movl %eax, -4(%edx) | ||
205 | L(write_0dbwords): | ||
206 | SETRTNVAL | ||
207 | RETURN | ||
208 | |||
209 | ALIGN (4) | ||
210 | L(16dbwordsormore): | ||
211 | test $3, %edx | ||
212 | jz L(aligned4bytes) | ||
213 | mov %eax, (%edx) | ||
214 | mov %eax, -4(%edx, %ecx, 4) | ||
215 | sub $1, %ecx | ||
216 | rol $24, %eax | ||
217 | add $1, %edx | ||
218 | test $3, %edx | ||
219 | jz L(aligned4bytes) | ||
220 | ror $8, %eax | ||
221 | add $1, %edx | ||
222 | test $3, %edx | ||
223 | jz L(aligned4bytes) | ||
224 | ror $8, %eax | ||
225 | add $1, %edx | ||
226 | L(aligned4bytes): | ||
227 | shl $2, %ecx | ||
228 | |||
229 | #ifdef USE_AS_BZERO32 | ||
230 | pxor %xmm0, %xmm0 | ||
231 | #else | ||
232 | movd %eax, %xmm0 | ||
233 | pshufd $0, %xmm0, %xmm0 | ||
234 | #endif | ||
235 | testl $0xf, %edx | ||
236 | jz L(aligned_16) | ||
237 | /* ECX > 32 and EDX is not 16 byte aligned. */ | ||
238 | L(not_aligned_16): | ||
239 | movdqu %xmm0, (%edx) | ||
240 | movl %edx, %eax | ||
241 | and $-16, %edx | ||
242 | add $16, %edx | ||
243 | sub %edx, %eax | ||
244 | add %eax, %ecx | ||
245 | movd %xmm0, %eax | ||
246 | ALIGN (4) | ||
247 | L(aligned_16): | ||
248 | cmp $128, %ecx | ||
249 | jae L(128bytesormore) | ||
250 | |||
251 | L(aligned_16_less128bytes): | ||
252 | add %ecx, %edx | ||
253 | shr $2, %ecx | ||
254 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
255 | |||
256 | ALIGN (4) | ||
257 | L(128bytesormore): | ||
258 | #ifdef SHARED_CACHE_SIZE | ||
259 | PUSH (%ebx) | ||
260 | mov $SHARED_CACHE_SIZE, %ebx | ||
261 | #else | ||
262 | # ifdef SHARED | ||
263 | call __i686.get_pc_thunk.bx | ||
264 | add $_GLOBAL_OFFSET_TABLE_, %ebx | ||
265 | mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx | ||
266 | # else | ||
267 | PUSH (%ebx) | ||
268 | mov __x86_shared_cache_size, %ebx | ||
269 | # endif | ||
270 | #endif | ||
271 | cmp %ebx, %ecx | ||
272 | jae L(128bytesormore_nt_start) | ||
273 | |||
274 | #ifdef DATA_CACHE_SIZE | ||
275 | POP (%ebx) | ||
276 | # define RESTORE_EBX_STATE CFI_PUSH (%ebx) | ||
277 | cmp $DATA_CACHE_SIZE, %ecx | ||
278 | #else | ||
279 | # ifdef SHARED | ||
280 | # define RESTORE_EBX_STATE | ||
281 | call __i686.get_pc_thunk.bx | ||
282 | add $_GLOBAL_OFFSET_TABLE_, %ebx | ||
283 | cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx | ||
284 | # else | ||
285 | POP (%ebx) | ||
286 | # define RESTORE_EBX_STATE CFI_PUSH (%ebx) | ||
287 | cmp __x86_data_cache_size, %ecx | ||
288 | # endif | ||
289 | #endif | ||
290 | |||
291 | jae L(128bytes_L2_normal) | ||
292 | subl $128, %ecx | ||
293 | L(128bytesormore_normal): | ||
294 | sub $128, %ecx | ||
295 | movdqa %xmm0, (%edx) | ||
296 | movdqa %xmm0, 0x10(%edx) | ||
297 | movdqa %xmm0, 0x20(%edx) | ||
298 | movdqa %xmm0, 0x30(%edx) | ||
299 | movdqa %xmm0, 0x40(%edx) | ||
300 | movdqa %xmm0, 0x50(%edx) | ||
301 | movdqa %xmm0, 0x60(%edx) | ||
302 | movdqa %xmm0, 0x70(%edx) | ||
303 | lea 128(%edx), %edx | ||
304 | jb L(128bytesless_normal) | ||
305 | |||
306 | |||
307 | sub $128, %ecx | ||
308 | movdqa %xmm0, (%edx) | ||
309 | movdqa %xmm0, 0x10(%edx) | ||
310 | movdqa %xmm0, 0x20(%edx) | ||
311 | movdqa %xmm0, 0x30(%edx) | ||
312 | movdqa %xmm0, 0x40(%edx) | ||
313 | movdqa %xmm0, 0x50(%edx) | ||
314 | movdqa %xmm0, 0x60(%edx) | ||
315 | movdqa %xmm0, 0x70(%edx) | ||
316 | lea 128(%edx), %edx | ||
317 | jae L(128bytesormore_normal) | ||
318 | |||
319 | L(128bytesless_normal): | ||
320 | lea 128(%ecx), %ecx | ||
321 | add %ecx, %edx | ||
322 | shr $2, %ecx | ||
323 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
324 | |||
325 | ALIGN (4) | ||
326 | L(128bytes_L2_normal): | ||
327 | prefetcht0 0x380(%edx) | ||
328 | prefetcht0 0x3c0(%edx) | ||
329 | sub $128, %ecx | ||
330 | movdqa %xmm0, (%edx) | ||
331 | movaps %xmm0, 0x10(%edx) | ||
332 | movaps %xmm0, 0x20(%edx) | ||
333 | movaps %xmm0, 0x30(%edx) | ||
334 | movaps %xmm0, 0x40(%edx) | ||
335 | movaps %xmm0, 0x50(%edx) | ||
336 | movaps %xmm0, 0x60(%edx) | ||
337 | movaps %xmm0, 0x70(%edx) | ||
338 | add $128, %edx | ||
339 | cmp $128, %ecx | ||
340 | jae L(128bytes_L2_normal) | ||
341 | |||
342 | L(128bytesless_L2_normal): | ||
343 | add %ecx, %edx | ||
344 | shr $2, %ecx | ||
345 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
346 | |||
347 | RESTORE_EBX_STATE | ||
348 | L(128bytesormore_nt_start): | ||
349 | sub %ebx, %ecx | ||
350 | mov %ebx, %eax | ||
351 | and $0x7f, %eax | ||
352 | add %eax, %ecx | ||
353 | movd %xmm0, %eax | ||
354 | ALIGN (4) | ||
355 | L(128bytesormore_shared_cache_loop): | ||
356 | prefetcht0 0x3c0(%edx) | ||
357 | prefetcht0 0x380(%edx) | ||
358 | sub $0x80, %ebx | ||
359 | movdqa %xmm0, (%edx) | ||
360 | movdqa %xmm0, 0x10(%edx) | ||
361 | movdqa %xmm0, 0x20(%edx) | ||
362 | movdqa %xmm0, 0x30(%edx) | ||
363 | movdqa %xmm0, 0x40(%edx) | ||
364 | movdqa %xmm0, 0x50(%edx) | ||
365 | movdqa %xmm0, 0x60(%edx) | ||
366 | movdqa %xmm0, 0x70(%edx) | ||
367 | add $0x80, %edx | ||
368 | cmp $0x80, %ebx | ||
369 | jae L(128bytesormore_shared_cache_loop) | ||
370 | cmp $0x80, %ecx | ||
371 | jb L(shared_cache_loop_end) | ||
372 | |||
373 | ALIGN (4) | ||
374 | L(128bytesormore_nt): | ||
375 | sub $0x80, %ecx | ||
376 | movntdq %xmm0, (%edx) | ||
377 | movntdq %xmm0, 0x10(%edx) | ||
378 | movntdq %xmm0, 0x20(%edx) | ||
379 | movntdq %xmm0, 0x30(%edx) | ||
380 | movntdq %xmm0, 0x40(%edx) | ||
381 | movntdq %xmm0, 0x50(%edx) | ||
382 | movntdq %xmm0, 0x60(%edx) | ||
383 | movntdq %xmm0, 0x70(%edx) | ||
384 | add $0x80, %edx | ||
385 | cmp $0x80, %ecx | ||
386 | jae L(128bytesormore_nt) | ||
387 | sfence | ||
388 | L(shared_cache_loop_end): | ||
389 | #if defined DATA_CACHE_SIZE || !defined SHARED | ||
390 | POP (%ebx) | ||
391 | #endif | ||
392 | add %ecx, %edx | ||
393 | shr $2, %ecx | ||
394 | BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) | ||
395 | |||
396 | .pushsection .rodata.sse2,"a",@progbits | ||
397 | ALIGN (2) | ||
398 | L(table_16_128bytes): | ||
399 | .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) | ||
400 | .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) | ||
401 | .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) | ||
402 | .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) | ||
403 | .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) | ||
404 | .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) | ||
405 | .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) | ||
406 | .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) | ||
407 | .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) | ||
408 | .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) | ||
409 | .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) | ||
410 | .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) | ||
411 | .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) | ||
412 | .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) | ||
413 | .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) | ||
414 | .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) | ||
415 | .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) | ||
416 | .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) | ||
417 | .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) | ||
418 | .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) | ||
419 | .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) | ||
420 | .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) | ||
421 | .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) | ||
422 | .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) | ||
423 | .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) | ||
424 | .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) | ||
425 | .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) | ||
426 | .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) | ||
427 | .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) | ||
428 | .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) | ||
429 | .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) | ||
430 | .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) | ||
431 | .popsection | ||
432 | |||
433 | ALIGN (4) | ||
434 | L(aligned_16_112bytes): | ||
435 | movdqa %xmm0, -112(%edx) | ||
436 | L(aligned_16_96bytes): | ||
437 | movdqa %xmm0, -96(%edx) | ||
438 | L(aligned_16_80bytes): | ||
439 | movdqa %xmm0, -80(%edx) | ||
440 | L(aligned_16_64bytes): | ||
441 | movdqa %xmm0, -64(%edx) | ||
442 | L(aligned_16_48bytes): | ||
443 | movdqa %xmm0, -48(%edx) | ||
444 | L(aligned_16_32bytes): | ||
445 | movdqa %xmm0, -32(%edx) | ||
446 | L(aligned_16_16bytes): | ||
447 | movdqa %xmm0, -16(%edx) | ||
448 | L(aligned_16_0bytes): | ||
449 | SETRTNVAL | ||
450 | RETURN | ||
451 | |||
452 | ALIGN (4) | ||
453 | L(aligned_16_116bytes): | ||
454 | movdqa %xmm0, -116(%edx) | ||
455 | L(aligned_16_100bytes): | ||
456 | movdqa %xmm0, -100(%edx) | ||
457 | L(aligned_16_84bytes): | ||
458 | movdqa %xmm0, -84(%edx) | ||
459 | L(aligned_16_68bytes): | ||
460 | movdqa %xmm0, -68(%edx) | ||
461 | L(aligned_16_52bytes): | ||
462 | movdqa %xmm0, -52(%edx) | ||
463 | L(aligned_16_36bytes): | ||
464 | movdqa %xmm0, -36(%edx) | ||
465 | L(aligned_16_20bytes): | ||
466 | movdqa %xmm0, -20(%edx) | ||
467 | L(aligned_16_4bytes): | ||
468 | movl %eax, -4(%edx) | ||
469 | SETRTNVAL | ||
470 | RETURN | ||
471 | |||
472 | ALIGN (4) | ||
473 | L(aligned_16_120bytes): | ||
474 | movdqa %xmm0, -120(%edx) | ||
475 | L(aligned_16_104bytes): | ||
476 | movdqa %xmm0, -104(%edx) | ||
477 | L(aligned_16_88bytes): | ||
478 | movdqa %xmm0, -88(%edx) | ||
479 | L(aligned_16_72bytes): | ||
480 | movdqa %xmm0, -72(%edx) | ||
481 | L(aligned_16_56bytes): | ||
482 | movdqa %xmm0, -56(%edx) | ||
483 | L(aligned_16_40bytes): | ||
484 | movdqa %xmm0, -40(%edx) | ||
485 | L(aligned_16_24bytes): | ||
486 | movdqa %xmm0, -24(%edx) | ||
487 | L(aligned_16_8bytes): | ||
488 | movq %xmm0, -8(%edx) | ||
489 | SETRTNVAL | ||
490 | RETURN | ||
491 | |||
492 | ALIGN (4) | ||
493 | L(aligned_16_124bytes): | ||
494 | movdqa %xmm0, -124(%edx) | ||
495 | L(aligned_16_108bytes): | ||
496 | movdqa %xmm0, -108(%edx) | ||
497 | L(aligned_16_92bytes): | ||
498 | movdqa %xmm0, -92(%edx) | ||
499 | L(aligned_16_76bytes): | ||
500 | movdqa %xmm0, -76(%edx) | ||
501 | L(aligned_16_60bytes): | ||
502 | movdqa %xmm0, -60(%edx) | ||
503 | L(aligned_16_44bytes): | ||
504 | movdqa %xmm0, -44(%edx) | ||
505 | L(aligned_16_28bytes): | ||
506 | movdqa %xmm0, -28(%edx) | ||
507 | L(aligned_16_12bytes): | ||
508 | movq %xmm0, -12(%edx) | ||
509 | movl %eax, -4(%edx) | ||
510 | SETRTNVAL | ||
511 | RETURN | ||
512 | |||
513 | END (sse2_memset32_atom) | ||
diff --git a/libcutils/memory.c b/libcutils/memory.c index ef6c7e663..6486b4504 100644 --- a/libcutils/memory.c +++ b/libcutils/memory.c | |||
@@ -16,6 +16,7 @@ | |||
16 | 16 | ||
17 | #include <cutils/memory.h> | 17 | #include <cutils/memory.h> |
18 | 18 | ||
19 | #if !HAVE_MEMSET16 | ||
19 | void android_memset16(uint16_t* dst, uint16_t value, size_t size) | 20 | void android_memset16(uint16_t* dst, uint16_t value, size_t size) |
20 | { | 21 | { |
21 | size >>= 1; | 22 | size >>= 1; |
@@ -23,7 +24,9 @@ void android_memset16(uint16_t* dst, uint16_t value, size_t size) | |||
23 | *dst++ = value; | 24 | *dst++ = value; |
24 | } | 25 | } |
25 | } | 26 | } |
27 | #endif | ||
26 | 28 | ||
29 | #if !HAVE_MEMSET32 | ||
27 | void android_memset32(uint32_t* dst, uint32_t value, size_t size) | 30 | void android_memset32(uint32_t* dst, uint32_t value, size_t size) |
28 | { | 31 | { |
29 | size >>= 2; | 32 | size >>= 2; |
@@ -31,6 +34,7 @@ void android_memset32(uint32_t* dst, uint32_t value, size_t size) | |||
31 | *dst++ = value; | 34 | *dst++ = value; |
32 | } | 35 | } |
33 | } | 36 | } |
37 | #endif | ||
34 | 38 | ||
35 | #if !HAVE_STRLCPY | 39 | #if !HAVE_STRLCPY |
36 | /* | 40 | /* |