aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorElliott Hughes2014-12-14 19:52:32 -0600
committerElliott Hughes2014-12-14 19:52:32 -0600
commitf92cc305711c7a64582f0f71f691f18262435e94 (patch)
treea5d11783288209464659e5294189b1224e1fe271
parent0e32e39df0e487ec86d86627f1d4b43d1c8c524d (diff)
downloadplatform-bionic-f92cc305711c7a64582f0f71f691f18262435e94.tar.gz
platform-bionic-f92cc305711c7a64582f0f71f691f18262435e94.tar.xz
platform-bionic-f92cc305711c7a64582f0f71f691f18262435e94.zip
Remove arm assembler not referenced from any makefile.
I also suspect that libc/arch-arm/bionic/memcmp.S is supposed to like in the generic directory these days, but this change just removes dead code. Change-Id: I9072488df6e9b7261d79b6014914a0e937cb387b
-rw-r--r--libc/arch-arm/bionic/memcpy.S686
-rw-r--r--libc/arch-arm/bionic/memcpy.a9.S614
-rw-r--r--libc/arch-arm/bionic/strcmp.S317
3 files changed, 0 insertions, 1617 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
deleted file mode 100644
index 2c9b10c1..00000000
--- a/libc/arch-arm/bionic/memcpy.S
+++ /dev/null
@@ -1,686 +0,0 @@
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <private/bionic_asm.h>
31
32#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
33
34 .text
35 .fpu neon
36
37#ifdef HAVE_32_BYTE_CACHE_LINE
38/* a prefetch distance of 2 cache-lines */
39#define CACHE_LINE_SIZE 32
40#else
41/* a prefetch distance of 4 cache-lines works best experimentally */
42#define CACHE_LINE_SIZE 64
43#endif
44
45ENTRY(memcpy)
46 .save {r0, lr}
47 /* start preloading as early as possible */
48 pld [r1, #(CACHE_LINE_SIZE * 0)]
49 stmfd sp!, {r0, lr}
50 pld [r1, #(CACHE_LINE_SIZE * 1)]
51
52/* If Neon supports unaligned access then remove the align code,
53 * unless a size limit has been specified.
54 */
55#ifndef NEON_UNALIGNED_ACCESS
56 /* do we have at least 16-bytes to copy (needed for alignment below) */
57 cmp r2, #16
58 blo 5f
59
60 /* check if buffers are aligned. If so, run arm-only version */
61 eor r3, r0, r1
62 ands r3, r3, #0x3
63 beq 11f
64
65 /* align destination to cache-line for the write-buffer */
66 rsb r3, r0, #0
67 ands r3, r3, #0xF
68 beq 2f
69
70 /* copy up to 15-bytes (count in r3) */
71 sub r2, r2, r3
72 movs ip, r3, lsl #31
73 ldrmib lr, [r1], #1
74 strmib lr, [r0], #1
75 ldrcsb ip, [r1], #1
76 ldrcsb lr, [r1], #1
77 strcsb ip, [r0], #1
78 strcsb lr, [r0], #1
79 movs ip, r3, lsl #29
80 bge 1f
81 // copies 4 bytes, destination 32-bits aligned
82 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
83 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
841: bcc 2f
85 // copies 8 bytes, destination 64-bits aligned
86 vld1.8 {d0}, [r1]!
87 vst1.8 {d0}, [r0, :64]!
882:
89 /* preload immediately the next cache line, which we may need */
90 pld [r1, #(CACHE_LINE_SIZE * 0)]
91 pld [r1, #(CACHE_LINE_SIZE * 1)]
92
93#ifdef HAVE_32_BYTE_CACHE_LINE
94 /* make sure we have at least 32 bytes to copy */
95 subs r2, r2, #32
96 blo 4f
97
98 /* preload all the cache lines we need.
99 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
100 * ideally would would increase the distance in the main loop to
101 * avoid the goofy code below. In practice this doesn't seem to make
102 * a big difference.
103 */
104 pld [r1, #(PREFETCH_DISTANCE)]
105
1061: /* The main loop copies 32 bytes at a time */
107 vld1.8 {d0 - d3}, [r1]!
108 pld [r1, #(PREFETCH_DISTANCE)]
109 subs r2, r2, #32
110 vst1.8 {d0 - d3}, [r0, :128]!
111 bhs 1b
112#else
113 /* make sure we have at least 64 bytes to copy */
114 subs r2, r2, #64
115 blo 2f
116
117 /* preload all the cache lines we need. */
118 pld [r1, #(CACHE_LINE_SIZE * 2)]
119 pld [r1, #(CACHE_LINE_SIZE * 3)]
120
1211: /* The main loop copies 64 bytes at a time */
122 vld1.8 {d0 - d3}, [r1]!
123 vld1.8 {d4 - d7}, [r1]!
124#ifdef HAVE_32_BYTE_CACHE_LINE
125 pld [r1, #(CACHE_LINE_SIZE * 2)]
126 pld [r1, #(CACHE_LINE_SIZE * 3)]
127#else
128 pld [r1, #(CACHE_LINE_SIZE * 3)]
129#endif
130 subs r2, r2, #64
131 vst1.8 {d0 - d3}, [r0, :128]!
132 vst1.8 {d4 - d7}, [r0, :128]!
133 bhs 1b
134
1352: /* fix-up the remaining count and make sure we have >= 32 bytes left */
136 add r2, r2, #64
137 subs r2, r2, #32
138 blo 4f
139
1403: /* 32 bytes at a time. These cache lines were already preloaded */
141 vld1.8 {d0 - d3}, [r1]!
142 subs r2, r2, #32
143 vst1.8 {d0 - d3}, [r0, :128]!
144 bhs 3b
145#endif
1464: /* less than 32 left */
147 add r2, r2, #32
148 tst r2, #0x10
149 beq 5f
150 // copies 16 bytes, 128-bits aligned
151 vld1.8 {d0, d1}, [r1]!
152 vst1.8 {d0, d1}, [r0, :128]!
1535: /* copy up to 15-bytes (count in r2) */
154 movs ip, r2, lsl #29
155 bcc 1f
156 vld1.8 {d0}, [r1]!
157 vst1.8 {d0}, [r0]!
1581: bge 2f
159 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
160 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
1612: movs ip, r2, lsl #31
162 ldrmib r3, [r1], #1
163 ldrcsb ip, [r1], #1
164 ldrcsb lr, [r1], #1
165 strmib r3, [r0], #1
166 strcsb ip, [r0], #1
167 strcsb lr, [r0], #1
168
169 ldmfd sp!, {r0, lr}
170 bx lr
171
172#else /* NEON_UNALIGNED_ACCESS */
173
174 // Check so divider is at least 16 bytes, needed for alignment code.
175 cmp r2, #16
176 blo 5f
177
178#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
179 /* Check the upper size limit for Neon unaligned memory access in memcpy */
180#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
181 cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
182 blo 3f
183#endif
184 /* check if buffers are aligned. If so, run arm-only version */
185 eor r3, r0, r1
186 ands r3, r3, #0x3
187 beq 11f
188
189 /* align destination to 16 bytes for the write-buffer */
190 rsb r3, r0, #0
191 ands r3, r3, #0xF
192 beq 3f
193
194 /* copy up to 15-bytes (count in r3) */
195 sub r2, r2, r3
196 movs ip, r3, lsl #31
197 ldrmib lr, [r1], #1
198 strmib lr, [r0], #1
199 ldrcsb ip, [r1], #1
200 ldrcsb lr, [r1], #1
201 strcsb ip, [r0], #1
202 strcsb lr, [r0], #1
203 movs ip, r3, lsl #29
204 bge 1f
205 // copies 4 bytes, destination 32-bits aligned
206 vld1.32 {d0[0]}, [r1]!
207 vst1.32 {d0[0]}, [r0, :32]!
2081: bcc 2f
209 // copies 8 bytes, destination 64-bits aligned
210 vld1.8 {d0}, [r1]!
211 vst1.8 {d0}, [r0, :64]!
2122:
213 /* preload immediately the next cache line, which we may need */
214 pld [r1, #(CACHE_LINE_SIZE * 0)]
215 pld [r1, #(CACHE_LINE_SIZE * 1)]
2163:
217#endif
218 /* make sure we have at least 64 bytes to copy */
219 subs r2, r2, #64
220 blo 2f
221
222 /* preload all the cache lines we need */
223 pld [r1, #(CACHE_LINE_SIZE * 2)]
224 pld [r1, #(CACHE_LINE_SIZE * 3)]
225
2261: /* The main loop copies 64 bytes at a time */
227 vld1.8 {d0 - d3}, [r1]!
228 vld1.8 {d4 - d7}, [r1]!
229#ifdef HAVE_32_BYTE_CACHE_LINE
230 pld [r1, #(CACHE_LINE_SIZE * 2)]
231 pld [r1, #(CACHE_LINE_SIZE * 3)]
232#else
233 pld [r1, #(CACHE_LINE_SIZE * 3)]
234#endif
235 subs r2, r2, #64
236 vst1.8 {d0 - d3}, [r0]!
237 vst1.8 {d4 - d7}, [r0]!
238 bhs 1b
239
2402: /* fix-up the remaining count and make sure we have >= 32 bytes left */
241 add r2, r2, #64
242 subs r2, r2, #32
243 blo 4f
244
2453: /* 32 bytes at a time. These cache lines were already preloaded */
246 vld1.8 {d0 - d3}, [r1]!
247 subs r2, r2, #32
248 vst1.8 {d0 - d3}, [r0]!
249 bhs 3b
250
2514: /* less than 32 left */
252 add r2, r2, #32
253 tst r2, #0x10
254 beq 5f
255 // copies 16 bytes, 128-bits aligned
256 vld1.8 {d0, d1}, [r1]!
257 vst1.8 {d0, d1}, [r0]!
2585: /* copy up to 15-bytes (count in r2) */
259 movs ip, r2, lsl #29
260 bcc 1f
261 vld1.8 {d0}, [r1]!
262 vst1.8 {d0}, [r0]!
2631: bge 2f
264 vld1.32 {d0[0]}, [r1]!
265 vst1.32 {d0[0]}, [r0]!
2662: movs ip, r2, lsl #31
267 ldrmib r3, [r1], #1
268 ldrcsb ip, [r1], #1
269 ldrcsb lr, [r1], #1
270 strmib r3, [r0], #1
271 strcsb ip, [r0], #1
272 strcsb lr, [r0], #1
273
274 ldmfd sp!, {r0, lr}
275 bx lr
276#endif /* NEON_UNALIGNED_ACCESS */
27711:
278 /* Simple arm-only copy loop to handle aligned copy operations */
279 stmfd sp!, {r4, r5, r6, r7, r8}
280 pld [r1, #(CACHE_LINE_SIZE * 2)]
281
282 /* Check alignment */
283 rsb r3, r1, #0
284 ands r3, #3
285 beq 2f
286
287 /* align source to 32 bits. We need to insert 2 instructions between
288 * a ldr[b|h] and str[b|h] because byte and half-word instructions
289 * stall 2 cycles.
290 */
291 movs r12, r3, lsl #31
292 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
293 ldrmib r3, [r1], #1
294 ldrcsb r4, [r1], #1
295 ldrcsb r5, [r1], #1
296 strmib r3, [r0], #1
297 strcsb r4, [r0], #1
298 strcsb r5, [r0], #1
2992:
300 subs r2, #32
301 blt 5f
302 pld [r1, #(CACHE_LINE_SIZE * 3)]
3033: /* Main copy loop, copying 32 bytes at a time */
304 pld [r1, #(CACHE_LINE_SIZE * 4)]
305 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
306 subs r2, r2, #32
307 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
308 bge 3b
3095: /* Handle any remaining bytes */
310 adds r2, #32
311 beq 6f
312
313 movs r12, r2, lsl #28
314 ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */
315 ldmmiia r1!, {r7, r8} /* 8 bytes */
316 stmcsia r0!, {r3, r4, r5, r6}
317 stmmiia r0!, {r7, r8}
318 movs r12, r2, lsl #30
319 ldrcs r3, [r1], #4 /* 4 bytes */
320 ldrmih r4, [r1], #2 /* 2 bytes */
321 strcs r3, [r0], #4
322 strmih r4, [r0], #2
323 tst r2, #0x1
324 ldrneb r3, [r1] /* last byte */
325 strneb r3, [r0]
3266:
327 ldmfd sp!, {r4, r5, r6, r7, r8}
328 ldmfd sp!, {r0, pc}
329END(memcpy)
330
331
332#else /* __ARM_ARCH__ < 7 */
333
334
335 /*
336 * Optimized memcpy() for ARM.
337 *
338 * note that memcpy() always returns the destination pointer,
339 * so we have to preserve R0.
340 */
341
342ENTRY(memcpy)
343 /* The stack must always be 64-bits aligned to be compliant with the
344 * ARM ABI. Since we have to save R0, we might as well save R4
345 * which we can use for better pipelining of the reads below
346 */
347 .save {r0, r4, lr}
348 stmfd sp!, {r0, r4, lr}
349 /* Making room for r5-r11 which will be spilled later */
350 .pad #28
351 sub sp, sp, #28
352
353 // preload the destination because we'll align it to a cache line
354 // with small writes. Also start the source "pump".
355 pld [r0, #0]
356 pld [r1, #0]
357 pld [r1, #32]
358
359 /* it simplifies things to take care of len<4 early */
360 cmp r2, #4
361 blo copy_last_3_and_return
362
363 /* compute the offset to align the source
364 * offset = (4-(src&3))&3 = -src & 3
365 */
366 rsb r3, r1, #0
367 ands r3, r3, #3
368 beq src_aligned
369
370 /* align source to 32 bits. We need to insert 2 instructions between
371 * a ldr[b|h] and str[b|h] because byte and half-word instructions
372 * stall 2 cycles.
373 */
374 movs r12, r3, lsl #31
375 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
376 ldrmib r3, [r1], #1
377 ldrcsb r4, [r1], #1
378 ldrcsb r12,[r1], #1
379 strmib r3, [r0], #1
380 strcsb r4, [r0], #1
381 strcsb r12,[r0], #1
382
383src_aligned:
384
385 /* see if src and dst are aligned together (congruent) */
386 eor r12, r0, r1
387 tst r12, #3
388 bne non_congruent
389
390 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
391 * frame. Don't update sp.
392 */
393 stmea sp, {r5-r11}
394
395 /* align the destination to a cache-line */
396 rsb r3, r0, #0
397 ands r3, r3, #0x1C
398 beq congruent_aligned32
399 cmp r3, r2
400 andhi r3, r2, #0x1C
401
402 /* conditionnaly copies 0 to 7 words (length in r3) */
403 movs r12, r3, lsl #28
404 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
405 ldmmiia r1!, {r8, r9} /* 8 bytes */
406 stmcsia r0!, {r4, r5, r6, r7}
407 stmmiia r0!, {r8, r9}
408 tst r3, #0x4
409 ldrne r10,[r1], #4 /* 4 bytes */
410 strne r10,[r0], #4
411 sub r2, r2, r3
412
413congruent_aligned32:
414 /*
415 * here source is aligned to 32 bytes.
416 */
417
418cached_aligned32:
419 subs r2, r2, #32
420 blo less_than_32_left
421
422 /*
423 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
424 * stall only until the requested world is fetched, but the linefill
425 * continues in the the background.
426 * While the linefill is going, we write our previous cache-line
427 * into the write-buffer (which should have some free space).
428 * When the linefill is done, the writebuffer will
429 * start dumping its content into memory
430 *
431 * While all this is going, we then load a full cache line into
432 * 8 registers, this cache line should be in the cache by now
433 * (or partly in the cache).
434 *
435 * This code should work well regardless of the source/dest alignment.
436 *
437 */
438
439 // Align the preload register to a cache-line because the cpu does
440 // "critical word first" (the first word requested is loaded first).
441 bic r12, r1, #0x1F
442 add r12, r12, #64
443
4441: ldmia r1!, { r4-r11 }
445 pld [r12, #64]
446 subs r2, r2, #32
447
448 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
449 // for ARM9 preload will not be safely guarded by the preceding subs.
450 // When it is safely guarded the only possibility to have SIGSEGV here
451 // is because the caller overstates the length.
452 ldrhi r3, [r12], #32 /* cheap ARM9 preload */
453 stmia r0!, { r4-r11 }
454 bhs 1b
455
456 add r2, r2, #32
457
458
459
460
461less_than_32_left:
462 /*
463 * less than 32 bytes left at this point (length in r2)
464 */
465
466 /* skip all this if there is nothing to do, which should
467 * be a common case (if not executed the code below takes
468 * about 16 cycles)
469 */
470 tst r2, #0x1F
471 beq 1f
472
473 /* conditionnaly copies 0 to 31 bytes */
474 movs r12, r2, lsl #28
475 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
476 ldmmiia r1!, {r8, r9} /* 8 bytes */
477 stmcsia r0!, {r4, r5, r6, r7}
478 stmmiia r0!, {r8, r9}
479 movs r12, r2, lsl #30
480 ldrcs r3, [r1], #4 /* 4 bytes */
481 ldrmih r4, [r1], #2 /* 2 bytes */
482 strcs r3, [r0], #4
483 strmih r4, [r0], #2
484 tst r2, #0x1
485 ldrneb r3, [r1] /* last byte */
486 strneb r3, [r0]
487
488 /* we're done! restore everything and return */
4891: ldmfd sp!, {r5-r11}
490 ldmfd sp!, {r0, r4, lr}
491 bx lr
492
493 /********************************************************************/
494
495non_congruent:
496 /*
497 * here source is aligned to 4 bytes
498 * but destination is not.
499 *
500 * in the code below r2 is the number of bytes read
501 * (the number of bytes written is always smaller, because we have
502 * partial words in the shift queue)
503 */
504 cmp r2, #4
505 blo copy_last_3_and_return
506
507 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
508 * frame. Don't update sp.
509 */
510 stmea sp, {r5-r11}
511
512 /* compute shifts needed to align src to dest */
513 rsb r5, r0, #0
514 and r5, r5, #3 /* r5 = # bytes in partial words */
515 mov r12, r5, lsl #3 /* r12 = right */
516 rsb lr, r12, #32 /* lr = left */
517
518 /* read the first word */
519 ldr r3, [r1], #4
520 sub r2, r2, #4
521
522 /* write a partial word (0 to 3 bytes), such that destination
523 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
524 */
525 movs r5, r5, lsl #31
526 strmib r3, [r0], #1
527 movmi r3, r3, lsr #8
528 strcsb r3, [r0], #1
529 movcs r3, r3, lsr #8
530 strcsb r3, [r0], #1
531 movcs r3, r3, lsr #8
532
533 cmp r2, #4
534 blo partial_word_tail
535
536 /* Align destination to 32 bytes (cache line boundary) */
5371: tst r0, #0x1c
538 beq 2f
539 ldr r5, [r1], #4
540 sub r2, r2, #4
541 orr r4, r3, r5, lsl lr
542 mov r3, r5, lsr r12
543 str r4, [r0], #4
544 cmp r2, #4
545 bhs 1b
546 blo partial_word_tail
547
548 /* copy 32 bytes at a time */
5492: subs r2, r2, #32
550 blo less_than_thirtytwo
551
552 /* Use immediate mode for the shifts, because there is an extra cycle
553 * for register shifts, which could account for up to 50% of
554 * performance hit.
555 */
556
557 cmp r12, #24
558 beq loop24
559 cmp r12, #8
560 beq loop8
561
562loop16:
563 ldr r12, [r1], #4
5641: mov r4, r12
565 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
566 pld [r1, #64]
567 subs r2, r2, #32
568 ldrhs r12, [r1], #4
569 orr r3, r3, r4, lsl #16
570 mov r4, r4, lsr #16
571 orr r4, r4, r5, lsl #16
572 mov r5, r5, lsr #16
573 orr r5, r5, r6, lsl #16
574 mov r6, r6, lsr #16
575 orr r6, r6, r7, lsl #16
576 mov r7, r7, lsr #16
577 orr r7, r7, r8, lsl #16
578 mov r8, r8, lsr #16
579 orr r8, r8, r9, lsl #16
580 mov r9, r9, lsr #16
581 orr r9, r9, r10, lsl #16
582 mov r10, r10, lsr #16
583 orr r10, r10, r11, lsl #16
584 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
585 mov r3, r11, lsr #16
586 bhs 1b
587 b less_than_thirtytwo
588
589loop8:
590 ldr r12, [r1], #4
5911: mov r4, r12
592 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
593 pld [r1, #64]
594 subs r2, r2, #32
595 ldrhs r12, [r1], #4
596 orr r3, r3, r4, lsl #24
597 mov r4, r4, lsr #8
598 orr r4, r4, r5, lsl #24
599 mov r5, r5, lsr #8
600 orr r5, r5, r6, lsl #24
601 mov r6, r6, lsr #8
602 orr r6, r6, r7, lsl #24
603 mov r7, r7, lsr #8
604 orr r7, r7, r8, lsl #24
605 mov r8, r8, lsr #8
606 orr r8, r8, r9, lsl #24
607 mov r9, r9, lsr #8
608 orr r9, r9, r10, lsl #24
609 mov r10, r10, lsr #8
610 orr r10, r10, r11, lsl #24
611 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
612 mov r3, r11, lsr #8
613 bhs 1b
614 b less_than_thirtytwo
615
616loop24:
617 ldr r12, [r1], #4
6181: mov r4, r12
619 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
620 pld [r1, #64]
621 subs r2, r2, #32
622 ldrhs r12, [r1], #4
623 orr r3, r3, r4, lsl #8
624 mov r4, r4, lsr #24
625 orr r4, r4, r5, lsl #8
626 mov r5, r5, lsr #24
627 orr r5, r5, r6, lsl #8
628 mov r6, r6, lsr #24
629 orr r6, r6, r7, lsl #8
630 mov r7, r7, lsr #24
631 orr r7, r7, r8, lsl #8
632 mov r8, r8, lsr #24
633 orr r8, r8, r9, lsl #8
634 mov r9, r9, lsr #24
635 orr r9, r9, r10, lsl #8
636 mov r10, r10, lsr #24
637 orr r10, r10, r11, lsl #8
638 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
639 mov r3, r11, lsr #24
640 bhs 1b
641
642
643less_than_thirtytwo:
644 /* copy the last 0 to 31 bytes of the source */
645 rsb r12, lr, #32 /* we corrupted r12, recompute it */
646 add r2, r2, #32
647 cmp r2, #4
648 blo partial_word_tail
649
6501: ldr r5, [r1], #4
651 sub r2, r2, #4
652 orr r4, r3, r5, lsl lr
653 mov r3, r5, lsr r12
654 str r4, [r0], #4
655 cmp r2, #4
656 bhs 1b
657
658partial_word_tail:
659 /* we have a partial word in the input buffer */
660 movs r5, lr, lsl #(31-3)
661 strmib r3, [r0], #1
662 movmi r3, r3, lsr #8
663 strcsb r3, [r0], #1
664 movcs r3, r3, lsr #8
665 strcsb r3, [r0], #1
666
667 /* Refill spilled registers from the stack. Don't update sp. */
668 ldmfd sp, {r5-r11}
669
670copy_last_3_and_return:
671 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
672 ldrmib r2, [r1], #1
673 ldrcsb r3, [r1], #1
674 ldrcsb r12,[r1]
675 strmib r2, [r0], #1
676 strcsb r3, [r0], #1
677 strcsb r12,[r0]
678
679 /* we're done! restore sp and spilled registers and return */
680 add sp, sp, #28
681 ldmfd sp!, {r0, r4, lr}
682 bx lr
683END(memcpy)
684
685
686#endif /* __ARM_ARCH__ < 7 */
diff --git a/libc/arch-arm/bionic/memcpy.a9.S b/libc/arch-arm/bionic/memcpy.a9.S
deleted file mode 100644
index 259701d6..00000000
--- a/libc/arch-arm/bionic/memcpy.a9.S
+++ /dev/null
@@ -1,614 +0,0 @@
1/* Copyright (c) 2013, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 * Neither the name of Linaro Limited nor the names of its
16 contributors may be used to endorse or promote products derived
17 from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 */
32
33/*
34 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
35 of VFP or NEON when built with the appropriate flags.
36
37 Assumptions:
38
39 ARMv6 (ARMv7-a if using Neon)
40 ARM state
41 Unaligned accesses
42 LDRD/STRD support unaligned word accesses
43
44 */
45
46#include <machine/cpu-features.h>
47#include <private/bionic_asm.h>
48
49 .syntax unified
50 /* This implementation requires ARM state. */
51 .arm
52
53#ifdef __ARM_NEON__
54
55 .fpu neon
56 .arch armv7-a
57# define FRAME_SIZE 4
58# define USE_VFP
59# define USE_NEON
60
61#elif !defined (__SOFTFP__)
62
63 .arch armv6
64 .fpu vfpv2
65# define FRAME_SIZE 32
66# define USE_VFP
67
68#else
69 .arch armv6
70# define FRAME_SIZE 32
71
72#endif
73
74/* Old versions of GAS incorrectly implement the NEON align semantics. */
75#ifdef BROKEN_ASM_NEON_ALIGN
76#define ALIGN(addr, align) addr,:align
77#else
78#define ALIGN(addr, align) addr:align
79#endif
80
81#define PC_OFFSET 8 /* PC pipeline compensation. */
82#define INSN_SIZE 4
83
84/* Call parameters. */
85#define dstin r0
86#define src r1
87#define count r2
88
89/* Locals. */
90#define tmp1 r3
91#define dst ip
92#define tmp2 r10
93
94#ifndef USE_NEON
95/* For bulk copies using GP registers. */
96#define A_l r2 /* Call-clobbered. */
97#define A_h r3 /* Call-clobbered. */
98#define B_l r4
99#define B_h r5
100#define C_l r6
101#define C_h r7
102#define D_l r8
103#define D_h r9
104#endif
105
106/* Number of lines ahead to pre-fetch data. If you change this the code
107 below will need adjustment to compensate. */
108
109#define prefetch_lines 5
110
111#ifdef USE_VFP
112 .macro cpy_line_vfp vreg, base
113 vstr \vreg, [dst, #\base]
114 vldr \vreg, [src, #\base]
115 vstr d0, [dst, #\base + 8]
116 vldr d0, [src, #\base + 8]
117 vstr d1, [dst, #\base + 16]
118 vldr d1, [src, #\base + 16]
119 vstr d2, [dst, #\base + 24]
120 vldr d2, [src, #\base + 24]
121 vstr \vreg, [dst, #\base + 32]
122 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
123 vstr d0, [dst, #\base + 40]
124 vldr d0, [src, #\base + 40]
125 vstr d1, [dst, #\base + 48]
126 vldr d1, [src, #\base + 48]
127 vstr d2, [dst, #\base + 56]
128 vldr d2, [src, #\base + 56]
129 .endm
130
131 .macro cpy_tail_vfp vreg, base
132 vstr \vreg, [dst, #\base]
133 vldr \vreg, [src, #\base]
134 vstr d0, [dst, #\base + 8]
135 vldr d0, [src, #\base + 8]
136 vstr d1, [dst, #\base + 16]
137 vldr d1, [src, #\base + 16]
138 vstr d2, [dst, #\base + 24]
139 vldr d2, [src, #\base + 24]
140 vstr \vreg, [dst, #\base + 32]
141 vstr d0, [dst, #\base + 40]
142 vldr d0, [src, #\base + 40]
143 vstr d1, [dst, #\base + 48]
144 vldr d1, [src, #\base + 48]
145 vstr d2, [dst, #\base + 56]
146 vldr d2, [src, #\base + 56]
147 .endm
148#endif
149
150 .p2align 6
151ENTRY(memcpy)
152
153 mov dst, dstin /* Preserve dstin, we need to return it. */
154 cmp count, #64
155 bge .Lcpy_not_short
156 /* Deal with small copies quickly by dropping straight into the
157 exit block. */
158
159.Ltail63unaligned:
160#ifdef USE_NEON
161 and tmp1, count, #0x38
162 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
163 add pc, pc, tmp1
164 vld1.8 {d0}, [src]! /* 14 words to go. */
165 vst1.8 {d0}, [dst]!
166 vld1.8 {d0}, [src]! /* 12 words to go. */
167 vst1.8 {d0}, [dst]!
168 vld1.8 {d0}, [src]! /* 10 words to go. */
169 vst1.8 {d0}, [dst]!
170 vld1.8 {d0}, [src]! /* 8 words to go. */
171 vst1.8 {d0}, [dst]!
172 vld1.8 {d0}, [src]! /* 6 words to go. */
173 vst1.8 {d0}, [dst]!
174 vld1.8 {d0}, [src]! /* 4 words to go. */
175 vst1.8 {d0}, [dst]!
176 vld1.8 {d0}, [src]! /* 2 words to go. */
177 vst1.8 {d0}, [dst]!
178
179 tst count, #4
180 ldrne tmp1, [src], #4
181 strne tmp1, [dst], #4
182#else
183 /* Copy up to 15 full words of data. May not be aligned. */
184 /* Cannot use VFP for unaligned data. */
185 and tmp1, count, #0x3c
186 add dst, dst, tmp1
187 add src, src, tmp1
188 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
189 /* Jump directly into the sequence below at the correct offset. */
190 add pc, pc, tmp1, lsl #1
191
192 ldr tmp1, [src, #-60] /* 15 words to go. */
193 str tmp1, [dst, #-60]
194
195 ldr tmp1, [src, #-56] /* 14 words to go. */
196 str tmp1, [dst, #-56]
197 ldr tmp1, [src, #-52]
198 str tmp1, [dst, #-52]
199
200 ldr tmp1, [src, #-48] /* 12 words to go. */
201 str tmp1, [dst, #-48]
202 ldr tmp1, [src, #-44]
203 str tmp1, [dst, #-44]
204
205 ldr tmp1, [src, #-40] /* 10 words to go. */
206 str tmp1, [dst, #-40]
207 ldr tmp1, [src, #-36]
208 str tmp1, [dst, #-36]
209
210 ldr tmp1, [src, #-32] /* 8 words to go. */
211 str tmp1, [dst, #-32]
212 ldr tmp1, [src, #-28]
213 str tmp1, [dst, #-28]
214
215 ldr tmp1, [src, #-24] /* 6 words to go. */
216 str tmp1, [dst, #-24]
217 ldr tmp1, [src, #-20]
218 str tmp1, [dst, #-20]
219
220 ldr tmp1, [src, #-16] /* 4 words to go. */
221 str tmp1, [dst, #-16]
222 ldr tmp1, [src, #-12]
223 str tmp1, [dst, #-12]
224
225 ldr tmp1, [src, #-8] /* 2 words to go. */
226 str tmp1, [dst, #-8]
227 ldr tmp1, [src, #-4]
228 str tmp1, [dst, #-4]
229#endif
230
231 lsls count, count, #31
232 ldrhcs tmp1, [src], #2
233 ldrbne src, [src] /* Src is dead, use as a scratch. */
234 strhcs tmp1, [dst], #2
235 strbne src, [dst]
236 bx lr
237
238.Lcpy_not_short:
239 /* At least 64 bytes to copy, but don't know the alignment yet. */
240 str tmp2, [sp, #-FRAME_SIZE]!
241 and tmp2, src, #7
242 and tmp1, dst, #7
243 cmp tmp1, tmp2
244 bne .Lcpy_notaligned
245
246#ifdef USE_VFP
247 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
248 that the FP pipeline is much better at streaming loads and
249 stores. This is outside the critical loop. */
250 vmov.f32 s0, s0
251#endif
252
253 /* SRC and DST have the same mutual 32-bit alignment, but we may
254 still need to pre-copy some bytes to get to natural alignment.
255 We bring DST into full 64-bit alignment. */
256 lsls tmp2, dst, #29
257 beq 1f
258 rsbs tmp2, tmp2, #0
259 sub count, count, tmp2, lsr #29
260 ldrmi tmp1, [src], #4
261 strmi tmp1, [dst], #4
262 lsls tmp2, tmp2, #2
263 ldrhcs tmp1, [src], #2
264 ldrbne tmp2, [src], #1
265 strhcs tmp1, [dst], #2
266 strbne tmp2, [dst], #1
267
2681:
269 subs tmp2, count, #64 /* Use tmp2 for count. */
270 blt .Ltail63aligned
271
272 cmp tmp2, #512
273 bge .Lcpy_body_long
274
275.Lcpy_body_medium: /* Count in tmp2. */
276#ifdef USE_VFP
2771:
278 vldr d0, [src, #0]
279 subs tmp2, tmp2, #64
280 vldr d1, [src, #8]
281 vstr d0, [dst, #0]
282 vldr d0, [src, #16]
283 vstr d1, [dst, #8]
284 vldr d1, [src, #24]
285 vstr d0, [dst, #16]
286 vldr d0, [src, #32]
287 vstr d1, [dst, #24]
288 vldr d1, [src, #40]
289 vstr d0, [dst, #32]
290 vldr d0, [src, #48]
291 vstr d1, [dst, #40]
292 vldr d1, [src, #56]
293 vstr d0, [dst, #48]
294 add src, src, #64
295 vstr d1, [dst, #56]
296 add dst, dst, #64
297 bge 1b
298 tst tmp2, #0x3f
299 beq .Ldone
300
301.Ltail63aligned: /* Count in tmp2. */
302 and tmp1, tmp2, #0x38
303 add dst, dst, tmp1
304 add src, src, tmp1
305 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
306 add pc, pc, tmp1
307
308 vldr d0, [src, #-56] /* 14 words to go. */
309 vstr d0, [dst, #-56]
310 vldr d0, [src, #-48] /* 12 words to go. */
311 vstr d0, [dst, #-48]
312 vldr d0, [src, #-40] /* 10 words to go. */
313 vstr d0, [dst, #-40]
314 vldr d0, [src, #-32] /* 8 words to go. */
315 vstr d0, [dst, #-32]
316 vldr d0, [src, #-24] /* 6 words to go. */
317 vstr d0, [dst, #-24]
318 vldr d0, [src, #-16] /* 4 words to go. */
319 vstr d0, [dst, #-16]
320 vldr d0, [src, #-8] /* 2 words to go. */
321 vstr d0, [dst, #-8]
322#else
323 sub src, src, #8
324 sub dst, dst, #8
3251:
326 ldrd A_l, A_h, [src, #8]
327 strd A_l, A_h, [dst, #8]
328 ldrd A_l, A_h, [src, #16]
329 strd A_l, A_h, [dst, #16]
330 ldrd A_l, A_h, [src, #24]
331 strd A_l, A_h, [dst, #24]
332 ldrd A_l, A_h, [src, #32]
333 strd A_l, A_h, [dst, #32]
334 ldrd A_l, A_h, [src, #40]
335 strd A_l, A_h, [dst, #40]
336 ldrd A_l, A_h, [src, #48]
337 strd A_l, A_h, [dst, #48]
338 ldrd A_l, A_h, [src, #56]
339 strd A_l, A_h, [dst, #56]
340 ldrd A_l, A_h, [src, #64]!
341 strd A_l, A_h, [dst, #64]!
342 subs tmp2, tmp2, #64
343 bge 1b
344 tst tmp2, #0x3f
345 bne 1f
346 ldr tmp2,[sp], #FRAME_SIZE
347 bx lr
3481:
349 add src, src, #8
350 add dst, dst, #8
351
352.Ltail63aligned: /* Count in tmp2. */
353 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
354 we know that the src and dest are 32-bit aligned so we can use
355 LDRD/STRD to improve efficiency. */
356 /* TMP2 is now negative, but we don't care about that. The bottom
357 six bits still tell us how many bytes are left to copy. */
358
359 and tmp1, tmp2, #0x38
360 add dst, dst, tmp1
361 add src, src, tmp1
362 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
363 add pc, pc, tmp1
364 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
365 strd A_l, A_h, [dst, #-56]
366 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
367 strd A_l, A_h, [dst, #-48]
368 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
369 strd A_l, A_h, [dst, #-40]
370 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
371 strd A_l, A_h, [dst, #-32]
372 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
373 strd A_l, A_h, [dst, #-24]
374 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
375 strd A_l, A_h, [dst, #-16]
376 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
377 strd A_l, A_h, [dst, #-8]
378
379#endif
380 tst tmp2, #4
381 ldrne tmp1, [src], #4
382 strne tmp1, [dst], #4
383 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
384 ldrhcs tmp1, [src], #2
385 ldrbne tmp2, [src]
386 strhcs tmp1, [dst], #2
387 strbne tmp2, [dst]
388
389.Ldone:
390 ldr tmp2, [sp], #FRAME_SIZE
391 bx lr
392
393.Lcpy_body_long: /* Count in tmp2. */
394
395 /* Long copy. We know that there's at least (prefetch_lines * 64)
396 bytes to go. */
397#ifdef USE_VFP
398 /* Don't use PLD. Instead, read some data in advance of the current
399 copy position into a register. This should act like a PLD
400 operation but we won't have to repeat the transfer. */
401
402 vldr d3, [src, #0]
403 vldr d4, [src, #64]
404 vldr d5, [src, #128]
405 vldr d6, [src, #192]
406 vldr d7, [src, #256]
407
408 vldr d0, [src, #8]
409 vldr d1, [src, #16]
410 vldr d2, [src, #24]
411 add src, src, #32
412
413 subs tmp2, tmp2, #prefetch_lines * 64 * 2
414 blt 2f
4151:
416 cpy_line_vfp d3, 0
417 cpy_line_vfp d4, 64
418 cpy_line_vfp d5, 128
419 add dst, dst, #3 * 64
420 add src, src, #3 * 64
421 cpy_line_vfp d6, 0
422 cpy_line_vfp d7, 64
423 add dst, dst, #2 * 64
424 add src, src, #2 * 64
425 subs tmp2, tmp2, #prefetch_lines * 64
426 bge 1b
427
4282:
429 cpy_tail_vfp d3, 0
430 cpy_tail_vfp d4, 64
431 cpy_tail_vfp d5, 128
432 add src, src, #3 * 64
433 add dst, dst, #3 * 64
434 cpy_tail_vfp d6, 0
435 vstr d7, [dst, #64]
436 vldr d7, [src, #64]
437 vstr d0, [dst, #64 + 8]
438 vldr d0, [src, #64 + 8]
439 vstr d1, [dst, #64 + 16]
440 vldr d1, [src, #64 + 16]
441 vstr d2, [dst, #64 + 24]
442 vldr d2, [src, #64 + 24]
443 vstr d7, [dst, #64 + 32]
444 add src, src, #96
445 vstr d0, [dst, #64 + 40]
446 vstr d1, [dst, #64 + 48]
447 vstr d2, [dst, #64 + 56]
448 add dst, dst, #128
449 add tmp2, tmp2, #prefetch_lines * 64
450 b .Lcpy_body_medium
451#else
452 /* Long copy. Use an SMS style loop to maximize the I/O
453 bandwidth of the core. We don't have enough spare registers
454 to synthesise prefetching, so use PLD operations. */
455 /* Pre-bias src and dst. */
456 sub src, src, #8
457 sub dst, dst, #8
458 pld [src, #8]
459 pld [src, #72]
460 subs tmp2, tmp2, #64
461 pld [src, #136]
462 ldrd A_l, A_h, [src, #8]
463 strd B_l, B_h, [sp, #8]
464 ldrd B_l, B_h, [src, #16]
465 strd C_l, C_h, [sp, #16]
466 ldrd C_l, C_h, [src, #24]
467 strd D_l, D_h, [sp, #24]
468 pld [src, #200]
469 ldrd D_l, D_h, [src, #32]!
470 b 1f
471 .p2align 6
4722:
473 pld [src, #232]
474 strd A_l, A_h, [dst, #40]
475 ldrd A_l, A_h, [src, #40]
476 strd B_l, B_h, [dst, #48]
477 ldrd B_l, B_h, [src, #48]
478 strd C_l, C_h, [dst, #56]
479 ldrd C_l, C_h, [src, #56]
480 strd D_l, D_h, [dst, #64]!
481 ldrd D_l, D_h, [src, #64]!
482 subs tmp2, tmp2, #64
4831:
484 strd A_l, A_h, [dst, #8]
485 ldrd A_l, A_h, [src, #8]
486 strd B_l, B_h, [dst, #16]
487 ldrd B_l, B_h, [src, #16]
488 strd C_l, C_h, [dst, #24]
489 ldrd C_l, C_h, [src, #24]
490 strd D_l, D_h, [dst, #32]
491 ldrd D_l, D_h, [src, #32]
492 bcs 2b
493 /* Save the remaining bytes and restore the callee-saved regs. */
494 strd A_l, A_h, [dst, #40]
495 add src, src, #40
496 strd B_l, B_h, [dst, #48]
497 ldrd B_l, B_h, [sp, #8]
498 strd C_l, C_h, [dst, #56]
499 ldrd C_l, C_h, [sp, #16]
500 strd D_l, D_h, [dst, #64]
501 ldrd D_l, D_h, [sp, #24]
502 add dst, dst, #72
503 tst tmp2, #0x3f
504 bne .Ltail63aligned
505 ldr tmp2, [sp], #FRAME_SIZE
506 bx lr
507#endif
508
509.Lcpy_notaligned:
510 pld [src]
511 pld [src, #64]
512 /* There's at least 64 bytes to copy, but there is no mutual
513 alignment. */
514 /* Bring DST to 64-bit alignment. */
515 lsls tmp2, dst, #29
516 pld [src, #(2 * 64)]
517 beq 1f
518 rsbs tmp2, tmp2, #0
519 sub count, count, tmp2, lsr #29
520 ldrmi tmp1, [src], #4
521 strmi tmp1, [dst], #4
522 lsls tmp2, tmp2, #2
523 ldrbne tmp1, [src], #1
524 ldrhcs tmp2, [src], #2
525 strbne tmp1, [dst], #1
526 strhcs tmp2, [dst], #2
5271:
528 pld [src, #(3 * 64)]
529 subs count, count, #64
530 ldrmi tmp2, [sp], #FRAME_SIZE
531 bmi .Ltail63unaligned
532 pld [src, #(4 * 64)]
533
534#ifdef USE_NEON
535 vld1.8 {d0-d3}, [src]!
536 vld1.8 {d4-d7}, [src]!
537 subs count, count, #64
538 bmi 2f
5391:
540 pld [src, #(4 * 64)]
541 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
542 vld1.8 {d0-d3}, [src]!
543 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
544 vld1.8 {d4-d7}, [src]!
545 subs count, count, #64
546 bpl 1b
5472:
548 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
549 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
550 ands count, count, #0x3f
551#else
552 /* Use an SMS style loop to maximize the I/O bandwidth. */
553 sub src, src, #4
554 sub dst, dst, #8
555 subs tmp2, count, #64 /* Use tmp2 for count. */
556 ldr A_l, [src, #4]
557 ldr A_h, [src, #8]
558 strd B_l, B_h, [sp, #8]
559 ldr B_l, [src, #12]
560 ldr B_h, [src, #16]
561 strd C_l, C_h, [sp, #16]
562 ldr C_l, [src, #20]
563 ldr C_h, [src, #24]
564 strd D_l, D_h, [sp, #24]
565 ldr D_l, [src, #28]
566 ldr D_h, [src, #32]!
567 b 1f
568 .p2align 6
5692:
570 pld [src, #(5 * 64) - (32 - 4)]
571 strd A_l, A_h, [dst, #40]
572 ldr A_l, [src, #36]
573 ldr A_h, [src, #40]
574 strd B_l, B_h, [dst, #48]
575 ldr B_l, [src, #44]
576 ldr B_h, [src, #48]
577 strd C_l, C_h, [dst, #56]
578 ldr C_l, [src, #52]
579 ldr C_h, [src, #56]
580 strd D_l, D_h, [dst, #64]!
581 ldr D_l, [src, #60]
582 ldr D_h, [src, #64]!
583 subs tmp2, tmp2, #64
5841:
585 strd A_l, A_h, [dst, #8]
586 ldr A_l, [src, #4]
587 ldr A_h, [src, #8]
588 strd B_l, B_h, [dst, #16]
589 ldr B_l, [src, #12]
590 ldr B_h, [src, #16]
591 strd C_l, C_h, [dst, #24]
592 ldr C_l, [src, #20]
593 ldr C_h, [src, #24]
594 strd D_l, D_h, [dst, #32]
595 ldr D_l, [src, #28]
596 ldr D_h, [src, #32]
597 bcs 2b
598
599 /* Save the remaining bytes and restore the callee-saved regs. */
600 strd A_l, A_h, [dst, #40]
601 add src, src, #36
602 strd B_l, B_h, [dst, #48]
603 ldrd B_l, B_h, [sp, #8]
604 strd C_l, C_h, [dst, #56]
605 ldrd C_l, C_h, [sp, #16]
606 strd D_l, D_h, [dst, #64]
607 ldrd D_l, D_h, [sp, #24]
608 add dst, dst, #72
609 ands count, tmp2, #0x3f
610#endif
611 ldr tmp2, [sp], #FRAME_SIZE
612 bne .Ltail63unaligned
613 bx lr
614END(memcpy)
diff --git a/libc/arch-arm/bionic/strcmp.S b/libc/arch-arm/bionic/strcmp.S
deleted file mode 100644
index 6dba942d..00000000
--- a/libc/arch-arm/bionic/strcmp.S
+++ /dev/null
@@ -1,317 +0,0 @@
1/*
2 * Copyright (c) 2011 The Android Open Source Project
3 * Copyright (c) 2008 ARM Ltd
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name of the company may not be used to endorse or promote
15 * products derived from this software without specific prior written
16 * permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
19 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
20 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include <machine/cpu-features.h>
31#include <private/bionic_asm.h>
32
33 .text
34
35#ifdef __ARMEB__
36#define SHFT2LSB lsl
37#define SHFT2LSBEQ lsleq
38#define SHFT2MSB lsr
39#define SHFT2MSBEQ lsreq
40#define MSB 0x000000ff
41#define LSB 0xff000000
42#else
43#define SHFT2LSB lsr
44#define SHFT2LSBEQ lsreq
45#define SHFT2MSB lsl
46#define SHFT2MSBEQ lsleq
47#define MSB 0xff000000
48#define LSB 0x000000ff
49#endif
50
51#define magic1(REG) REG
52#define magic2(REG) REG, lsl #7
53
54ENTRY(strcmp)
55 pld [r0, #0]
56 pld [r1, #0]
57 eor r2, r0, r1
58 tst r2, #3
59
60 /* Strings not at same byte offset from a word boundary. */
61 bne .Lstrcmp_unaligned
62 ands r2, r0, #3
63 bic r0, r0, #3
64 bic r1, r1, #3
65 ldr ip, [r0], #4
66 it eq
67 ldreq r3, [r1], #4
68 beq 1f
69
70 /* Although s1 and s2 have identical initial alignment, they are
71 * not currently word aligned. Rather than comparing bytes,
72 * make sure that any bytes fetched from before the addressed
73 * bytes are forced to 0xff. Then they will always compare
74 * equal.
75 */
76 eor r2, r2, #3
77 lsl r2, r2, #3
78 mvn r3, #MSB
79 SHFT2LSB r2, r3, r2
80 ldr r3, [r1], #4
81 orr ip, ip, r2
82 orr r3, r3, r2
831:
84 /* Load the 'magic' constant 0x01010101. */
85 str r4, [sp, #-4]!
86 mov r4, #1
87 orr r4, r4, r4, lsl #8
88 orr r4, r4, r4, lsl #16
89 .p2align 2
904:
91 pld [r0, #8]
92 pld [r1, #8]
93 sub r2, ip, magic1(r4)
94 cmp ip, r3
95 itttt eq
96
97 /* check for any zero bytes in first word */
98 biceq r2, r2, ip
99 tsteq r2, magic2(r4)
100 ldreq ip, [r0], #4
101 ldreq r3, [r1], #4
102 beq 4b
1032:
104 /* There's a zero or a different byte in the word */
105 SHFT2MSB r0, ip, #24
106 SHFT2LSB ip, ip, #8
107 cmp r0, #1
108 it cs
109 cmpcs r0, r3, SHFT2MSB #24
110 it eq
111 SHFT2LSBEQ r3, r3, #8
112 beq 2b
113 /* On a big-endian machine, r0 contains the desired byte in bits
114 * 0-7; on a little-endian machine they are in bits 24-31. In
115 * both cases the other bits in r0 are all zero. For r3 the
116 * interesting byte is at the other end of the word, but the
117 * other bits are not necessarily zero. We need a signed result
118 * representing the differnece in the unsigned bytes, so for the
119 * little-endian case we can't just shift the interesting bits up.
120 */
121#ifdef __ARMEB__
122 sub r0, r0, r3, lsr #24
123#else
124 and r3, r3, #255
125 /* No RSB instruction in Thumb2 */
126#ifdef __thumb2__
127 lsr r0, r0, #24
128 sub r0, r0, r3
129#else
130 rsb r0, r3, r0, lsr #24
131#endif
132#endif
133 ldr r4, [sp], #4
134 bx lr
135
136.Lstrcmp_unaligned:
137 wp1 .req r0
138 wp2 .req r1
139 b1 .req r2
140 w1 .req r4
141 w2 .req r5
142 t1 .req ip
143 @ r3 is scratch
144
145 /* First of all, compare bytes until wp1(sp1) is word-aligned. */
1461:
147 tst wp1, #3
148 beq 2f
149 ldrb r2, [wp1], #1
150 ldrb r3, [wp2], #1
151 cmp r2, #1
152 it cs
153 cmpcs r2, r3
154 beq 1b
155 sub r0, r2, r3
156 bx lr
157
1582:
159 str r5, [sp, #-4]!
160 str r4, [sp, #-4]!
161 mov b1, #1
162 orr b1, b1, b1, lsl #8
163 orr b1, b1, b1, lsl #16
164
165 and t1, wp2, #3
166 bic wp2, wp2, #3
167 ldr w1, [wp1], #4
168 ldr w2, [wp2], #4
169 cmp t1, #2
170 beq 2f
171 bhi 3f
172
173 /* Critical inner Loop: Block with 3 bytes initial overlap */
174 .p2align 2
1751:
176 bic t1, w1, #MSB
177 cmp t1, w2, SHFT2LSB #8
178 sub r3, w1, b1
179 bic r3, r3, w1
180 bne 4f
181 ands r3, r3, b1, lsl #7
182 it eq
183 ldreq w2, [wp2], #4
184 bne 5f
185 eor t1, t1, w1
186 cmp t1, w2, SHFT2MSB #24
187 bne 6f
188 ldr w1, [wp1], #4
189 b 1b
1904:
191 SHFT2LSB w2, w2, #8
192 b 8f
193
1945:
195#ifdef __ARMEB__
196 /* The syndrome value may contain false ones if the string ends
197 * with the bytes 0x01 0x00
198 */
199 tst w1, #0xff000000
200 itt ne
201 tstne w1, #0x00ff0000
202 tstne w1, #0x0000ff00
203 beq 7f
204#else
205 bics r3, r3, #0xff000000
206 bne 7f
207#endif
208 ldrb w2, [wp2]
209 SHFT2LSB t1, w1, #24
210#ifdef __ARMEB__
211 lsl w2, w2, #24
212#endif
213 b 8f
214
2156:
216 SHFT2LSB t1, w1, #24
217 and w2, w2, #LSB
218 b 8f
219
220 /* Critical inner Loop: Block with 2 bytes initial overlap */
221 .p2align 2
2222:
223 SHFT2MSB t1, w1, #16
224 sub r3, w1, b1
225 SHFT2LSB t1, t1, #16
226 bic r3, r3, w1
227 cmp t1, w2, SHFT2LSB #16
228 bne 4f
229 ands r3, r3, b1, lsl #7
230 it eq
231 ldreq w2, [wp2], #4
232 bne 5f
233 eor t1, t1, w1
234 cmp t1, w2, SHFT2MSB #16
235 bne 6f
236 ldr w1, [wp1], #4
237 b 2b
238
2395:
240#ifdef __ARMEB__
241 /* The syndrome value may contain false ones if the string ends
242 * with the bytes 0x01 0x00
243 */
244 tst w1, #0xff000000
245 it ne
246 tstne w1, #0x00ff0000
247 beq 7f
248#else
249 lsls r3, r3, #16
250 bne 7f
251#endif
252 ldrh w2, [wp2]
253 SHFT2LSB t1, w1, #16
254#ifdef __ARMEB__
255 lsl w2, w2, #16
256#endif
257 b 8f
258
2596:
260 SHFT2MSB w2, w2, #16
261 SHFT2LSB t1, w1, #16
2624:
263 SHFT2LSB w2, w2, #16
264 b 8f
265
266 /* Critical inner Loop: Block with 1 byte initial overlap */
267 .p2align 2
2683:
269 and t1, w1, #LSB
270 cmp t1, w2, SHFT2LSB #24
271 sub r3, w1, b1
272 bic r3, r3, w1
273 bne 4f
274 ands r3, r3, b1, lsl #7
275 it eq
276 ldreq w2, [wp2], #4
277 bne 5f
278 eor t1, t1, w1
279 cmp t1, w2, SHFT2MSB #8
280 bne 6f
281 ldr w1, [wp1], #4
282 b 3b
2834:
284 SHFT2LSB w2, w2, #24
285 b 8f
2865:
287 /* The syndrome value may contain false ones if the string ends
288 * with the bytes 0x01 0x00
289 */
290 tst w1, #LSB
291 beq 7f
292 ldr w2, [wp2], #4
2936:
294 SHFT2LSB t1, w1, #8
295 bic w2, w2, #MSB
296 b 8f
2977:
298 mov r0, #0
299 ldr r4, [sp], #4
300 ldr r5, [sp], #4
301 bx lr
302
3038:
304 and r2, t1, #LSB
305 and r0, w2, #LSB
306 cmp r0, #1
307 it cs
308 cmpcs r0, r2
309 itt eq
310 SHFT2LSBEQ t1, t1, #8
311 SHFT2LSBEQ w2, w2, #8
312 beq 8b
313 sub r0, r2, r0
314 ldr r4, [sp], #4
315 ldr r5, [sp], #4
316 bx lr
317END(strcmp)