1 /*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
29 /*
30 * This code assumes it is running on a processor that supports all arm v7
31 * instructions, that supports neon instructions, and that has a 32 byte
32 * cache line.
33 */
35 ENTRY_PRIVATE(MEMCPY_BASE)
36 .save {r0, lr}
37 .cfi_def_cfa_offset 8
38 .cfi_rel_offset r0, 0
39 .cfi_rel_offset lr, 4
41 // Check so divider is at least 16 bytes, needed for alignment code.
42 cmp r2, #16
43 blo 5f
45 /* check if buffers are aligned. If so, run arm-only version */
46 eor r3, r0, r1
47 ands r3, r3, #0x3
48 beq __memcpy_base_aligned
50 /* Check the upper size limit for Neon unaligned memory access in memcpy */
51 cmp r2, #224
52 blo 3f
54 /* align destination to 16 bytes for the write-buffer */
55 rsb r3, r0, #0
56 ands r3, r3, #0xF
57 beq 3f
59 /* copy up to 15-bytes (count in r3) */
60 sub r2, r2, r3
61 movs ip, r3, lsl #31
62 itt mi
63 ldrbmi lr, [r1], #1
64 strbmi lr, [r0], #1
65 itttt cs
66 ldrbcs ip, [r1], #1
67 ldrbcs lr, [r1], #1
68 strbcs ip, [r0], #1
69 strbcs lr, [r0], #1
70 movs ip, r3, lsl #29
71 bge 1f
72 // copies 4 bytes, destination 32-bits aligned
73 vld1.32 {d0[0]}, [r1]!
74 vst1.32 {d0[0]}, [r0, :32]!
75 1: bcc 2f
76 // copies 8 bytes, destination 64-bits aligned
77 vld1.8 {d0}, [r1]!
78 vst1.8 {d0}, [r0, :64]!
79 2:
80 /* preload immediately the next cache line, which we may need */
81 pld [r1, #0]
82 pld [r1, #(32 * 2)]
83 3:
84 /* make sure we have at least 64 bytes to copy */
85 subs r2, r2, #64
86 blo 2f
88 /* preload all the cache lines we need */
89 pld [r1, #(32 * 4)]
90 pld [r1, #(32 * 6)]
92 1: /* The main loop copies 64 bytes at a time */
93 vld1.8 {d0 - d3}, [r1]!
94 vld1.8 {d4 - d7}, [r1]!
95 pld [r1, #(32 * 6)]
96 subs r2, r2, #64
97 vst1.8 {d0 - d3}, [r0]!
98 vst1.8 {d4 - d7}, [r0]!
99 bhs 1b
101 2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
102 add r2, r2, #64
103 subs r2, r2, #32
104 blo 4f
106 3: /* 32 bytes at a time. These cache lines were already preloaded */
107 vld1.8 {d0 - d3}, [r1]!
108 subs r2, r2, #32
109 vst1.8 {d0 - d3}, [r0]!
110 bhs 3b
112 4: /* less than 32 left */
113 add r2, r2, #32
114 tst r2, #0x10
115 beq 5f
116 // copies 16 bytes, 128-bits aligned
117 vld1.8 {d0, d1}, [r1]!
118 vst1.8 {d0, d1}, [r0]!
119 5: /* copy up to 15-bytes (count in r2) */
120 movs ip, r2, lsl #29
121 bcc 1f
122 vld1.8 {d0}, [r1]!
123 vst1.8 {d0}, [r0]!
124 1: bge 2f
125 vld1.32 {d0[0]}, [r1]!
126 vst1.32 {d0[0]}, [r0]!
127 2: movs ip, r2, lsl #31
128 itt mi
129 ldrbmi r3, [r1], #1
130 strbmi r3, [r0], #1
131 itttt cs
132 ldrbcs ip, [r1], #1
133 ldrbcs lr, [r1], #1
134 strbcs ip, [r0], #1
135 strbcs lr, [r0], #1
137 ldmfd sp!, {r0, lr}
138 bx lr
139 END(MEMCPY_BASE)
141 ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
142 .save {r0, lr}
143 .cfi_def_cfa_offset 8
144 .cfi_rel_offset r0, 0
145 .cfi_rel_offset lr, 4
147 /* Simple arm-only copy loop to handle aligned copy operations */
148 stmfd sp!, {r4-r8}
149 .save {r4-r8}
150 .cfi_adjust_cfa_offset 20
151 .cfi_rel_offset r4, 0
152 .cfi_rel_offset r5, 4
153 .cfi_rel_offset r6, 8
154 .cfi_rel_offset r7, 12
155 .cfi_rel_offset r8, 16
156 pld [r1, #(32 * 4)]
158 /* Check alignment */
159 rsb r3, r1, #0
160 ands r3, #3
161 beq 2f
163 /* align source to 32 bits. We need to insert 2 instructions between
164 * a ldr[b|h] and str[b|h] because byte and half-word instructions
165 * stall 2 cycles.
166 */
167 movs r12, r3, lsl #31
168 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
169 itt mi
170 ldrbmi r3, [r1], #1
171 strbmi r3, [r0], #1
172 itttt cs
173 ldrbcs r4, [r1], #1
174 ldrbcs r5, [r1], #1
175 strbcs r4, [r0], #1
176 strbcs r5, [r0], #1
178 2:
179 subs r2, r2, #64
180 blt 4f
182 3: /* Main copy loop, copying 64 bytes at a time */
183 pld [r1, #(32 * 8)]
184 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
185 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
186 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
187 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
188 subs r2, r2, #64
189 bge 3b
191 4: /* Check if there are > 32 bytes left */
192 adds r2, r2, #64
193 subs r2, r2, #32
194 blt 5f
196 /* Copy 32 bytes */
197 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
198 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
199 subs r2, #32
201 5: /* Handle any remaining bytes */
202 adds r2, #32
203 beq 6f
205 movs r12, r2, lsl #28
206 itt cs
207 ldmiacs r1!, {r3, r4, r5, r6} /* 16 bytes */
208 stmiacs r0!, {r3, r4, r5, r6}
209 itt mi
210 ldmiami r1!, {r7, r8} /* 8 bytes */
211 stmiami r0!, {r7, r8}
212 movs r12, r2, lsl #30
213 itt cs
214 ldrcs r3, [r1], #4 /* 4 bytes */
215 strcs r3, [r0], #4
216 itt mi
217 ldrhmi r4, [r1], #2 /* 2 bytes */
218 strhmi r4, [r0], #2
219 tst r2, #0x1
220 itt ne
221 ldrbne r3, [r1] /* last byte */
222 strbne r3, [r0]
223 6:
224 ldmfd sp!, {r4-r8}
225 ldmfd sp!, {r0, pc}
226 END(MEMCPY_BASE_ALIGNED)