1 /* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3 Copyright (c) 2014, NVIDIA Corporation. All rights reserved.
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the Linaro nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
29 /* Assumptions:
30 *
31 * denver, ARMv8-a, AArch64
32 * Unaligned accesses
33 *
34 */
36 #include <private/bionic_asm.h>
38 #define dstin x0
39 #define src x1
40 #define count x2
41 #define tmp1 x3
42 #define tmp1w w3
43 #define tmp2 x4
44 #define tmp2w w4
45 #define tmp3 x5
46 #define tmp3w w5
47 #define dst x6
49 #define A_l x7
50 #define A_h x8
51 #define B_l x9
52 #define B_h x10
53 #define C_l x11
54 #define C_h x12
55 #define D_l x13
56 #define D_h x14
58 #define QA_l q0
59 #define QA_h q1
60 #define QB_l q2
61 #define QB_h q3
63 ENTRY(memcpy)
65 mov dst, dstin
66 cmp count, #64
67 b.ge .Lcpy_not_short
68 cmp count, #15
69 b.le .Ltail15tiny
71 /* Deal with small copies quickly by dropping straight into the
72 * exit block. */
73 .Ltail63:
74 /* Copy up to 48 bytes of data. At this point we only need the
75 * bottom 6 bits of count to be accurate. */
76 ands tmp1, count, #0x30
77 b.eq .Ltail15
78 add dst, dst, tmp1
79 add src, src, tmp1
80 cmp tmp1w, #0x20
81 b.eq 1f
82 b.lt 2f
83 ldp A_l, A_h, [src, #-48]
84 stp A_l, A_h, [dst, #-48]
85 1:
86 ldp A_l, A_h, [src, #-32]
87 stp A_l, A_h, [dst, #-32]
88 2:
89 ldp A_l, A_h, [src, #-16]
90 stp A_l, A_h, [dst, #-16]
92 .Ltail15:
93 ands count, count, #15
94 beq 1f
95 add src, src, count
96 ldp A_l, A_h, [src, #-16]
97 add dst, dst, count
98 stp A_l, A_h, [dst, #-16]
99 1:
100 ret
102 .Ltail15tiny:
103 /* Copy up to 15 bytes of data. Does not assume additional data
104 being copied. */
105 tbz count, #3, 1f
106 ldr tmp1, [src], #8
107 str tmp1, [dst], #8
108 1:
109 tbz count, #2, 1f
110 ldr tmp1w, [src], #4
111 str tmp1w, [dst], #4
112 1:
113 tbz count, #1, 1f
114 ldrh tmp1w, [src], #2
115 strh tmp1w, [dst], #2
116 1:
117 tbz count, #0, 1f
118 ldrb tmp1w, [src]
119 strb tmp1w, [dst]
120 1:
121 ret
123 .Lcpy_not_short:
124 /* We don't much care about the alignment of DST, but we want SRC
125 * to be 128-bit (16 byte) aligned so that we don't cross cache line
126 * boundaries on both loads and stores. */
127 neg tmp2, src
128 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
129 b.eq 2f
130 sub count, count, tmp2
131 /* Copy more data than needed; it's faster than jumping
132 * around copying sub-Quadword quantities. We know that
133 * it can't overrun. */
134 ldp A_l, A_h, [src]
135 add src, src, tmp2
136 stp A_l, A_h, [dst]
137 add dst, dst, tmp2
138 /* There may be less than 63 bytes to go now. */
139 cmp count, #63
140 b.le .Ltail63
141 2:
142 subs count, count, #128
143 b.ge .Lcpy_body_large
144 /* Less than 128 bytes to copy, so handle 64 here and then jump
145 * to the tail. */
146 ldp QA_l, QA_h, [src]
147 ldp QB_l, QB_h, [src, #32]
148 stp QA_l, QA_h, [dst]
149 stp QB_l, QB_h, [dst, #32]
150 tst count, #0x3f
151 add src, src, #64
152 add dst, dst, #64
153 b.ne .Ltail63
154 ret
156 /* Critical loop. Start at a new cache line boundary. Assuming
157 * 64 bytes per line this ensures the entire loop is in one line. */
158 .p2align 6
159 .Lcpy_body_large:
160 cmp count, 65536
161 bhi .Lcpy_body_huge
162 /* There are at least 128 bytes to copy. */
163 ldp QA_l, QA_h, [src, #0]
164 sub dst, dst, #32 /* Pre-bias. */
165 ldp QB_l, QB_h, [src, #32]! /* src += 64 - Pre-bias. */
166 1:
167 stp QA_l, QA_h, [dst, #32]
168 ldp QA_l, QA_h, [src, #32]
169 stp QB_l, QB_h, [dst, #64]!
170 ldp QB_l, QB_h, [src, #64]!
172 subs count, count, #64
173 b.ge 1b
175 stp QA_l, QA_h, [dst, #32]
176 stp QB_l, QB_h, [dst, #64]
177 add src, src, #32
178 add dst, dst, #64 + 32
179 tst count, #0x3f
180 b.ne .Ltail63
181 ret
182 .Lcpy_body_huge:
183 /* There are at least 128 bytes to copy. */
184 ldp QA_l, QA_h, [src, #0]
185 sub dst, dst, #32 /* Pre-bias. */
186 ldp QB_l, QB_h, [src, #32]!
187 1:
188 stnp QA_l, QA_h, [dst, #32]
189 stnp QB_l, QB_h, [dst, #64]
190 ldp QA_l, QA_h, [src, #32]
191 ldp QB_l, QB_h, [src, #64]!
192 add dst, dst, #64
194 subs count, count, #64
195 b.ge 1b
197 stnp QA_l, QA_h, [dst, #32]
198 stnp QB_l, QB_h, [dst, #64]
199 add src, src, #32
200 add dst, dst, #64 + 32
201 tst count, #0x3f
202 b.ne .Ltail63
203 ret
205 END(memcpy)