diff options
Diffstat (limited to 'libcutils/arch-mips/android_memset.S')
-rw-r--r-- | libcutils/arch-mips/android_memset.S | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/libcutils/arch-mips/android_memset.S b/libcutils/arch-mips/android_memset.S new file mode 100644 index 000000000..6811de01e --- /dev/null +++ b/libcutils/arch-mips/android_memset.S | |||
@@ -0,0 +1,323 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2009 | ||
3 | * MIPS Technologies, Inc., California. | ||
4 | * | ||
5 | * Redistribution and use in source and binary forms, with or without | ||
6 | * modification, are permitted provided that the following conditions | ||
7 | * are met: | ||
8 | * 1. Redistributions of source code must retain the above copyright | ||
9 | * notice, this list of conditions and the following disclaimer. | ||
10 | * 2. Redistributions in binary form must reproduce the above copyright | ||
11 | * notice, this list of conditions and the following disclaimer in the | ||
12 | * documentation and/or other materials provided with the distribution. | ||
13 | * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its | ||
14 | * contributors may be used to endorse or promote products derived from | ||
15 | * this software without specific prior written permission. | ||
16 | * | ||
17 | * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND | ||
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE | ||
21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
27 | * SUCH DAMAGE. | ||
28 | */ | ||
29 | |||
30 | /************************************************************************ | ||
31 | * | ||
32 | * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops | ||
33 | * Version: "043009" | ||
34 | * | ||
35 | ************************************************************************/ | ||
36 | |||
37 | |||
38 | /************************************************************************ | ||
39 | * Include files | ||
40 | ************************************************************************/ | ||
41 | |||
42 | #include <machine/asm.h> | ||
43 | #define END(f) .cfi_endproc; .size f, .-f; .end f | ||
44 | |||
45 | /* | ||
46 | * This routine could be optimized for MIPS64. The current code only | ||
47 | * uses MIPS32 instructions. | ||
48 | */ | ||
49 | |||
50 | #if defined(__MIPSEB__) | ||
51 | # define SWHI swl /* high part is left in big-endian */ | ||
52 | # define SWLO swr /* low part is right in big-endian */ | ||
53 | #endif | ||
54 | |||
55 | #if defined(__MIPSEL__) | ||
56 | # define SWHI swr /* high part is right in little-endian */ | ||
57 | # define SWLO swl /* low part is left in little-endian */ | ||
58 | #endif | ||
59 | |||
60 | #if !(defined(XGPROF) || defined(XPROF)) | ||
61 | #undef SETUP_GP | ||
62 | #define SETUP_GP | ||
63 | #endif | ||
64 | |||
65 | #ifdef NDEBUG | ||
66 | #define DBG # | ||
67 | #else | ||
68 | #define DBG | ||
69 | #endif | ||
70 | |||
71 | /* | ||
72 | * void android_memset16(uint16_t* dst, uint16_t value, size_t size); | ||
73 | */ | ||
74 | |||
75 | LEAF(android_memset16,0) | ||
76 | .set noreorder | ||
77 | DBG /* Check parameters */ | ||
78 | DBG andi t0,a0,1 # a0 must be halfword aligned | ||
79 | DBG tne t0,zero | ||
80 | DBG andi t2,a2,1 # a2 must be even | ||
81 | DBG tne t2,zero | ||
82 | |||
83 | #ifdef FIXARGS | ||
84 | # ensure count is even | ||
85 | #if (__mips==32) && (__mips_isa_rev>=2) | ||
86 | ins a2,zero,0,1 | ||
87 | #else | ||
88 | ori a2,1 | ||
89 | xori a2,1 | ||
90 | #endif | ||
91 | #endif | ||
92 | |||
93 | #if (__mips==32) && (__mips_isa_rev>=2) | ||
94 | ins a1,a1,16,16 | ||
95 | #else | ||
96 | andi a1,0xffff | ||
97 | sll t3,a1,16 | ||
98 | or a1,t3 | ||
99 | #endif | ||
100 | |||
101 | beqz a2,.Ldone | ||
102 | andi t1,a0,2 | ||
103 | beqz t1,.Lalignok | ||
104 | addu t0,a0,a2 # t0 is the "past the end" address | ||
105 | sh a1,0(a0) # store one halfword to get aligned | ||
106 | addu a0,2 | ||
107 | subu a2,2 | ||
108 | .Lalignok: | ||
109 | slti t1,a2,4 # .Laligned for 4 or more bytes | ||
110 | beqz t1,.Laligned | ||
111 | sne t1,a2,2 # one more halfword? | ||
112 | bnez t1,.Ldone | ||
113 | nop | ||
114 | sh a1,0(a0) | ||
115 | .Ldone: | ||
116 | j ra | ||
117 | nop | ||
118 | .set reorder | ||
119 | END(android_memset16) | ||
120 | |||
121 | /* | ||
122 | * void android_memset32(uint32_t* dst, uint32_t value, size_t size); | ||
123 | */ | ||
124 | |||
125 | LEAF(android_memset32,0) | ||
126 | .set noreorder | ||
127 | DBG /* Check parameters */ | ||
128 | DBG andi t0,a0,3 # a0 must be word aligned | ||
129 | DBG tne t0,zero | ||
130 | DBG andi t2,a2,3 # a2 must be a multiple of 4 bytes | ||
131 | DBG tne t2,zero | ||
132 | |||
133 | #ifdef FIXARGS | ||
134 | # ensure count is a multiple of 4 | ||
135 | #if (__mips==32) && (__mips_isa_rev>=2) | ||
136 | ins $a2,$0,0,2 | ||
137 | #else | ||
138 | ori a2,3 | ||
139 | xori a2,3 | ||
140 | #endif | ||
141 | #endif | ||
142 | |||
143 | bnez a2,.Laligned # any work to do? | ||
144 | addu t0,a0,a2 # t0 is the "past the end" address | ||
145 | |||
146 | j ra | ||
147 | nop | ||
148 | .set reorder | ||
149 | END(android_memset32) | ||
150 | |||
151 | LEAF(memset,0) | ||
152 | |||
153 | .set noreorder | ||
154 | .set noat | ||
155 | |||
156 | addu t0,a0,a2 # t0 is the "past the end" address | ||
157 | slti AT,a2,4 # is a2 less than 4? | ||
158 | bne AT,zero,.Llast4 # if yes, go to last4 | ||
159 | move v0,a0 # memset returns the dst pointer | ||
160 | |||
161 | beq a1,zero,.Lset0 | ||
162 | subu v1,zero,a0 | ||
163 | |||
164 | # smear byte into 32 bit word | ||
165 | #if (__mips==32) && (__mips_isa_rev>=2) | ||
166 | ins a1, a1, 8, 8 # Replicate fill byte into half-word. | ||
167 | ins a1, a1, 16, 16 # Replicate fill byte into word. | ||
168 | #else | ||
169 | and a1,0xff | ||
170 | sll AT,a1,8 | ||
171 | or a1,AT | ||
172 | sll AT,a1,16 | ||
173 | or a1,AT | ||
174 | #endif | ||
175 | |||
176 | .Lset0: | ||
177 | andi v1,v1,0x3 # word-unaligned address? | ||
178 | beq v1,zero,.Laligned # v1 is the unalignment count | ||
179 | subu a2,a2,v1 | ||
180 | SWHI a1,0(a0) | ||
181 | addu a0,a0,v1 | ||
182 | |||
183 | # Here we have the "word-aligned" a0 (until the "last4") | ||
184 | .Laligned: | ||
185 | andi t8,a2,0x3f # any 64-byte chunks? | ||
186 | # t8 is the byte count past 64-byte chunks | ||
187 | beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks | ||
188 | # There will be at most 1 32-byte chunk then | ||
189 | subu a3,a2,t8 # subtract from a2 the reminder | ||
190 | # Here a3 counts bytes in 16w chunks | ||
191 | addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks | ||
192 | |||
193 | # Find out, if there are any 64-byte chunks after which will be still at least | ||
194 | # 96 bytes left. The value "96" is calculated as needed buffer for | ||
195 | # "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after | ||
196 | # incrementing "a0" by 64. | ||
197 | # For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk. | ||
198 | # | ||
199 | sltiu v1,a2,160 | ||
200 | bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)" | ||
201 | subu t7,a2,96 # subtract "pref 30 unsafe" region | ||
202 | # below we have at least 1 64-byte chunk which is "pref 30 safe" | ||
203 | andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder | ||
204 | subu t5,t7,t6 # subtract from t7 the reminder | ||
205 | # Here t5 counts bytes in 16w "safe" chunks | ||
206 | addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks | ||
207 | |||
208 | # Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line | ||
209 | # pref 30,0(a0) | ||
210 | # Here we are in the region, where it is safe to use "pref 30,64(a0)" | ||
211 | .Lloop16w: | ||
212 | addiu a0,a0,64 | ||
213 | pref 30,-32(a0) # continue setting up the dest, addr 64-32 | ||
214 | sw a1,-64(a0) | ||
215 | sw a1,-60(a0) | ||
216 | sw a1,-56(a0) | ||
217 | sw a1,-52(a0) | ||
218 | sw a1,-48(a0) | ||
219 | sw a1,-44(a0) | ||
220 | sw a1,-40(a0) | ||
221 | sw a1,-36(a0) | ||
222 | nop | ||
223 | nop # the extra nop instructions help to balance | ||
224 | nop # cycles needed for "store" + "fill" + "evict" | ||
225 | nop # For 64byte store there are needed 8 fill | ||
226 | nop # and 8 evict cycles, i.e. at least 32 instr. | ||
227 | nop | ||
228 | nop | ||
229 | pref 30,0(a0) # continue setting up the dest, addr 64-0 | ||
230 | sw a1,-32(a0) | ||
231 | sw a1,-28(a0) | ||
232 | sw a1,-24(a0) | ||
233 | sw a1,-20(a0) | ||
234 | sw a1,-16(a0) | ||
235 | sw a1,-12(a0) | ||
236 | sw a1,-8(a0) | ||
237 | sw a1,-4(a0) | ||
238 | nop | ||
239 | nop | ||
240 | nop | ||
241 | nop # NOTE: adding 14 nop-s instead of 12 nop-s | ||
242 | nop # gives better results for "fast" memory | ||
243 | nop | ||
244 | bne a0,t4,.Lloop16w | ||
245 | nop | ||
246 | |||
247 | beq a0,a3,.Lchk8w # maybe no more 64-byte chunks? | ||
248 | nop # this "delayed slot" is useless ... | ||
249 | |||
250 | .Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks | ||
251 | addiu a0,a0,64 | ||
252 | sw a1,-64(a0) | ||
253 | sw a1,-60(a0) | ||
254 | sw a1,-56(a0) | ||
255 | sw a1,-52(a0) | ||
256 | sw a1,-48(a0) | ||
257 | sw a1,-44(a0) | ||
258 | sw a1,-40(a0) | ||
259 | sw a1,-36(a0) | ||
260 | sw a1,-32(a0) | ||
261 | sw a1,-28(a0) | ||
262 | sw a1,-24(a0) | ||
263 | sw a1,-20(a0) | ||
264 | sw a1,-16(a0) | ||
265 | sw a1,-12(a0) | ||
266 | sw a1,-8(a0) | ||
267 | bne a0,a3,.Lloop16w_nopref30 | ||
268 | sw a1,-4(a0) | ||
269 | |||
270 | .Lchk8w: # t8 here is the byte count past 64-byte chunks | ||
271 | |||
272 | andi t7,t8,0x1f # is there a 32-byte chunk? | ||
273 | # the t7 is the reminder count past 32-bytes | ||
274 | beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk | ||
275 | move a2,t7 | ||
276 | |||
277 | sw a1,0(a0) | ||
278 | sw a1,4(a0) | ||
279 | sw a1,8(a0) | ||
280 | sw a1,12(a0) | ||
281 | sw a1,16(a0) | ||
282 | sw a1,20(a0) | ||
283 | sw a1,24(a0) | ||
284 | sw a1,28(a0) | ||
285 | addiu a0,a0,32 | ||
286 | |||
287 | .Lchk1w: | ||
288 | andi t8,a2,0x3 # now t8 is the reminder past 1w chunks | ||
289 | beq a2,t8,.Llast4aligned | ||
290 | subu a3,a2,t8 # a3 is the count of bytes in 1w chunks | ||
291 | addu a3,a0,a3 # now a3 is the dst address past the 1w chunks | ||
292 | |||
293 | # copying in words (4-byte chunks) | ||
294 | .LwordCopy_loop: | ||
295 | addiu a0,a0,4 | ||
296 | bne a0,a3,.LwordCopy_loop | ||
297 | sw a1,-4(a0) | ||
298 | |||
299 | # store last 0-3 bytes | ||
300 | # this will repeat the last store if the memset finishes on a word boundary | ||
301 | .Llast4aligned: | ||
302 | j ra | ||
303 | SWLO a1,-1(t0) | ||
304 | |||
305 | .Llast4: | ||
306 | beq a0,t0,.Llast4e | ||
307 | .Llast4l: | ||
308 | addiu a0,a0,1 | ||
309 | bne a0,t0,.Llast4l | ||
310 | sb a1,-1(a0) | ||
311 | .Llast4e: | ||
312 | j ra | ||
313 | nop | ||
314 | |||
315 | .set at | ||
316 | .set reorder | ||
317 | |||
318 | END(memset) | ||
319 | |||
320 | |||
321 | /************************************************************************ | ||
322 | * Implementation : Static functions | ||
323 | ************************************************************************/ | ||