libc/arch-arm64/denver64/bionic/memcpy.S

   1 /* Copyright (c) 2012, Linaro Limited
   2    All rights reserved.
   3    Copyright (c) 2014, NVIDIA Corporation.  All rights reserved.
   4
   5    Redistribution and use in source and binary forms, with or without
   6    modification, are permitted provided that the following conditions are met:
   7        * Redistributions of source code must retain the above copyright
   8          notice, this list of conditions and the following disclaimer.
   9        * Redistributions in binary form must reproduce the above copyright
  10          notice, this list of conditions and the following disclaimer in the
  11          documentation and/or other materials provided with the distribution.
  12        * Neither the name of the Linaro nor the
  13          names of its contributors may be used to endorse or promote products
  14          derived from this software without specific prior written permission.
  15
  16    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  17    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  18    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  19    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  20    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  22    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27 */
  28
  29 /* Assumptions:
  30  *
  31  * denver, ARMv8-a, AArch64
  32  * Unaligned accesses
  33  *
  34  */
  35
  36 #include <private/bionic_asm.h>
  37
  38 #define dstin   x0
  39 #define src     x1
  40 #define count   x2
  41 #define tmp1    x3
  42 #define tmp1w   w3
  43 #define tmp2    x4
  44 #define tmp2w   w4
  45 #define tmp3    x5
  46 #define tmp3w   w5
  47 #define dst     x6
  48
  49 #define A_l     x7
  50 #define A_h     x8
  51 #define B_l     x9
  52 #define B_h     x10
  53 #define C_l     x11
  54 #define C_h     x12
  55 #define D_l     x13
  56 #define D_h     x14
  57
  58 #define QA_l    q0
  59 #define QA_h    q1
  60 #define QB_l    q2
  61 #define QB_h    q3
  62
  63 ENTRY(memcpy)
  64
  65         mov     dst, dstin
  66         cmp     count, #64
  67         b.ge    .Lcpy_not_short
  68         cmp     count, #15
  69         b.le    .Ltail15tiny
  70
  71         /* Deal with small copies quickly by dropping straight into the
  72          * exit block.  */
  73 .Ltail63:
  74         /* Copy up to 48 bytes of data.  At this point we only need the
  75          * bottom 6 bits of count to be accurate.  */
  76         ands    tmp1, count, #0x30
  77         b.eq    .Ltail15
  78         add     dst, dst, tmp1
  79         add     src, src, tmp1
  80         cmp     tmp1w, #0x20
  81         b.eq    1f
  82         b.lt    2f
  83         ldp     A_l, A_h, [src, #-48]
  84         stp     A_l, A_h, [dst, #-48]
  85 1:
  86         ldp     A_l, A_h, [src, #-32]
  87         stp     A_l, A_h, [dst, #-32]
  88 2:
  89         ldp     A_l, A_h, [src, #-16]
  90         stp     A_l, A_h, [dst, #-16]
  91
  92 .Ltail15:
  93         ands    count, count, #15
  94         beq     1f
  95         add     src, src, count
  96         ldp     A_l, A_h, [src, #-16]
  97         add     dst, dst, count
  98         stp     A_l, A_h, [dst, #-16]
  99 1:
 100         ret
 101
 102 .Ltail15tiny:
 103         /* Copy up to 15 bytes of data.  Does not assume additional data
 104            being copied.  */
 105         tbz     count, #3, 1f
 106         ldr     tmp1, [src], #8
 107         str     tmp1, [dst], #8
 108 1:
 109         tbz     count, #2, 1f
 110         ldr     tmp1w, [src], #4
 111         str     tmp1w, [dst], #4
 112 1:
 113         tbz     count, #1, 1f
 114         ldrh    tmp1w, [src], #2
 115         strh    tmp1w, [dst], #2
 116 1:
 117         tbz     count, #0, 1f
 118         ldrb    tmp1w, [src]
 119         strb    tmp1w, [dst]
 120 1:
 121         ret
 122
 123 .Lcpy_not_short:
 124         /* We don't much care about the alignment of DST, but we want SRC
 125          * to be 128-bit (16 byte) aligned so that we don't cross cache line
 126          * boundaries on both loads and stores.  */
 127         neg     tmp2, src
 128         ands    tmp2, tmp2, #15         /* Bytes to reach alignment.  */
 129         b.eq    2f
 130         sub     count, count, tmp2
 131         /* Copy more data than needed; it's faster than jumping
 132          * around copying sub-Quadword quantities.  We know that
 133          * it can't overrun.  */
 134         ldp     A_l, A_h, [src]
 135         add     src, src, tmp2
 136         stp     A_l, A_h, [dst]
 137         add     dst, dst, tmp2
 138         /* There may be less than 63 bytes to go now.  */
 139         cmp     count, #63
 140         b.le    .Ltail63
 141 2:
 142         subs    count, count, #128
 143         b.ge    .Lcpy_body_large
 144         /* Less than 128 bytes to copy, so handle 64 here and then jump
 145          * to the tail.  */
 146         ldp     QA_l, QA_h, [src]
 147         ldp     QB_l, QB_h, [src, #32]
 148         stp     QA_l, QA_h, [dst]
 149         stp     QB_l, QB_h, [dst, #32]
 150         tst     count, #0x3f
 151         add     src, src, #64
 152         add     dst, dst, #64
 153         b.ne    .Ltail63
 154         ret
 155
 156         /* Critical loop.  Start at a new cache line boundary.  Assuming
 157          * 64 bytes per line this ensures the entire loop is in one line.  */
 158         .p2align 6
 159 .Lcpy_body_large:
 160         cmp     count, 65536
 161         bhi     .Lcpy_body_huge
 162         /* There are at least 128 bytes to copy.  */
 163         ldp     QA_l, QA_h, [src, #0]
 164         sub     dst, dst, #32           /* Pre-bias.  */
 165         ldp     QB_l, QB_h, [src, #32]! /* src += 64 - Pre-bias.  */
 166 1:
 167         stp     QA_l, QA_h, [dst, #32]
 168         ldp     QA_l, QA_h, [src, #32]
 169         stp     QB_l, QB_h, [dst, #64]!
 170         ldp     QB_l, QB_h, [src, #64]!
 171
 172         subs    count, count, #64
 173         b.ge    1b
 174
 175         stp     QA_l, QA_h, [dst, #32]
 176         stp     QB_l, QB_h, [dst, #64]
 177         add     src, src, #32
 178         add     dst, dst, #64 + 32
 179         tst     count, #0x3f
 180         b.ne    .Ltail63
 181         ret
 182 .Lcpy_body_huge:
 183         /* There are at least 128 bytes to copy.  */
 184         ldp     QA_l, QA_h, [src, #0]
 185         sub     dst, dst, #32           /* Pre-bias.  */
 186         ldp     QB_l, QB_h, [src, #32]!
 187 1:
 188         stnp    QA_l, QA_h, [dst, #32]
 189         stnp    QB_l, QB_h, [dst, #64]
 190         ldp     QA_l, QA_h, [src, #32]
 191         ldp     QB_l, QB_h, [src, #64]!
 192         add     dst, dst, #64
 193
 194         subs    count, count, #64
 195         b.ge    1b
 196
 197         stnp    QA_l, QA_h, [dst, #32]
 198         stnp    QB_l, QB_h, [dst, #64]
 199         add     src, src, #32
 200         add     dst, dst, #64 + 32
 201         tst     count, #0x3f
 202         b.ne    .Ltail63
 203         ret
 204
 205 END(memcpy)