/*
 * Copyright (C) 2011 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef BIONIC_ATOMIC_ARM_H
#define BIONIC_ATOMIC_ARM_H

#include <machine/cpu-features.h>

/* Some of the harware instructions used below are not available in Thumb-1
 * mode (they are if you build in ARM or Thumb-2 mode though). To solve this
 * problem, we're going to use the same technique than libatomics_ops,
 * which is to temporarily switch to ARM, do the operation, then switch
 * back to Thumb-1.
 *
 * This results in two 'bx' jumps, just like a normal function call, but
 * everything is kept inlined, avoids loading or computing the function's
 * address, and prevents a little I-cache trashing too.
 *
 * However, it is highly recommended to avoid compiling any C library source
 * file that use these functions in Thumb-1 mode.
 *
 * Define three helper macros to implement this:
 */
#if defined(__thumb__) && !defined(__thumb2__)
#  define  __ATOMIC_SWITCH_TO_ARM \
            "adr r3, 5f\n" \
            "bx  r3\n" \
            ".align\n" \
            ".arm\n" \
        "5:\n"
/* note: the leading \n below is intentional */
#  define __ATOMIC_SWITCH_TO_THUMB \
            "\n" \
            "adr r3, 6f\n" \
            "bx  r3\n" \
            ".thumb" \
        "6:\n"

#  define __ATOMIC_CLOBBERS   "r3"  /* list of clobbered registers */

/* Warn the user that ARM mode should really be preferred! */
#  warning Rebuilding this source file in ARM mode is highly recommended for performance!!

#else
#  define  __ATOMIC_SWITCH_TO_ARM   /* nothing */
#  define  __ATOMIC_SWITCH_TO_THUMB /* nothing */
#  define  __ATOMIC_CLOBBERS        /* nothing */
#endif


/* Define a full memory barrier, this is only needed if we build the
 * platform for a multi-core device. For the record, using a 'dmb'
 * instruction on a Nexus One device can take up to 180 ns even if
 * it is completely un-necessary on this device.
 *
 * NOTE: This is where the platform and NDK headers atomic headers are
 *        going to diverge. With the NDK, we don't know if the generated
 *        code is going to run on a single or multi-core device, so we
 *        need to be cautious.
 *
 *        Fortunately, we can use the kernel helper function that is
 *        mapped at address 0xffff0fa0 in all user process, and that
 *        provides a device-specific barrier operation.
 *
 *        I.e. on single-core devices, the helper immediately returns,
 *        on multi-core devices, it uses "dmb" or any other means to
 *        perform a full-memory barrier.
 *
 * There are three cases to consider for the platform:
 *
 *    - multi-core ARMv7-A       => use the 'dmb' hardware instruction
 *    - multi-core ARMv6         => use the coprocessor
 *    - single core ARMv5TE/6/7  => do not use any hardware barrier
 */
#if defined(ANDROID_SMP) && ANDROID_SMP == 1

/* Sanity check, multi-core is only supported starting from ARMv6 */
#  if __ARM_ARCH__ < 6
#    error ANDROID_SMP should not be set to 1 for an ARM architecture less than 6
#  endif

#  ifdef __ARM_HAVE_DMB
/* For ARMv7-A, we can use the 'dmb' instruction directly */
__ATOMIC_INLINE__ void
__bionic_memory_barrier(void)
{
    /* Note: we always build in ARM or Thumb-2 on ARMv7-A, so don't
     * bother with __ATOMIC_SWITCH_TO_ARM */
    __asm__ __volatile__ ( "dmb" : : : "memory" );
}
#  else /* !__ARM_HAVE_DMB */
/* Otherwise, i.e. for multi-core ARMv6, we need to use the coprocessor,
 * which requires the use of a general-purpose register, which is slightly
 * less efficient.
 */
__ATOMIC_INLINE__ void
__bionic_memory_barrier(void)
{
    __asm__ __volatile__ (
        __SWITCH_TO_ARM
        "mcr p15, 0, %0, c7, c10, 5"
        __SWITCH_TO_THUMB
        : : "r" (0) : __ATOMIC_CLOBBERS "memory");
}
#  endif /* !__ARM_HAVE_DMB */
#else /* !ANDROID_SMP */
__ATOMIC_INLINE__ void
__bionic_memory_barrier(void)
{
    /* A simple compiler barrier */
    __asm__ __volatile__ ( "" : : : "memory" );
}
#endif /* !ANDROID_SMP */

/* Compare-and-swap, without any explicit barriers. Note that this functions
 * returns 0 on success, and 1 on failure. The opposite convention is typically
 * used on other platforms.
 *
 * There are two cases to consider:
 *
 *     - ARMv6+  => use LDREX/STREX instructions
 *     - < ARMv6 => use kernel helper function mapped at 0xffff0fc0
 *
 * LDREX/STREX are only available starting from ARMv6
 */
#ifdef __ARM_HAVE_LDREX_STREX
__ATOMIC_INLINE__ int
__bionic_cmpxchg(int32_t old_value, int32_t new_value, volatile int32_t* ptr)
{
    int32_t prev, status;
    do {
        __asm__ __volatile__ (
            __ATOMIC_SWITCH_TO_ARM
            "ldrex %0, [%3]\n"
            "mov %1, #0\n"
            "teq %0, %4\n"
#ifdef __thumb2__
            "it eq\n"
#endif
            "strexeq %1, %5, [%3]"
            __ATOMIC_SWITCH_TO_THUMB
            : "=&r" (prev), "=&r" (status), "+m"(*ptr)
            : "r" (ptr), "Ir" (old_value), "r" (new_value)
            : __ATOMIC_CLOBBERS "cc");
    } while (__builtin_expect(status != 0, 0));
    return prev != old_value;
}
#  else /* !__ARM_HAVE_LDREX_STREX */

/* Use the handy kernel helper function mapped at 0xffff0fc0 */
typedef int (kernel_cmpxchg)(int32_t, int32_t, volatile int32_t *);

__ATOMIC_INLINE__ int
__kernel_cmpxchg(int32_t old_value, int32_t new_value, volatile int32_t* ptr)
{
    /* Note: the kernel function returns 0 on success too */
    return (*(kernel_cmpxchg *)0xffff0fc0)(old_value, new_value, ptr);
}

__ATOMIC_INLINE__ int
__bionic_cmpxchg(int32_t old_value, int32_t new_value, volatile int32_t* ptr)
{
    return __kernel_cmpxchg(old_value, new_value, ptr);
}
#endif /* !__ARM_HAVE_LDREX_STREX */

/* Swap operation, without any explicit barriers.
 * There are again two similar cases to consider:
 *
 *   ARMv6+ => use LDREX/STREX
 *   < ARMv6 => use SWP instead.
 */
#ifdef __ARM_HAVE_LDREX_STREX
__ATOMIC_INLINE__ int32_t
__bionic_swap(int32_t new_value, volatile int32_t* ptr)
{
    int32_t prev, status;
    do {
        __asm__ __volatile__ (
            __ATOMIC_SWITCH_TO_ARM
            "ldrex %0, [%3]\n"
            "strex %1, %4, [%3]"
            __ATOMIC_SWITCH_TO_THUMB
            : "=&r" (prev), "=&r" (status), "+m" (*ptr)
            : "r" (ptr), "r" (new_value)
            : __ATOMIC_CLOBBERS "cc");
    } while (__builtin_expect(status != 0, 0));
    return prev;
}
#else /* !__ARM_HAVE_LDREX_STREX */
__ATOMIC_INLINE__ int32_t
__bionic_swap(int32_t new_value, volatile int32_t* ptr)
{
    int32_t prev;
    /* NOTE: SWP is available in Thumb-1 too */
    __asm__ __volatile__ ("swp %0, %2, [%3]"
                          : "=&r" (prev), "+m" (*ptr)
                          : "r" (new_value), "r" (ptr)
                          : "cc");
    return prev;
}
#endif /* !__ARM_HAVE_LDREX_STREX */

/* Atomic increment - without any barriers
 * This returns the old value
 */
#ifdef __ARM_HAVE_LDREX_STREX
__ATOMIC_INLINE__ int32_t
__bionic_atomic_inc(volatile int32_t* ptr)
{
    int32_t prev, tmp, status;
    do {
        __asm__ __volatile__ (
            __ATOMIC_SWITCH_TO_ARM
            "ldrex %0, [%4]\n"
            "add %1, %0, #1\n"
            "strex %2, %1, [%4]"
            __ATOMIC_SWITCH_TO_THUMB
            : "=&r" (prev), "=&r" (tmp), "=&r" (status), "+m"(*ptr)
            : "r" (ptr)
            : __ATOMIC_CLOBBERS "cc");
    } while (__builtin_expect(status != 0, 0));
    return prev;
}
#else
__ATOMIC_INLINE__ int32_t
__bionic_atomic_inc(volatile int32_t* ptr)
{
    int32_t  prev, status;
    do {
        prev = *ptr;
        status = __kernel_cmpxchg(prev, prev+1, ptr);
    } while (__builtin_expect(status != 0, 0));
    return prev;
}
#endif

/* Atomic decrement - without any barriers
 * This returns the old value.
 */
#ifdef __ARM_HAVE_LDREX_STREX
__ATOMIC_INLINE__ int32_t
__bionic_atomic_dec(volatile int32_t* ptr)
{
    int32_t prev, tmp, status;
    do {
        __asm__ __volatile__ (
            __ATOMIC_SWITCH_TO_ARM
            "ldrex %0, [%4]\n"
            "sub %1, %0, #1\n"
            "strex %2, %1, [%4]"
            __ATOMIC_SWITCH_TO_THUMB
            : "=&r" (prev), "=&r" (tmp), "=&r" (status), "+m"(*ptr)
            : "r" (ptr)
            : __ATOMIC_CLOBBERS "cc");
    } while (__builtin_expect(status != 0, 0));
    return prev;
}
#else
__ATOMIC_INLINE__ int32_t
__bionic_atomic_dec(volatile int32_t* ptr)
{
    int32_t  prev, status;
    do {
        prev = *ptr;
        status = __kernel_cmpxchg(prev, prev-1, ptr);
    } while (__builtin_expect(status != 0, 0));
    return prev;
}
#endif

#endif /* SYS_ATOMICS_ARM_H */