/*
 *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

#include <stdio.h>

#define DEBUG          0
#define DEBUG_PRINTF(...) \
    do { \
        if (DEBUG) { \
            printf(__VA_ARGS__); \
        } \
    } while (0)


#define NBYTES         (1 << 8)
#define NHALFS         (NBYTES / sizeof(short))
#define NWORDS         (NBYTES / sizeof(int))
#define NDOBLS         (NBYTES / sizeof(long long))

long long     dbuf[NDOBLS] __attribute__((aligned(1 << 12))) = {0};
int           wbuf[NWORDS] __attribute__((aligned(1 << 12))) = {0};
short         hbuf[NHALFS] __attribute__((aligned(1 << 12))) = {0};
unsigned char bbuf[NBYTES] __attribute__((aligned(1 << 12))) = {0};

/*
 * We use the C preporcessor to deal with the combinations of types
 */

#define INIT(BUF, N) \
    void init_##BUF(void) \
    { \
        int i; \
        for (i = 0; i < N; i++) { \
            BUF[i] = i; \
        } \
    } \

INIT(bbuf, NBYTES)
INIT(hbuf, NHALFS)
INIT(wbuf, NWORDS)
INIT(dbuf, NDOBLS)

/*
 * Macros for performing circular load
 *     RES         result
 *     ADDR        address
 *     START       start address of buffer
 *     LEN         length of buffer (in bytes)
 *     INC         address increment (in bytes for IMM, elements for REG)
 */
#define CIRC_LOAD_IMM(SIZE, RES, ADDR, START, LEN, INC) \
    __asm__( \
        "r4 = %3\n\t" \
        "m0 = r4\n\t" \
        "cs0 = %2\n\t" \
        "%0 = mem" #SIZE "(%1++#" #INC ":circ(M0))\n\t" \
        : "=r"(RES), "+r"(ADDR) \
        : "r"(START), "r"(LEN) \
        : "r4", "m0", "cs0")
#define CIRC_LOAD_IMM_b(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_IMM(b, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_IMM_ub(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_IMM(ub, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_IMM_h(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_IMM(h, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_IMM_uh(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_IMM(uh, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_IMM_w(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_IMM(w, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_IMM_d(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_IMM(d, RES, ADDR, START, LEN, INC)

/*
 * The mreg has the following pieces
 *     mreg[31:28]              increment[10:7]
 *     mreg[27:24]              K value (used Hexagon v3 and earlier)
 *     mreg[23:17]              increment[6:0]
 *     mreg[16:0]               circular buffer length
 */
static int build_mreg(int inc, int K, int len)
{
    return ((inc & 0x780) << 21) |
           ((K & 0xf) << 24) |
           ((inc & 0x7f) << 17) |
           (len & 0x1ffff);
}

#define CIRC_LOAD_REG(SIZE, RES, ADDR, START, LEN, INC) \
    __asm__( \
        "r4 = %2\n\t" \
        "m1 = r4\n\t" \
        "cs1 = %3\n\t" \
        "%0 = mem" #SIZE "(%1++I:circ(M1))\n\t" \
        : "=r"(RES), "+r"(ADDR) \
        : "r"(build_mreg((INC), 0, (LEN))), \
          "r"(START) \
        : "r4", "m1", "cs1")
#define CIRC_LOAD_REG_b(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_REG(b, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_REG_ub(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_REG(ub, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_REG_h(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_REG(h, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_REG_uh(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_REG(uh, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_REG_w(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_REG(w, RES, ADDR, START, LEN, INC)
#define CIRC_LOAD_REG_d(RES, ADDR, START, LEN, INC) \
    CIRC_LOAD_REG(d, RES, ADDR, START, LEN, INC)

/*
 * Macros for performing circular store
 *     VAL         value to store
 *     ADDR        address
 *     START       start address of buffer
 *     LEN         length of buffer (in bytes)
 *     INC         address increment (in bytes for IMM, elements for REG)
 */
#define CIRC_STORE_IMM(SIZE, PART, VAL, ADDR, START, LEN, INC) \
    __asm__( \
        "r4 = %3\n\t" \
        "m0 = r4\n\t" \
        "cs0 = %1\n\t" \
        "mem" #SIZE "(%0++#" #INC ":circ(M0)) = %2" PART "\n\t" \
        : "+r"(ADDR) \
        : "r"(START), "r"(VAL), "r"(LEN) \
        : "r4", "m0", "cs0", "memory")
#define CIRC_STORE_IMM_b(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_IMM(b, "", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_IMM_h(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_IMM(h, "", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_IMM_f(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_IMM(h, ".H", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_IMM_w(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_IMM(w, "", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_IMM_d(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_IMM(d, "", VAL, ADDR, START, LEN, INC)

#define CIRC_STORE_NEW_IMM(SIZE, VAL, ADDR, START, LEN, INC) \
    __asm__( \
        "r4 = %3\n\t" \
        "m0 = r4\n\t" \
        "cs0 = %1\n\t" \
        "{\n\t" \
        "    r5 = %2\n\t" \
        "    mem" #SIZE "(%0++#" #INC ":circ(M0)) = r5.new\n\t" \
        "}\n\t" \
        : "+r"(ADDR) \
        : "r"(START), "r"(VAL), "r"(LEN) \
        : "r4", "r5", "m0", "cs0", "memory")
#define CIRC_STORE_IMM_bnew(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_NEW_IMM(b, VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_IMM_hnew(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_NEW_IMM(h, VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_IMM_wnew(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_NEW_IMM(w, VAL, ADDR, START, LEN, INC)

#define CIRC_STORE_REG(SIZE, PART, VAL, ADDR, START, LEN, INC) \
    __asm__( \
        "r4 = %1\n\t" \
        "m1 = r4\n\t" \
        "cs1 = %2\n\t" \
        "mem" #SIZE "(%0++I:circ(M1)) = %3" PART "\n\t" \
        : "+r"(ADDR) \
        : "r"(build_mreg((INC), 0, (LEN))), \
          "r"(START), \
          "r"(VAL) \
        : "r4", "m1", "cs1", "memory")
#define CIRC_STORE_REG_b(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_REG(b, "", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_REG_h(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_REG(h, "", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_REG_f(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_REG(h, ".H", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_REG_w(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_REG(w, "", VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_REG_d(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_REG(d, "", VAL, ADDR, START, LEN, INC)

#define CIRC_STORE_NEW_REG(SIZE, VAL, ADDR, START, LEN, INC) \
    __asm__( \
        "r4 = %1\n\t" \
        "m1 = r4\n\t" \
        "cs1 = %2\n\t" \
        "{\n\t" \
        "    r5 = %3\n\t" \
        "    mem" #SIZE "(%0++I:circ(M1)) = r5.new\n\t" \
        "}\n\t" \
        : "+r"(ADDR) \
        : "r"(build_mreg((INC), 0, (LEN))), \
          "r"(START), \
          "r"(VAL) \
        : "r4", "r5", "m1", "cs1", "memory")
#define CIRC_STORE_REG_bnew(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_NEW_REG(b, VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_REG_hnew(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_NEW_REG(h, VAL, ADDR, START, LEN, INC)
#define CIRC_STORE_REG_wnew(VAL, ADDR, START, LEN, INC) \
    CIRC_STORE_NEW_REG(w, VAL, ADDR, START, LEN, INC)


int err;

/* We'll test increments +1 and -1 */
void check_load(int i, long long result, int inc, int size)
{
    int expect = (i * inc);
    while (expect >= size) {
        expect -= size;
    }
    while (expect < 0) {
        expect += size;
    }
    if (result != expect) {
        printf("ERROR(%d): %lld != %d\n", i, result, expect);
        err++;
    }
}

#define TEST_LOAD_IMM(SZ, TYPE, BUF, BUFSIZE, INC, FMT) \
void circ_test_load_imm_##SZ(void) \
{ \
    TYPE *p = (TYPE *)BUF; \
    int size = 10; \
    int i; \
    for (i = 0; i < BUFSIZE; i++) { \
        TYPE element; \
        CIRC_LOAD_IMM_##SZ(element, p, BUF, size * sizeof(TYPE), (INC)); \
        DEBUG_PRINTF("i = %2d, p = 0x%p, element = %2" #FMT "\n", \
                     i, p, element); \
        check_load(i, element, ((INC) / (int)sizeof(TYPE)), size); \
    } \
    p = (TYPE *)BUF; \
    for (i = 0; i < BUFSIZE; i++) { \
        TYPE element; \
        CIRC_LOAD_IMM_##SZ(element, p, BUF, size * sizeof(TYPE), -(INC)); \
        DEBUG_PRINTF("i = %2d, p = 0x%p, element = %2" #FMT "\n", \
                     i, p, element); \
        check_load(i, element, (-(INC) / (int)sizeof(TYPE)), size); \
    } \
}

TEST_LOAD_IMM(b,  char,           bbuf, NBYTES, 1, d)
TEST_LOAD_IMM(ub, unsigned char,  bbuf, NBYTES, 1, d)
TEST_LOAD_IMM(h,  short,          hbuf, NHALFS, 2, d)
TEST_LOAD_IMM(uh, unsigned short, hbuf, NHALFS, 2, d)
TEST_LOAD_IMM(w,  int,            wbuf, NWORDS, 4, d)
TEST_LOAD_IMM(d,  long long,      dbuf, NDOBLS, 8, lld)

#define TEST_LOAD_REG(SZ, TYPE, BUF, BUFSIZE, FMT) \
void circ_test_load_reg_##SZ(void) \
{ \
    TYPE *p = (TYPE *)BUF; \
    int size = 13; \
    int i; \
    for (i = 0; i < BUFSIZE; i++) { \
        TYPE element; \
        CIRC_LOAD_REG_##SZ(element, p, BUF, size * sizeof(TYPE), 1); \
        DEBUG_PRINTF("i = %2d, p = 0x%p, element = %2" #FMT "\n", \
                     i, p, element); \
        check_load(i, element, 1, size); \
    } \
    p = (TYPE *)BUF; \
    for (i = 0; i < BUFSIZE; i++) { \
        TYPE element; \
        CIRC_LOAD_REG_##SZ(element, p, BUF, size * sizeof(TYPE), -1); \
        DEBUG_PRINTF("i = %2d, p = 0x%p, element = %2" #FMT "\n", \
                     i, p, element); \
        check_load(i, element, -1, size); \
    } \
}

TEST_LOAD_REG(b,  char,           bbuf, NBYTES, d)
TEST_LOAD_REG(ub, unsigned char,  bbuf, NBYTES, d)
TEST_LOAD_REG(h,  short,          hbuf, NHALFS, d)
TEST_LOAD_REG(uh, unsigned short, hbuf, NHALFS, d)
TEST_LOAD_REG(w,  int,            wbuf, NWORDS, d)
TEST_LOAD_REG(d,  long long,      dbuf, NDOBLS, lld)

/* The circular stores will wrap around somewhere inside the buffer */
#define CIRC_VAL(SZ, TYPE, BUFSIZE) \
TYPE circ_val_##SZ(int i, int inc, int size) \
{ \
    int mod = BUFSIZE % size; \
    int elem = i * inc; \
    if (elem < 0) { \
        if (-elem <= size - mod) { \
            return (elem + BUFSIZE - mod); \
        } else { \
            return (elem + BUFSIZE + size - mod); \
        } \
    } else if (elem < mod) {\
        return (elem + BUFSIZE - mod); \
    } else { \
        return (elem + BUFSIZE - size - mod); \
    } \
}

CIRC_VAL(b, unsigned char, NBYTES)
CIRC_VAL(h, short,         NHALFS)
CIRC_VAL(w, int,           NWORDS)
CIRC_VAL(d, long long,     NDOBLS)

/*
 * Circular stores should only write to the first "size" elements of the buffer
 * the remainder of the elements should have BUF[i] == i
 */
#define CHECK_STORE(SZ, BUF, BUFSIZE, FMT) \
void check_store_##SZ(int inc, int size) \
{ \
    int i; \
    for (i = 0; i < size; i++) { \
        DEBUG_PRINTF(#BUF "[%3d] = 0x%02" #FMT ", guess = 0x%02" #FMT "\n", \
                     i, BUF[i], circ_val_##SZ(i, inc, size)); \
        if (BUF[i] != circ_val_##SZ(i, inc, size)) { \
            printf("ERROR(%3d): 0x%02" #FMT " != 0x%02" #FMT "\n", \
                   i, BUF[i], circ_val_##SZ(i, inc, size)); \
            err++; \
        } \
    } \
    for (i = size; i < BUFSIZE; i++) { \
        if (BUF[i] != i) { \
            printf("ERROR(%3d): 0x%02" #FMT " != 0x%02x\n", i, BUF[i], i); \
            err++; \
        } \
    } \
}

CHECK_STORE(b, bbuf, NBYTES, x)
CHECK_STORE(h, hbuf, NHALFS, x)
CHECK_STORE(w, wbuf, NWORDS, x)
CHECK_STORE(d, dbuf, NDOBLS, llx)

#define CIRC_TEST_STORE_IMM(SZ, CHK, TYPE, BUF, BUFSIZE, SHIFT, INC) \
void circ_test_store_imm_##SZ(void) \
{ \
    unsigned int size = 27; \
    TYPE *p = BUF; \
    TYPE val = 0; \
    int i; \
    init_##BUF(); \
    for (i = 0; i < BUFSIZE; i++) { \
        CIRC_STORE_IMM_##SZ(val << SHIFT, p, BUF, size * sizeof(TYPE), INC); \
        val++; \
    } \
    check_store_##CHK(((INC) / (int)sizeof(TYPE)), size); \
    p = BUF; \
    val = 0; \
    init_##BUF(); \
    for (i = 0; i < BUFSIZE; i++) { \
        CIRC_STORE_IMM_##SZ(val << SHIFT, p, BUF, size * sizeof(TYPE), \
                            -(INC)); \
        val++; \
    } \
    check_store_##CHK((-(INC) / (int)sizeof(TYPE)), size); \
}

CIRC_TEST_STORE_IMM(b,    b, unsigned char, bbuf, NBYTES, 0,  1)
CIRC_TEST_STORE_IMM(h,    h, short,         hbuf, NHALFS, 0,  2)
CIRC_TEST_STORE_IMM(f,    h, short,         hbuf, NHALFS, 16, 2)
CIRC_TEST_STORE_IMM(w,    w, int,           wbuf, NWORDS, 0,  4)
CIRC_TEST_STORE_IMM(d,    d, long long,     dbuf, NDOBLS, 0,  8)
CIRC_TEST_STORE_IMM(bnew, b, unsigned char, bbuf, NBYTES, 0,  1)
CIRC_TEST_STORE_IMM(hnew, h, short,         hbuf, NHALFS, 0,  2)
CIRC_TEST_STORE_IMM(wnew, w, int,           wbuf, NWORDS, 0,  4)

#define CIRC_TEST_STORE_REG(SZ, CHK, TYPE, BUF, BUFSIZE, SHIFT) \
void circ_test_store_reg_##SZ(void) \
{ \
    TYPE *p = BUF; \
    unsigned int size = 19; \
    TYPE val = 0; \
    int i; \
    init_##BUF(); \
    for (i = 0; i < BUFSIZE; i++) { \
        CIRC_STORE_REG_##SZ(val << SHIFT, p, BUF, size * sizeof(TYPE), 1); \
        val++; \
    } \
    check_store_##CHK(1, size); \
    p = BUF; \
    val = 0; \
    init_##BUF(); \
    for (i = 0; i < BUFSIZE; i++) { \
        CIRC_STORE_REG_##SZ(val << SHIFT, p, BUF, size * sizeof(TYPE), -1); \
        val++; \
    } \
    check_store_##CHK(-1, size); \
}

CIRC_TEST_STORE_REG(b,    b, unsigned char, bbuf, NBYTES, 0)
CIRC_TEST_STORE_REG(h,    h, short,         hbuf, NHALFS, 0)
CIRC_TEST_STORE_REG(f,    h, short,         hbuf, NHALFS, 16)
CIRC_TEST_STORE_REG(w,    w, int,           wbuf, NWORDS, 0)
CIRC_TEST_STORE_REG(d,    d, long long,     dbuf, NDOBLS, 0)
CIRC_TEST_STORE_REG(bnew, b, unsigned char, bbuf, NBYTES, 0)
CIRC_TEST_STORE_REG(hnew, h, short,         hbuf, NHALFS, 0)
CIRC_TEST_STORE_REG(wnew, w, int,           wbuf, NWORDS, 0)

/* Test the old scheme used in Hexagon V3 */
static void circ_test_v3(void)
{
    int *p = wbuf;
    int size = 15;
    int K = 4;      /* 64 bytes */
    int element;
    int i;

    init_wbuf();

    for (i = 0; i < NWORDS; i++) {
        __asm__(
            "r4 = %2\n\t"
            "m1 = r4\n\t"
            "%0 = memw(%1++I:circ(M1))\n\t"
            : "=r"(element), "+r"(p)
            : "r"(build_mreg(1, K, size * sizeof(int)))
            : "r4", "m1");
        DEBUG_PRINTF("i = %2d, p = 0x%p, element = %2d\n", i, p, element);
        check_load(i, element, 1, size);
    }
}

int main()
{
    init_bbuf();
    init_hbuf();
    init_wbuf();
    init_dbuf();

    DEBUG_PRINTF("NBYTES = %d\n", NBYTES);
    DEBUG_PRINTF("Address of dbuf = 0x%p\n", dbuf);
    DEBUG_PRINTF("Address of wbuf = 0x%p\n", wbuf);
    DEBUG_PRINTF("Address of hbuf = 0x%p\n", hbuf);
    DEBUG_PRINTF("Address of bbuf = 0x%p\n", bbuf);

    circ_test_load_imm_b();
    circ_test_load_imm_ub();
    circ_test_load_imm_h();
    circ_test_load_imm_uh();
    circ_test_load_imm_w();
    circ_test_load_imm_d();

    circ_test_load_reg_b();
    circ_test_load_reg_ub();
    circ_test_load_reg_h();
    circ_test_load_reg_uh();
    circ_test_load_reg_w();
    circ_test_load_reg_d();

    circ_test_store_imm_b();
    circ_test_store_imm_h();
    circ_test_store_imm_f();
    circ_test_store_imm_w();
    circ_test_store_imm_d();
    circ_test_store_imm_bnew();
    circ_test_store_imm_hnew();
    circ_test_store_imm_wnew();

    circ_test_store_reg_b();
    circ_test_store_reg_h();
    circ_test_store_reg_f();
    circ_test_store_reg_w();
    circ_test_store_reg_d();
    circ_test_store_reg_bnew();
    circ_test_store_reg_hnew();
    circ_test_store_reg_wnew();

    circ_test_v3();

    puts(err ? "FAIL" : "PASS");
    return err ? 1 : 0;
}