1*c217d954SCole FaustR"( 2*c217d954SCole Faust 3*c217d954SCole Faust 4*c217d954SCole Faust 5*c217d954SCole Faust 6*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 7*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 8*c217d954SCole Faust 9*c217d954SCole Faust 10*c217d954SCole Faust 11*c217d954SCole Faust 12*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13*c217d954SCole Faust VSTORE(N0) \ 14*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15*c217d954SCole Faust 16*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18*c217d954SCole Faust VSTORE(N0) \ 19*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20*c217d954SCole Faust 21*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23*c217d954SCole Faust VSTORE(N0) \ 24*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25*c217d954SCole Faust 26*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28*c217d954SCole Faust VSTORE(N0) \ 29*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30*c217d954SCole Faust 31*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33*c217d954SCole Faust VSTORE(N0) \ 34*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35*c217d954SCole Faust 36*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38*c217d954SCole Faust VSTORE(N0) \ 39*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40*c217d954SCole Faust 41*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43*c217d954SCole Faust VSTORE(N0) \ 44*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45*c217d954SCole Faust 46*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48*c217d954SCole Faust VSTORE(N0) \ 49*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50*c217d954SCole Faust 51*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53*c217d954SCole Faust VSTORE(N0) \ 54*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55*c217d954SCole Faust 56*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58*c217d954SCole Faust VSTORE(N0) \ 59*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60*c217d954SCole Faust 61*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63*c217d954SCole Faust VSTORE(N0) \ 64*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65*c217d954SCole Faust 66*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68*c217d954SCole Faust VSTORE(N0) \ 69*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70*c217d954SCole Faust 71*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73*c217d954SCole Faust VSTORE(N0) \ 74*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75*c217d954SCole Faust 76*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78*c217d954SCole Faust VSTORE(N0) \ 79*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80*c217d954SCole Faust 81*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83*c217d954SCole Faust VSTORE(N0) \ 84*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85*c217d954SCole Faust 86*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88*c217d954SCole Faust VSTORE(N0) \ 89*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90*c217d954SCole Faust 91*c217d954SCole Faust 92*c217d954SCole Faust 93*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94*c217d954SCole Faust VSTORE(N0) \ 95*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96*c217d954SCole Faust 97*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99*c217d954SCole Faust VSTORE(N0) \ 100*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101*c217d954SCole Faust 102*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104*c217d954SCole Faust VSTORE(N0) \ 105*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106*c217d954SCole Faust 107*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109*c217d954SCole Faust VSTORE(N0) \ 110*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111*c217d954SCole Faust 112*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114*c217d954SCole Faust VSTORE(N0) \ 115*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116*c217d954SCole Faust 117*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119*c217d954SCole Faust VSTORE(N0) \ 120*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121*c217d954SCole Faust 122*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124*c217d954SCole Faust VSTORE(N0) \ 125*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126*c217d954SCole Faust 127*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129*c217d954SCole Faust VSTORE(N0) \ 130*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131*c217d954SCole Faust 132*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134*c217d954SCole Faust VSTORE(N0) \ 135*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136*c217d954SCole Faust 137*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139*c217d954SCole Faust VSTORE(N0) \ 140*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141*c217d954SCole Faust 142*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144*c217d954SCole Faust VSTORE(N0) \ 145*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146*c217d954SCole Faust 147*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149*c217d954SCole Faust VSTORE(N0) \ 150*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151*c217d954SCole Faust 152*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154*c217d954SCole Faust VSTORE(N0) \ 155*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156*c217d954SCole Faust 157*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159*c217d954SCole Faust VSTORE(N0) \ 160*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161*c217d954SCole Faust 162*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164*c217d954SCole Faust VSTORE(N0) \ 165*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166*c217d954SCole Faust 167*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169*c217d954SCole Faust VSTORE(N0) \ 170*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171*c217d954SCole Faust 172*c217d954SCole Faust 173*c217d954SCole Faust 174*c217d954SCole Faust 175*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177*c217d954SCole Faust 178*c217d954SCole Faust 179*c217d954SCole Faust 180*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182*c217d954SCole Faust 183*c217d954SCole Faust 184*c217d954SCole Faust 185*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 187*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188*c217d954SCole Faust 189*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 192*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193*c217d954SCole Faust 194*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 197*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198*c217d954SCole Faust 199*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 202*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203*c217d954SCole Faust 204*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 207*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208*c217d954SCole Faust 209*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 212*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213*c217d954SCole Faust 214*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 217*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218*c217d954SCole Faust 219*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 222*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223*c217d954SCole Faust 224*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 227*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228*c217d954SCole Faust 229*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 232*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233*c217d954SCole Faust 234*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 237*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238*c217d954SCole Faust 239*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 242*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243*c217d954SCole Faust 244*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 247*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248*c217d954SCole Faust 249*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 252*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253*c217d954SCole Faust 254*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 257*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258*c217d954SCole Faust 259*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 262*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263*c217d954SCole Faust 264*c217d954SCole Faust 265*c217d954SCole Faust 266*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268*c217d954SCole Faust 269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271*c217d954SCole Faust { \ 272*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273*c217d954SCole Faust } \ 274*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275*c217d954SCole Faust { \ 276*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277*c217d954SCole Faust } \ 278*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279*c217d954SCole Faust { \ 280*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281*c217d954SCole Faust } \ 282*c217d954SCole Faust else \ 283*c217d954SCole Faust { \ 284*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285*c217d954SCole Faust } 286*c217d954SCole Faust 287*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 289*c217d954SCole Faust { \ 290*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291*c217d954SCole Faust } \ 292*c217d954SCole Faust else \ 293*c217d954SCole Faust { \ 294*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295*c217d954SCole Faust } 296*c217d954SCole Faust 297*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 299*c217d954SCole Faust { \ 300*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301*c217d954SCole Faust } \ 302*c217d954SCole Faust else \ 303*c217d954SCole Faust { \ 304*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305*c217d954SCole Faust } 306*c217d954SCole Faust 307*c217d954SCole Faust 308*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309*c217d954SCole Faust 310*c217d954SCole Faust 311*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312*c217d954SCole Faust 313*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315*c217d954SCole Faust 316*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317*c217d954SCole Faust 318*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320*c217d954SCole Faust 321*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322*c217d954SCole Faust 323*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325*c217d954SCole Faust 326*c217d954SCole Faust#else 327*c217d954SCole Faust 328*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330*c217d954SCole Faust 331*c217d954SCole Faust#endif 332*c217d954SCole Faust 333*c217d954SCole Faust#endif 334*c217d954SCole Faust 335*c217d954SCole Faust 336*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 337*c217d954SCole Faust 338*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340*c217d954SCole Faust#else 341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342*c217d954SCole Faust ((uint)(y * M0)) 343*c217d954SCole Faust#endif 344*c217d954SCole Faust 345*c217d954SCole Faust 346*c217d954SCole Faust 347*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349*c217d954SCole Faust 350*c217d954SCole Faust 351*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353*c217d954SCole Faust#endif 354*c217d954SCole Faust 355*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357*c217d954SCole Faust#endif 358*c217d954SCole Faust 359*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361*c217d954SCole Faust#endif 362*c217d954SCole Faust 363*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 365*c217d954SCole Faust#endif 366*c217d954SCole Faust 367*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 368*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 369*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 370*c217d954SCole Faust 371*c217d954SCole Faust 372*c217d954SCole Faust#define CONCAT(a, b) a##b 373*c217d954SCole Faust 374*c217d954SCole Faust 375*c217d954SCole Faust#define EXPAND(x) x 376*c217d954SCole Faust 377*c217d954SCole Faust 378*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379*c217d954SCole Faust 380*c217d954SCole Faust 381*c217d954SCole Faust#define REV1(x) ((x)) 382*c217d954SCole Faust#define REV2(x) ((x).s10) 383*c217d954SCole Faust#define REV3(x) ((x).s210) 384*c217d954SCole Faust#define REV4(x) ((x).s3210) 385*c217d954SCole Faust#define REV8(x) ((x).s76543210) 386*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 387*c217d954SCole Faust 388*c217d954SCole Faust 389*c217d954SCole Faust 390*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 391*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 392*c217d954SCole Faust 393*c217d954SCole Faust 394*c217d954SCole Faust 395*c217d954SCole Faust#define ROT1_0(x) ((x)) 396*c217d954SCole Faust#define ROT1_1(x) ((x)) 397*c217d954SCole Faust 398*c217d954SCole Faust#define ROT2_0(x) ((x)) 399*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 400*c217d954SCole Faust#define ROT2_2(x) ((x)) 401*c217d954SCole Faust 402*c217d954SCole Faust#define ROT3_0(x) ((x)) 403*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 404*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 405*c217d954SCole Faust#define ROT3_3(x) ((x)) 406*c217d954SCole Faust 407*c217d954SCole Faust#define ROT4_0(x) ((x)) 408*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 409*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 410*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 411*c217d954SCole Faust#define ROT4_4(x) ((x)) 412*c217d954SCole Faust 413*c217d954SCole Faust#define ROT8_0(x) ((x)) 414*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 415*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 416*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 417*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 418*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 419*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 420*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 421*c217d954SCole Faust#define ROT8_8(x) ((x)) 422*c217d954SCole Faust 423*c217d954SCole Faust#define ROT16_0(x) ((x)) 424*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 425*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 426*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 427*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 428*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 429*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 430*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 431*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 432*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 433*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 434*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 435*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 436*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 437*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 438*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 439*c217d954SCole Faust#define ROT16_16(x) ((x)) 440*c217d954SCole Faust 441*c217d954SCole Faust 442*c217d954SCole Faust 443*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445*c217d954SCole Faust 446*c217d954SCole Faust 447*c217d954SCole Faust 448*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 449*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 450*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454*c217d954SCole Faust 455*c217d954SCole Faust 456*c217d954SCole Faust 457*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459*c217d954SCole Faust 460*c217d954SCole Faust 461*c217d954SCole Faust#define VLOAD_STR(size) vload##size 462*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 463*c217d954SCole Faust 464*c217d954SCole Faust 465*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467*c217d954SCole Faust 468*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 469*c217d954SCole Faust { \ 470*c217d954SCole Faust } 471*c217d954SCole Faust 472*c217d954SCole Faust 473*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 474*c217d954SCole Faust#define vload_partial_1_1 vload1 475*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 476*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 477*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 478*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 479*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 480*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 481*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 482*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 483*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 484*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 485*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 486*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 487*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 488*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 489*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 490*c217d954SCole Faust 491*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 492*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 493*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 494*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 495*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 496*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 497*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 498*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 499*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 500*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 501*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 502*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 503*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 504*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 505*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 506*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 507*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 508*c217d954SCole Faust 509*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 510*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 511*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 512*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 513*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 514*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 515*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 516*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 517*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 518*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 519*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 520*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 521*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 522*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 523*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 524*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 525*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 526*c217d954SCole Faust 527*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 528*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 529*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 530*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 531*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 532*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 533*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 534*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 535*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 536*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 537*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 538*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 539*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 540*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 541*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 542*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 543*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 544*c217d954SCole Faust 545*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 546*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 547*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 548*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 549*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 550*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 551*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 552*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 553*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 554*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 555*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 556*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 557*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 558*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 559*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 560*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 561*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 562*c217d954SCole Faust 563*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 564*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 565*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 566*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 567*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 568*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 569*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 570*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 571*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 572*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 573*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 574*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 575*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 576*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 577*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 578*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 579*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 580*c217d954SCole Faust 581*c217d954SCole Faust 582*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 583*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 584*c217d954SCole Faust 585*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 586*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 587*c217d954SCole Faust 588*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 589*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 590*c217d954SCole Faust 591*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 592*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 593*c217d954SCole Faust 594*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 595*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 597*c217d954SCole Faust 598*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 599*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601*c217d954SCole Faust 602*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 603*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605*c217d954SCole Faust 606*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 607*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 608*c217d954SCole Faust 609*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 610*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 612*c217d954SCole Faust 613*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 614*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616*c217d954SCole Faust 617*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 618*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620*c217d954SCole Faust 621*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 622*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624*c217d954SCole Faust 625*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 626*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628*c217d954SCole Faust 629*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 630*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632*c217d954SCole Faust 633*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 634*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636*c217d954SCole Faust 637*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 638*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 639*c217d954SCole Faust 640*c217d954SCole Faust 641*c217d954SCole Faust 642*c217d954SCole Faust#define PIXEL_UNIT4 1 643*c217d954SCole Faust#define PIXEL_UNIT8 2 644*c217d954SCole Faust#define PIXEL_UNIT16 4 645*c217d954SCole Faust 646*c217d954SCole Faust 647*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649*c217d954SCole Faust 650*c217d954SCole Faust 651*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654*c217d954SCole Faust 655*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659*c217d954SCole Faust#endif 660*c217d954SCole Faust 661*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664*c217d954SCole Faust 665*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669*c217d954SCole Faust#endif 670*c217d954SCole Faust 671*c217d954SCole Faust 672*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674*c217d954SCole Faust 675*c217d954SCole Faust 676*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678*c217d954SCole Faust 679*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 680*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 681*c217d954SCole Faust 682*c217d954SCole Faust#define float1 float 683*c217d954SCole Faust#define half1 half 684*c217d954SCole Faust#define char1 char 685*c217d954SCole Faust#define uchar1 uchar 686*c217d954SCole Faust#define short1 short 687*c217d954SCole Faust#define ushort1 ushort 688*c217d954SCole Faust#define int1 int 689*c217d954SCole Faust#define uint1 uint 690*c217d954SCole Faust#define long1 long 691*c217d954SCole Faust#define ulong1 ulong 692*c217d954SCole Faust#define double1 double 693*c217d954SCole Faust 694*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696*c217d954SCole Faust 697*c217d954SCole Faust 698*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700*c217d954SCole Faust 701*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 702*c217d954SCole Faust { \ 703*c217d954SCole Faust } 704*c217d954SCole Faust 705*c217d954SCole Faust 706*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 707*c217d954SCole Faust#define vstore_partial_1_1 vstore1 708*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 709*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 710*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 711*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 712*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 713*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 714*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 715*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 716*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 717*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 718*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 719*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 720*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 721*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 722*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 723*c217d954SCole Faust 724*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 725*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 726*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 727*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 728*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 729*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 730*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 731*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 732*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 733*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 734*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 735*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 736*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 737*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 738*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 739*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 740*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 741*c217d954SCole Faust 742*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 743*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 744*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 745*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 746*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 747*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 748*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 749*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 750*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 751*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 752*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 753*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 754*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 755*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 756*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 757*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 758*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 759*c217d954SCole Faust 760*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 761*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 762*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 763*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 764*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 765*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 766*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 767*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 768*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 769*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 770*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 771*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 772*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 773*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 774*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 775*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 776*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 777*c217d954SCole Faust 778*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 779*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 780*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 781*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 782*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 783*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 784*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 785*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 786*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 787*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 788*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 789*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 790*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 791*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 792*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 793*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 794*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 795*c217d954SCole Faust 796*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 797*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 798*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 799*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 800*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 801*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 802*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 803*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 804*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 805*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 806*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 807*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 808*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 809*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 810*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 811*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 812*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 813*c217d954SCole Faust 814*c217d954SCole Faust 815*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 816*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 817*c217d954SCole Faust 818*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 819*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 820*c217d954SCole Faust 821*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 822*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 823*c217d954SCole Faust 824*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 825*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 826*c217d954SCole Faust 827*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 828*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 830*c217d954SCole Faust 831*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 832*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834*c217d954SCole Faust 835*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 836*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838*c217d954SCole Faust 839*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 840*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 841*c217d954SCole Faust 842*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 843*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 845*c217d954SCole Faust 846*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 847*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849*c217d954SCole Faust 850*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 851*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853*c217d954SCole Faust 854*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 855*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857*c217d954SCole Faust 858*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 859*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861*c217d954SCole Faust 862*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 863*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865*c217d954SCole Faust 866*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 867*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869*c217d954SCole Faust 870*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 871*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 872*c217d954SCole Faust 873*c217d954SCole Faust 874*c217d954SCole Faust 875*c217d954SCole Faust 876*c217d954SCole Faust 877*c217d954SCole Faust#define convert_float_sat convert_float 878*c217d954SCole Faust#define convert_float1_sat convert_float 879*c217d954SCole Faust#define convert_float2_sat convert_float2 880*c217d954SCole Faust#define convert_float3_sat convert_float3 881*c217d954SCole Faust#define convert_float4_sat convert_float4 882*c217d954SCole Faust#define convert_float8_sat convert_float8 883*c217d954SCole Faust#define convert_float16_sat convert_float16 884*c217d954SCole Faust#define convert_half_sat convert_float 885*c217d954SCole Faust#define convert_half1_sat convert_half 886*c217d954SCole Faust#define convert_half2_sat convert_half2 887*c217d954SCole Faust#define convert_half3_sat convert_half3 888*c217d954SCole Faust#define convert_half4_sat convert_half4 889*c217d954SCole Faust#define convert_half8_sat convert_half8 890*c217d954SCole Faust#define convert_half16_sat convert_half16 891*c217d954SCole Faust 892*c217d954SCole Faust#define convert_float1 convert_float 893*c217d954SCole Faust#define convert_half1 convert_half 894*c217d954SCole Faust#define convert_char1 convert_char 895*c217d954SCole Faust#define convert_uchar1 convert_uchar 896*c217d954SCole Faust#define convert_short1 convert_short 897*c217d954SCole Faust#define convert_ushort1 convert_ushort 898*c217d954SCole Faust#define convert_int1 convert_int 899*c217d954SCole Faust#define convert_uint1 convert_uint 900*c217d954SCole Faust#define convert_long1 convert_long 901*c217d954SCole Faust#define convert_ulong1 convert_ulong 902*c217d954SCole Faust#define convert_double1 convert_double 903*c217d954SCole Faust 904*c217d954SCole Faust#define convert_char1_sat convert_char_sat 905*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 906*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 907*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 908*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 909*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 910*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 911*c217d954SCole Faust#define convert_short1_sat convert_short_sat 912*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 913*c217d954SCole Faust#define convert_int1_sat convert_int_sat 914*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 915*c217d954SCole Faust#define convert_long1_sat convert_long_sat 916*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 917*c217d954SCole Faust#define convert_double1_sat convert_double_sat 918*c217d954SCole Faust 919*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 920*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921*c217d954SCole Faust 922*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 923*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 924*c217d954SCole Faust 925*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927*c217d954SCole Faust 928*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930*c217d954SCole Faust 931*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 932*c217d954SCole Faust#define select_vec_dt_char(size) char##size 933*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 934*c217d954SCole Faust#define select_vec_dt_short(size) short##size 935*c217d954SCole Faust#define select_vec_dt_half(size) short##size 936*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 937*c217d954SCole Faust#define select_vec_dt_int(size) int##size 938*c217d954SCole Faust#define select_vec_dt_float(size) int##size 939*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 940*c217d954SCole Faust#define select_vec_dt_long(size) long##size 941*c217d954SCole Faust 942*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945*c217d954SCole Faust 946*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 947*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 948*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 949*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 950*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 951*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 952*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 953*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 954*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 955*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 956*c217d954SCole Faust 957*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960*c217d954SCole Faust 961*c217d954SCole Faust#define sum_reduce_1(x) (x) 962*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967*c217d954SCole Faust 968*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970*c217d954SCole Faust 971*c217d954SCole Faust#define prod_reduce_1(x) (x) 972*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977*c217d954SCole Faust 978*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980*c217d954SCole Faust 981*c217d954SCole Faust#define max_reduce_1(x) (x) 982*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987*c217d954SCole Faust 988*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990*c217d954SCole Faust 991*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 992*c217d954SCole Faust __global uchar *name##_ptr, \ 993*c217d954SCole Faust uint name##_stride_x, \ 994*c217d954SCole Faust uint name##_step_x, \ 995*c217d954SCole Faust uint name##_offset_first_element_in_bytes 996*c217d954SCole Faust 997*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 998*c217d954SCole Faust __global uchar *name##_ptr, \ 999*c217d954SCole Faust uint name##_stride_x, \ 1000*c217d954SCole Faust uint name##_step_x, \ 1001*c217d954SCole Faust uint name##_stride_y, \ 1002*c217d954SCole Faust uint name##_step_y, \ 1003*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1004*c217d954SCole Faust 1005*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 1006*c217d954SCole Faust __global uchar *name##_ptr, \ 1007*c217d954SCole Faust uint name##_stride_x, \ 1008*c217d954SCole Faust uint name##_step_x, \ 1009*c217d954SCole Faust uint name##_stride_y, \ 1010*c217d954SCole Faust uint name##_step_y, \ 1011*c217d954SCole Faust uint name##_stride_z, \ 1012*c217d954SCole Faust uint name##_step_z, \ 1013*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1014*c217d954SCole Faust 1015*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 1016*c217d954SCole Faust __global uchar *name##_ptr, \ 1017*c217d954SCole Faust uint name##_stride_x, \ 1018*c217d954SCole Faust uint name##_step_x, \ 1019*c217d954SCole Faust uint name##_stride_y, \ 1020*c217d954SCole Faust uint name##_step_y, \ 1021*c217d954SCole Faust uint name##_stride_z, \ 1022*c217d954SCole Faust uint name##_step_z, \ 1023*c217d954SCole Faust uint name##_stride_w, \ 1024*c217d954SCole Faust uint name##_step_w, \ 1025*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1026*c217d954SCole Faust 1027*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 1028*c217d954SCole Faust __global uchar *name##_ptr, \ 1029*c217d954SCole Faust uint name##_stride_x, \ 1030*c217d954SCole Faust uint name##_step_x, \ 1031*c217d954SCole Faust uint name##_stride_y, \ 1032*c217d954SCole Faust uint name##_step_y, \ 1033*c217d954SCole Faust uint name##_stride_z, \ 1034*c217d954SCole Faust uint name##_step_z, \ 1035*c217d954SCole Faust uint name##_stride_w, \ 1036*c217d954SCole Faust uint name##_step_w, \ 1037*c217d954SCole Faust uint name##_stride_v, \ 1038*c217d954SCole Faust uint name##_step_v, \ 1039*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1040*c217d954SCole Faust 1041*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043*c217d954SCole Faust 1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046*c217d954SCole Faust 1047*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049*c217d954SCole Faust 1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052*c217d954SCole Faust 1053*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055*c217d954SCole Faust 1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058*c217d954SCole Faust 1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061*c217d954SCole Faust 1062*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064*c217d954SCole Faust name##_stride_z, name##_step_z) 1065*c217d954SCole Faust 1066*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068*c217d954SCole Faust 1069*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072*c217d954SCole Faust 1073*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075*c217d954SCole Faust 1076*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078*c217d954SCole Faust name##_stride_z, name##_step_z) 1079*c217d954SCole Faust 1080*c217d954SCole Faust 1081*c217d954SCole Fausttypedef struct Vector 1082*c217d954SCole Faust{ 1083*c217d954SCole Faust __global uchar *ptr; 1084*c217d954SCole Faust int offset_first_element_in_bytes; 1085*c217d954SCole Faust int stride_x; 1086*c217d954SCole Faust} Vector; 1087*c217d954SCole Faust 1088*c217d954SCole Faust 1089*c217d954SCole Fausttypedef struct Image 1090*c217d954SCole Faust{ 1091*c217d954SCole Faust __global uchar *ptr; 1092*c217d954SCole Faust int offset_first_element_in_bytes; 1093*c217d954SCole Faust int stride_x; 1094*c217d954SCole Faust int stride_y; 1095*c217d954SCole Faust} Image; 1096*c217d954SCole Faust 1097*c217d954SCole Faust 1098*c217d954SCole Fausttypedef struct Tensor3D 1099*c217d954SCole Faust{ 1100*c217d954SCole Faust __global uchar *ptr; 1101*c217d954SCole Faust int offset_first_element_in_bytes; 1102*c217d954SCole Faust int stride_x; 1103*c217d954SCole Faust int stride_y; 1104*c217d954SCole Faust int stride_z; 1105*c217d954SCole Faust} Tensor3D; 1106*c217d954SCole Faust 1107*c217d954SCole Faust 1108*c217d954SCole Fausttypedef struct Tensor4D 1109*c217d954SCole Faust{ 1110*c217d954SCole Faust __global uchar *ptr; 1111*c217d954SCole Faust int offset_first_element_in_bytes; 1112*c217d954SCole Faust int stride_x; 1113*c217d954SCole Faust int stride_y; 1114*c217d954SCole Faust int stride_z; 1115*c217d954SCole Faust int stride_w; 1116*c217d954SCole Faust} Tensor4D; 1117*c217d954SCole Faust 1118*c217d954SCole Faust 1119*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120*c217d954SCole Faust{ 1121*c217d954SCole Faust Vector vector = 1122*c217d954SCole Faust { 1123*c217d954SCole Faust .ptr = ptr, 1124*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125*c217d954SCole Faust .stride_x = stride_x, 1126*c217d954SCole Faust }; 1127*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128*c217d954SCole Faust return vector; 1129*c217d954SCole Faust} 1130*c217d954SCole Faust 1131*c217d954SCole Faust 1132*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133*c217d954SCole Faust{ 1134*c217d954SCole Faust Image img = 1135*c217d954SCole Faust { 1136*c217d954SCole Faust .ptr = ptr, 1137*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138*c217d954SCole Faust .stride_x = stride_x, 1139*c217d954SCole Faust .stride_y = stride_y 1140*c217d954SCole Faust }; 1141*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142*c217d954SCole Faust return img; 1143*c217d954SCole Faust} 1144*c217d954SCole Faust 1145*c217d954SCole Faust 1146*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147*c217d954SCole Faust{ 1148*c217d954SCole Faust Image img = 1149*c217d954SCole Faust { 1150*c217d954SCole Faust .ptr = ptr, 1151*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152*c217d954SCole Faust .stride_x = stride_x, 1153*c217d954SCole Faust .stride_y = stride_y 1154*c217d954SCole Faust }; 1155*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156*c217d954SCole Faust return img; 1157*c217d954SCole Faust} 1158*c217d954SCole Faust 1159*c217d954SCole Faust 1160*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161*c217d954SCole Faust{ 1162*c217d954SCole Faust Tensor3D tensor = 1163*c217d954SCole Faust { 1164*c217d954SCole Faust .ptr = ptr, 1165*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166*c217d954SCole Faust .stride_x = stride_x, 1167*c217d954SCole Faust .stride_y = stride_y, 1168*c217d954SCole Faust .stride_z = stride_z 1169*c217d954SCole Faust }; 1170*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171*c217d954SCole Faust return tensor; 1172*c217d954SCole Faust} 1173*c217d954SCole Faust 1174*c217d954SCole Faust 1175*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176*c217d954SCole Faust{ 1177*c217d954SCole Faust Tensor3D tensor = 1178*c217d954SCole Faust { 1179*c217d954SCole Faust .ptr = ptr, 1180*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181*c217d954SCole Faust .stride_x = stride_x, 1182*c217d954SCole Faust .stride_y = stride_y, 1183*c217d954SCole Faust .stride_z = stride_z 1184*c217d954SCole Faust }; 1185*c217d954SCole Faust return tensor; 1186*c217d954SCole Faust} 1187*c217d954SCole Faust 1188*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189*c217d954SCole Faust uint step_w, 1190*c217d954SCole Faust uint mod_size) 1191*c217d954SCole Faust{ 1192*c217d954SCole Faust Tensor4D tensor = 1193*c217d954SCole Faust { 1194*c217d954SCole Faust .ptr = ptr, 1195*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196*c217d954SCole Faust .stride_x = stride_x, 1197*c217d954SCole Faust .stride_y = stride_y, 1198*c217d954SCole Faust .stride_z = stride_z, 1199*c217d954SCole Faust .stride_w = stride_w 1200*c217d954SCole Faust }; 1201*c217d954SCole Faust 1202*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203*c217d954SCole Faust return tensor; 1204*c217d954SCole Faust} 1205*c217d954SCole Faust 1206*c217d954SCole Faust 1207*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 1208*c217d954SCole Faust{ 1209*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 1210*c217d954SCole Faust} 1211*c217d954SCole Faust 1212*c217d954SCole Faust 1213*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 1214*c217d954SCole Faust{ 1215*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 1216*c217d954SCole Faust} 1217*c217d954SCole Faust 1218*c217d954SCole Faust 1219*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220*c217d954SCole Faust{ 1221*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222*c217d954SCole Faust} 1223*c217d954SCole Faust 1224*c217d954SCole Faust 1225*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226*c217d954SCole Faust{ 1227*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228*c217d954SCole Faust} 1229*c217d954SCole Faust 1230*c217d954SCole Faust 1231*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232*c217d954SCole Faust{ 1233*c217d954SCole Faust uint num_elements = width * height; 1234*c217d954SCole Faust 1235*c217d954SCole Faust const uint z = index / num_elements; 1236*c217d954SCole Faust 1237*c217d954SCole Faust index %= num_elements; 1238*c217d954SCole Faust 1239*c217d954SCole Faust const uint y = index / width; 1240*c217d954SCole Faust 1241*c217d954SCole Faust index %= width; 1242*c217d954SCole Faust 1243*c217d954SCole Faust const uint x = index; 1244*c217d954SCole Faust 1245*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246*c217d954SCole Faust} 1247*c217d954SCole Faust 1248*c217d954SCole Faust#endif 1249*c217d954SCole Faust 1250*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST 1251*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a)) 1252*c217d954SCole Faust#else 1253*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a)) 1254*c217d954SCole Faust#endif 1255*c217d954SCole Faust 1256*c217d954SCole Faust 1257*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 1258*c217d954SCole Faust 1259*c217d954SCole Faust 1260*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 1261*c217d954SCole Faust 1262*c217d954SCole Faust 1263*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 1264*c217d954SCole Faust 1265*c217d954SCole Faust 1266*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 1267*c217d954SCole Faust 1268*c217d954SCole Faust 1269*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 1270*c217d954SCole Faust 1271*c217d954SCole Faust 1272*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1273*c217d954SCole Faust 1274*c217d954SCole Faust 1275*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 1276*c217d954SCole Faust 1277*c217d954SCole Faust 1278*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 1279*c217d954SCole Faust 1280*c217d954SCole Faust 1281*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 1282*c217d954SCole Faust 1283*c217d954SCole Faust 1284*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 1285*c217d954SCole Faust 1286*c217d954SCole Faust 1287*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 1288*c217d954SCole Faust 1289*c217d954SCole Faust 1290*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 1291*c217d954SCole Faust 1292*c217d954SCole Faust 1293*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 1294*c217d954SCole Faust 1295*c217d954SCole Faust 1296*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 1297*c217d954SCole Faust 1298*c217d954SCole Faust 1299*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 1300*c217d954SCole Faust 1301*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1302*c217d954SCole Faust 1303*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1304*c217d954SCole Faust 1305*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 1306*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 1307*c217d954SCole Faust 1308*c217d954SCole Faust 1309*c217d954SCole Faust 1310*c217d954SCole Faust 1311*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1312*c217d954SCole Faust VSTORE(N0) \ 1313*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1314*c217d954SCole Faust 1315*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1317*c217d954SCole Faust VSTORE(N0) \ 1318*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1319*c217d954SCole Faust 1320*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1322*c217d954SCole Faust VSTORE(N0) \ 1323*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1324*c217d954SCole Faust 1325*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1327*c217d954SCole Faust VSTORE(N0) \ 1328*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1329*c217d954SCole Faust 1330*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1332*c217d954SCole Faust VSTORE(N0) \ 1333*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1334*c217d954SCole Faust 1335*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1337*c217d954SCole Faust VSTORE(N0) \ 1338*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1339*c217d954SCole Faust 1340*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1341*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342*c217d954SCole Faust VSTORE(N0) \ 1343*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1344*c217d954SCole Faust 1345*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347*c217d954SCole Faust VSTORE(N0) \ 1348*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1349*c217d954SCole Faust 1350*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352*c217d954SCole Faust VSTORE(N0) \ 1353*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1354*c217d954SCole Faust 1355*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357*c217d954SCole Faust VSTORE(N0) \ 1358*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1359*c217d954SCole Faust 1360*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362*c217d954SCole Faust VSTORE(N0) \ 1363*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1364*c217d954SCole Faust 1365*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367*c217d954SCole Faust VSTORE(N0) \ 1368*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1369*c217d954SCole Faust 1370*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372*c217d954SCole Faust VSTORE(N0) \ 1373*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1374*c217d954SCole Faust 1375*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377*c217d954SCole Faust VSTORE(N0) \ 1378*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1379*c217d954SCole Faust 1380*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382*c217d954SCole Faust VSTORE(N0) \ 1383*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1384*c217d954SCole Faust 1385*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1386*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387*c217d954SCole Faust VSTORE(N0) \ 1388*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1389*c217d954SCole Faust 1390*c217d954SCole Faust 1391*c217d954SCole Faust 1392*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1393*c217d954SCole Faust VSTORE(N0) \ 1394*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1395*c217d954SCole Faust 1396*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1398*c217d954SCole Faust VSTORE(N0) \ 1399*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1400*c217d954SCole Faust 1401*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1403*c217d954SCole Faust VSTORE(N0) \ 1404*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1405*c217d954SCole Faust 1406*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1408*c217d954SCole Faust VSTORE(N0) \ 1409*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1410*c217d954SCole Faust 1411*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1413*c217d954SCole Faust VSTORE(N0) \ 1414*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1415*c217d954SCole Faust 1416*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1418*c217d954SCole Faust VSTORE(N0) \ 1419*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1420*c217d954SCole Faust 1421*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1422*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1423*c217d954SCole Faust VSTORE(N0) \ 1424*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1425*c217d954SCole Faust 1426*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1427*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1428*c217d954SCole Faust VSTORE(N0) \ 1429*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1430*c217d954SCole Faust 1431*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1432*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1433*c217d954SCole Faust VSTORE(N0) \ 1434*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1435*c217d954SCole Faust 1436*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1437*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438*c217d954SCole Faust VSTORE(N0) \ 1439*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1440*c217d954SCole Faust 1441*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1442*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443*c217d954SCole Faust VSTORE(N0) \ 1444*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1445*c217d954SCole Faust 1446*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1447*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448*c217d954SCole Faust VSTORE(N0) \ 1449*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1450*c217d954SCole Faust 1451*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1452*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453*c217d954SCole Faust VSTORE(N0) \ 1454*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1455*c217d954SCole Faust 1456*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1457*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458*c217d954SCole Faust VSTORE(N0) \ 1459*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1460*c217d954SCole Faust 1461*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1462*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463*c217d954SCole Faust VSTORE(N0) \ 1464*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1465*c217d954SCole Faust 1466*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1467*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468*c217d954SCole Faust VSTORE(N0) \ 1469*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1470*c217d954SCole Faust 1471*c217d954SCole Faust 1472*c217d954SCole Faust 1473*c217d954SCole Faust 1474*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1475*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1476*c217d954SCole Faust 1477*c217d954SCole Faust 1478*c217d954SCole Faust 1479*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1480*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1481*c217d954SCole Faust 1482*c217d954SCole Faust 1483*c217d954SCole Faust 1484*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1486*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1487*c217d954SCole Faust 1488*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1491*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1492*c217d954SCole Faust 1493*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1496*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1497*c217d954SCole Faust 1498*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1501*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1502*c217d954SCole Faust 1503*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1506*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1507*c217d954SCole Faust 1508*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1511*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1512*c217d954SCole Faust 1513*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1514*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1516*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1517*c217d954SCole Faust 1518*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1519*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1521*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1522*c217d954SCole Faust 1523*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1524*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1526*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1527*c217d954SCole Faust 1528*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1529*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1531*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1532*c217d954SCole Faust 1533*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1534*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1536*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1537*c217d954SCole Faust 1538*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1539*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1541*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1542*c217d954SCole Faust 1543*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1544*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1545*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1546*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1547*c217d954SCole Faust 1548*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1549*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1550*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1551*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1552*c217d954SCole Faust 1553*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1554*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1555*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1556*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1557*c217d954SCole Faust 1558*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1559*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1560*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1561*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1562*c217d954SCole Faust 1563*c217d954SCole Faust 1564*c217d954SCole Faust 1565*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1566*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1567*c217d954SCole Faust 1568*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1569*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1570*c217d954SCole Faust { \ 1571*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1572*c217d954SCole Faust } \ 1573*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1574*c217d954SCole Faust { \ 1575*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1576*c217d954SCole Faust } \ 1577*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1578*c217d954SCole Faust { \ 1579*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1580*c217d954SCole Faust } \ 1581*c217d954SCole Faust else \ 1582*c217d954SCole Faust { \ 1583*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1584*c217d954SCole Faust } 1585*c217d954SCole Faust 1586*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1587*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 1588*c217d954SCole Faust { \ 1589*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1590*c217d954SCole Faust } \ 1591*c217d954SCole Faust else \ 1592*c217d954SCole Faust { \ 1593*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1594*c217d954SCole Faust } 1595*c217d954SCole Faust 1596*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1597*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 1598*c217d954SCole Faust { \ 1599*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1600*c217d954SCole Faust } \ 1601*c217d954SCole Faust else \ 1602*c217d954SCole Faust { \ 1603*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1604*c217d954SCole Faust } 1605*c217d954SCole Faust 1606*c217d954SCole Faust 1607*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1608*c217d954SCole Faust 1609*c217d954SCole Faust 1610*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1611*c217d954SCole Faust 1612*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1613*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1614*c217d954SCole Faust 1615*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1616*c217d954SCole Faust 1617*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1618*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1619*c217d954SCole Faust 1620*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1621*c217d954SCole Faust 1622*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1623*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1624*c217d954SCole Faust 1625*c217d954SCole Faust#else 1626*c217d954SCole Faust 1627*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1628*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1629*c217d954SCole Faust 1630*c217d954SCole Faust#endif 1631*c217d954SCole Faust 1632*c217d954SCole Faust#endif 1633*c217d954SCole Faust 1634*c217d954SCole Faust 1635*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 1636*c217d954SCole Faust 1637*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1638*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1639*c217d954SCole Faust#else 1640*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1641*c217d954SCole Faust ((uint)(y * M0)) 1642*c217d954SCole Faust#endif 1643*c217d954SCole Faust 1644*c217d954SCole Faust 1645*c217d954SCole Faust 1646*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1647*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1648*c217d954SCole Faust 1649*c217d954SCole Faust 1650*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1651*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1652*c217d954SCole Faust#endif 1653*c217d954SCole Faust 1654*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1655*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1656*c217d954SCole Faust#endif 1657*c217d954SCole Faust 1658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1659*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1660*c217d954SCole Faust#endif 1661*c217d954SCole Faust 1662*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1663*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 1664*c217d954SCole Faust#endif 1665*c217d954SCole Faust 1666*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 1667*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 1668*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 1669*c217d954SCole Faust 1670*c217d954SCole Faust 1671*c217d954SCole Faust#define CONCAT(a, b) a##b 1672*c217d954SCole Faust 1673*c217d954SCole Faust 1674*c217d954SCole Faust#define EXPAND(x) x 1675*c217d954SCole Faust 1676*c217d954SCole Faust 1677*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1678*c217d954SCole Faust 1679*c217d954SCole Faust 1680*c217d954SCole Faust#define REV1(x) ((x)) 1681*c217d954SCole Faust#define REV2(x) ((x).s10) 1682*c217d954SCole Faust#define REV3(x) ((x).s210) 1683*c217d954SCole Faust#define REV4(x) ((x).s3210) 1684*c217d954SCole Faust#define REV8(x) ((x).s76543210) 1685*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 1686*c217d954SCole Faust 1687*c217d954SCole Faust 1688*c217d954SCole Faust 1689*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 1690*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 1691*c217d954SCole Faust 1692*c217d954SCole Faust 1693*c217d954SCole Faust 1694*c217d954SCole Faust#define ROT1_0(x) ((x)) 1695*c217d954SCole Faust#define ROT1_1(x) ((x)) 1696*c217d954SCole Faust 1697*c217d954SCole Faust#define ROT2_0(x) ((x)) 1698*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 1699*c217d954SCole Faust#define ROT2_2(x) ((x)) 1700*c217d954SCole Faust 1701*c217d954SCole Faust#define ROT3_0(x) ((x)) 1702*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 1703*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 1704*c217d954SCole Faust#define ROT3_3(x) ((x)) 1705*c217d954SCole Faust 1706*c217d954SCole Faust#define ROT4_0(x) ((x)) 1707*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 1708*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 1709*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 1710*c217d954SCole Faust#define ROT4_4(x) ((x)) 1711*c217d954SCole Faust 1712*c217d954SCole Faust#define ROT8_0(x) ((x)) 1713*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 1714*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 1715*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 1716*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 1717*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 1718*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 1719*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 1720*c217d954SCole Faust#define ROT8_8(x) ((x)) 1721*c217d954SCole Faust 1722*c217d954SCole Faust#define ROT16_0(x) ((x)) 1723*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 1724*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 1725*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 1726*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 1727*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 1728*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 1729*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 1730*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 1731*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 1732*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 1733*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 1734*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 1735*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 1736*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 1737*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 1738*c217d954SCole Faust#define ROT16_16(x) ((x)) 1739*c217d954SCole Faust 1740*c217d954SCole Faust 1741*c217d954SCole Faust 1742*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1743*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1744*c217d954SCole Faust 1745*c217d954SCole Faust 1746*c217d954SCole Faust 1747*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 1748*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 1749*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1750*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1751*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1752*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1753*c217d954SCole Faust 1754*c217d954SCole Faust 1755*c217d954SCole Faust 1756*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1757*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1758*c217d954SCole Faust 1759*c217d954SCole Faust 1760*c217d954SCole Faust#define VLOAD_STR(size) vload##size 1761*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 1762*c217d954SCole Faust 1763*c217d954SCole Faust 1764*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1765*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1766*c217d954SCole Faust 1767*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 1768*c217d954SCole Faust { \ 1769*c217d954SCole Faust } 1770*c217d954SCole Faust 1771*c217d954SCole Faust 1772*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 1773*c217d954SCole Faust#define vload_partial_1_1 vload1 1774*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 1775*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 1776*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 1777*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 1778*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 1779*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 1780*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 1781*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 1782*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 1783*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 1784*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 1785*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 1786*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 1787*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 1788*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 1789*c217d954SCole Faust 1790*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 1791*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 1792*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 1793*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 1794*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 1795*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 1796*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 1797*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 1798*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 1799*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 1800*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 1801*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 1802*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 1803*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 1804*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 1805*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 1806*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 1807*c217d954SCole Faust 1808*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 1809*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 1810*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 1811*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 1812*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 1813*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 1814*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 1815*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 1816*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 1817*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 1818*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 1819*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 1820*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 1821*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 1822*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 1823*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 1824*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 1825*c217d954SCole Faust 1826*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 1827*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 1828*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 1829*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 1830*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 1831*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 1832*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 1833*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 1834*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 1835*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 1836*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 1837*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 1838*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 1839*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 1840*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 1841*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 1842*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 1843*c217d954SCole Faust 1844*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 1845*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 1846*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 1847*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 1848*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 1849*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 1850*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 1851*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 1852*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 1853*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 1854*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 1855*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 1856*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 1857*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 1858*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 1859*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 1860*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 1861*c217d954SCole Faust 1862*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 1863*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 1864*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 1865*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 1866*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 1867*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 1868*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 1869*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 1870*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 1871*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 1872*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 1873*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 1874*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 1875*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 1876*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 1877*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 1878*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 1879*c217d954SCole Faust 1880*c217d954SCole Faust 1881*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 1882*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 1883*c217d954SCole Faust 1884*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 1885*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 1886*c217d954SCole Faust 1887*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 1888*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 1889*c217d954SCole Faust 1890*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 1891*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 1892*c217d954SCole Faust 1893*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 1894*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1895*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 1896*c217d954SCole Faust 1897*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 1898*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1899*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1900*c217d954SCole Faust 1901*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 1902*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1903*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1904*c217d954SCole Faust 1905*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 1906*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 1907*c217d954SCole Faust 1908*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 1909*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1910*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 1911*c217d954SCole Faust 1912*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 1913*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1914*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1915*c217d954SCole Faust 1916*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 1917*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1918*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1919*c217d954SCole Faust 1920*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 1921*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1922*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1923*c217d954SCole Faust 1924*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 1925*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1926*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1927*c217d954SCole Faust 1928*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 1929*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1930*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1931*c217d954SCole Faust 1932*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 1933*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1934*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1935*c217d954SCole Faust 1936*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 1937*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 1938*c217d954SCole Faust 1939*c217d954SCole Faust 1940*c217d954SCole Faust 1941*c217d954SCole Faust#define PIXEL_UNIT4 1 1942*c217d954SCole Faust#define PIXEL_UNIT8 2 1943*c217d954SCole Faust#define PIXEL_UNIT16 4 1944*c217d954SCole Faust 1945*c217d954SCole Faust 1946*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1947*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1948*c217d954SCole Faust 1949*c217d954SCole Faust 1950*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1951*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1952*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1953*c217d954SCole Faust 1954*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1955*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1956*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1957*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1958*c217d954SCole Faust#endif 1959*c217d954SCole Faust 1960*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1961*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1962*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1963*c217d954SCole Faust 1964*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1965*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1966*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1967*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1968*c217d954SCole Faust#endif 1969*c217d954SCole Faust 1970*c217d954SCole Faust 1971*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1972*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1973*c217d954SCole Faust 1974*c217d954SCole Faust 1975*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1976*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1977*c217d954SCole Faust 1978*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 1979*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 1980*c217d954SCole Faust 1981*c217d954SCole Faust#define float1 float 1982*c217d954SCole Faust#define half1 half 1983*c217d954SCole Faust#define char1 char 1984*c217d954SCole Faust#define uchar1 uchar 1985*c217d954SCole Faust#define short1 short 1986*c217d954SCole Faust#define ushort1 ushort 1987*c217d954SCole Faust#define int1 int 1988*c217d954SCole Faust#define uint1 uint 1989*c217d954SCole Faust#define long1 long 1990*c217d954SCole Faust#define ulong1 ulong 1991*c217d954SCole Faust#define double1 double 1992*c217d954SCole Faust 1993*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1994*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1995*c217d954SCole Faust 1996*c217d954SCole Faust 1997*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1998*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1999*c217d954SCole Faust 2000*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 2001*c217d954SCole Faust { \ 2002*c217d954SCole Faust } 2003*c217d954SCole Faust 2004*c217d954SCole Faust 2005*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 2006*c217d954SCole Faust#define vstore_partial_1_1 vstore1 2007*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 2008*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 2009*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 2010*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 2011*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 2012*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 2013*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 2014*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 2015*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 2016*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 2017*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 2018*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 2019*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 2020*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 2021*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 2022*c217d954SCole Faust 2023*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 2024*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 2025*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 2026*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 2027*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 2028*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 2029*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 2030*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 2031*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 2032*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 2033*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 2034*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 2035*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 2036*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 2037*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 2038*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 2039*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 2040*c217d954SCole Faust 2041*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 2042*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 2043*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 2044*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 2045*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 2046*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 2047*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 2048*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 2049*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 2050*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 2051*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 2052*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 2053*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 2054*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 2055*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 2056*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 2057*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 2058*c217d954SCole Faust 2059*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 2060*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 2061*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 2062*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 2063*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 2064*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 2065*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 2066*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 2067*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 2068*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 2069*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 2070*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 2071*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 2072*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 2073*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 2074*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 2075*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 2076*c217d954SCole Faust 2077*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 2078*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 2079*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 2080*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 2081*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 2082*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 2083*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 2084*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 2085*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 2086*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 2087*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 2088*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 2089*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 2090*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 2091*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 2092*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 2093*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 2094*c217d954SCole Faust 2095*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 2096*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 2097*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 2098*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 2099*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 2100*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 2101*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 2102*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 2103*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 2104*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 2105*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 2106*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 2107*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 2108*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 2109*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 2110*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 2111*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 2112*c217d954SCole Faust 2113*c217d954SCole Faust 2114*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 2115*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 2116*c217d954SCole Faust 2117*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 2118*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 2119*c217d954SCole Faust 2120*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 2121*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 2122*c217d954SCole Faust 2123*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 2124*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 2125*c217d954SCole Faust 2126*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 2127*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2128*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 2129*c217d954SCole Faust 2130*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 2131*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2132*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2133*c217d954SCole Faust 2134*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 2135*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2136*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2137*c217d954SCole Faust 2138*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 2139*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 2140*c217d954SCole Faust 2141*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 2142*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2143*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 2144*c217d954SCole Faust 2145*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 2146*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2147*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2148*c217d954SCole Faust 2149*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 2150*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2151*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2152*c217d954SCole Faust 2153*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 2154*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2155*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2156*c217d954SCole Faust 2157*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 2158*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2159*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2160*c217d954SCole Faust 2161*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 2162*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2163*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2164*c217d954SCole Faust 2165*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 2166*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2167*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2168*c217d954SCole Faust 2169*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 2170*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 2171*c217d954SCole Faust 2172*c217d954SCole Faust 2173*c217d954SCole Faust 2174*c217d954SCole Faust 2175*c217d954SCole Faust 2176*c217d954SCole Faust#define convert_float_sat convert_float 2177*c217d954SCole Faust#define convert_float1_sat convert_float 2178*c217d954SCole Faust#define convert_float2_sat convert_float2 2179*c217d954SCole Faust#define convert_float3_sat convert_float3 2180*c217d954SCole Faust#define convert_float4_sat convert_float4 2181*c217d954SCole Faust#define convert_float8_sat convert_float8 2182*c217d954SCole Faust#define convert_float16_sat convert_float16 2183*c217d954SCole Faust#define convert_half_sat convert_float 2184*c217d954SCole Faust#define convert_half1_sat convert_half 2185*c217d954SCole Faust#define convert_half2_sat convert_half2 2186*c217d954SCole Faust#define convert_half3_sat convert_half3 2187*c217d954SCole Faust#define convert_half4_sat convert_half4 2188*c217d954SCole Faust#define convert_half8_sat convert_half8 2189*c217d954SCole Faust#define convert_half16_sat convert_half16 2190*c217d954SCole Faust 2191*c217d954SCole Faust#define convert_float1 convert_float 2192*c217d954SCole Faust#define convert_half1 convert_half 2193*c217d954SCole Faust#define convert_char1 convert_char 2194*c217d954SCole Faust#define convert_uchar1 convert_uchar 2195*c217d954SCole Faust#define convert_short1 convert_short 2196*c217d954SCole Faust#define convert_ushort1 convert_ushort 2197*c217d954SCole Faust#define convert_int1 convert_int 2198*c217d954SCole Faust#define convert_uint1 convert_uint 2199*c217d954SCole Faust#define convert_long1 convert_long 2200*c217d954SCole Faust#define convert_ulong1 convert_ulong 2201*c217d954SCole Faust#define convert_double1 convert_double 2202*c217d954SCole Faust 2203*c217d954SCole Faust#define convert_char1_sat convert_char_sat 2204*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 2205*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 2206*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 2207*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 2208*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 2209*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 2210*c217d954SCole Faust#define convert_short1_sat convert_short_sat 2211*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 2212*c217d954SCole Faust#define convert_int1_sat convert_int_sat 2213*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 2214*c217d954SCole Faust#define convert_long1_sat convert_long_sat 2215*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 2216*c217d954SCole Faust#define convert_double1_sat convert_double_sat 2217*c217d954SCole Faust 2218*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 2219*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2220*c217d954SCole Faust 2221*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 2222*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 2223*c217d954SCole Faust 2224*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2225*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2226*c217d954SCole Faust 2227*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2228*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2229*c217d954SCole Faust 2230*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 2231*c217d954SCole Faust#define select_vec_dt_char(size) char##size 2232*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 2233*c217d954SCole Faust#define select_vec_dt_short(size) short##size 2234*c217d954SCole Faust#define select_vec_dt_half(size) short##size 2235*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 2236*c217d954SCole Faust#define select_vec_dt_int(size) int##size 2237*c217d954SCole Faust#define select_vec_dt_float(size) int##size 2238*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 2239*c217d954SCole Faust#define select_vec_dt_long(size) long##size 2240*c217d954SCole Faust 2241*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2242*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2243*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2244*c217d954SCole Faust 2245*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 2246*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 2247*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 2248*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 2249*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 2250*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 2251*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 2252*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 2253*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 2254*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 2255*c217d954SCole Faust 2256*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2257*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2258*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2259*c217d954SCole Faust 2260*c217d954SCole Faust#define sum_reduce_1(x) (x) 2261*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2262*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2263*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2264*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2265*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2266*c217d954SCole Faust 2267*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2268*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2269*c217d954SCole Faust 2270*c217d954SCole Faust#define prod_reduce_1(x) (x) 2271*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2272*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2273*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2274*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2275*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2276*c217d954SCole Faust 2277*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2278*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2279*c217d954SCole Faust 2280*c217d954SCole Faust#define max_reduce_1(x) (x) 2281*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2282*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2283*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2284*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2285*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2286*c217d954SCole Faust 2287*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2288*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2289*c217d954SCole Faust 2290*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 2291*c217d954SCole Faust __global uchar *name##_ptr, \ 2292*c217d954SCole Faust uint name##_stride_x, \ 2293*c217d954SCole Faust uint name##_step_x, \ 2294*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2295*c217d954SCole Faust 2296*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 2297*c217d954SCole Faust __global uchar *name##_ptr, \ 2298*c217d954SCole Faust uint name##_stride_x, \ 2299*c217d954SCole Faust uint name##_step_x, \ 2300*c217d954SCole Faust uint name##_stride_y, \ 2301*c217d954SCole Faust uint name##_step_y, \ 2302*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2303*c217d954SCole Faust 2304*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 2305*c217d954SCole Faust __global uchar *name##_ptr, \ 2306*c217d954SCole Faust uint name##_stride_x, \ 2307*c217d954SCole Faust uint name##_step_x, \ 2308*c217d954SCole Faust uint name##_stride_y, \ 2309*c217d954SCole Faust uint name##_step_y, \ 2310*c217d954SCole Faust uint name##_stride_z, \ 2311*c217d954SCole Faust uint name##_step_z, \ 2312*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2313*c217d954SCole Faust 2314*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 2315*c217d954SCole Faust __global uchar *name##_ptr, \ 2316*c217d954SCole Faust uint name##_stride_x, \ 2317*c217d954SCole Faust uint name##_step_x, \ 2318*c217d954SCole Faust uint name##_stride_y, \ 2319*c217d954SCole Faust uint name##_step_y, \ 2320*c217d954SCole Faust uint name##_stride_z, \ 2321*c217d954SCole Faust uint name##_step_z, \ 2322*c217d954SCole Faust uint name##_stride_w, \ 2323*c217d954SCole Faust uint name##_step_w, \ 2324*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2325*c217d954SCole Faust 2326*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 2327*c217d954SCole Faust __global uchar *name##_ptr, \ 2328*c217d954SCole Faust uint name##_stride_x, \ 2329*c217d954SCole Faust uint name##_step_x, \ 2330*c217d954SCole Faust uint name##_stride_y, \ 2331*c217d954SCole Faust uint name##_step_y, \ 2332*c217d954SCole Faust uint name##_stride_z, \ 2333*c217d954SCole Faust uint name##_step_z, \ 2334*c217d954SCole Faust uint name##_stride_w, \ 2335*c217d954SCole Faust uint name##_step_w, \ 2336*c217d954SCole Faust uint name##_stride_v, \ 2337*c217d954SCole Faust uint name##_step_v, \ 2338*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2339*c217d954SCole Faust 2340*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 2341*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2342*c217d954SCole Faust 2343*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2344*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2345*c217d954SCole Faust 2346*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 2347*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2348*c217d954SCole Faust 2349*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2350*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2351*c217d954SCole Faust 2352*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2353*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2354*c217d954SCole Faust 2355*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2356*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2357*c217d954SCole Faust 2358*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2359*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2360*c217d954SCole Faust 2361*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2362*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2363*c217d954SCole Faust name##_stride_z, name##_step_z) 2364*c217d954SCole Faust 2365*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2366*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2367*c217d954SCole Faust 2368*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2369*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2370*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2371*c217d954SCole Faust 2372*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2373*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2374*c217d954SCole Faust 2375*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2376*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2377*c217d954SCole Faust name##_stride_z, name##_step_z) 2378*c217d954SCole Faust 2379*c217d954SCole Faust 2380*c217d954SCole Fausttypedef struct Vector 2381*c217d954SCole Faust{ 2382*c217d954SCole Faust __global uchar *ptr; 2383*c217d954SCole Faust int offset_first_element_in_bytes; 2384*c217d954SCole Faust int stride_x; 2385*c217d954SCole Faust} Vector; 2386*c217d954SCole Faust 2387*c217d954SCole Faust 2388*c217d954SCole Fausttypedef struct Image 2389*c217d954SCole Faust{ 2390*c217d954SCole Faust __global uchar *ptr; 2391*c217d954SCole Faust int offset_first_element_in_bytes; 2392*c217d954SCole Faust int stride_x; 2393*c217d954SCole Faust int stride_y; 2394*c217d954SCole Faust} Image; 2395*c217d954SCole Faust 2396*c217d954SCole Faust 2397*c217d954SCole Fausttypedef struct Tensor3D 2398*c217d954SCole Faust{ 2399*c217d954SCole Faust __global uchar *ptr; 2400*c217d954SCole Faust int offset_first_element_in_bytes; 2401*c217d954SCole Faust int stride_x; 2402*c217d954SCole Faust int stride_y; 2403*c217d954SCole Faust int stride_z; 2404*c217d954SCole Faust} Tensor3D; 2405*c217d954SCole Faust 2406*c217d954SCole Faust 2407*c217d954SCole Fausttypedef struct Tensor4D 2408*c217d954SCole Faust{ 2409*c217d954SCole Faust __global uchar *ptr; 2410*c217d954SCole Faust int offset_first_element_in_bytes; 2411*c217d954SCole Faust int stride_x; 2412*c217d954SCole Faust int stride_y; 2413*c217d954SCole Faust int stride_z; 2414*c217d954SCole Faust int stride_w; 2415*c217d954SCole Faust} Tensor4D; 2416*c217d954SCole Faust 2417*c217d954SCole Faust 2418*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2419*c217d954SCole Faust{ 2420*c217d954SCole Faust Vector vector = 2421*c217d954SCole Faust { 2422*c217d954SCole Faust .ptr = ptr, 2423*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2424*c217d954SCole Faust .stride_x = stride_x, 2425*c217d954SCole Faust }; 2426*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2427*c217d954SCole Faust return vector; 2428*c217d954SCole Faust} 2429*c217d954SCole Faust 2430*c217d954SCole Faust 2431*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2432*c217d954SCole Faust{ 2433*c217d954SCole Faust Image img = 2434*c217d954SCole Faust { 2435*c217d954SCole Faust .ptr = ptr, 2436*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2437*c217d954SCole Faust .stride_x = stride_x, 2438*c217d954SCole Faust .stride_y = stride_y 2439*c217d954SCole Faust }; 2440*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2441*c217d954SCole Faust return img; 2442*c217d954SCole Faust} 2443*c217d954SCole Faust 2444*c217d954SCole Faust 2445*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2446*c217d954SCole Faust{ 2447*c217d954SCole Faust Image img = 2448*c217d954SCole Faust { 2449*c217d954SCole Faust .ptr = ptr, 2450*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2451*c217d954SCole Faust .stride_x = stride_x, 2452*c217d954SCole Faust .stride_y = stride_y 2453*c217d954SCole Faust }; 2454*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2455*c217d954SCole Faust return img; 2456*c217d954SCole Faust} 2457*c217d954SCole Faust 2458*c217d954SCole Faust 2459*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2460*c217d954SCole Faust{ 2461*c217d954SCole Faust Tensor3D tensor = 2462*c217d954SCole Faust { 2463*c217d954SCole Faust .ptr = ptr, 2464*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2465*c217d954SCole Faust .stride_x = stride_x, 2466*c217d954SCole Faust .stride_y = stride_y, 2467*c217d954SCole Faust .stride_z = stride_z 2468*c217d954SCole Faust }; 2469*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2470*c217d954SCole Faust return tensor; 2471*c217d954SCole Faust} 2472*c217d954SCole Faust 2473*c217d954SCole Faust 2474*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2475*c217d954SCole Faust{ 2476*c217d954SCole Faust Tensor3D tensor = 2477*c217d954SCole Faust { 2478*c217d954SCole Faust .ptr = ptr, 2479*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2480*c217d954SCole Faust .stride_x = stride_x, 2481*c217d954SCole Faust .stride_y = stride_y, 2482*c217d954SCole Faust .stride_z = stride_z 2483*c217d954SCole Faust }; 2484*c217d954SCole Faust return tensor; 2485*c217d954SCole Faust} 2486*c217d954SCole Faust 2487*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2488*c217d954SCole Faust uint step_w, 2489*c217d954SCole Faust uint mod_size) 2490*c217d954SCole Faust{ 2491*c217d954SCole Faust Tensor4D tensor = 2492*c217d954SCole Faust { 2493*c217d954SCole Faust .ptr = ptr, 2494*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2495*c217d954SCole Faust .stride_x = stride_x, 2496*c217d954SCole Faust .stride_y = stride_y, 2497*c217d954SCole Faust .stride_z = stride_z, 2498*c217d954SCole Faust .stride_w = stride_w 2499*c217d954SCole Faust }; 2500*c217d954SCole Faust 2501*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2502*c217d954SCole Faust return tensor; 2503*c217d954SCole Faust} 2504*c217d954SCole Faust 2505*c217d954SCole Faust 2506*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 2507*c217d954SCole Faust{ 2508*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 2509*c217d954SCole Faust} 2510*c217d954SCole Faust 2511*c217d954SCole Faust 2512*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 2513*c217d954SCole Faust{ 2514*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 2515*c217d954SCole Faust} 2516*c217d954SCole Faust 2517*c217d954SCole Faust 2518*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2519*c217d954SCole Faust{ 2520*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2521*c217d954SCole Faust} 2522*c217d954SCole Faust 2523*c217d954SCole Faust 2524*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2525*c217d954SCole Faust{ 2526*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2527*c217d954SCole Faust} 2528*c217d954SCole Faust 2529*c217d954SCole Faust 2530*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2531*c217d954SCole Faust{ 2532*c217d954SCole Faust uint num_elements = width * height; 2533*c217d954SCole Faust 2534*c217d954SCole Faust const uint z = index / num_elements; 2535*c217d954SCole Faust 2536*c217d954SCole Faust index %= num_elements; 2537*c217d954SCole Faust 2538*c217d954SCole Faust const uint y = index / width; 2539*c217d954SCole Faust 2540*c217d954SCole Faust index %= width; 2541*c217d954SCole Faust 2542*c217d954SCole Faust const uint x = index; 2543*c217d954SCole Faust 2544*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2545*c217d954SCole Faust} 2546*c217d954SCole Faust 2547*c217d954SCole Faust#endif 2548*c217d954SCole Faust 2549*c217d954SCole Faust 2550*c217d954SCole Faust#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 2551*c217d954SCole Faust#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 2552*c217d954SCole Faust 2553*c217d954SCole Faust 2554*c217d954SCole Faust#define scalar_access_0_1(x) ((x).s0) 2555*c217d954SCole Faust#define scalar_access_0_2(x) ((x).s01) 2556*c217d954SCole Faust#define scalar_access_0_3(x) ((x).s012) 2557*c217d954SCole Faust#define scalar_access_0_4(x) ((x).s0123) 2558*c217d954SCole Faust#define scalar_access_0_8(x) ((x).s01234567) 2559*c217d954SCole Faust#define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 2560*c217d954SCole Faust 2561*c217d954SCole Faust 2562*c217d954SCole Faust#define scalar_access_1_1(x) ((x).s1) 2563*c217d954SCole Faust#define scalar_access_1_2(x) ((x).s12) 2564*c217d954SCole Faust#define scalar_access_1_3(x) ((x).s123) 2565*c217d954SCole Faust#define scalar_access_1_4(x) ((x).s1234) 2566*c217d954SCole Faust#define scalar_access_1_8(x) ((x).s12345678) 2567*c217d954SCole Faust 2568*c217d954SCole Faust 2569*c217d954SCole Faust#define scalar_access_2_1(x) ((x).s2) 2570*c217d954SCole Faust#define scalar_access_2_2(x) ((x).s23) 2571*c217d954SCole Faust#define scalar_access_2_3(x) ((x).s234) 2572*c217d954SCole Faust#define scalar_access_2_4(x) ((x).s2345) 2573*c217d954SCole Faust#define scalar_access_2_8(x) ((x).s23456789) 2574*c217d954SCole Faust 2575*c217d954SCole Faust 2576*c217d954SCole Faust#define scalar_access_3_1(x) ((x).s3) 2577*c217d954SCole Faust#define scalar_access_3_2(x) ((x).s34) 2578*c217d954SCole Faust#define scalar_access_3_3(x) ((x).s345) 2579*c217d954SCole Faust#define scalar_access_3_4(x) ((x).s3456) 2580*c217d954SCole Faust#define scalar_access_3_8(x) ((x).s3456789A) 2581*c217d954SCole Faust 2582*c217d954SCole Faust 2583*c217d954SCole Faust#define scalar_access_4_1(x) ((x).s4) 2584*c217d954SCole Faust#define scalar_access_4_2(x) ((x).s45) 2585*c217d954SCole Faust#define scalar_access_4_3(x) ((x).s456) 2586*c217d954SCole Faust#define scalar_access_4_4(x) ((x).s4567) 2587*c217d954SCole Faust#define scalar_access_4_8(x) ((x).s456789AB) 2588*c217d954SCole Faust 2589*c217d954SCole Faust 2590*c217d954SCole Faust#define scalar_access_8_1(x) ((x).s8) 2591*c217d954SCole Faust#define scalar_access_8_2(x) ((x).s89) 2592*c217d954SCole Faust#define scalar_access_8_3(x) ((x).s89A) 2593*c217d954SCole Faust#define scalar_access_8_4(x) ((x).s89AB) 2594*c217d954SCole Faust#define scalar_access_8_8(x) ((x).s89ABCDEF) 2595*c217d954SCole Faust 2596*c217d954SCole Faust 2597*c217d954SCole Faust#define scalar_access_12_1(x) ((x).sC) 2598*c217d954SCole Faust#define scalar_access_12_2(x) ((x).sCD) 2599*c217d954SCole Faust#define scalar_access_12_3(x) ((x).sCDE) 2600*c217d954SCole Faust#define scalar_access_12_4(x) ((x).sCDEF) 2601*c217d954SCole Faust 2602*c217d954SCole Faust 2603*c217d954SCole Faust#define scalar_access_16_1(x) ((x).sF) 2604*c217d954SCole Faust 2605*c217d954SCole Faust 2606*c217d954SCole Faust#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2607*c217d954SCole Faust ({}) 2608*c217d954SCole Faust 2609*c217d954SCole Faust#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2610*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2611*c217d954SCole Faust 2612*c217d954SCole Faust#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2613*c217d954SCole Faust LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2614*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2615*c217d954SCole Faust 2616*c217d954SCole Faust#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2617*c217d954SCole Faust LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2618*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2619*c217d954SCole Faust 2620*c217d954SCole Faust#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2621*c217d954SCole Faust LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2622*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2623*c217d954SCole Faust 2624*c217d954SCole Faust#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2625*c217d954SCole Faust LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2626*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2627*c217d954SCole Faust 2628*c217d954SCole Faust#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2629*c217d954SCole Faust LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2630*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2631*c217d954SCole Faust 2632*c217d954SCole Faust#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2633*c217d954SCole Faust LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2634*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2635*c217d954SCole Faust 2636*c217d954SCole Faust#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2637*c217d954SCole Faust LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2638*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2639*c217d954SCole Faust 2640*c217d954SCole Faust#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2641*c217d954SCole Faust LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2642*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2643*c217d954SCole Faust 2644*c217d954SCole Faust#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2645*c217d954SCole Faust LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2646*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2647*c217d954SCole Faust 2648*c217d954SCole Faust#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2649*c217d954SCole Faust LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2650*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2651*c217d954SCole Faust 2652*c217d954SCole Faust#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2653*c217d954SCole Faust LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2654*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2655*c217d954SCole Faust 2656*c217d954SCole Faust#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2657*c217d954SCole Faust LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2658*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2659*c217d954SCole Faust 2660*c217d954SCole Faust#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2661*c217d954SCole Faust LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2662*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2663*c217d954SCole Faust 2664*c217d954SCole Faust#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2665*c217d954SCole Faust LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2666*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2667*c217d954SCole Faust 2668*c217d954SCole Faust#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2669*c217d954SCole Faust LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2670*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2671*c217d954SCole Faust 2672*c217d954SCole Faust 2673*c217d954SCole Faust 2674*c217d954SCole Faust#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2675*c217d954SCole Faust#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2676*c217d954SCole Faust 2677*c217d954SCole Faust 2678*c217d954SCole Faust 2679*c217d954SCole Faust#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2680*c217d954SCole Faust ({}) 2681*c217d954SCole Faust 2682*c217d954SCole Faust#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2683*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2684*c217d954SCole Faust 2685*c217d954SCole Faust#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2686*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2687*c217d954SCole Faust 2688*c217d954SCole Faust#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2689*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2690*c217d954SCole Faust 2691*c217d954SCole Faust#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2692*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2693*c217d954SCole Faust 2694*c217d954SCole Faust#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2695*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2696*c217d954SCole Faust LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2697*c217d954SCole Faust 2698*c217d954SCole Faust#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2699*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2700*c217d954SCole Faust LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2701*c217d954SCole Faust 2702*c217d954SCole Faust#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2703*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2704*c217d954SCole Faust LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2705*c217d954SCole Faust 2706*c217d954SCole Faust#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2707*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2708*c217d954SCole Faust 2709*c217d954SCole Faust#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2710*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2711*c217d954SCole Faust LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2712*c217d954SCole Faust 2713*c217d954SCole Faust#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2714*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2715*c217d954SCole Faust LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2716*c217d954SCole Faust 2717*c217d954SCole Faust#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2718*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2719*c217d954SCole Faust LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2720*c217d954SCole Faust 2721*c217d954SCole Faust#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2722*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2723*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2724*c217d954SCole Faust 2725*c217d954SCole Faust#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2726*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2727*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2728*c217d954SCole Faust LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2729*c217d954SCole Faust 2730*c217d954SCole Faust#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2731*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2732*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2733*c217d954SCole Faust LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2734*c217d954SCole Faust 2735*c217d954SCole Faust#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2736*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2737*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2738*c217d954SCole Faust LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2739*c217d954SCole Faust 2740*c217d954SCole Faust#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2741*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2742*c217d954SCole Faust 2743*c217d954SCole Faust 2744*c217d954SCole Faust 2745*c217d954SCole Faust#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2746*c217d954SCole Faust#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2747*c217d954SCole Faust 2748*c217d954SCole Faust 2749*c217d954SCole Faust#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2750*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2751*c217d954SCole Faust BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2752*c217d954SCole Faust 2753*c217d954SCole Faust#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2754*c217d954SCole Faust LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2755*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2756*c217d954SCole Faust BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2757*c217d954SCole Faust 2758*c217d954SCole Faust#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2759*c217d954SCole Faust LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2760*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2761*c217d954SCole Faust BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2762*c217d954SCole Faust 2763*c217d954SCole Faust#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2764*c217d954SCole Faust LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2765*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2766*c217d954SCole Faust BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2767*c217d954SCole Faust 2768*c217d954SCole Faust#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2769*c217d954SCole Faust LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2770*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2771*c217d954SCole Faust BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2772*c217d954SCole Faust 2773*c217d954SCole Faust#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2774*c217d954SCole Faust LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2775*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2776*c217d954SCole Faust BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2777*c217d954SCole Faust 2778*c217d954SCole Faust#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2779*c217d954SCole Faust LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2780*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2781*c217d954SCole Faust BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2782*c217d954SCole Faust 2783*c217d954SCole Faust#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2784*c217d954SCole Faust LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2785*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2786*c217d954SCole Faust BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2787*c217d954SCole Faust 2788*c217d954SCole Faust#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2789*c217d954SCole Faust LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2790*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2791*c217d954SCole Faust BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2792*c217d954SCole Faust 2793*c217d954SCole Faust#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2794*c217d954SCole Faust LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2795*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2796*c217d954SCole Faust BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2797*c217d954SCole Faust 2798*c217d954SCole Faust#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2799*c217d954SCole Faust LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2800*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2801*c217d954SCole Faust BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2802*c217d954SCole Faust 2803*c217d954SCole Faust#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2804*c217d954SCole Faust LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2805*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2806*c217d954SCole Faust BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2807*c217d954SCole Faust 2808*c217d954SCole Faust#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2809*c217d954SCole Faust LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2810*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2811*c217d954SCole Faust BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2812*c217d954SCole Faust 2813*c217d954SCole Faust#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2814*c217d954SCole Faust LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2815*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2816*c217d954SCole Faust BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2817*c217d954SCole Faust 2818*c217d954SCole Faust#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2819*c217d954SCole Faust LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2820*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2821*c217d954SCole Faust BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2822*c217d954SCole Faust 2823*c217d954SCole Faust#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2824*c217d954SCole Faust LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2825*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2826*c217d954SCole Faust BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2827*c217d954SCole Faust 2828*c217d954SCole Faust 2829*c217d954SCole Faust 2830*c217d954SCole Faust 2831*c217d954SCole Faust#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2832*c217d954SCole Faust#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2833*c217d954SCole Faust 2834*c217d954SCole Faust 2835*c217d954SCole Faust 2836*c217d954SCole Faust#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2837*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2838*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2839*c217d954SCole Faust 2840*c217d954SCole Faust#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2841*c217d954SCole Faust LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2842*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2843*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2844*c217d954SCole Faust 2845*c217d954SCole Faust#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2846*c217d954SCole Faust LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2847*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2848*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2849*c217d954SCole Faust 2850*c217d954SCole Faust#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2851*c217d954SCole Faust LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2852*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2853*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2854*c217d954SCole Faust 2855*c217d954SCole Faust#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2856*c217d954SCole Faust LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2857*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2858*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2859*c217d954SCole Faust 2860*c217d954SCole Faust#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2861*c217d954SCole Faust LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2862*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2863*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2864*c217d954SCole Faust 2865*c217d954SCole Faust#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2866*c217d954SCole Faust LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2867*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2868*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2869*c217d954SCole Faust 2870*c217d954SCole Faust#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2871*c217d954SCole Faust LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2872*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2873*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2874*c217d954SCole Faust 2875*c217d954SCole Faust#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2876*c217d954SCole Faust LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2877*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2878*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2879*c217d954SCole Faust 2880*c217d954SCole Faust#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2881*c217d954SCole Faust LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2882*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2883*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2884*c217d954SCole Faust 2885*c217d954SCole Faust#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2886*c217d954SCole Faust LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2887*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2888*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2889*c217d954SCole Faust 2890*c217d954SCole Faust#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2891*c217d954SCole Faust LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2892*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2893*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2894*c217d954SCole Faust 2895*c217d954SCole Faust#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2896*c217d954SCole Faust LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2897*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2898*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2899*c217d954SCole Faust 2900*c217d954SCole Faust#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2901*c217d954SCole Faust LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2902*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2903*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2904*c217d954SCole Faust 2905*c217d954SCole Faust#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2906*c217d954SCole Faust LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2907*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2908*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2909*c217d954SCole Faust 2910*c217d954SCole Faust#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2911*c217d954SCole Faust LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2912*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2913*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2914*c217d954SCole Faust 2915*c217d954SCole Faust 2916*c217d954SCole Faust 2917*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2918*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2919*c217d954SCole Faust 2920*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2921*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2922*c217d954SCole Faust { \ 2923*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2924*c217d954SCole Faust } \ 2925*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2926*c217d954SCole Faust { \ 2927*c217d954SCole Faust LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2928*c217d954SCole Faust } \ 2929*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2930*c217d954SCole Faust { \ 2931*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2932*c217d954SCole Faust } \ 2933*c217d954SCole Faust else \ 2934*c217d954SCole Faust { \ 2935*c217d954SCole Faust LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2936*c217d954SCole Faust } 2937*c217d954SCole Faust 2938*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2939*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 2940*c217d954SCole Faust { \ 2941*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2942*c217d954SCole Faust } \ 2943*c217d954SCole Faust else \ 2944*c217d954SCole Faust { \ 2945*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2946*c217d954SCole Faust } 2947*c217d954SCole Faust 2948*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2949*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 2950*c217d954SCole Faust { \ 2951*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2952*c217d954SCole Faust } \ 2953*c217d954SCole Faust else \ 2954*c217d954SCole Faust { \ 2955*c217d954SCole Faust LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2956*c217d954SCole Faust } 2957*c217d954SCole Faust 2958*c217d954SCole Faust 2959*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2960*c217d954SCole Faust 2961*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2962*c217d954SCole Faust LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2963*c217d954SCole Faust 2964*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2965*c217d954SCole Faust 2966*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2967*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2968*c217d954SCole Faust LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2969*c217d954SCole Faust 2970*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2971*c217d954SCole Faust 2972*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2973*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2974*c217d954SCole Faust LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2975*c217d954SCole Faust 2976*c217d954SCole Faust#else 2977*c217d954SCole Faust 2978*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2979*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2980*c217d954SCole Faust LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2981*c217d954SCole Faust 2982*c217d954SCole Faust#endif 2983*c217d954SCole Faust 2984*c217d954SCole Faust 2985*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2986*c217d954SCole Faust BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 2987*c217d954SCole Faust 2988*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2989*c217d954SCole Faust LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2990*c217d954SCole Faust BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 2991*c217d954SCole Faust 2992*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2993*c217d954SCole Faust LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2994*c217d954SCole Faust BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 2995*c217d954SCole Faust 2996*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2997*c217d954SCole Faust LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2998*c217d954SCole Faust BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 2999*c217d954SCole Faust 3000*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3001*c217d954SCole Faust LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3002*c217d954SCole Faust BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 3003*c217d954SCole Faust 3004*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3005*c217d954SCole Faust LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3006*c217d954SCole Faust BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 3007*c217d954SCole Faust 3008*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3009*c217d954SCole Faust LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3010*c217d954SCole Faust BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 3011*c217d954SCole Faust 3012*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3013*c217d954SCole Faust LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3014*c217d954SCole Faust BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 3015*c217d954SCole Faust 3016*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3017*c217d954SCole Faust LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3018*c217d954SCole Faust BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 3019*c217d954SCole Faust 3020*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3021*c217d954SCole Faust LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3022*c217d954SCole Faust BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 3023*c217d954SCole Faust 3024*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3025*c217d954SCole Faust LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3026*c217d954SCole Faust BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 3027*c217d954SCole Faust 3028*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3029*c217d954SCole Faust LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3030*c217d954SCole Faust BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 3031*c217d954SCole Faust 3032*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3033*c217d954SCole Faust LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3034*c217d954SCole Faust BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 3035*c217d954SCole Faust 3036*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3037*c217d954SCole Faust LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3038*c217d954SCole Faust BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 3039*c217d954SCole Faust 3040*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3041*c217d954SCole Faust LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3042*c217d954SCole Faust BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 3043*c217d954SCole Faust 3044*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3045*c217d954SCole Faust LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3046*c217d954SCole Faust BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 3047*c217d954SCole Faust 3048*c217d954SCole Faust 3049*c217d954SCole Faust 3050*c217d954SCole Faust#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3051*c217d954SCole Faust#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3052*c217d954SCole Faust 3053*c217d954SCole Faust 3054*c217d954SCole Faust 3055*c217d954SCole Faust#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3056*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3057*c217d954SCole Faust BASENAME##0; \ 3058*c217d954SCole Faust if(Y_MASK##0 != 0) \ 3059*c217d954SCole Faust BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 3060*c217d954SCole Faust else \ 3061*c217d954SCole Faust BASENAME##0 = 0; 3062*c217d954SCole Faust 3063*c217d954SCole Faust#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3064*c217d954SCole Faust LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3065*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3066*c217d954SCole Faust BASENAME##1; \ 3067*c217d954SCole Faust if(Y_MASK##1 != 0) \ 3068*c217d954SCole Faust BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 3069*c217d954SCole Faust else \ 3070*c217d954SCole Faust BASENAME##1 = 0; 3071*c217d954SCole Faust 3072*c217d954SCole Faust#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3073*c217d954SCole Faust LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3074*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3075*c217d954SCole Faust BASENAME##2; \ 3076*c217d954SCole Faust if(Y_MASK##2 != 0) \ 3077*c217d954SCole Faust BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 3078*c217d954SCole Faust else \ 3079*c217d954SCole Faust BASENAME##2 = 0; 3080*c217d954SCole Faust 3081*c217d954SCole Faust#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3082*c217d954SCole Faust LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3083*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3084*c217d954SCole Faust BASENAME##3; \ 3085*c217d954SCole Faust if(Y_MASK##3 != 0) \ 3086*c217d954SCole Faust BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 3087*c217d954SCole Faust else \ 3088*c217d954SCole Faust BASENAME##3 = 0; 3089*c217d954SCole Faust 3090*c217d954SCole Faust#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3091*c217d954SCole Faust LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3092*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3093*c217d954SCole Faust BASENAME##4; \ 3094*c217d954SCole Faust if(Y_MASK##4 != 0) \ 3095*c217d954SCole Faust BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 3096*c217d954SCole Faust else \ 3097*c217d954SCole Faust BASENAME##4 = 0; 3098*c217d954SCole Faust 3099*c217d954SCole Faust#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3100*c217d954SCole Faust LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3101*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3102*c217d954SCole Faust BASENAME##5; \ 3103*c217d954SCole Faust if(Y_MASK##5 != 0) \ 3104*c217d954SCole Faust BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 3105*c217d954SCole Faust else \ 3106*c217d954SCole Faust BASENAME##5 = 0; 3107*c217d954SCole Faust 3108*c217d954SCole Faust#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3109*c217d954SCole Faust LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3110*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3111*c217d954SCole Faust BASENAME##6; \ 3112*c217d954SCole Faust if(Y_MASK##6 != 0) \ 3113*c217d954SCole Faust BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 3114*c217d954SCole Faust else \ 3115*c217d954SCole Faust BASENAME##6 = 0; 3116*c217d954SCole Faust 3117*c217d954SCole Faust#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3118*c217d954SCole Faust LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3119*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3120*c217d954SCole Faust BASENAME##7; \ 3121*c217d954SCole Faust if(Y_MASK##7 != 0) \ 3122*c217d954SCole Faust BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 3123*c217d954SCole Faust else \ 3124*c217d954SCole Faust BASENAME##7 = 0; 3125*c217d954SCole Faust 3126*c217d954SCole Faust#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3127*c217d954SCole Faust LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3128*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3129*c217d954SCole Faust BASENAME##8; \ 3130*c217d954SCole Faust if(Y_MASK##8 != 0) \ 3131*c217d954SCole Faust BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 3132*c217d954SCole Faust else \ 3133*c217d954SCole Faust BASENAME##8 = 0; 3134*c217d954SCole Faust 3135*c217d954SCole Faust#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3136*c217d954SCole Faust LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3137*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3138*c217d954SCole Faust BASENAME##9; \ 3139*c217d954SCole Faust if(Y_MASK##9 != 0) \ 3140*c217d954SCole Faust BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 3141*c217d954SCole Faust else \ 3142*c217d954SCole Faust BASENAME##9 = 0; 3143*c217d954SCole Faust 3144*c217d954SCole Faust#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3145*c217d954SCole Faust LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3146*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3147*c217d954SCole Faust BASENAME##A; \ 3148*c217d954SCole Faust if(Y_MASK##A != 0) \ 3149*c217d954SCole Faust BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 3150*c217d954SCole Faust else \ 3151*c217d954SCole Faust BASENAME##A = 0; 3152*c217d954SCole Faust 3153*c217d954SCole Faust#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3154*c217d954SCole Faust LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3155*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3156*c217d954SCole Faust BASENAME##B; \ 3157*c217d954SCole Faust if(Y_MASK##B != 0) \ 3158*c217d954SCole Faust BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 3159*c217d954SCole Faust else \ 3160*c217d954SCole Faust BASENAME##B = 0; 3161*c217d954SCole Faust 3162*c217d954SCole Faust#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3163*c217d954SCole Faust LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3164*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3165*c217d954SCole Faust BASENAME##C; \ 3166*c217d954SCole Faust if(Y_MASK##C != 0) \ 3167*c217d954SCole Faust BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 3168*c217d954SCole Faust else \ 3169*c217d954SCole Faust BASENAME##C = 0; 3170*c217d954SCole Faust 3171*c217d954SCole Faust#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3172*c217d954SCole Faust LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3173*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3174*c217d954SCole Faust BASENAME##D; \ 3175*c217d954SCole Faust if(Y_MASK##D != 0) \ 3176*c217d954SCole Faust BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 3177*c217d954SCole Faust else \ 3178*c217d954SCole Faust BASENAME##D = 0; 3179*c217d954SCole Faust 3180*c217d954SCole Faust#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3181*c217d954SCole Faust LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3182*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3183*c217d954SCole Faust BASENAME##E; \ 3184*c217d954SCole Faust if(Y_MASK##E != 0) \ 3185*c217d954SCole Faust BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 3186*c217d954SCole Faust else \ 3187*c217d954SCole Faust BASENAME##E = 0; 3188*c217d954SCole Faust 3189*c217d954SCole Faust#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3190*c217d954SCole Faust LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3191*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3192*c217d954SCole Faust BASENAME##F; \ 3193*c217d954SCole Faust if(Y_MASK##F != 0) \ 3194*c217d954SCole Faust BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 3195*c217d954SCole Faust else \ 3196*c217d954SCole Faust BASENAME##F = 0; 3197*c217d954SCole Faust 3198*c217d954SCole Faust 3199*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3200*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3201*c217d954SCole Faust 3202*c217d954SCole Faust 3203*c217d954SCole Faust#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3204*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3205*c217d954SCole Faust BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 3206*c217d954SCole Faust 3207*c217d954SCole Faust#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3208*c217d954SCole Faust LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3209*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3210*c217d954SCole Faust BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 3211*c217d954SCole Faust 3212*c217d954SCole Faust#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3213*c217d954SCole Faust LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3214*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3215*c217d954SCole Faust BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 3216*c217d954SCole Faust 3217*c217d954SCole Faust#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3218*c217d954SCole Faust LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3219*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3220*c217d954SCole Faust BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 3221*c217d954SCole Faust 3222*c217d954SCole Faust#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3223*c217d954SCole Faust LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3224*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3225*c217d954SCole Faust BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 3226*c217d954SCole Faust 3227*c217d954SCole Faust#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3228*c217d954SCole Faust LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3229*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3230*c217d954SCole Faust BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 3231*c217d954SCole Faust 3232*c217d954SCole Faust#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3233*c217d954SCole Faust LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3234*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3235*c217d954SCole Faust BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 3236*c217d954SCole Faust 3237*c217d954SCole Faust#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3238*c217d954SCole Faust LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3239*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3240*c217d954SCole Faust BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 3241*c217d954SCole Faust 3242*c217d954SCole Faust#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3243*c217d954SCole Faust LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3244*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3245*c217d954SCole Faust BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 3246*c217d954SCole Faust 3247*c217d954SCole Faust#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3248*c217d954SCole Faust LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3249*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3250*c217d954SCole Faust BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 3251*c217d954SCole Faust 3252*c217d954SCole Faust#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3253*c217d954SCole Faust LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3254*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3255*c217d954SCole Faust BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 3256*c217d954SCole Faust 3257*c217d954SCole Faust#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3258*c217d954SCole Faust LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3259*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3260*c217d954SCole Faust BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 3261*c217d954SCole Faust 3262*c217d954SCole Faust#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3263*c217d954SCole Faust LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3264*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3265*c217d954SCole Faust BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 3266*c217d954SCole Faust 3267*c217d954SCole Faust#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3268*c217d954SCole Faust LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3269*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3270*c217d954SCole Faust BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 3271*c217d954SCole Faust 3272*c217d954SCole Faust#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3273*c217d954SCole Faust LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3274*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3275*c217d954SCole Faust BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 3276*c217d954SCole Faust 3277*c217d954SCole Faust#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3278*c217d954SCole Faust LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3279*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3280*c217d954SCole Faust BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 3281*c217d954SCole Faust 3282*c217d954SCole Faust 3283*c217d954SCole Faust 3284*c217d954SCole Faust 3285*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3286*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3287*c217d954SCole Faust 3288*c217d954SCole Faust 3289*c217d954SCole Faust 3290*c217d954SCole Faust#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3291*c217d954SCole Faust Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3292*c217d954SCole Faust Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 3293*c217d954SCole Faust Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 3294*c217d954SCole Faust 3295*c217d954SCole Faust#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3296*c217d954SCole Faust CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3297*c217d954SCole Faust Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3298*c217d954SCole Faust Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 3299*c217d954SCole Faust Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 3300*c217d954SCole Faust 3301*c217d954SCole Faust#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3302*c217d954SCole Faust CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3303*c217d954SCole Faust Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3304*c217d954SCole Faust Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 3305*c217d954SCole Faust Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 3306*c217d954SCole Faust 3307*c217d954SCole Faust#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3308*c217d954SCole Faust CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3309*c217d954SCole Faust Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3310*c217d954SCole Faust Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 3311*c217d954SCole Faust Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 3312*c217d954SCole Faust 3313*c217d954SCole Faust#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3314*c217d954SCole Faust CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3315*c217d954SCole Faust Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3316*c217d954SCole Faust Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 3317*c217d954SCole Faust Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 3318*c217d954SCole Faust 3319*c217d954SCole Faust#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3320*c217d954SCole Faust CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3321*c217d954SCole Faust Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3322*c217d954SCole Faust Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 3323*c217d954SCole Faust Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 3324*c217d954SCole Faust 3325*c217d954SCole Faust#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3326*c217d954SCole Faust CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3327*c217d954SCole Faust Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3328*c217d954SCole Faust Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 3329*c217d954SCole Faust Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 3330*c217d954SCole Faust 3331*c217d954SCole Faust#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3332*c217d954SCole Faust CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3333*c217d954SCole Faust Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3334*c217d954SCole Faust Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 3335*c217d954SCole Faust Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 3336*c217d954SCole Faust 3337*c217d954SCole Faust 3338*c217d954SCole Faust 3339*c217d954SCole Faust 3340*c217d954SCole Faust#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3341*c217d954SCole Faust#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3342*c217d954SCole Faust 3343*c217d954SCole Faust 3344*c217d954SCole Faust 3345*c217d954SCole Faust#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3346*c217d954SCole Faust BASENAME##0 *= (DATA_TYPE)SCALE; 3347*c217d954SCole Faust 3348*c217d954SCole Faust#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3349*c217d954SCole Faust SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3350*c217d954SCole Faust BASENAME##1 *= (DATA_TYPE)SCALE; 3351*c217d954SCole Faust 3352*c217d954SCole Faust#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3353*c217d954SCole Faust SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3354*c217d954SCole Faust BASENAME##2 *= (DATA_TYPE)SCALE; 3355*c217d954SCole Faust 3356*c217d954SCole Faust#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3357*c217d954SCole Faust SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3358*c217d954SCole Faust BASENAME##3 *= (DATA_TYPE)SCALE; 3359*c217d954SCole Faust 3360*c217d954SCole Faust#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3361*c217d954SCole Faust SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3362*c217d954SCole Faust BASENAME##4 *= (DATA_TYPE)SCALE; 3363*c217d954SCole Faust 3364*c217d954SCole Faust#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3365*c217d954SCole Faust SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3366*c217d954SCole Faust BASENAME##5 *= (DATA_TYPE)SCALE; 3367*c217d954SCole Faust 3368*c217d954SCole Faust#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3369*c217d954SCole Faust SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3370*c217d954SCole Faust BASENAME##6 *= (DATA_TYPE)SCALE; 3371*c217d954SCole Faust 3372*c217d954SCole Faust#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3373*c217d954SCole Faust SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3374*c217d954SCole Faust BASENAME##7 *= (DATA_TYPE)SCALE; 3375*c217d954SCole Faust 3376*c217d954SCole Faust#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3377*c217d954SCole Faust SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3378*c217d954SCole Faust BASENAME##8 *= (DATA_TYPE)SCALE; 3379*c217d954SCole Faust 3380*c217d954SCole Faust#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3381*c217d954SCole Faust SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3382*c217d954SCole Faust BASENAME##9 *= (DATA_TYPE)SCALE; 3383*c217d954SCole Faust 3384*c217d954SCole Faust#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3385*c217d954SCole Faust SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3386*c217d954SCole Faust BASENAME##A *= (DATA_TYPE)SCALE; 3387*c217d954SCole Faust 3388*c217d954SCole Faust#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3389*c217d954SCole Faust SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3390*c217d954SCole Faust BASENAME##B *= (DATA_TYPE)SCALE; 3391*c217d954SCole Faust 3392*c217d954SCole Faust#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3393*c217d954SCole Faust SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3394*c217d954SCole Faust BASENAME##C *= (DATA_TYPE)SCALE; 3395*c217d954SCole Faust 3396*c217d954SCole Faust#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3397*c217d954SCole Faust SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3398*c217d954SCole Faust BASENAME##D *= (DATA_TYPE)SCALE; 3399*c217d954SCole Faust 3400*c217d954SCole Faust#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3401*c217d954SCole Faust SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3402*c217d954SCole Faust BASENAME##E *= (DATA_TYPE)SCALE; 3403*c217d954SCole Faust 3404*c217d954SCole Faust#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 3405*c217d954SCole Faust SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3406*c217d954SCole Faust BASENAME##F *= (DATA_TYPE)SCALE; 3407*c217d954SCole Faust 3408*c217d954SCole Faust 3409*c217d954SCole Faust 3410*c217d954SCole Faust#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 3411*c217d954SCole Faust#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 3412*c217d954SCole Faust 3413*c217d954SCole Faust 3414*c217d954SCole Faust 3415*c217d954SCole Faust#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 3416*c217d954SCole Faust TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 3417*c217d954SCole Faust#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 3418*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 2) \ 3419*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 3420*c217d954SCole Faust#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 3421*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 3) \ 3422*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 3423*c217d954SCole Faust#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 3424*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 4) \ 3425*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 3426*c217d954SCole Faust#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 3427*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 8) \ 3428*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 3429*c217d954SCole Faust#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 3430*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 16) \ 3431*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 3432*c217d954SCole Faust 3433*c217d954SCole Faust 3434*c217d954SCole Faust 3435*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 3436*c217d954SCole Faust TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 3437*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 3438*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 2) \ 3439*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 3440*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 3441*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 3) \ 3442*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 3443*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 3444*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 4) \ 3445*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 3446*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 3447*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 8) \ 3448*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 3449*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 3450*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 16) \ 3451*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 3452*c217d954SCole Faust 3453*c217d954SCole Faust 3454*c217d954SCole Faust 3455*c217d954SCole Faust#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 3456*c217d954SCole Faust COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 3457*c217d954SCole Faust#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 3458*c217d954SCole Faust COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 3459*c217d954SCole Faust COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 3460*c217d954SCole Faust#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 3461*c217d954SCole Faust TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 3462*c217d954SCole Faust COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 3463*c217d954SCole Faust#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 3464*c217d954SCole Faust TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 3465*c217d954SCole Faust COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 3466*c217d954SCole Faust#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 3467*c217d954SCole Faust TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 3468*c217d954SCole Faust COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 3469*c217d954SCole Faust COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 3470*c217d954SCole Faust COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 3471*c217d954SCole Faust COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 3472*c217d954SCole Faust#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 3473*c217d954SCole Faust TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 3474*c217d954SCole Faust COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 3475*c217d954SCole Faust COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 3476*c217d954SCole Faust COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 3477*c217d954SCole Faust COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 3478*c217d954SCole Faust COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 3479*c217d954SCole Faust COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 3480*c217d954SCole Faust COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 3481*c217d954SCole Faust COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 3482*c217d954SCole Faust 3483*c217d954SCole Faust 3484*c217d954SCole Faust 3485*c217d954SCole Faust 3486*c217d954SCole Faust#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3487*c217d954SCole Faust CONCAT(COLUMN_VECTOR, K0) \ 3488*c217d954SCole Faust (IDX_COL, BASENAME, BS, TYPE); 3489*c217d954SCole Faust 3490*c217d954SCole Faust 3491*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3492*c217d954SCole Faust CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 3493*c217d954SCole Faust (IDX_COL, BASENAME, BS, TYPE); 3494*c217d954SCole Faust 3495*c217d954SCole Faust 3496*c217d954SCole Faust#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 3497*c217d954SCole Faust CONCAT(TRANSPOSE_K0X, N0) \ 3498*c217d954SCole Faust (K0, BASENAME, BS, TYPE); 3499*c217d954SCole Faust 3500*c217d954SCole Faust 3501*c217d954SCole Faust#define ADD_ROW_1(BASENAME, BIAS) \ 3502*c217d954SCole Faust BASENAME##0 += BIAS##0; 3503*c217d954SCole Faust 3504*c217d954SCole Faust#define ADD_ROW_2(BASENAME, BIAS) \ 3505*c217d954SCole Faust ADD_ROW_1(BASENAME, BIAS) \ 3506*c217d954SCole Faust BASENAME##1 += BIAS##1; 3507*c217d954SCole Faust 3508*c217d954SCole Faust#define ADD_ROW_3(BASENAME, BIAS) \ 3509*c217d954SCole Faust ADD_ROW_2(BASENAME, BIAS) \ 3510*c217d954SCole Faust BASENAME##2 += BIAS##2; 3511*c217d954SCole Faust 3512*c217d954SCole Faust#define ADD_ROW_4(BASENAME, BIAS) \ 3513*c217d954SCole Faust ADD_ROW_3(BASENAME, BIAS) \ 3514*c217d954SCole Faust BASENAME##3 += BIAS##3; 3515*c217d954SCole Faust 3516*c217d954SCole Faust#define ADD_ROW_5(BASENAME, BIAS) \ 3517*c217d954SCole Faust ADD_ROW_4(BASENAME, BIAS) \ 3518*c217d954SCole Faust BASENAME##4 += BIAS##4; 3519*c217d954SCole Faust 3520*c217d954SCole Faust#define ADD_ROW_6(BASENAME, BIAS) \ 3521*c217d954SCole Faust ADD_ROW_5(BASENAME, BIAS) \ 3522*c217d954SCole Faust BASENAME##5 += BIAS##5; 3523*c217d954SCole Faust 3524*c217d954SCole Faust#define ADD_ROW_7(BASENAME, BIAS) \ 3525*c217d954SCole Faust ADD_ROW_6(BASENAME, BIAS) \ 3526*c217d954SCole Faust BASENAME##6 += BIAS##6; 3527*c217d954SCole Faust 3528*c217d954SCole Faust#define ADD_ROW_8(BASENAME, BIAS) \ 3529*c217d954SCole Faust ADD_ROW_7(BASENAME, BIAS) \ 3530*c217d954SCole Faust BASENAME##7 += BIAS##7; 3531*c217d954SCole Faust 3532*c217d954SCole Faust#define ADD_ROW_9(BASENAME, BIAS) \ 3533*c217d954SCole Faust ADD_ROW_8(BASENAME, BIAS) \ 3534*c217d954SCole Faust BASENAME##8 += BIAS##8; 3535*c217d954SCole Faust 3536*c217d954SCole Faust#define ADD_ROW_10(BASENAME, BIAS) \ 3537*c217d954SCole Faust ADD_ROW_9(BASENAME, BIAS) \ 3538*c217d954SCole Faust BASENAME##9 += BIAS##9; 3539*c217d954SCole Faust 3540*c217d954SCole Faust#define ADD_ROW_11(BASENAME, BIAS) \ 3541*c217d954SCole Faust ADD_ROW_10(BASENAME, BIAS) \ 3542*c217d954SCole Faust BASENAME##A += BIAS##A; 3543*c217d954SCole Faust 3544*c217d954SCole Faust#define ADD_ROW_12(BASENAME, BIAS) \ 3545*c217d954SCole Faust ADD_ROW_11(BASENAME, BIAS) \ 3546*c217d954SCole Faust BASENAME##B += BIAS##B; 3547*c217d954SCole Faust 3548*c217d954SCole Faust#define ADD_ROW_13(BASENAME, BIAS) \ 3549*c217d954SCole Faust ADD_ROW_12(BASENAME, BIAS) \ 3550*c217d954SCole Faust BASENAME##C += BIAS##C; 3551*c217d954SCole Faust 3552*c217d954SCole Faust#define ADD_ROW_14(BASENAME, BIAS) \ 3553*c217d954SCole Faust ADD_ROW_13(BASENAME, BIAS) \ 3554*c217d954SCole Faust BASENAME##D += BIAS##D; 3555*c217d954SCole Faust 3556*c217d954SCole Faust#define ADD_ROW_15(BASENAME, BIAS) \ 3557*c217d954SCole Faust ADD_ROW_14(BASENAME, BIAS) \ 3558*c217d954SCole Faust BASENAME##E += BIAS##E; 3559*c217d954SCole Faust 3560*c217d954SCole Faust#define ADD_ROW_16(BASENAME, BIAS) \ 3561*c217d954SCole Faust ADD_ROW_15(BASENAME, BIAS) \ 3562*c217d954SCole Faust BASENAME##F += BIAS##F; 3563*c217d954SCole Faust 3564*c217d954SCole Faust 3565*c217d954SCole Faust 3566*c217d954SCole Faust 3567*c217d954SCole Faust#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 3568*c217d954SCole Faust#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 3569*c217d954SCole Faust 3570*c217d954SCole Faust 3571*c217d954SCole Faust 3572*c217d954SCole Faust#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3573*c217d954SCole Faust BASENAME##0 += BIAS; 3574*c217d954SCole Faust 3575*c217d954SCole Faust#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3576*c217d954SCole Faust ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3577*c217d954SCole Faust BASENAME##1 += BIAS; 3578*c217d954SCole Faust 3579*c217d954SCole Faust#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3580*c217d954SCole Faust ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3581*c217d954SCole Faust BASENAME##2 += BIAS; 3582*c217d954SCole Faust 3583*c217d954SCole Faust#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3584*c217d954SCole Faust ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3585*c217d954SCole Faust BASENAME##3 += BIAS; 3586*c217d954SCole Faust 3587*c217d954SCole Faust#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3588*c217d954SCole Faust ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3589*c217d954SCole Faust BASENAME##4 += BIAS; 3590*c217d954SCole Faust 3591*c217d954SCole Faust#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3592*c217d954SCole Faust ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3593*c217d954SCole Faust BASENAME##5 += BIAS; 3594*c217d954SCole Faust 3595*c217d954SCole Faust#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3596*c217d954SCole Faust ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3597*c217d954SCole Faust BASENAME##6 += BIAS; 3598*c217d954SCole Faust 3599*c217d954SCole Faust#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3600*c217d954SCole Faust ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3601*c217d954SCole Faust BASENAME##7 += BIAS; 3602*c217d954SCole Faust 3603*c217d954SCole Faust#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3604*c217d954SCole Faust ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3605*c217d954SCole Faust BASENAME##8 += BIAS; 3606*c217d954SCole Faust 3607*c217d954SCole Faust#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3608*c217d954SCole Faust ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3609*c217d954SCole Faust BASENAME##9 += BIAS; 3610*c217d954SCole Faust 3611*c217d954SCole Faust#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3612*c217d954SCole Faust ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3613*c217d954SCole Faust BASENAME##A += BIAS; 3614*c217d954SCole Faust 3615*c217d954SCole Faust#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3616*c217d954SCole Faust ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3617*c217d954SCole Faust BASENAME##B += BIAS; 3618*c217d954SCole Faust 3619*c217d954SCole Faust#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3620*c217d954SCole Faust ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3621*c217d954SCole Faust BASENAME##C += BIAS; 3622*c217d954SCole Faust 3623*c217d954SCole Faust#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3624*c217d954SCole Faust ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3625*c217d954SCole Faust BASENAME##D += BIAS; 3626*c217d954SCole Faust 3627*c217d954SCole Faust#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3628*c217d954SCole Faust ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3629*c217d954SCole Faust BASENAME##E += BIAS; 3630*c217d954SCole Faust 3631*c217d954SCole Faust#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 3632*c217d954SCole Faust ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3633*c217d954SCole Faust BASENAME##F += BIAS; 3634*c217d954SCole Faust 3635*c217d954SCole Faust 3636*c217d954SCole Faust#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 3637*c217d954SCole Faust#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 3638*c217d954SCole Faust 3639*c217d954SCole Faust 3640*c217d954SCole Faust 3641*c217d954SCole Faust#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3642*c217d954SCole Faust BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 3643*c217d954SCole Faust 3644*c217d954SCole Faust#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3645*c217d954SCole Faust ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3646*c217d954SCole Faust BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 3647*c217d954SCole Faust 3648*c217d954SCole Faust#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3649*c217d954SCole Faust ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3650*c217d954SCole Faust BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 3651*c217d954SCole Faust 3652*c217d954SCole Faust#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3653*c217d954SCole Faust ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3654*c217d954SCole Faust BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 3655*c217d954SCole Faust 3656*c217d954SCole Faust#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3657*c217d954SCole Faust ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3658*c217d954SCole Faust BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 3659*c217d954SCole Faust 3660*c217d954SCole Faust#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3661*c217d954SCole Faust ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3662*c217d954SCole Faust BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 3663*c217d954SCole Faust 3664*c217d954SCole Faust#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3665*c217d954SCole Faust ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3666*c217d954SCole Faust BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 3667*c217d954SCole Faust 3668*c217d954SCole Faust#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3669*c217d954SCole Faust ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3670*c217d954SCole Faust BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 3671*c217d954SCole Faust 3672*c217d954SCole Faust#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3673*c217d954SCole Faust ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3674*c217d954SCole Faust BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 3675*c217d954SCole Faust 3676*c217d954SCole Faust#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3677*c217d954SCole Faust ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3678*c217d954SCole Faust BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 3679*c217d954SCole Faust 3680*c217d954SCole Faust#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3681*c217d954SCole Faust ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3682*c217d954SCole Faust BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 3683*c217d954SCole Faust 3684*c217d954SCole Faust#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3685*c217d954SCole Faust ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3686*c217d954SCole Faust BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 3687*c217d954SCole Faust 3688*c217d954SCole Faust#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3689*c217d954SCole Faust ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3690*c217d954SCole Faust BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 3691*c217d954SCole Faust 3692*c217d954SCole Faust#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3693*c217d954SCole Faust ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3694*c217d954SCole Faust BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 3695*c217d954SCole Faust 3696*c217d954SCole Faust#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3697*c217d954SCole Faust ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3698*c217d954SCole Faust BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 3699*c217d954SCole Faust 3700*c217d954SCole Faust#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3701*c217d954SCole Faust ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3702*c217d954SCole Faust BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 3703*c217d954SCole Faust 3704*c217d954SCole Faust 3705*c217d954SCole Faust 3706*c217d954SCole Faust#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3707*c217d954SCole Faust#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3708*c217d954SCole Faust 3709*c217d954SCole Faust 3710*c217d954SCole Faust 3711*c217d954SCole Faust#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3712*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3713*c217d954SCole Faust BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 3714*c217d954SCole Faust 3715*c217d954SCole Faust#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3716*c217d954SCole Faust CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3717*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3718*c217d954SCole Faust BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 3719*c217d954SCole Faust 3720*c217d954SCole Faust#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3721*c217d954SCole Faust CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3722*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3723*c217d954SCole Faust BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 3724*c217d954SCole Faust 3725*c217d954SCole Faust#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3726*c217d954SCole Faust CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3727*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3728*c217d954SCole Faust BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 3729*c217d954SCole Faust 3730*c217d954SCole Faust#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3731*c217d954SCole Faust CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3732*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3733*c217d954SCole Faust BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 3734*c217d954SCole Faust 3735*c217d954SCole Faust#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3736*c217d954SCole Faust CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3737*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3738*c217d954SCole Faust BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 3739*c217d954SCole Faust 3740*c217d954SCole Faust#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3741*c217d954SCole Faust CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3742*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3743*c217d954SCole Faust BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 3744*c217d954SCole Faust 3745*c217d954SCole Faust#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3746*c217d954SCole Faust CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3747*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3748*c217d954SCole Faust BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 3749*c217d954SCole Faust 3750*c217d954SCole Faust#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3751*c217d954SCole Faust CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3752*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3753*c217d954SCole Faust BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 3754*c217d954SCole Faust 3755*c217d954SCole Faust#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3756*c217d954SCole Faust CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3757*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3758*c217d954SCole Faust BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 3759*c217d954SCole Faust 3760*c217d954SCole Faust#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3761*c217d954SCole Faust CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3762*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3763*c217d954SCole Faust BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 3764*c217d954SCole Faust 3765*c217d954SCole Faust#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3766*c217d954SCole Faust CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3767*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3768*c217d954SCole Faust BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 3769*c217d954SCole Faust 3770*c217d954SCole Faust#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3771*c217d954SCole Faust CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3772*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3773*c217d954SCole Faust BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 3774*c217d954SCole Faust 3775*c217d954SCole Faust#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3776*c217d954SCole Faust CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3777*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3778*c217d954SCole Faust BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 3779*c217d954SCole Faust 3780*c217d954SCole Faust#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3781*c217d954SCole Faust CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3782*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3783*c217d954SCole Faust BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 3784*c217d954SCole Faust 3785*c217d954SCole Faust#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3786*c217d954SCole Faust CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3787*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3788*c217d954SCole Faust BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 3789*c217d954SCole Faust 3790*c217d954SCole Faust 3791*c217d954SCole Faust 3792*c217d954SCole Faust#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3793*c217d954SCole Faust#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3794*c217d954SCole Faust 3795*c217d954SCole Faust 3796*c217d954SCole Faust#ifndef ARM_COMPUTE_REPEAT_H 3797*c217d954SCole Faust#define ARM_COMPUTE_REPEAT_H 3798*c217d954SCole Faust 3799*c217d954SCole Faust 3800*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 3801*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 3802*c217d954SCole Faust 3803*c217d954SCole Faust 3804*c217d954SCole Faust 3805*c217d954SCole Faust 3806*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3807*c217d954SCole Faust VSTORE(N0) \ 3808*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3809*c217d954SCole Faust 3810*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3811*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3812*c217d954SCole Faust VSTORE(N0) \ 3813*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3814*c217d954SCole Faust 3815*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3816*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3817*c217d954SCole Faust VSTORE(N0) \ 3818*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3819*c217d954SCole Faust 3820*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3821*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3822*c217d954SCole Faust VSTORE(N0) \ 3823*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3824*c217d954SCole Faust 3825*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3826*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3827*c217d954SCole Faust VSTORE(N0) \ 3828*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3829*c217d954SCole Faust 3830*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3831*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3832*c217d954SCole Faust VSTORE(N0) \ 3833*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3834*c217d954SCole Faust 3835*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3836*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3837*c217d954SCole Faust VSTORE(N0) \ 3838*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3839*c217d954SCole Faust 3840*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3841*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3842*c217d954SCole Faust VSTORE(N0) \ 3843*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3844*c217d954SCole Faust 3845*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3846*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3847*c217d954SCole Faust VSTORE(N0) \ 3848*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3849*c217d954SCole Faust 3850*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3851*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3852*c217d954SCole Faust VSTORE(N0) \ 3853*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3854*c217d954SCole Faust 3855*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3856*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3857*c217d954SCole Faust VSTORE(N0) \ 3858*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3859*c217d954SCole Faust 3860*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3861*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3862*c217d954SCole Faust VSTORE(N0) \ 3863*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3864*c217d954SCole Faust 3865*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3866*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3867*c217d954SCole Faust VSTORE(N0) \ 3868*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3869*c217d954SCole Faust 3870*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3871*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3872*c217d954SCole Faust VSTORE(N0) \ 3873*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3874*c217d954SCole Faust 3875*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3876*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3877*c217d954SCole Faust VSTORE(N0) \ 3878*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3879*c217d954SCole Faust 3880*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3881*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3882*c217d954SCole Faust VSTORE(N0) \ 3883*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3884*c217d954SCole Faust 3885*c217d954SCole Faust 3886*c217d954SCole Faust 3887*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3888*c217d954SCole Faust VSTORE(N0) \ 3889*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3890*c217d954SCole Faust 3891*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3892*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3893*c217d954SCole Faust VSTORE(N0) \ 3894*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3895*c217d954SCole Faust 3896*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3897*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3898*c217d954SCole Faust VSTORE(N0) \ 3899*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3900*c217d954SCole Faust 3901*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3902*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3903*c217d954SCole Faust VSTORE(N0) \ 3904*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3905*c217d954SCole Faust 3906*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3907*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3908*c217d954SCole Faust VSTORE(N0) \ 3909*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3910*c217d954SCole Faust 3911*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3912*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3913*c217d954SCole Faust VSTORE(N0) \ 3914*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3915*c217d954SCole Faust 3916*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3917*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3918*c217d954SCole Faust VSTORE(N0) \ 3919*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3920*c217d954SCole Faust 3921*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3922*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3923*c217d954SCole Faust VSTORE(N0) \ 3924*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3925*c217d954SCole Faust 3926*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3927*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3928*c217d954SCole Faust VSTORE(N0) \ 3929*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3930*c217d954SCole Faust 3931*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 3932*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3933*c217d954SCole Faust VSTORE(N0) \ 3934*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3935*c217d954SCole Faust 3936*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3937*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3938*c217d954SCole Faust VSTORE(N0) \ 3939*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3940*c217d954SCole Faust 3941*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3942*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3943*c217d954SCole Faust VSTORE(N0) \ 3944*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3945*c217d954SCole Faust 3946*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3947*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3948*c217d954SCole Faust VSTORE(N0) \ 3949*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3950*c217d954SCole Faust 3951*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3952*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3953*c217d954SCole Faust VSTORE(N0) \ 3954*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3955*c217d954SCole Faust 3956*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3957*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3958*c217d954SCole Faust VSTORE(N0) \ 3959*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3960*c217d954SCole Faust 3961*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3962*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3963*c217d954SCole Faust VSTORE(N0) \ 3964*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3965*c217d954SCole Faust 3966*c217d954SCole Faust 3967*c217d954SCole Faust 3968*c217d954SCole Faust 3969*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3970*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3971*c217d954SCole Faust 3972*c217d954SCole Faust 3973*c217d954SCole Faust 3974*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3975*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3976*c217d954SCole Faust 3977*c217d954SCole Faust 3978*c217d954SCole Faust 3979*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3980*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3981*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3982*c217d954SCole Faust 3983*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3984*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3985*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3986*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3987*c217d954SCole Faust 3988*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3989*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3990*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3991*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3992*c217d954SCole Faust 3993*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3994*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3995*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3996*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3997*c217d954SCole Faust 3998*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3999*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4000*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4001*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 4002*c217d954SCole Faust 4003*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4004*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4005*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4006*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 4007*c217d954SCole Faust 4008*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4009*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4010*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4011*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 4012*c217d954SCole Faust 4013*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4014*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4015*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4016*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 4017*c217d954SCole Faust 4018*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4019*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4020*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4021*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 4022*c217d954SCole Faust 4023*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4024*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4025*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4026*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 4027*c217d954SCole Faust 4028*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4029*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4030*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4031*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 4032*c217d954SCole Faust 4033*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4034*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4035*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4036*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 4037*c217d954SCole Faust 4038*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4039*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4040*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4041*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 4042*c217d954SCole Faust 4043*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4044*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4045*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4046*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 4047*c217d954SCole Faust 4048*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4049*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4050*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4051*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 4052*c217d954SCole Faust 4053*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4054*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4055*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4056*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 4057*c217d954SCole Faust 4058*c217d954SCole Faust 4059*c217d954SCole Faust 4060*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4061*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4062*c217d954SCole Faust 4063*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4064*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 4065*c217d954SCole Faust { \ 4066*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4067*c217d954SCole Faust } \ 4068*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 4069*c217d954SCole Faust { \ 4070*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4071*c217d954SCole Faust } \ 4072*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 4073*c217d954SCole Faust { \ 4074*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4075*c217d954SCole Faust } \ 4076*c217d954SCole Faust else \ 4077*c217d954SCole Faust { \ 4078*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4079*c217d954SCole Faust } 4080*c217d954SCole Faust 4081*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 4082*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 4083*c217d954SCole Faust { \ 4084*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4085*c217d954SCole Faust } \ 4086*c217d954SCole Faust else \ 4087*c217d954SCole Faust { \ 4088*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4089*c217d954SCole Faust } 4090*c217d954SCole Faust 4091*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 4092*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 4093*c217d954SCole Faust { \ 4094*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4095*c217d954SCole Faust } \ 4096*c217d954SCole Faust else \ 4097*c217d954SCole Faust { \ 4098*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4099*c217d954SCole Faust } 4100*c217d954SCole Faust 4101*c217d954SCole Faust 4102*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 4103*c217d954SCole Faust 4104*c217d954SCole Faust 4105*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 4106*c217d954SCole Faust 4107*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4108*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4109*c217d954SCole Faust 4110*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 4111*c217d954SCole Faust 4112*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4113*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 4114*c217d954SCole Faust 4115*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 4116*c217d954SCole Faust 4117*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4118*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 4119*c217d954SCole Faust 4120*c217d954SCole Faust#else 4121*c217d954SCole Faust 4122*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4123*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 4124*c217d954SCole Faust 4125*c217d954SCole Faust#endif 4126*c217d954SCole Faust 4127*c217d954SCole Faust#endif 4128*c217d954SCole Faust 4129*c217d954SCole Faust 4130*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 4131*c217d954SCole Faust 4132*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4133*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 4134*c217d954SCole Faust#else 4135*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4136*c217d954SCole Faust ((uint)(y * M0)) 4137*c217d954SCole Faust#endif 4138*c217d954SCole Faust 4139*c217d954SCole Faust 4140*c217d954SCole Faust 4141*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 4142*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 4143*c217d954SCole Faust 4144*c217d954SCole Faust 4145*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4146*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 4147*c217d954SCole Faust#endif 4148*c217d954SCole Faust 4149*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 4150*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 4151*c217d954SCole Faust#endif 4152*c217d954SCole Faust 4153*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 4154*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 4155*c217d954SCole Faust#endif 4156*c217d954SCole Faust 4157*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 4158*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 4159*c217d954SCole Faust#endif 4160*c217d954SCole Faust 4161*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 4162*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 4163*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 4164*c217d954SCole Faust 4165*c217d954SCole Faust 4166*c217d954SCole Faust#define CONCAT(a, b) a##b 4167*c217d954SCole Faust 4168*c217d954SCole Faust 4169*c217d954SCole Faust#define EXPAND(x) x 4170*c217d954SCole Faust 4171*c217d954SCole Faust 4172*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 4173*c217d954SCole Faust 4174*c217d954SCole Faust 4175*c217d954SCole Faust#define REV1(x) ((x)) 4176*c217d954SCole Faust#define REV2(x) ((x).s10) 4177*c217d954SCole Faust#define REV3(x) ((x).s210) 4178*c217d954SCole Faust#define REV4(x) ((x).s3210) 4179*c217d954SCole Faust#define REV8(x) ((x).s76543210) 4180*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 4181*c217d954SCole Faust 4182*c217d954SCole Faust 4183*c217d954SCole Faust 4184*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 4185*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 4186*c217d954SCole Faust 4187*c217d954SCole Faust 4188*c217d954SCole Faust 4189*c217d954SCole Faust#define ROT1_0(x) ((x)) 4190*c217d954SCole Faust#define ROT1_1(x) ((x)) 4191*c217d954SCole Faust 4192*c217d954SCole Faust#define ROT2_0(x) ((x)) 4193*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 4194*c217d954SCole Faust#define ROT2_2(x) ((x)) 4195*c217d954SCole Faust 4196*c217d954SCole Faust#define ROT3_0(x) ((x)) 4197*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 4198*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 4199*c217d954SCole Faust#define ROT3_3(x) ((x)) 4200*c217d954SCole Faust 4201*c217d954SCole Faust#define ROT4_0(x) ((x)) 4202*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 4203*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 4204*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 4205*c217d954SCole Faust#define ROT4_4(x) ((x)) 4206*c217d954SCole Faust 4207*c217d954SCole Faust#define ROT8_0(x) ((x)) 4208*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 4209*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 4210*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 4211*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 4212*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 4213*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 4214*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 4215*c217d954SCole Faust#define ROT8_8(x) ((x)) 4216*c217d954SCole Faust 4217*c217d954SCole Faust#define ROT16_0(x) ((x)) 4218*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 4219*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 4220*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 4221*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 4222*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 4223*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 4224*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 4225*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 4226*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 4227*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 4228*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 4229*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 4230*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 4231*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 4232*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 4233*c217d954SCole Faust#define ROT16_16(x) ((x)) 4234*c217d954SCole Faust 4235*c217d954SCole Faust 4236*c217d954SCole Faust 4237*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 4238*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 4239*c217d954SCole Faust 4240*c217d954SCole Faust 4241*c217d954SCole Faust 4242*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 4243*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 4244*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 4245*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 4246*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 4247*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 4248*c217d954SCole Faust 4249*c217d954SCole Faust 4250*c217d954SCole Faust 4251*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 4252*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 4253*c217d954SCole Faust 4254*c217d954SCole Faust 4255*c217d954SCole Faust#define VLOAD_STR(size) vload##size 4256*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 4257*c217d954SCole Faust 4258*c217d954SCole Faust 4259*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 4260*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 4261*c217d954SCole Faust 4262*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 4263*c217d954SCole Faust { \ 4264*c217d954SCole Faust } 4265*c217d954SCole Faust 4266*c217d954SCole Faust 4267*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 4268*c217d954SCole Faust#define vload_partial_1_1 vload1 4269*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 4270*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 4271*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 4272*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 4273*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 4274*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 4275*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 4276*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 4277*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 4278*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 4279*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 4280*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 4281*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 4282*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 4283*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 4284*c217d954SCole Faust 4285*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 4286*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 4287*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 4288*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 4289*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 4290*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 4291*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 4292*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 4293*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 4294*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 4295*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 4296*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 4297*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 4298*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 4299*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 4300*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 4301*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 4302*c217d954SCole Faust 4303*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 4304*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 4305*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 4306*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 4307*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 4308*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 4309*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 4310*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 4311*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 4312*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 4313*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 4314*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 4315*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 4316*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 4317*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 4318*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 4319*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 4320*c217d954SCole Faust 4321*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 4322*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 4323*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 4324*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 4325*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 4326*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 4327*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 4328*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 4329*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 4330*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 4331*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 4332*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 4333*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 4334*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 4335*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 4336*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 4337*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 4338*c217d954SCole Faust 4339*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 4340*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 4341*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 4342*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 4343*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 4344*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 4345*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 4346*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 4347*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 4348*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 4349*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 4350*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 4351*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 4352*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 4353*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 4354*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 4355*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 4356*c217d954SCole Faust 4357*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 4358*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 4359*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 4360*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 4361*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 4362*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 4363*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 4364*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 4365*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 4366*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 4367*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 4368*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 4369*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 4370*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 4371*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 4372*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 4373*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 4374*c217d954SCole Faust 4375*c217d954SCole Faust 4376*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 4377*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 4378*c217d954SCole Faust 4379*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 4380*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 4381*c217d954SCole Faust 4382*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 4383*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 4384*c217d954SCole Faust 4385*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 4386*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 4387*c217d954SCole Faust 4388*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 4389*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4390*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 4391*c217d954SCole Faust 4392*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 4393*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4394*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 4395*c217d954SCole Faust 4396*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 4397*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4398*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 4399*c217d954SCole Faust 4400*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 4401*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 4402*c217d954SCole Faust 4403*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 4404*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4405*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 4406*c217d954SCole Faust 4407*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 4408*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4409*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 4410*c217d954SCole Faust 4411*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 4412*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4413*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 4414*c217d954SCole Faust 4415*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 4416*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4417*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 4418*c217d954SCole Faust 4419*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 4420*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4421*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 4422*c217d954SCole Faust 4423*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 4424*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4425*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 4426*c217d954SCole Faust 4427*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 4428*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4429*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 4430*c217d954SCole Faust 4431*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 4432*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 4433*c217d954SCole Faust 4434*c217d954SCole Faust 4435*c217d954SCole Faust 4436*c217d954SCole Faust#define PIXEL_UNIT4 1 4437*c217d954SCole Faust#define PIXEL_UNIT8 2 4438*c217d954SCole Faust#define PIXEL_UNIT16 4 4439*c217d954SCole Faust 4440*c217d954SCole Faust 4441*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 4442*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 4443*c217d954SCole Faust 4444*c217d954SCole Faust 4445*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 4446*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 4447*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 4448*c217d954SCole Faust 4449*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4450*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 4451*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 4452*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 4453*c217d954SCole Faust#endif 4454*c217d954SCole Faust 4455*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 4456*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4457*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4458*c217d954SCole Faust 4459*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4460*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 4461*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4462*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4463*c217d954SCole Faust#endif 4464*c217d954SCole Faust 4465*c217d954SCole Faust 4466*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 4467*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 4468*c217d954SCole Faust 4469*c217d954SCole Faust 4470*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 4471*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 4472*c217d954SCole Faust 4473*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 4474*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 4475*c217d954SCole Faust 4476*c217d954SCole Faust#define float1 float 4477*c217d954SCole Faust#define half1 half 4478*c217d954SCole Faust#define char1 char 4479*c217d954SCole Faust#define uchar1 uchar 4480*c217d954SCole Faust#define short1 short 4481*c217d954SCole Faust#define ushort1 ushort 4482*c217d954SCole Faust#define int1 int 4483*c217d954SCole Faust#define uint1 uint 4484*c217d954SCole Faust#define long1 long 4485*c217d954SCole Faust#define ulong1 ulong 4486*c217d954SCole Faust#define double1 double 4487*c217d954SCole Faust 4488*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 4489*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 4490*c217d954SCole Faust 4491*c217d954SCole Faust 4492*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 4493*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 4494*c217d954SCole Faust 4495*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 4496*c217d954SCole Faust { \ 4497*c217d954SCole Faust } 4498*c217d954SCole Faust 4499*c217d954SCole Faust 4500*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 4501*c217d954SCole Faust#define vstore_partial_1_1 vstore1 4502*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 4503*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 4504*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 4505*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 4506*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 4507*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 4508*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 4509*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 4510*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 4511*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 4512*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 4513*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 4514*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 4515*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 4516*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 4517*c217d954SCole Faust 4518*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 4519*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 4520*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 4521*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 4522*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 4523*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 4524*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 4525*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 4526*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 4527*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 4528*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 4529*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 4530*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 4531*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 4532*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 4533*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 4534*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 4535*c217d954SCole Faust 4536*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 4537*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 4538*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 4539*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 4540*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 4541*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 4542*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 4543*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 4544*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 4545*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 4546*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 4547*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 4548*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 4549*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 4550*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 4551*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 4552*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 4553*c217d954SCole Faust 4554*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 4555*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 4556*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 4557*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 4558*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 4559*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 4560*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 4561*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 4562*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 4563*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 4564*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 4565*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 4566*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 4567*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 4568*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 4569*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 4570*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 4571*c217d954SCole Faust 4572*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 4573*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 4574*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 4575*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 4576*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 4577*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 4578*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 4579*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 4580*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 4581*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 4582*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 4583*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 4584*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 4585*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 4586*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 4587*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 4588*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 4589*c217d954SCole Faust 4590*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 4591*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 4592*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 4593*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 4594*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 4595*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 4596*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 4597*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 4598*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 4599*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 4600*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 4601*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 4602*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 4603*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 4604*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 4605*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 4606*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 4607*c217d954SCole Faust 4608*c217d954SCole Faust 4609*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 4610*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 4611*c217d954SCole Faust 4612*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 4613*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 4614*c217d954SCole Faust 4615*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 4616*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 4617*c217d954SCole Faust 4618*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 4619*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 4620*c217d954SCole Faust 4621*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 4622*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4623*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 4624*c217d954SCole Faust 4625*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 4626*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4627*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 4628*c217d954SCole Faust 4629*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 4630*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4631*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 4632*c217d954SCole Faust 4633*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 4634*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 4635*c217d954SCole Faust 4636*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 4637*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4638*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 4639*c217d954SCole Faust 4640*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 4641*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4642*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 4643*c217d954SCole Faust 4644*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 4645*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4646*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 4647*c217d954SCole Faust 4648*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 4649*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4650*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 4651*c217d954SCole Faust 4652*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 4653*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4654*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 4655*c217d954SCole Faust 4656*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 4657*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4658*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 4659*c217d954SCole Faust 4660*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 4661*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4662*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 4663*c217d954SCole Faust 4664*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 4665*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 4666*c217d954SCole Faust 4667*c217d954SCole Faust 4668*c217d954SCole Faust 4669*c217d954SCole Faust 4670*c217d954SCole Faust 4671*c217d954SCole Faust#define convert_float_sat convert_float 4672*c217d954SCole Faust#define convert_float1_sat convert_float 4673*c217d954SCole Faust#define convert_float2_sat convert_float2 4674*c217d954SCole Faust#define convert_float3_sat convert_float3 4675*c217d954SCole Faust#define convert_float4_sat convert_float4 4676*c217d954SCole Faust#define convert_float8_sat convert_float8 4677*c217d954SCole Faust#define convert_float16_sat convert_float16 4678*c217d954SCole Faust#define convert_half_sat convert_float 4679*c217d954SCole Faust#define convert_half1_sat convert_half 4680*c217d954SCole Faust#define convert_half2_sat convert_half2 4681*c217d954SCole Faust#define convert_half3_sat convert_half3 4682*c217d954SCole Faust#define convert_half4_sat convert_half4 4683*c217d954SCole Faust#define convert_half8_sat convert_half8 4684*c217d954SCole Faust#define convert_half16_sat convert_half16 4685*c217d954SCole Faust 4686*c217d954SCole Faust#define convert_float1 convert_float 4687*c217d954SCole Faust#define convert_half1 convert_half 4688*c217d954SCole Faust#define convert_char1 convert_char 4689*c217d954SCole Faust#define convert_uchar1 convert_uchar 4690*c217d954SCole Faust#define convert_short1 convert_short 4691*c217d954SCole Faust#define convert_ushort1 convert_ushort 4692*c217d954SCole Faust#define convert_int1 convert_int 4693*c217d954SCole Faust#define convert_uint1 convert_uint 4694*c217d954SCole Faust#define convert_long1 convert_long 4695*c217d954SCole Faust#define convert_ulong1 convert_ulong 4696*c217d954SCole Faust#define convert_double1 convert_double 4697*c217d954SCole Faust 4698*c217d954SCole Faust#define convert_char1_sat convert_char_sat 4699*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 4700*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 4701*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 4702*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 4703*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 4704*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 4705*c217d954SCole Faust#define convert_short1_sat convert_short_sat 4706*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 4707*c217d954SCole Faust#define convert_int1_sat convert_int_sat 4708*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 4709*c217d954SCole Faust#define convert_long1_sat convert_long_sat 4710*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 4711*c217d954SCole Faust#define convert_double1_sat convert_double_sat 4712*c217d954SCole Faust 4713*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 4714*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 4715*c217d954SCole Faust 4716*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 4717*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 4718*c217d954SCole Faust 4719*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 4720*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 4721*c217d954SCole Faust 4722*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 4723*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 4724*c217d954SCole Faust 4725*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 4726*c217d954SCole Faust#define select_vec_dt_char(size) char##size 4727*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 4728*c217d954SCole Faust#define select_vec_dt_short(size) short##size 4729*c217d954SCole Faust#define select_vec_dt_half(size) short##size 4730*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 4731*c217d954SCole Faust#define select_vec_dt_int(size) int##size 4732*c217d954SCole Faust#define select_vec_dt_float(size) int##size 4733*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 4734*c217d954SCole Faust#define select_vec_dt_long(size) long##size 4735*c217d954SCole Faust 4736*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 4737*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 4738*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 4739*c217d954SCole Faust 4740*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 4741*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 4742*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 4743*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 4744*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 4745*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 4746*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 4747*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 4748*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 4749*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 4750*c217d954SCole Faust 4751*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 4752*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 4753*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 4754*c217d954SCole Faust 4755*c217d954SCole Faust#define sum_reduce_1(x) (x) 4756*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 4757*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 4758*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 4759*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 4760*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 4761*c217d954SCole Faust 4762*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 4763*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 4764*c217d954SCole Faust 4765*c217d954SCole Faust#define prod_reduce_1(x) (x) 4766*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 4767*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 4768*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 4769*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 4770*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 4771*c217d954SCole Faust 4772*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 4773*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 4774*c217d954SCole Faust 4775*c217d954SCole Faust#define max_reduce_1(x) (x) 4776*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 4777*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 4778*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 4779*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 4780*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 4781*c217d954SCole Faust 4782*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 4783*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 4784*c217d954SCole Faust 4785*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 4786*c217d954SCole Faust __global uchar *name##_ptr, \ 4787*c217d954SCole Faust uint name##_stride_x, \ 4788*c217d954SCole Faust uint name##_step_x, \ 4789*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4790*c217d954SCole Faust 4791*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 4792*c217d954SCole Faust __global uchar *name##_ptr, \ 4793*c217d954SCole Faust uint name##_stride_x, \ 4794*c217d954SCole Faust uint name##_step_x, \ 4795*c217d954SCole Faust uint name##_stride_y, \ 4796*c217d954SCole Faust uint name##_step_y, \ 4797*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4798*c217d954SCole Faust 4799*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 4800*c217d954SCole Faust __global uchar *name##_ptr, \ 4801*c217d954SCole Faust uint name##_stride_x, \ 4802*c217d954SCole Faust uint name##_step_x, \ 4803*c217d954SCole Faust uint name##_stride_y, \ 4804*c217d954SCole Faust uint name##_step_y, \ 4805*c217d954SCole Faust uint name##_stride_z, \ 4806*c217d954SCole Faust uint name##_step_z, \ 4807*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4808*c217d954SCole Faust 4809*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 4810*c217d954SCole Faust __global uchar *name##_ptr, \ 4811*c217d954SCole Faust uint name##_stride_x, \ 4812*c217d954SCole Faust uint name##_step_x, \ 4813*c217d954SCole Faust uint name##_stride_y, \ 4814*c217d954SCole Faust uint name##_step_y, \ 4815*c217d954SCole Faust uint name##_stride_z, \ 4816*c217d954SCole Faust uint name##_step_z, \ 4817*c217d954SCole Faust uint name##_stride_w, \ 4818*c217d954SCole Faust uint name##_step_w, \ 4819*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4820*c217d954SCole Faust 4821*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 4822*c217d954SCole Faust __global uchar *name##_ptr, \ 4823*c217d954SCole Faust uint name##_stride_x, \ 4824*c217d954SCole Faust uint name##_step_x, \ 4825*c217d954SCole Faust uint name##_stride_y, \ 4826*c217d954SCole Faust uint name##_step_y, \ 4827*c217d954SCole Faust uint name##_stride_z, \ 4828*c217d954SCole Faust uint name##_step_z, \ 4829*c217d954SCole Faust uint name##_stride_w, \ 4830*c217d954SCole Faust uint name##_step_w, \ 4831*c217d954SCole Faust uint name##_stride_v, \ 4832*c217d954SCole Faust uint name##_step_v, \ 4833*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4834*c217d954SCole Faust 4835*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 4836*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 4837*c217d954SCole Faust 4838*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 4839*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 4840*c217d954SCole Faust 4841*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 4842*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 4843*c217d954SCole Faust 4844*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 4845*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 4846*c217d954SCole Faust 4847*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4848*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4849*c217d954SCole Faust 4850*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 4851*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 4852*c217d954SCole Faust 4853*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4854*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4855*c217d954SCole Faust 4856*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 4857*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4858*c217d954SCole Faust name##_stride_z, name##_step_z) 4859*c217d954SCole Faust 4860*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 4861*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 4862*c217d954SCole Faust 4863*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 4864*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4865*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 4866*c217d954SCole Faust 4867*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 4868*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 4869*c217d954SCole Faust 4870*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 4871*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4872*c217d954SCole Faust name##_stride_z, name##_step_z) 4873*c217d954SCole Faust 4874*c217d954SCole Faust 4875*c217d954SCole Fausttypedef struct Vector 4876*c217d954SCole Faust{ 4877*c217d954SCole Faust __global uchar *ptr; 4878*c217d954SCole Faust int offset_first_element_in_bytes; 4879*c217d954SCole Faust int stride_x; 4880*c217d954SCole Faust} Vector; 4881*c217d954SCole Faust 4882*c217d954SCole Faust 4883*c217d954SCole Fausttypedef struct Image 4884*c217d954SCole Faust{ 4885*c217d954SCole Faust __global uchar *ptr; 4886*c217d954SCole Faust int offset_first_element_in_bytes; 4887*c217d954SCole Faust int stride_x; 4888*c217d954SCole Faust int stride_y; 4889*c217d954SCole Faust} Image; 4890*c217d954SCole Faust 4891*c217d954SCole Faust 4892*c217d954SCole Fausttypedef struct Tensor3D 4893*c217d954SCole Faust{ 4894*c217d954SCole Faust __global uchar *ptr; 4895*c217d954SCole Faust int offset_first_element_in_bytes; 4896*c217d954SCole Faust int stride_x; 4897*c217d954SCole Faust int stride_y; 4898*c217d954SCole Faust int stride_z; 4899*c217d954SCole Faust} Tensor3D; 4900*c217d954SCole Faust 4901*c217d954SCole Faust 4902*c217d954SCole Fausttypedef struct Tensor4D 4903*c217d954SCole Faust{ 4904*c217d954SCole Faust __global uchar *ptr; 4905*c217d954SCole Faust int offset_first_element_in_bytes; 4906*c217d954SCole Faust int stride_x; 4907*c217d954SCole Faust int stride_y; 4908*c217d954SCole Faust int stride_z; 4909*c217d954SCole Faust int stride_w; 4910*c217d954SCole Faust} Tensor4D; 4911*c217d954SCole Faust 4912*c217d954SCole Faust 4913*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 4914*c217d954SCole Faust{ 4915*c217d954SCole Faust Vector vector = 4916*c217d954SCole Faust { 4917*c217d954SCole Faust .ptr = ptr, 4918*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4919*c217d954SCole Faust .stride_x = stride_x, 4920*c217d954SCole Faust }; 4921*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 4922*c217d954SCole Faust return vector; 4923*c217d954SCole Faust} 4924*c217d954SCole Faust 4925*c217d954SCole Faust 4926*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 4927*c217d954SCole Faust{ 4928*c217d954SCole Faust Image img = 4929*c217d954SCole Faust { 4930*c217d954SCole Faust .ptr = ptr, 4931*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4932*c217d954SCole Faust .stride_x = stride_x, 4933*c217d954SCole Faust .stride_y = stride_y 4934*c217d954SCole Faust }; 4935*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 4936*c217d954SCole Faust return img; 4937*c217d954SCole Faust} 4938*c217d954SCole Faust 4939*c217d954SCole Faust 4940*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4941*c217d954SCole Faust{ 4942*c217d954SCole Faust Image img = 4943*c217d954SCole Faust { 4944*c217d954SCole Faust .ptr = ptr, 4945*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4946*c217d954SCole Faust .stride_x = stride_x, 4947*c217d954SCole Faust .stride_y = stride_y 4948*c217d954SCole Faust }; 4949*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4950*c217d954SCole Faust return img; 4951*c217d954SCole Faust} 4952*c217d954SCole Faust 4953*c217d954SCole Faust 4954*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4955*c217d954SCole Faust{ 4956*c217d954SCole Faust Tensor3D tensor = 4957*c217d954SCole Faust { 4958*c217d954SCole Faust .ptr = ptr, 4959*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4960*c217d954SCole Faust .stride_x = stride_x, 4961*c217d954SCole Faust .stride_y = stride_y, 4962*c217d954SCole Faust .stride_z = stride_z 4963*c217d954SCole Faust }; 4964*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4965*c217d954SCole Faust return tensor; 4966*c217d954SCole Faust} 4967*c217d954SCole Faust 4968*c217d954SCole Faust 4969*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4970*c217d954SCole Faust{ 4971*c217d954SCole Faust Tensor3D tensor = 4972*c217d954SCole Faust { 4973*c217d954SCole Faust .ptr = ptr, 4974*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4975*c217d954SCole Faust .stride_x = stride_x, 4976*c217d954SCole Faust .stride_y = stride_y, 4977*c217d954SCole Faust .stride_z = stride_z 4978*c217d954SCole Faust }; 4979*c217d954SCole Faust return tensor; 4980*c217d954SCole Faust} 4981*c217d954SCole Faust 4982*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 4983*c217d954SCole Faust uint step_w, 4984*c217d954SCole Faust uint mod_size) 4985*c217d954SCole Faust{ 4986*c217d954SCole Faust Tensor4D tensor = 4987*c217d954SCole Faust { 4988*c217d954SCole Faust .ptr = ptr, 4989*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4990*c217d954SCole Faust .stride_x = stride_x, 4991*c217d954SCole Faust .stride_y = stride_y, 4992*c217d954SCole Faust .stride_z = stride_z, 4993*c217d954SCole Faust .stride_w = stride_w 4994*c217d954SCole Faust }; 4995*c217d954SCole Faust 4996*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 4997*c217d954SCole Faust return tensor; 4998*c217d954SCole Faust} 4999*c217d954SCole Faust 5000*c217d954SCole Faust 5001*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 5002*c217d954SCole Faust{ 5003*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 5004*c217d954SCole Faust} 5005*c217d954SCole Faust 5006*c217d954SCole Faust 5007*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 5008*c217d954SCole Faust{ 5009*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 5010*c217d954SCole Faust} 5011*c217d954SCole Faust 5012*c217d954SCole Faust 5013*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 5014*c217d954SCole Faust{ 5015*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 5016*c217d954SCole Faust} 5017*c217d954SCole Faust 5018*c217d954SCole Faust 5019*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 5020*c217d954SCole Faust{ 5021*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 5022*c217d954SCole Faust} 5023*c217d954SCole Faust 5024*c217d954SCole Faust 5025*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 5026*c217d954SCole Faust{ 5027*c217d954SCole Faust uint num_elements = width * height; 5028*c217d954SCole Faust 5029*c217d954SCole Faust const uint z = index / num_elements; 5030*c217d954SCole Faust 5031*c217d954SCole Faust index %= num_elements; 5032*c217d954SCole Faust 5033*c217d954SCole Faust const uint y = index / width; 5034*c217d954SCole Faust 5035*c217d954SCole Faust index %= width; 5036*c217d954SCole Faust 5037*c217d954SCole Faust const uint x = index; 5038*c217d954SCole Faust 5039*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 5040*c217d954SCole Faust} 5041*c217d954SCole Faust 5042*c217d954SCole Faust#endif 5043*c217d954SCole Faust 5044*c217d954SCole Faust 5045*c217d954SCole Faust 5046*c217d954SCole Faust#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) 5047*c217d954SCole Faust#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ 5048*c217d954SCole Faust P_X##_DEF(1, P_A, P_B, P_C); \ 5049*c217d954SCole Faust REPEAT_3_1(P_X, P_A, P_B, P_C) 5050*c217d954SCole Faust#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ 5051*c217d954SCole Faust P_X##_DEF(2, P_A, P_B, P_C); \ 5052*c217d954SCole Faust REPEAT_3_2(P_X, P_A, P_B, P_C) 5053*c217d954SCole Faust#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ 5054*c217d954SCole Faust P_X##_DEF(3, P_A, P_B, P_C); \ 5055*c217d954SCole Faust REPEAT_3_3(P_X, P_A, P_B, P_C) 5056*c217d954SCole Faust#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ 5057*c217d954SCole Faust P_X##_DEF(4, P_A, P_B, P_C); \ 5058*c217d954SCole Faust REPEAT_3_4(P_X, P_A, P_B, P_C) 5059*c217d954SCole Faust#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ 5060*c217d954SCole Faust P_X##_DEF(5, P_A, P_B, P_C); \ 5061*c217d954SCole Faust REPEAT_3_5(P_X, P_A, P_B, P_C) 5062*c217d954SCole Faust#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ 5063*c217d954SCole Faust P_X##_DEF(6, P_A, P_B, P_C); \ 5064*c217d954SCole Faust REPEAT_3_6(P_X, P_A, P_B, P_C) 5065*c217d954SCole Faust#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ 5066*c217d954SCole Faust P_X##_DEF(7, P_A, P_B, P_C); \ 5067*c217d954SCole Faust REPEAT_3_7(P_X, P_A, P_B, P_C) 5068*c217d954SCole Faust#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ 5069*c217d954SCole Faust P_X##_DEF(8, P_A, P_B, P_C); \ 5070*c217d954SCole Faust REPEAT_3_8(P_X, P_A, P_B, P_C) 5071*c217d954SCole Faust#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ 5072*c217d954SCole Faust P_X##_DEF(9, P_A, P_B, P_C); \ 5073*c217d954SCole Faust REPEAT_3_9(P_X, P_A, P_B, P_C) 5074*c217d954SCole Faust#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ 5075*c217d954SCole Faust P_X##_DEF(A, P_A, P_B, P_C); \ 5076*c217d954SCole Faust REPEAT_3_10(P_X, P_A, P_B, P_C) 5077*c217d954SCole Faust#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ 5078*c217d954SCole Faust P_X##_DEF(B, P_A, P_B, P_C); \ 5079*c217d954SCole Faust REPEAT_3_11(P_X, P_A, P_B, P_C) 5080*c217d954SCole Faust#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ 5081*c217d954SCole Faust P_X##_DEF(C, P_A, P_B, P_C); \ 5082*c217d954SCole Faust REPEAT_3_12(P_X, P_A, P_B, P_C) 5083*c217d954SCole Faust#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ 5084*c217d954SCole Faust P_X##_DEF(D, P_A, P_B, P_C); \ 5085*c217d954SCole Faust REPEAT_3_13(P_X, P_A, P_B, P_C) 5086*c217d954SCole Faust#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ 5087*c217d954SCole Faust P_X##_DEF(E, P_A, P_B, P_C); \ 5088*c217d954SCole Faust REPEAT_3_14(P_X, P_A, P_B, P_C) 5089*c217d954SCole Faust#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ 5090*c217d954SCole Faust P_X##_DEF(F, P_A, P_B, P_C); \ 5091*c217d954SCole Faust REPEAT_3_15(P_X, P_A, P_B, P_C) 5092*c217d954SCole Faust 5093*c217d954SCole Faust#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) 5094*c217d954SCole Faust#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) 5095*c217d954SCole Faust 5096*c217d954SCole Faust 5097*c217d954SCole Faust#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) 5098*c217d954SCole Faust#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ 5099*c217d954SCole Faust P_X##_DEF(1, P_A, P_B, P_C, P_D); \ 5100*c217d954SCole Faust REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) 5101*c217d954SCole Faust#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ 5102*c217d954SCole Faust P_X##_DEF(2, P_A, P_B, P_C, P_D); \ 5103*c217d954SCole Faust REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) 5104*c217d954SCole Faust#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ 5105*c217d954SCole Faust P_X##_DEF(3, P_A, P_B, P_C, P_D); \ 5106*c217d954SCole Faust REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) 5107*c217d954SCole Faust#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ 5108*c217d954SCole Faust P_X##_DEF(4, P_A, P_B, P_C, P_D); \ 5109*c217d954SCole Faust REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) 5110*c217d954SCole Faust#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ 5111*c217d954SCole Faust P_X##_DEF(5, P_A, P_B, P_C, P_D); \ 5112*c217d954SCole Faust REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) 5113*c217d954SCole Faust#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ 5114*c217d954SCole Faust P_X##_DEF(6, P_A, P_B, P_C, P_D); \ 5115*c217d954SCole Faust REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) 5116*c217d954SCole Faust#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ 5117*c217d954SCole Faust P_X##_DEF(7, P_A, P_B, P_C, P_D); \ 5118*c217d954SCole Faust REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) 5119*c217d954SCole Faust#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ 5120*c217d954SCole Faust P_X##_DEF(8, P_A, P_B, P_C, P_D); \ 5121*c217d954SCole Faust REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) 5122*c217d954SCole Faust#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ 5123*c217d954SCole Faust P_X##_DEF(9, P_A, P_B, P_C, P_D); \ 5124*c217d954SCole Faust REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) 5125*c217d954SCole Faust#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ 5126*c217d954SCole Faust P_X##_DEF(A, P_A, P_B, P_C, P_D); \ 5127*c217d954SCole Faust REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) 5128*c217d954SCole Faust#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ 5129*c217d954SCole Faust P_X##_DEF(B, P_A, P_B, P_C, P_D); \ 5130*c217d954SCole Faust REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) 5131*c217d954SCole Faust#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ 5132*c217d954SCole Faust P_X##_DEF(C, P_A, P_B, P_C, P_D); \ 5133*c217d954SCole Faust REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) 5134*c217d954SCole Faust#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ 5135*c217d954SCole Faust P_X##_DEF(D, P_A, P_B, P_C, P_D); \ 5136*c217d954SCole Faust REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) 5137*c217d954SCole Faust#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ 5138*c217d954SCole Faust P_X##_DEF(E, P_A, P_B, P_C, P_D); \ 5139*c217d954SCole Faust REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) 5140*c217d954SCole Faust#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ 5141*c217d954SCole Faust P_X##_DEF(F, P_A, P_B, P_C, P_D); \ 5142*c217d954SCole Faust REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) 5143*c217d954SCole Faust 5144*c217d954SCole Faust#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) 5145*c217d954SCole Faust#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) 5146*c217d954SCole Faust 5147*c217d954SCole Faust 5148*c217d954SCole Faust#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL 5149*c217d954SCole Faust#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) 5150*c217d954SCole Faust 5151*c217d954SCole Faust 5152*c217d954SCole Faust#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) 5153*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) 5154*c217d954SCole Faust 5155*c217d954SCole Faust 5156*c217d954SCole Faust#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) 5157*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) 5158*c217d954SCole Faust 5159*c217d954SCole Faust 5160*c217d954SCole Faust#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL 5161*c217d954SCole Faust#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) 5162*c217d954SCole Faust 5163*c217d954SCole Faust 5164*c217d954SCole Faust#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL 5165*c217d954SCole Faust#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) 5166*c217d954SCole Faust 5167*c217d954SCole Faust 5168*c217d954SCole Faust#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC 5169*c217d954SCole Faust#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) 5170*c217d954SCole Faust 5171*c217d954SCole Faust 5172*c217d954SCole Faust#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID 5173*c217d954SCole Faust#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) 5174*c217d954SCole Faust 5175*c217d954SCole Faust 5176*c217d954SCole Faust#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) 5177*c217d954SCole Faust#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) 5178*c217d954SCole Faust 5179*c217d954SCole Faust 5180*c217d954SCole Faust#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) 5181*c217d954SCole Faust#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) 5182*c217d954SCole Faust 5183*c217d954SCole Faust 5184*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 5185*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 5186*c217d954SCole Faust 5187*c217d954SCole Faust 5188*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 5189*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 5190*c217d954SCole Faust 5191*c217d954SCole Faust 5192*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ 5193*c217d954SCole Faust ({ \ 5194*c217d954SCole Faust VEC_DATA_TYPE(int, N0) \ 5195*c217d954SCole Faust VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 5196*c217d954SCole Faust VEC_DATA_TYPE(int, N0) \ 5197*c217d954SCole Faust VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 5198*c217d954SCole Faust VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ 5199*c217d954SCole Faust }) 5200*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) 5201*c217d954SCole Faust 5202*c217d954SCole Faust#endif 5203*c217d954SCole Faust 5204*c217d954SCole Faust#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) 5205*c217d954SCole Faust 5206*c217d954SCole Faust#define CONCAT(a, b) a##b 5207*c217d954SCole Faust 5208*c217d954SCole Faust#define ARM_DOT1(a, b, c) \ 5209*c217d954SCole Faust ({ \ 5210*c217d954SCole Faust c = fma(a, b, c); \ 5211*c217d954SCole Faust }) 5212*c217d954SCole Faust#define ARM_DOT2(a, b, c) \ 5213*c217d954SCole Faust ({ \ 5214*c217d954SCole Faust c = fma(a.s0, b.s0, c); \ 5215*c217d954SCole Faust c = fma(a.s1, b.s1, c); \ 5216*c217d954SCole Faust }) 5217*c217d954SCole Faust#define ARM_DOT3(a, b, c) \ 5218*c217d954SCole Faust ({ \ 5219*c217d954SCole Faust ARM_DOT2(a, b, c); \ 5220*c217d954SCole Faust c = fma((a.s2), (b.s2), c); \ 5221*c217d954SCole Faust }) 5222*c217d954SCole Faust#define ARM_DOT4(a, b, c) \ 5223*c217d954SCole Faust ({ \ 5224*c217d954SCole Faust ARM_DOT3(a, b, c); \ 5225*c217d954SCole Faust c = fma((a.s3), (b.s3), c); \ 5226*c217d954SCole Faust }) 5227*c217d954SCole Faust#define ARM_DOT8(a, b, c) \ 5228*c217d954SCole Faust ({ \ 5229*c217d954SCole Faust ARM_DOT4((a.lo), (b.lo), c); \ 5230*c217d954SCole Faust ARM_DOT4((a.hi), (b.hi), c); \ 5231*c217d954SCole Faust }) 5232*c217d954SCole Faust#define ARM_DOT16(a, b, c) \ 5233*c217d954SCole Faust ({ \ 5234*c217d954SCole Faust ARM_DOT8((a.lo), (b.lo), c); \ 5235*c217d954SCole Faust ARM_DOT8((a.hi), (b.hi), c); \ 5236*c217d954SCole Faust }) 5237*c217d954SCole Faust 5238*c217d954SCole Faust#if N0 == 2 5239*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \ 5240*c217d954SCole Faust ({ \ 5241*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5242*c217d954SCole Faust ((a), (b##0), (c.s0)); \ 5243*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5244*c217d954SCole Faust ((a), (b##1), (c.s1)); \ 5245*c217d954SCole Faust }) 5246*c217d954SCole Faust#elif N0 == 3 5247*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \ 5248*c217d954SCole Faust ({ \ 5249*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5250*c217d954SCole Faust ((a), (b##0), (c.s0)); \ 5251*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5252*c217d954SCole Faust ((a), (b##1), (c.s1)); \ 5253*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5254*c217d954SCole Faust ((a), (b##2), (c.s2)); \ 5255*c217d954SCole Faust }) 5256*c217d954SCole Faust#elif N0 == 4 5257*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \ 5258*c217d954SCole Faust ({ \ 5259*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5260*c217d954SCole Faust ((a), (b##0), (c.s0)); \ 5261*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5262*c217d954SCole Faust ((a), (b##1), (c.s1)); \ 5263*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5264*c217d954SCole Faust ((a), (b##2), (c.s2)); \ 5265*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5266*c217d954SCole Faust ((a), (b##3), (c.s3)); \ 5267*c217d954SCole Faust }) 5268*c217d954SCole Faust#elif N0 == 8 5269*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \ 5270*c217d954SCole Faust ({ \ 5271*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5272*c217d954SCole Faust ((a), (b##0), (c.s0)); \ 5273*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5274*c217d954SCole Faust ((a), (b##1), (c.s1)); \ 5275*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5276*c217d954SCole Faust ((a), (b##2), (c.s2)); \ 5277*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5278*c217d954SCole Faust ((a), (b##3), (c.s3)); \ 5279*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5280*c217d954SCole Faust ((a), (b##4), (c.s4)); \ 5281*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5282*c217d954SCole Faust ((a), (b##5), (c.s5)); \ 5283*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5284*c217d954SCole Faust ((a), (b##6), (c.s6)); \ 5285*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5286*c217d954SCole Faust ((a), (b##7), (c.s7)); \ 5287*c217d954SCole Faust }) 5288*c217d954SCole Faust#elif N0 == 16 5289*c217d954SCole Faust#define ARM_DOT_K0XN0(k0, a, b, c) \ 5290*c217d954SCole Faust ({ \ 5291*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5292*c217d954SCole Faust ((a), (b##0), (c.s0)); \ 5293*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5294*c217d954SCole Faust ((a), (b##1), (c.s1)); \ 5295*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5296*c217d954SCole Faust ((a), (b##2), (c.s2)); \ 5297*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5298*c217d954SCole Faust ((a), (b##3), (c.s3)); \ 5299*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5300*c217d954SCole Faust ((a), (b##4), (c.s4)); \ 5301*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5302*c217d954SCole Faust ((a), (b##5), (c.s5)); \ 5303*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5304*c217d954SCole Faust ((a), (b##6), (c.s6)); \ 5305*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5306*c217d954SCole Faust ((a), (b##7), (c.s7)); \ 5307*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5308*c217d954SCole Faust ((a), (b##8), (c.s8)); \ 5309*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5310*c217d954SCole Faust ((a), (b##9), (c.s9)); \ 5311*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5312*c217d954SCole Faust ((a), (b##A), (c.sA)); \ 5313*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5314*c217d954SCole Faust ((a), (b##B), (c.sB)); \ 5315*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5316*c217d954SCole Faust ((a), (b##C), (c.sC)); \ 5317*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5318*c217d954SCole Faust ((a), (b##D), (c.sD)); \ 5319*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5320*c217d954SCole Faust ((a), (b##E), (c.sE)); \ 5321*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 5322*c217d954SCole Faust ((a), (b##F), (c.sF)); \ 5323*c217d954SCole Faust }) 5324*c217d954SCole Faust#else 5325*c217d954SCole Faust#error "N0 value not supported" 5326*c217d954SCole Faust#endif 5327*c217d954SCole Faust 5328*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T) 5329*c217d954SCole Faust 5330*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), 5331*c217d954SCole Faust IMAGE_DECLARATION(rhs), 5332*c217d954SCole Faust#if defined(BETA) 5333*c217d954SCole Faust IMAGE_DECLARATION(bias), 5334*c217d954SCole Faust#endif 5335*c217d954SCole Faust IMAGE_DECLARATION(dst), 5336*c217d954SCole Faust uint lhs_stride_z, 5337*c217d954SCole Faust uint rhs_stride_z, 5338*c217d954SCole Faust#if defined(BETA) 5339*c217d954SCole Faust uint bias_stride_z, 5340*c217d954SCole Faust#endif 5341*c217d954SCole Faust uint dst_stride_z 5342*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 5343*c217d954SCole Faust , 5344*c217d954SCole Faust uint lhs_cross_plane_pad 5345*c217d954SCole Faust#endif 5346*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 5347*c217d954SCole Faust , 5348*c217d954SCole Faust uint dst_cross_plane_pad 5349*c217d954SCole Faust#endif 5350*c217d954SCole Faust , 5351*c217d954SCole Faust const int M, 5352*c217d954SCole Faust const int N, 5353*c217d954SCole Faust const int K) 5354*c217d954SCole Faust{ 5355*c217d954SCole Faust 5356*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0)) 5357*c217d954SCole Faust 5358*c217d954SCole Faust 5359*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 5360*c217d954SCole Faust#define RHS_OFFSET_X (K0) 5361*c217d954SCole Faust#define RHS_STEP_X ((K0) * (H0)) 5362*c217d954SCole Faust#define RHS_STEP_LOOP (1) 5363*c217d954SCole Faust#else 5364*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 5365*c217d954SCole Faust#define RHS_STEP_X (K0) 5366*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 5367*c217d954SCole Faust#endif 5368*c217d954SCole Faust 5369*c217d954SCole Faust uint x = get_global_id(0); 5370*c217d954SCole Faust uint y = get_global_id(1); 5371*c217d954SCole Faust uint z = get_global_id(2); 5372*c217d954SCole Faust 5373*c217d954SCole Faust const bool cond_y = y == 0; 5374*c217d954SCole Faust const bool cond_x = ((x + 1) * N0 >= N); 5375*c217d954SCole Faust 5376*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 5377*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 5378*c217d954SCole Faust { 5379*c217d954SCole Faust return; 5380*c217d954SCole Faust } 5381*c217d954SCole Faust#endif 5382*c217d954SCole Faust 5383*c217d954SCole Faust 5384*c217d954SCole Faust uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 5385*c217d954SCole Faust 5386*c217d954SCole Faust 5387*c217d954SCole Faust uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; 5388*c217d954SCole Faust 5389*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 5390*c217d954SCole Faust 5391*c217d954SCole Faust rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 5392*c217d954SCole Faust#else 5393*c217d954SCole Faust rhs_offset += z * rhs_stride_z; 5394*c217d954SCole Faust#endif 5395*c217d954SCole Faust 5396*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 5397*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 5398*c217d954SCole Faust 5399*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 5400*c217d954SCole Faust 5401*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 5402*c217d954SCole Faust 5403*c217d954SCole Faust 5404*c217d954SCole Faust 5405*c217d954SCole Faust lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 5406*c217d954SCole Faust 5407*c217d954SCole Faust#else 5408*c217d954SCole Faust 5409*c217d954SCole Faust 5410*c217d954SCole Faust lhs_offset += z * lhs_stride_z; 5411*c217d954SCole Faust 5412*c217d954SCole Faust#endif 5413*c217d954SCole Faust 5414*c217d954SCole Faust 5415*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 5416*c217d954SCole Faust 5417*c217d954SCole Faust int i = 0; 5418*c217d954SCole Faust for(; i <= (K - K0); i += K0) 5419*c217d954SCole Faust { 5420*c217d954SCole Faust 5421*c217d954SCole Faust 5422*c217d954SCole Faust 5423*c217d954SCole Faust 5424*c217d954SCole Faust 5425*c217d954SCole Faust 5426*c217d954SCole Faust 5427*c217d954SCole Faust 5428*c217d954SCole Faust 5429*c217d954SCole Faust 5430*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 5431*c217d954SCole Faust 5432*c217d954SCole Faust 5433*c217d954SCole Faust LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); 5434*c217d954SCole Faust 5435*c217d954SCole Faust 5436*c217d954SCole Faust ARM_DOT_K0XN0(K0, a0, b, c0); 5437*c217d954SCole Faust#if M0 > 1 5438*c217d954SCole Faust ARM_DOT_K0XN0(K0, a1, b, c1); 5439*c217d954SCole Faust#endif 5440*c217d954SCole Faust#if M0 > 2 5441*c217d954SCole Faust ARM_DOT_K0XN0(K0, a2, b, c2); 5442*c217d954SCole Faust#endif 5443*c217d954SCole Faust#if M0 > 3 5444*c217d954SCole Faust ARM_DOT_K0XN0(K0, a3, b, c3); 5445*c217d954SCole Faust#endif 5446*c217d954SCole Faust#if M0 > 4 5447*c217d954SCole Faust ARM_DOT_K0XN0(K0, a4, b, c4); 5448*c217d954SCole Faust#endif 5449*c217d954SCole Faust#if M0 > 5 5450*c217d954SCole Faust ARM_DOT_K0XN0(K0, a5, b, c5); 5451*c217d954SCole Faust#endif 5452*c217d954SCole Faust#if M0 > 6 5453*c217d954SCole Faust ARM_DOT_K0XN0(K0, a6, b, c6); 5454*c217d954SCole Faust#endif 5455*c217d954SCole Faust#if M0 > 7 5456*c217d954SCole Faust ARM_DOT_K0XN0(K0, a7, b, c7); 5457*c217d954SCole Faust#endif 5458*c217d954SCole Faust 5459*c217d954SCole Faust lhs_offset += K0 * sizeof(DATA_TYPE); 5460*c217d954SCole Faust rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); 5461*c217d954SCole Faust } 5462*c217d954SCole Faust 5463*c217d954SCole Faust 5464*c217d954SCole Faust for(; i < K; ++i) 5465*c217d954SCole Faust { 5466*c217d954SCole Faust 5467*c217d954SCole Faust LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 5468*c217d954SCole Faust 5469*c217d954SCole Faust 5470*c217d954SCole Faust LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); 5471*c217d954SCole Faust 5472*c217d954SCole Faust 5473*c217d954SCole Faust ARM_DOT_K0XN0(1, a0, b, c0); 5474*c217d954SCole Faust#if M0 > 1 5475*c217d954SCole Faust ARM_DOT_K0XN0(1, a1, b, c1); 5476*c217d954SCole Faust#endif 5477*c217d954SCole Faust#if M0 > 2 5478*c217d954SCole Faust ARM_DOT_K0XN0(1, a2, b, c2); 5479*c217d954SCole Faust#endif 5480*c217d954SCole Faust#if M0 > 3 5481*c217d954SCole Faust ARM_DOT_K0XN0(1, a3, b, c3); 5482*c217d954SCole Faust#endif 5483*c217d954SCole Faust#if M0 > 4 5484*c217d954SCole Faust ARM_DOT_K0XN0(1, a4, b, c4); 5485*c217d954SCole Faust#endif 5486*c217d954SCole Faust#if M0 > 5 5487*c217d954SCole Faust ARM_DOT_K0XN0(1, a5, b, c5); 5488*c217d954SCole Faust#endif 5489*c217d954SCole Faust#if M0 > 6 5490*c217d954SCole Faust ARM_DOT_K0XN0(1, a6, b, c6); 5491*c217d954SCole Faust#endif 5492*c217d954SCole Faust#if M0 > 7 5493*c217d954SCole Faust ARM_DOT_K0XN0(1, a7, b, c7); 5494*c217d954SCole Faust#endif 5495*c217d954SCole Faust 5496*c217d954SCole Faust lhs_offset += sizeof(DATA_TYPE); 5497*c217d954SCole Faust rhs_offset += sizeof(DATA_TYPE); 5498*c217d954SCole Faust } 5499*c217d954SCole Faust 5500*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 5501*c217d954SCole Faust 5502*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 5503*c217d954SCole Faust 5504*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 5505*c217d954SCole Faust 5506*c217d954SCole Faust 5507*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 5508*c217d954SCole Faust 5509*c217d954SCole Faust 5510*c217d954SCole Faust 5511*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 5512*c217d954SCole Faust 5513*c217d954SCole Faust#else 5514*c217d954SCole Faust 5515*c217d954SCole Faust 5516*c217d954SCole Faust dst_addr += z * dst_stride_z; 5517*c217d954SCole Faust 5518*c217d954SCole Faust#endif 5519*c217d954SCole Faust 5520*c217d954SCole Faust 5521*c217d954SCole Faust#if defined(ALPHA) 5522*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 5523*c217d954SCole Faust#endif 5524*c217d954SCole Faust 5525*c217d954SCole Faust 5526*c217d954SCole Faust#if defined(BETA) 5527*c217d954SCole Faust#if defined(BROADCAST_BIAS) 5528*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 5529*c217d954SCole Faust 5530*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 5531*c217d954SCole Faust 5532*c217d954SCole Faust#ifndef UNIT_BETA 5533*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 5534*c217d954SCole Faust#endif 5535*c217d954SCole Faust 5536*c217d954SCole Faust 5537*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 5538*c217d954SCole Faust 5539*c217d954SCole Faust#else 5540*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 5541*c217d954SCole Faust 5542*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5543*c217d954SCole Faust 5544*c217d954SCole Faust#ifndef UNIT_BETA 5545*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 5546*c217d954SCole Faust#endif 5547*c217d954SCole Faust 5548*c217d954SCole Faust 5549*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 5550*c217d954SCole Faust 5551*c217d954SCole Faust#endif 5552*c217d954SCole Faust#endif 5553*c217d954SCole Faust 5554*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 5555*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 5556*c217d954SCole Faust#endif 5557*c217d954SCole Faust 5558*c217d954SCole Faust 5559*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5560*c217d954SCole Faust 5561*c217d954SCole Faust#undef RHS_BLOCK_SIZE 5562*c217d954SCole Faust#undef RHS_OFFSET_X 5563*c217d954SCole Faust#undef RHS_STEP_X 5564*c217d954SCole Faust#undef RHS_STEP_LOOP 5565*c217d954SCole Faust} 5566*c217d954SCole Faust#endif 5567*c217d954SCole Faust 5568*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE) 5569*c217d954SCole Faust 5570*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), 5571*c217d954SCole Faust __read_only image2d_t rhs_img, 5572*c217d954SCole Faust#if defined(BETA) 5573*c217d954SCole Faust IMAGE_DECLARATION(bias), 5574*c217d954SCole Faust#endif 5575*c217d954SCole Faust IMAGE_DECLARATION(dst), 5576*c217d954SCole Faust uint lhs_stride_z, 5577*c217d954SCole Faust uint rhs_stride_z, 5578*c217d954SCole Faust#if defined(BETA) 5579*c217d954SCole Faust uint bias_stride_z, 5580*c217d954SCole Faust#endif 5581*c217d954SCole Faust uint dst_stride_z 5582*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 5583*c217d954SCole Faust , 5584*c217d954SCole Faust uint lhs_cross_plane_pad 5585*c217d954SCole Faust#endif 5586*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 5587*c217d954SCole Faust , 5588*c217d954SCole Faust uint dst_cross_plane_pad 5589*c217d954SCole Faust#endif 5590*c217d954SCole Faust , 5591*c217d954SCole Faust const int M, 5592*c217d954SCole Faust const int N, 5593*c217d954SCole Faust const int K) 5594*c217d954SCole Faust{ 5595*c217d954SCole Faust 5596*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) 5597*c217d954SCole Faust 5598*c217d954SCole Faust const uint LEFTOVER_K = K % K0; 5599*c217d954SCole Faust 5600*c217d954SCole Faust 5601*c217d954SCole Faust#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) 5602*c217d954SCole Faust 5603*c217d954SCole Faust 5604*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 5605*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT) 5606*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT * (H0)) 5607*c217d954SCole Faust#define RHS_STEP_LOOP (1) 5608*c217d954SCole Faust#else 5609*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 5610*c217d954SCole Faust#define RHS_STEP_X PIXEL_UNIT 5611*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 5612*c217d954SCole Faust#endif 5613*c217d954SCole Faust 5614*c217d954SCole Faust uint x = get_global_id(0); 5615*c217d954SCole Faust uint y = get_global_id(1); 5616*c217d954SCole Faust uint z = get_global_id(2); 5617*c217d954SCole Faust 5618*c217d954SCole Faust const bool cond_y = y == 0; 5619*c217d954SCole Faust const bool cond_x = ((x + 1) * N0 >= N); 5620*c217d954SCole Faust 5621*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 5622*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 5623*c217d954SCole Faust { 5624*c217d954SCole Faust return; 5625*c217d954SCole Faust } 5626*c217d954SCole Faust#endif 5627*c217d954SCole Faust 5628*c217d954SCole Faust 5629*c217d954SCole Faust uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 5630*c217d954SCole Faust 5631*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 5632*c217d954SCole Faust 5633*c217d954SCole Faust const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); 5634*c217d954SCole Faust#else 5635*c217d954SCole Faust const uint z_rhs = get_global_id(2); 5636*c217d954SCole Faust#endif 5637*c217d954SCole Faust 5638*c217d954SCole Faust 5639*c217d954SCole Faust uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; 5640*c217d954SCole Faust const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; 5641*c217d954SCole Faust 5642*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 5643*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 5644*c217d954SCole Faust 5645*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 5646*c217d954SCole Faust 5647*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 5648*c217d954SCole Faust 5649*c217d954SCole Faust 5650*c217d954SCole Faust 5651*c217d954SCole Faust lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 5652*c217d954SCole Faust 5653*c217d954SCole Faust#else 5654*c217d954SCole Faust 5655*c217d954SCole Faust 5656*c217d954SCole Faust lhs_offset += z * lhs_stride_z; 5657*c217d954SCole Faust 5658*c217d954SCole Faust#endif 5659*c217d954SCole Faust 5660*c217d954SCole Faust 5661*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 5662*c217d954SCole Faust 5663*c217d954SCole Faust int i = 0; 5664*c217d954SCole Faust for(; i <= (K - K0); i += K0) 5665*c217d954SCole Faust { 5666*c217d954SCole Faust 5667*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 5668*c217d954SCole Faust 5669*c217d954SCole Faust 5670*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); 5671*c217d954SCole Faust LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); 5672*c217d954SCole Faust 5673*c217d954SCole Faust 5674*c217d954SCole Faust ARM_DOT_K0XN0(K0, a0, b, c0); 5675*c217d954SCole Faust#if M0 > 1 5676*c217d954SCole Faust ARM_DOT_K0XN0(K0, a1, b, c1); 5677*c217d954SCole Faust#endif 5678*c217d954SCole Faust#if M0 > 2 5679*c217d954SCole Faust ARM_DOT_K0XN0(K0, a2, b, c2); 5680*c217d954SCole Faust#endif 5681*c217d954SCole Faust#if M0 > 3 5682*c217d954SCole Faust ARM_DOT_K0XN0(K0, a3, b, c3); 5683*c217d954SCole Faust#endif 5684*c217d954SCole Faust#if M0 > 4 5685*c217d954SCole Faust ARM_DOT_K0XN0(K0, a4, b, c4); 5686*c217d954SCole Faust#endif 5687*c217d954SCole Faust#if M0 > 5 5688*c217d954SCole Faust ARM_DOT_K0XN0(K0, a5, b, c5); 5689*c217d954SCole Faust#endif 5690*c217d954SCole Faust#if M0 > 6 5691*c217d954SCole Faust ARM_DOT_K0XN0(K0, a6, b, c6); 5692*c217d954SCole Faust#endif 5693*c217d954SCole Faust#if M0 > 7 5694*c217d954SCole Faust ARM_DOT_K0XN0(K0, a7, b, c7); 5695*c217d954SCole Faust#endif 5696*c217d954SCole Faust 5697*c217d954SCole Faust lhs_offset += K0 * sizeof(DATA_TYPE); 5698*c217d954SCole Faust x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; 5699*c217d954SCole Faust } 5700*c217d954SCole Faust 5701*c217d954SCole Faust if(LEFTOVER_K != 0) 5702*c217d954SCole Faust { 5703*c217d954SCole Faust 5704*c217d954SCole Faust 5705*c217d954SCole Faust 5706*c217d954SCole Faust 5707*c217d954SCole Faust union UNION_VEC_TYPE 5708*c217d954SCole Faust { 5709*c217d954SCole Faust DATA_TYPE s[K0]; 5710*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, K0) 5711*c217d954SCole Faust v; 5712*c217d954SCole Faust }; 5713*c217d954SCole Faust 5714*c217d954SCole Faust union UNION_VEC_TYPE a0 = {.v = 0 }; 5715*c217d954SCole Faust#if M0 > 1 5716*c217d954SCole Faust union UNION_VEC_TYPE a1 = {.v = 0 }; 5717*c217d954SCole Faust#endif 5718*c217d954SCole Faust#if M0 > 2 5719*c217d954SCole Faust union UNION_VEC_TYPE a2 = {.v = 0 }; 5720*c217d954SCole Faust#endif 5721*c217d954SCole Faust#if M0 > 3 5722*c217d954SCole Faust union UNION_VEC_TYPE a3 = {.v = 0 }; 5723*c217d954SCole Faust#endif 5724*c217d954SCole Faust#if M0 > 4 5725*c217d954SCole Faust union UNION_VEC_TYPE a4 = {.v = 0 }; 5726*c217d954SCole Faust#endif 5727*c217d954SCole Faust#if M0 > 5 5728*c217d954SCole Faust union UNION_VEC_TYPE a5 = {.v = 0 }; 5729*c217d954SCole Faust#endif 5730*c217d954SCole Faust#if M0 > 6 5731*c217d954SCole Faust union UNION_VEC_TYPE a6 = {.v = 0 }; 5732*c217d954SCole Faust#endif 5733*c217d954SCole Faust#if M0 > 7 5734*c217d954SCole Faust union UNION_VEC_TYPE a7 = {.v = 0 }; 5735*c217d954SCole Faust#endif 5736*c217d954SCole Faust 5737*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); 5738*c217d954SCole Faust 5739*c217d954SCole Faust 5740*c217d954SCole Faust LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); 5741*c217d954SCole Faust 5742*c217d954SCole Faust 5743*c217d954SCole Faust for(int k = 0; k < LEFTOVER_K; ++k) 5744*c217d954SCole Faust { 5745*c217d954SCole Faust a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0); 5746*c217d954SCole Faust#if M0 > 1 5747*c217d954SCole Faust a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1); 5748*c217d954SCole Faust#endif 5749*c217d954SCole Faust#if M0 > 2 5750*c217d954SCole Faust a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2); 5751*c217d954SCole Faust#endif 5752*c217d954SCole Faust#if M0 > 3 5753*c217d954SCole Faust a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3); 5754*c217d954SCole Faust#endif 5755*c217d954SCole Faust#if M0 > 4 5756*c217d954SCole Faust a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4); 5757*c217d954SCole Faust#endif 5758*c217d954SCole Faust#if M0 > 5 5759*c217d954SCole Faust a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5); 5760*c217d954SCole Faust#endif 5761*c217d954SCole Faust#if M0 > 6 5762*c217d954SCole Faust a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6); 5763*c217d954SCole Faust#endif 5764*c217d954SCole Faust#if M0 > 7 5765*c217d954SCole Faust a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7); 5766*c217d954SCole Faust#endif 5767*c217d954SCole Faust 5768*c217d954SCole Faust lhs_offset += sizeof(DATA_TYPE); 5769*c217d954SCole Faust } 5770*c217d954SCole Faust 5771*c217d954SCole Faust 5772*c217d954SCole Faust ARM_DOT_K0XN0(K0, a0.v, b, c0); 5773*c217d954SCole Faust#if M0 > 1 5774*c217d954SCole Faust ARM_DOT_K0XN0(K0, a1.v, b, c1); 5775*c217d954SCole Faust#endif 5776*c217d954SCole Faust#if M0 > 2 5777*c217d954SCole Faust ARM_DOT_K0XN0(K0, a2.v, b, c2); 5778*c217d954SCole Faust#endif 5779*c217d954SCole Faust#if M0 > 3 5780*c217d954SCole Faust ARM_DOT_K0XN0(K0, a3.v, b, c3); 5781*c217d954SCole Faust#endif 5782*c217d954SCole Faust#if M0 > 4 5783*c217d954SCole Faust ARM_DOT_K0XN0(K0, a4.v, b, c4); 5784*c217d954SCole Faust#endif 5785*c217d954SCole Faust#if M0 > 5 5786*c217d954SCole Faust ARM_DOT_K0XN0(K0, a5.v, b, c5); 5787*c217d954SCole Faust#endif 5788*c217d954SCole Faust#if M0 > 6 5789*c217d954SCole Faust ARM_DOT_K0XN0(K0, a6.v, b, c6); 5790*c217d954SCole Faust#endif 5791*c217d954SCole Faust#if M0 > 7 5792*c217d954SCole Faust ARM_DOT_K0XN0(K0, a7.v, b, c7); 5793*c217d954SCole Faust#endif 5794*c217d954SCole Faust } 5795*c217d954SCole Faust 5796*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 5797*c217d954SCole Faust 5798*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 5799*c217d954SCole Faust 5800*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 5801*c217d954SCole Faust 5802*c217d954SCole Faust 5803*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 5804*c217d954SCole Faust 5805*c217d954SCole Faust 5806*c217d954SCole Faust 5807*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 5808*c217d954SCole Faust 5809*c217d954SCole Faust#else 5810*c217d954SCole Faust 5811*c217d954SCole Faust 5812*c217d954SCole Faust dst_addr += z * dst_stride_z; 5813*c217d954SCole Faust 5814*c217d954SCole Faust#endif 5815*c217d954SCole Faust 5816*c217d954SCole Faust 5817*c217d954SCole Faust#if defined(ALPHA) 5818*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 5819*c217d954SCole Faust#endif 5820*c217d954SCole Faust 5821*c217d954SCole Faust 5822*c217d954SCole Faust#if defined(BETA) 5823*c217d954SCole Faust#if defined(BROADCAST_BIAS) 5824*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 5825*c217d954SCole Faust 5826*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 5827*c217d954SCole Faust 5828*c217d954SCole Faust#ifndef UNIT_BETA 5829*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 5830*c217d954SCole Faust#endif 5831*c217d954SCole Faust 5832*c217d954SCole Faust 5833*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 5834*c217d954SCole Faust 5835*c217d954SCole Faust#else 5836*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 5837*c217d954SCole Faust 5838*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5839*c217d954SCole Faust 5840*c217d954SCole Faust#ifndef UNIT_BETA 5841*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 5842*c217d954SCole Faust#endif 5843*c217d954SCole Faust 5844*c217d954SCole Faust 5845*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 5846*c217d954SCole Faust 5847*c217d954SCole Faust#endif 5848*c217d954SCole Faust#endif 5849*c217d954SCole Faust 5850*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 5851*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 5852*c217d954SCole Faust#endif 5853*c217d954SCole Faust 5854*c217d954SCole Faust 5855*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5856*c217d954SCole Faust 5857*c217d954SCole Faust#undef RHS_BLOCK_SIZE 5858*c217d954SCole Faust#undef RHS_OFFSET_X 5859*c217d954SCole Faust#undef RHS_STEP_X 5860*c217d954SCole Faust#undef RHS_STEP_LOOP 5861*c217d954SCole Faust#undef PIXEL_UNIT 5862*c217d954SCole Faust} 5863*c217d954SCole Faust#endif 5864*c217d954SCole Faust 5865*c217d954SCole Faust#define VFMA(a, b, c) \ 5866*c217d954SCole Faust ({ \ 5867*c217d954SCole Faust c = fma(a, b, c); \ 5868*c217d954SCole Faust }) 5869*c217d954SCole Faust 5870*c217d954SCole Faust#if M0 == 1 5871*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5872*c217d954SCole Faust ({ \ 5873*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5874*c217d954SCole Faust }) 5875*c217d954SCole Faust#elif M0 == 2 5876*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5877*c217d954SCole Faust ({ \ 5878*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5879*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5880*c217d954SCole Faust }) 5881*c217d954SCole Faust#elif M0 == 3 5882*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5883*c217d954SCole Faust ({ \ 5884*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5885*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5886*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5887*c217d954SCole Faust }) 5888*c217d954SCole Faust#elif M0 == 4 5889*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5890*c217d954SCole Faust ({ \ 5891*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5892*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5893*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5894*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5895*c217d954SCole Faust }) 5896*c217d954SCole Faust#elif M0 == 5 5897*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5898*c217d954SCole Faust ({ \ 5899*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5900*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5901*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5902*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5903*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5904*c217d954SCole Faust }) 5905*c217d954SCole Faust#elif M0 == 6 5906*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5907*c217d954SCole Faust ({ \ 5908*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5909*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5910*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5911*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5912*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5913*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 5914*c217d954SCole Faust }) 5915*c217d954SCole Faust#elif M0 == 7 5916*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5917*c217d954SCole Faust ({ \ 5918*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5919*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5920*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5921*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5922*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5923*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 5924*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 5925*c217d954SCole Faust }) 5926*c217d954SCole Faust#elif M0 == 8 5927*c217d954SCole Faust#define VFMA_M0xN0(i, a, b, c) \ 5928*c217d954SCole Faust ({ \ 5929*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5930*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5931*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5932*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5933*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5934*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 5935*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 5936*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ 5937*c217d954SCole Faust }) 5938*c217d954SCole Faust#else 5939*c217d954SCole Faust#error "M0 not supported" 5940*c217d954SCole Faust#endif 5941*c217d954SCole Faust 5942*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT) 5943*c217d954SCole Faust 5944*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), 5945*c217d954SCole Faust IMAGE_DECLARATION(rhs), 5946*c217d954SCole Faust#if defined(BETA) 5947*c217d954SCole Faust IMAGE_DECLARATION(bias), 5948*c217d954SCole Faust#endif 5949*c217d954SCole Faust IMAGE_DECLARATION(dst), 5950*c217d954SCole Faust uint lhs_stride_z, 5951*c217d954SCole Faust uint rhs_stride_z, 5952*c217d954SCole Faust#if defined(BETA) 5953*c217d954SCole Faust uint bias_stride_z, 5954*c217d954SCole Faust#endif 5955*c217d954SCole Faust uint dst_stride_z 5956*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 5957*c217d954SCole Faust , 5958*c217d954SCole Faust uint lhs_cross_plane_pad 5959*c217d954SCole Faust#endif 5960*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 5961*c217d954SCole Faust , 5962*c217d954SCole Faust uint dst_cross_plane_pad 5963*c217d954SCole Faust#endif 5964*c217d954SCole Faust , 5965*c217d954SCole Faust const int M, 5966*c217d954SCole Faust const int N, 5967*c217d954SCole Faust const int K) 5968*c217d954SCole Faust{ 5969*c217d954SCole Faust 5970*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0)) 5971*c217d954SCole Faust 5972*c217d954SCole Faust 5973*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 5974*c217d954SCole Faust#define RHS_OFFSET_X (N0) 5975*c217d954SCole Faust#define RHS_STEP_X ((N0) * (H0)) 5976*c217d954SCole Faust#define RHS_STEP_LOOP (1) 5977*c217d954SCole Faust#else 5978*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 5979*c217d954SCole Faust#define RHS_STEP_X (N0) 5980*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 5981*c217d954SCole Faust#endif 5982*c217d954SCole Faust 5983*c217d954SCole Faust uint x = get_global_id(0); 5984*c217d954SCole Faust uint y = get_global_id(1); 5985*c217d954SCole Faust uint z = get_global_id(2); 5986*c217d954SCole Faust 5987*c217d954SCole Faust const bool cond_y = y == 0; 5988*c217d954SCole Faust const bool cond_x = ((x + 1) * N0 >= N); 5989*c217d954SCole Faust 5990*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 5991*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 5992*c217d954SCole Faust { 5993*c217d954SCole Faust return; 5994*c217d954SCole Faust } 5995*c217d954SCole Faust#endif 5996*c217d954SCole Faust 5997*c217d954SCole Faust 5998*c217d954SCole Faust uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 5999*c217d954SCole Faust 6000*c217d954SCole Faust 6001*c217d954SCole Faust uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; 6002*c217d954SCole Faust 6003*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 6004*c217d954SCole Faust 6005*c217d954SCole Faust rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 6006*c217d954SCole Faust#else 6007*c217d954SCole Faust rhs_offset += z * rhs_stride_z; 6008*c217d954SCole Faust#endif 6009*c217d954SCole Faust 6010*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); 6011*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6012*c217d954SCole Faust 6013*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 6014*c217d954SCole Faust 6015*c217d954SCole Faust 6016*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 6017*c217d954SCole Faust 6018*c217d954SCole Faust 6019*c217d954SCole Faust 6020*c217d954SCole Faust lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 6021*c217d954SCole Faust 6022*c217d954SCole Faust#else 6023*c217d954SCole Faust 6024*c217d954SCole Faust 6025*c217d954SCole Faust lhs_offset += z * lhs_stride_z; 6026*c217d954SCole Faust 6027*c217d954SCole Faust#endif 6028*c217d954SCole Faust 6029*c217d954SCole Faust 6030*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 6031*c217d954SCole Faust 6032*c217d954SCole Faust int i = 0; 6033*c217d954SCole Faust for(; i <= (K - K0); i += K0) 6034*c217d954SCole Faust { 6035*c217d954SCole Faust 6036*c217d954SCole Faust 6037*c217d954SCole Faust 6038*c217d954SCole Faust 6039*c217d954SCole Faust 6040*c217d954SCole Faust 6041*c217d954SCole Faust 6042*c217d954SCole Faust 6043*c217d954SCole Faust 6044*c217d954SCole Faust 6045*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); 6046*c217d954SCole Faust 6047*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 6048*c217d954SCole Faust b0; 6049*c217d954SCole Faust 6050*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); 6051*c217d954SCole Faust VFMA_M0xN0(0, a, b0, c); 6052*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE))); 6053*c217d954SCole Faust VFMA_M0xN0(1, a, b0, c); 6054*c217d954SCole Faust#if K0 > 2 6055*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE))); 6056*c217d954SCole Faust VFMA_M0xN0(2, a, b0, c); 6057*c217d954SCole Faust#endif 6058*c217d954SCole Faust#if K0 > 3 6059*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE))); 6060*c217d954SCole Faust VFMA_M0xN0(3, a, b0, c); 6061*c217d954SCole Faust#endif 6062*c217d954SCole Faust#if K0 > 4 6063*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE))); 6064*c217d954SCole Faust VFMA_M0xN0(4, a, b0, c); 6065*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE))); 6066*c217d954SCole Faust VFMA_M0xN0(5, a, b0, c); 6067*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE))); 6068*c217d954SCole Faust VFMA_M0xN0(6, a, b0, c); 6069*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE))); 6070*c217d954SCole Faust VFMA_M0xN0(7, a, b0, c); 6071*c217d954SCole Faust#endif 6072*c217d954SCole Faust#if K0 > 8 6073*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE))); 6074*c217d954SCole Faust VFMA_M0xN0(8, a, b0, c); 6075*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE))); 6076*c217d954SCole Faust VFMA_M0xN0(9, a, b0, c); 6077*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE))); 6078*c217d954SCole Faust VFMA_M0xN0(A, a, b0, c); 6079*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE))); 6080*c217d954SCole Faust VFMA_M0xN0(B, a, b0, c); 6081*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE))); 6082*c217d954SCole Faust VFMA_M0xN0(C, a, b0, c); 6083*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE))); 6084*c217d954SCole Faust VFMA_M0xN0(D, a, b0, c); 6085*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE))); 6086*c217d954SCole Faust VFMA_M0xN0(E, a, b0, c); 6087*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE))); 6088*c217d954SCole Faust VFMA_M0xN0(F, a, b0, c); 6089*c217d954SCole Faust#endif 6090*c217d954SCole Faust 6091*c217d954SCole Faust lhs_offset += K0 * sizeof(DATA_TYPE); 6092*c217d954SCole Faust rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); 6093*c217d954SCole Faust } 6094*c217d954SCole Faust 6095*c217d954SCole Faust 6096*c217d954SCole Faust for(; i < K; ++i) 6097*c217d954SCole Faust { 6098*c217d954SCole Faust 6099*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6100*c217d954SCole Faust a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); 6101*c217d954SCole Faust#if M0 > 1 6102*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6103*c217d954SCole Faust a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); 6104*c217d954SCole Faust#endif 6105*c217d954SCole Faust#if M0 > 2 6106*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6107*c217d954SCole Faust a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); 6108*c217d954SCole Faust#endif 6109*c217d954SCole Faust#if M0 > 3 6110*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6111*c217d954SCole Faust a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); 6112*c217d954SCole Faust#endif 6113*c217d954SCole Faust#if M0 > 4 6114*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6115*c217d954SCole Faust a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); 6116*c217d954SCole Faust#endif 6117*c217d954SCole Faust#if M0 > 5 6118*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6119*c217d954SCole Faust a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); 6120*c217d954SCole Faust#endif 6121*c217d954SCole Faust#if M0 > 6 6122*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6123*c217d954SCole Faust a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); 6124*c217d954SCole Faust#endif 6125*c217d954SCole Faust#if M0 > 7 6126*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6127*c217d954SCole Faust a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); 6128*c217d954SCole Faust#endif 6129*c217d954SCole Faust 6130*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 6131*c217d954SCole Faust b0; 6132*c217d954SCole Faust 6133*c217d954SCole Faust b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); 6134*c217d954SCole Faust VFMA_M0xN0(0, a, b0, c); 6135*c217d954SCole Faust 6136*c217d954SCole Faust lhs_offset += sizeof(DATA_TYPE); 6137*c217d954SCole Faust rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE); 6138*c217d954SCole Faust } 6139*c217d954SCole Faust 6140*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 6141*c217d954SCole Faust 6142*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 6143*c217d954SCole Faust 6144*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6145*c217d954SCole Faust 6146*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6147*c217d954SCole Faust 6148*c217d954SCole Faust 6149*c217d954SCole Faust 6150*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 6151*c217d954SCole Faust 6152*c217d954SCole Faust#else 6153*c217d954SCole Faust 6154*c217d954SCole Faust 6155*c217d954SCole Faust dst_addr += z * dst_stride_z; 6156*c217d954SCole Faust 6157*c217d954SCole Faust#endif 6158*c217d954SCole Faust 6159*c217d954SCole Faust 6160*c217d954SCole Faust#if defined(ALPHA) 6161*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 6162*c217d954SCole Faust#endif 6163*c217d954SCole Faust 6164*c217d954SCole Faust 6165*c217d954SCole Faust#if defined(BETA) 6166*c217d954SCole Faust#if defined(BROADCAST_BIAS) 6167*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 6168*c217d954SCole Faust 6169*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 6170*c217d954SCole Faust 6171*c217d954SCole Faust#ifndef UNIT_BETA 6172*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 6173*c217d954SCole Faust#endif 6174*c217d954SCole Faust 6175*c217d954SCole Faust 6176*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 6177*c217d954SCole Faust 6178*c217d954SCole Faust#else 6179*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 6180*c217d954SCole Faust 6181*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6182*c217d954SCole Faust 6183*c217d954SCole Faust#ifndef UNIT_BETA 6184*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 6185*c217d954SCole Faust#endif 6186*c217d954SCole Faust 6187*c217d954SCole Faust 6188*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 6189*c217d954SCole Faust 6190*c217d954SCole Faust#endif 6191*c217d954SCole Faust#endif 6192*c217d954SCole Faust 6193*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 6194*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 6195*c217d954SCole Faust#endif 6196*c217d954SCole Faust 6197*c217d954SCole Faust 6198*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6199*c217d954SCole Faust 6200*c217d954SCole Faust#undef RHS_BLOCK_SIZE 6201*c217d954SCole Faust#undef RHS_OFFSET_X 6202*c217d954SCole Faust#undef RHS_STEP_X 6203*c217d954SCole Faust#undef RHS_STEP_LOOP 6204*c217d954SCole Faust} 6205*c217d954SCole Faust#endif 6206*c217d954SCole Faust 6207*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE) 6208*c217d954SCole Faust 6209*c217d954SCole Faust__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), 6210*c217d954SCole Faust __read_only image2d_t rhs_img, 6211*c217d954SCole Faust#if defined(BETA) 6212*c217d954SCole Faust IMAGE_DECLARATION(bias), 6213*c217d954SCole Faust#endif 6214*c217d954SCole Faust IMAGE_DECLARATION(dst), 6215*c217d954SCole Faust uint lhs_stride_z, 6216*c217d954SCole Faust uint rhs_stride_z, 6217*c217d954SCole Faust#if defined(BETA) 6218*c217d954SCole Faust uint bias_stride_z, 6219*c217d954SCole Faust#endif 6220*c217d954SCole Faust uint dst_stride_z 6221*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 6222*c217d954SCole Faust , 6223*c217d954SCole Faust uint lhs_cross_plane_pad 6224*c217d954SCole Faust#endif 6225*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6226*c217d954SCole Faust , 6227*c217d954SCole Faust uint dst_cross_plane_pad 6228*c217d954SCole Faust#endif 6229*c217d954SCole Faust , 6230*c217d954SCole Faust const int M, 6231*c217d954SCole Faust const int N, 6232*c217d954SCole Faust const int K) 6233*c217d954SCole Faust{ 6234*c217d954SCole Faust 6235*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) 6236*c217d954SCole Faust 6237*c217d954SCole Faust 6238*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) 6239*c217d954SCole Faust 6240*c217d954SCole Faust 6241*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 6242*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT) 6243*c217d954SCole Faust#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) 6244*c217d954SCole Faust#define RHS_STEP_LOOP 1 6245*c217d954SCole Faust#else 6246*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 6247*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT) 6248*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 6249*c217d954SCole Faust#endif 6250*c217d954SCole Faust 6251*c217d954SCole Faust uint x = get_global_id(0); 6252*c217d954SCole Faust uint y = get_global_id(1); 6253*c217d954SCole Faust uint z = get_global_id(2); 6254*c217d954SCole Faust 6255*c217d954SCole Faust const bool cond_y = y == 0; 6256*c217d954SCole Faust const bool cond_x = ((x + 1) * N0 >= N); 6257*c217d954SCole Faust 6258*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 6259*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 6260*c217d954SCole Faust { 6261*c217d954SCole Faust return; 6262*c217d954SCole Faust } 6263*c217d954SCole Faust#endif 6264*c217d954SCole Faust 6265*c217d954SCole Faust 6266*c217d954SCole Faust uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 6267*c217d954SCole Faust 6268*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 6269*c217d954SCole Faust 6270*c217d954SCole Faust const uint z_rhs = (z % MATRIX_B_DEPTH); 6271*c217d954SCole Faust#else 6272*c217d954SCole Faust const uint z_rhs = z; 6273*c217d954SCole Faust#endif 6274*c217d954SCole Faust 6275*c217d954SCole Faust 6276*c217d954SCole Faust uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; 6277*c217d954SCole Faust const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; 6278*c217d954SCole Faust 6279*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); 6280*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6281*c217d954SCole Faust 6282*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 6283*c217d954SCole Faust 6284*c217d954SCole Faust 6285*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 6286*c217d954SCole Faust 6287*c217d954SCole Faust 6288*c217d954SCole Faust 6289*c217d954SCole Faust lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 6290*c217d954SCole Faust 6291*c217d954SCole Faust#else 6292*c217d954SCole Faust 6293*c217d954SCole Faust 6294*c217d954SCole Faust lhs_offset += z * lhs_stride_z; 6295*c217d954SCole Faust 6296*c217d954SCole Faust#endif 6297*c217d954SCole Faust 6298*c217d954SCole Faust 6299*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 6300*c217d954SCole Faust 6301*c217d954SCole Faust int i = 0; 6302*c217d954SCole Faust for(; i <= (K - K0); i += K0) 6303*c217d954SCole Faust { 6304*c217d954SCole Faust 6305*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); 6306*c217d954SCole Faust 6307*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 6308*c217d954SCole Faust b0; 6309*c217d954SCole Faust 6310*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); 6311*c217d954SCole Faust VFMA_M0xN0(0, a, b0, c); 6312*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); 6313*c217d954SCole Faust VFMA_M0xN0(1, a, b0, c); 6314*c217d954SCole Faust#if K0 > 2 6315*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); 6316*c217d954SCole Faust VFMA_M0xN0(2, a, b0, c); 6317*c217d954SCole Faust#endif 6318*c217d954SCole Faust#if K0 > 3 6319*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); 6320*c217d954SCole Faust VFMA_M0xN0(3, a, b0, c); 6321*c217d954SCole Faust#endif 6322*c217d954SCole Faust#if K0 > 4 6323*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); 6324*c217d954SCole Faust VFMA_M0xN0(4, a, b0, c); 6325*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); 6326*c217d954SCole Faust VFMA_M0xN0(5, a, b0, c); 6327*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); 6328*c217d954SCole Faust VFMA_M0xN0(6, a, b0, c); 6329*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); 6330*c217d954SCole Faust VFMA_M0xN0(7, a, b0, c); 6331*c217d954SCole Faust#endif 6332*c217d954SCole Faust#if K0 > 8 6333*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); 6334*c217d954SCole Faust VFMA_M0xN0(8, a, b0, c); 6335*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); 6336*c217d954SCole Faust VFMA_M0xN0(9, a, b0, c); 6337*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); 6338*c217d954SCole Faust VFMA_M0xN0(A, a, b0, c); 6339*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); 6340*c217d954SCole Faust VFMA_M0xN0(B, a, b0, c); 6341*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); 6342*c217d954SCole Faust VFMA_M0xN0(C, a, b0, c); 6343*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); 6344*c217d954SCole Faust VFMA_M0xN0(D, a, b0, c); 6345*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); 6346*c217d954SCole Faust VFMA_M0xN0(E, a, b0, c); 6347*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); 6348*c217d954SCole Faust VFMA_M0xN0(F, a, b0, c); 6349*c217d954SCole Faust#endif 6350*c217d954SCole Faust 6351*c217d954SCole Faust lhs_offset += K0 * sizeof(DATA_TYPE); 6352*c217d954SCole Faust x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP; 6353*c217d954SCole Faust } 6354*c217d954SCole Faust 6355*c217d954SCole Faust 6356*c217d954SCole Faust for(; i < K; ++i) 6357*c217d954SCole Faust { 6358*c217d954SCole Faust 6359*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6360*c217d954SCole Faust a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); 6361*c217d954SCole Faust#if M0 > 1 6362*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6363*c217d954SCole Faust a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); 6364*c217d954SCole Faust#endif 6365*c217d954SCole Faust#if M0 > 2 6366*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6367*c217d954SCole Faust a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); 6368*c217d954SCole Faust#endif 6369*c217d954SCole Faust#if M0 > 3 6370*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6371*c217d954SCole Faust a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); 6372*c217d954SCole Faust#endif 6373*c217d954SCole Faust#if M0 > 4 6374*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6375*c217d954SCole Faust a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); 6376*c217d954SCole Faust#endif 6377*c217d954SCole Faust#if M0 > 5 6378*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6379*c217d954SCole Faust a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); 6380*c217d954SCole Faust#endif 6381*c217d954SCole Faust#if M0 > 6 6382*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6383*c217d954SCole Faust a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); 6384*c217d954SCole Faust#endif 6385*c217d954SCole Faust#if M0 > 7 6386*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 6387*c217d954SCole Faust a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); 6388*c217d954SCole Faust#endif 6389*c217d954SCole Faust 6390*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 6391*c217d954SCole Faust b0; 6392*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); 6393*c217d954SCole Faust 6394*c217d954SCole Faust VFMA_M0xN0(0, a, b0, c); 6395*c217d954SCole Faust 6396*c217d954SCole Faust lhs_offset += sizeof(DATA_TYPE); 6397*c217d954SCole Faust x_rhs += RHS_STEP_X; 6398*c217d954SCole Faust } 6399*c217d954SCole Faust 6400*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 6401*c217d954SCole Faust 6402*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 6403*c217d954SCole Faust 6404*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6405*c217d954SCole Faust 6406*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6407*c217d954SCole Faust 6408*c217d954SCole Faust 6409*c217d954SCole Faust 6410*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 6411*c217d954SCole Faust 6412*c217d954SCole Faust#else 6413*c217d954SCole Faust 6414*c217d954SCole Faust 6415*c217d954SCole Faust dst_addr += z * dst_stride_z; 6416*c217d954SCole Faust 6417*c217d954SCole Faust#endif 6418*c217d954SCole Faust 6419*c217d954SCole Faust 6420*c217d954SCole Faust#if defined(ALPHA) 6421*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 6422*c217d954SCole Faust#endif 6423*c217d954SCole Faust 6424*c217d954SCole Faust 6425*c217d954SCole Faust#if defined(BETA) 6426*c217d954SCole Faust#if defined(BROADCAST_BIAS) 6427*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 6428*c217d954SCole Faust 6429*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 6430*c217d954SCole Faust 6431*c217d954SCole Faust#ifndef UNIT_BETA 6432*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 6433*c217d954SCole Faust#endif 6434*c217d954SCole Faust 6435*c217d954SCole Faust 6436*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 6437*c217d954SCole Faust 6438*c217d954SCole Faust#else 6439*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 6440*c217d954SCole Faust 6441*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6442*c217d954SCole Faust 6443*c217d954SCole Faust#ifndef UNIT_BETA 6444*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 6445*c217d954SCole Faust#endif 6446*c217d954SCole Faust 6447*c217d954SCole Faust 6448*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 6449*c217d954SCole Faust 6450*c217d954SCole Faust#endif 6451*c217d954SCole Faust#endif 6452*c217d954SCole Faust 6453*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 6454*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 6455*c217d954SCole Faust#endif 6456*c217d954SCole Faust 6457*c217d954SCole Faust 6458*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6459*c217d954SCole Faust 6460*c217d954SCole Faust#undef RHS_BLOCK_SIZE 6461*c217d954SCole Faust#undef RHS_OFFSET_X 6462*c217d954SCole Faust#undef RHS_STEP_X 6463*c217d954SCole Faust#undef RHS_STEP_LOOP 6464*c217d954SCole Faust} 6465*c217d954SCole Faust#endif 6466*c217d954SCole Faust#endif 6467*c217d954SCole Faust 6468*c217d954SCole Faust#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) 6469*c217d954SCole Faust 6470*c217d954SCole Faust#if defined(MIXED_PRECISION) 6471*c217d954SCole Faust#if K0 == 2 6472*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6473*c217d954SCole Faust ({ \ 6474*c217d954SCole Faust c += a.s0 * b.s0; \ 6475*c217d954SCole Faust c += a.s1 * b.s1; \ 6476*c217d954SCole Faust }) 6477*c217d954SCole Faust#elif K0 == 3 6478*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6479*c217d954SCole Faust ({ \ 6480*c217d954SCole Faust c += a.s0 * b.s0; \ 6481*c217d954SCole Faust c += a.s1 * b.s1; \ 6482*c217d954SCole Faust c += a.s2 * b.s2; \ 6483*c217d954SCole Faust }) 6484*c217d954SCole Faust#elif K0 == 4 6485*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6486*c217d954SCole Faust ({ \ 6487*c217d954SCole Faust c += a.s0 * b.s0; \ 6488*c217d954SCole Faust c += a.s1 * b.s1; \ 6489*c217d954SCole Faust c += a.s2 * b.s2; \ 6490*c217d954SCole Faust c += a.s3 * b.s3; \ 6491*c217d954SCole Faust }) 6492*c217d954SCole Faust#elif K0 == 8 6493*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6494*c217d954SCole Faust ({ \ 6495*c217d954SCole Faust c += a.s0 * b.s0; \ 6496*c217d954SCole Faust c += a.s1 * b.s1; \ 6497*c217d954SCole Faust c += a.s2 * b.s2; \ 6498*c217d954SCole Faust c += a.s3 * b.s3; \ 6499*c217d954SCole Faust c += a.s4 * b.s4; \ 6500*c217d954SCole Faust c += a.s5 * b.s5; \ 6501*c217d954SCole Faust c += a.s6 * b.s6; \ 6502*c217d954SCole Faust c += a.s7 * b.s7; \ 6503*c217d954SCole Faust }) 6504*c217d954SCole Faust#elif K0 == 16 6505*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6506*c217d954SCole Faust ({ \ 6507*c217d954SCole Faust c += a.s0 * b.s0; \ 6508*c217d954SCole Faust c += a.s1 * b.s1; \ 6509*c217d954SCole Faust c += a.s2 * b.s2; \ 6510*c217d954SCole Faust c += a.s3 * b.s3; \ 6511*c217d954SCole Faust c += a.s4 * b.s4; \ 6512*c217d954SCole Faust c += a.s5 * b.s5; \ 6513*c217d954SCole Faust c += a.s6 * b.s6; \ 6514*c217d954SCole Faust c += a.s7 * b.s7; \ 6515*c217d954SCole Faust c += a.s8 * b.s8; \ 6516*c217d954SCole Faust c += a.s9 * b.s9; \ 6517*c217d954SCole Faust c += a.sA * b.sA; \ 6518*c217d954SCole Faust c += a.sB * b.sB; \ 6519*c217d954SCole Faust c += a.sC * b.sC; \ 6520*c217d954SCole Faust c += a.sD * b.sD; \ 6521*c217d954SCole Faust c += a.sE * b.sE; \ 6522*c217d954SCole Faust c += a.sF * b.sF; \ 6523*c217d954SCole Faust }) 6524*c217d954SCole Faust#else 6525*c217d954SCole Faust#error "K0 value not supported" 6526*c217d954SCole Faust#endif 6527*c217d954SCole Faust#else 6528*c217d954SCole Faust#if K0 == 2 6529*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6530*c217d954SCole Faust ({ \ 6531*c217d954SCole Faust c = fma(a.s0, b.s0, c); \ 6532*c217d954SCole Faust c = fma(a.s1, b.s1, c); \ 6533*c217d954SCole Faust }) 6534*c217d954SCole Faust#elif K0 == 3 6535*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6536*c217d954SCole Faust ({ \ 6537*c217d954SCole Faust c = fma(a.s0, b.s0, c); \ 6538*c217d954SCole Faust c = fma(a.s1, b.s1, c); \ 6539*c217d954SCole Faust c = fma(a.s2, b.s2, c); \ 6540*c217d954SCole Faust }) 6541*c217d954SCole Faust#elif K0 == 4 6542*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6543*c217d954SCole Faust ({ \ 6544*c217d954SCole Faust c = fma(a.s0, b.s0, c); \ 6545*c217d954SCole Faust c = fma(a.s1, b.s1, c); \ 6546*c217d954SCole Faust c = fma(a.s2, b.s2, c); \ 6547*c217d954SCole Faust c = fma(a.s3, b.s3, c); \ 6548*c217d954SCole Faust }) 6549*c217d954SCole Faust#elif K0 == 8 6550*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6551*c217d954SCole Faust ({ \ 6552*c217d954SCole Faust c = fma(a.s0, b.s0, c); \ 6553*c217d954SCole Faust c = fma(a.s1, b.s1, c); \ 6554*c217d954SCole Faust c = fma(a.s2, b.s2, c); \ 6555*c217d954SCole Faust c = fma(a.s3, b.s3, c); \ 6556*c217d954SCole Faust c = fma(a.s4, b.s4, c); \ 6557*c217d954SCole Faust c = fma(a.s5, b.s5, c); \ 6558*c217d954SCole Faust c = fma(a.s6, b.s6, c); \ 6559*c217d954SCole Faust c = fma(a.s7, b.s7, c); \ 6560*c217d954SCole Faust }) 6561*c217d954SCole Faust#elif K0 == 16 6562*c217d954SCole Faust#define ARM_DOT_K0(a, b, c) \ 6563*c217d954SCole Faust ({ \ 6564*c217d954SCole Faust c = fma(a.s0, b.s0, c); \ 6565*c217d954SCole Faust c = fma(a.s1, b.s1, c); \ 6566*c217d954SCole Faust c = fma(a.s2, b.s2, c); \ 6567*c217d954SCole Faust c = fma(a.s3, b.s3, c); \ 6568*c217d954SCole Faust c = fma(a.s4, b.s4, c); \ 6569*c217d954SCole Faust c = fma(a.s5, b.s5, c); \ 6570*c217d954SCole Faust c = fma(a.s6, b.s6, c); \ 6571*c217d954SCole Faust c = fma(a.s7, b.s7, c); \ 6572*c217d954SCole Faust c = fma(a.s8, b.s8, c); \ 6573*c217d954SCole Faust c = fma(a.s9, b.s9, c); \ 6574*c217d954SCole Faust c = fma(a.sA, b.sA, c); \ 6575*c217d954SCole Faust c = fma(a.sB, b.sB, c); \ 6576*c217d954SCole Faust c = fma(a.sC, b.sC, c); \ 6577*c217d954SCole Faust c = fma(a.sD, b.sD, c); \ 6578*c217d954SCole Faust c = fma(a.sE, b.sE, c); \ 6579*c217d954SCole Faust c = fma(a.sF, b.sF, c); \ 6580*c217d954SCole Faust }) 6581*c217d954SCole Faust#else 6582*c217d954SCole Faust#error "K0 value not supported" 6583*c217d954SCole Faust#endif 6584*c217d954SCole Faust#endif 6585*c217d954SCole Faust 6586*c217d954SCole Faust#if defined(ARM_DOT_K0XN0) 6587*c217d954SCole Faust#undef ARM_DOT_K0XN0 6588*c217d954SCole Faust#endif 6589*c217d954SCole Faust 6590*c217d954SCole Faust#if N0 == 2 6591*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c) \ 6592*c217d954SCole Faust ({ \ 6593*c217d954SCole Faust ARM_DOT_K0((a), (b##0), (c.s0)); \ 6594*c217d954SCole Faust ARM_DOT_K0((a), (b##1), (c.s1)); \ 6595*c217d954SCole Faust }) 6596*c217d954SCole Faust#elif N0 == 3 6597*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c) \ 6598*c217d954SCole Faust ({ \ 6599*c217d954SCole Faust ARM_DOT_K0((a), (b##0), (c.s0)); \ 6600*c217d954SCole Faust ARM_DOT_K0((a), (b##1), (c.s1)); \ 6601*c217d954SCole Faust ARM_DOT_K0((a), (b##2), (c.s2)); \ 6602*c217d954SCole Faust }) 6603*c217d954SCole Faust#elif N0 == 4 6604*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c) \ 6605*c217d954SCole Faust ({ \ 6606*c217d954SCole Faust ARM_DOT_K0((a), (b##0), (c.s0)); \ 6607*c217d954SCole Faust ARM_DOT_K0((a), (b##1), (c.s1)); \ 6608*c217d954SCole Faust ARM_DOT_K0((a), (b##2), (c.s2)); \ 6609*c217d954SCole Faust ARM_DOT_K0((a), (b##3), (c.s3)); \ 6610*c217d954SCole Faust }) 6611*c217d954SCole Faust#elif N0 == 8 6612*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c) \ 6613*c217d954SCole Faust ({ \ 6614*c217d954SCole Faust ARM_DOT_K0((a), (b##0), (c.s0)); \ 6615*c217d954SCole Faust ARM_DOT_K0((a), (b##1), (c.s1)); \ 6616*c217d954SCole Faust ARM_DOT_K0((a), (b##2), (c.s2)); \ 6617*c217d954SCole Faust ARM_DOT_K0((a), (b##3), (c.s3)); \ 6618*c217d954SCole Faust ARM_DOT_K0((a), (b##4), (c.s4)); \ 6619*c217d954SCole Faust ARM_DOT_K0((a), (b##5), (c.s5)); \ 6620*c217d954SCole Faust ARM_DOT_K0((a), (b##6), (c.s6)); \ 6621*c217d954SCole Faust ARM_DOT_K0((a), (b##7), (c.s7)); \ 6622*c217d954SCole Faust }) 6623*c217d954SCole Faust#elif N0 == 16 6624*c217d954SCole Faust#define ARM_DOT_K0XN0(a, b, c) \ 6625*c217d954SCole Faust ({ \ 6626*c217d954SCole Faust ARM_DOT_K0((a), (b##0), (c.s0)); \ 6627*c217d954SCole Faust ARM_DOT_K0((a), (b##1), (c.s1)); \ 6628*c217d954SCole Faust ARM_DOT_K0((a), (b##2), (c.s2)); \ 6629*c217d954SCole Faust ARM_DOT_K0((a), (b##3), (c.s3)); \ 6630*c217d954SCole Faust ARM_DOT_K0((a), (b##4), (c.s4)); \ 6631*c217d954SCole Faust ARM_DOT_K0((a), (b##5), (c.s5)); \ 6632*c217d954SCole Faust ARM_DOT_K0((a), (b##6), (c.s6)); \ 6633*c217d954SCole Faust ARM_DOT_K0((a), (b##7), (c.s7)); \ 6634*c217d954SCole Faust ARM_DOT_K0((a), (b##8), (c.s8)); \ 6635*c217d954SCole Faust ARM_DOT_K0((a), (b##9), (c.s9)); \ 6636*c217d954SCole Faust ARM_DOT_K0((a), (b##A), (c.sA)); \ 6637*c217d954SCole Faust ARM_DOT_K0((a), (b##B), (c.sB)); \ 6638*c217d954SCole Faust ARM_DOT_K0((a), (b##C), (c.sC)); \ 6639*c217d954SCole Faust ARM_DOT_K0((a), (b##D), (c.sD)); \ 6640*c217d954SCole Faust ARM_DOT_K0((a), (b##E), (c.sE)); \ 6641*c217d954SCole Faust ARM_DOT_K0((a), (b##F), (c.sF)); \ 6642*c217d954SCole Faust }) 6643*c217d954SCole Faust#else 6644*c217d954SCole Faust#error "N0 value not supported" 6645*c217d954SCole Faust#endif 6646*c217d954SCole Faust 6647*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T) 6648*c217d954SCole Faust 6649*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), 6650*c217d954SCole Faust IMAGE_DECLARATION(rhs), 6651*c217d954SCole Faust#if defined(BETA) 6652*c217d954SCole Faust IMAGE_DECLARATION(bias), 6653*c217d954SCole Faust#endif 6654*c217d954SCole Faust IMAGE_DECLARATION(dst), 6655*c217d954SCole Faust uint lhs_stride_z, 6656*c217d954SCole Faust uint rhs_stride_z, 6657*c217d954SCole Faust#if defined(BETA) 6658*c217d954SCole Faust uint bias_stride_z, 6659*c217d954SCole Faust#endif 6660*c217d954SCole Faust uint dst_stride_z 6661*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6662*c217d954SCole Faust , 6663*c217d954SCole Faust uint dst_cross_plane_pad 6664*c217d954SCole Faust#endif 6665*c217d954SCole Faust , 6666*c217d954SCole Faust const int M, 6667*c217d954SCole Faust const int N, 6668*c217d954SCole Faust const int K) 6669*c217d954SCole Faust{ 6670*c217d954SCole Faust 6671*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0)) 6672*c217d954SCole Faust 6673*c217d954SCole Faust#if defined(LHS_INTERLEAVE) 6674*c217d954SCole Faust#define LHS_OFFSET_X (K0) 6675*c217d954SCole Faust#define LHS_STEP_X ((K0) * (V0)) 6676*c217d954SCole Faust#define LHS_STEP_LOOP (1) 6677*c217d954SCole Faust#else 6678*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 6679*c217d954SCole Faust#define LHS_STEP_X (K0) 6680*c217d954SCole Faust#define LHS_STEP_LOOP (V0) 6681*c217d954SCole Faust#endif 6682*c217d954SCole Faust 6683*c217d954SCole Faust 6684*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0)) 6685*c217d954SCole Faust 6686*c217d954SCole Faust 6687*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 6688*c217d954SCole Faust#define RHS_OFFSET_X (K0) 6689*c217d954SCole Faust#define RHS_STEP_X ((K0) * (H0)) 6690*c217d954SCole Faust#define RHS_STEP_LOOP (1) 6691*c217d954SCole Faust#else 6692*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 6693*c217d954SCole Faust#define RHS_STEP_X (K0) 6694*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 6695*c217d954SCole Faust#endif 6696*c217d954SCole Faust 6697*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 6698*c217d954SCole Faust if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) 6699*c217d954SCole Faust { 6700*c217d954SCole Faust return; 6701*c217d954SCole Faust } 6702*c217d954SCole Faust#endif 6703*c217d954SCole Faust 6704*c217d954SCole Faust 6705*c217d954SCole Faust __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + 6706*c217d954SCole Faust (get_global_id(2) * lhs_stride_z); 6707*c217d954SCole Faust 6708*c217d954SCole Faust 6709*c217d954SCole Faust __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y; 6710*c217d954SCole Faust 6711*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 6712*c217d954SCole Faust 6713*c217d954SCole Faust rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z; 6714*c217d954SCole Faust#else 6715*c217d954SCole Faust rhs_addr += get_global_id(2) * rhs_stride_z; 6716*c217d954SCole Faust#endif 6717*c217d954SCole Faust 6718*c217d954SCole Faust 6719*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 6720*c217d954SCole Faust 6721*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 6722*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6723*c217d954SCole Faust 6724*c217d954SCole Faust for(int i = 0; i < K; i += K0) 6725*c217d954SCole Faust { 6726*c217d954SCole Faust 6727*c217d954SCole Faust 6728*c217d954SCole Faust 6729*c217d954SCole Faust 6730*c217d954SCole Faust 6731*c217d954SCole Faust 6732*c217d954SCole Faust 6733*c217d954SCole Faust 6734*c217d954SCole Faust 6735*c217d954SCole Faust 6736*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); 6737*c217d954SCole Faust 6738*c217d954SCole Faust 6739*c217d954SCole Faust LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero); 6740*c217d954SCole Faust 6741*c217d954SCole Faust 6742*c217d954SCole Faust ARM_DOT_K0XN0(a0, b, c0); 6743*c217d954SCole Faust#if M0 > 1 6744*c217d954SCole Faust ARM_DOT_K0XN0(a1, b, c1); 6745*c217d954SCole Faust#endif 6746*c217d954SCole Faust#if M0 > 2 6747*c217d954SCole Faust ARM_DOT_K0XN0(a2, b, c2); 6748*c217d954SCole Faust#endif 6749*c217d954SCole Faust#if M0 > 3 6750*c217d954SCole Faust ARM_DOT_K0XN0(a3, b, c3); 6751*c217d954SCole Faust#endif 6752*c217d954SCole Faust#if M0 > 4 6753*c217d954SCole Faust ARM_DOT_K0XN0(a4, b, c4); 6754*c217d954SCole Faust#endif 6755*c217d954SCole Faust#if M0 > 5 6756*c217d954SCole Faust ARM_DOT_K0XN0(a5, b, c5); 6757*c217d954SCole Faust#endif 6758*c217d954SCole Faust#if M0 > 6 6759*c217d954SCole Faust ARM_DOT_K0XN0(a6, b, c6); 6760*c217d954SCole Faust#endif 6761*c217d954SCole Faust#if M0 > 7 6762*c217d954SCole Faust ARM_DOT_K0XN0(a7, b, c7); 6763*c217d954SCole Faust#endif 6764*c217d954SCole Faust 6765*c217d954SCole Faust lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); 6766*c217d954SCole Faust rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); 6767*c217d954SCole Faust } 6768*c217d954SCole Faust 6769*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); 6770*c217d954SCole Faust 6771*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 6772*c217d954SCole Faust 6773*c217d954SCole Faust const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 6774*c217d954SCole Faust const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 6775*c217d954SCole Faust 6776*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6777*c217d954SCole Faust 6778*c217d954SCole Faust 6779*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6780*c217d954SCole Faust 6781*c217d954SCole Faust 6782*c217d954SCole Faust dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; 6783*c217d954SCole Faust 6784*c217d954SCole Faust#else 6785*c217d954SCole Faust 6786*c217d954SCole Faust 6787*c217d954SCole Faust dst_addr += get_global_id(2) * dst_stride_z; 6788*c217d954SCole Faust 6789*c217d954SCole Faust#endif 6790*c217d954SCole Faust 6791*c217d954SCole Faust 6792*c217d954SCole Faust#if defined(ALPHA) 6793*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 6794*c217d954SCole Faust#endif 6795*c217d954SCole Faust 6796*c217d954SCole Faust 6797*c217d954SCole Faust#if defined(BETA) 6798*c217d954SCole Faust#if defined(BROADCAST_BIAS) 6799*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 6800*c217d954SCole Faust 6801*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 6802*c217d954SCole Faust 6803*c217d954SCole Faust#ifndef UNIT_BETA 6804*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 6805*c217d954SCole Faust#endif 6806*c217d954SCole Faust 6807*c217d954SCole Faust 6808*c217d954SCole Faust#if defined(MIXED_PRECISION) 6809*c217d954SCole Faust CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 6810*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 6811*c217d954SCole Faust#else 6812*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 6813*c217d954SCole Faust#endif 6814*c217d954SCole Faust 6815*c217d954SCole Faust#else 6816*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( 6817*c217d954SCole Faust 2) * bias_stride_z; 6818*c217d954SCole Faust 6819*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6820*c217d954SCole Faust 6821*c217d954SCole Faust#ifndef UNIT_BETA 6822*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 6823*c217d954SCole Faust#endif 6824*c217d954SCole Faust 6825*c217d954SCole Faust 6826*c217d954SCole Faust#if defined(MIXED_PRECISION) 6827*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 6828*c217d954SCole Faust ADD_BLOCK(M0, c, bias_hp); 6829*c217d954SCole Faust#else 6830*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 6831*c217d954SCole Faust#endif 6832*c217d954SCole Faust 6833*c217d954SCole Faust#endif 6834*c217d954SCole Faust#endif 6835*c217d954SCole Faust 6836*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 6837*c217d954SCole Faust#if defined(MIXED_PRECISION) 6838*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 6839*c217d954SCole Faust#else 6840*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 6841*c217d954SCole Faust#endif 6842*c217d954SCole Faust#endif 6843*c217d954SCole Faust 6844*c217d954SCole Faust 6845*c217d954SCole Faust#if defined(MIXED_PRECISION) 6846*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 6847*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6848*c217d954SCole Faust#else 6849*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6850*c217d954SCole Faust#endif 6851*c217d954SCole Faust 6852*c217d954SCole Faust#undef LHS_BLOCK_SIZE 6853*c217d954SCole Faust#undef LHS_OFFSET_X 6854*c217d954SCole Faust#undef LHS_STEP_X 6855*c217d954SCole Faust#undef RHS_BLOCK_SIZE 6856*c217d954SCole Faust#undef RHS_OFFSET_X 6857*c217d954SCole Faust#undef RHS_STEP_X 6858*c217d954SCole Faust#undef LHS_STEP_LOOP 6859*c217d954SCole Faust#undef RHS_STEP_LOOP 6860*c217d954SCole Faust} 6861*c217d954SCole Faust#endif 6862*c217d954SCole Faust 6863*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE) 6864*c217d954SCole Faust 6865*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs), 6866*c217d954SCole Faust __read_only image2d_t rhs_img, 6867*c217d954SCole Faust#if defined(BETA) 6868*c217d954SCole Faust IMAGE_DECLARATION(bias), 6869*c217d954SCole Faust#endif 6870*c217d954SCole Faust IMAGE_DECLARATION(dst), 6871*c217d954SCole Faust uint lhs_stride_z, 6872*c217d954SCole Faust uint rhs_stride_z, 6873*c217d954SCole Faust#if defined(BETA) 6874*c217d954SCole Faust uint bias_stride_z, 6875*c217d954SCole Faust#endif 6876*c217d954SCole Faust uint dst_stride_z 6877*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6878*c217d954SCole Faust , 6879*c217d954SCole Faust uint dst_cross_plane_pad 6880*c217d954SCole Faust#endif 6881*c217d954SCole Faust , 6882*c217d954SCole Faust const int M, 6883*c217d954SCole Faust const int N, 6884*c217d954SCole Faust const int K) 6885*c217d954SCole Faust{ 6886*c217d954SCole Faust 6887*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) 6888*c217d954SCole Faust 6889*c217d954SCole Faust 6890*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0)) 6891*c217d954SCole Faust 6892*c217d954SCole Faust#if defined(LHS_INTERLEAVE) 6893*c217d954SCole Faust#define LHS_OFFSET_X (K0) 6894*c217d954SCole Faust#define LHS_STEP_X ((K0) * (V0)) 6895*c217d954SCole Faust#define LHS_STEP_LOOP (1) 6896*c217d954SCole Faust#else 6897*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 6898*c217d954SCole Faust#define LHS_STEP_X (K0) 6899*c217d954SCole Faust#define LHS_STEP_LOOP (V0) 6900*c217d954SCole Faust#endif 6901*c217d954SCole Faust 6902*c217d954SCole Faust 6903*c217d954SCole Faust#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) 6904*c217d954SCole Faust 6905*c217d954SCole Faust 6906*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 6907*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT) 6908*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT * (H0)) 6909*c217d954SCole Faust#define RHS_STEP_LOOP (1) 6910*c217d954SCole Faust#else 6911*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 6912*c217d954SCole Faust#define RHS_STEP_X PIXEL_UNIT 6913*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 6914*c217d954SCole Faust#endif 6915*c217d954SCole Faust 6916*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 6917*c217d954SCole Faust if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) 6918*c217d954SCole Faust { 6919*c217d954SCole Faust return; 6920*c217d954SCole Faust } 6921*c217d954SCole Faust#endif 6922*c217d954SCole Faust 6923*c217d954SCole Faust 6924*c217d954SCole Faust __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + 6925*c217d954SCole Faust (get_global_id(2) * lhs_stride_z); 6926*c217d954SCole Faust 6927*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 6928*c217d954SCole Faust 6929*c217d954SCole Faust const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); 6930*c217d954SCole Faust#else 6931*c217d954SCole Faust const uint z_rhs = get_global_id(2); 6932*c217d954SCole Faust#endif 6933*c217d954SCole Faust 6934*c217d954SCole Faust 6935*c217d954SCole Faust uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; 6936*c217d954SCole Faust const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; 6937*c217d954SCole Faust 6938*c217d954SCole Faust 6939*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 6940*c217d954SCole Faust 6941*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 6942*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6943*c217d954SCole Faust 6944*c217d954SCole Faust for(int i = 0; i < K; i += K0) 6945*c217d954SCole Faust { 6946*c217d954SCole Faust 6947*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); 6948*c217d954SCole Faust 6949*c217d954SCole Faust 6950*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); 6951*c217d954SCole Faust LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); 6952*c217d954SCole Faust 6953*c217d954SCole Faust 6954*c217d954SCole Faust ARM_DOT_K0XN0(a0, b, c0); 6955*c217d954SCole Faust#if M0 > 1 6956*c217d954SCole Faust ARM_DOT_K0XN0(a1, b, c1); 6957*c217d954SCole Faust#endif 6958*c217d954SCole Faust#if M0 > 2 6959*c217d954SCole Faust ARM_DOT_K0XN0(a2, b, c2); 6960*c217d954SCole Faust#endif 6961*c217d954SCole Faust#if M0 > 3 6962*c217d954SCole Faust ARM_DOT_K0XN0(a3, b, c3); 6963*c217d954SCole Faust#endif 6964*c217d954SCole Faust#if M0 > 4 6965*c217d954SCole Faust ARM_DOT_K0XN0(a4, b, c4); 6966*c217d954SCole Faust#endif 6967*c217d954SCole Faust#if M0 > 5 6968*c217d954SCole Faust ARM_DOT_K0XN0(a5, b, c5); 6969*c217d954SCole Faust#endif 6970*c217d954SCole Faust#if M0 > 6 6971*c217d954SCole Faust ARM_DOT_K0XN0(a6, b, c6); 6972*c217d954SCole Faust#endif 6973*c217d954SCole Faust#if M0 > 7 6974*c217d954SCole Faust ARM_DOT_K0XN0(a7, b, c7); 6975*c217d954SCole Faust#endif 6976*c217d954SCole Faust 6977*c217d954SCole Faust lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); 6978*c217d954SCole Faust 6979*c217d954SCole Faust x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; 6980*c217d954SCole Faust } 6981*c217d954SCole Faust 6982*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); 6983*c217d954SCole Faust 6984*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 6985*c217d954SCole Faust 6986*c217d954SCole Faust const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 6987*c217d954SCole Faust const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 6988*c217d954SCole Faust 6989*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 6990*c217d954SCole Faust 6991*c217d954SCole Faust 6992*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6993*c217d954SCole Faust 6994*c217d954SCole Faust 6995*c217d954SCole Faust dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; 6996*c217d954SCole Faust 6997*c217d954SCole Faust#else 6998*c217d954SCole Faust 6999*c217d954SCole Faust 7000*c217d954SCole Faust dst_addr += get_global_id(2) * dst_stride_z; 7001*c217d954SCole Faust 7002*c217d954SCole Faust#endif 7003*c217d954SCole Faust 7004*c217d954SCole Faust 7005*c217d954SCole Faust#if defined(ALPHA) 7006*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 7007*c217d954SCole Faust#endif 7008*c217d954SCole Faust 7009*c217d954SCole Faust 7010*c217d954SCole Faust#if defined(BETA) 7011*c217d954SCole Faust#if defined(BROADCAST_BIAS) 7012*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 7013*c217d954SCole Faust 7014*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 7015*c217d954SCole Faust 7016*c217d954SCole Faust#ifndef UNIT_BETA 7017*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 7018*c217d954SCole Faust#endif 7019*c217d954SCole Faust 7020*c217d954SCole Faust 7021*c217d954SCole Faust#if defined(MIXED_PRECISION) 7022*c217d954SCole Faust CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7023*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 7024*c217d954SCole Faust#else 7025*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 7026*c217d954SCole Faust#endif 7027*c217d954SCole Faust 7028*c217d954SCole Faust#else 7029*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( 7030*c217d954SCole Faust 2) * bias_stride_z; 7031*c217d954SCole Faust 7032*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7033*c217d954SCole Faust 7034*c217d954SCole Faust#ifndef UNIT_BETA 7035*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 7036*c217d954SCole Faust#endif 7037*c217d954SCole Faust 7038*c217d954SCole Faust 7039*c217d954SCole Faust#if defined(MIXED_PRECISION) 7040*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7041*c217d954SCole Faust ADD_BLOCK(M0, c, bias_hp); 7042*c217d954SCole Faust#else 7043*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 7044*c217d954SCole Faust#endif 7045*c217d954SCole Faust 7046*c217d954SCole Faust#endif 7047*c217d954SCole Faust#endif 7048*c217d954SCole Faust 7049*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 7050*c217d954SCole Faust#if defined(MIXED_PRECISION) 7051*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 7052*c217d954SCole Faust#else 7053*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 7054*c217d954SCole Faust#endif 7055*c217d954SCole Faust#endif 7056*c217d954SCole Faust 7057*c217d954SCole Faust 7058*c217d954SCole Faust#if defined(MIXED_PRECISION) 7059*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 7060*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7061*c217d954SCole Faust#else 7062*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7063*c217d954SCole Faust#endif 7064*c217d954SCole Faust 7065*c217d954SCole Faust#undef LHS_BLOCK_SIZE 7066*c217d954SCole Faust#undef LHS_OFFSET_X 7067*c217d954SCole Faust#undef LHS_STEP_X 7068*c217d954SCole Faust#undef RHS_BLOCK_SIZE 7069*c217d954SCole Faust#undef RHS_OFFSET_X 7070*c217d954SCole Faust#undef RHS_STEP_X 7071*c217d954SCole Faust#undef PIXEL_UNIT 7072*c217d954SCole Faust#undef LHS_STEP_LOOP 7073*c217d954SCole Faust#undef RHS_STEP_LOOP 7074*c217d954SCole Faust} 7075*c217d954SCole Faust#endif 7076*c217d954SCole Faust 7077*c217d954SCole Faust#if defined(LHS_TRANSPOSE) 7078*c217d954SCole Faust 7079*c217d954SCole Faust#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE) 7080*c217d954SCole Faust 7081*c217d954SCole Faust#if defined(MIXED_PRECISION) 7082*c217d954SCole Faust 7083*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD) 7084*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))); 7085*c217d954SCole Faust#else 7086*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c)); 7087*c217d954SCole Faust#endif 7088*c217d954SCole Faust 7089*c217d954SCole Faust#else 7090*c217d954SCole Faust 7091*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD) 7092*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c += (a) * (b); 7093*c217d954SCole Faust#else 7094*c217d954SCole Faust#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c)); 7095*c217d954SCole Faust#endif 7096*c217d954SCole Faust 7097*c217d954SCole Faust#endif 7098*c217d954SCole Faust 7099*c217d954SCole Faust#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \ 7100*c217d954SCole Faust ({ \ 7101*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \ 7102*c217d954SCole Faust }) 7103*c217d954SCole Faust#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \ 7104*c217d954SCole Faust ({ \ 7105*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \ 7106*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \ 7107*c217d954SCole Faust }) 7108*c217d954SCole Faust#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \ 7109*c217d954SCole Faust ({ \ 7110*c217d954SCole Faust ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \ 7111*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \ 7112*c217d954SCole Faust }) 7113*c217d954SCole Faust#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \ 7114*c217d954SCole Faust ({ \ 7115*c217d954SCole Faust ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \ 7116*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \ 7117*c217d954SCole Faust }) 7118*c217d954SCole Faust#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \ 7119*c217d954SCole Faust ({ \ 7120*c217d954SCole Faust ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \ 7121*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \ 7122*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \ 7123*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \ 7124*c217d954SCole Faust ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \ 7125*c217d954SCole Faust }) 7126*c217d954SCole Faust 7127*c217d954SCole Faust 7128*c217d954SCole Faust 7129*c217d954SCole Faust 7130*c217d954SCole Faust 7131*c217d954SCole Faust 7132*c217d954SCole Faust 7133*c217d954SCole Faust#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C) 7134*c217d954SCole Faust 7135*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \ 7136*c217d954SCole Faust ({ \ 7137*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \ 7138*c217d954SCole Faust }) 7139*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \ 7140*c217d954SCole Faust ({ \ 7141*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \ 7142*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \ 7143*c217d954SCole Faust }) 7144*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \ 7145*c217d954SCole Faust ({ \ 7146*c217d954SCole Faust ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \ 7147*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \ 7148*c217d954SCole Faust }) 7149*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \ 7150*c217d954SCole Faust ({ \ 7151*c217d954SCole Faust ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \ 7152*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \ 7153*c217d954SCole Faust }) 7154*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \ 7155*c217d954SCole Faust ({ \ 7156*c217d954SCole Faust ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \ 7157*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \ 7158*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \ 7159*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \ 7160*c217d954SCole Faust ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \ 7161*c217d954SCole Faust }) 7162*c217d954SCole Faust#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \ 7163*c217d954SCole Faust ({ \ 7164*c217d954SCole Faust ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \ 7165*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \ 7166*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \ 7167*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \ 7168*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \ 7169*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \ 7170*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \ 7171*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \ 7172*c217d954SCole Faust ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \ 7173*c217d954SCole Faust }) 7174*c217d954SCole Faust 7175*c217d954SCole Faust 7176*c217d954SCole Faust 7177*c217d954SCole Faust 7178*c217d954SCole Faust 7179*c217d954SCole Faust 7180*c217d954SCole Faust 7181*c217d954SCole Faust 7182*c217d954SCole Faust 7183*c217d954SCole Faust#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \ 7184*c217d954SCole Faust CONCAT(ARM_MM_T_NT_M0xN0x, K0) \ 7185*c217d954SCole Faust (M0, N0, TYPE, A, B, C) 7186*c217d954SCole Faust 7187*c217d954SCole Faust#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT) 7188*c217d954SCole Faust 7189*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), 7190*c217d954SCole Faust IMAGE_DECLARATION(rhs), 7191*c217d954SCole Faust#if defined(BETA) 7192*c217d954SCole Faust IMAGE_DECLARATION(bias), 7193*c217d954SCole Faust#endif 7194*c217d954SCole Faust IMAGE_DECLARATION(dst), 7195*c217d954SCole Faust uint lhs_stride_z, 7196*c217d954SCole Faust uint rhs_stride_z, 7197*c217d954SCole Faust#if defined(BETA) 7198*c217d954SCole Faust uint bias_stride_z, 7199*c217d954SCole Faust#endif 7200*c217d954SCole Faust uint dst_stride_z 7201*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 7202*c217d954SCole Faust , 7203*c217d954SCole Faust uint dst_cross_plane_pad 7204*c217d954SCole Faust#endif 7205*c217d954SCole Faust , 7206*c217d954SCole Faust const int M, 7207*c217d954SCole Faust const int N, 7208*c217d954SCole Faust const int K) 7209*c217d954SCole Faust{ 7210*c217d954SCole Faust 7211*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0)) 7212*c217d954SCole Faust 7213*c217d954SCole Faust#if defined(LHS_INTERLEAVE) 7214*c217d954SCole Faust#define LHS_OFFSET_X (M0) 7215*c217d954SCole Faust#define LHS_STEP_X ((M0) * (V0)) 7216*c217d954SCole Faust#define LHS_STEP_LOOP (1) 7217*c217d954SCole Faust#else 7218*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 7219*c217d954SCole Faust#define LHS_STEP_X (M0) 7220*c217d954SCole Faust#define LHS_STEP_LOOP (V0) 7221*c217d954SCole Faust#endif 7222*c217d954SCole Faust 7223*c217d954SCole Faust 7224*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0)) 7225*c217d954SCole Faust 7226*c217d954SCole Faust 7227*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 7228*c217d954SCole Faust#define RHS_OFFSET_X (N0) 7229*c217d954SCole Faust#define RHS_STEP_X ((N0) * (H0)) 7230*c217d954SCole Faust#else 7231*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 7232*c217d954SCole Faust#define RHS_STEP_X (N0) 7233*c217d954SCole Faust#endif 7234*c217d954SCole Faust 7235*c217d954SCole Faust const uint x = get_global_id(0); 7236*c217d954SCole Faust const uint y = get_global_id(1); 7237*c217d954SCole Faust const uint z = get_global_id(2); 7238*c217d954SCole Faust 7239*c217d954SCole Faust const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 7240*c217d954SCole Faust const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 7241*c217d954SCole Faust 7242*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 7243*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 7244*c217d954SCole Faust { 7245*c217d954SCole Faust return; 7246*c217d954SCole Faust } 7247*c217d954SCole Faust#endif 7248*c217d954SCole Faust 7249*c217d954SCole Faust 7250*c217d954SCole Faust __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); 7251*c217d954SCole Faust 7252*c217d954SCole Faust 7253*c217d954SCole Faust __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; 7254*c217d954SCole Faust 7255*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 7256*c217d954SCole Faust 7257*c217d954SCole Faust rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; 7258*c217d954SCole Faust#else 7259*c217d954SCole Faust rhs_addr += z * rhs_stride_z; 7260*c217d954SCole Faust#endif 7261*c217d954SCole Faust 7262*c217d954SCole Faust 7263*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 7264*c217d954SCole Faust 7265*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); 7266*c217d954SCole Faust 7267*c217d954SCole Faust __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); 7268*c217d954SCole Faust __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr); 7269*c217d954SCole Faust 7270*c217d954SCole Faust for(int i = 0; i < K; i += K0) 7271*c217d954SCole Faust { 7272*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, M0) 7273*c217d954SCole Faust a0; 7274*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 7275*c217d954SCole Faust b0; 7276*c217d954SCole Faust 7277*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7278*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7279*c217d954SCole Faust 7280*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7281*c217d954SCole Faust 7282*c217d954SCole Faust lhs += LHS_STEP_X; 7283*c217d954SCole Faust rhs += RHS_STEP_X; 7284*c217d954SCole Faust 7285*c217d954SCole Faust#if K0 > 1 7286*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7287*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7288*c217d954SCole Faust 7289*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7290*c217d954SCole Faust 7291*c217d954SCole Faust lhs += LHS_STEP_X; 7292*c217d954SCole Faust rhs += RHS_STEP_X; 7293*c217d954SCole Faust#endif 7294*c217d954SCole Faust 7295*c217d954SCole Faust#if K0 > 2 7296*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7297*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7298*c217d954SCole Faust 7299*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7300*c217d954SCole Faust 7301*c217d954SCole Faust lhs += LHS_STEP_X; 7302*c217d954SCole Faust rhs += RHS_STEP_X; 7303*c217d954SCole Faust#endif 7304*c217d954SCole Faust 7305*c217d954SCole Faust#if K0 > 3 7306*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7307*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7308*c217d954SCole Faust 7309*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7310*c217d954SCole Faust 7311*c217d954SCole Faust lhs += LHS_STEP_X; 7312*c217d954SCole Faust rhs += RHS_STEP_X; 7313*c217d954SCole Faust#endif 7314*c217d954SCole Faust 7315*c217d954SCole Faust#if K0 > 4 7316*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7317*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7318*c217d954SCole Faust 7319*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7320*c217d954SCole Faust 7321*c217d954SCole Faust lhs += LHS_STEP_X; 7322*c217d954SCole Faust rhs += RHS_STEP_X; 7323*c217d954SCole Faust 7324*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7325*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7326*c217d954SCole Faust 7327*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7328*c217d954SCole Faust 7329*c217d954SCole Faust lhs += LHS_STEP_X; 7330*c217d954SCole Faust rhs += RHS_STEP_X; 7331*c217d954SCole Faust 7332*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7333*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7334*c217d954SCole Faust 7335*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7336*c217d954SCole Faust 7337*c217d954SCole Faust lhs += LHS_STEP_X; 7338*c217d954SCole Faust rhs += RHS_STEP_X; 7339*c217d954SCole Faust 7340*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7341*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7342*c217d954SCole Faust 7343*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7344*c217d954SCole Faust 7345*c217d954SCole Faust lhs += LHS_STEP_X; 7346*c217d954SCole Faust rhs += RHS_STEP_X; 7347*c217d954SCole Faust#endif 7348*c217d954SCole Faust 7349*c217d954SCole Faust#if K0 > 8 7350*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7351*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7352*c217d954SCole Faust 7353*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7354*c217d954SCole Faust 7355*c217d954SCole Faust lhs += LHS_STEP_X; 7356*c217d954SCole Faust rhs += RHS_STEP_X; 7357*c217d954SCole Faust 7358*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7359*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7360*c217d954SCole Faust 7361*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7362*c217d954SCole Faust 7363*c217d954SCole Faust lhs += LHS_STEP_X; 7364*c217d954SCole Faust rhs += RHS_STEP_X; 7365*c217d954SCole Faust 7366*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7367*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7368*c217d954SCole Faust 7369*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7370*c217d954SCole Faust 7371*c217d954SCole Faust lhs += LHS_STEP_X; 7372*c217d954SCole Faust rhs += RHS_STEP_X; 7373*c217d954SCole Faust 7374*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7375*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7376*c217d954SCole Faust 7377*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7378*c217d954SCole Faust 7379*c217d954SCole Faust lhs += LHS_STEP_X; 7380*c217d954SCole Faust rhs += RHS_STEP_X; 7381*c217d954SCole Faust 7382*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7383*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7384*c217d954SCole Faust 7385*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7386*c217d954SCole Faust 7387*c217d954SCole Faust lhs += LHS_STEP_X; 7388*c217d954SCole Faust rhs += RHS_STEP_X; 7389*c217d954SCole Faust 7390*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7391*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7392*c217d954SCole Faust 7393*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7394*c217d954SCole Faust 7395*c217d954SCole Faust lhs += LHS_STEP_X; 7396*c217d954SCole Faust rhs += RHS_STEP_X; 7397*c217d954SCole Faust 7398*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7399*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7400*c217d954SCole Faust 7401*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7402*c217d954SCole Faust 7403*c217d954SCole Faust lhs += LHS_STEP_X; 7404*c217d954SCole Faust rhs += RHS_STEP_X; 7405*c217d954SCole Faust 7406*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7407*c217d954SCole Faust b0 = VLOAD(N0)(0, rhs); 7408*c217d954SCole Faust 7409*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7410*c217d954SCole Faust 7411*c217d954SCole Faust lhs += LHS_STEP_X; 7412*c217d954SCole Faust rhs += RHS_STEP_X; 7413*c217d954SCole Faust#endif 7414*c217d954SCole Faust 7415*c217d954SCole Faust#ifndef LHS_INTERLEAVE 7416*c217d954SCole Faust lhs += (M0 * K0 * (V0 - 1)); 7417*c217d954SCole Faust#endif 7418*c217d954SCole Faust 7419*c217d954SCole Faust#ifndef RHS_INTERLEAVE 7420*c217d954SCole Faust rhs += (N0 * K0 * (H0 - 1)); 7421*c217d954SCole Faust#endif 7422*c217d954SCole Faust } 7423*c217d954SCole Faust 7424*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); 7425*c217d954SCole Faust 7426*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 7427*c217d954SCole Faust 7428*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 7429*c217d954SCole Faust 7430*c217d954SCole Faust 7431*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 7432*c217d954SCole Faust 7433*c217d954SCole Faust 7434*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 7435*c217d954SCole Faust 7436*c217d954SCole Faust#else 7437*c217d954SCole Faust 7438*c217d954SCole Faust 7439*c217d954SCole Faust dst_addr += z * dst_stride_z; 7440*c217d954SCole Faust 7441*c217d954SCole Faust#endif 7442*c217d954SCole Faust 7443*c217d954SCole Faust 7444*c217d954SCole Faust#if defined(ALPHA) 7445*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 7446*c217d954SCole Faust#endif 7447*c217d954SCole Faust 7448*c217d954SCole Faust 7449*c217d954SCole Faust#if defined(BETA) 7450*c217d954SCole Faust#if defined(BROADCAST_BIAS) 7451*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); 7452*c217d954SCole Faust 7453*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 7454*c217d954SCole Faust 7455*c217d954SCole Faust#ifndef UNIT_BETA 7456*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 7457*c217d954SCole Faust#endif 7458*c217d954SCole Faust 7459*c217d954SCole Faust 7460*c217d954SCole Faust#if defined(MIXED_PRECISION) 7461*c217d954SCole Faust CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7462*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 7463*c217d954SCole Faust#else 7464*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 7465*c217d954SCole Faust#endif 7466*c217d954SCole Faust 7467*c217d954SCole Faust#else 7468*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( 7469*c217d954SCole Faust 2) * bias_stride_z; 7470*c217d954SCole Faust 7471*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7472*c217d954SCole Faust 7473*c217d954SCole Faust#ifndef UNIT_BETA 7474*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 7475*c217d954SCole Faust#endif 7476*c217d954SCole Faust 7477*c217d954SCole Faust#if defined(MIXED_PRECISION) 7478*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7479*c217d954SCole Faust ADD_BLOCK(M0, c, bias_hp); 7480*c217d954SCole Faust#else 7481*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 7482*c217d954SCole Faust#endif 7483*c217d954SCole Faust 7484*c217d954SCole Faust#endif 7485*c217d954SCole Faust#endif 7486*c217d954SCole Faust 7487*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 7488*c217d954SCole Faust#if defined(MIXED_PRECISION) 7489*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 7490*c217d954SCole Faust#else 7491*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 7492*c217d954SCole Faust#endif 7493*c217d954SCole Faust#endif 7494*c217d954SCole Faust 7495*c217d954SCole Faust 7496*c217d954SCole Faust#if defined(MIXED_PRECISION) 7497*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 7498*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7499*c217d954SCole Faust#else 7500*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7501*c217d954SCole Faust#endif 7502*c217d954SCole Faust 7503*c217d954SCole Faust#undef LHS_BLOCK_SIZE 7504*c217d954SCole Faust#undef LHS_OFFSET_X 7505*c217d954SCole Faust#undef LHS_STEP_X 7506*c217d954SCole Faust#undef RHS_BLOCK_SIZE 7507*c217d954SCole Faust#undef RHS_OFFSET_X 7508*c217d954SCole Faust#undef RHS_STEP_X 7509*c217d954SCole Faust} 7510*c217d954SCole Faust#endif 7511*c217d954SCole Faust 7512*c217d954SCole Faust#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE) 7513*c217d954SCole Faust 7514*c217d954SCole Faust__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs), 7515*c217d954SCole Faust __read_only image2d_t rhs_img, 7516*c217d954SCole Faust#if defined(BETA) 7517*c217d954SCole Faust IMAGE_DECLARATION(bias), 7518*c217d954SCole Faust#endif 7519*c217d954SCole Faust IMAGE_DECLARATION(dst), 7520*c217d954SCole Faust uint lhs_stride_z, 7521*c217d954SCole Faust uint rhs_stride_z, 7522*c217d954SCole Faust#if defined(BETA) 7523*c217d954SCole Faust uint bias_stride_z, 7524*c217d954SCole Faust#endif 7525*c217d954SCole Faust uint dst_stride_z 7526*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 7527*c217d954SCole Faust , 7528*c217d954SCole Faust uint dst_cross_plane_pad 7529*c217d954SCole Faust#endif 7530*c217d954SCole Faust , 7531*c217d954SCole Faust const int M, 7532*c217d954SCole Faust const int N, 7533*c217d954SCole Faust const int K) 7534*c217d954SCole Faust{ 7535*c217d954SCole Faust 7536*c217d954SCole Faust#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) 7537*c217d954SCole Faust 7538*c217d954SCole Faust 7539*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0)) 7540*c217d954SCole Faust 7541*c217d954SCole Faust#if defined(LHS_INTERLEAVE) 7542*c217d954SCole Faust#define LHS_OFFSET_X (M0) 7543*c217d954SCole Faust#define LHS_STEP_X ((M0) * (V0)) 7544*c217d954SCole Faust#define LHS_STEP_LOOP (1) 7545*c217d954SCole Faust#else 7546*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 7547*c217d954SCole Faust#define LHS_STEP_X (M0) 7548*c217d954SCole Faust#define LHS_STEP_LOOP (V0) 7549*c217d954SCole Faust#endif 7550*c217d954SCole Faust 7551*c217d954SCole Faust 7552*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) 7553*c217d954SCole Faust 7554*c217d954SCole Faust 7555*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 7556*c217d954SCole Faust#define RHS_OFFSET_X (PIXEL_UNIT) 7557*c217d954SCole Faust#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) 7558*c217d954SCole Faust#else 7559*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 7560*c217d954SCole Faust#define RHS_STEP_X (PIXEL_UNIT) 7561*c217d954SCole Faust#endif 7562*c217d954SCole Faust 7563*c217d954SCole Faust const uint x = get_global_id(0); 7564*c217d954SCole Faust const uint y = get_global_id(1); 7565*c217d954SCole Faust const uint z = get_global_id(2); 7566*c217d954SCole Faust 7567*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 7568*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 7569*c217d954SCole Faust { 7570*c217d954SCole Faust return; 7571*c217d954SCole Faust } 7572*c217d954SCole Faust#endif 7573*c217d954SCole Faust 7574*c217d954SCole Faust 7575*c217d954SCole Faust __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); 7576*c217d954SCole Faust 7577*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 7578*c217d954SCole Faust 7579*c217d954SCole Faust const uint z_rhs = (z % MATRIX_B_DEPTH); 7580*c217d954SCole Faust#else 7581*c217d954SCole Faust const uint z_rhs = z; 7582*c217d954SCole Faust#endif 7583*c217d954SCole Faust 7584*c217d954SCole Faust 7585*c217d954SCole Faust uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; 7586*c217d954SCole Faust const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; 7587*c217d954SCole Faust 7588*c217d954SCole Faust 7589*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 7590*c217d954SCole Faust 7591*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); 7592*c217d954SCole Faust 7593*c217d954SCole Faust __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); 7594*c217d954SCole Faust 7595*c217d954SCole Faust for(int i = 0; i < K; i += K0) 7596*c217d954SCole Faust { 7597*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, M0) 7598*c217d954SCole Faust a0; 7599*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 7600*c217d954SCole Faust b0; 7601*c217d954SCole Faust 7602*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7603*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); 7604*c217d954SCole Faust 7605*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7606*c217d954SCole Faust 7607*c217d954SCole Faust lhs += LHS_STEP_X; 7608*c217d954SCole Faust 7609*c217d954SCole Faust#if K0 > 1 7610*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7611*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); 7612*c217d954SCole Faust 7613*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7614*c217d954SCole Faust 7615*c217d954SCole Faust lhs += LHS_STEP_X; 7616*c217d954SCole Faust#endif 7617*c217d954SCole Faust 7618*c217d954SCole Faust#if K0 > 2 7619*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7620*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); 7621*c217d954SCole Faust 7622*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7623*c217d954SCole Faust 7624*c217d954SCole Faust lhs += LHS_STEP_X; 7625*c217d954SCole Faust#endif 7626*c217d954SCole Faust 7627*c217d954SCole Faust#if K0 > 3 7628*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7629*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); 7630*c217d954SCole Faust 7631*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7632*c217d954SCole Faust 7633*c217d954SCole Faust lhs += LHS_STEP_X; 7634*c217d954SCole Faust#endif 7635*c217d954SCole Faust 7636*c217d954SCole Faust#if K0 > 4 7637*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7638*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); 7639*c217d954SCole Faust 7640*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7641*c217d954SCole Faust 7642*c217d954SCole Faust lhs += LHS_STEP_X; 7643*c217d954SCole Faust 7644*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7645*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); 7646*c217d954SCole Faust 7647*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7648*c217d954SCole Faust 7649*c217d954SCole Faust lhs += LHS_STEP_X; 7650*c217d954SCole Faust 7651*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7652*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); 7653*c217d954SCole Faust 7654*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7655*c217d954SCole Faust 7656*c217d954SCole Faust lhs += LHS_STEP_X; 7657*c217d954SCole Faust 7658*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7659*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); 7660*c217d954SCole Faust 7661*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7662*c217d954SCole Faust 7663*c217d954SCole Faust lhs += LHS_STEP_X; 7664*c217d954SCole Faust#endif 7665*c217d954SCole Faust 7666*c217d954SCole Faust#if K0 > 8 7667*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7668*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); 7669*c217d954SCole Faust 7670*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7671*c217d954SCole Faust 7672*c217d954SCole Faust lhs += LHS_STEP_X; 7673*c217d954SCole Faust 7674*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7675*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); 7676*c217d954SCole Faust 7677*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7678*c217d954SCole Faust 7679*c217d954SCole Faust lhs += LHS_STEP_X; 7680*c217d954SCole Faust 7681*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7682*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); 7683*c217d954SCole Faust 7684*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7685*c217d954SCole Faust 7686*c217d954SCole Faust lhs += LHS_STEP_X; 7687*c217d954SCole Faust 7688*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7689*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); 7690*c217d954SCole Faust 7691*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7692*c217d954SCole Faust 7693*c217d954SCole Faust lhs += LHS_STEP_X; 7694*c217d954SCole Faust 7695*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7696*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); 7697*c217d954SCole Faust 7698*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7699*c217d954SCole Faust 7700*c217d954SCole Faust lhs += LHS_STEP_X; 7701*c217d954SCole Faust 7702*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7703*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); 7704*c217d954SCole Faust 7705*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7706*c217d954SCole Faust 7707*c217d954SCole Faust lhs += LHS_STEP_X; 7708*c217d954SCole Faust 7709*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7710*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); 7711*c217d954SCole Faust 7712*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7713*c217d954SCole Faust 7714*c217d954SCole Faust lhs += LHS_STEP_X; 7715*c217d954SCole Faust 7716*c217d954SCole Faust a0 = VLOAD(M0)(0, lhs); 7717*c217d954SCole Faust b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); 7718*c217d954SCole Faust 7719*c217d954SCole Faust ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7720*c217d954SCole Faust 7721*c217d954SCole Faust lhs += LHS_STEP_X; 7722*c217d954SCole Faust#endif 7723*c217d954SCole Faust 7724*c217d954SCole Faust#ifndef LHS_INTERLEAVE 7725*c217d954SCole Faust lhs += (M0 * K0 * (V0 - 1)); 7726*c217d954SCole Faust#endif 7727*c217d954SCole Faust 7728*c217d954SCole Faust x_rhs += K0 * RHS_STEP_X; 7729*c217d954SCole Faust#ifndef RHS_INTERLEAVE 7730*c217d954SCole Faust x_rhs += (PIXEL_UNIT * K0 * (H0 - 1)); 7731*c217d954SCole Faust#endif 7732*c217d954SCole Faust } 7733*c217d954SCole Faust 7734*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); 7735*c217d954SCole Faust 7736*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 7737*c217d954SCole Faust 7738*c217d954SCole Faust const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 7739*c217d954SCole Faust const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 7740*c217d954SCole Faust 7741*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 7742*c217d954SCole Faust 7743*c217d954SCole Faust 7744*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 7745*c217d954SCole Faust 7746*c217d954SCole Faust 7747*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 7748*c217d954SCole Faust 7749*c217d954SCole Faust#else 7750*c217d954SCole Faust 7751*c217d954SCole Faust 7752*c217d954SCole Faust dst_addr += z * dst_stride_z; 7753*c217d954SCole Faust 7754*c217d954SCole Faust#endif 7755*c217d954SCole Faust 7756*c217d954SCole Faust 7757*c217d954SCole Faust#if defined(ALPHA) 7758*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 7759*c217d954SCole Faust#endif 7760*c217d954SCole Faust 7761*c217d954SCole Faust 7762*c217d954SCole Faust#if defined(BETA) 7763*c217d954SCole Faust#if defined(BROADCAST_BIAS) 7764*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); 7765*c217d954SCole Faust 7766*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 7767*c217d954SCole Faust 7768*c217d954SCole Faust#ifndef UNIT_BETA 7769*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 7770*c217d954SCole Faust#endif 7771*c217d954SCole Faust 7772*c217d954SCole Faust 7773*c217d954SCole Faust#if defined(MIXED_PRECISION) 7774*c217d954SCole Faust CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7775*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 7776*c217d954SCole Faust#else 7777*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 7778*c217d954SCole Faust#endif 7779*c217d954SCole Faust 7780*c217d954SCole Faust#else 7781*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z; 7782*c217d954SCole Faust 7783*c217d954SCole Faust LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7784*c217d954SCole Faust 7785*c217d954SCole Faust#ifndef UNIT_BETA 7786*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 7787*c217d954SCole Faust#endif 7788*c217d954SCole Faust 7789*c217d954SCole Faust#if defined(MIXED_PRECISION) 7790*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7791*c217d954SCole Faust ADD_BLOCK(M0, c, bias_hp); 7792*c217d954SCole Faust#else 7793*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 7794*c217d954SCole Faust#endif 7795*c217d954SCole Faust 7796*c217d954SCole Faust#endif 7797*c217d954SCole Faust#endif 7798*c217d954SCole Faust 7799*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 7800*c217d954SCole Faust#if defined(MIXED_PRECISION) 7801*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 7802*c217d954SCole Faust#else 7803*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 7804*c217d954SCole Faust#endif 7805*c217d954SCole Faust#endif 7806*c217d954SCole Faust 7807*c217d954SCole Faust 7808*c217d954SCole Faust#if defined(MIXED_PRECISION) 7809*c217d954SCole Faust CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 7810*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7811*c217d954SCole Faust#else 7812*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7813*c217d954SCole Faust#endif 7814*c217d954SCole Faust 7815*c217d954SCole Faust#undef LHS_BLOCK_SIZE 7816*c217d954SCole Faust#undef LHS_OFFSET_X 7817*c217d954SCole Faust#undef LHS_STEP_X 7818*c217d954SCole Faust#undef RHS_BLOCK_SIZE 7819*c217d954SCole Faust#undef RHS_OFFSET_X 7820*c217d954SCole Faust#undef RHS_STEP_X 7821*c217d954SCole Faust#undef PIXEL_UNIT 7822*c217d954SCole Faust#undef LHS_STEP_LOOP 7823*c217d954SCole Faust#undef RHS_STEP_LOOP 7824*c217d954SCole Faust} 7825*c217d954SCole Faust#endif 7826*c217d954SCole Faust 7827*c217d954SCole Faust#endif 7828*c217d954SCole Faust 7829*c217d954SCole Faust#endif 7830*c217d954SCole Faust 7831*c217d954SCole Faust#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) 7832*c217d954SCole Faust 7833*c217d954SCole Faust#define VFMA(a, b, c) \ 7834*c217d954SCole Faust ({ \ 7835*c217d954SCole Faust c = fma(a, b, c); \ 7836*c217d954SCole Faust }) 7837*c217d954SCole Faust 7838*c217d954SCole Faust#if M0 == 1 7839*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7840*c217d954SCole Faust ({ \ 7841*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7842*c217d954SCole Faust }) 7843*c217d954SCole Faust#elif M0 == 2 7844*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7845*c217d954SCole Faust ({ \ 7846*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7847*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7848*c217d954SCole Faust }) 7849*c217d954SCole Faust#elif M0 == 3 7850*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7851*c217d954SCole Faust ({ \ 7852*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7853*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7854*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7855*c217d954SCole Faust }) 7856*c217d954SCole Faust#elif M0 == 4 7857*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7858*c217d954SCole Faust ({ \ 7859*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7860*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7861*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7862*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7863*c217d954SCole Faust }) 7864*c217d954SCole Faust#elif M0 == 5 7865*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7866*c217d954SCole Faust ({ \ 7867*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7868*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7869*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7870*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7871*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7872*c217d954SCole Faust }) 7873*c217d954SCole Faust#elif M0 == 6 7874*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7875*c217d954SCole Faust ({ \ 7876*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7877*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7878*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7879*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7880*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7881*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 7882*c217d954SCole Faust }) 7883*c217d954SCole Faust#elif M0 == 7 7884*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7885*c217d954SCole Faust ({ \ 7886*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7887*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7888*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7889*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7890*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7891*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 7892*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 7893*c217d954SCole Faust }) 7894*c217d954SCole Faust#elif M0 == 8 7895*c217d954SCole Faust#define RHS_VFMA_M0xN0(i, a, b, c) \ 7896*c217d954SCole Faust ({ \ 7897*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7898*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7899*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7900*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7901*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7902*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 7903*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 7904*c217d954SCole Faust VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ 7905*c217d954SCole Faust }) 7906*c217d954SCole Faust#else 7907*c217d954SCole Faust#error "M0 not supported" 7908*c217d954SCole Faust#endif 7909*c217d954SCole Faust 7910*c217d954SCole Faust#if defined(GEMM_MM_NATIVE) 7911*c217d954SCole Faust 7912*c217d954SCole Faust__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), 7913*c217d954SCole Faust IMAGE_DECLARATION(rhs), 7914*c217d954SCole Faust#if defined(BETA) 7915*c217d954SCole Faust IMAGE_DECLARATION(bias), 7916*c217d954SCole Faust#endif 7917*c217d954SCole Faust IMAGE_DECLARATION(dst), 7918*c217d954SCole Faust uint lhs_stride_z, 7919*c217d954SCole Faust uint rhs_stride_z, 7920*c217d954SCole Faust#if defined(BETA) 7921*c217d954SCole Faust uint bias_stride_z, 7922*c217d954SCole Faust#endif 7923*c217d954SCole Faust uint dst_stride_z, 7924*c217d954SCole Faust const int M, 7925*c217d954SCole Faust const int N, 7926*c217d954SCole Faust const int K 7927*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 7928*c217d954SCole Faust , 7929*c217d954SCole Faust uint lhs_cross_plane_pad 7930*c217d954SCole Faust#endif 7931*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 7932*c217d954SCole Faust , 7933*c217d954SCole Faust uint dst_cross_plane_pad 7934*c217d954SCole Faust#endif 7935*c217d954SCole Faust ) 7936*c217d954SCole Faust{ 7937*c217d954SCole Faust 7938*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0)) 7939*c217d954SCole Faust 7940*c217d954SCole Faust 7941*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 7942*c217d954SCole Faust 7943*c217d954SCole Faust uint x = get_global_id(0); 7944*c217d954SCole Faust uint y = get_global_id(1); 7945*c217d954SCole Faust uint z = get_global_id(2); 7946*c217d954SCole Faust 7947*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 7948*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 7949*c217d954SCole Faust { 7950*c217d954SCole Faust return; 7951*c217d954SCole Faust } 7952*c217d954SCole Faust#endif 7953*c217d954SCole Faust 7954*c217d954SCole Faust 7955*c217d954SCole Faust uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 7956*c217d954SCole Faust 7957*c217d954SCole Faust 7958*c217d954SCole Faust uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); 7959*c217d954SCole Faust 7960*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 7961*c217d954SCole Faust 7962*c217d954SCole Faust rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 7963*c217d954SCole Faust#else 7964*c217d954SCole Faust rhs_offset += z * rhs_stride_z; 7965*c217d954SCole Faust#endif 7966*c217d954SCole Faust 7967*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 7968*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 7969*c217d954SCole Faust 7970*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 7971*c217d954SCole Faust 7972*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 7973*c217d954SCole Faust 7974*c217d954SCole Faust 7975*c217d954SCole Faust 7976*c217d954SCole Faust lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 7977*c217d954SCole Faust 7978*c217d954SCole Faust#else 7979*c217d954SCole Faust 7980*c217d954SCole Faust 7981*c217d954SCole Faust lhs_offset += z * lhs_stride_z; 7982*c217d954SCole Faust 7983*c217d954SCole Faust#endif 7984*c217d954SCole Faust 7985*c217d954SCole Faust 7986*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 7987*c217d954SCole Faust 7988*c217d954SCole Faust int i = 0; 7989*c217d954SCole Faust#if K0 > 1 7990*c217d954SCole Faust for(; i <= (K - K0); i += K0) 7991*c217d954SCole Faust { 7992*c217d954SCole Faust 7993*c217d954SCole Faust 7994*c217d954SCole Faust 7995*c217d954SCole Faust 7996*c217d954SCole Faust 7997*c217d954SCole Faust 7998*c217d954SCole Faust 7999*c217d954SCole Faust 8000*c217d954SCole Faust 8001*c217d954SCole Faust 8002*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 8003*c217d954SCole Faust 8004*c217d954SCole Faust 8005*c217d954SCole Faust LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero); 8006*c217d954SCole Faust 8007*c217d954SCole Faust RHS_VFMA_M0xN0(0, a, b0, c); 8008*c217d954SCole Faust RHS_VFMA_M0xN0(1, a, b1, c); 8009*c217d954SCole Faust#if K0 > 2 8010*c217d954SCole Faust RHS_VFMA_M0xN0(2, a, b2, c); 8011*c217d954SCole Faust#endif 8012*c217d954SCole Faust#if K0 > 3 8013*c217d954SCole Faust RHS_VFMA_M0xN0(3, a, b3, c); 8014*c217d954SCole Faust#endif 8015*c217d954SCole Faust#if K0 > 4 8016*c217d954SCole Faust RHS_VFMA_M0xN0(4, a, b4, c); 8017*c217d954SCole Faust RHS_VFMA_M0xN0(5, a, b5, c); 8018*c217d954SCole Faust RHS_VFMA_M0xN0(6, a, b6, c); 8019*c217d954SCole Faust RHS_VFMA_M0xN0(7, a, b7, c); 8020*c217d954SCole Faust#endif 8021*c217d954SCole Faust#if K0 > 8 8022*c217d954SCole Faust RHS_VFMA_M0xN0(8, a, b8, c); 8023*c217d954SCole Faust RHS_VFMA_M0xN0(9, a, b9, c); 8024*c217d954SCole Faust RHS_VFMA_M0xN0(A, a, bA, c); 8025*c217d954SCole Faust RHS_VFMA_M0xN0(B, a, bB, c); 8026*c217d954SCole Faust RHS_VFMA_M0xN0(C, a, bC, c); 8027*c217d954SCole Faust RHS_VFMA_M0xN0(D, a, bD, c); 8028*c217d954SCole Faust RHS_VFMA_M0xN0(E, a, bE, c); 8029*c217d954SCole Faust RHS_VFMA_M0xN0(F, a, bF, c); 8030*c217d954SCole Faust#endif 8031*c217d954SCole Faust 8032*c217d954SCole Faust lhs_offset += K0 * sizeof(DATA_TYPE); 8033*c217d954SCole Faust rhs_offset += K0 * rhs_stride_y; 8034*c217d954SCole Faust } 8035*c217d954SCole Faust#endif 8036*c217d954SCole Faust 8037*c217d954SCole Faust for(; i < K; ++i) 8038*c217d954SCole Faust { 8039*c217d954SCole Faust 8040*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8041*c217d954SCole Faust a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0)); 8042*c217d954SCole Faust#if M0 > 1 8043*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8044*c217d954SCole Faust a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1)); 8045*c217d954SCole Faust#endif 8046*c217d954SCole Faust#if M0 > 2 8047*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8048*c217d954SCole Faust a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2)); 8049*c217d954SCole Faust#endif 8050*c217d954SCole Faust#if M0 > 3 8051*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8052*c217d954SCole Faust a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3)); 8053*c217d954SCole Faust#endif 8054*c217d954SCole Faust#if M0 > 4 8055*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8056*c217d954SCole Faust a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4)); 8057*c217d954SCole Faust#endif 8058*c217d954SCole Faust#if M0 > 5 8059*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8060*c217d954SCole Faust a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5)); 8061*c217d954SCole Faust#endif 8062*c217d954SCole Faust#if M0 > 6 8063*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8064*c217d954SCole Faust a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6)); 8065*c217d954SCole Faust#endif 8066*c217d954SCole Faust#if M0 > 7 8067*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 2) 8068*c217d954SCole Faust a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7)); 8069*c217d954SCole Faust#endif 8070*c217d954SCole Faust 8071*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) 8072*c217d954SCole Faust b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y)); 8073*c217d954SCole Faust RHS_VFMA_M0xN0(0, a, b, c); 8074*c217d954SCole Faust 8075*c217d954SCole Faust lhs_offset += sizeof(DATA_TYPE); 8076*c217d954SCole Faust rhs_offset += rhs_stride_y; 8077*c217d954SCole Faust } 8078*c217d954SCole Faust 8079*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 8080*c217d954SCole Faust 8081*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 8082*c217d954SCole Faust 8083*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8084*c217d954SCole Faust 8085*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 8086*c217d954SCole Faust 8087*c217d954SCole Faust 8088*c217d954SCole Faust 8089*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 8090*c217d954SCole Faust 8091*c217d954SCole Faust#else 8092*c217d954SCole Faust 8093*c217d954SCole Faust 8094*c217d954SCole Faust dst_addr += z * dst_stride_z; 8095*c217d954SCole Faust 8096*c217d954SCole Faust#endif 8097*c217d954SCole Faust 8098*c217d954SCole Faust 8099*c217d954SCole Faust#if defined(ALPHA) 8100*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 8101*c217d954SCole Faust#endif 8102*c217d954SCole Faust 8103*c217d954SCole Faust 8104*c217d954SCole Faust#if defined(BETA) 8105*c217d954SCole Faust#if defined(BROADCAST_BIAS) 8106*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 8107*c217d954SCole Faust 8108*c217d954SCole Faust LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); 8109*c217d954SCole Faust 8110*c217d954SCole Faust#ifndef UNIT_BETA 8111*c217d954SCole Faust SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 8112*c217d954SCole Faust#endif 8113*c217d954SCole Faust 8114*c217d954SCole Faust 8115*c217d954SCole Faust ADD_BLOCK_BROADCAST(M0, c, bias0); 8116*c217d954SCole Faust 8117*c217d954SCole Faust#else 8118*c217d954SCole Faust __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 8119*c217d954SCole Faust 8120*c217d954SCole Faust LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); 8121*c217d954SCole Faust 8122*c217d954SCole Faust#ifndef UNIT_BETA 8123*c217d954SCole Faust SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 8124*c217d954SCole Faust#endif 8125*c217d954SCole Faust 8126*c217d954SCole Faust 8127*c217d954SCole Faust ADD_BLOCK(M0, c, bias); 8128*c217d954SCole Faust 8129*c217d954SCole Faust#endif 8130*c217d954SCole Faust#endif 8131*c217d954SCole Faust 8132*c217d954SCole Faust#if defined(ACTIVATION_TYPE) 8133*c217d954SCole Faust ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 8134*c217d954SCole Faust#endif 8135*c217d954SCole Faust 8136*c217d954SCole Faust const bool cond_y = y == 0; 8137*c217d954SCole Faust const bool cond_x = ((x + 1) * N0 >= N); 8138*c217d954SCole Faust 8139*c217d954SCole Faust 8140*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 8141*c217d954SCole Faust} 8142*c217d954SCole Faust#endif 8143*c217d954SCole Faust#endif 8144*c217d954SCole Faust 8145*c217d954SCole Faust#if defined(BETA) 8146*c217d954SCole Faust 8147*c217d954SCole Faust__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), 8148*c217d954SCole Faust TENSOR3D_DECLARATION(dst)) 8149*c217d954SCole Faust{ 8150*c217d954SCole Faust 8151*c217d954SCole Faust Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8152*c217d954SCole Faust Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); 8153*c217d954SCole Faust 8154*c217d954SCole Faust 8155*c217d954SCole Faust float4 alpha_ab = vload4(0, (__global float *)dst.ptr); 8156*c217d954SCole Faust 8157*c217d954SCole Faust 8158*c217d954SCole Faust float4 c = vload4(0, (__global float *)src.ptr); 8159*c217d954SCole Faust 8160*c217d954SCole Faust 8161*c217d954SCole Faust float4 out = alpha_ab + (float4)BETA * c; 8162*c217d954SCole Faust 8163*c217d954SCole Faust 8164*c217d954SCole Faust vstore4(out, 0, (__global float *)dst.ptr); 8165*c217d954SCole Faust} 8166*c217d954SCole Faust 8167*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) 8168*c217d954SCole Faust 8169*c217d954SCole Faust__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), 8170*c217d954SCole Faust TENSOR3D_DECLARATION(dst)) 8171*c217d954SCole Faust{ 8172*c217d954SCole Faust 8173*c217d954SCole Faust Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8174*c217d954SCole Faust Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); 8175*c217d954SCole Faust 8176*c217d954SCole Faust 8177*c217d954SCole Faust half8 alpha_ab = vload8(0, (__global half *)dst.ptr); 8178*c217d954SCole Faust 8179*c217d954SCole Faust 8180*c217d954SCole Faust half8 c = vload8(0, (__global half *)src.ptr); 8181*c217d954SCole Faust 8182*c217d954SCole Faust 8183*c217d954SCole Faust half8 out = alpha_ab + (half8)BETA * c; 8184*c217d954SCole Faust 8185*c217d954SCole Faust 8186*c217d954SCole Faust vstore8(out, 0, (__global half *)dst.ptr); 8187*c217d954SCole Faust} 8188*c217d954SCole Faust#endif 8189*c217d954SCole Faust#endif )"