1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2013 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <cstdint>
18*32afb93cSXin Li 
19*32afb93cSXin Li #include "RenderScriptToolkit.h"
20*32afb93cSXin Li #include "TaskProcessor.h"
21*32afb93cSXin Li #include "Utils.h"
22*32afb93cSXin Li 
23*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.YuvToRgb"
24*32afb93cSXin Li 
25*32afb93cSXin Li namespace renderscript {
26*32afb93cSXin Li 
roundUpTo16(size_t val)27*32afb93cSXin Li inline size_t roundUpTo16(size_t val) {
28*32afb93cSXin Li     return (val + 15u) & ~15u;
29*32afb93cSXin Li }
30*32afb93cSXin Li 
31*32afb93cSXin Li class YuvToRgbTask : public Task {
32*32afb93cSXin Li     uchar4* mOut;
33*32afb93cSXin Li     size_t mCstep;
34*32afb93cSXin Li     size_t mStrideY;
35*32afb93cSXin Li     size_t mStrideU;
36*32afb93cSXin Li     size_t mStrideV;
37*32afb93cSXin Li     const uchar* mInY;
38*32afb93cSXin Li     const uchar* mInU;
39*32afb93cSXin Li     const uchar* mInV;
40*32afb93cSXin Li 
41*32afb93cSXin Li     void kernel(uchar4* out, uint32_t xstart, uint32_t xend, uint32_t currentY);
42*32afb93cSXin Li     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
43*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
44*32afb93cSXin Li                      size_t endY) override;
45*32afb93cSXin Li 
46*32afb93cSXin Li    public:
YuvToRgbTask(const uint8_t * input,uint8_t * output,size_t sizeX,size_t sizeY,RenderScriptToolkit::YuvFormat format)47*32afb93cSXin Li     YuvToRgbTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
48*32afb93cSXin Li                  RenderScriptToolkit::YuvFormat format)
49*32afb93cSXin Li         : Task{sizeX, sizeY, 4, false, nullptr}, mOut{reinterpret_cast<uchar4*>(output)} {
50*32afb93cSXin Li         switch (format) {
51*32afb93cSXin Li             case RenderScriptToolkit::YuvFormat::NV21:
52*32afb93cSXin Li                 mCstep = 2;
53*32afb93cSXin Li                 mStrideY = sizeX;
54*32afb93cSXin Li                 mStrideU = mStrideY;
55*32afb93cSXin Li                 mStrideV = mStrideY;
56*32afb93cSXin Li                 mInY = reinterpret_cast<const uchar*>(input);
57*32afb93cSXin Li                 mInV = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
58*32afb93cSXin Li                 mInU = mInV + 1;
59*32afb93cSXin Li                 break;
60*32afb93cSXin Li             case RenderScriptToolkit::YuvFormat::YV12:
61*32afb93cSXin Li                 mCstep = 1;
62*32afb93cSXin Li                 mStrideY = roundUpTo16(sizeX);
63*32afb93cSXin Li                 mStrideU = roundUpTo16(mStrideY >> 1u);
64*32afb93cSXin Li                 mStrideV = mStrideU;
65*32afb93cSXin Li                 mInY = reinterpret_cast<const uchar*>(input);
66*32afb93cSXin Li                 mInU = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
67*32afb93cSXin Li                 mInV = mInU + mStrideV * sizeY / 2;
68*32afb93cSXin Li                 break;
69*32afb93cSXin Li         }
70*32afb93cSXin Li     }
71*32afb93cSXin Li };
72*32afb93cSXin Li 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)73*32afb93cSXin Li void YuvToRgbTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
74*32afb93cSXin Li                                size_t endY) {
75*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
76*32afb93cSXin Li         size_t offset = mSizeX * y + startX;
77*32afb93cSXin Li         uchar4* out = mOut + offset;
78*32afb93cSXin Li         kernel(out, startX, endX, y);
79*32afb93cSXin Li     }
80*32afb93cSXin Li }
81*32afb93cSXin Li 
rsYuvToRGBA_uchar4(uchar y,uchar u,uchar v)82*32afb93cSXin Li static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
83*32afb93cSXin Li     int16_t Y = ((int16_t)y) - 16;
84*32afb93cSXin Li     int16_t U = ((int16_t)u) - 128;
85*32afb93cSXin Li     int16_t V = ((int16_t)v) - 128;
86*32afb93cSXin Li 
87*32afb93cSXin Li     short4 p;
88*32afb93cSXin Li     p.x = (Y * 298 + V * 409 + 128) >> 8;
89*32afb93cSXin Li     p.y = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
90*32afb93cSXin Li     p.z = (Y * 298 + U * 516 + 128) >> 8;
91*32afb93cSXin Li     p.w = 255;
92*32afb93cSXin Li     if(p.x < 0) {
93*32afb93cSXin Li         p.x = 0;
94*32afb93cSXin Li     }
95*32afb93cSXin Li     if(p.x > 255) {
96*32afb93cSXin Li         p.x = 255;
97*32afb93cSXin Li     }
98*32afb93cSXin Li     if(p.y < 0) {
99*32afb93cSXin Li         p.y = 0;
100*32afb93cSXin Li     }
101*32afb93cSXin Li     if(p.y > 255) {
102*32afb93cSXin Li         p.y = 255;
103*32afb93cSXin Li     }
104*32afb93cSXin Li     if(p.z < 0) {
105*32afb93cSXin Li         p.z = 0;
106*32afb93cSXin Li     }
107*32afb93cSXin Li     if(p.z > 255) {
108*32afb93cSXin Li         p.z = 255;
109*32afb93cSXin Li     }
110*32afb93cSXin Li 
111*32afb93cSXin Li     return (uchar4){static_cast<uchar>(p.x), static_cast<uchar>(p.y),
112*32afb93cSXin Li                     static_cast<uchar>(p.z), static_cast<uchar>(p.w)};
113*32afb93cSXin Li }
114*32afb93cSXin Li 
115*32afb93cSXin Li extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
116*32afb93cSXin Li                                   size_t xend);
117*32afb93cSXin Li extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
118*32afb93cSXin Li                                    size_t xend);
119*32afb93cSXin Li extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v,
120*32afb93cSXin Li                                    size_t xstart, size_t xend);
121*32afb93cSXin Li 
kernel(uchar4 * out,uint32_t xstart,uint32_t xend,uint32_t currentY)122*32afb93cSXin Li void YuvToRgbTask::kernel(uchar4 *out, uint32_t xstart, uint32_t xend, uint32_t currentY) {
123*32afb93cSXin Li     //ALOGI("kernel out %p, xstart=%u, xend=%u, currentY=%u", out, xstart, xend, currentY);
124*32afb93cSXin Li 
125*32afb93cSXin Li     const uchar *y = mInY + (currentY * mStrideY);
126*32afb93cSXin Li     const uchar *v = mInV + ((currentY >> 1) * mStrideV);
127*32afb93cSXin Li     const uchar *u = mInU + ((currentY >> 1) * mStrideU);
128*32afb93cSXin Li 
129*32afb93cSXin Li     //ALOGI("pinY %p, pinV %p, pinU %p", pinY, pinV, pinU);
130*32afb93cSXin Li 
131*32afb93cSXin Li     uint32_t x1 = xstart;
132*32afb93cSXin Li     uint32_t x2 = xend;
133*32afb93cSXin Li 
134*32afb93cSXin Li     /*
135*32afb93cSXin Li     ALOGE("pinY, %p, Y, %p, currentY, %d, strideY, %zu", pinY, y, currentY, mStrideY);
136*32afb93cSXin Li     ALOGE("pinU, %p, U, %p, currentY, %d, strideU, %zu", pinU, u, currentY, mStrideU);
137*32afb93cSXin Li     ALOGE("pinV, %p, V, %p, currentY, %d, strideV, %zu", pinV, v, currentY, mStrideV);
138*32afb93cSXin Li     ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX,
139*32afb93cSXin Li           cp->alloc->mHal.drvState.lod[0].dimY);
140*32afb93cSXin Li     ALOGE("info->dim.x, %d, info->dim.y, %d", info->dim.x, info->dim.y);
141*32afb93cSXin Li     uchar* pinY = (uchar*)mInY;
142*32afb93cSXin Li     uchar* pinU = (uchar*)mInU;
143*32afb93cSXin Li     uchar* pinV = (uchar*)mInV;
144*32afb93cSXin Li     ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
145*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
146*32afb93cSXin Li           pinY, pinY[0], pinY[1], pinY[2], pinY[3], pinY[4], pinY[5], pinY[6], pinY[7], pinY[8],
147*32afb93cSXin Li           pinY[9], pinY[10], pinY[11], pinY[12], pinY[13], pinY[14], pinY[15]);
148*32afb93cSXin Li     ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
149*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
150*32afb93cSXin Li           pinY, pinY[16], pinY[17], pinY[18], pinY[19], pinY[20], pinY[21], pinY[22], pinY[23],
151*32afb93cSXin Li           pinY[24], pinY[25], pinY[26], pinY[27], pinY[28], pinY[29], pinY[30], pinY[31]);
152*32afb93cSXin Li     ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
153*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
154*32afb93cSXin Li           pinY, pinY[32], pinY[33], pinY[34], pinY[35], pinY[36], pinY[37], pinY[38], pinY[39],
155*32afb93cSXin Li           pinY[40], pinY[41], pinY[42], pinY[43], pinY[44], pinY[45], pinY[46], pinY[47]);
156*32afb93cSXin Li 
157*32afb93cSXin Li     ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
158*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
159*32afb93cSXin Li           pinU, pinU[0], pinU[1], pinU[2], pinU[3], pinU[4], pinU[5], pinU[6], pinU[7], pinU[8],
160*32afb93cSXin Li           pinU[9], pinU[10], pinU[11], pinU[12], pinU[13], pinU[14], pinU[15]);
161*32afb93cSXin Li     ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
162*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
163*32afb93cSXin Li           pinU, pinU[16], pinU[17], pinU[18], pinU[19], pinU[20], pinU[21], pinU[22], pinU[23],
164*32afb93cSXin Li           pinU[24], pinU[25], pinU[26], pinU[27], pinU[28], pinU[29], pinU[30], pinU[31]);
165*32afb93cSXin Li     ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
166*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
167*32afb93cSXin Li           pinU, pinU[32], pinU[33], pinU[34], pinU[35], pinU[36], pinU[37], pinU[38], pinU[39],
168*32afb93cSXin Li           pinU[40], pinU[41], pinU[42], pinU[43], pinU[44], pinU[45], pinU[46], pinU[47]);
169*32afb93cSXin Li 
170*32afb93cSXin Li     ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
171*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
172*32afb93cSXin Li           pinV, pinV[0], pinV[1], pinV[2], pinV[3], pinV[4], pinV[5], pinV[6], pinV[7], pinV[8],
173*32afb93cSXin Li           pinV[9], pinV[10], pinV[11], pinV[12], pinV[13], pinV[14], pinV[15]);
174*32afb93cSXin Li     ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
175*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
176*32afb93cSXin Li           pinV, pinV[16], pinV[17], pinV[18], pinV[19], pinV[20], pinV[21], pinV[22], pinV[23],
177*32afb93cSXin Li           pinV[24], pinV[25], pinV[26], pinV[27], pinV[28], pinV[29], pinV[30], pinV[31]);
178*32afb93cSXin Li     ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
179*32afb93cSXin Li           "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
180*32afb93cSXin Li           pinV, pinV[32], pinV[33], pinV[34], pinV[35], pinV[36], pinV[37], pinV[38], pinV[39],
181*32afb93cSXin Li           pinV[40], pinV[41], pinV[42], pinV[43], pinV[44], pinV[45], pinV[46], pinV[47]);
182*32afb93cSXin Li     */
183*32afb93cSXin Li 
184*32afb93cSXin Li     /* If we start on an odd pixel then deal with it here and bump things along
185*32afb93cSXin Li      * so that subsequent code can carry on with even-odd pairing assumptions.
186*32afb93cSXin Li      */
187*32afb93cSXin Li     if((x1 & 1) && (x2 > x1)) {
188*32afb93cSXin Li         int cx = (x1 >> 1) * mCstep;
189*32afb93cSXin Li         *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
190*32afb93cSXin Li         out++;
191*32afb93cSXin Li         x1++;
192*32afb93cSXin Li     }
193*32afb93cSXin Li 
194*32afb93cSXin Li #if defined(ARCH_ARM_USE_INTRINSICS)
195*32afb93cSXin Li     if((x2 > x1) && mUsesSimd) {
196*32afb93cSXin Li         int32_t len = x2 - x1;
197*32afb93cSXin Li         if (mCstep == 1) {
198*32afb93cSXin Li             rsdIntrinsicYuv2_K(out, y, u, v, x1, x2);
199*32afb93cSXin Li             x1 += len;
200*32afb93cSXin Li             out += len;
201*32afb93cSXin Li         } else if (mCstep == 2) {
202*32afb93cSXin Li             // Check for proper interleave
203*32afb93cSXin Li             intptr_t ipu = (intptr_t)u;
204*32afb93cSXin Li             intptr_t ipv = (intptr_t)v;
205*32afb93cSXin Li 
206*32afb93cSXin Li             if (ipu == (ipv + 1)) {
207*32afb93cSXin Li                 rsdIntrinsicYuv_K(out, y, v, x1, x2);
208*32afb93cSXin Li                 x1 += len;
209*32afb93cSXin Li                 out += len;
210*32afb93cSXin Li             } else if (ipu == (ipv - 1)) {
211*32afb93cSXin Li                 rsdIntrinsicYuvR_K(out, y, u, x1, x2);
212*32afb93cSXin Li                 x1 += len;
213*32afb93cSXin Li                 out += len;
214*32afb93cSXin Li             }
215*32afb93cSXin Li         }
216*32afb93cSXin Li     }
217*32afb93cSXin Li #endif
218*32afb93cSXin Li 
219*32afb93cSXin Li     if(x2 > x1) {
220*32afb93cSXin Li        // ALOGE("y %i  %i  %i", currentY, x1, x2);
221*32afb93cSXin Li         while(x1 < x2) {
222*32afb93cSXin Li             int cx = (x1 >> 1) * mCstep;
223*32afb93cSXin Li             *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
224*32afb93cSXin Li             out++;
225*32afb93cSXin Li             x1++;
226*32afb93cSXin Li             *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
227*32afb93cSXin Li             out++;
228*32afb93cSXin Li             x1++;
229*32afb93cSXin Li         }
230*32afb93cSXin Li     }
231*32afb93cSXin Li }
232*32afb93cSXin Li 
yuvToRgb(const uint8_t * input,uint8_t * output,size_t sizeX,size_t sizeY,YuvFormat format)233*32afb93cSXin Li void RenderScriptToolkit::yuvToRgb(const uint8_t* input, uint8_t* output, size_t sizeX,
234*32afb93cSXin Li                                    size_t sizeY, YuvFormat format) {
235*32afb93cSXin Li     YuvToRgbTask task(input, output, sizeX, sizeY, format);
236*32afb93cSXin Li     processor->doTask(&task);
237*32afb93cSXin Li }
238*32afb93cSXin Li 
239*32afb93cSXin Li }  // namespace renderscript
240