1*32afb93cSXin Li /*
2*32afb93cSXin Li  * Copyright (C) 2012 The Android Open Source Project
3*32afb93cSXin Li  *
4*32afb93cSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
5*32afb93cSXin Li  * you may not use this file except in compliance with the License.
6*32afb93cSXin Li  * You may obtain a copy of the License at
7*32afb93cSXin Li  *
8*32afb93cSXin Li  *      http://www.apache.org/licenses/LICENSE-2.0
9*32afb93cSXin Li  *
10*32afb93cSXin Li  * Unless required by applicable law or agreed to in writing, software
11*32afb93cSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
12*32afb93cSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*32afb93cSXin Li  * See the License for the specific language governing permissions and
14*32afb93cSXin Li  * limitations under the License.
15*32afb93cSXin Li  */
16*32afb93cSXin Li 
17*32afb93cSXin Li #include <cmath>
18*32afb93cSXin Li #include <cstdint>
19*32afb93cSXin Li 
20*32afb93cSXin Li #include "RenderScriptToolkit.h"
21*32afb93cSXin Li #include "TaskProcessor.h"
22*32afb93cSXin Li #include "Utils.h"
23*32afb93cSXin Li 
24*32afb93cSXin Li namespace renderscript {
25*32afb93cSXin Li 
26*32afb93cSXin Li #define LOG_TAG "renderscript.toolkit.Blur"
27*32afb93cSXin Li 
28*32afb93cSXin Li /**
29*32afb93cSXin Li  * Blurs an image or a section of an image.
30*32afb93cSXin Li  *
31*32afb93cSXin Li  * Our algorithm does two passes: a vertical blur followed by an horizontal blur.
32*32afb93cSXin Li  */
33*32afb93cSXin Li class BlurTask : public Task {
34*32afb93cSXin Li     // The image we're blurring.
35*32afb93cSXin Li     const uchar* mIn;
36*32afb93cSXin Li     // Where we store the blurred image.
37*32afb93cSXin Li     uchar* outArray;
38*32afb93cSXin Li     // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
39*32afb93cSXin Li     // So, the max kernel size is 51 (= 2 * 25 + 1).
40*32afb93cSXin Li     // Considering SSSE3 case, which requires the size is multiple of 4,
41*32afb93cSXin Li     // at least 52 words are necessary. Values outside of the kernel should be 0.
42*32afb93cSXin Li     float mFp[104];
43*32afb93cSXin Li     uint16_t mIp[104];
44*32afb93cSXin Li 
45*32afb93cSXin Li     // Working area to store the result of the vertical blur, to be used by the horizontal pass.
46*32afb93cSXin Li     // There's one area per thread. Since the needed working area may be too large to put on the
47*32afb93cSXin Li     // stack, we are allocating it from the heap. To avoid paying the allocation cost for each
48*32afb93cSXin Li     // tile, we cache the scratch area here.
49*32afb93cSXin Li     std::vector<void*> mScratch;       // Pointers to the scratch areas, one per thread.
50*32afb93cSXin Li     std::vector<size_t> mScratchSize;  // The size in bytes of the scratch areas, one per thread.
51*32afb93cSXin Li 
52*32afb93cSXin Li     // The radius of the blur, in floating point and integer format.
53*32afb93cSXin Li     float mRadius;
54*32afb93cSXin Li     int mIradius;
55*32afb93cSXin Li 
56*32afb93cSXin Li     void kernelU4(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
57*32afb93cSXin Li                   uint32_t threadIndex);
58*32afb93cSXin Li     void kernelU1(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
59*32afb93cSXin Li     void ComputeGaussianWeights();
60*32afb93cSXin Li 
61*32afb93cSXin Li     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
62*32afb93cSXin Li     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
63*32afb93cSXin Li                      size_t endY) override;
64*32afb93cSXin Li 
65*32afb93cSXin Li    public:
BlurTask(const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,uint32_t threadCount,float radius,const Restriction * restriction)66*32afb93cSXin Li     BlurTask(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY, size_t vectorSize,
67*32afb93cSXin Li              uint32_t threadCount, float radius, const Restriction* restriction)
68*32afb93cSXin Li         : Task{sizeX, sizeY, vectorSize, false, restriction},
69*32afb93cSXin Li           mIn{in},
70*32afb93cSXin Li           outArray{out},
71*32afb93cSXin Li           mScratch{threadCount},
72*32afb93cSXin Li           mScratchSize{threadCount},
73*32afb93cSXin Li           mRadius{std::min(25.0f, radius)} {
74*32afb93cSXin Li         ComputeGaussianWeights();
75*32afb93cSXin Li     }
76*32afb93cSXin Li 
~BlurTask()77*32afb93cSXin Li     ~BlurTask() {
78*32afb93cSXin Li         for (size_t i = 0; i < mScratch.size(); i++) {
79*32afb93cSXin Li             if (mScratch[i]) {
80*32afb93cSXin Li                 free(mScratch[i]);
81*32afb93cSXin Li             }
82*32afb93cSXin Li         }
83*32afb93cSXin Li     }
84*32afb93cSXin Li };
85*32afb93cSXin Li 
ComputeGaussianWeights()86*32afb93cSXin Li void BlurTask::ComputeGaussianWeights() {
87*32afb93cSXin Li     memset(mFp, 0, sizeof(mFp));
88*32afb93cSXin Li     memset(mIp, 0, sizeof(mIp));
89*32afb93cSXin Li 
90*32afb93cSXin Li     // Compute gaussian weights for the blur
91*32afb93cSXin Li     // e is the euler's number
92*32afb93cSXin Li     float e = 2.718281828459045f;
93*32afb93cSXin Li     float pi = 3.1415926535897932f;
94*32afb93cSXin Li     // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
95*32afb93cSXin Li     // x is of the form [-radius .. 0 .. radius]
96*32afb93cSXin Li     // and sigma varies with the radius.
97*32afb93cSXin Li     // Based on some experimental radius values and sigmas,
98*32afb93cSXin Li     // we approximately fit sigma = f(radius) as
99*32afb93cSXin Li     // sigma = radius * 0.4  + 0.6
100*32afb93cSXin Li     // The larger the radius gets, the more our gaussian blur
101*32afb93cSXin Li     // will resemble a box blur since with large sigma
102*32afb93cSXin Li     // the gaussian curve begins to lose its shape
103*32afb93cSXin Li     float sigma = 0.4f * mRadius + 0.6f;
104*32afb93cSXin Li 
105*32afb93cSXin Li     // Now compute the coefficients. We will store some redundant values to save
106*32afb93cSXin Li     // some math during the blur calculations precompute some values
107*32afb93cSXin Li     float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
108*32afb93cSXin Li     float coeff2 = - 1.0f / (2.0f * sigma * sigma);
109*32afb93cSXin Li 
110*32afb93cSXin Li     float normalizeFactor = 0.0f;
111*32afb93cSXin Li     float floatR = 0.0f;
112*32afb93cSXin Li     int r;
113*32afb93cSXin Li     mIradius = (float)ceil(mRadius) + 0.5f;
114*32afb93cSXin Li     for (r = -mIradius; r <= mIradius; r ++) {
115*32afb93cSXin Li         floatR = (float)r;
116*32afb93cSXin Li         mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
117*32afb93cSXin Li         normalizeFactor += mFp[r + mIradius];
118*32afb93cSXin Li     }
119*32afb93cSXin Li 
120*32afb93cSXin Li     // Now we need to normalize the weights because all our coefficients need to add up to one
121*32afb93cSXin Li     normalizeFactor = 1.0f / normalizeFactor;
122*32afb93cSXin Li     for (r = -mIradius; r <= mIradius; r ++) {
123*32afb93cSXin Li         mFp[r + mIradius] *= normalizeFactor;
124*32afb93cSXin Li         mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
125*32afb93cSXin Li     }
126*32afb93cSXin Li }
127*32afb93cSXin Li 
128*32afb93cSXin Li /**
129*32afb93cSXin Li  * Vertical blur of a uchar4 line.
130*32afb93cSXin Li  *
131*32afb93cSXin Li  * @param sizeY Number of cells of the input array in the vertical direction.
132*32afb93cSXin Li  * @param out Where to place the computed value.
133*32afb93cSXin Li  * @param x Coordinate of the point we're blurring.
134*32afb93cSXin Li  * @param y Coordinate of the point we're blurring.
135*32afb93cSXin Li  * @param ptrIn Start of the input array.
136*32afb93cSXin Li  * @param iStride The size in byte of a row of the input array.
137*32afb93cSXin Li  * @param gPtr The gaussian coefficients.
138*32afb93cSXin Li  * @param iradius The radius of the blur.
139*32afb93cSXin Li  */
OneVU4(uint32_t sizeY,float4 * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)140*32afb93cSXin Li static void OneVU4(uint32_t sizeY, float4* out, int32_t x, int32_t y, const uchar* ptrIn,
141*32afb93cSXin Li                    int iStride, const float* gPtr, int iradius) {
142*32afb93cSXin Li     const uchar *pi = ptrIn + x*4;
143*32afb93cSXin Li 
144*32afb93cSXin Li     float4 blurredPixel = 0;
145*32afb93cSXin Li     for (int r = -iradius; r <= iradius; r ++) {
146*32afb93cSXin Li         int validY = std::max((y + r), 0);
147*32afb93cSXin Li         validY = std::min(validY, (int)(sizeY - 1));
148*32afb93cSXin Li         const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
149*32afb93cSXin Li         float4 pf = convert<float4>(pvy[0]);
150*32afb93cSXin Li         blurredPixel += pf * gPtr[0];
151*32afb93cSXin Li         gPtr++;
152*32afb93cSXin Li     }
153*32afb93cSXin Li 
154*32afb93cSXin Li     out[0] = blurredPixel;
155*32afb93cSXin Li }
156*32afb93cSXin Li 
157*32afb93cSXin Li /**
158*32afb93cSXin Li  * Vertical blur of a uchar1 line.
159*32afb93cSXin Li  *
160*32afb93cSXin Li  * @param sizeY Number of cells of the input array in the vertical direction.
161*32afb93cSXin Li  * @param out Where to place the computed value.
162*32afb93cSXin Li  * @param x Coordinate of the point we're blurring.
163*32afb93cSXin Li  * @param y Coordinate of the point we're blurring.
164*32afb93cSXin Li  * @param ptrIn Start of the input array.
165*32afb93cSXin Li  * @param iStride The size in byte of a row of the input array.
166*32afb93cSXin Li  * @param gPtr The gaussian coefficients.
167*32afb93cSXin Li  * @param iradius The radius of the blur.
168*32afb93cSXin Li  */
OneVU1(uint32_t sizeY,float * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)169*32afb93cSXin Li static void OneVU1(uint32_t sizeY, float *out, int32_t x, int32_t y,
170*32afb93cSXin Li                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
171*32afb93cSXin Li 
172*32afb93cSXin Li     const uchar *pi = ptrIn + x;
173*32afb93cSXin Li 
174*32afb93cSXin Li     float blurredPixel = 0;
175*32afb93cSXin Li     for (int r = -iradius; r <= iradius; r ++) {
176*32afb93cSXin Li         int validY = std::max((y + r), 0);
177*32afb93cSXin Li         validY = std::min(validY, (int)(sizeY - 1));
178*32afb93cSXin Li         float pf = (float)pi[validY * iStride];
179*32afb93cSXin Li         blurredPixel += pf * gPtr[0];
180*32afb93cSXin Li         gPtr++;
181*32afb93cSXin Li     }
182*32afb93cSXin Li 
183*32afb93cSXin Li     out[0] = blurredPixel;
184*32afb93cSXin Li }
185*32afb93cSXin Li 
186*32afb93cSXin Li 
187*32afb93cSXin Li extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
188*32afb93cSXin Li                  size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
189*32afb93cSXin Li extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
190*32afb93cSXin Li                  size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
191*32afb93cSXin Li 
192*32afb93cSXin Li #if defined(ARCH_X86_HAVE_SSSE3)
193*32afb93cSXin Li extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr,
194*32afb93cSXin Li                                    int rct, int x1, int ct);
195*32afb93cSXin Li extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
196*32afb93cSXin Li                                    int ct);
197*32afb93cSXin Li extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
198*32afb93cSXin Li                                    int ct);
199*32afb93cSXin Li #endif
200*32afb93cSXin Li 
201*32afb93cSXin Li /**
202*32afb93cSXin Li  * Vertical blur of a line of RGBA, knowing that there's enough rows above and below us to avoid
203*32afb93cSXin Li  * dealing with boundary conditions.
204*32afb93cSXin Li  *
205*32afb93cSXin Li  * @param out Where to store the results. This is the input to the horizontal blur.
206*32afb93cSXin Li  * @param ptrIn The input data for this line.
207*32afb93cSXin Li  * @param iStride The width of the input.
208*32afb93cSXin Li  * @param gPtr The gaussian coefficients.
209*32afb93cSXin Li  * @param ct The diameter of the blur.
210*32afb93cSXin Li  * @param len How many cells to blur.
211*32afb93cSXin Li  * @param usesSimd Whether this processor supports SIMD.
212*32afb93cSXin Li  */
OneVFU4(float4 * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int x2,bool usesSimd)213*32afb93cSXin Li static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct,
214*32afb93cSXin Li                     int x2, bool usesSimd) {
215*32afb93cSXin Li     int x1 = 0;
216*32afb93cSXin Li #if defined(ARCH_X86_HAVE_SSSE3)
217*32afb93cSXin Li     if (usesSimd) {
218*32afb93cSXin Li         int t = (x2 - x1);
219*32afb93cSXin Li         t &= ~1;
220*32afb93cSXin Li         if (t) {
221*32afb93cSXin Li             rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
222*32afb93cSXin Li         }
223*32afb93cSXin Li         x1 += t;
224*32afb93cSXin Li         out += t;
225*32afb93cSXin Li         ptrIn += t << 2;
226*32afb93cSXin Li     }
227*32afb93cSXin Li #else
228*32afb93cSXin Li     (void) usesSimd; // Avoid unused parameter warning.
229*32afb93cSXin Li #endif
230*32afb93cSXin Li     while(x2 > x1) {
231*32afb93cSXin Li         const uchar *pi = ptrIn;
232*32afb93cSXin Li         float4 blurredPixel = 0;
233*32afb93cSXin Li         const float* gp = gPtr;
234*32afb93cSXin Li 
235*32afb93cSXin Li         for (int r = 0; r < ct; r++) {
236*32afb93cSXin Li             float4 pf = convert<float4>(((const uchar4 *)pi)[0]);
237*32afb93cSXin Li             blurredPixel += pf * gp[0];
238*32afb93cSXin Li             pi += iStride;
239*32afb93cSXin Li             gp++;
240*32afb93cSXin Li         }
241*32afb93cSXin Li         out->xyzw = blurredPixel;
242*32afb93cSXin Li         x1++;
243*32afb93cSXin Li         out++;
244*32afb93cSXin Li         ptrIn+=4;
245*32afb93cSXin Li     }
246*32afb93cSXin Li }
247*32afb93cSXin Li 
248*32afb93cSXin Li /**
249*32afb93cSXin Li  * Vertical blur of a line of U_8, knowing that there's enough rows above and below us to avoid
250*32afb93cSXin Li  * dealing with boundary conditions.
251*32afb93cSXin Li  *
252*32afb93cSXin Li  * @param out Where to store the results. This is the input to the horizontal blur.
253*32afb93cSXin Li  * @param ptrIn The input data for this line.
254*32afb93cSXin Li  * @param iStride The width of the input.
255*32afb93cSXin Li  * @param gPtr The gaussian coefficients.
256*32afb93cSXin Li  * @param ct The diameter of the blur.
257*32afb93cSXin Li  * @param len How many cells to blur.
258*32afb93cSXin Li  * @param usesSimd Whether this processor supports SIMD.
259*32afb93cSXin Li  */
OneVFU1(float * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int len,bool usesSimd)260*32afb93cSXin Li static void OneVFU1(float* out, const uchar* ptrIn, int iStride, const float* gPtr, int ct, int len,
261*32afb93cSXin Li                     bool usesSimd) {
262*32afb93cSXin Li     int x1 = 0;
263*32afb93cSXin Li 
264*32afb93cSXin Li     while((len > x1) && (((uintptr_t)ptrIn) & 0x3)) {
265*32afb93cSXin Li         const uchar *pi = ptrIn;
266*32afb93cSXin Li         float blurredPixel = 0;
267*32afb93cSXin Li         const float* gp = gPtr;
268*32afb93cSXin Li 
269*32afb93cSXin Li         for (int r = 0; r < ct; r++) {
270*32afb93cSXin Li             float pf = (float)pi[0];
271*32afb93cSXin Li             blurredPixel += pf * gp[0];
272*32afb93cSXin Li             pi += iStride;
273*32afb93cSXin Li             gp++;
274*32afb93cSXin Li         }
275*32afb93cSXin Li         out[0] = blurredPixel;
276*32afb93cSXin Li         x1++;
277*32afb93cSXin Li         out++;
278*32afb93cSXin Li         ptrIn++;
279*32afb93cSXin Li         len--;
280*32afb93cSXin Li     }
281*32afb93cSXin Li #if defined(ARCH_X86_HAVE_SSSE3)
282*32afb93cSXin Li     if (usesSimd && (len > x1)) {
283*32afb93cSXin Li         int t = (len - x1) >> 2;
284*32afb93cSXin Li         t &= ~1;
285*32afb93cSXin Li         if (t) {
286*32afb93cSXin Li             rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
287*32afb93cSXin Li             len -= t << 2;
288*32afb93cSXin Li             ptrIn += t << 2;
289*32afb93cSXin Li             out += t << 2;
290*32afb93cSXin Li         }
291*32afb93cSXin Li     }
292*32afb93cSXin Li #else
293*32afb93cSXin Li     (void) usesSimd; // Avoid unused parameter warning.
294*32afb93cSXin Li #endif
295*32afb93cSXin Li     while(len > 0) {
296*32afb93cSXin Li         const uchar *pi = ptrIn;
297*32afb93cSXin Li         float blurredPixel = 0;
298*32afb93cSXin Li         const float* gp = gPtr;
299*32afb93cSXin Li 
300*32afb93cSXin Li         for (int r = 0; r < ct; r++) {
301*32afb93cSXin Li             float pf = (float)pi[0];
302*32afb93cSXin Li             blurredPixel += pf * gp[0];
303*32afb93cSXin Li             pi += iStride;
304*32afb93cSXin Li             gp++;
305*32afb93cSXin Li         }
306*32afb93cSXin Li         out[0] = blurredPixel;
307*32afb93cSXin Li         len--;
308*32afb93cSXin Li         out++;
309*32afb93cSXin Li         ptrIn++;
310*32afb93cSXin Li     }
311*32afb93cSXin Li }
312*32afb93cSXin Li 
313*32afb93cSXin Li /**
314*32afb93cSXin Li  * Horizontal blur of a uchar4 line.
315*32afb93cSXin Li  *
316*32afb93cSXin Li  * @param sizeX Number of cells of the input array in the horizontal direction.
317*32afb93cSXin Li  * @param out Where to place the computed value.
318*32afb93cSXin Li  * @param x Coordinate of the point we're blurring.
319*32afb93cSXin Li  * @param ptrIn The start of the input row from which we're indexing x.
320*32afb93cSXin Li  * @param gPtr The gaussian coefficients.
321*32afb93cSXin Li  * @param iradius The radius of the blur.
322*32afb93cSXin Li  */
OneHU4(uint32_t sizeX,uchar4 * out,int32_t x,const float4 * ptrIn,const float * gPtr,int iradius)323*32afb93cSXin Li static void OneHU4(uint32_t sizeX, uchar4* out, int32_t x, const float4* ptrIn, const float* gPtr,
324*32afb93cSXin Li                    int iradius) {
325*32afb93cSXin Li     float4 blurredPixel = 0;
326*32afb93cSXin Li     for (int r = -iradius; r <= iradius; r ++) {
327*32afb93cSXin Li         int validX = std::max((x + r), 0);
328*32afb93cSXin Li         validX = std::min(validX, (int)(sizeX - 1));
329*32afb93cSXin Li         float4 pf = ptrIn[validX];
330*32afb93cSXin Li         blurredPixel += pf * gPtr[0];
331*32afb93cSXin Li         gPtr++;
332*32afb93cSXin Li     }
333*32afb93cSXin Li 
334*32afb93cSXin Li     out->xyzw = convert<uchar4>(blurredPixel);
335*32afb93cSXin Li }
336*32afb93cSXin Li 
337*32afb93cSXin Li /**
338*32afb93cSXin Li  * Horizontal blur of a uchar line.
339*32afb93cSXin Li  *
340*32afb93cSXin Li  * @param sizeX Number of cells of the input array in the horizontal direction.
341*32afb93cSXin Li  * @param out Where to place the computed value.
342*32afb93cSXin Li  * @param x Coordinate of the point we're blurring.
343*32afb93cSXin Li  * @param ptrIn The start of the input row from which we're indexing x.
344*32afb93cSXin Li  * @param gPtr The gaussian coefficients.
345*32afb93cSXin Li  * @param iradius The radius of the blur.
346*32afb93cSXin Li  */
OneHU1(uint32_t sizeX,uchar * out,int32_t x,const float * ptrIn,const float * gPtr,int iradius)347*32afb93cSXin Li static void OneHU1(uint32_t sizeX, uchar* out, int32_t x, const float* ptrIn, const float* gPtr,
348*32afb93cSXin Li                    int iradius) {
349*32afb93cSXin Li     float blurredPixel = 0;
350*32afb93cSXin Li     for (int r = -iradius; r <= iradius; r ++) {
351*32afb93cSXin Li         int validX = std::max((x + r), 0);
352*32afb93cSXin Li         validX = std::min(validX, (int)(sizeX - 1));
353*32afb93cSXin Li         float pf = ptrIn[validX];
354*32afb93cSXin Li         blurredPixel += pf * gPtr[0];
355*32afb93cSXin Li         gPtr++;
356*32afb93cSXin Li     }
357*32afb93cSXin Li 
358*32afb93cSXin Li     out[0] = (uchar)blurredPixel;
359*32afb93cSXin Li }
360*32afb93cSXin Li 
361*32afb93cSXin Li /**
362*32afb93cSXin Li  * Full blur of a line of RGBA data.
363*32afb93cSXin Li  *
364*32afb93cSXin Li  * @param outPtr Where to store the results
365*32afb93cSXin Li  * @param xstart The index of the section we're starting to blur.
366*32afb93cSXin Li  * @param xend  The end index of the section.
367*32afb93cSXin Li  * @param currentY The index of the line we're blurring.
368*32afb93cSXin Li  * @param usesSimd Whether this processor supports SIMD.
369*32afb93cSXin Li  */
kernelU4(void * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY,uint32_t threadIndex)370*32afb93cSXin Li void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
371*32afb93cSXin Li                         uint32_t threadIndex) {
372*32afb93cSXin Li     float4 stackbuf[2048];
373*32afb93cSXin Li     float4 *buf = &stackbuf[0];
374*32afb93cSXin Li     const uint32_t stride = mSizeX * mVectorSize;
375*32afb93cSXin Li 
376*32afb93cSXin Li     uchar4 *out = (uchar4 *)outPtr;
377*32afb93cSXin Li     uint32_t x1 = xstart;
378*32afb93cSXin Li     uint32_t x2 = xend;
379*32afb93cSXin Li 
380*32afb93cSXin Li #if defined(ARCH_ARM_USE_INTRINSICS)
381*32afb93cSXin Li     if (mUsesSimd && mSizeX >= 4) {
382*32afb93cSXin Li       rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
383*32afb93cSXin Li                  mSizeX, mSizeY,
384*32afb93cSXin Li                  stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
385*32afb93cSXin Li         return;
386*32afb93cSXin Li     }
387*32afb93cSXin Li #endif
388*32afb93cSXin Li 
389*32afb93cSXin Li     if (mSizeX > 2048) {
390*32afb93cSXin Li         if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
391*32afb93cSXin Li             // Pad the side of the allocation by one unit to allow alignment later
392*32afb93cSXin Li             mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
393*32afb93cSXin Li             mScratchSize[threadIndex] = mSizeX;
394*32afb93cSXin Li         }
395*32afb93cSXin Li         // realloc only aligns to 8 bytes so we manually align to 16.
396*32afb93cSXin Li         buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
397*32afb93cSXin Li     }
398*32afb93cSXin Li     float4 *fout = (float4 *)buf;
399*32afb93cSXin Li     int y = currentY;
400*32afb93cSXin Li     if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
401*32afb93cSXin Li         const uchar *pi = mIn + (y - mIradius) * stride;
402*32afb93cSXin Li         OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
403*32afb93cSXin Li     } else {
404*32afb93cSXin Li         x1 = 0;
405*32afb93cSXin Li         while(mSizeX > x1) {
406*32afb93cSXin Li             OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
407*32afb93cSXin Li             fout++;
408*32afb93cSXin Li             x1++;
409*32afb93cSXin Li         }
410*32afb93cSXin Li     }
411*32afb93cSXin Li 
412*32afb93cSXin Li     x1 = xstart;
413*32afb93cSXin Li     while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
414*32afb93cSXin Li         OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
415*32afb93cSXin Li         out++;
416*32afb93cSXin Li         x1++;
417*32afb93cSXin Li     }
418*32afb93cSXin Li #if defined(ARCH_X86_HAVE_SSSE3)
419*32afb93cSXin Li     if (mUsesSimd) {
420*32afb93cSXin Li         if ((x1 + mIradius) < x2) {
421*32afb93cSXin Li             rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
422*32afb93cSXin Li                                    mIradius * 2 + 1, x1, x2 - mIradius);
423*32afb93cSXin Li             out += (x2 - mIradius) - x1;
424*32afb93cSXin Li             x1 = x2 - mIradius;
425*32afb93cSXin Li         }
426*32afb93cSXin Li     }
427*32afb93cSXin Li #endif
428*32afb93cSXin Li     while(x2 > x1) {
429*32afb93cSXin Li         OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
430*32afb93cSXin Li         out++;
431*32afb93cSXin Li         x1++;
432*32afb93cSXin Li     }
433*32afb93cSXin Li }
434*32afb93cSXin Li 
435*32afb93cSXin Li /**
436*32afb93cSXin Li  * Full blur of a line of U_8 data.
437*32afb93cSXin Li  *
438*32afb93cSXin Li  * @param outPtr Where to store the results
439*32afb93cSXin Li  * @param xstart The index of the section we're starting to blur.
440*32afb93cSXin Li  * @param xend  The end index of the section.
441*32afb93cSXin Li  * @param currentY The index of the line we're blurring.
442*32afb93cSXin Li  */
kernelU1(void * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)443*32afb93cSXin Li void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
444*32afb93cSXin Li     float buf[4 * 2048];
445*32afb93cSXin Li     const uint32_t stride = mSizeX * mVectorSize;
446*32afb93cSXin Li 
447*32afb93cSXin Li     uchar *out = (uchar *)outPtr;
448*32afb93cSXin Li     uint32_t x1 = xstart;
449*32afb93cSXin Li     uint32_t x2 = xend;
450*32afb93cSXin Li 
451*32afb93cSXin Li #if defined(ARCH_ARM_USE_INTRINSICS)
452*32afb93cSXin Li     if (mUsesSimd && mSizeX >= 16) {
453*32afb93cSXin Li         // The specialisation for r<=8 has an awkward prefill case, which is
454*32afb93cSXin Li         // fiddly to resolve, where starting close to the right edge can cause
455*32afb93cSXin Li         // a read beyond the end of input.  So avoid that case here.
456*32afb93cSXin Li         if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
457*32afb93cSXin Li             rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
458*32afb93cSXin Li                      stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
459*32afb93cSXin Li             return;
460*32afb93cSXin Li         }
461*32afb93cSXin Li     }
462*32afb93cSXin Li #endif
463*32afb93cSXin Li 
464*32afb93cSXin Li     float *fout = (float *)buf;
465*32afb93cSXin Li     int y = currentY;
466*32afb93cSXin Li     if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
467*32afb93cSXin Li         const uchar *pi = mIn + (y - mIradius) * stride;
468*32afb93cSXin Li         OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
469*32afb93cSXin Li     } else {
470*32afb93cSXin Li         x1 = 0;
471*32afb93cSXin Li         while(mSizeX > x1) {
472*32afb93cSXin Li             OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
473*32afb93cSXin Li             fout++;
474*32afb93cSXin Li             x1++;
475*32afb93cSXin Li         }
476*32afb93cSXin Li     }
477*32afb93cSXin Li 
478*32afb93cSXin Li     x1 = xstart;
479*32afb93cSXin Li     while ((x1 < x2) &&
480*32afb93cSXin Li            ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
481*32afb93cSXin Li         OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
482*32afb93cSXin Li         out++;
483*32afb93cSXin Li         x1++;
484*32afb93cSXin Li     }
485*32afb93cSXin Li #if defined(ARCH_X86_HAVE_SSSE3)
486*32afb93cSXin Li     if (mUsesSimd) {
487*32afb93cSXin Li         if ((x1 + mIradius) < x2) {
488*32afb93cSXin Li             uint32_t len = x2 - (x1 + mIradius);
489*32afb93cSXin Li             len &= ~3;
490*32afb93cSXin Li 
491*32afb93cSXin Li             // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
492*32afb93cSXin Li             // nees to ensure four more values can be accessed in order to avoid accessing
493*32afb93cSXin Li             // uninitialized buffer.
494*32afb93cSXin Li             if (len > 4) {
495*32afb93cSXin Li                 len -= 4;
496*32afb93cSXin Li                 rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
497*32afb93cSXin Li                                        mIradius * 2 + 1, x1, x1 + len);
498*32afb93cSXin Li                 out += len;
499*32afb93cSXin Li                 x1 += len;
500*32afb93cSXin Li             }
501*32afb93cSXin Li         }
502*32afb93cSXin Li     }
503*32afb93cSXin Li #endif
504*32afb93cSXin Li     while(x2 > x1) {
505*32afb93cSXin Li         OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
506*32afb93cSXin Li         out++;
507*32afb93cSXin Li         x1++;
508*32afb93cSXin Li     }
509*32afb93cSXin Li }
510*32afb93cSXin Li 
processData(int threadIndex,size_t startX,size_t startY,size_t endX,size_t endY)511*32afb93cSXin Li void BlurTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
512*32afb93cSXin Li                            size_t endY) {
513*32afb93cSXin Li     for (size_t y = startY; y < endY; y++) {
514*32afb93cSXin Li         void* outPtr = outArray + (mSizeX * y + startX) * mVectorSize;
515*32afb93cSXin Li         if (mVectorSize == 4) {
516*32afb93cSXin Li             kernelU4(outPtr, startX, endX, y, threadIndex);
517*32afb93cSXin Li         } else {
518*32afb93cSXin Li             kernelU1(outPtr, startX, endX, y);
519*32afb93cSXin Li         }
520*32afb93cSXin Li     }
521*32afb93cSXin Li }
522*32afb93cSXin Li 
blur(const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,int radius,const Restriction * restriction)523*32afb93cSXin Li void RenderScriptToolkit::blur(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY,
524*32afb93cSXin Li                                size_t vectorSize, int radius, const Restriction* restriction) {
525*32afb93cSXin Li #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
526*32afb93cSXin Li     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
527*32afb93cSXin Li         return;
528*32afb93cSXin Li     }
529*32afb93cSXin Li     if (radius <= 0 || radius > 25) {
530*32afb93cSXin Li         ALOGE("The radius should be between 1 and 25. %d provided.", radius);
531*32afb93cSXin Li     }
532*32afb93cSXin Li     if (vectorSize != 1 && vectorSize != 4) {
533*32afb93cSXin Li         ALOGE("The vectorSize should be 1 or 4. %zu provided.", vectorSize);
534*32afb93cSXin Li     }
535*32afb93cSXin Li #endif
536*32afb93cSXin Li 
537*32afb93cSXin Li     BlurTask task(in, out, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), radius,
538*32afb93cSXin Li                   restriction);
539*32afb93cSXin Li     processor->doTask(&task);
540*32afb93cSXin Li }
541*32afb93cSXin Li 
542*32afb93cSXin Li }  // namespace renderscript
543