1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 17 18 #include <algorithm> 19 20 #include "ruy/profiler/instrumentation.h" // from @ruy 21 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" 22 #include "tensorflow/lite/kernels/internal/types.h" 23 24 namespace tflite { 25 namespace optimized_ops { 26 27 // Implementation of float DepthwiseConv 28 29 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 30 struct FloatDepthwiseConvKernel {}; 31 32 #ifdef USE_NEON 33 34 template <> 35 struct FloatDepthwiseConvKernel<false, 8, 1> { 36 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 37 const float* input_ptr, int input_ptr_increment, 38 const float* filter_ptr, float* acc_buffer_ptr) { 39 // Load the filters 40 float32x4_t filter[2]; 41 for (int i = 0; i < 2; i++) { 42 filter[i] = vld1q_f32(filter_ptr + 4 * i); 43 } 44 int outp = 0; 45 // Handle 2 output pixels at a time. 46 for (; outp <= num_output_pixels - 2; outp += 2) { 47 // Load the inputs 48 float32x4_t input[4]; 49 for (int i = 0; i < 4; i++) { 50 input[i] = vld1q_f32(input_ptr + 4 * i); 51 } 52 input_ptr += 16; 53 // Load the accumulators from acc_buffer 54 float32x4_t acc[4]; 55 for (int i = 0; i < 4; i++) { 56 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 57 } 58 // Multiply-accumulate 59 acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); 60 acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); 61 acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); 62 acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); 63 // Store the accumulators back to acc_buffer 64 for (int i = 0; i < 4; i++) { 65 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 66 } 67 acc_buffer_ptr += 16; 68 } 69 // Handle one output pixel at a time. 70 for (; outp < num_output_pixels; outp++) { 71 // Load the inputs 72 float32x4_t input[2]; 73 for (int i = 0; i < 2; i++) { 74 input[i] = vld1q_f32(input_ptr + 4 * i); 75 } 76 input_ptr += 8; 77 // Load the accumulators from acc_buffer 78 float32x4_t acc[2]; 79 for (int i = 0; i < 2; i++) { 80 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 81 } 82 // Multiply-accumulate 83 for (int i = 0; i < 2; i++) { 84 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); 85 } 86 // Store the accumulators back to acc_buffer 87 for (int i = 0; i < 2; i++) { 88 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 89 } 90 acc_buffer_ptr += 8; 91 } 92 } 93 }; 94 95 template <> 96 struct FloatDepthwiseConvKernel<false, 2, 1> { 97 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 98 const float* input_ptr, int input_ptr_increment, 99 const float* filter_ptr, float* acc_buffer_ptr) { 100 const float32x2_t filters = vld1_f32(filter_ptr); 101 const float32x4_t filters_dup2 = vcombine_f32(filters, filters); 102 int outp = 0; 103 // Handle 8 output pixels at a time. 104 for (; outp <= num_output_pixels - 8; outp += 8) { 105 // Load the inputs 106 float32x4_t input[4]; 107 for (int i = 0; i < 4; i++) { 108 input[i] = vld1q_f32(input_ptr + 4 * i); 109 } 110 input_ptr += 16; 111 // Load the accumulators from acc_buffer 112 float32x4_t acc[4]; 113 for (int i = 0; i < 4; i++) { 114 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 115 } 116 // Multiply-accumulate 117 for (int i = 0; i < 4; i++) { 118 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); 119 } 120 // Store the accumulators back to acc_buffer 121 for (int i = 0; i < 4; i++) { 122 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 123 } 124 acc_buffer_ptr += 16; 125 } 126 // Handle 4 output pixels at a time. 127 for (; outp <= num_output_pixels - 4; outp += 4) { 128 // Load the inputs 129 float32x4_t input[2]; 130 for (int i = 0; i < 2; i++) { 131 input[i] = vld1q_f32(input_ptr + 4 * i); 132 } 133 input_ptr += 8; 134 // Load the accumulators from acc_buffer 135 float32x4_t acc[2]; 136 for (int i = 0; i < 2; i++) { 137 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 138 } 139 // Multiply-accumulate 140 for (int i = 0; i < 2; i++) { 141 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); 142 } 143 // Store the accumulators back to acc_buffer 144 for (int i = 0; i < 2; i++) { 145 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 146 } 147 acc_buffer_ptr += 8; 148 } 149 // Handle 2 output pixels at a time. 150 for (; outp <= num_output_pixels - 2; outp += 2) { 151 // Load the inputs 152 const float32x4_t input = vld1q_f32(input_ptr); 153 input_ptr += 4; 154 // Load the accumulators from acc_buffer 155 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 156 // Multiply-accumulate 157 acc = vmlaq_f32(acc, input, filters_dup2); 158 // Store the accumulators back to acc_buffer 159 vst1q_f32(acc_buffer_ptr, acc); 160 acc_buffer_ptr += 4; 161 } 162 // Handle 1 output pixel at a time 163 for (; outp < num_output_pixels; outp++) { 164 // Load the inputs 165 const float32x2_t input = vld1_f32(input_ptr); 166 input_ptr += 2; 167 // Load the accumulators from acc_buffer 168 float32x2_t acc = vld1_f32(acc_buffer_ptr); 169 // Multiply-accumulate 170 acc = vmla_f32(acc, input, filters); 171 // Store the accumulators back to acc_buffer 172 vst1_f32(acc_buffer_ptr, acc); 173 acc_buffer_ptr += 2; 174 } 175 } 176 }; 177 178 template <> 179 struct FloatDepthwiseConvKernel<true, 0, 1> { 180 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 181 const float* input_ptr, int input_ptr_increment, 182 const float* filter_ptr, float* acc_buffer_ptr) { 183 // Handle one output pixel at a time. 184 for (int outp = 0; outp < num_output_pixels; outp++) { 185 const float* local_filter_ptr = filter_ptr; 186 const float* local_input_ptr = input_ptr; 187 int ic = 0; 188 // Handle 16 input channels at a time. 189 for (; ic <= input_depth - 16; ic += 16) { 190 // Load the filters 191 float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0); 192 float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1); 193 float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2); 194 float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3); 195 local_filter_ptr += 16; 196 // Load the inputs 197 float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0); 198 float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1); 199 float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2); 200 float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3); 201 local_input_ptr += 16; 202 // Load the accumulators from acc_buffer 203 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 204 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 205 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 206 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 207 // Multiply-accumulate 208 acc_0 = vmlaq_f32(acc_0, input_0, filter_0); 209 acc_1 = vmlaq_f32(acc_1, input_1, filter_1); 210 acc_2 = vmlaq_f32(acc_2, input_2, filter_2); 211 acc_3 = vmlaq_f32(acc_3, input_3, filter_3); 212 // Store the accumulators back to acc_buffer 213 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 214 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 215 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 216 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 217 acc_buffer_ptr += 16; 218 } 219 // Handle 4 input channels at a time. 220 for (; ic <= input_depth - 4; ic += 4) { 221 // Load the filters 222 float32x4_t filter; 223 filter = vld1q_f32(local_filter_ptr); 224 local_filter_ptr += 4; 225 // Load the inputs 226 float32x4_t input; 227 input = vld1q_f32(local_input_ptr); 228 local_input_ptr += 4; 229 // Load the accumulators from acc_buffer 230 float32x4_t acc; 231 acc = vld1q_f32(acc_buffer_ptr); 232 // Multiply-accumulate 233 acc = vmlaq_f32(acc, input, filter); 234 // Store the accumulators back to acc_buffer 235 vst1q_f32(acc_buffer_ptr, acc); 236 acc_buffer_ptr += 4; 237 } 238 // Handle one input channel at a time. 239 for (; ic < input_depth; ic++) { 240 const float input_val = *local_input_ptr++; 241 const float filter_val = *local_filter_ptr++; 242 *acc_buffer_ptr++ += filter_val * input_val; 243 } 244 input_ptr += input_ptr_increment; 245 } 246 } 247 }; 248 249 template <> 250 struct FloatDepthwiseConvKernel<true, 0, 8> { 251 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 252 const float* input_ptr, int input_ptr_increment, 253 const float* filter_ptr, float* acc_buffer_ptr) { 254 // Handle one output pixel at a time. 255 for (int outp = 0; outp < num_output_pixels; outp++) { 256 const float* local_filter_ptr = filter_ptr; 257 const float* local_input_ptr = input_ptr; 258 int ic = 0; 259 // Handle 2 input channels at a time. 260 for (; ic <= input_depth - 2; ic += 2) { 261 // Load the filters 262 float32x4_t filter[4]; 263 for (int i = 0; i < 4; i++) { 264 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 265 } 266 local_filter_ptr += 16; 267 // Load the inputs 268 const float32x2_t input = vld1_f32(local_input_ptr); 269 local_input_ptr += 2; 270 // Load the accumulators from acc_buffer 271 float32x4_t acc[4]; 272 for (int i = 0; i < 4; i++) { 273 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 274 } 275 // Multiply-accumulate 276 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); 277 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); 278 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); 279 acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); 280 // Store the accumulators back to acc_buffer 281 for (int i = 0; i < 4; i++) { 282 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 283 } 284 acc_buffer_ptr += 16; 285 } 286 // Handle one input channel at a time. 287 for (; ic < input_depth; ic++) { 288 // Load the filters 289 float32x4_t filter[2]; 290 for (int i = 0; i < 2; i++) { 291 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 292 } 293 local_filter_ptr += 8; 294 // Load the inputs 295 const float input_val = *local_input_ptr++; 296 // Load the accumulators from acc_buffer 297 float32x4_t acc[2]; 298 for (int i = 0; i < 2; i++) { 299 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 300 } 301 // Multiply-accumulate 302 for (int i = 0; i < 2; i++) { 303 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 304 } 305 // Store the accumulators back to acc_buffer 306 for (int i = 0; i < 2; i++) { 307 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 308 } 309 acc_buffer_ptr += 8; 310 } 311 input_ptr += input_ptr_increment; 312 } 313 } 314 }; 315 316 // Note this implementation is very slow for input_depths < 8 317 // (e.g. comparable to reference implementation) see, specializations for 318 // input_depth=3 below. 319 template <> 320 struct FloatDepthwiseConvKernel<true, 0, 2> { 321 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 322 const float* input_ptr, int input_ptr_increment, 323 const float* filter_ptr, float* acc_buffer_ptr) { 324 // Handle one output pixel at a time. 325 for (int outp = 0; outp < num_output_pixels; outp++) { 326 const float* local_filter_ptr = filter_ptr; 327 const float* local_input_ptr = input_ptr; 328 int ic = 0; 329 // Handle 8 input channels at a time. 330 for (; ic <= input_depth - 8; ic += 8) { 331 // Load the filters 332 float32x4_t filter[4]; 333 for (int i = 0; i < 4; i++) { 334 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 335 } 336 local_filter_ptr += 16; 337 // Load the inputs 338 float32x4x2_t input_dup2[2]; 339 for (int i = 0; i < 2; i++) { 340 const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); 341 input_dup2[i] = vzipq_f32(input, input); 342 } 343 local_input_ptr += 8; 344 // Load the accumulators from acc_buffer 345 float32x4_t acc[4]; 346 for (int i = 0; i < 4; i++) { 347 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 348 } 349 // Multiply-accumulate 350 acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); 351 acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); 352 acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); 353 acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); 354 // Store the accumulators back to acc_buffer 355 for (int i = 0; i < 4; i++) { 356 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 357 } 358 acc_buffer_ptr += 16; 359 } 360 // Handle 4 input channels at a time. 361 for (; ic <= input_depth - 4; ic += 4) { 362 // Load the filters 363 float32x2_t filter[4]; 364 for (int i = 0; i < 4; i++) { 365 filter[i] = vld1_f32(local_filter_ptr + 2 * i); 366 } 367 local_filter_ptr += 8; 368 // Load the inputs 369 const float32x4_t input = vld1q_f32(local_input_ptr); 370 local_input_ptr += 4; 371 // Load the accumulators from acc_buffer 372 float32x2_t acc[4]; 373 for (int i = 0; i < 4; i++) { 374 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 375 } 376 // Multiply-accumulate 377 acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); 378 acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); 379 acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); 380 acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); 381 // Store the accumulators back to acc_buffer 382 for (int i = 0; i < 4; i++) { 383 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 384 } 385 acc_buffer_ptr += 8; 386 } 387 // Handle 2 input channels at a time. 388 for (; ic <= input_depth - 2; ic += 2) { 389 // Load the filters 390 const float32x4_t filter = vld1q_f32(local_filter_ptr); 391 local_filter_ptr += 4; 392 // Load the inputs 393 const float32x2_t input = vld1_f32(local_input_ptr); 394 local_input_ptr += 2; 395 // Load the accumulators from acc_buffer 396 float32x2_t acc[2]; 397 for (int i = 0; i < 2; i++) { 398 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 399 } 400 // Multiply-accumulate 401 acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); 402 acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); 403 // Store the accumulators back to acc_buffer 404 for (int i = 0; i < 2; i++) { 405 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 406 } 407 acc_buffer_ptr += 4; 408 } 409 // Handle one input channel at a time. 410 for (; ic < input_depth; ic++) { 411 // Load the inputs 412 const float input_val = *local_input_ptr++; 413 // Multiply-accumulate 414 for (int i = 0; i < 2; i++) { 415 acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; 416 } 417 local_filter_ptr += 2; 418 acc_buffer_ptr += 2; 419 } 420 input_ptr += input_ptr_increment; 421 } 422 } 423 }; 424 425 template <> 426 struct FloatDepthwiseConvKernel<true, 3, 2> { 427 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 428 const float* input_ptr, int input_ptr_increment, 429 const float* filter_ptr, float* acc_buffer_ptr) { 430 // Load the filters 431 float32x2_t filter[3]; 432 for (int i = 0; i < 3; i++) { 433 filter[i] = vld1_f32(filter_ptr + 2 * i); 434 } 435 // Handle one output pixel at a time. 436 for (int outp = 0; outp < num_output_pixels; outp++) { 437 const float32x2_t input01 = vld1_f32(input_ptr); 438 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); 439 // Load the accumulators from acc_buffer 440 float32x2_t acc[3]; 441 for (int i = 0; i < 3; i++) { 442 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); 443 } 444 // Multiply-accumulate for each input channel there 2 outputs 445 acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0); 446 acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1); 447 acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0); 448 // Store the accumulators back to acc_buffer 449 for (int i = 0; i < 3; i++) { 450 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); 451 } 452 acc_buffer_ptr += 6; 453 input_ptr += input_ptr_increment; 454 } 455 } 456 }; 457 458 template <> 459 struct FloatDepthwiseConvKernel<true, 3, 4> { 460 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 461 const float* input_ptr, int input_ptr_increment, 462 const float* filter_ptr, float* acc_buffer_ptr) { 463 // Load the filters 464 float32x4_t filter[3]; 465 for (int i = 0; i < 3; i++) { 466 filter[i] = vld1q_f32(filter_ptr + 4 * i); 467 } 468 // Handle one output pixel at a time. 469 for (int outp = 0; outp < num_output_pixels; outp++) { 470 // NOTE: we only want 3 values, so we read it as two ops where 471 // the second op just duplicates the lane 472 const float32x2_t input01 = vld1_f32(input_ptr); 473 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); 474 // Load the accumulators from acc_buffer 475 float32x4_t acc[3]; 476 for (int i = 0; i < 3; i++) { 477 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 478 } 479 // Multiply-accumulate all outputs. 480 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0); 481 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1); 482 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0); 483 // Store the accumulators back to acc_buffer 484 for (int i = 0; i < 3; i++) { 485 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 486 } 487 acc_buffer_ptr += 12; 488 input_ptr += input_ptr_increment; 489 } 490 } 491 }; 492 493 template <> 494 struct FloatDepthwiseConvKernel<true, 1, 8> { 495 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 496 const float* input_ptr, int input_ptr_increment, 497 const float* filter_ptr, float* acc_buffer_ptr) { 498 // Load the filters 499 float32x4_t filter[2]; 500 for (int i = 0; i < 2; i++) { 501 filter[i] = vld1q_f32(filter_ptr + 4 * i); 502 } 503 // Handle one output pixel at a time. 504 for (int outp = 0; outp < num_output_pixels; outp++) { 505 // Load the inputs 506 const float input_val = *input_ptr; 507 input_ptr += input_ptr_increment; 508 // Load the accumulators from acc_buffer 509 float32x4_t acc[2]; 510 for (int i = 0; i < 2; i++) { 511 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 512 } 513 // Multiply-accumulate 514 for (int i = 0; i < 2; i++) { 515 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 516 } 517 // Store the accumulators back to acc_buffer 518 for (int i = 0; i < 2; i++) { 519 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 520 } 521 acc_buffer_ptr += 8; 522 } 523 } 524 }; 525 526 template <> 527 struct FloatDepthwiseConvKernel<true, 1, 32> { 528 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 529 const float* input_ptr, int input_ptr_increment, 530 const float* filter_ptr, float* acc_buffer_ptr) { 531 // Load the filters 532 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); 533 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); 534 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); 535 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); 536 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); 537 float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5); 538 float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6); 539 float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7); 540 541 // Handle one output pixel at a time. 542 for (int outp = 0; outp < num_output_pixels; outp++) { 543 // Load the inputs 544 const float input_val = *input_ptr; 545 input_ptr += input_ptr_increment; 546 // Load the accumulators from acc_buffer 547 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 548 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 549 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 550 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 551 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); 552 float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5); 553 float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6); 554 float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7); 555 // Multiply-accumulate 556 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); 557 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); 558 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); 559 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); 560 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); 561 acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val); 562 acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val); 563 acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val); 564 // Store the accumulators back to acc_buffer 565 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 566 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 567 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 568 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 569 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); 570 vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5); 571 vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6); 572 vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7); 573 acc_buffer_ptr += 32; 574 } 575 } 576 }; 577 578 template <> 579 struct FloatDepthwiseConvKernel<true, 1, 20> { 580 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 581 const float* input_ptr, int input_ptr_increment, 582 const float* filter_ptr, float* acc_buffer_ptr) { 583 // Load the filters 584 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); 585 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); 586 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); 587 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); 588 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); 589 590 // Handle one output pixel at a time. 591 for (int outp = 0; outp < num_output_pixels; outp++) { 592 // Load the inputs 593 const float input_val = *input_ptr; 594 input_ptr += input_ptr_increment; 595 // Load the accumulators from acc_buffer 596 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); 597 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); 598 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); 599 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); 600 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); 601 // Multiply-accumulate 602 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); 603 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); 604 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); 605 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); 606 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); 607 // Store the accumulators back to acc_buffer 608 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); 609 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); 610 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); 611 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); 612 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); 613 acc_buffer_ptr += 20; 614 } 615 } 616 }; 617 618 template <> 619 struct FloatDepthwiseConvKernel<true, 0, 16> { 620 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 621 const float* input_ptr, int input_ptr_increment, 622 const float* filter_ptr, float* acc_buffer_ptr) { 623 // Handle one output pixel at a time. 624 for (int outp = 0; outp < num_output_pixels; outp++) { 625 const float* local_filter_ptr = filter_ptr; 626 const float* local_input_ptr = input_ptr; 627 for (int ic = 0; ic < input_depth; ic++) { 628 // Load the filters 629 float32x4_t filter[4]; 630 for (int i = 0; i < 4; i++) { 631 filter[i] = vld1q_f32(local_filter_ptr + 4 * i); 632 } 633 local_filter_ptr += 16; 634 // Load the inputs 635 const float input_val = *local_input_ptr++; 636 // Load the accumulators from acc_buffer 637 float32x4_t acc[4]; 638 for (int i = 0; i < 4; i++) { 639 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 640 } 641 // Multiply-accumulate 642 for (int i = 0; i < 4; i++) { 643 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); 644 } 645 // Store the accumulators back to acc_buffer 646 for (int i = 0; i < 4; i++) { 647 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 648 } 649 acc_buffer_ptr += 16; 650 } 651 input_ptr += input_ptr_increment; 652 } 653 } 654 }; 655 656 template <> 657 struct FloatDepthwiseConvKernel<true, 8, 1> { 658 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 659 const float* input_ptr, int input_ptr_increment, 660 const float* filter_ptr, float* acc_buffer_ptr) { 661 // Load the filters 662 float32x4_t filter[2]; 663 for (int i = 0; i < 2; i++) { 664 filter[i] = vld1q_f32(filter_ptr + 4 * i); 665 } 666 // Handle one output pixel at a time. 667 for (int outp = 0; outp < num_output_pixels; outp++) { 668 // Load the inputs 669 float32x4_t input[2]; 670 for (int i = 0; i < 2; i++) { 671 input[i] = vld1q_f32(input_ptr + 4 * i); 672 } 673 // Load the accumulators from acc_buffer 674 float32x4_t acc[2]; 675 for (int i = 0; i < 2; i++) { 676 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); 677 } 678 // Multiply-accumulate 679 for (int i = 0; i < 2; i++) { 680 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); 681 } 682 // Store the accumulators back to acc_buffer 683 for (int i = 0; i < 2; i++) { 684 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); 685 } 686 acc_buffer_ptr += 8; 687 input_ptr += input_ptr_increment; 688 } 689 } 690 }; 691 692 template <> 693 struct FloatDepthwiseConvKernel<true, 2, 1> { 694 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 695 const float* input_ptr, int input_ptr_increment, 696 const float* filter_ptr, float* acc_buffer_ptr) { 697 float32x2_t filter = vld1_f32(filter_ptr); 698 float32x4_t filter_x4 = vcombine_f32(filter, filter); 699 int outp = 0; 700 701 // Handle two output pixels at a time. 702 for (; outp <= num_output_pixels - 2; outp += 2) { 703 // Load the inputs 704 float32x2_t input_1 = vld1_f32(input_ptr); 705 input_ptr += input_ptr_increment; 706 float32x2_t input_2 = vld1_f32(input_ptr); 707 input_ptr += input_ptr_increment; 708 float32x4_t input = vcombine_f32(input_1, input_2); 709 710 // Load the accumulators from acc_buffer 711 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 712 713 // Multiply-accumulate 714 acc = vmlaq_f32(acc, input, filter_x4); 715 716 // Store the accumulators back to acc_buffer 717 vst1q_f32(acc_buffer_ptr, acc); 718 acc_buffer_ptr += 4; 719 } 720 // Handle one output pixel at a time. 721 for (; outp < num_output_pixels; outp++) { 722 // Load the inputs 723 float32x2_t input = vld1_f32(input_ptr); 724 input_ptr += input_ptr_increment; 725 726 // Load the accumulators from acc_buffer 727 float32x2_t acc = vld1_f32(acc_buffer_ptr); 728 729 // Multiply-accumulate 730 acc = vmla_f32(acc, input, filter); 731 732 // Store the accumulators back to acc_buffer 733 vst1_f32(acc_buffer_ptr, acc); 734 acc_buffer_ptr += 2; 735 } 736 } 737 }; 738 739 template <> 740 struct FloatDepthwiseConvKernel<true, 4, 1> { 741 static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 742 const float* input_ptr, int input_ptr_increment, 743 const float* filter_ptr, float* acc_buffer_ptr) { 744 float32x4_t filter = vld1q_f32(filter_ptr); 745 746 // Handle one output pixel at a time. 747 for (int outp = 0; outp < num_output_pixels; outp++) { 748 // Load the inputs 749 float32x4_t input = vld1q_f32(input_ptr); 750 // Load the accumulators from acc_buffer 751 float32x4_t acc = vld1q_f32(acc_buffer_ptr); 752 // Multiply-accumulate 753 acc = vmlaq_f32(acc, input, filter); 754 // Store the accumulators back to acc_buffer 755 vst1q_f32(acc_buffer_ptr, acc); 756 acc_buffer_ptr += 4; 757 input_ptr += input_ptr_increment; 758 } 759 } 760 }; 761 #endif 762 763 // Accumulates the effect of one row of the filter, on a segment of one row 764 // of the output, accessing the corresponding one row of the input. 765 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 766 void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, 767 int input_depth, int input_width, 768 const float* input_data, int pad_width, 769 int depth_multiplier, int filter_width, 770 const float* filter_data, 771 int out_x_buffer_start, int out_x_buffer_end, 772 int output_depth, float* acc_buffer) { 773 ruy::profiler::ScopeLabel label(TFLITE_PRETTY_FUNCTION); 774 // Consistency check parameters. This is important in particular to ensure 775 // that we keep the number of template instantiations minimal, so we don't 776 // increase binary size unnecessarily. 777 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); 778 static_assert(kFixedInputDepth || kAllowStrided, ""); 779 TFLITE_DCHECK(stride == 1 || kAllowStrided); 780 if (kFixedInputDepth) { 781 TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth); 782 } 783 if (kFixedDepthMultiplier) { 784 TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); 785 } 786 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 787 const int input_ptr_increment = stride * input_depth; 788 const float* filter_base_ptr = filter_data; 789 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 790 // For the current (filter_x, filter_y) point in the filter, 791 // compute the boundaries of the corresponding output row segment. 792 int out_x_loop_start_unclamped = 0; 793 int out_x_loop_end_unclamped = 0; 794 if (kAllowStrided) { 795 if (stride == 2) { 796 out_x_loop_start_unclamped = 797 (pad_width - dilation_factor * filter_x + 1) / 2; 798 out_x_loop_end_unclamped = 799 (pad_width + input_width - dilation_factor * filter_x + 1) / 2; 800 } else if (stride == 4) { 801 out_x_loop_start_unclamped = 802 (pad_width - dilation_factor * filter_x + 3) / 4; 803 out_x_loop_end_unclamped = 804 (pad_width + input_width - dilation_factor * filter_x + 3) / 4; 805 } else { 806 out_x_loop_start_unclamped = 807 (pad_width - dilation_factor * filter_x + stride - 1) / stride; 808 out_x_loop_end_unclamped = (pad_width + input_width - 809 dilation_factor * filter_x + stride - 1) / 810 stride; 811 } 812 } else { 813 out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x; 814 out_x_loop_end_unclamped = 815 pad_width + input_width - dilation_factor * filter_x; 816 } 817 // The kernel will have to iterate on the segment of the 818 // output row that starts at out_x_loop_start and out_x_loop_end. 819 const int out_x_loop_start = 820 std::max(out_x_buffer_start, out_x_loop_start_unclamped); 821 const int out_x_loop_end = 822 std::min(out_x_buffer_end, out_x_loop_end_unclamped); 823 824 float* acc_buffer_ptr = 825 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 826 const int in_x_origin = 827 (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; 828 const float* input_ptr = input_data + in_x_origin * input_depth; 829 const int num_output_pixels = out_x_loop_end - out_x_loop_start; 830 FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, 831 kFixedDepthMultiplier>::Run(num_output_pixels, 832 input_depth, 833 depth_multiplier, 834 input_ptr, 835 input_ptr_increment, 836 filter_base_ptr, 837 acc_buffer_ptr); 838 filter_base_ptr += output_depth; 839 } 840 } 841 842 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. 843 inline void FloatDepthwiseConvAccumRowGeneric( 844 int stride, int dilation_factor, int input_depth, int input_width, 845 const float* input_data, int pad_width, int depth_multiplier, 846 int filter_width, const float* filter_data, int out_x_buffer_start, 847 int out_x_buffer_end, int output_depth, float* acc_buffer) { 848 ruy::profiler::ScopeLabel label("DepthwiseConvAccumRowGeneric (slow)"); 849 const float* filter_base_ptr = filter_data; 850 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 851 const int out_x_loop_start = std::max( 852 out_x_buffer_start, 853 (pad_width - dilation_factor * filter_x + stride - 1) / stride); 854 const int out_x_loop_end = std::min( 855 out_x_buffer_end, 856 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / 857 stride); 858 859 float* acc_buffer_ptr = 860 acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 861 const int in_x_origin = 862 (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; 863 const float* input_ptr = input_data + in_x_origin * input_depth; 864 const int input_ptr_increment = (stride - 1) * input_depth; 865 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { 866 const float* filter_ptr = filter_base_ptr; 867 for (int ic = 0; ic < input_depth; ++ic) { 868 const float input_val = *input_ptr++; 869 for (int m = 0; m < depth_multiplier; m++) { 870 const float filter_val = *filter_ptr++; 871 *acc_buffer_ptr++ += filter_val * input_val; 872 } 873 } 874 input_ptr += input_ptr_increment; 875 } 876 filter_base_ptr += output_depth; 877 } 878 } 879 880 // Initializes the accumulator buffer with bias values. 881 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, 882 const float* bias_data, 883 float* acc_buffer) { 884 // TODO(benoitjacob): This might need optimized specializations 885 // for small output_depth values, if that ever becomes an important 886 // case (like it was for some quantized DepthwiseConv cases). 887 for (int i = 0; i < num_output_pixels; i++) { 888 memcpy(acc_buffer + i * output_depth, bias_data, 889 sizeof(acc_buffer[0]) * output_depth); 890 } 891 } 892 893 // DepthwiseConv can run with multi threads on the dim specified by thread_dim. 894 // Each thread processes output elements on dim, thread_dim, in the range of 895 // [thread_start, thread_end). 896 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it 897 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :]. 898 // 899 // The cpu_flags is currently unused. This 900 // parameter is included so that the signature matches that required by a 901 // templated function. Other versions, such as quantized, need this parameter. 902 inline void DepthwiseConvImpl( 903 const DepthwiseParams& params, const RuntimeShape& input_shape, 904 const float* input_data, const RuntimeShape& filter_shape, 905 const float* filter_data, const RuntimeShape& bias_shape, 906 const float* bias_data, const RuntimeShape& output_shape, 907 float* output_data, const CpuFlags& /* cpu_flags */, int thread_start, 908 int thread_end, int thread_dim) { 909 ruy::profiler::ScopeLabel label("DepthwiseConv/float/DepthwiseConvImpl"); 910 911 const int stride_width = params.stride_width; 912 const int stride_height = params.stride_height; 913 const int pad_width = params.padding_values.width; 914 const int pad_height = params.padding_values.height; 915 const int depth_multiplier = params.depth_multiplier; 916 const float output_activation_min = params.float_activation_min; 917 const float output_activation_max = params.float_activation_max; 918 const int dilation_width_factor = params.dilation_width_factor; 919 const int dilation_height_factor = params.dilation_height_factor; 920 TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); 921 TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); 922 TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); 923 TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1); 924 925 const int batches = MatchingDim(input_shape, 0, output_shape, 0); 926 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); 927 const int input_height = input_shape.Dims(1); 928 const int input_width = input_shape.Dims(2); 929 const int input_depth = input_shape.Dims(3); 930 const int filter_height = filter_shape.Dims(1); 931 const int filter_width = filter_shape.Dims(2); 932 const int output_height = output_shape.Dims(1); 933 const int output_width = output_shape.Dims(2); 934 TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 935 TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); 936 937 static const int kAccBufferMaxSize = 4832; 938 float acc_buffer[kAccBufferMaxSize]; 939 TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth); 940 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; 941 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; 942 TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, 943 kAccBufferActualSize); 944 TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); 945 TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1); 946 947 // row_accum_func will point to the core accumulation function to be used 948 // for this DepthwiseConv op. 949 using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric); 950 row_accum_func_t row_accum_func = nullptr; 951 952 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 953 FIXED_DEPTH_MULTIPLIER) \ 954 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ 955 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ 956 depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ 957 row_accum_func = \ 958 FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 959 FIXED_DEPTH_MULTIPLIER>; \ 960 } 961 962 #ifdef USE_NEON 963 // We go over our list of kernels by decreasing order of preference 964 // for the cases where multiple kernels could apply. 965 966 // Start with the fastest kernels: AllowStrided=false, fixed input depth. 967 968 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) 969 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) 970 971 // Next come the strided kernels: AllowStrided=true, fixed input depth. 972 // They are a bit less efficient, but allow stride!=1. 973 974 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) 975 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) 976 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) 977 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) 978 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) 979 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2) 980 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4) 981 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) 982 983 // Finally, the kernels allowing a variable input depth, 984 // these are the least efficient but most general kernels. 985 986 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) 987 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) 988 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) 989 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) 990 991 #endif // USE_NEON 992 993 #undef TFMINI_USE_DEPTHWISECONV_KERNEL 994 995 // No matching fast kernel found, use slow fallback. 996 if (!row_accum_func) { 997 row_accum_func = FloatDepthwiseConvAccumRowGeneric; 998 } 999 1000 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); 1001 const int input_batch_stride = input_height_stride * input_shape.Dims(1); 1002 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); 1003 1004 // Now that we have determined row_accum_func, we can start work. 1005 int batch_start = 0; 1006 int batch_end = batches; 1007 int row_start = 0; 1008 int row_end = output_height; 1009 int output_ptr_offset = 0; 1010 1011 switch (thread_dim) { 1012 case 0: 1013 // Multithread along with the batch axis 1014 TFLITE_DCHECK_GE(thread_start, 0); 1015 TFLITE_DCHECK_LE(thread_end, batches); 1016 batch_start = thread_start; 1017 batch_end = thread_end; 1018 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); 1019 break; 1020 case 1: 1021 // Multithread along with the row axis 1022 TFLITE_DCHECK_GE(thread_start, 0); 1023 TFLITE_DCHECK_LE(thread_end, output_height); 1024 row_start = thread_start; 1025 row_end = thread_end; 1026 output_ptr_offset = row_start * output_width * output_depth; 1027 break; 1028 } 1029 1030 float* output_ptr = output_data + output_ptr_offset; 1031 int batch_step = 1032 (output_height + row_start - row_end) * output_width * output_depth; 1033 1034 for (int b = batch_start; b < batch_end; ++b) { 1035 for (int out_y = row_start; out_y < row_end; ++out_y) { 1036 const int in_y_origin = (out_y * stride_height) - pad_height; 1037 const int filter_y_start = 1038 std::max(0, (-in_y_origin + dilation_height_factor - 1) / 1039 dilation_height_factor); 1040 const int filter_y_end = 1041 std::min(filter_height, 1042 (input_height - in_y_origin + dilation_height_factor - 1) / 1043 dilation_height_factor); 1044 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; 1045 out_x_buffer_start += kOutputPixelsInAccBuffer) { 1046 const int out_x_buffer_end = std::min( 1047 output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); 1048 // We call a 'pixel' a group of activation that share all but the 1049 // 'depth'/'channel' coordinate. num_output_pixels is the number of 1050 // output pixels that we will accumulate in this loop iteration. 1051 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; 1052 // Initialize our local accumulator with the bias values, so we don't 1053 // have to add them later. 1054 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, 1055 acc_buffer); 1056 // Accumulation loop. Most of the time should be spent in here. 1057 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1058 ++filter_y) { 1059 const int in_y = in_y_origin + dilation_height_factor * filter_y; 1060 row_accum_func( 1061 stride_width, dilation_width_factor, input_depth, input_width, 1062 input_data + in_y * input_height_stride + b * input_batch_stride, 1063 pad_width, depth_multiplier, filter_width, 1064 filter_data + filter_y * filter_height_stride, out_x_buffer_start, 1065 out_x_buffer_end, output_depth, acc_buffer); 1066 } 1067 // Finished accumulating. Now store to destination. 1068 const int num_output_values = output_depth * num_output_pixels; 1069 int i = 0; 1070 // TODO(benoitjacob) optimized code goes here 1071 #ifdef USE_NEON 1072 // Handle 16 values at a time 1073 for (; i <= num_output_values - 16; i += 16) { 1074 float32x4_t acc[4]; 1075 for (int k = 0; k < 4; k++) { 1076 acc[k] = vld1q_f32(acc_buffer + i + 4 * k); 1077 } 1078 for (int k = 0; k < 4; k++) { 1079 acc[k] = vmaxq_f32( 1080 vdupq_n_f32(output_activation_min), 1081 vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); 1082 } 1083 for (int k = 0; k < 4; k++) { 1084 vst1q_f32(output_ptr + 4 * k, acc[k]); 1085 } 1086 output_ptr += 16; 1087 } 1088 // Handle 4 values at a time 1089 for (; i <= num_output_values - 4; i += 4) { 1090 float32x4_t acc = vld1q_f32(acc_buffer + i); 1091 1092 acc = vmaxq_f32(vdupq_n_f32(output_activation_min), 1093 vminq_f32(vdupq_n_f32(output_activation_max), acc)); 1094 1095 vst1q_f32(output_ptr, acc); 1096 output_ptr += 4; 1097 } 1098 #endif 1099 // Handle leftover values, one by one. This is very slow. 1100 for (; i < num_output_values; i++) { 1101 float acc = acc_buffer[i]; 1102 acc = std::max(output_activation_min, 1103 std::min(output_activation_max, acc)); 1104 1105 *output_ptr++ = acc; 1106 } 1107 } 1108 } 1109 output_ptr += batch_step; 1110 } 1111 } 1112 1113 1114 } // namespace optimized_ops 1115 } // namespace tflite 1116 1117 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_ 1118