1  /*
2   * Copyright (C) 2017 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  #define LOG_TAG "ExecutionPlan"
18  
19  #include "ExecutionPlan.h"
20  
21  #include <ControlFlow.h>
22  #include <CpuExecutor.h>
23  #include <GraphDump.h>
24  #include <LegacyUtils.h>
25  #include <MetaModel.h>
26  #include <OperationsUtils.h>
27  #include <TokenHasher.h>
28  #include <Tracing.h>
29  #include <android-base/logging.h>
30  #include <fcntl.h>
31  #include <nnapi/IBurst.h>
32  #include <sys/stat.h>
33  #include <sys/types.h>
34  
35  #include <algorithm>
36  #include <functional>
37  #include <map>
38  #include <memory>
39  #include <mutex>
40  #include <queue>
41  #include <set>
42  #include <string>
43  #include <type_traits>
44  #include <unordered_set>
45  #include <utility>
46  #include <vector>
47  
48  #include "BurstBuilder.h"
49  #include "CompilationBuilder.h"
50  #include "ExecutionBuilder.h"
51  #include "ExecutionCallback.h"
52  #include "Manager.h"
53  #include "ModelBuilder.h"
54  #include "TypeManager.h"
55  
56  namespace android {
57  namespace nn {
58  
59  namespace {
60  
61  // The index of the main model in SourceModels.
62  constexpr uint32_t kMainModelInSourceModels = 0;
63  
64  constexpr uint32_t kNoPadding = 1;
65  
updateTokenFromMetaData(TokenHasher * token,const std::vector<TokenValuePair> & metaData)66  static bool updateTokenFromMetaData(TokenHasher* token,
67                                      const std::vector<TokenValuePair>& metaData) {
68      // Combines the TokenValuePair and corresponding extension name.
69      std::vector<std::tuple<const char*, uint16_t, const uint8_t*, size_t>> metaDataWithExtension;
70      for (auto p : metaData) {
71          uint16_t prefix = static_cast<uint32_t>(p.token) >> kExtensionTypeBits;
72          uint16_t extensionEnum = static_cast<uint32_t>(p.token) & kTypeWithinExtensionMask;
73          const Extension* extension;
74          if (!TypeManager::get()->getExtensionInfo(prefix, &extension)) {
75              LOG(ERROR) << "Prefix " << prefix << " could not be found";
76              return false;
77          }
78          metaDataWithExtension.push_back(std::make_tuple(extension->name.c_str(), extensionEnum,
79                                                          p.value.data(), p.value.size()));
80      }
81      // Sort with extension name and extension enum.
82      std::sort(metaDataWithExtension.begin(), metaDataWithExtension.end(),
83                [](const auto& a, const auto& b) {
84                    if (int r = strcmp(std::get<0>(a), std::get<0>(b))) {
85                        return r < 0;
86                    } else {
87                        return std::get<1>(a) < std::get<1>(b);
88                    }
89                });
90      // Update the cache token with the sorted array.
91      for (auto [extensionName, extensionEnum, value, valueSize] : metaDataWithExtension) {
92          if (!token->updateFromString(extensionName) ||
93              !token->update(&extensionEnum, sizeof(uint16_t)) || !token->update(value, valueSize)) {
94              return false;
95          }
96      }
97      return true;
98  }
99  
100  // Compiles the model on device.
101  // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have
102  // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the
103  // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the
104  // device name, device version string, and the execution preference in this function.
compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,TokenHasher * token,const std::vector<TokenValuePair> & metaData,std::shared_ptr<RuntimePreparedModel> * preparedModel)105  int compile(const Device& device, const ModelBuilder& model, int executionPreference,
106              int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
107              TokenHasher* token, const std::vector<TokenValuePair>& metaData,
108              std::shared_ptr<RuntimePreparedModel>* preparedModel) {
109      CHECK(token != nullptr);
110      CHECK(preparedModel != nullptr);
111      *preparedModel = nullptr;
112  
113      std::optional<CacheToken> cacheToken;
114      if (device.isCachingSupported() && token->ok() &&
115          token->updateFromString(device.getName().c_str()) &&
116          token->updateFromString(device.getVersionString().c_str()) &&
117          token->update(&executionPreference, sizeof(executionPreference)) &&
118          token->update(&compilationPriority, sizeof(compilationPriority)) &&
119          updateTokenFromMetaData(token, metaData) && token->finish()) {
120          cacheToken = CacheToken{};
121          const uint8_t* tokenPtr = token->getCacheToken();
122          std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin());
123      }
124  
125      const ModelFactory makeModel = [&model] { return model.makeModel(); };
126      const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference);
127      const Priority priority = convertToCanonicalPriority(compilationPriority);
128      std::vector<ExtensionNameAndPrefix> extensionNameAndPrefix =
129              TypeManager::get()->getExtensionNameAndPrefix(metaData);
130      const auto [n, returnedPreparedModel] =
131              device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken,
132                                  metaData, extensionNameAndPrefix);
133      *preparedModel = returnedPreparedModel;
134      return n;
135  }
136  
137  typedef std::function<void(uint32_t)> OperationReadyCallback;
138  
copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)139  int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex,
140                             const Operand& fromOperand) {
141      if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL &&
142          std::holds_alternative<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams)) {
143          auto& fromChannelQuant =
144                  std::get<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams);
145          ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = {
146                  .channelDim = fromChannelQuant.channelDim,
147                  .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()),
148                  .scales = fromChannelQuant.scales.data(),
149          };
150          return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant);
151      } else if (isExtension(fromOperand.type) &&
152                 std::holds_alternative<Operand::ExtensionParams>(fromOperand.extraParams)) {
153          auto extensionData = std::get<Operand::ExtensionParams>(fromOperand.extraParams);
154          return model.setOperandExtensionData(toOperandIndex, extensionData.data(),
155                                               extensionData.size());
156      } else if (!std::holds_alternative<Operand::NoParams>(fromOperand.extraParams) ||
157                 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
158          LOG(ERROR) << "Type " << fromOperand.type
159                     << " has an unexpected extraParams variant: " << fromOperand.extraParams.index();
160          return ANEURALNETWORKS_BAD_DATA;
161      } else {
162          return ANEURALNETWORKS_NO_ERROR;
163      }
164  }
165  
166  // This class tracks whether we know the value of an operand as operations
167  // are processed.
168  class OperandTracker {
169     public:
170      // Creates the tracker for this model. Figure out which operations can be
171      // executed right away and cb for each one of them.
172      OperandTracker(const ModelBuilder* model, OperationReadyCallback cb);
173      // Mark the specified operation as having been processed. The output
174      // of the operation now being known, this may make new operations to be
175      // able to run.  Call cb for each one of them.
176      void markProcessed(uint32_t operationIndex, OperationReadyCallback cb);
177  
178     private:
179      const ModelBuilder* mModel;
180      std::multimap<uint32_t, uint32_t> mOperandToOperations;
181      std::vector<uint32_t> mUnknownInputCount;  // For each operation
182  };
183  
OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)184  OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb)
185      : mModel(model) {
186      const auto& operations = mModel->getOperations();
187      mUnknownInputCount.resize(operations.size());
188      for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) {
189          const Operation& operation = operations[operationIndex];
190          uint32_t count = 0;
191          for (uint32_t operandIndex : operation.inputs) {
192              auto lifetime = mModel->getOperand(operandIndex).lifetime;
193              if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
194                  lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
195                  count++;
196                  mOperandToOperations.emplace(operandIndex, operationIndex);
197              }
198          }
199          if (count == 0) {
200              cb(operationIndex);
201          }
202          mUnknownInputCount[operationIndex] = count;
203      }
204  }
205  
markProcessed(uint32_t operationIndex,OperationReadyCallback cb)206  void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) {
207      // Mark all its outputs as known.
208      const Operation& operation = mModel->getOperations()[operationIndex];
209      for (uint32_t operandIndex : operation.outputs) {
210          auto range = mOperandToOperations.equal_range(operandIndex);
211          for (auto i = range.first; i != range.second; i++) {
212              uint32_t& count = mUnknownInputCount[i->second];
213              if (--count == 0) {
214                  cb(i->second);
215              }
216          }
217      }
218  }
219  
addTemporary(uint32_t * totalSizeOfTemporaries,uint32_t size,uint32_t alignment,uint32_t padding)220  StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size,
221                                       uint32_t alignment, uint32_t padding) {
222      // TODO: what about overflow?
223      *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment);
224      const uint32_t offset = *totalSizeOfTemporaries;
225      size = roundUp(size, padding);
226      *totalSizeOfTemporaries += size;
227      return {.offset = offset, .paddedLength = size};
228  };
229  
toString(SourceOperandIndex sourceOperandIndex)230  std::string toString(SourceOperandIndex sourceOperandIndex) {
231      return "(" + std::to_string(sourceOperandIndex.first) + ", " +
232             std::to_string(sourceOperandIndex.second) + ")";
233  };
234  
235  // A helper class to analyze the step roles of all partition boundary operands.
236  //
237  // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer
238  // with the following two methods:
239  //   - addRole: Add a step role to a boundary operand
240  //   - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest"
241  //     operand. All of the step roles of the "dest" operand are also possible step roles of the
242  //     "source" operand. This is useful for interpreted control flow, e.g., the outer input operand
243  //     of an interpreted IF operation may be directly used as all step roles of the corresponding
244  //     input operand of the then and else models. Note that this relationship is directional --
245  //     (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a
246  //     shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph
247  //     produced by the used-by relationship is acyclic. This is true for the partitioner algorithm
248  //     because there must be a root operand of each step role for the memory to be allocated on
249  //     behalf of.
250  //
251  class StepRoleAnalyzer {
252     public:
analyze(const std::function<void (StepRoleAnalyzer &)> & setup)253      static std::map<SourceOperandIndex, std::set<StepRole>> analyze(
254              const std::function<void(StepRoleAnalyzer&)>& setup) {
255          StepRoleAnalyzer analyzer;
256          setup(analyzer);
257          return analyzer.finish();
258      }
259  
addRole(const ExecutionStep & step,uint32_t operandIndex,IOType type,uint32_t stepIOIndex)260      void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type,
261                   uint32_t stepIOIndex) {
262          SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex};
263          mRoles[source].emplace(step.getIndex(), type, stepIOIndex);
264      }
265  
setUsedBy(const SourceOperandIndex & source,const SourceOperandIndex & dest)266      void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) {
267          mUsedBy[source].emplace(dest);
268      }
269  
270     private:
271      StepRoleAnalyzer() = default;
272  
273      // Merges the step roles of the destination operands to the source operands
274      // and returns the final map.
finish()275      std::map<SourceOperandIndex, std::set<StepRole>> finish() {
276          for (const auto& [source, _] : mUsedBy) {
277              finishHelper(source);
278          }
279          return std::move(mRoles);
280      }
281  
finishHelper(SourceOperandIndex current)282      void finishHelper(SourceOperandIndex current) {
283          if (mProcessedOperands.count(current) > 0) return;
284          mProcessedOperands.insert(current);
285          const auto it = mUsedBy.find(current);
286          if (it != mUsedBy.end()) {
287              auto& roles = mRoles[current];
288              // Merge the step roles of the destination operands.
289              for (const auto& dest : it->second) {
290                  finishHelper(dest);
291                  const auto& destRoles = mRoles[dest];
292                  roles.insert(destRoles.begin(), destRoles.end());
293              }
294          }
295      }
296  
297      // A map from the source operand to its step roles.
298      std::map<SourceOperandIndex, std::set<StepRole>> mRoles;
299      // A map from the source operand to a set of destination operands that may directly
300      // use the memory of the source operand.
301      std::map<SourceOperandIndex, std::set<SourceOperandIndex>> mUsedBy;
302      // Used in finish to track which operand has been processed.
303      std::set<SourceOperandIndex> mProcessedOperands;
304  };
305  
306  }  // namespace
307  
vlogDump(const char * context) const308  void DynamicTemporaries::vlogDump(const char* context) const {
309      if (empty()) {
310          return;
311      }
312      if (context) {
313          VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
314      }
315      for (const auto& temp : mSourceOperandToTemporary) {
316          VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
317                          << ", stepIndex = " << temp.second.stepIndex
318                          << ", offset = " << temp.second.offset
319                          << ", dimensions = " << toString(temp.second.dimensions)
320                          << ", paddedLength = " << temp.second.paddedLength
321                          << ", alignment = " << temp.second.alignment
322                          << ", padding = " << temp.second.padding;
323      }
324  }
325  
declare(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex,const Dimensions & initialDimensions,uint32_t initialLength,uint32_t alignment,uint32_t padding)326  void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
327                                   const Dimensions& initialDimensions, uint32_t initialLength,
328                                   uint32_t alignment, uint32_t padding) {
329      VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
330                      << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
331                      << ", initialDimensions = " << toString(initialDimensions)
332                      << ", initialLength = " << initialLength << ", alignment = " << alignment
333                      << ", padding = " << padding << ")";
334      CHECK(!mDeclared);
335      CHECK_GT(initialLength, 0u);
336      const uint32_t paddedLength = roundUp(initialLength, padding);
337      auto [_, isNew] = mSourceOperandToTemporary.emplace(
338              sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions,
339                                                           paddedLength, alignment, padding});
340      CHECK(isNew);
341      mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
342  }
343  
redeclare(SourceOperandIndex sourceOperandIndex,const Dimensions & newDimensions,uint32_t newLength)344  bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
345                                     const Dimensions& newDimensions, uint32_t newLength) {
346      auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
347          VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
348                          << toString(sourceOperandIndex)
349                          << ", newDimensions = " << toString(newDimensions)
350                          << ", newLength = " << newLength << ") -> " << toString(changedShape);
351          return changedShape;
352      };
353  
354      CHECK(mDeclared);
355      CHECK_GT(newLength, 0u);
356  
357      InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
358      const uint32_t paddedLength = roundUp(newLength, temp.padding);
359      if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) {
360          return createAndLogResult(false);
361      }
362      if (temp.paddedLength < paddedLength) {
363          // Otherwise allocation remains valid, even if it may be suboptimal
364          // (because it uses more space than needed).  Use case: Don't force
365          // client to allocate again just because the client reported more
366          // accurate shape information.
367          mAllocatedStepIndexes.erase(temp.stepIndex);
368      }
369      temp.paddedLength = paddedLength;
370      temp.dimensions = newDimensions;
371      return createAndLogResult(true);
372  }
373  
allocate(uint32_t stepIndex)374  int DynamicTemporaries::allocate(uint32_t stepIndex) {
375      VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
376  
377      CHECK(mDeclared);
378  
379      const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
380      if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
381          return ANEURALNETWORKS_NO_ERROR;
382      }
383  
384      // perform layout
385      uint32_t newSize = 0;
386      for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) {
387          InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
388          // temp.paddedLength is already padded in declare and redeclare.
389          CHECK(temp.paddedLength % temp.padding == 0);
390          temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset;
391      }
392  
393      // perform (re-)allocation
394      // TODO: Today we may shrink the allocation in order to avoid wasting memory.  Is this important
395      //       to conserve memory, or do we waste time reallocating?
396      const double kWaste = 0.2 /* arbitrary */;  // Willing to waste space to avoid
397                                                  // deallocation/reallocation overhead
398      auto& memory = mStepIndexToMemory[stepIndex];
399      const uint32_t oldSize = (memory ? memory->getSize() : 0);
400      if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
401          // Suitable allocation already exists; nothing to do
402      } else {
403          int n;
404          std::tie(n, memory) = MemoryAshmem::create(newSize);
405          if (n != ANEURALNETWORKS_NO_ERROR) {
406              LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
407                         << " for step " << stepIndex;
408              mAllocatedStepIndexes.erase(stepIndex);
409              return n;
410          }
411      }
412  
413      mAllocatedStepIndexes.insert(stepIndex);
414      return ANEURALNETWORKS_NO_ERROR;
415  }
416  
allocated(uint32_t stepIndex) const417  bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
418      return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
419              mStepIndexToSourceOperandIndexes.end()) ||
420             mAllocatedStepIndexes.count(stepIndex);
421  }
422  
lookup(SourceOperandIndex sourceOperandIndex,bool mustBeAllocated) const423  std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
424          SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
425      CHECK(mDeclared);
426      if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
427          it != mSourceOperandToTemporary.end()) {
428          const InternalLocationAndShape& temp = it->second;
429          const bool isAllocated = allocated(temp.stepIndex);
430          if (mustBeAllocated) {
431              CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
432                                 << " must be allocated";
433          }
434          if (isAllocated) {
435              return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
436                                      &temp.dimensions, temp.paddedLength};
437          } else {
438              return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength};
439          }
440      }
441      return std::nullopt;
442  }
443  
ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)444  ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
445                               std::shared_ptr<Device> device)
446      : mPlan(plan),
447        mIndex(stepIndex),
448        mSourceModelIndex(sourceModelIndex),
449        mStepModel(),
450        mDevice(device),
451        mToken(plan->getCacheToken()) {}
452  
453  // Adds an operand if it has not been added already.
454  // Sets the index in the step model for the corresponding operand.
addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)455  int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex,
456                                OperandKind kind) {
457      // Have we added this operand already?
458      auto i = mOperandMap.find(sourceOperandIndex);
459      if (i != mOperandMap.end()) {
460          CHECK(kind == INPUT);
461          *stepOperandIndex = i->second;
462          return ANEURALNETWORKS_NO_ERROR;
463      }
464  
465      // First time we add this operand.
466      *stepOperandIndex = mStepModel.operandCount();
467      mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex);
468  
469      // Add the operand to the step model.
470      const ModelBuilder& sourceModel = *getSourceModel();
471      const Operand& operand = sourceModel.getOperand(sourceOperandIndex);
472      ANeuralNetworksOperandType type = {
473              .type = static_cast<int32_t>(operand.type),
474              .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()),
475              .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr,
476              .scale = operand.scale,
477              .zeroPoint = operand.zeroPoint,
478      };
479  
480      int n = mStepModel.addOperand(type);
481      if (n != ANEURALNETWORKS_NO_ERROR) {
482          LOG(ERROR) << "Previous error occurred when partitioning the graph";
483          return n;
484      }
485  
486      n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand);
487      if (n != ANEURALNETWORKS_NO_ERROR) {
488          LOG(ERROR) << "Error when copying extra parameters to the operand";
489          return n;
490      }
491  
492      // Sets its value.
493      switch (operand.lifetime) {
494          case Operand::LifeTime::CONSTANT_COPY: {
495              const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset);
496              n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
497          } break;
498          case Operand::LifeTime::CONSTANT_REFERENCE: {
499              const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex];
500              n = mStepModel.setOperandValueFromMemory(
501                      *stepOperandIndex, memory, operand.location.offset, operand.location.length);
502          } break;
503          case Operand::LifeTime::NO_VALUE: {
504              n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0);
505          } break;
506          case Operand::LifeTime::TEMPORARY_VARIABLE: {  // handled similarly to SUBGRAPH_OUTPUT
507              if (kind == INPUT) {
508                  // The first time we've seen this operand is as an
509                  // input.  That means it must be defined by a
510                  // different partition, and is an input to this one.
511                  mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
512              } else {
513                  // The first time we've seen this operand is as an
514                  // output.  It may be an input to a different
515                  // partition, so keep track of it.
516                  mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
517                                            mIndex);
518              }
519          } break;
520          case Operand::LifeTime::SUBGRAPH_INPUT: {
521              mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
522          } break;
523          case Operand::LifeTime::SUBGRAPH_OUTPUT: {  // handled similarly to TEMPORARY_VARIABLE
524              if (kind == INPUT) {
525                  // The first time we've seen this operand is as an
526                  // input.  That means it must be defined by a
527                  // different partition, and is an input to this one.
528                  mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
529              } else {
530                  // The first time we've seen this operand is as an
531                  // output.
532                  mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
533                  // It may be an input to a different partition, so keep track of
534                  // it.
535                  mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
536                                         mIndex);
537              }
538          } break;
539          case Operand::LifeTime::SUBGRAPH: {
540              const ModelBuilder* model = sourceModel.getReferencedModel(operand);
541              n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model);
542          } break;
543          case Operand::LifeTime::POINTER: {
544              const void* data = std::get<const void*>(operand.location.pointer);
545              n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length);
546          } break;
547      }
548  
549      if (n != ANEURALNETWORKS_NO_ERROR) {
550          LOG(ERROR) << "Previous error occurred when partitioning the graph";
551      }
552      return n;
553  }
554  
addOperation(int operationIndex)555  int ExecutionStep::addOperation(int operationIndex) {
556      const Operation& operation = getSourceModel()->getOperation(operationIndex);
557      if (mToken.ok()) {
558          mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex));
559          mToken.update(&operationIndex, sizeof(operationIndex));
560      }
561  
562      // Convert the input and output operand indexes.
563      //
564      // We expect operations to be added in topological order.  Therefore:
565      //
566      // - We may not have seen an input if it is a model input, a
567      //   constant, or an operand written by a different partition.
568      //
569      // - We should not have seen any outputs.
570      auto addOperands = [this](const std::vector<uint32_t>& sourceModelOperands,
571                                std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int {
572          const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size());
573          for (uint32_t i = 0; i < operandCount; i++) {
574              NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind));
575          }
576          return ANEURALNETWORKS_NO_ERROR;
577      };
578  
579      const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size());
580      const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size());
581      std::vector<uint32_t> inputs(inputCount);
582      std::vector<uint32_t> outputs(outputCount);
583      NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT));
584      NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT));
585      return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(),
586                                     outputCount, outputs.data());
587  }
588  
mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const std::vector<OutputShape> * mainModelOutputShapes,const RuntimeMemory * temporaryMemory,const std::map<SourceOperandIndex,StaticTemporaryLocation> & sourceOperandToLocationOfTemporary,const DynamicTemporaries & dynamicTemporaries,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const589  void ExecutionStep::mapInputsAndOutputs(
590          std::shared_ptr<StepExecutor> executor,
591          const std::vector<OutputShape>* mainModelOutputShapes, const RuntimeMemory* temporaryMemory,
592          const std::map<SourceOperandIndex, StaticTemporaryLocation>&
593                  sourceOperandToLocationOfTemporary,
594          const DynamicTemporaries& dynamicTemporaries,
595          const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
596          const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
597          const std::map<SourceOperandIndex, ConstantReferenceLocation>&
598                  sourceOperandToConstantReference) const {
599      auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) {
600          SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
601          if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
602              it != sourceOperandToLocationOfTemporary.end()) {
603              const auto& loc = it->second;
604              executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset,
605                                           loc.paddedLength);
606          } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
607              executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset,
608                                           loc->paddedLength, *loc->dimensions);
609          } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
610                     it != sourceOperandToInputIndex.end()) {
611              executor->mapInput(it->second, stepInputIndex);
612          } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
613                     it != sourceOperandToOutputIndex.end()) {
614              executor->mapOutputToInput(it->second, stepInputIndex,
615                                         mainModelOutputShapes
616                                                 ? &mainModelOutputShapes->at(it->second).dimensions
617                                                 : nullptr);
618          } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
619                     it != sourceOperandToConstantReference.end()) {
620              // Constant partition boundary operand. This could be an IF branch
621              // model input or a WHILE variable initializer.
622              const auto& loc = it->second;
623              executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length);
624          } else {
625              CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand "
626                           << toString(sourceOperandIndex);
627          }
628      };
629      auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) {
630          SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex);
631          if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex);
632              it != sourceOperandToLocationOfTemporary.end()) {
633              const auto& loc = it->second;
634              executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset,
635                                            loc.paddedLength);
636          } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
637              executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
638                                            loc->paddedLength, *loc->dimensions);
639          } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
640                     it != sourceOperandToOutputIndex.end()) {
641              executor->mapOutput(it->second, stepOutputIndex);
642          } else {
643              CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand "
644                           << toString(sourceOperandIndex);
645          }
646      };
647      for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) {
648          mapInput(mStepModelInputs[i].first, i);
649      }
650      for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) {
651          mapOutput(mStepModelOutputs[i].first, i);
652      }
653  }
654  
findModelOutputsThatAreDownstreamInputs()655  void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
656      auto declareModelOutputIsDownstreamInput =
657              [this](const SourceOperandIndex& sourceOperandIndex) {
658                  const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
659                  CHECK(it != mOutputToDefiningExecutionStep.end());
660                  uint32_t stepIndex = it->second;
661                  CHECK_LT(stepIndex, mSteps.size());
662                  VLOG(COMPILATION)
663                          << "ExecutionStep(" << stepIndex
664                          << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
665                          << toString(sourceOperandIndex) << ")";
666                  CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
667                        mSourceOperandToOutputIndex.end());
668                  mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
669                          mSourceOperandToOutputIndex.at(sourceOperandIndex));
670              };
671      for (const auto& logicalStep : mSteps) {
672          if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
673              for (const auto& output : step->getOutputsAsStepModelInputs()) {
674                  SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
675                  declareModelOutputIsDownstreamInput(sourceOperandIndex);
676              }
677          }
678      }
679  }
680  
findTempsAsStepModelOutputs()681  void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
682      auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
683          const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
684          if (it == mTemporaryToDefiningExecutionStep.end()) {
685              // The operand is not a temporary or is not defined by an
686              // ExecutionStep (i.e. it's an output of an IF or a WHILE).
687              // The latter case is handled by ExecutionPlan::makeController().
688              return;
689          }
690          uint32_t stepIndex = it->second;
691          CHECK_LT(stepIndex, mSteps.size());
692          mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second);
693      };
694      for (const auto& logicalStep : mSteps) {
695          if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
696              for (const auto& input : step->getTempsAsStepModelInputs()) {
697                  SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first);
698                  recordAsOutputIfTemporary(sourceOperandIndex);
699              }
700          } else if (const IfStep* step = logicalStep->tryIfStep()) {
701              recordAsOutputIfTemporary(step->conditionOperandIndex);
702              for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
703                  recordAsOutputIfTemporary(sourceOperandIndex);
704              }
705          } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
706              for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) {
707                  recordAsOutputIfTemporary(sourceOperandIndex);
708              }
709          } else {
710              CHECK(logicalStep->isGoto());
711          }
712      }
713  }
714  
declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex)715  void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
716      VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
717                        << mainModelOutputIndex << ")";
718      const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
719                                mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
720      CHECK(it != mOutputIndexStepModelToMainModel.end());
721      const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
722      CHECK(stepModelOutputIndex < mModelOutputs.size());
723      mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
724  }
725  
recordTempAsStepModelOutput(uint32_t stepOperandIndex)726  void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
727      const auto it = mOperandMap.find(stepOperandIndex);
728      CHECK(it != mOperandMap.end());
729      mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second);
730  }
731  
getSourceModel() const732  const ModelBuilder* ExecutionStep::getSourceModel() const {
733      return mPlan->getSourceModels().getModel(mSourceModelIndex);
734  }
735  
logStepModel() const736  void ExecutionStep::logStepModel() const {
737      VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex;
738  
739      auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) {
740          if (!toLog.empty()) {
741              toLog += ", ";
742          }
743          toLog += toString(e.first);
744          toLog += "->";
745          toLog += toString(e.second);
746      };
747  
748      auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) {
749          std::string toLog;
750          for (const auto& e : map) {
751              logRemapEntry(toLog, e);
752          }
753          VLOG(COMPILATION) << name << ": " << toLog;
754      };
755      auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) {
756          std::string toLog;
757          for (const auto& e : set) {
758              logRemapEntry(toLog, e);
759          }
760          VLOG(COMPILATION) << name << ": " << toLog;
761      };
762  
763      logRemapVector("step model inputs", mStepModelInputs);
764      logRemapVector("step model outputs", mStepModelOutputs);
765      logRemapVector("model inputs", mModelInputs);
766      logRemapVector("model outputs", mModelOutputs);
767      logRemapVector("temps as step model inputs", mTempsAsStepModelInputs);
768      logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs);
769      logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs);
770  }
771  
hasUnknownSize(const Operand & operand)772  static bool hasUnknownSize(const Operand& operand) {
773      if (operand.dimensions.empty()) {
774          return TypeManager::get()->isTensorType(operand.type);
775      }
776      for (const Dimension& dimension : operand.dimensions) {
777          if (dimension == 0) {
778              return true;
779          }
780      }
781      return false;
782  }
783  
finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)784  int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize,
785                                     int32_t executionPreference, int32_t priority) {
786      CHECK(mDevice != nullptr);
787  
788      for (const auto& stepModelOutput : mTempsAsStepModelOutputs) {
789          const Operand& operand = mStepModel.getOperand(stepModelOutput.second);
790          if (hasUnknownSize(operand)) {
791              *hasOutputOfUnknownSize = true;
792              VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first
793                                << " of source graph) has unknown size: " << operand;
794          }
795      }
796  
797      mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16());
798  
799      mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end());
800      mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(),
801                              mTempsAsStepModelInputs.end());
802      mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(),
803                              mOutputsAsStepModelInputs.end());
804  
805      mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end());
806      mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(),
807                               mTempsAsStepModelOutputs.end());
808  
809      // A step model with no inputs or no outputs is an invalid model. Note that we would like to
810      // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from
811      // model validation.
812      if (hasNoInputsOrNoOutputs()) {
813          VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs "
814                               "or no outputs";
815          return ANEURALNETWORKS_OP_FAILED;
816      }
817  
818      if (mSourceModelIndex == kMainModelInSourceModels) {
819          std::map<uint32_t, uint32_t> mainModelOperandToInputIndex;
820          for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
821              mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i;
822          }
823          std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex;
824          for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
825              mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i;
826          }
827  
828          // mInputIndexStepModelToMainModel is ordered by step model input index and relies on
829          // mModelInputs being the first inputs, as specified by mStepModelInputs.
830          mInputIndexStepModelToMainModel.resize(mModelInputs.size());
831          std::transform(mModelInputs.begin(), mModelInputs.end(),
832                         mInputIndexStepModelToMainModel.begin(),
833                         [&mainModelOperandToInputIndex](auto& e) {
834                             uint32_t sourceOperandIndex = e.first;
835                             return mainModelOperandToInputIndex[sourceOperandIndex];
836                         });
837  
838          // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on
839          // mModelOutputs being the first outputs, as specified by mStepModelOutputs.
840          mOutputIndexStepModelToMainModel.resize(mModelOutputs.size());
841          std::transform(mModelOutputs.begin(), mModelOutputs.end(),
842                         mOutputIndexStepModelToMainModel.begin(),
843                         [&mainModelOperandToOutputIndex](auto& e) {
844                             uint32_t sourceOperandIndex = e.first;
845                             return mainModelOperandToOutputIndex[sourceOperandIndex];
846                         });
847  
848          // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies
849          // on mOutputsAsStepModelInputs being the first outputs.
850          mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size());
851          std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(),
852                         mOutputsAsStepModelInputsIndexToMainModel.begin(),
853                         [&mainModelOperandToOutputIndex](auto& e) {
854                             uint32_t sourceOperandIndex = e.first;
855                             return mainModelOperandToOutputIndex[sourceOperandIndex];
856                         });
857      }
858  
859      if (VLOG_IS_ON(COMPILATION)) {
860          logStepModel();
861      }
862  
863      std::vector<uint32_t> inputs(mStepModelInputs.size());
864      std::vector<uint32_t> outputs(mStepModelOutputs.size());
865      std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(),
866                     [](auto& e) { return e.second; });
867      std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(),
868                     [](auto& e) { return e.second; });
869      NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(),
870                                                             outputs.size(), outputs.data()));
871      NN_RETURN_IF_ERROR(mStepModel.finish());
872  
873      // TODO: Move compilation elsewhere?
874      VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName();
875      return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(),
876                     &mToken, {}, &mPreparedStepModel);
877  }
878  
dump() const879  void ExecutionStep::dump() const {
880      if (VLOG_IS_ON(COMPILATION)) {
881          VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName();
882          logModelToInfo(mStepModel.makeModel());
883      }
884  }
885  
operator <<(std::ostream & os,const IfStep & step)886  std::ostream& operator<<(std::ostream& os, const IfStep& step) {
887      return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex)
888                << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex;
889  }
890  
operator <<(std::ostream & os,const WhileStep & step)891  std::ostream& operator<<(std::ostream& os, const WhileStep& step) {
892      return os << "Step#" << step.index << ": while cond=" << step.condStepIndex
893                << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex;
894  }
895  
operator <<(std::ostream & os,const GotoStep & step)896  std::ostream& operator<<(std::ostream& os, const GotoStep& step) {
897      return os << "Step#" << step.index << ": goto " << step.gotoStepIndex;
898  }
899  
dump() const900  void LogicalStep::dump() const {
901      if (VLOG_IS_ON(COMPILATION)) {
902          if (const IfStep* step = tryIfStep()) {
903              VLOG(COMPILATION) << *step;
904          } else if (const WhileStep* step = tryWhileStep()) {
905              VLOG(COMPILATION) << *step;
906          } else if (const GotoStep* step = tryGotoStep()) {
907              VLOG(COMPILATION) << *step;
908          } else {
909              executionStep()->dump();
910          }
911      }
912  }
913  
finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)914  int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
915                                          int32_t executionPreference, int32_t priority,
916                                          const OptionalTimePoint& deadline,
917                                          const std::vector<TokenValuePair>& metadata,
918                                          int simulateFailureResultCode) {
919      CHECK(!mSuccessfulFinish);
920      CHECK(!deadline.has_value());
921      CHECK(metadata.empty());
922  
923      const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
924  
925      auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) {
926          for (const auto& sourceOperandIndex : operands) {
927              const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
928              const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
929              if (hasUnknownSize(operand)) {
930                  return true;
931              }
932          }
933          return false;
934      };
935  
936      findTempsAsStepModelOutputs();
937      for (const auto& logicalStep : mSteps) {
938          if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
939              bool stepHasDynamicTemporaries = false;
940              int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries,
941                                            executionPreference, priority);
942              if (stepHasDynamicTemporaries) {
943                  mHasDynamicTemporaries = true;
944                  if (!isCompliantVersion(kHalVersionV1_2ToApi.canonical,
945                                          step->getDevice()->getFeatureLevel())) {
946                      // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT
947                      // must have fully specified dimensions either in the
948                      // Operand or in the RequestArgument.  In the case of a
949                      // dynamic temporary, we won't be able to supply fully
950                      // specified dimensions in either.
951                      VLOG(COMPILATION)
952                              << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex()
953                              << " defines dynamic temporaries but is scheduled on pre-1.2 device "
954                              << step->getDevice()->getName();
955                      if (n == ANEURALNETWORKS_NO_ERROR) {
956                          n = ANEURALNETWORKS_OP_FAILED;
957                      }
958                  }
959              }
960              if (n != ANEURALNETWORKS_NO_ERROR) {
961                  VLOG(COMPILATION)
962                          << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
963                  return n;
964              }
965          } else if (IfStep* step = logicalStep->tryIfStep()) {
966              // The partitioner does not support dynamic temporaries (b/132458982).
967              CHECK(!containsUnknownSize(step->outerInputOperands));
968              CHECK(!containsUnknownSize(step->outerOutputOperands));
969              // step->conditionOperandIndex has a static shape. See b/158557728.
970              CHECK(!containsUnknownSize(step->thenBranchInputOperands));
971              CHECK(!containsUnknownSize(step->thenBranchOutputOperands));
972              CHECK(!containsUnknownSize(step->elseBranchInputOperands));
973              CHECK(!containsUnknownSize(step->elseBranchOutputOperands));
974          } else if (WhileStep* step = logicalStep->tryWhileStep()) {
975              // The partitioner does not support dynamic temporaries (b/132458982).
976              CHECK(!containsUnknownSize(step->outerInputOperands));
977              CHECK(!containsUnknownSize(step->outerOutputOperands));
978              CHECK(!containsUnknownSize(step->condInputOperands));
979              // step->condOutputOperand has a static shape. See b/158557728.
980              CHECK(!containsUnknownSize(step->bodyInputOperands));
981              CHECK(!containsUnknownSize(step->bodyOutputOperands));
982          } else {
983              CHECK(logicalStep->isGoto());
984          }
985      }
986  
987      if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
988          VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
989                            << simulateFailureResultCode;
990          return simulateFailureResultCode;
991      }
992  
993      for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
994          SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i));
995          mSourceOperandToInputIndex[index] = i;
996      }
997      for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) {
998          SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i));
999          mSourceOperandToOutputIndex[index] = i;
1000      }
1001  
1002      findControlFlowBoundaryConstants(sourceModels);
1003      findModelOutputsThatAreDownstreamInputs();
1004      findMemoryStepRoles();
1005  
1006      mSuccessfulFinish = true;
1007      LOG(INFO) << "ExecutionPlan::CompoundBody::finish: compilation finished successfully";
1008      return ANEURALNETWORKS_NO_ERROR;
1009  }
1010  
findControlFlowBoundaryConstants(const SourceModels * sourceModels)1011  void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
1012          const SourceModels* sourceModels) {
1013      auto handleBoundaryConstants = [this,
1014                                      sourceModels](const SourceOperandIndex& sourceOperandIndex) {
1015          const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first);
1016          const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second);
1017          const DataLocation& location = operand.location;
1018          if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) {
1019              mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
1020                      .buffer = sourceModel->getPointerToOperandValue(location.offset),
1021                      .length = location.length,
1022              };
1023          } else if (operand.lifetime == Operand::LifeTime::POINTER) {
1024              mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = {
1025                      .buffer = static_cast<const uint8_t*>(std::get<const void*>(location.pointer)),
1026                      .length = location.length,
1027              };
1028          } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) {
1029              mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = {
1030                      .memory = sourceModel->getMemories()[location.poolIndex],
1031                      .offset = location.offset,
1032                      .length = location.length,
1033              };
1034          }
1035      };
1036      for (const auto& logicalStep : mSteps) {
1037          if (const IfStep* step = logicalStep->tryIfStep()) {
1038              handleBoundaryConstants(step->conditionOperandIndex);
1039              for (const auto& sourceOperandIndex : step->outerInputOperands) {
1040                  handleBoundaryConstants(sourceOperandIndex);
1041              }
1042          } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1043              for (const auto& sourceOperandIndex : step->outerInputOperands) {
1044                  handleBoundaryConstants(sourceOperandIndex);
1045              }
1046          }
1047      }
1048  }
1049  
findMemoryStepRoles()1050  void ExecutionPlan::CompoundBody::findMemoryStepRoles() {
1051      mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) {
1052          for (const auto& logicalStep : mSteps) {
1053              if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1054                  const auto& stepModelInputs = step->getStepModelInputs();
1055                  for (uint32_t i = 0; i < stepModelInputs.size(); i++) {
1056                      const auto& [sourceIndex, stepIndex] = stepModelInputs[i];
1057                      analyzer.addRole(*step, sourceIndex, IOType::INPUT, i);
1058                  }
1059                  const auto& stepModelOutputs = step->getStepModelOutputs();
1060                  for (uint32_t i = 0; i < stepModelOutputs.size(); i++) {
1061                      const auto& [sourceIndex, stepIndex] = stepModelOutputs[i];
1062                      analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i);
1063                  }
1064              } else if (const IfStep* step = logicalStep->tryIfStep()) {
1065                  // See ExecutionPlan::nextCompound(const IfStep*, ...).
1066                  //
1067                  // For interpreted IF operation, the outer input memories may be directly used by
1068                  // the SUBGRAPH_INPUTs of the then and else model.
1069                  CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size());
1070                  CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size());
1071                  for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1072                      analyzer.setUsedBy(step->outerInputOperands[i],
1073                                         step->thenBranchInputOperands[i]);
1074                      analyzer.setUsedBy(step->outerInputOperands[i],
1075                                         step->elseBranchInputOperands[i]);
1076                  }
1077                  // For interpreted IF operation, the outer output memories may be directly used by
1078                  // the SUBGRAPH_OUTPUTs of the then and else model.
1079                  CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size());
1080                  CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size());
1081                  for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) {
1082                      analyzer.setUsedBy(step->outerOutputOperands[i],
1083                                         step->thenBranchOutputOperands[i]);
1084                      analyzer.setUsedBy(step->outerOutputOperands[i],
1085                                         step->elseBranchOutputOperands[i]);
1086                  }
1087              } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1088                  // See ExecutionPlan::nextCompound(const WhileStep*, ...).
1089                  //
1090                  // For interpreted WHILE operation, the following memories are involved:
1091                  // a. the outer input memories to the WHILE operation
1092                  // b. the outer output memories to the WHILE operation
1093                  // c. the output memory of the condition model
1094                  // d. one set of output memories of the body model
1095                  // e. another set of output memories of the body model
1096                  //
1097                  // The memories are used in the following ways:
1098                  //
1099                  // - Condition model:
1100                  //   * In the first iteration: inputs use (a); output uses (c)
1101                  //   * In the following iterations: inputs use (d) or (e) for input-output and
1102                  //     state-only operands, and (a) for input-only operands; output uses (c)
1103                  //
1104                  // - Body model:
1105                  //   * In all iterations: inputs are the same as the condition model; outputs use
1106                  //                        (d) or (e)
1107                  //
1108                  // Therefore, we configure the analyzer with the following used-by relationships:
1109                  // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of
1110                  //   the condition model for all inputs in the first iteration, as well as the
1111                  //   input-only operands in the following iterations.
1112                  CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1113                  for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) {
1114                      analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]);
1115                  }
1116                  // - The output memories of the body model (d) and (e) may be directly used by the
1117                  //   SUBGRAPH_INPUTs of the condition model for input-output and state-only operands
1118                  //   after the first iteration.
1119                  CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size());
1120                  for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) {
1121                      analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]);
1122                  }
1123                  // - The SUBGRAPH_INPUTs of the condition model are directly used by the
1124                  //   SUBGRAPH_INPUTs of the body model for all inputs in all iterations.
1125                  CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1126                  for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) {
1127                      analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]);
1128                  }
1129              } else if (logicalStep->isGoto()) {
1130                  // Nothing to do.
1131              } else {
1132                  CHECK(false) << "Unexpected LogicalStep kind";
1133              }
1134          }
1135      });
1136  }
1137  
finish(const SourceModels *,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1138  int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
1139                                        int32_t priority, const OptionalTimePoint& deadline,
1140                                        const std::vector<TokenValuePair>& metadata,
1141                                        int simulateFailureResultCode) {
1142      CHECK(!mSuccessfulFinish);
1143      CHECK(mDevice != nullptr);
1144      VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
1145      int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo,
1146                      &mToken, metadata, &mPreparedModel);
1147      if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
1148          VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
1149                            << simulateFailureResultCode;
1150          n = simulateFailureResultCode;
1151      }
1152      mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
1153      if (mSuccessfulFinish) {
1154          LOG(INFO) << "ExecutionPlan::SimpleBody::finish: compilation finished successfully on "
1155                    << mDevice->getName();
1156      }
1157      return n;
1158  }
1159  
finish(int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1160  int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
1161                            const OptionalTimePoint& deadline,
1162                            const std::vector<TokenValuePair>& metadata,
1163                            int simulateFailureResultCode) {
1164      CHECK(mBody != nullptr);
1165      return mBody->finish(&getSourceModels(), executionPreference, priority, deadline, metadata,
1166                           simulateFailureResultCode);
1167  }
1168  
Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference,DynamicTemporaries dynamicTemporaries)1169  ExecutionPlan::Controller::Controller(
1170          const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
1171          const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
1172          std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary,
1173          std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,
1174          std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
1175          std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
1176          const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
1177          std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
1178          DynamicTemporaries dynamicTemporaries)
1179      : mPlan(plan),
1180        mExecutionBuilder(executionBuilder),
1181        mBurstBuilder(burstBuilder),
1182        mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)),
1183        mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)),
1184        mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
1185        mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
1186        mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
1187        mDynamicTemporaries(std::move(dynamicTemporaries)),
1188        mNextStepIndex(0),
1189        mFallbackNextStepIndex(kBadStepIndex),
1190        mLastStepSyncFd(-1) {
1191      if (totalSizeOfTemporaries == 0) {
1192          return;
1193      }
1194      int n;
1195      std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries);
1196      if (n != ANEURALNETWORKS_NO_ERROR) {
1197          LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries";
1198          mNextStepIndex = kBadStepIndex;
1199      }
1200      for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) {
1201          memcpy(mTemporaries->getPointer() +
1202                         mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset,
1203                 location.buffer, location.length);
1204      }
1205  }
1206  
1207  // Attempt to create a burst object for each PreparedModel/Partition. If the
1208  // burst controller object cannot be made, return a nullptr in its place to
1209  // indicate the regular execution path should be used. This can occur either
1210  // because PreparedModel was nullptr (cpu was best choice), or because the
1211  // IPreparedModel was of insufficient version or failed to configure the burst.
makeBursts() const1212  std::vector<SharedBurst> ExecutionPlan::makeBursts() const {
1213      switch (mState) {
1214          // burst object for each partition in the compound case
1215          case COMPOUND: {
1216              std::vector<SharedBurst> bursts;
1217              bursts.reserve(compound()->mSteps.size());
1218              for (const auto& logicalStep : compound()->mSteps) {
1219                  if (!logicalStep->isExecution()) {
1220                      bursts.push_back(nullptr);
1221                      continue;
1222                  }
1223                  if (const auto preparedModel =
1224                              logicalStep->executionStep()->getPreparedStepModel()) {
1225                      const auto maybeBurst = preparedModel->configureExecutionBurst();
1226                      if (!maybeBurst.has_value()) {
1227                          LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1228                                     << maybeBurst.error().code << ": " << maybeBurst.error().message;
1229                      }
1230                      bursts.push_back(maybeBurst.value_or(nullptr));
1231                  } else {
1232                      bursts.push_back(nullptr);
1233                  }
1234              }
1235              return bursts;
1236          }
1237          // single burst object for the simple case
1238          case SIMPLE: {
1239              std::vector<SharedBurst> burst;
1240              auto simpleBody = simple();
1241              if (const auto preparedModel = simpleBody->mPreparedModel) {
1242                  const auto maybeBurst = preparedModel->configureExecutionBurst();
1243                  if (!maybeBurst.has_value()) {
1244                      LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with "
1245                                 << maybeBurst.error().code << ": " << maybeBurst.error().message;
1246                  }
1247                  burst.push_back(maybeBurst.value_or(nullptr));
1248              } else {
1249                  burst.push_back(nullptr);
1250              }
1251              return burst;
1252          }
1253          // no burst objects made
1254          default:
1255              return {};
1256      }
1257  }
1258  
makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const1259  std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
1260          ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const {
1261      CHECK(isValid());
1262      CHECK(mState != SIMPLE);
1263      const auto* body = compound();
1264      // Create the layout for a RuntimeMemory object big enough to hold
1265      // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
1266      // - buffers required by the control flow implementation.
1267      //
1268      // TODO: Rethink this approach for managing temporaries.  Some
1269      // alternatives:
1270      //
1271      // 1) Adopt a memory layout scheme analogous to stack allocation,
1272      // where objects of non-overlapping lifetime can occupy the same
1273      // storage.  We would still have a single Memory object in this
1274      // case.
1275      //
1276      // 2) Do something like what CpuExecutor does, and do allocations
1277      // and deallocations on the fly (during execution) before first
1278      // reference and after last reference, respectively.  This would
1279      // mean having one Memory object per TEMPORARY; or, in a more
1280      // complicated implementation, one Memory object per set of
1281      // temporaries that have the same lifetime.  Note that the Android
1282      // system limits the number of shared memory objects, which are
1283      // what our Memory objects represent.
1284      //
1285      uint32_t totalSizeOfTemporaries = 0;
1286      // This function has two modes of operation:
1287      // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
1288      //    TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
1289      //    skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
1290      //    skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
1291      //    operand of another lifetime.
1292      // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
1293      //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
1294      //    of another lifetime.
1295      auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries](
1296                                  const SourceOperandIndex& sourceOperandIndex,
1297                                  std::map<SourceOperandIndex, StaticTemporaryLocation>*
1298                                          sourceOperandToLocationOfTemporary,
1299                                  Operand::LifeTime lifetime =
1300                                          Operand::LifeTime::TEMPORARY_VARIABLE) {
1301          CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
1302                lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT);
1303          const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex);
1304          if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE &&
1305              sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
1306              // See the caller for explanation.
1307              return;
1308          }
1309          CHECK_EQ(sourceOperand.lifetime, lifetime);
1310          const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1311          if (size != 0u) {
1312              const auto memoryPreference =
1313                      body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1314              const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment,
1315                                            memoryPreference.padding);
1316              auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc);
1317              CHECK(isNew);
1318              VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
1319                              << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1320          } else {
1321              // Unknown size, hence dynamic temporary.  The mapping will
1322              // be established elsewhere (DynamicTemporaries::allocate()).
1323              CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1324              CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE);
1325          }
1326      };
1327      std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary;
1328      std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2;
1329      for (const auto& logicalStep : body->mSteps) {
1330          if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1331              // Allocate memory for ExecutionStep temporary outputs that are
1332              // inputs to other steps, as determined by
1333              // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs().
1334              //
1335              // We don't allocate memory for step model output operands with
1336              // source operand lifetime SUBGRAPH_OUTPUT because they will be
1337              // - managed by the client (main model outputs),
1338              // - assigned a location of another operand (when this step model
1339              //   output is a branch model output of an IF; see
1340              //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1341              // - allocated by a WHILE (when this step model output
1342              //   is a condition or body model output of a WHILE; see the
1343              //   step->bodyOutputOperands and step->condOutputOperand handling
1344              //   below).
1345              for (const auto& output : step->getTempsAsStepModelOutputs()) {
1346                  mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first),
1347                               &sourceOperandToLocationOfTemporary);
1348              }
1349          } else if (const IfStep* step = logicalStep->tryIfStep()) {
1350              // Allocate memory for all temporary outputs of an IfStep because
1351              // they are going to be written to by a branch model. We don't
1352              // perform unused output operand optimisation for referenced models.
1353              //
1354              // We don't allocate memory for branch output operands because they
1355              // use the same location as the corresponding outer output operands,
1356              // as established in ExecutionPlan::nextCompound(const IfStep*, ...)
1357              //
1358              // We don't allocate memory for outer output operands with source
1359              // operand lifetime SUBGRAPH_OUTPUT because they will be
1360              // - managed by the client (main model outputs),
1361              // - assigned a location of another operand (when this IF outer
1362              //   output is a branch model output of another IF; see
1363              //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1364              // - allocated by a WHILE (when this IF outer output
1365              //   is a condition or body model output of a WHILE; see the
1366              //   step->bodyOutputOperands and step->condOutputOperand handling
1367              //   below).
1368              for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1369                  mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1370              }
1371          } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1372              // Allocate memory for all temporary outputs of an WhileStep because
1373              // they are going to be written to by the WHILE loop.
1374              //
1375              // We don't allocate memory for outer output operands with source
1376              // operand lifetime SUBGRAPH_OUTPUT because they will be
1377              // - managed by the client (main model outputs),
1378              // - assigned a location of another operand (when this WHILE outer
1379              //   output is a branch model output of an IF; see
1380              //   ExecutionPlan::nextCompound(const IfStep*, ...)), or
1381              // - allocated by another WHILE (when this WHILE outer output
1382              //   is a condition or body model output of another WHILE; see the
1383              //   step->bodyOutputOperands and step->condOutputOperand handling
1384              //   below).
1385              for (const auto& sourceOperandIndex : step->outerOutputOperands) {
1386                  mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary);
1387              }
1388              // Allocate memory for body model outputs. Note that we could use
1389              // the outer output operand memory instead but we currently don't do
1390              // so (b/148206073).
1391              for (const auto& sourceOperandIndex : step->bodyOutputOperands) {
1392                  mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary,
1393                               Operand::LifeTime::SUBGRAPH_OUTPUT);
1394                  // Allocate another set of temporaries for double buffering.
1395                  mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2,
1396                               Operand::LifeTime::SUBGRAPH_OUTPUT);
1397              }
1398              // Allocate memory for condition model output.
1399              // TODO: Share one condition output memory region between all loops.
1400              mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary,
1401                           Operand::LifeTime::SUBGRAPH_OUTPUT);
1402          } else {
1403              CHECK(logicalStep->isGoto());
1404          }
1405      }
1406      // Allocate temporary memory for boundary CONSTANT_COPY operands.
1407      for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) {
1408          const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1409          const auto loc = addTemporary(&totalSizeOfTemporaries, location.length,
1410                                        memoryPreference.alignment, memoryPreference.padding);
1411          sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc);
1412          VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
1413                          << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength;
1414      }
1415      // Collect dynamic temporaries.
1416      // TODO(b/157236079): Move some or all of this work to compilation time?
1417      DynamicTemporaries dynamicTemporaries;
1418      const TypeManager* typeManager = TypeManager::get();
1419      forEachDynamicTemporary([body, typeManager, &dynamicTemporaries](
1420                                      SourceOperandIndex sourceOperandIndex,
1421                                      const Operand& sourceOperand, uint32_t definingStepIndex) {
1422          CHECK(typeManager->isTensorType(sourceOperand.type));
1423          const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
1424          // TODO: For now we guess an initial size equal to element
1425          // size, which is overly conservative.
1426          const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
1427          dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
1428                                     size, memoryPreference.alignment, memoryPreference.padding);
1429      });
1430      dynamicTemporaries.endDeclarations();
1431      dynamicTemporaries.vlogDump("finished declarations");
1432  
1433      return std::shared_ptr<Controller>(new Controller(
1434              this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
1435              std::move(sourceOperandToLocationOfTemporary),
1436              std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex,
1437              body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy,
1438              body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
1439  }
1440  
1441  // TODO: Find a better way to provide this functionality.
fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1442  int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
1443                              std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1444                              const std::vector<OutputShape>* mainModelOutputShapes) const {
1445      *executor = nullptr;
1446      if (burstController != nullptr) {
1447          *burstController = nullptr;
1448      }
1449  
1450      VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
1451                      << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
1452  
1453      if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) {
1454          // We haven't called next().
1455          return ANEURALNETWORKS_OP_FAILED;
1456      }
1457  
1458      if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1459          // The last call to next() did not produce an executor.
1460          return ANEURALNETWORKS_OP_FAILED;
1461      }
1462  
1463      controller->mNextStepIndex = controller->mFallbackNextStepIndex;
1464      return next(controller, executor, burstController, mainModelOutputShapes);
1465  }
1466  
Buffer(void * pointer,uint32_t size)1467  ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
1468      : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast<uint8_t*>(pointer), size)),
1469        mOffset(0) {}
1470  
Buffer(RunTimePoolInfo info,uint32_t offset)1471  ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset)
1472      : mInfo(std::move(info)), mOffset(offset) {}
1473  
getPointer() const1474  void* ExecutionPlan::Buffer::getPointer() const {
1475      return mInfo.getBuffer() + mOffset;
1476  }
1477  
getSize() const1478  uint32_t ExecutionPlan::Buffer::getSize() const {
1479      return mInfo.getSize() - mOffset;
1480  }
1481  
flush() const1482  void ExecutionPlan::Buffer::flush() const {
1483      mInfo.flush();
1484  }
1485  
getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1486  std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo(
1487          const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const {
1488      switch (info.state()) {
1489          case ModelArgumentInfo::POINTER: {
1490              return Buffer(info.buffer(), info.length());
1491          } break;
1492          case ModelArgumentInfo::MEMORY: {
1493              if (std::optional<RunTimePoolInfo> poolInfo =
1494                          executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) {
1495                  return Buffer(*poolInfo, info.locationAndLength().offset);
1496              } else {
1497                  LOG(ERROR) << "Unable to map operand memory pool";
1498                  return std::nullopt;
1499              }
1500          } break;
1501          case ModelArgumentInfo::HAS_NO_VALUE: {
1502              LOG(ERROR) << "Attempting to read an operand that has no value";
1503              return std::nullopt;
1504          } break;
1505          default: {
1506              LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state());
1507              return std::nullopt;
1508          } break;
1509      }
1510  }
1511  
getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1512  std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer(
1513          std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const {
1514      const auto& sourceOperandToLocationOfTemporary =
1515              controller->mSourceOperandToLocationOfTemporary;
1516      const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex;
1517      const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex;
1518      const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference;
1519      if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex);
1520          it != sourceOperandToLocationOfTemporary.end()) {
1521          const uint32_t offset = it->second.offset;
1522          const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries;
1523          return Buffer(memory->getPointer() + offset, memory->getSize() - offset);
1524      } else if (auto it = sourceOperandToInputIndex.find(operandIndex);
1525                 it != sourceOperandToInputIndex.end()) {
1526          const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second);
1527          return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1528      } else if (auto it = sourceOperandToOutputIndex.find(operandIndex);
1529                 it != sourceOperandToOutputIndex.end()) {
1530          const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second);
1531          return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder);
1532      } else if (auto it = sourceOperandToConstantReference.find(operandIndex);
1533                 it != sourceOperandToConstantReference.end()) {
1534          const ConstantReferenceLocation& location = it->second;
1535          const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo();
1536          if (info == std::nullopt) {
1537              return std::nullopt;
1538          }
1539          return Buffer(info->getBuffer() + location.offset, location.length);
1540      }
1541      return std::nullopt;
1542  }
1543  
readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1544  int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
1545                                        SourceOperandIndex operandIndex, bool* value) const {
1546      std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex);
1547      if (buffer == std::nullopt) {
1548          LOG(ERROR) << "Unable to read operand " << toString(operandIndex);
1549          return ANEURALNETWORKS_OP_FAILED;
1550      }
1551      CHECK_GE(buffer->getSize(), sizeof(bool8));
1552      bool8 value8 = *static_cast<bool8*>(buffer->getPointer());
1553      *value = static_cast<bool>(value8);
1554      VLOG(EXECUTION) << "readConditionValue: " << *value;
1555      return ANEURALNETWORKS_NO_ERROR;
1556  }
1557  
next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes,int syncFdOfLastStep) const1558  int ExecutionPlan::next(std::shared_ptr<Controller> controller,
1559                          std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController,
1560                          const std::vector<OutputShape>* mainModelOutputShapes,
1561                          int syncFdOfLastStep) const {
1562      CHECK(mState == COMPOUND);
1563  
1564      controller->mLastStepSyncFd = syncFdOfLastStep;
1565      *executor = nullptr;
1566      if (burstController != nullptr) {
1567          *burstController = nullptr;
1568      }
1569  
1570      VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor)
1571                      << "): mNextStepIndex = " << controller->mNextStepIndex;
1572  
1573      if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1574          return ANEURALNETWORKS_OP_FAILED;
1575      }
1576  
1577      return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1578  }
1579  
nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1580  int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
1581                                  std::shared_ptr<StepExecutor>* executor,
1582                                  SharedBurst* burstController,
1583                                  const std::vector<OutputShape>* mainModelOutputShapes) const {
1584      if (controller->mNextStepIndex == Controller::kBadStepIndex) {
1585          return ANEURALNETWORKS_OP_FAILED;
1586      }
1587  
1588      auto compoundBody = compound();
1589      if (controller->mNextStepIndex == compoundBody->mSteps.size()) {
1590          controller->mNextStepIndex = Controller::kBadStepIndex;  // end
1591          return ANEURALNETWORKS_NO_ERROR;
1592      }
1593  
1594      const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
1595      if (const IfStep* step = logicalStep->tryIfStep()) {
1596          return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1597      } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
1598          return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1599      } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
1600          return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1601      } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
1602          return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
1603      } else {
1604          CHECK(false) << "Unknown step variant";
1605          return ANEURALNETWORKS_BAD_STATE;
1606      }
1607  }
1608  
nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1609  int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
1610                                  std::shared_ptr<StepExecutor>* executor,
1611                                  SharedBurst* burstController,
1612                                  const std::vector<OutputShape>* mainModelOutputShapes) const {
1613      VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
1614                      << step->getDevice()->getName();
1615  
1616      NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
1617      controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
1618  
1619      *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
1620                                                 step->getDevice(), step->getPreparedStepModel(),
1621                                                 /*reusable=*/false, step,
1622                                                 &controller->mDynamicTemporaries);
1623  
1624      step->mapInputsAndOutputs(
1625              *executor, mainModelOutputShapes, controller->mTemporaries.get(),
1626              controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries,
1627              controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
1628              controller->mSourceOperandToConstantReference);
1629      if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
1630          *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
1631      }
1632  
1633      controller->mFallbackNextStepIndex = controller->mNextStepIndex;
1634      controller->mNextStepIndex++;
1635      return ANEURALNETWORKS_NO_ERROR;
1636  }
1637  
1638  // The first argument is the "source" operand, the second operand is the "destination".
setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1639  void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand,
1640                                           const SourceOperandIndex& innerOperand) {
1641      VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from "
1642                      << toString(outerOperand);
1643  #ifdef NN_DEBUGGABLE
1644      CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1645                       mSourceOperandToInputIndex.count(innerOperand) +
1646                       mSourceOperandToOutputIndex.count(innerOperand) +
1647                       mSourceOperandToConstantReference.count(innerOperand),
1648               1u);
1649  #endif
1650      mSourceOperandToLocationOfTemporary.erase(innerOperand);
1651      mSourceOperandToInputIndex.erase(innerOperand);
1652      mSourceOperandToOutputIndex.erase(innerOperand);
1653      mSourceOperandToConstantReference.erase(innerOperand);
1654      if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1655          it != mSourceOperandToLocationOfTemporary.end()) {
1656          mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1657      } else if (auto it = mSourceOperandToInputIndex.find(outerOperand);
1658                 it != mSourceOperandToInputIndex.end()) {
1659          mSourceOperandToInputIndex.emplace(innerOperand, it->second);
1660      } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1661                 it != mSourceOperandToOutputIndex.end()) {
1662          mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1663      } else if (auto it = mSourceOperandToConstantReference.find(outerOperand);
1664                 it != mSourceOperandToConstantReference.end()) {
1665          mSourceOperandToConstantReference.emplace(innerOperand, it->second);
1666      } else {
1667          CHECK(false) << "Cannot set step model input operand " << toString(innerOperand)
1668                       << " from operand " << toString(outerOperand);
1669      }
1670  }
1671  
1672  // The first argument is the "source" operand, the second operand is the "destination".
setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1673  void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand,
1674                                            const SourceOperandIndex& innerOperand) {
1675      VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from "
1676                      << toString(outerOperand);
1677  #ifdef NN_DEBUGGABLE
1678      CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) +
1679                       mSourceOperandToOutputIndex.count(innerOperand),
1680               1u);
1681  #endif
1682      mSourceOperandToLocationOfTemporary.erase(innerOperand);
1683      mSourceOperandToOutputIndex.erase(innerOperand);
1684      if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand);
1685          it != mSourceOperandToLocationOfTemporary.end()) {
1686          mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second);
1687      } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand);
1688                 it != mSourceOperandToOutputIndex.end()) {
1689          mSourceOperandToOutputIndex.emplace(innerOperand, it->second);
1690      } else {
1691          CHECK(false) << "Cannot set step model output operand " << toString(innerOperand)
1692                       << " from operand " << toString(outerOperand);
1693      }
1694  }
1695  
waitForLastStepSyncFence() const1696  int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
1697      if (mLastStepSyncFd == -1) {
1698          return ANEURALNETWORKS_NO_ERROR;
1699      }
1700      VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd;
1701      auto r = syncWait(mLastStepSyncFd, -1);
1702      int n = ANEURALNETWORKS_NO_ERROR;
1703      if (r != FenceState::SIGNALED) {
1704          LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd;
1705          n = ANEURALNETWORKS_OP_FAILED;
1706      }
1707      return n;
1708  }
1709  
1710  // Invocations of Controller::setInput/setOutput in this function must match with invocations of
1711  // StepRoleAnalyzer::setUsedBy in the IfStep branch in
1712  // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1713  int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
1714                                  std::shared_ptr<StepExecutor>* executor,
1715                                  SharedBurst* burstController,
1716                                  const std::vector<OutputShape>* mainModelOutputShapes) const {
1717      VLOG(EXECUTION) << "next: " << *step;
1718      // If the last step has a sync fence, wait for it to signal before reading the condition value.
1719      // This is safe because the steps are serialized when doing fenced compute.
1720      NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1721      bool condValue;
1722      NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue));
1723      controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex;
1724      const std::vector<SourceOperandIndex>& branchInputOperands =
1725              condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands;
1726      const std::vector<SourceOperandIndex>& branchOutputOperands =
1727              condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands;
1728      CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size());
1729      CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size());
1730      for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) {
1731          // We have to do this assignment just before executing this step to
1732          // accommodate cases when the IF resides within a WHILE condition or
1733          // body model and for some j the i-th input of the IF branch model is
1734          // - an input of the WHILE condition model (whileStep->condInputOperands[j]),
1735          // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or
1736          // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]).
1737          // In such cases, the WhileStep modifies the location of
1738          // step->outerInputOperands[i] to implement double buffering.
1739          controller->setInput(step->outerInputOperands[i], branchInputOperands[i]);
1740      }
1741      for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1742          // We have to do this assignment just before executing this step to
1743          // accommodate the case when the IF resides within a WHILE body
1744          // model and the i-th output of the IF branch model is an
1745          // output of the WHILE body model (whileStep->bodyOutputOperands[j] for
1746          // some j). In that case, the WhileStep modifies the location of
1747          // step->outerOutputOperands[i] to implement double buffering.
1748          controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
1749      }
1750      return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1751  }
1752  
1753  // Invocations of Controller::setInput in this function must match with invocations of
1754  // StepRoleAnalyzer::setUsedBy in the WhileStep branch in
1755  // ExecutionPlan::CompoundBody::findMemoryStepRoles.
nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1756  int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
1757                                  std::shared_ptr<StepExecutor>* executor,
1758                                  SharedBurst* burstController,
1759                                  const std::vector<OutputShape>* mainModelOutputShapes) const {
1760      WhileState& state = controller->mWhileState[controller->mNextStepIndex];
1761      if (state.stage == WhileState::EVALUATE_CONDITION) {
1762          state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
1763          VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1764                          << ": evaluating condition";
1765          controller->mNextStepIndex = step->condStepIndex;
1766  
1767          if (state.iteration == 0) {
1768              state.startTime = Clock::now();
1769          }
1770  
1771          // iteration = 0   cond inputs = outer inputs
1772          // iteration = 1   cond inputs = body outputs
1773          // iteration = 2   cond inputs = body outputs
1774          // iteration = 3   cond inputs = ...
1775          uint32_t loopBodyOutputCount = step->bodyOutputOperands.size();
1776          CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size());
1777          CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount);
1778          for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) {
1779              bool operandIsInputOnly = i >= loopBodyOutputCount;
1780              controller->setInput((state.iteration == 0 || operandIsInputOnly)
1781                                           ? step->outerInputOperands[i]
1782                                           : step->bodyOutputOperands[i],
1783                                   step->condInputOperands[i]);
1784          }
1785  
1786          state.stage = WhileState::EVALUATE_BODY;
1787          return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1788      }
1789  
1790      CHECK(state.stage == WhileState::EVALUATE_BODY);
1791      std::chrono::nanoseconds timeoutDuration(
1792              controller->mExecutionBuilder->getLoopTimeoutDuration());
1793      auto duration = Clock::now() - state.startTime;
1794      if (duration > timeoutDuration) {
1795          LOG(ERROR) << "WHILE loop timed out after "
1796                     << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1797                     << " ms";
1798          return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1799      }
1800  
1801      // If the last step has a sync fence, wait for it to signal before reading the condition value.
1802      // This is safe because the steps are serialized when doing fenced compute.
1803      NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence());
1804      bool condValue;
1805      NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue));
1806      if (condValue) {
1807          VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1808                          << ": evaluating body";
1809          controller->mNextStepIndex = step->bodyStepIndex;
1810  
1811          // iteration = 0   body inputs = cond inputs = outer inputs   body outputs = tmp1
1812          // iteration = 1   body inputs = cond inputs = tmp1           body outputs = tmp2
1813          // iteration = 2   body inputs = cond inputs = tmp2           body outputs = tmp1
1814          // iteration = 3   body inputs = cond inputs = ...            body outputs = ...
1815  #ifdef NN_DEBUGGABLE
1816          CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size());
1817          CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size());
1818          CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size());
1819          CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size());
1820  #endif
1821          for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) {
1822              controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]);
1823          }
1824          if (state.iteration != 0) {
1825              for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) {
1826  #ifdef NN_DEBUGGABLE
1827                  CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u);
1828                  CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u);
1829                  CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u);
1830                  CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u);
1831  #endif
1832                  std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand],
1833                            controller->mSourceOperandToLocationOfTemporary2[outputOperand]);
1834              }
1835          }
1836      } else {
1837          VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration
1838                          << ": exiting loop";
1839          controller->mNextStepIndex = step->exitStepIndex;
1840  
1841          // Copy body outputs to outer outputs.
1842          // TODO: Use outer outputs instead of tmp2 to avoid copying?
1843          CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size());
1844          for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) {
1845              // condInputOperands[i] points to a body output operand from the
1846              // last iteration if we've executed at least one iteration and to a
1847              // WHILE operation input operand otherwise.
1848              const SourceOperandIndex& innerOperand = step->condInputOperands[i];
1849              const SourceOperandIndex& outerOperand = step->outerOutputOperands[i];
1850              std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand);
1851              if (outerBuffer == std::nullopt) {
1852                  // This should never happen.
1853                  LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand);
1854                  return ANEURALNETWORKS_OP_FAILED;
1855              }
1856              const Operand& sourceOperand =
1857                      controller->mExecutionBuilder->getSourceOperand(outerOperand);
1858              const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
1859              CHECK_NE(size, 0u);
1860              std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand);
1861              if (innerBuffer == std::nullopt) {
1862                  // This should never happen.
1863                  LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand);
1864                  return ANEURALNETWORKS_OP_FAILED;
1865              }
1866              CHECK_LE(size, innerBuffer->getSize());
1867              CHECK_LE(size, outerBuffer->getSize());
1868              memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size);
1869              outerBuffer->flush();
1870          }
1871          state.iteration = WhileState::kOutsideLoop;
1872      }
1873  
1874      state.stage = WhileState::EVALUATE_CONDITION;
1875      return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1876  }
1877  
nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1878  int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
1879                                  std::shared_ptr<StepExecutor>* executor,
1880                                  SharedBurst* burstController,
1881                                  const std::vector<OutputShape>* mainModelOutputShapes) const {
1882      VLOG(EXECUTION) << "next: " << *step;
1883      controller->mNextStepIndex = step->gotoStepIndex;
1884      return nextCompound(controller, executor, burstController, mainModelOutputShapes);
1885  }
1886  
makeStepExecutor(bool reusable,ExecutionBuilder * executionBuilder) const1887  std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor(
1888          bool reusable, ExecutionBuilder* executionBuilder) const {
1889      auto simpleBody = simple();
1890      auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel,
1891                                                     simpleBody->mDevice, simpleBody->mPreparedModel,
1892                                                     reusable);
1893      executor->mapInputsAndOutputsTrivially();
1894      return executor;
1895  }
1896  
becomeCompoundIfEmpty()1897  void ExecutionPlan::becomeCompoundIfEmpty() {
1898      CHECK(mState != SIMPLE);
1899      if (mState == EMPTY) {
1900          mBody = new CompoundBody(this);
1901          mState = COMPOUND;
1902      }
1903  }
1904  
createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1905  ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex,
1906                                                       const std::shared_ptr<Device> device) {
1907      becomeCompoundIfEmpty();
1908      auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this,
1909                                                compound()->mSteps.size(), sourceModelIndex, device);
1910      compound()->mSteps.push_back(step);
1911      return step->executionStep();
1912  }
1913  
createNewIfStep()1914  IfStep* ExecutionPlan::createNewIfStep() {
1915      becomeCompoundIfEmpty();
1916      auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>);
1917      step->ifStep()->index = compound()->mSteps.size();
1918      compound()->mSteps.push_back(step);
1919      return step->ifStep();
1920  }
1921  
createNewWhileStep()1922  WhileStep* ExecutionPlan::createNewWhileStep() {
1923      becomeCompoundIfEmpty();
1924      auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>);
1925      step->whileStep()->index = compound()->mSteps.size();
1926      compound()->mSteps.push_back(step);
1927      return step->whileStep();
1928  }
1929  
createNewGotoStep()1930  GotoStep* ExecutionPlan::createNewGotoStep() {
1931      becomeCompoundIfEmpty();
1932      auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>);
1933      step->gotoStep()->index = compound()->mSteps.size();
1934      compound()->mSteps.push_back(step);
1935      return step->gotoStep();
1936  }
1937  
becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1938  void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
1939                                       const ModelBuilder* model) {
1940      CHECK(mState == EMPTY);
1941      mBody = new SimpleBody(device, model, mCacheInfo, mToken);
1942      mState = SIMPLE;
1943  }
1944  
recordOutputDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1945  void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1946      auto [it, isNew] =
1947              compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1948      CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
1949                   << toString(sourceOperandIndex) << " already defined by step " << it->second;
1950  }
1951  
recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1952  void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
1953      auto [it, isNew] =
1954              compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
1955      CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand "
1956                   << toString(sourceOperandIndex) << " already defined by step " << it->second;
1957  }
1958  
dump() const1959  void ExecutionPlan::dump() const {
1960      if (mBody) {
1961          mBody->dump();
1962      } else {
1963          VLOG(COMPILATION) << "EMPTY";
1964      }
1965  }
1966  
reset()1967  void ExecutionPlan::reset() {
1968      if (mBody) {
1969          delete mBody;
1970          mBody = nullptr;
1971      }
1972      mState = EMPTY;
1973  }
1974  
isSimpleCpu() const1975  bool ExecutionPlan::isSimpleCpu() const {
1976      return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice();
1977  }
1978  
forTest_getKind() const1979  ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const {
1980      switch (mState) {
1981          case EMPTY:
1982              return Kind::EMPTY;
1983          case SIMPLE:
1984              CHECK(mBody);
1985              return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR;
1986          case COMPOUND:
1987              CHECK(mBody);
1988              return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR;
1989          default:
1990              LOG(FATAL) << "unexpected state";
1991              return Kind::ERROR;
1992      }
1993  }
1994  
forTest_simpleGetDevice() const1995  std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const {
1996      return simple()->mDevice;
1997  }
1998  
forTest_compoundGetSteps() const1999  const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const {
2000      return compound()->mSteps;
2001  }
2002  
forTest_flatGetDynamicTemporaries() const2003  std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
2004      CHECK_EQ(getSourceModels().size(), size_t(1));
2005      std::set<uint32_t> ret;
2006      forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
2007          ret.insert(dynTemp.second);
2008      });
2009      return ret;
2010  }
2011  
hasDynamicTemporaries() const2012  bool ExecutionPlan::hasDynamicTemporaries() const {
2013      return mBody == nullptr ? false : mBody->hasDynamicTemporaries();
2014  }
2015  
forTest_hasStepModelWithNoInputsOrNoOutputs() const2016  bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const {
2017      return mBody == nullptr ? false : mBody->hasStepModelWithNoInputsOrNoOutputs();
2018  }
2019  
hasStepModelWithNoInputsOrNoOutputs() const2020  bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const {
2021      return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) {
2022          const ExecutionStep* step = logicalStep->tryExecutionStep();
2023          return step != nullptr && step->hasNoInputsOrNoOutputs();
2024      });
2025  }
2026  
forTest_simpleGetCacheToken() const2027  const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
2028      return simple()->mToken.getCacheToken();
2029  }
2030  
dump() const2031  void ExecutionPlan::SimpleBody::dump() const {
2032      VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName();
2033  }
2034  
dump() const2035  void ExecutionPlan::CompoundBody::dump() const {
2036      for (const auto& step : mSteps) {
2037          step->dump();
2038      }
2039  }
2040  
getInputSourceOperand(uint32_t index) const2041  SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const {
2042      const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
2043      CHECK_LT(index, mainModel->inputCount());
2044      const auto operandIndex = mainModel->getInputOperandIndex(index);
2045      return {kMainModelInSourceModels, operandIndex};
2046  }
2047  
getOutputSourceOperand(uint32_t index) const2048  SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const {
2049      const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels);
2050      CHECK_LT(index, mainModel->outputCount());
2051      const auto operandIndex = mainModel->getOutputOperandIndex(index);
2052      return {kMainModelInSourceModels, operandIndex};
2053  }
2054  
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2055  void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index,
2056                                                         const StepRoleCallback& callback) const {
2057      callback(mPreparedModel.get(), IOType::INPUT, index);
2058  }
2059  
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2060  void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index,
2061                                                          const StepRoleCallback& callback) const {
2062      callback(mPreparedModel.get(), IOType::OUTPUT, index);
2063  }
2064  
2065  // Map an input role of the main model to the input/output roles in the step models.
forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2066  void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index,
2067                                                           const StepRoleCallback& callback) const {
2068      const auto sourceOperandIndex = mPlan->getInputSourceOperand(index);
2069      forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2070  }
2071  
2072  // Map an output role of the main model to the input/output roles in the step models.
forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2073  void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
2074                                                            const StepRoleCallback& callback) const {
2075      const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index);
2076      forEachStepRoleOfSourceOperand(sourceOperandIndex, callback);
2077  }
2078  
forEachStepRoleOfSourceOperand(const SourceOperandIndex & index,const StepRoleCallback & callback) const2079  void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand(
2080          const SourceOperandIndex& index, const StepRoleCallback& callback) const {
2081      const auto it = mSourceOperandToStepRoles.find(index);
2082      if (it == mSourceOperandToStepRoles.end()) return;
2083      for (const auto& [stepIndex, type, ioIndex] : it->second) {
2084          CHECK_LT(stepIndex, mSteps.size());
2085          const auto* step = mSteps[stepIndex]->executionStep();
2086          callback(step->getPreparedStepModel().get(), type, ioIndex);
2087      }
2088  }
2089  
getMemoryPreference(IOType type,uint32_t index) const2090  MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const {
2091      CHECK(mState == SIMPLE || mState == COMPOUND);
2092      if (mState == SIMPLE) {
2093          return simple()->mPreparedModel->getMemoryPreference();
2094      } else {
2095          const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index)
2096                                                                : getOutputSourceOperand(index);
2097          return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex);
2098      }
2099  }
2100  
getMemoryPreferenceOfSourceOperand(const SourceOperandIndex & index) const2101  MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand(
2102          const SourceOperandIndex& index) const {
2103      uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding;
2104      forEachStepRoleOfSourceOperand(
2105              index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) {
2106                  const auto preference = preparedModel->getMemoryPreference();
2107                  alignment = std::max(alignment, preference.alignment);
2108                  padding = std::max(padding, preference.padding);
2109              });
2110      return {alignment, padding};
2111  }
2112  
forEachDynamicTemporary(const std::function<void (SourceOperandIndex,const Operand &,uint32_t definingStepIndex)> & fn) const2113  void ExecutionPlan::forEachDynamicTemporary(
2114          const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
2115                  fn) const {
2116      if (mState != COMPOUND) {
2117          return;
2118      }
2119  
2120      for (const auto& logicalStep : compound()->mSteps) {
2121          if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
2122              const uint32_t stepIndex = step->getIndex();
2123              const uint32_t sourceModelIndex = step->getSourceModelIndex();
2124              for (const auto& entry : step->getTempsAsStepModelOutputs()) {
2125                  const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
2126                  const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
2127                  if (hasUnknownSize(sourceOperand)) {
2128                      fn(sourceOperandIndex, sourceOperand, stepIndex);
2129                  }
2130              }
2131          }
2132      }
2133  }
2134  
partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan,const std::vector<TokenValuePair> & metaData,int simulateFailureResultCode) const2135  int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
2136                                     uint32_t preference, uint32_t priority,
2137                                     const OptionalTimePoint& deadline, ExecutionPlan* plan,
2138                                     const std::vector<TokenValuePair>& metaData,
2139                                     int simulateFailureResultCode) const {
2140      uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
2141      NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
2142                                                  deadline, plan));
2143      int n = plan->finish(preference, priority, deadline, metaData, simulateFailureResultCode);
2144      if (VLOG_IS_ON(COMPILATION)) {
2145          VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
2146          logModelToInfo(makeModel());
2147          plan->dump();
2148      }
2149      return n;
2150  }
2151  
partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan) const2152  int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
2153                                             const std::vector<std::shared_ptr<Device>>& devices,
2154                                             uint32_t preference, uint32_t priority,
2155                                             const OptionalTimePoint& deadline,
2156                                             ExecutionPlan* plan) const {
2157      // This function uses a heuristic approach to partitioning the graph.
2158      // It should be good enough for the first release.
2159  
2160      SourceModels* sourceModels = &plan->getSourceModels();
2161      const size_t deviceCount = devices.size();
2162      const size_t operationCount = mOperations.size();
2163  
2164      VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: "
2165                        << "sourceModelIndex = " << sourceModelIndex << ", "
2166                        << "deviceCount = " << deviceCount << ", "
2167                        << "operationCount = " << operationCount;
2168  
2169      // Figure out where each operation will best execute.
2170      // The value of the vector is the index in the devices vector.
2171      std::vector<int> bestDeviceForOperation(operationCount);
2172      NN_RETURN_IF_ERROR(
2173              findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation));
2174  
2175      // A special value produced by findBestDeviceForEachOperation meaning that
2176      // this is a control flow operation scheduled for interpreted execution
2177      // (see LogicalStep).
2178      const int kControlFlowInterpreter = deviceCount;
2179  
2180      // If one device will run all the operations, we don't need to split the
2181      // work. This shortcut does not apply when recursively partitioning
2182      // referenced models because our plan representation is flat.
2183      if (sourceModelIndex == kMainModelInSourceModels &&
2184          std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(),
2185                             std::not_equal_to<int>()) == bestDeviceForOperation.end()) {
2186          const int bestDeviceIndex = bestDeviceForOperation[0];
2187          // Bypass the partitioning process unless the only operation is a
2188          // control flow operation scheduled for interpreted execution.
2189          if (bestDeviceIndex != kControlFlowInterpreter) {
2190              VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: "
2191                                << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName();
2192              plan->becomeSingleStep(devices[bestDeviceIndex], this);
2193              return ANEURALNETWORKS_NO_ERROR;
2194          }
2195      }
2196  
2197      // No easy solution, we need to split the work.
2198  
2199      // We keep track of the operations that are ready to run for each device.
2200      // perDeviceQueue[deviceCount] is for interpreted execution of control flow
2201      // (see LogicalStep).
2202      std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
2203  
2204      // This helper function produces a device name.
2205      auto deviceName = [&devices, kControlFlowInterpreter,
2206                         deviceCount](int deviceIndex) -> std::string {
2207          if (deviceIndex == kControlFlowInterpreter) {
2208              return "NNAPI";
2209          } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
2210              return "{unknown}";
2211          } else {
2212              return devices.at(deviceIndex)->getName();
2213          }
2214      };
2215  
2216      // This helper function enqueues the operation on the appropriate queue.
2217      auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
2218          int deviceIndex = bestDeviceForOperation[operationIndex];
2219          perDeviceQueue[deviceIndex].push(operationIndex);
2220          VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
2221                            << deviceIndex << " (" << deviceName(deviceIndex) << ")";
2222      };
2223  
2224      // This helper function finds a device that has operations ready to process.
2225      // We start by looking at the control flow queue, and then look at the
2226      // devices in reverse order (i.e., starting at the end of the devices
2227      // vector). Earlier devices have a chance to prepare more of the inputs
2228      // required by other devices. This function returns -1 if all queues are
2229      // empty.
2230      auto findNextDeviceToProcess = [&]() -> int {
2231          for (int i = perDeviceQueue.size() - 1; i >= 0; i--) {
2232              if (!perDeviceQueue[i].empty()) {
2233                  return i;
2234              }
2235          }
2236          return -1;
2237      };
2238  
2239      OperandTracker tracker(this, enqueueOnAppropriateDevice);
2240      // For each iteration of this loop, we'll create either an execution step or
2241      // an interpreted control flow construct (including nested execution steps
2242      // and interpreted control flow constructs).
2243      while (true) {
2244          // Find the device we'll do this step for.
2245          int deviceIndex = findNextDeviceToProcess();
2246          VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
2247                            << deviceName(deviceIndex) << ")";
2248          if (deviceIndex < 0) {
2249              break;
2250          }
2251  
2252          // Assign as much as possible to this device.
2253          auto& queue = perDeviceQueue[deviceIndex];
2254          if (deviceIndex != kControlFlowInterpreter) {
2255              ExecutionStep* step =
2256                      plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]);
2257              while (!queue.empty()) {
2258                  uint32_t operationIndex = queue.front();
2259                  queue.pop();
2260                  int n = step->addOperation(operationIndex);
2261                  if (n != ANEURALNETWORKS_NO_ERROR) {
2262                      LOG(ERROR) << "failed to add operation " << operationIndex << " to step";
2263                      return n;
2264                  }
2265                  tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2266              }
2267          } else {
2268              while (!queue.empty()) {
2269                  uint32_t operationIndex = queue.front();
2270                  queue.pop();
2271                  const Operation& operation = getOperation(operationIndex);
2272                  if (operation.type == OperationType::IF) {
2273                      namespace op = operation_if;
2274                      const Operand& thenOperand =
2275                              getOperand(operation.inputs[op::kThenModelOperand]);
2276                      const Operand& elseOperand =
2277                              getOperand(operation.inputs[op::kElseModelOperand]);
2278                      const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2279                      const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2280                      uint32_t thenModelIndex = sourceModels->addModel(thenModel);
2281                      uint32_t elseModelIndex = sourceModels->addModel(elseModel);
2282  
2283                      // Emits the following:
2284                      // Index  Step
2285                      //   i    if then=(i + 1) else=(j + 1)
2286                      //  ...   (then model steps)
2287                      //   j    goto k
2288                      //  ...   (else model steps)
2289                      //   k    (steps after the IF)
2290                      IfStep* ifStep = plan->createNewIfStep();
2291                      ifStep->conditionOperandIndex = SourceOperandIndex(
2292                              sourceModelIndex, operation.inputs[op::kCondBoolOperand]);
2293                      ifStep->thenStepIndex = plan->getNextStepIndex();
2294                      NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal(
2295                              thenModelIndex, devices, preference, priority, deadline, plan));
2296                      GotoStep* afterThenBranch = plan->createNewGotoStep();
2297                      ifStep->elseStepIndex = plan->getNextStepIndex();
2298                      NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal(
2299                              elseModelIndex, devices, preference, priority, deadline, plan));
2300                      afterThenBranch->gotoStepIndex = plan->getNextStepIndex();
2301  
2302                      // Outer model operands.
2303                      for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2304                          ifStep->outerInputOperands.emplace_back(sourceModelIndex,
2305                                                                  operation.inputs[i]);
2306                      }
2307                      for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2308                          ifStep->outerOutputOperands.emplace_back(sourceModelIndex,
2309                                                                   operation.outputs[i]);
2310                      }
2311                      // Then model operands.
2312                      for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) {
2313                          ifStep->thenBranchInputOperands.emplace_back(
2314                                  thenModelIndex, thenModel->getInputOperandIndex(i));
2315                      }
2316                      for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) {
2317                          ifStep->thenBranchOutputOperands.emplace_back(
2318                                  thenModelIndex, thenModel->getOutputOperandIndex(i));
2319                      }
2320                      // Else model operands.
2321                      for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) {
2322                          ifStep->elseBranchInputOperands.emplace_back(
2323                                  elseModelIndex, elseModel->getInputOperandIndex(i));
2324                      }
2325                      for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) {
2326                          ifStep->elseBranchOutputOperands.emplace_back(
2327                                  elseModelIndex, elseModel->getOutputOperandIndex(i));
2328                      }
2329                  } else if (operation.type == OperationType::WHILE) {
2330                      namespace op = operation_while;
2331                      const Operand& condOperand =
2332                              getOperand(operation.inputs[op::kCondModelOperand]);
2333                      const Operand& bodyOperand =
2334                              getOperand(operation.inputs[op::kBodyModelOperand]);
2335                      const ModelBuilder* condModel = getReferencedModel(condOperand);
2336                      const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2337                      uint32_t condModelIndex = sourceModels->addModel(condModel);
2338                      uint32_t bodyModelIndex = sourceModels->addModel(bodyModel);
2339  
2340                      // Emits the following:
2341                      // Index  Step
2342                      //   i    while cond=(i + 1) body=(j + 1) exit=(k + 1)
2343                      //  ...   (cond model steps)
2344                      //   j    goto i
2345                      //  ...   (body model steps)
2346                      //   k    goto i
2347                      //  ...   (steps after the WHILE)
2348                      //
2349                      //  Note that WhileStep has WhileState associated with it.
2350                      WhileStep* whileStep = plan->createNewWhileStep();
2351                      whileStep->condStepIndex = plan->getNextStepIndex();
2352                      NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal(
2353                              condModelIndex, devices, preference, priority, deadline, plan));
2354                      GotoStep* afterCond = plan->createNewGotoStep();
2355                      afterCond->gotoStepIndex = whileStep->index;
2356                      whileStep->bodyStepIndex = plan->getNextStepIndex();
2357                      NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal(
2358                              bodyModelIndex, devices, preference, priority, deadline, plan));
2359                      GotoStep* afterBody = plan->createNewGotoStep();
2360                      afterBody->gotoStepIndex = whileStep->index;
2361                      whileStep->exitStepIndex = plan->getNextStepIndex();
2362  
2363                      // Outer model operands.
2364                      for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) {
2365                          whileStep->outerInputOperands.emplace_back(sourceModelIndex,
2366                                                                     operation.inputs[i]);
2367                      }
2368                      for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
2369                          whileStep->outerOutputOperands.emplace_back(sourceModelIndex,
2370                                                                      operation.outputs[i]);
2371                      }
2372                      // Cond model operands.
2373                      for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) {
2374                          whileStep->condInputOperands.emplace_back(
2375                                  condModelIndex, condModel->getInputOperandIndex(i));
2376                      }
2377                      whileStep->condOutputOperand =
2378                              SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0));
2379                      // Body model operands.
2380                      for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) {
2381                          whileStep->bodyInputOperands.emplace_back(
2382                                  bodyModelIndex, bodyModel->getInputOperandIndex(i));
2383                      }
2384                      for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) {
2385                          whileStep->bodyOutputOperands.emplace_back(
2386                                  bodyModelIndex, bodyModel->getOutputOperandIndex(i));
2387                      }
2388                  } else {
2389                      CHECK(false) << operation.type << " is not a control flow operation";
2390                  }
2391                  tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice);
2392              }
2393          }
2394      }
2395      return ANEURALNETWORKS_NO_ERROR;
2396  }
2397  
getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const2398  float ModelBuilder::getPerformance(uint32_t preference,
2399                                     const std::shared_ptr<Device> device) const {
2400      // Note that we will call this method multiple times per compilation with
2401      // the same arguments if there are nested control flow operations and we
2402      // decide to execute the outer operation on the ExecutionPlan::next()
2403      // interpreter.
2404      //
2405      // This is a potential compilation performance problem. To work around it,
2406      // the performance value could be cached for the duration of a compilation.
2407      float perf = 0;
2408      const size_t operationCount = mOperations.size();
2409      for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2410          perf += getPerformance(preference, device, operationIndex);
2411      }
2412      return perf;
2413  }
2414  
getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const2415  float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device,
2416                                     uint32_t operationIndex) const {
2417      auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) {
2418          return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime;
2419      };
2420  
2421      const Operation& operation = getOperation(operationIndex);
2422  
2423      if (operation.type == OperationType::IF) {
2424          namespace op = operation_if;
2425          const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2426          const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2427          const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2428          const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2429          return applyPreference(device->getIfPerformance()) +
2430                 0.5 * (thenModel->getPerformance(preference, device) +
2431                        elseModel->getPerformance(preference, device));
2432      }
2433  
2434      if (operation.type == OperationType::WHILE) {
2435          namespace op = operation_while;
2436          const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2437          const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2438          const ModelBuilder* condModel = getReferencedModel(condOperand);
2439          const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2440          return applyPreference(device->getWhilePerformance()) +
2441                 condModel->getPerformance(preference, device) +
2442                 bodyModel->getPerformance(preference, device);
2443      }
2444  
2445      // TODO This assumes that the type is dictated by the first operand. This is
2446      // currently the case but is not a safe assumption to make in the long term.
2447      const uint32_t operandIndex = operation.inputs[0];
2448      const OperandType operandType = mOperands[operandIndex].type;
2449      switch (operandType) {
2450          case OperandType::FLOAT32:
2451              if (mRelaxComputationFloat32toFloat16) {
2452                  return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar());
2453              }
2454              break;
2455          case OperandType::TENSOR_FLOAT32:
2456              if (mRelaxComputationFloat32toFloat16) {
2457                  return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor());
2458              }
2459              break;
2460          default:
2461              break;
2462      }
2463  
2464      return applyPreference(device->getPerformance(operandType));
2465  }
2466  
isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const2467  bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const {
2468      auto containsUnknownSize = [](const ModelBuilder* model,
2469                                    const std::vector<uint32_t>& operandIndexes) {
2470          for (uint32_t operandIndex : operandIndexes) {
2471              if (hasUnknownSize(model->getOperand(operandIndex))) {
2472                  return true;
2473              }
2474          }
2475          return false;
2476      };
2477  
2478      const Operation& operation = getOperation(operationIndex);
2479  
2480      if (operation.type == OperationType::IF) {
2481          namespace op = operation_if;
2482          const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]);
2483          const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]);
2484          const ModelBuilder* thenModel = getReferencedModel(thenOperand);
2485          const ModelBuilder* elseModel = getReferencedModel(elseOperand);
2486          return containsUnknownSize(this, operation.inputs) ||
2487                 containsUnknownSize(this, operation.outputs) ||
2488                 containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) ||
2489                 containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) ||
2490                 containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) ||
2491                 containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes());
2492      }
2493  
2494      if (operation.type == OperationType::WHILE) {
2495          namespace op = operation_while;
2496          const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]);
2497          const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]);
2498          const ModelBuilder* condModel = getReferencedModel(condOperand);
2499          const ModelBuilder* bodyModel = getReferencedModel(bodyOperand);
2500          return containsUnknownSize(this, operation.inputs) ||
2501                 containsUnknownSize(this, operation.outputs) ||
2502                 containsUnknownSize(condModel, condModel->getInputOperandIndexes()) ||
2503                 containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) ||
2504                 containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) ||
2505                 containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes());
2506      }
2507  
2508      // Not a control flow operation.
2509      return false;
2510  }
2511  
supportedByControlFlowInterpreter(uint32_t operationIndex) const2512  bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const {
2513      const Operation& operation = getOperation(operationIndex);
2514      return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) &&
2515             // The partitioner does not support dynamic temporaries (b/132458982).
2516             !isControlFlowOperationWithOperandOfUnknownSize(operationIndex);
2517  }
2518  
2519  namespace {
2520  
2521  // This class determines whether a given device can execute a given operation
2522  class CanDo {
2523     public:
CanDo()2524      CanDo() {}
2525  
initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)2526      void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) {
2527          mSupportsOperationByIndex = device->getSupportedOperations(metaModel);
2528      }
2529  
check(size_t operationIndex) const2530      bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; }
2531  
2532     private:
2533      std::vector<bool> mSupportsOperationByIndex;
2534  };
2535  
2536  }  // anonymous namespace
2537  
findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const2538  int ModelBuilder::findBestDeviceForEachOperation(
2539          uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices,
2540          std::vector<int>* bestDeviceForOperation) const {
2541      const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing());
2542  
2543      const size_t deviceCount = devices.size();
2544      std::vector<CanDo> canDo(deviceCount);
2545      for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2546          canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]);
2547      }
2548  
2549      // Figure out the best driver for each operation.
2550      const size_t operationCount = mOperations.size();
2551      for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) {
2552          const Operation& operation = getOperation(operationIndex);
2553          // Find which device, including CPU fallback, gives the best performance for this operation.
2554          int bestChoice = -1;
2555  
2556          if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) {
2557              // Do not schedule control flow operations with unknown size to
2558              // non-CPU devices because this is not supported by the 1.3 HAL.
2559              // See http://b/159076604#comment5.
2560              auto cpuDeviceIterator =
2561                      std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice());
2562              if (cpuDeviceIterator != devices.end()) {
2563                  int cpuDeviceIndex = cpuDeviceIterator - devices.begin();
2564                  if (canDo[cpuDeviceIndex].check(operationIndex)) {
2565                      bestChoice = cpuDeviceIndex;
2566                  }
2567              }
2568          } else {
2569              float bestPerfVal = 0.0;  // Do not check bestPerfVal if bestChoice < 0.
2570              for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) {
2571                  const auto& device = devices[deviceIndex];
2572                  if (canDo[deviceIndex].check(operationIndex)) {
2573                      const float perfVal = getPerformance(preference, device, operationIndex);
2574                      const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice());
2575                      if (bestChoice < 0 || perfVal < bestPerfVal ||
2576                          (perfVal == bestPerfVal && deviceIsPreferred)) {
2577                          bestChoice = deviceIndex;
2578                          bestPerfVal = perfVal;
2579                      }
2580                  } else {
2581                      // Somewhat noisy logging, but only place where the user of NNAPI can get
2582                      // feedback on why an operation was not run on a specific device.
2583                      //
2584                      // Logs O(operationCount * deviceCount) times, but typically deviceCount is
2585                      // very small.
2586                      VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation "
2587                                        << operation.type << ":" << operationIndex;
2588                  }
2589              }
2590          }
2591  
2592          if (bestChoice < 0) {
2593              LOG(ERROR) << "No driver can do operation " << operation.type;
2594              return ANEURALNETWORKS_BAD_DATA;
2595          } else if (devices[bestChoice] == DeviceManager::getCpuDevice() &&
2596                     supportedByControlFlowInterpreter(operationIndex)) {
2597              // Run control flow on the ExecutionPlan::next() interpreter and try
2598              // to delegate referenced models.
2599              const int kControlFlowInterpreter = deviceCount;
2600              (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
2601              VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2602                                << ":" << operationIndex << ") = -1 (NNAPI)";
2603          } else {
2604              (*bestDeviceForOperation)[operationIndex] = bestChoice;
2605              VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type
2606                                << ":" << operationIndex << ") = " << bestChoice << " ("
2607                                << devices[bestChoice]->getName() << ")";
2608          }
2609      }
2610      return ANEURALNETWORKS_NO_ERROR;
2611  }
2612  
2613  }  // namespace nn
2614  }  // namespace android
2615