1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define LOG_TAG "ExecutionPlan" 18 19 #include "ExecutionPlan.h" 20 21 #include <ControlFlow.h> 22 #include <CpuExecutor.h> 23 #include <GraphDump.h> 24 #include <LegacyUtils.h> 25 #include <MetaModel.h> 26 #include <OperationsUtils.h> 27 #include <TokenHasher.h> 28 #include <Tracing.h> 29 #include <android-base/logging.h> 30 #include <fcntl.h> 31 #include <nnapi/IBurst.h> 32 #include <sys/stat.h> 33 #include <sys/types.h> 34 35 #include <algorithm> 36 #include <functional> 37 #include <map> 38 #include <memory> 39 #include <mutex> 40 #include <queue> 41 #include <set> 42 #include <string> 43 #include <type_traits> 44 #include <unordered_set> 45 #include <utility> 46 #include <vector> 47 48 #include "BurstBuilder.h" 49 #include "CompilationBuilder.h" 50 #include "ExecutionBuilder.h" 51 #include "ExecutionCallback.h" 52 #include "Manager.h" 53 #include "ModelBuilder.h" 54 #include "TypeManager.h" 55 56 namespace android { 57 namespace nn { 58 59 namespace { 60 61 // The index of the main model in SourceModels. 62 constexpr uint32_t kMainModelInSourceModels = 0; 63 64 constexpr uint32_t kNoPadding = 1; 65 updateTokenFromMetaData(TokenHasher * token,const std::vector<TokenValuePair> & metaData)66 static bool updateTokenFromMetaData(TokenHasher* token, 67 const std::vector<TokenValuePair>& metaData) { 68 // Combines the TokenValuePair and corresponding extension name. 69 std::vector<std::tuple<const char*, uint16_t, const uint8_t*, size_t>> metaDataWithExtension; 70 for (auto p : metaData) { 71 uint16_t prefix = static_cast<uint32_t>(p.token) >> kExtensionTypeBits; 72 uint16_t extensionEnum = static_cast<uint32_t>(p.token) & kTypeWithinExtensionMask; 73 const Extension* extension; 74 if (!TypeManager::get()->getExtensionInfo(prefix, &extension)) { 75 LOG(ERROR) << "Prefix " << prefix << " could not be found"; 76 return false; 77 } 78 metaDataWithExtension.push_back(std::make_tuple(extension->name.c_str(), extensionEnum, 79 p.value.data(), p.value.size())); 80 } 81 // Sort with extension name and extension enum. 82 std::sort(metaDataWithExtension.begin(), metaDataWithExtension.end(), 83 [](const auto& a, const auto& b) { 84 if (int r = strcmp(std::get<0>(a), std::get<0>(b))) { 85 return r < 0; 86 } else { 87 return std::get<1>(a) < std::get<1>(b); 88 } 89 }); 90 // Update the cache token with the sorted array. 91 for (auto [extensionName, extensionEnum, value, valueSize] : metaDataWithExtension) { 92 if (!token->updateFromString(extensionName) || 93 !token->update(&extensionEnum, sizeof(uint16_t)) || !token->update(value, valueSize)) { 94 return false; 95 } 96 } 97 return true; 98 } 99 100 // Compiles the model on device. 101 // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have 102 // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the 103 // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the 104 // device name, device version string, and the execution preference in this function. compile(const Device & device,const ModelBuilder & model,int executionPreference,int compilationPriority,const OptionalTimePoint & deadline,const CacheInfo & cacheInfo,TokenHasher * token,const std::vector<TokenValuePair> & metaData,std::shared_ptr<RuntimePreparedModel> * preparedModel)105 int compile(const Device& device, const ModelBuilder& model, int executionPreference, 106 int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, 107 TokenHasher* token, const std::vector<TokenValuePair>& metaData, 108 std::shared_ptr<RuntimePreparedModel>* preparedModel) { 109 CHECK(token != nullptr); 110 CHECK(preparedModel != nullptr); 111 *preparedModel = nullptr; 112 113 std::optional<CacheToken> cacheToken; 114 if (device.isCachingSupported() && token->ok() && 115 token->updateFromString(device.getName().c_str()) && 116 token->updateFromString(device.getVersionString().c_str()) && 117 token->update(&executionPreference, sizeof(executionPreference)) && 118 token->update(&compilationPriority, sizeof(compilationPriority)) && 119 updateTokenFromMetaData(token, metaData) && token->finish()) { 120 cacheToken = CacheToken{}; 121 const uint8_t* tokenPtr = token->getCacheToken(); 122 std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin()); 123 } 124 125 const ModelFactory makeModel = [&model] { return model.makeModel(); }; 126 const ExecutionPreference preference = static_cast<ExecutionPreference>(executionPreference); 127 const Priority priority = convertToCanonicalPriority(compilationPriority); 128 std::vector<ExtensionNameAndPrefix> extensionNameAndPrefix = 129 TypeManager::get()->getExtensionNameAndPrefix(metaData); 130 const auto [n, returnedPreparedModel] = 131 device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken, 132 metaData, extensionNameAndPrefix); 133 *preparedModel = returnedPreparedModel; 134 return n; 135 } 136 137 typedef std::function<void(uint32_t)> OperationReadyCallback; 138 copyOperandExtraParams(ModelBuilder & model,uint32_t toOperandIndex,const Operand & fromOperand)139 int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex, 140 const Operand& fromOperand) { 141 if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL && 142 std::holds_alternative<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams)) { 143 auto& fromChannelQuant = 144 std::get<Operand::SymmPerChannelQuantParams>(fromOperand.extraParams); 145 ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = { 146 .channelDim = fromChannelQuant.channelDim, 147 .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()), 148 .scales = fromChannelQuant.scales.data(), 149 }; 150 return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant); 151 } else if (isExtension(fromOperand.type) && 152 std::holds_alternative<Operand::ExtensionParams>(fromOperand.extraParams)) { 153 auto extensionData = std::get<Operand::ExtensionParams>(fromOperand.extraParams); 154 return model.setOperandExtensionData(toOperandIndex, extensionData.data(), 155 extensionData.size()); 156 } else if (!std::holds_alternative<Operand::NoParams>(fromOperand.extraParams) || 157 fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) { 158 LOG(ERROR) << "Type " << fromOperand.type 159 << " has an unexpected extraParams variant: " << fromOperand.extraParams.index(); 160 return ANEURALNETWORKS_BAD_DATA; 161 } else { 162 return ANEURALNETWORKS_NO_ERROR; 163 } 164 } 165 166 // This class tracks whether we know the value of an operand as operations 167 // are processed. 168 class OperandTracker { 169 public: 170 // Creates the tracker for this model. Figure out which operations can be 171 // executed right away and cb for each one of them. 172 OperandTracker(const ModelBuilder* model, OperationReadyCallback cb); 173 // Mark the specified operation as having been processed. The output 174 // of the operation now being known, this may make new operations to be 175 // able to run. Call cb for each one of them. 176 void markProcessed(uint32_t operationIndex, OperationReadyCallback cb); 177 178 private: 179 const ModelBuilder* mModel; 180 std::multimap<uint32_t, uint32_t> mOperandToOperations; 181 std::vector<uint32_t> mUnknownInputCount; // For each operation 182 }; 183 OperandTracker(const ModelBuilder * model,OperationReadyCallback cb)184 OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) 185 : mModel(model) { 186 const auto& operations = mModel->getOperations(); 187 mUnknownInputCount.resize(operations.size()); 188 for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) { 189 const Operation& operation = operations[operationIndex]; 190 uint32_t count = 0; 191 for (uint32_t operandIndex : operation.inputs) { 192 auto lifetime = mModel->getOperand(operandIndex).lifetime; 193 if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE || 194 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) { 195 count++; 196 mOperandToOperations.emplace(operandIndex, operationIndex); 197 } 198 } 199 if (count == 0) { 200 cb(operationIndex); 201 } 202 mUnknownInputCount[operationIndex] = count; 203 } 204 } 205 markProcessed(uint32_t operationIndex,OperationReadyCallback cb)206 void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) { 207 // Mark all its outputs as known. 208 const Operation& operation = mModel->getOperations()[operationIndex]; 209 for (uint32_t operandIndex : operation.outputs) { 210 auto range = mOperandToOperations.equal_range(operandIndex); 211 for (auto i = range.first; i != range.second; i++) { 212 uint32_t& count = mUnknownInputCount[i->second]; 213 if (--count == 0) { 214 cb(i->second); 215 } 216 } 217 } 218 } 219 addTemporary(uint32_t * totalSizeOfTemporaries,uint32_t size,uint32_t alignment,uint32_t padding)220 StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size, 221 uint32_t alignment, uint32_t padding) { 222 // TODO: what about overflow? 223 *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment); 224 const uint32_t offset = *totalSizeOfTemporaries; 225 size = roundUp(size, padding); 226 *totalSizeOfTemporaries += size; 227 return {.offset = offset, .paddedLength = size}; 228 }; 229 toString(SourceOperandIndex sourceOperandIndex)230 std::string toString(SourceOperandIndex sourceOperandIndex) { 231 return "(" + std::to_string(sourceOperandIndex.first) + ", " + 232 std::to_string(sourceOperandIndex.second) + ")"; 233 }; 234 235 // A helper class to analyze the step roles of all partition boundary operands. 236 // 237 // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer 238 // with the following two methods: 239 // - addRole: Add a step role to a boundary operand 240 // - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest" 241 // operand. All of the step roles of the "dest" operand are also possible step roles of the 242 // "source" operand. This is useful for interpreted control flow, e.g., the outer input operand 243 // of an interpreted IF operation may be directly used as all step roles of the corresponding 244 // input operand of the then and else models. Note that this relationship is directional -- 245 // (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a 246 // shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph 247 // produced by the used-by relationship is acyclic. This is true for the partitioner algorithm 248 // because there must be a root operand of each step role for the memory to be allocated on 249 // behalf of. 250 // 251 class StepRoleAnalyzer { 252 public: analyze(const std::function<void (StepRoleAnalyzer &)> & setup)253 static std::map<SourceOperandIndex, std::set<StepRole>> analyze( 254 const std::function<void(StepRoleAnalyzer&)>& setup) { 255 StepRoleAnalyzer analyzer; 256 setup(analyzer); 257 return analyzer.finish(); 258 } 259 addRole(const ExecutionStep & step,uint32_t operandIndex,IOType type,uint32_t stepIOIndex)260 void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type, 261 uint32_t stepIOIndex) { 262 SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex}; 263 mRoles[source].emplace(step.getIndex(), type, stepIOIndex); 264 } 265 setUsedBy(const SourceOperandIndex & source,const SourceOperandIndex & dest)266 void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) { 267 mUsedBy[source].emplace(dest); 268 } 269 270 private: 271 StepRoleAnalyzer() = default; 272 273 // Merges the step roles of the destination operands to the source operands 274 // and returns the final map. finish()275 std::map<SourceOperandIndex, std::set<StepRole>> finish() { 276 for (const auto& [source, _] : mUsedBy) { 277 finishHelper(source); 278 } 279 return std::move(mRoles); 280 } 281 finishHelper(SourceOperandIndex current)282 void finishHelper(SourceOperandIndex current) { 283 if (mProcessedOperands.count(current) > 0) return; 284 mProcessedOperands.insert(current); 285 const auto it = mUsedBy.find(current); 286 if (it != mUsedBy.end()) { 287 auto& roles = mRoles[current]; 288 // Merge the step roles of the destination operands. 289 for (const auto& dest : it->second) { 290 finishHelper(dest); 291 const auto& destRoles = mRoles[dest]; 292 roles.insert(destRoles.begin(), destRoles.end()); 293 } 294 } 295 } 296 297 // A map from the source operand to its step roles. 298 std::map<SourceOperandIndex, std::set<StepRole>> mRoles; 299 // A map from the source operand to a set of destination operands that may directly 300 // use the memory of the source operand. 301 std::map<SourceOperandIndex, std::set<SourceOperandIndex>> mUsedBy; 302 // Used in finish to track which operand has been processed. 303 std::set<SourceOperandIndex> mProcessedOperands; 304 }; 305 306 } // namespace 307 vlogDump(const char * context) const308 void DynamicTemporaries::vlogDump(const char* context) const { 309 if (empty()) { 310 return; 311 } 312 if (context) { 313 VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\""; 314 } 315 for (const auto& temp : mSourceOperandToTemporary) { 316 VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first) 317 << ", stepIndex = " << temp.second.stepIndex 318 << ", offset = " << temp.second.offset 319 << ", dimensions = " << toString(temp.second.dimensions) 320 << ", paddedLength = " << temp.second.paddedLength 321 << ", alignment = " << temp.second.alignment 322 << ", padding = " << temp.second.padding; 323 } 324 } 325 declare(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex,const Dimensions & initialDimensions,uint32_t initialLength,uint32_t alignment,uint32_t padding)326 void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex, 327 const Dimensions& initialDimensions, uint32_t initialLength, 328 uint32_t alignment, uint32_t padding) { 329 VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = " 330 << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex 331 << ", initialDimensions = " << toString(initialDimensions) 332 << ", initialLength = " << initialLength << ", alignment = " << alignment 333 << ", padding = " << padding << ")"; 334 CHECK(!mDeclared); 335 CHECK_GT(initialLength, 0u); 336 const uint32_t paddedLength = roundUp(initialLength, padding); 337 auto [_, isNew] = mSourceOperandToTemporary.emplace( 338 sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions, 339 paddedLength, alignment, padding}); 340 CHECK(isNew); 341 mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex); 342 } 343 redeclare(SourceOperandIndex sourceOperandIndex,const Dimensions & newDimensions,uint32_t newLength)344 bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex, 345 const Dimensions& newDimensions, uint32_t newLength) { 346 auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) { 347 VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = " 348 << toString(sourceOperandIndex) 349 << ", newDimensions = " << toString(newDimensions) 350 << ", newLength = " << newLength << ") -> " << toString(changedShape); 351 return changedShape; 352 }; 353 354 CHECK(mDeclared); 355 CHECK_GT(newLength, 0u); 356 357 InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex); 358 const uint32_t paddedLength = roundUp(newLength, temp.padding); 359 if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) { 360 return createAndLogResult(false); 361 } 362 if (temp.paddedLength < paddedLength) { 363 // Otherwise allocation remains valid, even if it may be suboptimal 364 // (because it uses more space than needed). Use case: Don't force 365 // client to allocate again just because the client reported more 366 // accurate shape information. 367 mAllocatedStepIndexes.erase(temp.stepIndex); 368 } 369 temp.paddedLength = paddedLength; 370 temp.dimensions = newDimensions; 371 return createAndLogResult(true); 372 } 373 allocate(uint32_t stepIndex)374 int DynamicTemporaries::allocate(uint32_t stepIndex) { 375 VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")"; 376 377 CHECK(mDeclared); 378 379 const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex); 380 if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) { 381 return ANEURALNETWORKS_NO_ERROR; 382 } 383 384 // perform layout 385 uint32_t newSize = 0; 386 for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) { 387 InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex); 388 // temp.paddedLength is already padded in declare and redeclare. 389 CHECK(temp.paddedLength % temp.padding == 0); 390 temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset; 391 } 392 393 // perform (re-)allocation 394 // TODO: Today we may shrink the allocation in order to avoid wasting memory. Is this important 395 // to conserve memory, or do we waste time reallocating? 396 const double kWaste = 0.2 /* arbitrary */; // Willing to waste space to avoid 397 // deallocation/reallocation overhead 398 auto& memory = mStepIndexToMemory[stepIndex]; 399 const uint32_t oldSize = (memory ? memory->getSize() : 0); 400 if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) { 401 // Suitable allocation already exists; nothing to do 402 } else { 403 int n; 404 std::tie(n, memory) = MemoryAshmem::create(newSize); 405 if (n != ANEURALNETWORKS_NO_ERROR) { 406 LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize 407 << " for step " << stepIndex; 408 mAllocatedStepIndexes.erase(stepIndex); 409 return n; 410 } 411 } 412 413 mAllocatedStepIndexes.insert(stepIndex); 414 return ANEURALNETWORKS_NO_ERROR; 415 } 416 allocated(uint32_t stepIndex) const417 bool DynamicTemporaries::allocated(uint32_t stepIndex) const { 418 return (mStepIndexToSourceOperandIndexes.find(stepIndex) == 419 mStepIndexToSourceOperandIndexes.end()) || 420 mAllocatedStepIndexes.count(stepIndex); 421 } 422 lookup(SourceOperandIndex sourceOperandIndex,bool mustBeAllocated) const423 std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup( 424 SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const { 425 CHECK(mDeclared); 426 if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex); 427 it != mSourceOperandToTemporary.end()) { 428 const InternalLocationAndShape& temp = it->second; 429 const bool isAllocated = allocated(temp.stepIndex); 430 if (mustBeAllocated) { 431 CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex) 432 << " must be allocated"; 433 } 434 if (isAllocated) { 435 return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset, 436 &temp.dimensions, temp.paddedLength}; 437 } else { 438 return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength}; 439 } 440 } 441 return std::nullopt; 442 } 443 ExecutionStep(ExecutionPlan * plan,uint32_t stepIndex,uint32_t sourceModelIndex,std::shared_ptr<Device> device)444 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex, 445 std::shared_ptr<Device> device) 446 : mPlan(plan), 447 mIndex(stepIndex), 448 mSourceModelIndex(sourceModelIndex), 449 mStepModel(), 450 mDevice(device), 451 mToken(plan->getCacheToken()) {} 452 453 // Adds an operand if it has not been added already. 454 // Sets the index in the step model for the corresponding operand. addOperand(uint32_t sourceOperandIndex,uint32_t * stepOperandIndex,OperandKind kind)455 int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex, 456 OperandKind kind) { 457 // Have we added this operand already? 458 auto i = mOperandMap.find(sourceOperandIndex); 459 if (i != mOperandMap.end()) { 460 CHECK(kind == INPUT); 461 *stepOperandIndex = i->second; 462 return ANEURALNETWORKS_NO_ERROR; 463 } 464 465 // First time we add this operand. 466 *stepOperandIndex = mStepModel.operandCount(); 467 mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex); 468 469 // Add the operand to the step model. 470 const ModelBuilder& sourceModel = *getSourceModel(); 471 const Operand& operand = sourceModel.getOperand(sourceOperandIndex); 472 ANeuralNetworksOperandType type = { 473 .type = static_cast<int32_t>(operand.type), 474 .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()), 475 .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr, 476 .scale = operand.scale, 477 .zeroPoint = operand.zeroPoint, 478 }; 479 480 int n = mStepModel.addOperand(type); 481 if (n != ANEURALNETWORKS_NO_ERROR) { 482 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 483 return n; 484 } 485 486 n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand); 487 if (n != ANEURALNETWORKS_NO_ERROR) { 488 LOG(ERROR) << "Error when copying extra parameters to the operand"; 489 return n; 490 } 491 492 // Sets its value. 493 switch (operand.lifetime) { 494 case Operand::LifeTime::CONSTANT_COPY: { 495 const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset); 496 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length); 497 } break; 498 case Operand::LifeTime::CONSTANT_REFERENCE: { 499 const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex]; 500 n = mStepModel.setOperandValueFromMemory( 501 *stepOperandIndex, memory, operand.location.offset, operand.location.length); 502 } break; 503 case Operand::LifeTime::NO_VALUE: { 504 n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0); 505 } break; 506 case Operand::LifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT 507 if (kind == INPUT) { 508 // The first time we've seen this operand is as an 509 // input. That means it must be defined by a 510 // different partition, and is an input to this one. 511 mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); 512 } else { 513 // The first time we've seen this operand is as an 514 // output. It may be an input to a different 515 // partition, so keep track of it. 516 mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex), 517 mIndex); 518 } 519 } break; 520 case Operand::LifeTime::SUBGRAPH_INPUT: { 521 mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); 522 } break; 523 case Operand::LifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE 524 if (kind == INPUT) { 525 // The first time we've seen this operand is as an 526 // input. That means it must be defined by a 527 // different partition, and is an input to this one. 528 mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); 529 } else { 530 // The first time we've seen this operand is as an 531 // output. 532 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex); 533 // It may be an input to a different partition, so keep track of 534 // it. 535 mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex), 536 mIndex); 537 } 538 } break; 539 case Operand::LifeTime::SUBGRAPH: { 540 const ModelBuilder* model = sourceModel.getReferencedModel(operand); 541 n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model); 542 } break; 543 case Operand::LifeTime::POINTER: { 544 const void* data = std::get<const void*>(operand.location.pointer); 545 n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length); 546 } break; 547 } 548 549 if (n != ANEURALNETWORKS_NO_ERROR) { 550 LOG(ERROR) << "Previous error occurred when partitioning the graph"; 551 } 552 return n; 553 } 554 addOperation(int operationIndex)555 int ExecutionStep::addOperation(int operationIndex) { 556 const Operation& operation = getSourceModel()->getOperation(operationIndex); 557 if (mToken.ok()) { 558 mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex)); 559 mToken.update(&operationIndex, sizeof(operationIndex)); 560 } 561 562 // Convert the input and output operand indexes. 563 // 564 // We expect operations to be added in topological order. Therefore: 565 // 566 // - We may not have seen an input if it is a model input, a 567 // constant, or an operand written by a different partition. 568 // 569 // - We should not have seen any outputs. 570 auto addOperands = [this](const std::vector<uint32_t>& sourceModelOperands, 571 std::vector<uint32_t>* stepModelOperands, OperandKind kind) -> int { 572 const uint32_t operandCount = static_cast<uint32_t>(sourceModelOperands.size()); 573 for (uint32_t i = 0; i < operandCount; i++) { 574 NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind)); 575 } 576 return ANEURALNETWORKS_NO_ERROR; 577 }; 578 579 const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size()); 580 const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size()); 581 std::vector<uint32_t> inputs(inputCount); 582 std::vector<uint32_t> outputs(outputCount); 583 NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT)); 584 NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT)); 585 return mStepModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(), 586 outputCount, outputs.data()); 587 } 588 mapInputsAndOutputs(std::shared_ptr<StepExecutor> executor,const std::vector<OutputShape> * mainModelOutputShapes,const RuntimeMemory * temporaryMemory,const std::map<SourceOperandIndex,StaticTemporaryLocation> & sourceOperandToLocationOfTemporary,const DynamicTemporaries & dynamicTemporaries,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToInputIndex,const std::map<SourceOperandIndex,uint32_t> & sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantReferenceLocation> & sourceOperandToConstantReference) const589 void ExecutionStep::mapInputsAndOutputs( 590 std::shared_ptr<StepExecutor> executor, 591 const std::vector<OutputShape>* mainModelOutputShapes, const RuntimeMemory* temporaryMemory, 592 const std::map<SourceOperandIndex, StaticTemporaryLocation>& 593 sourceOperandToLocationOfTemporary, 594 const DynamicTemporaries& dynamicTemporaries, 595 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex, 596 const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex, 597 const std::map<SourceOperandIndex, ConstantReferenceLocation>& 598 sourceOperandToConstantReference) const { 599 auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) { 600 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex); 601 if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex); 602 it != sourceOperandToLocationOfTemporary.end()) { 603 const auto& loc = it->second; 604 executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset, 605 loc.paddedLength); 606 } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) { 607 executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset, 608 loc->paddedLength, *loc->dimensions); 609 } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex); 610 it != sourceOperandToInputIndex.end()) { 611 executor->mapInput(it->second, stepInputIndex); 612 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex); 613 it != sourceOperandToOutputIndex.end()) { 614 executor->mapOutputToInput(it->second, stepInputIndex, 615 mainModelOutputShapes 616 ? &mainModelOutputShapes->at(it->second).dimensions 617 : nullptr); 618 } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex); 619 it != sourceOperandToConstantReference.end()) { 620 // Constant partition boundary operand. This could be an IF branch 621 // model input or a WHILE variable initializer. 622 const auto& loc = it->second; 623 executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length); 624 } else { 625 CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand " 626 << toString(sourceOperandIndex); 627 } 628 }; 629 auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) { 630 SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex); 631 if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex); 632 it != sourceOperandToLocationOfTemporary.end()) { 633 const auto& loc = it->second; 634 executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset, 635 loc.paddedLength); 636 } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) { 637 executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset, 638 loc->paddedLength, *loc->dimensions); 639 } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex); 640 it != sourceOperandToOutputIndex.end()) { 641 executor->mapOutput(it->second, stepOutputIndex); 642 } else { 643 CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand " 644 << toString(sourceOperandIndex); 645 } 646 }; 647 for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) { 648 mapInput(mStepModelInputs[i].first, i); 649 } 650 for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) { 651 mapOutput(mStepModelOutputs[i].first, i); 652 } 653 } 654 findModelOutputsThatAreDownstreamInputs()655 void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() { 656 auto declareModelOutputIsDownstreamInput = 657 [this](const SourceOperandIndex& sourceOperandIndex) { 658 const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex); 659 CHECK(it != mOutputToDefiningExecutionStep.end()); 660 uint32_t stepIndex = it->second; 661 CHECK_LT(stepIndex, mSteps.size()); 662 VLOG(COMPILATION) 663 << "ExecutionStep(" << stepIndex 664 << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at" 665 << toString(sourceOperandIndex) << ")"; 666 CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) != 667 mSourceOperandToOutputIndex.end()); 668 mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput( 669 mSourceOperandToOutputIndex.at(sourceOperandIndex)); 670 }; 671 for (const auto& logicalStep : mSteps) { 672 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { 673 for (const auto& output : step->getOutputsAsStepModelInputs()) { 674 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first); 675 declareModelOutputIsDownstreamInput(sourceOperandIndex); 676 } 677 } 678 } 679 } 680 findTempsAsStepModelOutputs()681 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() { 682 auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) { 683 const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex); 684 if (it == mTemporaryToDefiningExecutionStep.end()) { 685 // The operand is not a temporary or is not defined by an 686 // ExecutionStep (i.e. it's an output of an IF or a WHILE). 687 // The latter case is handled by ExecutionPlan::makeController(). 688 return; 689 } 690 uint32_t stepIndex = it->second; 691 CHECK_LT(stepIndex, mSteps.size()); 692 mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second); 693 }; 694 for (const auto& logicalStep : mSteps) { 695 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { 696 for (const auto& input : step->getTempsAsStepModelInputs()) { 697 SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first); 698 recordAsOutputIfTemporary(sourceOperandIndex); 699 } 700 } else if (const IfStep* step = logicalStep->tryIfStep()) { 701 recordAsOutputIfTemporary(step->conditionOperandIndex); 702 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) { 703 recordAsOutputIfTemporary(sourceOperandIndex); 704 } 705 } else if (const WhileStep* step = logicalStep->tryWhileStep()) { 706 for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) { 707 recordAsOutputIfTemporary(sourceOperandIndex); 708 } 709 } else { 710 CHECK(logicalStep->isGoto()); 711 } 712 } 713 } 714 declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex)715 void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) { 716 VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput(" 717 << mainModelOutputIndex << ")"; 718 const auto it = std::find(mOutputIndexStepModelToMainModel.begin(), 719 mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex); 720 CHECK(it != mOutputIndexStepModelToMainModel.end()); 721 const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin(); 722 CHECK(stepModelOutputIndex < mModelOutputs.size()); 723 mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex); 724 } 725 recordTempAsStepModelOutput(uint32_t stepOperandIndex)726 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) { 727 const auto it = mOperandMap.find(stepOperandIndex); 728 CHECK(it != mOperandMap.end()); 729 mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second); 730 } 731 getSourceModel() const732 const ModelBuilder* ExecutionStep::getSourceModel() const { 733 return mPlan->getSourceModels().getModel(mSourceModelIndex); 734 } 735 logStepModel() const736 void ExecutionStep::logStepModel() const { 737 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex; 738 739 auto logRemapEntry = [](std::string& toLog, const std::pair<uint32_t, uint32_t>& e) { 740 if (!toLog.empty()) { 741 toLog += ", "; 742 } 743 toLog += toString(e.first); 744 toLog += "->"; 745 toLog += toString(e.second); 746 }; 747 748 auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) { 749 std::string toLog; 750 for (const auto& e : map) { 751 logRemapEntry(toLog, e); 752 } 753 VLOG(COMPILATION) << name << ": " << toLog; 754 }; 755 auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) { 756 std::string toLog; 757 for (const auto& e : set) { 758 logRemapEntry(toLog, e); 759 } 760 VLOG(COMPILATION) << name << ": " << toLog; 761 }; 762 763 logRemapVector("step model inputs", mStepModelInputs); 764 logRemapVector("step model outputs", mStepModelOutputs); 765 logRemapVector("model inputs", mModelInputs); 766 logRemapVector("model outputs", mModelOutputs); 767 logRemapVector("temps as step model inputs", mTempsAsStepModelInputs); 768 logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs); 769 logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs); 770 } 771 hasUnknownSize(const Operand & operand)772 static bool hasUnknownSize(const Operand& operand) { 773 if (operand.dimensions.empty()) { 774 return TypeManager::get()->isTensorType(operand.type); 775 } 776 for (const Dimension& dimension : operand.dimensions) { 777 if (dimension == 0) { 778 return true; 779 } 780 } 781 return false; 782 } 783 finishStepModel(const ModelBuilder * mainModel,bool * hasOutputOfUnknownSize,int32_t executionPreference,int32_t priority)784 int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize, 785 int32_t executionPreference, int32_t priority) { 786 CHECK(mDevice != nullptr); 787 788 for (const auto& stepModelOutput : mTempsAsStepModelOutputs) { 789 const Operand& operand = mStepModel.getOperand(stepModelOutput.second); 790 if (hasUnknownSize(operand)) { 791 *hasOutputOfUnknownSize = true; 792 VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first 793 << " of source graph) has unknown size: " << operand; 794 } 795 } 796 797 mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16()); 798 799 mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end()); 800 mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(), 801 mTempsAsStepModelInputs.end()); 802 mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(), 803 mOutputsAsStepModelInputs.end()); 804 805 mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end()); 806 mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(), 807 mTempsAsStepModelOutputs.end()); 808 809 // A step model with no inputs or no outputs is an invalid model. Note that we would like to 810 // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from 811 // model validation. 812 if (hasNoInputsOrNoOutputs()) { 813 VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs " 814 "or no outputs"; 815 return ANEURALNETWORKS_OP_FAILED; 816 } 817 818 if (mSourceModelIndex == kMainModelInSourceModels) { 819 std::map<uint32_t, uint32_t> mainModelOperandToInputIndex; 820 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) { 821 mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i; 822 } 823 std::map<uint32_t, uint32_t> mainModelOperandToOutputIndex; 824 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) { 825 mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i; 826 } 827 828 // mInputIndexStepModelToMainModel is ordered by step model input index and relies on 829 // mModelInputs being the first inputs, as specified by mStepModelInputs. 830 mInputIndexStepModelToMainModel.resize(mModelInputs.size()); 831 std::transform(mModelInputs.begin(), mModelInputs.end(), 832 mInputIndexStepModelToMainModel.begin(), 833 [&mainModelOperandToInputIndex](auto& e) { 834 uint32_t sourceOperandIndex = e.first; 835 return mainModelOperandToInputIndex[sourceOperandIndex]; 836 }); 837 838 // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on 839 // mModelOutputs being the first outputs, as specified by mStepModelOutputs. 840 mOutputIndexStepModelToMainModel.resize(mModelOutputs.size()); 841 std::transform(mModelOutputs.begin(), mModelOutputs.end(), 842 mOutputIndexStepModelToMainModel.begin(), 843 [&mainModelOperandToOutputIndex](auto& e) { 844 uint32_t sourceOperandIndex = e.first; 845 return mainModelOperandToOutputIndex[sourceOperandIndex]; 846 }); 847 848 // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies 849 // on mOutputsAsStepModelInputs being the first outputs. 850 mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size()); 851 std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(), 852 mOutputsAsStepModelInputsIndexToMainModel.begin(), 853 [&mainModelOperandToOutputIndex](auto& e) { 854 uint32_t sourceOperandIndex = e.first; 855 return mainModelOperandToOutputIndex[sourceOperandIndex]; 856 }); 857 } 858 859 if (VLOG_IS_ON(COMPILATION)) { 860 logStepModel(); 861 } 862 863 std::vector<uint32_t> inputs(mStepModelInputs.size()); 864 std::vector<uint32_t> outputs(mStepModelOutputs.size()); 865 std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(), 866 [](auto& e) { return e.second; }); 867 std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(), 868 [](auto& e) { return e.second; }); 869 NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(), 870 outputs.size(), outputs.data())); 871 NN_RETURN_IF_ERROR(mStepModel.finish()); 872 873 // TODO: Move compilation elsewhere? 874 VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName(); 875 return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(), 876 &mToken, {}, &mPreparedStepModel); 877 } 878 dump() const879 void ExecutionStep::dump() const { 880 if (VLOG_IS_ON(COMPILATION)) { 881 VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName(); 882 logModelToInfo(mStepModel.makeModel()); 883 } 884 } 885 operator <<(std::ostream & os,const IfStep & step)886 std::ostream& operator<<(std::ostream& os, const IfStep& step) { 887 return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex) 888 << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex; 889 } 890 operator <<(std::ostream & os,const WhileStep & step)891 std::ostream& operator<<(std::ostream& os, const WhileStep& step) { 892 return os << "Step#" << step.index << ": while cond=" << step.condStepIndex 893 << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex; 894 } 895 operator <<(std::ostream & os,const GotoStep & step)896 std::ostream& operator<<(std::ostream& os, const GotoStep& step) { 897 return os << "Step#" << step.index << ": goto " << step.gotoStepIndex; 898 } 899 dump() const900 void LogicalStep::dump() const { 901 if (VLOG_IS_ON(COMPILATION)) { 902 if (const IfStep* step = tryIfStep()) { 903 VLOG(COMPILATION) << *step; 904 } else if (const WhileStep* step = tryWhileStep()) { 905 VLOG(COMPILATION) << *step; 906 } else if (const GotoStep* step = tryGotoStep()) { 907 VLOG(COMPILATION) << *step; 908 } else { 909 executionStep()->dump(); 910 } 911 } 912 } 913 finish(const SourceModels * sourceModels,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)914 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels, 915 int32_t executionPreference, int32_t priority, 916 const OptionalTimePoint& deadline, 917 const std::vector<TokenValuePair>& metadata, 918 int simulateFailureResultCode) { 919 CHECK(!mSuccessfulFinish); 920 CHECK(!deadline.has_value()); 921 CHECK(metadata.empty()); 922 923 const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels); 924 925 auto containsUnknownSize = [sourceModels](const std::vector<SourceOperandIndex>& operands) { 926 for (const auto& sourceOperandIndex : operands) { 927 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first); 928 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second); 929 if (hasUnknownSize(operand)) { 930 return true; 931 } 932 } 933 return false; 934 }; 935 936 findTempsAsStepModelOutputs(); 937 for (const auto& logicalStep : mSteps) { 938 if (ExecutionStep* step = logicalStep->tryExecutionStep()) { 939 bool stepHasDynamicTemporaries = false; 940 int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries, 941 executionPreference, priority); 942 if (stepHasDynamicTemporaries) { 943 mHasDynamicTemporaries = true; 944 if (!isCompliantVersion(kHalVersionV1_2ToApi.canonical, 945 step->getDevice()->getFeatureLevel())) { 946 // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT 947 // must have fully specified dimensions either in the 948 // Operand or in the RequestArgument. In the case of a 949 // dynamic temporary, we won't be able to supply fully 950 // specified dimensions in either. 951 VLOG(COMPILATION) 952 << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex() 953 << " defines dynamic temporaries but is scheduled on pre-1.2 device " 954 << step->getDevice()->getName(); 955 if (n == ANEURALNETWORKS_NO_ERROR) { 956 n = ANEURALNETWORKS_OP_FAILED; 957 } 958 } 959 } 960 if (n != ANEURALNETWORKS_NO_ERROR) { 961 VLOG(COMPILATION) 962 << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed"; 963 return n; 964 } 965 } else if (IfStep* step = logicalStep->tryIfStep()) { 966 // The partitioner does not support dynamic temporaries (b/132458982). 967 CHECK(!containsUnknownSize(step->outerInputOperands)); 968 CHECK(!containsUnknownSize(step->outerOutputOperands)); 969 // step->conditionOperandIndex has a static shape. See b/158557728. 970 CHECK(!containsUnknownSize(step->thenBranchInputOperands)); 971 CHECK(!containsUnknownSize(step->thenBranchOutputOperands)); 972 CHECK(!containsUnknownSize(step->elseBranchInputOperands)); 973 CHECK(!containsUnknownSize(step->elseBranchOutputOperands)); 974 } else if (WhileStep* step = logicalStep->tryWhileStep()) { 975 // The partitioner does not support dynamic temporaries (b/132458982). 976 CHECK(!containsUnknownSize(step->outerInputOperands)); 977 CHECK(!containsUnknownSize(step->outerOutputOperands)); 978 CHECK(!containsUnknownSize(step->condInputOperands)); 979 // step->condOutputOperand has a static shape. See b/158557728. 980 CHECK(!containsUnknownSize(step->bodyInputOperands)); 981 CHECK(!containsUnknownSize(step->bodyOutputOperands)); 982 } else { 983 CHECK(logicalStep->isGoto()); 984 } 985 } 986 987 if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) { 988 VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode " 989 << simulateFailureResultCode; 990 return simulateFailureResultCode; 991 } 992 993 for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) { 994 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i)); 995 mSourceOperandToInputIndex[index] = i; 996 } 997 for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) { 998 SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i)); 999 mSourceOperandToOutputIndex[index] = i; 1000 } 1001 1002 findControlFlowBoundaryConstants(sourceModels); 1003 findModelOutputsThatAreDownstreamInputs(); 1004 findMemoryStepRoles(); 1005 1006 mSuccessfulFinish = true; 1007 LOG(INFO) << "ExecutionPlan::CompoundBody::finish: compilation finished successfully"; 1008 return ANEURALNETWORKS_NO_ERROR; 1009 } 1010 findControlFlowBoundaryConstants(const SourceModels * sourceModels)1011 void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants( 1012 const SourceModels* sourceModels) { 1013 auto handleBoundaryConstants = [this, 1014 sourceModels](const SourceOperandIndex& sourceOperandIndex) { 1015 const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first); 1016 const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second); 1017 const DataLocation& location = operand.location; 1018 if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) { 1019 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = { 1020 .buffer = sourceModel->getPointerToOperandValue(location.offset), 1021 .length = location.length, 1022 }; 1023 } else if (operand.lifetime == Operand::LifeTime::POINTER) { 1024 mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = { 1025 .buffer = static_cast<const uint8_t*>(std::get<const void*>(location.pointer)), 1026 .length = location.length, 1027 }; 1028 } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) { 1029 mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = { 1030 .memory = sourceModel->getMemories()[location.poolIndex], 1031 .offset = location.offset, 1032 .length = location.length, 1033 }; 1034 } 1035 }; 1036 for (const auto& logicalStep : mSteps) { 1037 if (const IfStep* step = logicalStep->tryIfStep()) { 1038 handleBoundaryConstants(step->conditionOperandIndex); 1039 for (const auto& sourceOperandIndex : step->outerInputOperands) { 1040 handleBoundaryConstants(sourceOperandIndex); 1041 } 1042 } else if (const WhileStep* step = logicalStep->tryWhileStep()) { 1043 for (const auto& sourceOperandIndex : step->outerInputOperands) { 1044 handleBoundaryConstants(sourceOperandIndex); 1045 } 1046 } 1047 } 1048 } 1049 findMemoryStepRoles()1050 void ExecutionPlan::CompoundBody::findMemoryStepRoles() { 1051 mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) { 1052 for (const auto& logicalStep : mSteps) { 1053 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { 1054 const auto& stepModelInputs = step->getStepModelInputs(); 1055 for (uint32_t i = 0; i < stepModelInputs.size(); i++) { 1056 const auto& [sourceIndex, stepIndex] = stepModelInputs[i]; 1057 analyzer.addRole(*step, sourceIndex, IOType::INPUT, i); 1058 } 1059 const auto& stepModelOutputs = step->getStepModelOutputs(); 1060 for (uint32_t i = 0; i < stepModelOutputs.size(); i++) { 1061 const auto& [sourceIndex, stepIndex] = stepModelOutputs[i]; 1062 analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i); 1063 } 1064 } else if (const IfStep* step = logicalStep->tryIfStep()) { 1065 // See ExecutionPlan::nextCompound(const IfStep*, ...). 1066 // 1067 // For interpreted IF operation, the outer input memories may be directly used by 1068 // the SUBGRAPH_INPUTs of the then and else model. 1069 CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size()); 1070 CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size()); 1071 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) { 1072 analyzer.setUsedBy(step->outerInputOperands[i], 1073 step->thenBranchInputOperands[i]); 1074 analyzer.setUsedBy(step->outerInputOperands[i], 1075 step->elseBranchInputOperands[i]); 1076 } 1077 // For interpreted IF operation, the outer output memories may be directly used by 1078 // the SUBGRAPH_OUTPUTs of the then and else model. 1079 CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size()); 1080 CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size()); 1081 for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) { 1082 analyzer.setUsedBy(step->outerOutputOperands[i], 1083 step->thenBranchOutputOperands[i]); 1084 analyzer.setUsedBy(step->outerOutputOperands[i], 1085 step->elseBranchOutputOperands[i]); 1086 } 1087 } else if (const WhileStep* step = logicalStep->tryWhileStep()) { 1088 // See ExecutionPlan::nextCompound(const WhileStep*, ...). 1089 // 1090 // For interpreted WHILE operation, the following memories are involved: 1091 // a. the outer input memories to the WHILE operation 1092 // b. the outer output memories to the WHILE operation 1093 // c. the output memory of the condition model 1094 // d. one set of output memories of the body model 1095 // e. another set of output memories of the body model 1096 // 1097 // The memories are used in the following ways: 1098 // 1099 // - Condition model: 1100 // * In the first iteration: inputs use (a); output uses (c) 1101 // * In the following iterations: inputs use (d) or (e) for input-output and 1102 // state-only operands, and (a) for input-only operands; output uses (c) 1103 // 1104 // - Body model: 1105 // * In all iterations: inputs are the same as the condition model; outputs use 1106 // (d) or (e) 1107 // 1108 // Therefore, we configure the analyzer with the following used-by relationships: 1109 // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of 1110 // the condition model for all inputs in the first iteration, as well as the 1111 // input-only operands in the following iterations. 1112 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size()); 1113 for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) { 1114 analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]); 1115 } 1116 // - The output memories of the body model (d) and (e) may be directly used by the 1117 // SUBGRAPH_INPUTs of the condition model for input-output and state-only operands 1118 // after the first iteration. 1119 CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size()); 1120 for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) { 1121 analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]); 1122 } 1123 // - The SUBGRAPH_INPUTs of the condition model are directly used by the 1124 // SUBGRAPH_INPUTs of the body model for all inputs in all iterations. 1125 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size()); 1126 for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) { 1127 analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]); 1128 } 1129 } else if (logicalStep->isGoto()) { 1130 // Nothing to do. 1131 } else { 1132 CHECK(false) << "Unexpected LogicalStep kind"; 1133 } 1134 } 1135 }); 1136 } 1137 finish(const SourceModels *,int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1138 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference, 1139 int32_t priority, const OptionalTimePoint& deadline, 1140 const std::vector<TokenValuePair>& metadata, 1141 int simulateFailureResultCode) { 1142 CHECK(!mSuccessfulFinish); 1143 CHECK(mDevice != nullptr); 1144 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation"; 1145 int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo, 1146 &mToken, metadata, &mPreparedModel); 1147 if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) { 1148 VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode " 1149 << simulateFailureResultCode; 1150 n = simulateFailureResultCode; 1151 } 1152 mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR); 1153 if (mSuccessfulFinish) { 1154 LOG(INFO) << "ExecutionPlan::SimpleBody::finish: compilation finished successfully on " 1155 << mDevice->getName(); 1156 } 1157 return n; 1158 } 1159 finish(int32_t executionPreference,int32_t priority,const OptionalTimePoint & deadline,const std::vector<TokenValuePair> & metadata,int simulateFailureResultCode)1160 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority, 1161 const OptionalTimePoint& deadline, 1162 const std::vector<TokenValuePair>& metadata, 1163 int simulateFailureResultCode) { 1164 CHECK(mBody != nullptr); 1165 return mBody->finish(&getSourceModels(), executionPreference, priority, deadline, metadata, 1166 simulateFailureResultCode); 1167 } 1168 Controller(const ExecutionPlan * plan,ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder,uint32_t totalSizeOfTemporaries,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary,std::map<SourceOperandIndex,StaticTemporaryLocation> sourceOperandToLocationOfTemporary2,std::map<SourceOperandIndex,uint32_t> sourceOperandToInputIndex,std::map<SourceOperandIndex,uint32_t> sourceOperandToOutputIndex,const std::map<SourceOperandIndex,ConstantCopyLocation> & sourceOperandToConstantCopy,std::map<SourceOperandIndex,ConstantReferenceLocation> sourceOperandToConstantReference,DynamicTemporaries dynamicTemporaries)1169 ExecutionPlan::Controller::Controller( 1170 const ExecutionPlan* plan, ExecutionBuilder* executionBuilder, 1171 const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries, 1172 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary, 1173 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2, 1174 std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex, 1175 std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex, 1176 const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy, 1177 std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference, 1178 DynamicTemporaries dynamicTemporaries) 1179 : mPlan(plan), 1180 mExecutionBuilder(executionBuilder), 1181 mBurstBuilder(burstBuilder), 1182 mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)), 1183 mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)), 1184 mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)), 1185 mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)), 1186 mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)), 1187 mDynamicTemporaries(std::move(dynamicTemporaries)), 1188 mNextStepIndex(0), 1189 mFallbackNextStepIndex(kBadStepIndex), 1190 mLastStepSyncFd(-1) { 1191 if (totalSizeOfTemporaries == 0) { 1192 return; 1193 } 1194 int n; 1195 std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries); 1196 if (n != ANEURALNETWORKS_NO_ERROR) { 1197 LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries"; 1198 mNextStepIndex = kBadStepIndex; 1199 } 1200 for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) { 1201 memcpy(mTemporaries->getPointer() + 1202 mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset, 1203 location.buffer, location.length); 1204 } 1205 } 1206 1207 // Attempt to create a burst object for each PreparedModel/Partition. If the 1208 // burst controller object cannot be made, return a nullptr in its place to 1209 // indicate the regular execution path should be used. This can occur either 1210 // because PreparedModel was nullptr (cpu was best choice), or because the 1211 // IPreparedModel was of insufficient version or failed to configure the burst. makeBursts() const1212 std::vector<SharedBurst> ExecutionPlan::makeBursts() const { 1213 switch (mState) { 1214 // burst object for each partition in the compound case 1215 case COMPOUND: { 1216 std::vector<SharedBurst> bursts; 1217 bursts.reserve(compound()->mSteps.size()); 1218 for (const auto& logicalStep : compound()->mSteps) { 1219 if (!logicalStep->isExecution()) { 1220 bursts.push_back(nullptr); 1221 continue; 1222 } 1223 if (const auto preparedModel = 1224 logicalStep->executionStep()->getPreparedStepModel()) { 1225 const auto maybeBurst = preparedModel->configureExecutionBurst(); 1226 if (!maybeBurst.has_value()) { 1227 LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with " 1228 << maybeBurst.error().code << ": " << maybeBurst.error().message; 1229 } 1230 bursts.push_back(maybeBurst.value_or(nullptr)); 1231 } else { 1232 bursts.push_back(nullptr); 1233 } 1234 } 1235 return bursts; 1236 } 1237 // single burst object for the simple case 1238 case SIMPLE: { 1239 std::vector<SharedBurst> burst; 1240 auto simpleBody = simple(); 1241 if (const auto preparedModel = simpleBody->mPreparedModel) { 1242 const auto maybeBurst = preparedModel->configureExecutionBurst(); 1243 if (!maybeBurst.has_value()) { 1244 LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with " 1245 << maybeBurst.error().code << ": " << maybeBurst.error().message; 1246 } 1247 burst.push_back(maybeBurst.value_or(nullptr)); 1248 } else { 1249 burst.push_back(nullptr); 1250 } 1251 return burst; 1252 } 1253 // no burst objects made 1254 default: 1255 return {}; 1256 } 1257 } 1258 makeController(ExecutionBuilder * executionBuilder,const BurstBuilder * burstBuilder) const1259 std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController( 1260 ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const { 1261 CHECK(isValid()); 1262 CHECK(mState != SIMPLE); 1263 const auto* body = compound(); 1264 // Create the layout for a RuntimeMemory object big enough to hold 1265 // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and 1266 // - buffers required by the control flow implementation. 1267 // 1268 // TODO: Rethink this approach for managing temporaries. Some 1269 // alternatives: 1270 // 1271 // 1) Adopt a memory layout scheme analogous to stack allocation, 1272 // where objects of non-overlapping lifetime can occupy the same 1273 // storage. We would still have a single Memory object in this 1274 // case. 1275 // 1276 // 2) Do something like what CpuExecutor does, and do allocations 1277 // and deallocations on the fly (during execution) before first 1278 // reference and after last reference, respectively. This would 1279 // mean having one Memory object per TEMPORARY; or, in a more 1280 // complicated implementation, one Memory object per set of 1281 // temporaries that have the same lifetime. Note that the Android 1282 // system limits the number of shared memory objects, which are 1283 // what our Memory objects represent. 1284 // 1285 uint32_t totalSizeOfTemporaries = 0; 1286 // This function has two modes of operation: 1287 // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for 1288 // TEMPORARY_VARIABLE source operands that are not dynamic temporaries, 1289 // skip TEMPORARY_VARIABLE source operands that are dynamic temporaries, 1290 // skip SUBGRAPH_OUTPUT source operands, and panic if we see a source 1291 // operand of another lifetime. 1292 // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for 1293 // SUBGRAPH_OUTPUT source operands and panic if we see a source operand 1294 // of another lifetime. 1295 auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries]( 1296 const SourceOperandIndex& sourceOperandIndex, 1297 std::map<SourceOperandIndex, StaticTemporaryLocation>* 1298 sourceOperandToLocationOfTemporary, 1299 Operand::LifeTime lifetime = 1300 Operand::LifeTime::TEMPORARY_VARIABLE) { 1301 CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE || 1302 lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT); 1303 const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex); 1304 if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE && 1305 sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) { 1306 // See the caller for explanation. 1307 return; 1308 } 1309 CHECK_EQ(sourceOperand.lifetime, lifetime); 1310 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand); 1311 if (size != 0u) { 1312 const auto memoryPreference = 1313 body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); 1314 const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment, 1315 memoryPreference.padding); 1316 auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc); 1317 CHECK(isNew); 1318 VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex) 1319 << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength; 1320 } else { 1321 // Unknown size, hence dynamic temporary. The mapping will 1322 // be established elsewhere (DynamicTemporaries::allocate()). 1323 CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE); 1324 CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE); 1325 } 1326 }; 1327 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary; 1328 std::map<SourceOperandIndex, StaticTemporaryLocation> sourceOperandToLocationOfTemporary2; 1329 for (const auto& logicalStep : body->mSteps) { 1330 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { 1331 // Allocate memory for ExecutionStep temporary outputs that are 1332 // inputs to other steps, as determined by 1333 // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs(). 1334 // 1335 // We don't allocate memory for step model output operands with 1336 // source operand lifetime SUBGRAPH_OUTPUT because they will be 1337 // - managed by the client (main model outputs), 1338 // - assigned a location of another operand (when this step model 1339 // output is a branch model output of an IF; see 1340 // ExecutionPlan::nextCompound(const IfStep*, ...)), or 1341 // - allocated by a WHILE (when this step model output 1342 // is a condition or body model output of a WHILE; see the 1343 // step->bodyOutputOperands and step->condOutputOperand handling 1344 // below). 1345 for (const auto& output : step->getTempsAsStepModelOutputs()) { 1346 mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first), 1347 &sourceOperandToLocationOfTemporary); 1348 } 1349 } else if (const IfStep* step = logicalStep->tryIfStep()) { 1350 // Allocate memory for all temporary outputs of an IfStep because 1351 // they are going to be written to by a branch model. We don't 1352 // perform unused output operand optimisation for referenced models. 1353 // 1354 // We don't allocate memory for branch output operands because they 1355 // use the same location as the corresponding outer output operands, 1356 // as established in ExecutionPlan::nextCompound(const IfStep*, ...) 1357 // 1358 // We don't allocate memory for outer output operands with source 1359 // operand lifetime SUBGRAPH_OUTPUT because they will be 1360 // - managed by the client (main model outputs), 1361 // - assigned a location of another operand (when this IF outer 1362 // output is a branch model output of another IF; see 1363 // ExecutionPlan::nextCompound(const IfStep*, ...)), or 1364 // - allocated by a WHILE (when this IF outer output 1365 // is a condition or body model output of a WHILE; see the 1366 // step->bodyOutputOperands and step->condOutputOperand handling 1367 // below). 1368 for (const auto& sourceOperandIndex : step->outerOutputOperands) { 1369 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary); 1370 } 1371 } else if (const WhileStep* step = logicalStep->tryWhileStep()) { 1372 // Allocate memory for all temporary outputs of an WhileStep because 1373 // they are going to be written to by the WHILE loop. 1374 // 1375 // We don't allocate memory for outer output operands with source 1376 // operand lifetime SUBGRAPH_OUTPUT because they will be 1377 // - managed by the client (main model outputs), 1378 // - assigned a location of another operand (when this WHILE outer 1379 // output is a branch model output of an IF; see 1380 // ExecutionPlan::nextCompound(const IfStep*, ...)), or 1381 // - allocated by another WHILE (when this WHILE outer output 1382 // is a condition or body model output of another WHILE; see the 1383 // step->bodyOutputOperands and step->condOutputOperand handling 1384 // below). 1385 for (const auto& sourceOperandIndex : step->outerOutputOperands) { 1386 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary); 1387 } 1388 // Allocate memory for body model outputs. Note that we could use 1389 // the outer output operand memory instead but we currently don't do 1390 // so (b/148206073). 1391 for (const auto& sourceOperandIndex : step->bodyOutputOperands) { 1392 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary, 1393 Operand::LifeTime::SUBGRAPH_OUTPUT); 1394 // Allocate another set of temporaries for double buffering. 1395 mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2, 1396 Operand::LifeTime::SUBGRAPH_OUTPUT); 1397 } 1398 // Allocate memory for condition model output. 1399 // TODO: Share one condition output memory region between all loops. 1400 mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary, 1401 Operand::LifeTime::SUBGRAPH_OUTPUT); 1402 } else { 1403 CHECK(logicalStep->isGoto()); 1404 } 1405 } 1406 // Allocate temporary memory for boundary CONSTANT_COPY operands. 1407 for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) { 1408 const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); 1409 const auto loc = addTemporary(&totalSizeOfTemporaries, location.length, 1410 memoryPreference.alignment, memoryPreference.padding); 1411 sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc); 1412 VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex) 1413 << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength; 1414 } 1415 // Collect dynamic temporaries. 1416 // TODO(b/157236079): Move some or all of this work to compilation time? 1417 DynamicTemporaries dynamicTemporaries; 1418 const TypeManager* typeManager = TypeManager::get(); 1419 forEachDynamicTemporary([body, typeManager, &dynamicTemporaries]( 1420 SourceOperandIndex sourceOperandIndex, 1421 const Operand& sourceOperand, uint32_t definingStepIndex) { 1422 CHECK(typeManager->isTensorType(sourceOperand.type)); 1423 const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); 1424 // TODO: For now we guess an initial size equal to element 1425 // size, which is overly conservative. 1426 const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1}); 1427 dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions, 1428 size, memoryPreference.alignment, memoryPreference.padding); 1429 }); 1430 dynamicTemporaries.endDeclarations(); 1431 dynamicTemporaries.vlogDump("finished declarations"); 1432 1433 return std::shared_ptr<Controller>(new Controller( 1434 this, executionBuilder, burstBuilder, totalSizeOfTemporaries, 1435 std::move(sourceOperandToLocationOfTemporary), 1436 std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex, 1437 body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy, 1438 body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries))); 1439 } 1440 1441 // TODO: Find a better way to provide this functionality. fallback(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1442 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller, 1443 std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController, 1444 const std::vector<OutputShape>* mainModelOutputShapes) const { 1445 *executor = nullptr; 1446 if (burstController != nullptr) { 1447 *burstController = nullptr; 1448 } 1449 1450 VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor) 1451 << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex; 1452 1453 if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) { 1454 // We haven't called next(). 1455 return ANEURALNETWORKS_OP_FAILED; 1456 } 1457 1458 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 1459 // The last call to next() did not produce an executor. 1460 return ANEURALNETWORKS_OP_FAILED; 1461 } 1462 1463 controller->mNextStepIndex = controller->mFallbackNextStepIndex; 1464 return next(controller, executor, burstController, mainModelOutputShapes); 1465 } 1466 Buffer(void * pointer,uint32_t size)1467 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size) 1468 : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast<uint8_t*>(pointer), size)), 1469 mOffset(0) {} 1470 Buffer(RunTimePoolInfo info,uint32_t offset)1471 ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset) 1472 : mInfo(std::move(info)), mOffset(offset) {} 1473 getPointer() const1474 void* ExecutionPlan::Buffer::getPointer() const { 1475 return mInfo.getBuffer() + mOffset; 1476 } 1477 getSize() const1478 uint32_t ExecutionPlan::Buffer::getSize() const { 1479 return mInfo.getSize() - mOffset; 1480 } 1481 flush() const1482 void ExecutionPlan::Buffer::flush() const { 1483 mInfo.flush(); 1484 } 1485 getBufferFromModelArgumentInfo(const ModelArgumentInfo & info,const ExecutionBuilder * executionBuilder) const1486 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBufferFromModelArgumentInfo( 1487 const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const { 1488 switch (info.state()) { 1489 case ModelArgumentInfo::POINTER: { 1490 return Buffer(info.buffer(), info.length()); 1491 } break; 1492 case ModelArgumentInfo::MEMORY: { 1493 if (std::optional<RunTimePoolInfo> poolInfo = 1494 executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) { 1495 return Buffer(*poolInfo, info.locationAndLength().offset); 1496 } else { 1497 LOG(ERROR) << "Unable to map operand memory pool"; 1498 return std::nullopt; 1499 } 1500 } break; 1501 case ModelArgumentInfo::HAS_NO_VALUE: { 1502 LOG(ERROR) << "Attempting to read an operand that has no value"; 1503 return std::nullopt; 1504 } break; 1505 default: { 1506 LOG(ERROR) << "Unexpected operand memory state: " << static_cast<int>(info.state()); 1507 return std::nullopt; 1508 } break; 1509 } 1510 } 1511 getBuffer(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex) const1512 std::optional<ExecutionPlan::Buffer> ExecutionPlan::getBuffer( 1513 std::shared_ptr<Controller> controller, SourceOperandIndex operandIndex) const { 1514 const auto& sourceOperandToLocationOfTemporary = 1515 controller->mSourceOperandToLocationOfTemporary; 1516 const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex; 1517 const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex; 1518 const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference; 1519 if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex); 1520 it != sourceOperandToLocationOfTemporary.end()) { 1521 const uint32_t offset = it->second.offset; 1522 const std::unique_ptr<MemoryAshmem>& memory = controller->mTemporaries; 1523 return Buffer(memory->getPointer() + offset, memory->getSize() - offset); 1524 } else if (auto it = sourceOperandToInputIndex.find(operandIndex); 1525 it != sourceOperandToInputIndex.end()) { 1526 const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second); 1527 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder); 1528 } else if (auto it = sourceOperandToOutputIndex.find(operandIndex); 1529 it != sourceOperandToOutputIndex.end()) { 1530 const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second); 1531 return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder); 1532 } else if (auto it = sourceOperandToConstantReference.find(operandIndex); 1533 it != sourceOperandToConstantReference.end()) { 1534 const ConstantReferenceLocation& location = it->second; 1535 const std::optional<RunTimePoolInfo> info = location.memory->getRunTimePoolInfo(); 1536 if (info == std::nullopt) { 1537 return std::nullopt; 1538 } 1539 return Buffer(info->getBuffer() + location.offset, location.length); 1540 } 1541 return std::nullopt; 1542 } 1543 readConditionValue(std::shared_ptr<Controller> controller,SourceOperandIndex operandIndex,bool * value) const1544 int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller, 1545 SourceOperandIndex operandIndex, bool* value) const { 1546 std::optional<ExecutionPlan::Buffer> buffer = getBuffer(controller, operandIndex); 1547 if (buffer == std::nullopt) { 1548 LOG(ERROR) << "Unable to read operand " << toString(operandIndex); 1549 return ANEURALNETWORKS_OP_FAILED; 1550 } 1551 CHECK_GE(buffer->getSize(), sizeof(bool8)); 1552 bool8 value8 = *static_cast<bool8*>(buffer->getPointer()); 1553 *value = static_cast<bool>(value8); 1554 VLOG(EXECUTION) << "readConditionValue: " << *value; 1555 return ANEURALNETWORKS_NO_ERROR; 1556 } 1557 next(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes,int syncFdOfLastStep) const1558 int ExecutionPlan::next(std::shared_ptr<Controller> controller, 1559 std::shared_ptr<StepExecutor>* executor, SharedBurst* burstController, 1560 const std::vector<OutputShape>* mainModelOutputShapes, 1561 int syncFdOfLastStep) const { 1562 CHECK(mState == COMPOUND); 1563 1564 controller->mLastStepSyncFd = syncFdOfLastStep; 1565 *executor = nullptr; 1566 if (burstController != nullptr) { 1567 *burstController = nullptr; 1568 } 1569 1570 VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor) 1571 << "): mNextStepIndex = " << controller->mNextStepIndex; 1572 1573 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 1574 return ANEURALNETWORKS_OP_FAILED; 1575 } 1576 1577 return nextCompound(controller, executor, burstController, mainModelOutputShapes); 1578 } 1579 nextCompound(std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1580 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller, 1581 std::shared_ptr<StepExecutor>* executor, 1582 SharedBurst* burstController, 1583 const std::vector<OutputShape>* mainModelOutputShapes) const { 1584 if (controller->mNextStepIndex == Controller::kBadStepIndex) { 1585 return ANEURALNETWORKS_OP_FAILED; 1586 } 1587 1588 auto compoundBody = compound(); 1589 if (controller->mNextStepIndex == compoundBody->mSteps.size()) { 1590 controller->mNextStepIndex = Controller::kBadStepIndex; // end 1591 return ANEURALNETWORKS_NO_ERROR; 1592 } 1593 1594 const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex]; 1595 if (const IfStep* step = logicalStep->tryIfStep()) { 1596 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); 1597 } else if (const WhileStep* step = logicalStep->tryWhileStep()) { 1598 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); 1599 } else if (const GotoStep* step = logicalStep->tryGotoStep()) { 1600 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); 1601 } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { 1602 return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); 1603 } else { 1604 CHECK(false) << "Unknown step variant"; 1605 return ANEURALNETWORKS_BAD_STATE; 1606 } 1607 } 1608 nextCompound(const ExecutionStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1609 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller, 1610 std::shared_ptr<StepExecutor>* executor, 1611 SharedBurst* burstController, 1612 const std::vector<OutputShape>* mainModelOutputShapes) const { 1613 VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on " 1614 << step->getDevice()->getName(); 1615 1616 NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex())); 1617 controller->mDynamicTemporaries.vlogDump("finished allocating for a step"); 1618 1619 *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(), 1620 step->getDevice(), step->getPreparedStepModel(), 1621 /*reusable=*/false, step, 1622 &controller->mDynamicTemporaries); 1623 1624 step->mapInputsAndOutputs( 1625 *executor, mainModelOutputShapes, controller->mTemporaries.get(), 1626 controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries, 1627 controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex, 1628 controller->mSourceOperandToConstantReference); 1629 if (burstController != nullptr && controller->mBurstBuilder != nullptr) { 1630 *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex); 1631 } 1632 1633 controller->mFallbackNextStepIndex = controller->mNextStepIndex; 1634 controller->mNextStepIndex++; 1635 return ANEURALNETWORKS_NO_ERROR; 1636 } 1637 1638 // The first argument is the "source" operand, the second operand is the "destination". setInput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1639 void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand, 1640 const SourceOperandIndex& innerOperand) { 1641 VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from " 1642 << toString(outerOperand); 1643 #ifdef NN_DEBUGGABLE 1644 CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) + 1645 mSourceOperandToInputIndex.count(innerOperand) + 1646 mSourceOperandToOutputIndex.count(innerOperand) + 1647 mSourceOperandToConstantReference.count(innerOperand), 1648 1u); 1649 #endif 1650 mSourceOperandToLocationOfTemporary.erase(innerOperand); 1651 mSourceOperandToInputIndex.erase(innerOperand); 1652 mSourceOperandToOutputIndex.erase(innerOperand); 1653 mSourceOperandToConstantReference.erase(innerOperand); 1654 if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand); 1655 it != mSourceOperandToLocationOfTemporary.end()) { 1656 mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second); 1657 } else if (auto it = mSourceOperandToInputIndex.find(outerOperand); 1658 it != mSourceOperandToInputIndex.end()) { 1659 mSourceOperandToInputIndex.emplace(innerOperand, it->second); 1660 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand); 1661 it != mSourceOperandToOutputIndex.end()) { 1662 mSourceOperandToOutputIndex.emplace(innerOperand, it->second); 1663 } else if (auto it = mSourceOperandToConstantReference.find(outerOperand); 1664 it != mSourceOperandToConstantReference.end()) { 1665 mSourceOperandToConstantReference.emplace(innerOperand, it->second); 1666 } else { 1667 CHECK(false) << "Cannot set step model input operand " << toString(innerOperand) 1668 << " from operand " << toString(outerOperand); 1669 } 1670 } 1671 1672 // The first argument is the "source" operand, the second operand is the "destination". setOutput(const SourceOperandIndex & outerOperand,const SourceOperandIndex & innerOperand)1673 void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand, 1674 const SourceOperandIndex& innerOperand) { 1675 VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from " 1676 << toString(outerOperand); 1677 #ifdef NN_DEBUGGABLE 1678 CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) + 1679 mSourceOperandToOutputIndex.count(innerOperand), 1680 1u); 1681 #endif 1682 mSourceOperandToLocationOfTemporary.erase(innerOperand); 1683 mSourceOperandToOutputIndex.erase(innerOperand); 1684 if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand); 1685 it != mSourceOperandToLocationOfTemporary.end()) { 1686 mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second); 1687 } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand); 1688 it != mSourceOperandToOutputIndex.end()) { 1689 mSourceOperandToOutputIndex.emplace(innerOperand, it->second); 1690 } else { 1691 CHECK(false) << "Cannot set step model output operand " << toString(innerOperand) 1692 << " from operand " << toString(outerOperand); 1693 } 1694 } 1695 waitForLastStepSyncFence() const1696 int ExecutionPlan::Controller::waitForLastStepSyncFence() const { 1697 if (mLastStepSyncFd == -1) { 1698 return ANEURALNETWORKS_NO_ERROR; 1699 } 1700 VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd; 1701 auto r = syncWait(mLastStepSyncFd, -1); 1702 int n = ANEURALNETWORKS_NO_ERROR; 1703 if (r != FenceState::SIGNALED) { 1704 LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd; 1705 n = ANEURALNETWORKS_OP_FAILED; 1706 } 1707 return n; 1708 } 1709 1710 // Invocations of Controller::setInput/setOutput in this function must match with invocations of 1711 // StepRoleAnalyzer::setUsedBy in the IfStep branch in 1712 // ExecutionPlan::CompoundBody::findMemoryStepRoles. nextCompound(const IfStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1713 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller, 1714 std::shared_ptr<StepExecutor>* executor, 1715 SharedBurst* burstController, 1716 const std::vector<OutputShape>* mainModelOutputShapes) const { 1717 VLOG(EXECUTION) << "next: " << *step; 1718 // If the last step has a sync fence, wait for it to signal before reading the condition value. 1719 // This is safe because the steps are serialized when doing fenced compute. 1720 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence()); 1721 bool condValue; 1722 NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue)); 1723 controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex; 1724 const std::vector<SourceOperandIndex>& branchInputOperands = 1725 condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands; 1726 const std::vector<SourceOperandIndex>& branchOutputOperands = 1727 condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands; 1728 CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size()); 1729 CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size()); 1730 for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) { 1731 // We have to do this assignment just before executing this step to 1732 // accommodate cases when the IF resides within a WHILE condition or 1733 // body model and for some j the i-th input of the IF branch model is 1734 // - an input of the WHILE condition model (whileStep->condInputOperands[j]), 1735 // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or 1736 // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]). 1737 // In such cases, the WhileStep modifies the location of 1738 // step->outerInputOperands[i] to implement double buffering. 1739 controller->setInput(step->outerInputOperands[i], branchInputOperands[i]); 1740 } 1741 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) { 1742 // We have to do this assignment just before executing this step to 1743 // accommodate the case when the IF resides within a WHILE body 1744 // model and the i-th output of the IF branch model is an 1745 // output of the WHILE body model (whileStep->bodyOutputOperands[j] for 1746 // some j). In that case, the WhileStep modifies the location of 1747 // step->outerOutputOperands[i] to implement double buffering. 1748 controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]); 1749 } 1750 return nextCompound(controller, executor, burstController, mainModelOutputShapes); 1751 } 1752 1753 // Invocations of Controller::setInput in this function must match with invocations of 1754 // StepRoleAnalyzer::setUsedBy in the WhileStep branch in 1755 // ExecutionPlan::CompoundBody::findMemoryStepRoles. nextCompound(const WhileStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1756 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller, 1757 std::shared_ptr<StepExecutor>* executor, 1758 SharedBurst* burstController, 1759 const std::vector<OutputShape>* mainModelOutputShapes) const { 1760 WhileState& state = controller->mWhileState[controller->mNextStepIndex]; 1761 if (state.stage == WhileState::EVALUATE_CONDITION) { 1762 state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1; 1763 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration 1764 << ": evaluating condition"; 1765 controller->mNextStepIndex = step->condStepIndex; 1766 1767 if (state.iteration == 0) { 1768 state.startTime = Clock::now(); 1769 } 1770 1771 // iteration = 0 cond inputs = outer inputs 1772 // iteration = 1 cond inputs = body outputs 1773 // iteration = 2 cond inputs = body outputs 1774 // iteration = 3 cond inputs = ... 1775 uint32_t loopBodyOutputCount = step->bodyOutputOperands.size(); 1776 CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size()); 1777 CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount); 1778 for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) { 1779 bool operandIsInputOnly = i >= loopBodyOutputCount; 1780 controller->setInput((state.iteration == 0 || operandIsInputOnly) 1781 ? step->outerInputOperands[i] 1782 : step->bodyOutputOperands[i], 1783 step->condInputOperands[i]); 1784 } 1785 1786 state.stage = WhileState::EVALUATE_BODY; 1787 return nextCompound(controller, executor, burstController, mainModelOutputShapes); 1788 } 1789 1790 CHECK(state.stage == WhileState::EVALUATE_BODY); 1791 std::chrono::nanoseconds timeoutDuration( 1792 controller->mExecutionBuilder->getLoopTimeoutDuration()); 1793 auto duration = Clock::now() - state.startTime; 1794 if (duration > timeoutDuration) { 1795 LOG(ERROR) << "WHILE loop timed out after " 1796 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count() 1797 << " ms"; 1798 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT; 1799 } 1800 1801 // If the last step has a sync fence, wait for it to signal before reading the condition value. 1802 // This is safe because the steps are serialized when doing fenced compute. 1803 NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence()); 1804 bool condValue; 1805 NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue)); 1806 if (condValue) { 1807 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration 1808 << ": evaluating body"; 1809 controller->mNextStepIndex = step->bodyStepIndex; 1810 1811 // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1 1812 // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2 1813 // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1 1814 // iteration = 3 body inputs = cond inputs = ... body outputs = ... 1815 #ifdef NN_DEBUGGABLE 1816 CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size()); 1817 CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size()); 1818 CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size()); 1819 CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size()); 1820 #endif 1821 for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) { 1822 controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]); 1823 } 1824 if (state.iteration != 0) { 1825 for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) { 1826 #ifdef NN_DEBUGGABLE 1827 CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u); 1828 CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u); 1829 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u); 1830 CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u); 1831 #endif 1832 std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand], 1833 controller->mSourceOperandToLocationOfTemporary2[outputOperand]); 1834 } 1835 } 1836 } else { 1837 VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration 1838 << ": exiting loop"; 1839 controller->mNextStepIndex = step->exitStepIndex; 1840 1841 // Copy body outputs to outer outputs. 1842 // TODO: Use outer outputs instead of tmp2 to avoid copying? 1843 CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size()); 1844 for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) { 1845 // condInputOperands[i] points to a body output operand from the 1846 // last iteration if we've executed at least one iteration and to a 1847 // WHILE operation input operand otherwise. 1848 const SourceOperandIndex& innerOperand = step->condInputOperands[i]; 1849 const SourceOperandIndex& outerOperand = step->outerOutputOperands[i]; 1850 std::optional<Buffer> outerBuffer = getBuffer(controller, outerOperand); 1851 if (outerBuffer == std::nullopt) { 1852 // This should never happen. 1853 LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand); 1854 return ANEURALNETWORKS_OP_FAILED; 1855 } 1856 const Operand& sourceOperand = 1857 controller->mExecutionBuilder->getSourceOperand(outerOperand); 1858 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand); 1859 CHECK_NE(size, 0u); 1860 std::optional<Buffer> innerBuffer = getBuffer(controller, innerOperand); 1861 if (innerBuffer == std::nullopt) { 1862 // This should never happen. 1863 LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand); 1864 return ANEURALNETWORKS_OP_FAILED; 1865 } 1866 CHECK_LE(size, innerBuffer->getSize()); 1867 CHECK_LE(size, outerBuffer->getSize()); 1868 memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size); 1869 outerBuffer->flush(); 1870 } 1871 state.iteration = WhileState::kOutsideLoop; 1872 } 1873 1874 state.stage = WhileState::EVALUATE_CONDITION; 1875 return nextCompound(controller, executor, burstController, mainModelOutputShapes); 1876 } 1877 nextCompound(const GotoStep * step,std::shared_ptr<Controller> controller,std::shared_ptr<StepExecutor> * executor,SharedBurst * burstController,const std::vector<OutputShape> * mainModelOutputShapes) const1878 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller, 1879 std::shared_ptr<StepExecutor>* executor, 1880 SharedBurst* burstController, 1881 const std::vector<OutputShape>* mainModelOutputShapes) const { 1882 VLOG(EXECUTION) << "next: " << *step; 1883 controller->mNextStepIndex = step->gotoStepIndex; 1884 return nextCompound(controller, executor, burstController, mainModelOutputShapes); 1885 } 1886 makeStepExecutor(bool reusable,ExecutionBuilder * executionBuilder) const1887 std::shared_ptr<StepExecutor> ExecutionPlan::makeStepExecutor( 1888 bool reusable, ExecutionBuilder* executionBuilder) const { 1889 auto simpleBody = simple(); 1890 auto executor = std::make_shared<StepExecutor>(executionBuilder, simpleBody->mModel, 1891 simpleBody->mDevice, simpleBody->mPreparedModel, 1892 reusable); 1893 executor->mapInputsAndOutputsTrivially(); 1894 return executor; 1895 } 1896 becomeCompoundIfEmpty()1897 void ExecutionPlan::becomeCompoundIfEmpty() { 1898 CHECK(mState != SIMPLE); 1899 if (mState == EMPTY) { 1900 mBody = new CompoundBody(this); 1901 mState = COMPOUND; 1902 } 1903 } 1904 createNewExecutionStep(uint32_t sourceModelIndex,const std::shared_ptr<Device> device)1905 ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex, 1906 const std::shared_ptr<Device> device) { 1907 becomeCompoundIfEmpty(); 1908 auto step = std::make_shared<LogicalStep>(std::in_place_type<ExecutionStep>, this, 1909 compound()->mSteps.size(), sourceModelIndex, device); 1910 compound()->mSteps.push_back(step); 1911 return step->executionStep(); 1912 } 1913 createNewIfStep()1914 IfStep* ExecutionPlan::createNewIfStep() { 1915 becomeCompoundIfEmpty(); 1916 auto step = std::make_shared<LogicalStep>(std::in_place_type<IfStep>); 1917 step->ifStep()->index = compound()->mSteps.size(); 1918 compound()->mSteps.push_back(step); 1919 return step->ifStep(); 1920 } 1921 createNewWhileStep()1922 WhileStep* ExecutionPlan::createNewWhileStep() { 1923 becomeCompoundIfEmpty(); 1924 auto step = std::make_shared<LogicalStep>(std::in_place_type<WhileStep>); 1925 step->whileStep()->index = compound()->mSteps.size(); 1926 compound()->mSteps.push_back(step); 1927 return step->whileStep(); 1928 } 1929 createNewGotoStep()1930 GotoStep* ExecutionPlan::createNewGotoStep() { 1931 becomeCompoundIfEmpty(); 1932 auto step = std::make_shared<LogicalStep>(std::in_place_type<GotoStep>); 1933 step->gotoStep()->index = compound()->mSteps.size(); 1934 compound()->mSteps.push_back(step); 1935 return step->gotoStep(); 1936 } 1937 becomeSingleStep(const std::shared_ptr<Device> device,const ModelBuilder * model)1938 void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device, 1939 const ModelBuilder* model) { 1940 CHECK(mState == EMPTY); 1941 mBody = new SimpleBody(device, model, mCacheInfo, mToken); 1942 mState = SIMPLE; 1943 } 1944 recordOutputDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1945 void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) { 1946 auto [it, isNew] = 1947 compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex); 1948 CHECK(isNew) << "Step " << stepIndex << " redefines output operand " 1949 << toString(sourceOperandIndex) << " already defined by step " << it->second; 1950 } 1951 recordTemporaryDef(SourceOperandIndex sourceOperandIndex,uint32_t stepIndex)1952 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) { 1953 auto [it, isNew] = 1954 compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex); 1955 CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand " 1956 << toString(sourceOperandIndex) << " already defined by step " << it->second; 1957 } 1958 dump() const1959 void ExecutionPlan::dump() const { 1960 if (mBody) { 1961 mBody->dump(); 1962 } else { 1963 VLOG(COMPILATION) << "EMPTY"; 1964 } 1965 } 1966 reset()1967 void ExecutionPlan::reset() { 1968 if (mBody) { 1969 delete mBody; 1970 mBody = nullptr; 1971 } 1972 mState = EMPTY; 1973 } 1974 isSimpleCpu() const1975 bool ExecutionPlan::isSimpleCpu() const { 1976 return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice(); 1977 } 1978 forTest_getKind() const1979 ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const { 1980 switch (mState) { 1981 case EMPTY: 1982 return Kind::EMPTY; 1983 case SIMPLE: 1984 CHECK(mBody); 1985 return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR; 1986 case COMPOUND: 1987 CHECK(mBody); 1988 return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR; 1989 default: 1990 LOG(FATAL) << "unexpected state"; 1991 return Kind::ERROR; 1992 } 1993 } 1994 forTest_simpleGetDevice() const1995 std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const { 1996 return simple()->mDevice; 1997 } 1998 forTest_compoundGetSteps() const1999 const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compoundGetSteps() const { 2000 return compound()->mSteps; 2001 } 2002 forTest_flatGetDynamicTemporaries() const2003 std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const { 2004 CHECK_EQ(getSourceModels().size(), size_t(1)); 2005 std::set<uint32_t> ret; 2006 forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) { 2007 ret.insert(dynTemp.second); 2008 }); 2009 return ret; 2010 } 2011 hasDynamicTemporaries() const2012 bool ExecutionPlan::hasDynamicTemporaries() const { 2013 return mBody == nullptr ? false : mBody->hasDynamicTemporaries(); 2014 } 2015 forTest_hasStepModelWithNoInputsOrNoOutputs() const2016 bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const { 2017 return mBody == nullptr ? false : mBody->hasStepModelWithNoInputsOrNoOutputs(); 2018 } 2019 hasStepModelWithNoInputsOrNoOutputs() const2020 bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const { 2021 return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) { 2022 const ExecutionStep* step = logicalStep->tryExecutionStep(); 2023 return step != nullptr && step->hasNoInputsOrNoOutputs(); 2024 }); 2025 } 2026 forTest_simpleGetCacheToken() const2027 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const { 2028 return simple()->mToken.getCacheToken(); 2029 } 2030 dump() const2031 void ExecutionPlan::SimpleBody::dump() const { 2032 VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName(); 2033 } 2034 dump() const2035 void ExecutionPlan::CompoundBody::dump() const { 2036 for (const auto& step : mSteps) { 2037 step->dump(); 2038 } 2039 } 2040 getInputSourceOperand(uint32_t index) const2041 SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const { 2042 const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels); 2043 CHECK_LT(index, mainModel->inputCount()); 2044 const auto operandIndex = mainModel->getInputOperandIndex(index); 2045 return {kMainModelInSourceModels, operandIndex}; 2046 } 2047 getOutputSourceOperand(uint32_t index) const2048 SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const { 2049 const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels); 2050 CHECK_LT(index, mainModel->outputCount()); 2051 const auto operandIndex = mainModel->getOutputOperandIndex(index); 2052 return {kMainModelInSourceModels, operandIndex}; 2053 } 2054 forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2055 void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index, 2056 const StepRoleCallback& callback) const { 2057 callback(mPreparedModel.get(), IOType::INPUT, index); 2058 } 2059 forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2060 void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index, 2061 const StepRoleCallback& callback) const { 2062 callback(mPreparedModel.get(), IOType::OUTPUT, index); 2063 } 2064 2065 // Map an input role of the main model to the input/output roles in the step models. forEachStepRoleOfInput(uint32_t index,const StepRoleCallback & callback) const2066 void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index, 2067 const StepRoleCallback& callback) const { 2068 const auto sourceOperandIndex = mPlan->getInputSourceOperand(index); 2069 forEachStepRoleOfSourceOperand(sourceOperandIndex, callback); 2070 } 2071 2072 // Map an output role of the main model to the input/output roles in the step models. forEachStepRoleOfOutput(uint32_t index,const StepRoleCallback & callback) const2073 void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index, 2074 const StepRoleCallback& callback) const { 2075 const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index); 2076 forEachStepRoleOfSourceOperand(sourceOperandIndex, callback); 2077 } 2078 forEachStepRoleOfSourceOperand(const SourceOperandIndex & index,const StepRoleCallback & callback) const2079 void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand( 2080 const SourceOperandIndex& index, const StepRoleCallback& callback) const { 2081 const auto it = mSourceOperandToStepRoles.find(index); 2082 if (it == mSourceOperandToStepRoles.end()) return; 2083 for (const auto& [stepIndex, type, ioIndex] : it->second) { 2084 CHECK_LT(stepIndex, mSteps.size()); 2085 const auto* step = mSteps[stepIndex]->executionStep(); 2086 callback(step->getPreparedStepModel().get(), type, ioIndex); 2087 } 2088 } 2089 getMemoryPreference(IOType type,uint32_t index) const2090 MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const { 2091 CHECK(mState == SIMPLE || mState == COMPOUND); 2092 if (mState == SIMPLE) { 2093 return simple()->mPreparedModel->getMemoryPreference(); 2094 } else { 2095 const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index) 2096 : getOutputSourceOperand(index); 2097 return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); 2098 } 2099 } 2100 getMemoryPreferenceOfSourceOperand(const SourceOperandIndex & index) const2101 MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand( 2102 const SourceOperandIndex& index) const { 2103 uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding; 2104 forEachStepRoleOfSourceOperand( 2105 index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) { 2106 const auto preference = preparedModel->getMemoryPreference(); 2107 alignment = std::max(alignment, preference.alignment); 2108 padding = std::max(padding, preference.padding); 2109 }); 2110 return {alignment, padding}; 2111 } 2112 forEachDynamicTemporary(const std::function<void (SourceOperandIndex,const Operand &,uint32_t definingStepIndex)> & fn) const2113 void ExecutionPlan::forEachDynamicTemporary( 2114 const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>& 2115 fn) const { 2116 if (mState != COMPOUND) { 2117 return; 2118 } 2119 2120 for (const auto& logicalStep : compound()->mSteps) { 2121 if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { 2122 const uint32_t stepIndex = step->getIndex(); 2123 const uint32_t sourceModelIndex = step->getSourceModelIndex(); 2124 for (const auto& entry : step->getTempsAsStepModelOutputs()) { 2125 const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first); 2126 const auto& sourceOperand = getSourceOperand(sourceOperandIndex); 2127 if (hasUnknownSize(sourceOperand)) { 2128 fn(sourceOperandIndex, sourceOperand, stepIndex); 2129 } 2130 } 2131 } 2132 } 2133 } 2134 partitionTheWork(const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan,const std::vector<TokenValuePair> & metaData,int simulateFailureResultCode) const2135 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, 2136 uint32_t preference, uint32_t priority, 2137 const OptionalTimePoint& deadline, ExecutionPlan* plan, 2138 const std::vector<TokenValuePair>& metaData, 2139 int simulateFailureResultCode) const { 2140 uint32_t sourceModelIndex = plan->getSourceModels().addModel(this); 2141 NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority, 2142 deadline, plan)); 2143 int n = plan->finish(preference, priority, deadline, metaData, simulateFailureResultCode); 2144 if (VLOG_IS_ON(COMPILATION)) { 2145 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: "; 2146 logModelToInfo(makeModel()); 2147 plan->dump(); 2148 } 2149 return n; 2150 } 2151 partitionTheWorkInternal(uint32_t sourceModelIndex,const std::vector<std::shared_ptr<Device>> & devices,uint32_t preference,uint32_t priority,const OptionalTimePoint & deadline,ExecutionPlan * plan) const2152 int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex, 2153 const std::vector<std::shared_ptr<Device>>& devices, 2154 uint32_t preference, uint32_t priority, 2155 const OptionalTimePoint& deadline, 2156 ExecutionPlan* plan) const { 2157 // This function uses a heuristic approach to partitioning the graph. 2158 // It should be good enough for the first release. 2159 2160 SourceModels* sourceModels = &plan->getSourceModels(); 2161 const size_t deviceCount = devices.size(); 2162 const size_t operationCount = mOperations.size(); 2163 2164 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: " 2165 << "sourceModelIndex = " << sourceModelIndex << ", " 2166 << "deviceCount = " << deviceCount << ", " 2167 << "operationCount = " << operationCount; 2168 2169 // Figure out where each operation will best execute. 2170 // The value of the vector is the index in the devices vector. 2171 std::vector<int> bestDeviceForOperation(operationCount); 2172 NN_RETURN_IF_ERROR( 2173 findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation)); 2174 2175 // A special value produced by findBestDeviceForEachOperation meaning that 2176 // this is a control flow operation scheduled for interpreted execution 2177 // (see LogicalStep). 2178 const int kControlFlowInterpreter = deviceCount; 2179 2180 // If one device will run all the operations, we don't need to split the 2181 // work. This shortcut does not apply when recursively partitioning 2182 // referenced models because our plan representation is flat. 2183 if (sourceModelIndex == kMainModelInSourceModels && 2184 std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(), 2185 std::not_equal_to<int>()) == bestDeviceForOperation.end()) { 2186 const int bestDeviceIndex = bestDeviceForOperation[0]; 2187 // Bypass the partitioning process unless the only operation is a 2188 // control flow operation scheduled for interpreted execution. 2189 if (bestDeviceIndex != kControlFlowInterpreter) { 2190 VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: " 2191 << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName(); 2192 plan->becomeSingleStep(devices[bestDeviceIndex], this); 2193 return ANEURALNETWORKS_NO_ERROR; 2194 } 2195 } 2196 2197 // No easy solution, we need to split the work. 2198 2199 // We keep track of the operations that are ready to run for each device. 2200 // perDeviceQueue[deviceCount] is for interpreted execution of control flow 2201 // (see LogicalStep). 2202 std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1); 2203 2204 // This helper function produces a device name. 2205 auto deviceName = [&devices, kControlFlowInterpreter, 2206 deviceCount](int deviceIndex) -> std::string { 2207 if (deviceIndex == kControlFlowInterpreter) { 2208 return "NNAPI"; 2209 } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) { 2210 return "{unknown}"; 2211 } else { 2212 return devices.at(deviceIndex)->getName(); 2213 } 2214 }; 2215 2216 // This helper function enqueues the operation on the appropriate queue. 2217 auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) { 2218 int deviceIndex = bestDeviceForOperation[operationIndex]; 2219 perDeviceQueue[deviceIndex].push(operationIndex); 2220 VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto " 2221 << deviceIndex << " (" << deviceName(deviceIndex) << ")"; 2222 }; 2223 2224 // This helper function finds a device that has operations ready to process. 2225 // We start by looking at the control flow queue, and then look at the 2226 // devices in reverse order (i.e., starting at the end of the devices 2227 // vector). Earlier devices have a chance to prepare more of the inputs 2228 // required by other devices. This function returns -1 if all queues are 2229 // empty. 2230 auto findNextDeviceToProcess = [&]() -> int { 2231 for (int i = perDeviceQueue.size() - 1; i >= 0; i--) { 2232 if (!perDeviceQueue[i].empty()) { 2233 return i; 2234 } 2235 } 2236 return -1; 2237 }; 2238 2239 OperandTracker tracker(this, enqueueOnAppropriateDevice); 2240 // For each iteration of this loop, we'll create either an execution step or 2241 // an interpreted control flow construct (including nested execution steps 2242 // and interpreted control flow constructs). 2243 while (true) { 2244 // Find the device we'll do this step for. 2245 int deviceIndex = findNextDeviceToProcess(); 2246 VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " (" 2247 << deviceName(deviceIndex) << ")"; 2248 if (deviceIndex < 0) { 2249 break; 2250 } 2251 2252 // Assign as much as possible to this device. 2253 auto& queue = perDeviceQueue[deviceIndex]; 2254 if (deviceIndex != kControlFlowInterpreter) { 2255 ExecutionStep* step = 2256 plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]); 2257 while (!queue.empty()) { 2258 uint32_t operationIndex = queue.front(); 2259 queue.pop(); 2260 int n = step->addOperation(operationIndex); 2261 if (n != ANEURALNETWORKS_NO_ERROR) { 2262 LOG(ERROR) << "failed to add operation " << operationIndex << " to step"; 2263 return n; 2264 } 2265 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); 2266 } 2267 } else { 2268 while (!queue.empty()) { 2269 uint32_t operationIndex = queue.front(); 2270 queue.pop(); 2271 const Operation& operation = getOperation(operationIndex); 2272 if (operation.type == OperationType::IF) { 2273 namespace op = operation_if; 2274 const Operand& thenOperand = 2275 getOperand(operation.inputs[op::kThenModelOperand]); 2276 const Operand& elseOperand = 2277 getOperand(operation.inputs[op::kElseModelOperand]); 2278 const ModelBuilder* thenModel = getReferencedModel(thenOperand); 2279 const ModelBuilder* elseModel = getReferencedModel(elseOperand); 2280 uint32_t thenModelIndex = sourceModels->addModel(thenModel); 2281 uint32_t elseModelIndex = sourceModels->addModel(elseModel); 2282 2283 // Emits the following: 2284 // Index Step 2285 // i if then=(i + 1) else=(j + 1) 2286 // ... (then model steps) 2287 // j goto k 2288 // ... (else model steps) 2289 // k (steps after the IF) 2290 IfStep* ifStep = plan->createNewIfStep(); 2291 ifStep->conditionOperandIndex = SourceOperandIndex( 2292 sourceModelIndex, operation.inputs[op::kCondBoolOperand]); 2293 ifStep->thenStepIndex = plan->getNextStepIndex(); 2294 NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal( 2295 thenModelIndex, devices, preference, priority, deadline, plan)); 2296 GotoStep* afterThenBranch = plan->createNewGotoStep(); 2297 ifStep->elseStepIndex = plan->getNextStepIndex(); 2298 NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal( 2299 elseModelIndex, devices, preference, priority, deadline, plan)); 2300 afterThenBranch->gotoStepIndex = plan->getNextStepIndex(); 2301 2302 // Outer model operands. 2303 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) { 2304 ifStep->outerInputOperands.emplace_back(sourceModelIndex, 2305 operation.inputs[i]); 2306 } 2307 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) { 2308 ifStep->outerOutputOperands.emplace_back(sourceModelIndex, 2309 operation.outputs[i]); 2310 } 2311 // Then model operands. 2312 for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) { 2313 ifStep->thenBranchInputOperands.emplace_back( 2314 thenModelIndex, thenModel->getInputOperandIndex(i)); 2315 } 2316 for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) { 2317 ifStep->thenBranchOutputOperands.emplace_back( 2318 thenModelIndex, thenModel->getOutputOperandIndex(i)); 2319 } 2320 // Else model operands. 2321 for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) { 2322 ifStep->elseBranchInputOperands.emplace_back( 2323 elseModelIndex, elseModel->getInputOperandIndex(i)); 2324 } 2325 for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) { 2326 ifStep->elseBranchOutputOperands.emplace_back( 2327 elseModelIndex, elseModel->getOutputOperandIndex(i)); 2328 } 2329 } else if (operation.type == OperationType::WHILE) { 2330 namespace op = operation_while; 2331 const Operand& condOperand = 2332 getOperand(operation.inputs[op::kCondModelOperand]); 2333 const Operand& bodyOperand = 2334 getOperand(operation.inputs[op::kBodyModelOperand]); 2335 const ModelBuilder* condModel = getReferencedModel(condOperand); 2336 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); 2337 uint32_t condModelIndex = sourceModels->addModel(condModel); 2338 uint32_t bodyModelIndex = sourceModels->addModel(bodyModel); 2339 2340 // Emits the following: 2341 // Index Step 2342 // i while cond=(i + 1) body=(j + 1) exit=(k + 1) 2343 // ... (cond model steps) 2344 // j goto i 2345 // ... (body model steps) 2346 // k goto i 2347 // ... (steps after the WHILE) 2348 // 2349 // Note that WhileStep has WhileState associated with it. 2350 WhileStep* whileStep = plan->createNewWhileStep(); 2351 whileStep->condStepIndex = plan->getNextStepIndex(); 2352 NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal( 2353 condModelIndex, devices, preference, priority, deadline, plan)); 2354 GotoStep* afterCond = plan->createNewGotoStep(); 2355 afterCond->gotoStepIndex = whileStep->index; 2356 whileStep->bodyStepIndex = plan->getNextStepIndex(); 2357 NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal( 2358 bodyModelIndex, devices, preference, priority, deadline, plan)); 2359 GotoStep* afterBody = plan->createNewGotoStep(); 2360 afterBody->gotoStepIndex = whileStep->index; 2361 whileStep->exitStepIndex = plan->getNextStepIndex(); 2362 2363 // Outer model operands. 2364 for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) { 2365 whileStep->outerInputOperands.emplace_back(sourceModelIndex, 2366 operation.inputs[i]); 2367 } 2368 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) { 2369 whileStep->outerOutputOperands.emplace_back(sourceModelIndex, 2370 operation.outputs[i]); 2371 } 2372 // Cond model operands. 2373 for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) { 2374 whileStep->condInputOperands.emplace_back( 2375 condModelIndex, condModel->getInputOperandIndex(i)); 2376 } 2377 whileStep->condOutputOperand = 2378 SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0)); 2379 // Body model operands. 2380 for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) { 2381 whileStep->bodyInputOperands.emplace_back( 2382 bodyModelIndex, bodyModel->getInputOperandIndex(i)); 2383 } 2384 for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) { 2385 whileStep->bodyOutputOperands.emplace_back( 2386 bodyModelIndex, bodyModel->getOutputOperandIndex(i)); 2387 } 2388 } else { 2389 CHECK(false) << operation.type << " is not a control flow operation"; 2390 } 2391 tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); 2392 } 2393 } 2394 } 2395 return ANEURALNETWORKS_NO_ERROR; 2396 } 2397 getPerformance(uint32_t preference,const std::shared_ptr<Device> device) const2398 float ModelBuilder::getPerformance(uint32_t preference, 2399 const std::shared_ptr<Device> device) const { 2400 // Note that we will call this method multiple times per compilation with 2401 // the same arguments if there are nested control flow operations and we 2402 // decide to execute the outer operation on the ExecutionPlan::next() 2403 // interpreter. 2404 // 2405 // This is a potential compilation performance problem. To work around it, 2406 // the performance value could be cached for the duration of a compilation. 2407 float perf = 0; 2408 const size_t operationCount = mOperations.size(); 2409 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { 2410 perf += getPerformance(preference, device, operationIndex); 2411 } 2412 return perf; 2413 } 2414 getPerformance(uint32_t preference,const std::shared_ptr<Device> device,uint32_t operationIndex) const2415 float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr<Device> device, 2416 uint32_t operationIndex) const { 2417 auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) { 2418 return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime; 2419 }; 2420 2421 const Operation& operation = getOperation(operationIndex); 2422 2423 if (operation.type == OperationType::IF) { 2424 namespace op = operation_if; 2425 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]); 2426 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]); 2427 const ModelBuilder* thenModel = getReferencedModel(thenOperand); 2428 const ModelBuilder* elseModel = getReferencedModel(elseOperand); 2429 return applyPreference(device->getIfPerformance()) + 2430 0.5 * (thenModel->getPerformance(preference, device) + 2431 elseModel->getPerformance(preference, device)); 2432 } 2433 2434 if (operation.type == OperationType::WHILE) { 2435 namespace op = operation_while; 2436 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]); 2437 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]); 2438 const ModelBuilder* condModel = getReferencedModel(condOperand); 2439 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); 2440 return applyPreference(device->getWhilePerformance()) + 2441 condModel->getPerformance(preference, device) + 2442 bodyModel->getPerformance(preference, device); 2443 } 2444 2445 // TODO This assumes that the type is dictated by the first operand. This is 2446 // currently the case but is not a safe assumption to make in the long term. 2447 const uint32_t operandIndex = operation.inputs[0]; 2448 const OperandType operandType = mOperands[operandIndex].type; 2449 switch (operandType) { 2450 case OperandType::FLOAT32: 2451 if (mRelaxComputationFloat32toFloat16) { 2452 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar()); 2453 } 2454 break; 2455 case OperandType::TENSOR_FLOAT32: 2456 if (mRelaxComputationFloat32toFloat16) { 2457 return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor()); 2458 } 2459 break; 2460 default: 2461 break; 2462 } 2463 2464 return applyPreference(device->getPerformance(operandType)); 2465 } 2466 isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const2467 bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const { 2468 auto containsUnknownSize = [](const ModelBuilder* model, 2469 const std::vector<uint32_t>& operandIndexes) { 2470 for (uint32_t operandIndex : operandIndexes) { 2471 if (hasUnknownSize(model->getOperand(operandIndex))) { 2472 return true; 2473 } 2474 } 2475 return false; 2476 }; 2477 2478 const Operation& operation = getOperation(operationIndex); 2479 2480 if (operation.type == OperationType::IF) { 2481 namespace op = operation_if; 2482 const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]); 2483 const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]); 2484 const ModelBuilder* thenModel = getReferencedModel(thenOperand); 2485 const ModelBuilder* elseModel = getReferencedModel(elseOperand); 2486 return containsUnknownSize(this, operation.inputs) || 2487 containsUnknownSize(this, operation.outputs) || 2488 containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) || 2489 containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) || 2490 containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) || 2491 containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes()); 2492 } 2493 2494 if (operation.type == OperationType::WHILE) { 2495 namespace op = operation_while; 2496 const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]); 2497 const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]); 2498 const ModelBuilder* condModel = getReferencedModel(condOperand); 2499 const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); 2500 return containsUnknownSize(this, operation.inputs) || 2501 containsUnknownSize(this, operation.outputs) || 2502 containsUnknownSize(condModel, condModel->getInputOperandIndexes()) || 2503 containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) || 2504 containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) || 2505 containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes()); 2506 } 2507 2508 // Not a control flow operation. 2509 return false; 2510 } 2511 supportedByControlFlowInterpreter(uint32_t operationIndex) const2512 bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const { 2513 const Operation& operation = getOperation(operationIndex); 2514 return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) && 2515 // The partitioner does not support dynamic temporaries (b/132458982). 2516 !isControlFlowOperationWithOperandOfUnknownSize(operationIndex); 2517 } 2518 2519 namespace { 2520 2521 // This class determines whether a given device can execute a given operation 2522 class CanDo { 2523 public: CanDo()2524 CanDo() {} 2525 initialize(const MetaModel & metaModel,std::shared_ptr<Device> device)2526 void initialize(const MetaModel& metaModel, std::shared_ptr<Device> device) { 2527 mSupportsOperationByIndex = device->getSupportedOperations(metaModel); 2528 } 2529 check(size_t operationIndex) const2530 bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; } 2531 2532 private: 2533 std::vector<bool> mSupportsOperationByIndex; 2534 }; 2535 2536 } // anonymous namespace 2537 findBestDeviceForEachOperation(uint32_t preference,const std::vector<std::shared_ptr<Device>> & devices,std::vector<int> * bestDeviceForOperation) const2538 int ModelBuilder::findBestDeviceForEachOperation( 2539 uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices, 2540 std::vector<int>* bestDeviceForOperation) const { 2541 const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing()); 2542 2543 const size_t deviceCount = devices.size(); 2544 std::vector<CanDo> canDo(deviceCount); 2545 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { 2546 canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]); 2547 } 2548 2549 // Figure out the best driver for each operation. 2550 const size_t operationCount = mOperations.size(); 2551 for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { 2552 const Operation& operation = getOperation(operationIndex); 2553 // Find which device, including CPU fallback, gives the best performance for this operation. 2554 int bestChoice = -1; 2555 2556 if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) { 2557 // Do not schedule control flow operations with unknown size to 2558 // non-CPU devices because this is not supported by the 1.3 HAL. 2559 // See http://b/159076604#comment5. 2560 auto cpuDeviceIterator = 2561 std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice()); 2562 if (cpuDeviceIterator != devices.end()) { 2563 int cpuDeviceIndex = cpuDeviceIterator - devices.begin(); 2564 if (canDo[cpuDeviceIndex].check(operationIndex)) { 2565 bestChoice = cpuDeviceIndex; 2566 } 2567 } 2568 } else { 2569 float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0. 2570 for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { 2571 const auto& device = devices[deviceIndex]; 2572 if (canDo[deviceIndex].check(operationIndex)) { 2573 const float perfVal = getPerformance(preference, device, operationIndex); 2574 const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice()); 2575 if (bestChoice < 0 || perfVal < bestPerfVal || 2576 (perfVal == bestPerfVal && deviceIsPreferred)) { 2577 bestChoice = deviceIndex; 2578 bestPerfVal = perfVal; 2579 } 2580 } else { 2581 // Somewhat noisy logging, but only place where the user of NNAPI can get 2582 // feedback on why an operation was not run on a specific device. 2583 // 2584 // Logs O(operationCount * deviceCount) times, but typically deviceCount is 2585 // very small. 2586 VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation " 2587 << operation.type << ":" << operationIndex; 2588 } 2589 } 2590 } 2591 2592 if (bestChoice < 0) { 2593 LOG(ERROR) << "No driver can do operation " << operation.type; 2594 return ANEURALNETWORKS_BAD_DATA; 2595 } else if (devices[bestChoice] == DeviceManager::getCpuDevice() && 2596 supportedByControlFlowInterpreter(operationIndex)) { 2597 // Run control flow on the ExecutionPlan::next() interpreter and try 2598 // to delegate referenced models. 2599 const int kControlFlowInterpreter = deviceCount; 2600 (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter; 2601 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type 2602 << ":" << operationIndex << ") = -1 (NNAPI)"; 2603 } else { 2604 (*bestDeviceForOperation)[operationIndex] = bestChoice; 2605 VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type 2606 << ":" << operationIndex << ") = " << bestChoice << " (" 2607 << devices[bestChoice]->getName() << ")"; 2608 } 2609 } 2610 return ANEURALNETWORKS_NO_ERROR; 2611 } 2612 2613 } // namespace nn 2614 } // namespace android 2615