869 lines
38 KiB
C++
869 lines
38 KiB
C++
/*
|
|
* Copyright (C) 2017 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#define LOG_TAG "Manager"
|
|
|
|
#include "Manager.h"
|
|
|
|
#include <android/hidl/manager/1.2/IServiceManager.h>
|
|
#include <android/sync.h>
|
|
#include <build/version.h>
|
|
#include <cutils/native_handle.h>
|
|
#include <hidl/HidlTransportSupport.h>
|
|
#include <hidl/ServiceManagement.h>
|
|
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "Callbacks.h"
|
|
#include "CpuExecutor.h"
|
|
#include "ExecutionBurstController.h"
|
|
#include "HalInterfaces.h"
|
|
#include "Memory.h"
|
|
#include "MetaModel.h"
|
|
#include "ModelArgumentInfo.h"
|
|
#include "Tracing.h"
|
|
#include "TypeManager.h"
|
|
#include "Utils.h"
|
|
#include "VersionedInterfaces.h"
|
|
|
|
namespace android {
|
|
namespace nn {
|
|
|
|
using namespace hal;
|
|
|
|
const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
|
|
|
|
// A Device with actual underlying driver
|
|
class DriverDevice : public Device {
|
|
public:
|
|
// Create a DriverDevice from a name and a DeviceFactory function.
|
|
// Returns nullptr on failure.
|
|
static std::shared_ptr<DriverDevice> create(const std::string& name,
|
|
const DeviceFactory& makeDevice);
|
|
|
|
// Prefer using DriverDevice::create
|
|
DriverDevice(std::shared_ptr<VersionedIDevice> device);
|
|
|
|
const std::string& getName() const override { return kInterface->getName(); }
|
|
const std::string& getVersionString() const override { return kInterface->getVersionString(); }
|
|
int64_t getFeatureLevel() const override { return kInterface->getFeatureLevel(); }
|
|
int32_t getType() const override { return kInterface->getType(); }
|
|
const std::vector<Extension>& getSupportedExtensions() const override {
|
|
return kInterface->getSupportedExtensions();
|
|
}
|
|
std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
|
|
PerformanceInfo getPerformance(OperandType type) const override {
|
|
const auto& capabilities = kInterface->getCapabilities();
|
|
return lookup(capabilities.operandPerformance, type);
|
|
}
|
|
PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
|
|
const auto& capabilities = kInterface->getCapabilities();
|
|
return capabilities.relaxedFloat32toFloat16PerformanceScalar;
|
|
}
|
|
PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
|
|
const auto& capabilities = kInterface->getCapabilities();
|
|
return capabilities.relaxedFloat32toFloat16PerformanceTensor;
|
|
}
|
|
PerformanceInfo getIfPerformance() const override {
|
|
const auto& capabilities = kInterface->getCapabilities();
|
|
return capabilities.ifPerformance;
|
|
}
|
|
PerformanceInfo getWhilePerformance() const override {
|
|
const auto& capabilities = kInterface->getCapabilities();
|
|
return capabilities.whilePerformance;
|
|
}
|
|
bool isCachingSupported() const override {
|
|
// Caching is supported if either of numModelCache or numDataCache is greater than 0.
|
|
const auto [numModelCacheFiles, numDataCacheFiles] =
|
|
kInterface->getNumberOfCacheFilesNeeded();
|
|
return numModelCacheFiles > 0 || numDataCacheFiles > 0;
|
|
}
|
|
int wait() const override { return kInterface->wait(); }
|
|
|
|
std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
|
|
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
|
|
const std::optional<Deadline>& deadline, const std::string& cacheDir,
|
|
const std::optional<CacheToken>& maybeToken) const override;
|
|
|
|
std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
|
|
hal::OperandType) const override;
|
|
|
|
private:
|
|
const std::shared_ptr<VersionedIDevice> kInterface;
|
|
|
|
#ifdef NN_DEBUGGABLE
|
|
// For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
|
|
// 0 - all operations reported by IDevice::getSupportedOperations() supported
|
|
// 1 - some operations reported by IDevice::getSupportedOperations() supported
|
|
uint32_t mSupported = 0;
|
|
#endif // NN_DEBUGGABLE
|
|
};
|
|
|
|
// A PreparedModel with underlying IPreparedModel instance return by actual driver.
|
|
class DriverPreparedModel : public PreparedModel {
|
|
public:
|
|
DriverPreparedModel(const Device* device,
|
|
const std::shared_ptr<VersionedIPreparedModel>& preparedModel)
|
|
: mDevice(device), mPreparedModel(preparedModel) {
|
|
CHECK(mDevice != nullptr);
|
|
CHECK(mPreparedModel != nullptr);
|
|
}
|
|
|
|
const Device* getDevice() const override { return mDevice; }
|
|
std::shared_ptr<VersionedIPreparedModel> getInterface() const override {
|
|
return mPreparedModel;
|
|
}
|
|
std::tuple<int, std::vector<OutputShape>, Timing> execute(
|
|
const std::vector<ModelArgumentInfo>& inputs,
|
|
const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories,
|
|
const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
|
|
const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration) const override;
|
|
|
|
std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
|
|
const std::vector<ModelArgumentInfo>& inputs,
|
|
const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
|
|
MeasureTiming measure, const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration,
|
|
const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
|
|
|
|
std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
|
|
bool preferPowerOverLatency) const override {
|
|
return mPreparedModel->configureExecutionBurst(preferPowerOverLatency);
|
|
}
|
|
|
|
private:
|
|
const Device* mDevice;
|
|
const std::shared_ptr<VersionedIPreparedModel> mPreparedModel;
|
|
};
|
|
|
|
DriverDevice::DriverDevice(std::shared_ptr<VersionedIDevice> device)
|
|
: kInterface(std::move(device)) {
|
|
CHECK(kInterface != nullptr);
|
|
#ifdef NN_DEBUGGABLE
|
|
static const char samplePrefix[] = "sample";
|
|
if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
|
|
mSupported = getProp("debug.nn.sample.supported");
|
|
}
|
|
#endif // NN_DEBUGGABLE
|
|
}
|
|
|
|
std::shared_ptr<DriverDevice> DriverDevice::create(const std::string& name,
|
|
const DeviceFactory& makeDevice) {
|
|
CHECK(makeDevice != nullptr);
|
|
std::shared_ptr<VersionedIDevice> device = VersionedIDevice::create(name, makeDevice);
|
|
if (device == nullptr) {
|
|
LOG(ERROR) << "DriverDevice::create failed to create VersionedIDevice object for service "
|
|
<< name;
|
|
return nullptr;
|
|
}
|
|
|
|
return std::make_shared<DriverDevice>(std::move(device));
|
|
}
|
|
|
|
std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
|
|
// Query the driver for what it can do.
|
|
ErrorStatus status = ErrorStatus::GENERAL_FAILURE;
|
|
std::vector<bool> supportedOperations;
|
|
std::tie(status, supportedOperations) = kInterface->getSupportedOperations(metaModel);
|
|
|
|
const Model& hidlModel = metaModel.getModel();
|
|
const uint32_t operationCount = hidlModel.main.operations.size();
|
|
if (status != ErrorStatus::NONE) {
|
|
LOG(ERROR) << "IDevice::getSupportedOperations returned the error " << toString(status);
|
|
// Set the supported operation vectors to all false, so we won't use this driver.
|
|
return std::vector<bool>(operationCount, false);
|
|
}
|
|
if (supportedOperations.size() != operationCount) {
|
|
LOG(ERROR) << "IDevice::getSupportedOperations returned a vector of length "
|
|
<< supportedOperations.size() << " when expecting " << operationCount;
|
|
// Set the supported operation vectors to all false, so we won't use this driver.
|
|
return std::vector<bool>(operationCount, false);
|
|
}
|
|
|
|
#ifdef NN_DEBUGGABLE
|
|
if (mSupported != 1) {
|
|
return supportedOperations;
|
|
}
|
|
|
|
const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
|
|
for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
|
|
if (!supportedOperations[operationIndex]) {
|
|
continue;
|
|
}
|
|
|
|
uint32_t accumulator = baseAccumulator;
|
|
const Operation& operation = hidlModel.main.operations[operationIndex];
|
|
accumulator ^= static_cast<uint32_t>(operation.type);
|
|
auto accumulateOperands = [&hidlModel, &accumulator](const hidl_vec<uint32_t>& operands) {
|
|
for (uint32_t operandIndex : operands) {
|
|
const Operand& operand = hidlModel.main.operands[operandIndex];
|
|
accumulator ^= static_cast<uint32_t>(operand.type);
|
|
accumulator ^= operand.dimensions.size();
|
|
for (uint32_t dimension : operand.dimensions) {
|
|
accumulator ^= dimension;
|
|
if (operand.lifetime == OperandLifeTime::CONSTANT_COPY ||
|
|
operand.lifetime == OperandLifeTime::CONSTANT_REFERENCE) {
|
|
accumulator ^= 1;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
accumulateOperands(operation.inputs);
|
|
accumulateOperands(operation.outputs);
|
|
if (accumulator & 1) {
|
|
supportedOperations[operationIndex] = false;
|
|
}
|
|
}
|
|
#endif // NN_DEBUGGABLE
|
|
|
|
return supportedOperations;
|
|
}
|
|
|
|
std::pair<int, std::shared_ptr<PreparedModel>> DriverDevice::prepareModel(
|
|
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
|
|
const std::optional<Deadline>& deadline, const std::string& cacheDir,
|
|
const std::optional<CacheToken>& maybeToken) const {
|
|
const auto [n, preparedModel] = kInterface->prepareModel(makeModel, preference, priority,
|
|
deadline, cacheDir, maybeToken);
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, nullptr};
|
|
}
|
|
CHECK(preparedModel != nullptr) << "prepareModel returned nullptr without error code";
|
|
return {ANEURALNETWORKS_NO_ERROR, std::make_shared<DriverPreparedModel>(this, preparedModel)};
|
|
}
|
|
|
|
std::pair<int, std::unique_ptr<Memory>> DriverDevice::allocate(const MemoryDescriptor& desc,
|
|
hal::OperandType) const {
|
|
const BufferDesc hidlDesc = {.dimensions = desc.dimensions};
|
|
std::vector<std::shared_ptr<VersionedIPreparedModel>> preparedModels(
|
|
desc.preparedModels.size());
|
|
std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
|
|
[](const auto* preparedModel) {
|
|
const auto versionedPreparedModel = preparedModel->getInterface();
|
|
CHECK(versionedPreparedModel != nullptr);
|
|
return versionedPreparedModel;
|
|
});
|
|
auto [status, buffer, token] =
|
|
kInterface->allocate(hidlDesc, preparedModels, desc.inputRoles, desc.outputRoles);
|
|
if (status != ErrorStatus::NONE) {
|
|
LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
|
|
<< " failed!";
|
|
return {convertErrorStatusToResultCode(status), nullptr};
|
|
}
|
|
return MemoryFromDevice::create(std::move(buffer), token);
|
|
}
|
|
|
|
// Figures out how to place each of the input or outputs in a buffer. This just
|
|
// does the layout and memory allocation, it does not copy data. Aligns each
|
|
// input a bit.
|
|
static std::tuple<int, std::unique_ptr<MemoryAshmem>, std::vector<DataLocation>>
|
|
allocatePointerArgumentsToPool(const std::vector<ModelArgumentInfo>& args,
|
|
std::vector<const Memory*>* memories) {
|
|
CHECK(memories != nullptr);
|
|
std::vector<DataLocation> ptrArgsLocations;
|
|
const uint32_t nextPoolIndex = memories->size();
|
|
int64_t total = 0;
|
|
for (const auto& info : args) {
|
|
if (info.state() == ModelArgumentInfo::POINTER) {
|
|
// TODO Good enough alignment?
|
|
total += alignBytesNeeded(static_cast<uint32_t>(total), info.length());
|
|
ptrArgsLocations.push_back({.poolIndex = nextPoolIndex,
|
|
.offset = static_cast<uint32_t>(total),
|
|
.length = info.length()});
|
|
total += info.length();
|
|
}
|
|
};
|
|
if (total > 0xFFFFFFFF) {
|
|
LOG(ERROR) << "allocatePointerArgumentsToPool: ANeuralNetworksExecution: Size of all "
|
|
"inputs or outputs exceeds 2^32.";
|
|
return {ANEURALNETWORKS_BAD_DATA, nullptr, std::vector<DataLocation>{}};
|
|
}
|
|
if (total <= 0) {
|
|
return {ANEURALNETWORKS_NO_ERROR, nullptr, std::vector<DataLocation>{}};
|
|
}
|
|
auto [n, memory] = MemoryAshmem::create(total);
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n, nullptr, std::vector<DataLocation>{}};
|
|
}
|
|
memories->push_back(memory.get());
|
|
return {ANEURALNETWORKS_NO_ERROR, std::move(memory), std::move(ptrArgsLocations)};
|
|
}
|
|
|
|
// Perform computation on an actual HIDL driver.
|
|
//
|
|
// Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
|
|
// outputs specified by pointers. The input pointer data will be copied to the input pool prior to
|
|
// execution, and the output pointer data will be copied out from the output pool after the
|
|
// execution.
|
|
//
|
|
// The HIDL invocation will choose between sync/async execution according to
|
|
// DeviceManager::mSyncExecHal.
|
|
std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
|
|
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories,
|
|
const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
|
|
const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration) const {
|
|
NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
|
|
|
|
// Make a copy of the memory tracker as we will append memory pools for pointer arguments.
|
|
std::vector<const Memory*> localMemories = memories;
|
|
|
|
// We separate the input & output pools so accelerators only need to copy
|
|
// the contents of the input pools. We could also use it to set protection
|
|
// on read only memory but that's not currently done.
|
|
|
|
// Layout the input and output data
|
|
const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
|
|
allocatePointerArgumentsToPool(inputs, &localMemories);
|
|
if (n1 != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n1, {}, kNoTiming};
|
|
}
|
|
const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
|
|
allocatePointerArgumentsToPool(outputs, &localMemories);
|
|
if (n2 != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n2, {}, kNoTiming};
|
|
}
|
|
|
|
// Copy the input data that was specified via a pointer.
|
|
if (inputPtrArgsMemory != nullptr) {
|
|
uint32_t ptrInputIndex = 0;
|
|
for (const auto& info : inputs) {
|
|
if (info.state() == ModelArgumentInfo::POINTER) {
|
|
const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
|
|
uint8_t* const data = inputPtrArgsMemory->getPointer();
|
|
memcpy(data + loc.offset, info.buffer(), loc.length);
|
|
}
|
|
}
|
|
}
|
|
|
|
Request request;
|
|
request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
|
|
request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
|
|
uint32_t count = localMemories.size();
|
|
request.pools.resize(count);
|
|
for (uint32_t i = 0; i < count; i++) {
|
|
request.pools[i] = localMemories[i]->getMemoryPool();
|
|
}
|
|
|
|
NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
|
|
"DriverPreparedModel::execute::execute");
|
|
|
|
int n = ANEURALNETWORKS_OP_FAILED;
|
|
std::vector<OutputShape> outputShapes;
|
|
Timing timing = kNoTiming;
|
|
|
|
// compute using burst if present
|
|
const bool burstCompute = (burstController != nullptr);
|
|
bool burstFallback = true;
|
|
if (burstCompute) {
|
|
const bool compliant = compliantWithV1_2(request);
|
|
if (compliant) {
|
|
V1_0::Request request12 = convertToV1_2(request);
|
|
std::vector<intptr_t> memoryIds;
|
|
memoryIds.reserve(localMemories.size());
|
|
for (const Memory* memory : localMemories) {
|
|
memory->usedBy(burstController);
|
|
memoryIds.push_back(memory->getKey());
|
|
}
|
|
|
|
VLOG(EXECUTION) << "Before ExecutionBurstController->compute() "
|
|
<< SHOW_IF_DEBUG(toString(request12));
|
|
std::tie(n, outputShapes, timing, burstFallback) =
|
|
burstController->compute(request12, measure, memoryIds);
|
|
}
|
|
}
|
|
|
|
// compute from IPreparedModel if either:
|
|
// (1) burst was not supplied, or
|
|
// (2) the burst execution failed and requested a fallback execution
|
|
if (!burstCompute || burstFallback) {
|
|
const bool preferSynchronous = DeviceManager::get()->syncExecHal();
|
|
std::tie(n, outputShapes, timing) = mPreparedModel->execute(
|
|
request, measure, deadline, loopTimeoutDuration, preferSynchronous);
|
|
}
|
|
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
VLOG(EXECUTION) << "**Execution failed**";
|
|
return {n, std::move(outputShapes), timing};
|
|
}
|
|
|
|
// Copy the output data from shared memory to the output buffers.
|
|
NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::execute");
|
|
if (outputPtrArgsMemory != nullptr) {
|
|
uint32_t ptrOutputIndex = 0;
|
|
for (const auto& info : outputs) {
|
|
if (info.state() == ModelArgumentInfo::POINTER) {
|
|
const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
|
|
const uint8_t* const data = outputPtrArgsMemory->getPointer();
|
|
memcpy(info.buffer(), data + loc.offset, loc.length);
|
|
}
|
|
}
|
|
}
|
|
|
|
VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
|
|
return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
|
|
}
|
|
|
|
std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
|
|
DriverPreparedModel::executeFenced(
|
|
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories, const std::vector<int>& waitFor,
|
|
hal::MeasureTiming measure, const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration,
|
|
const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const {
|
|
NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
|
|
CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd > 0; }));
|
|
// Make a copy of the memory tracker as we will append memory pools for pointer arguments.
|
|
std::vector<const Memory*> localMemories = memories;
|
|
sp<hal::IFencedExecutionCallback> executeFencedCallback;
|
|
hal::Timing timing = kNoTiming;
|
|
|
|
// We separate the input & output pools so accelerators only need to copy
|
|
// the contents of the input pools. We could also use it to set protection
|
|
// on read only memory but that's not currently done.
|
|
|
|
// Layout the input and output data
|
|
const auto [n1, inputPtrArgsMemory, inputPtrArgsLocations] =
|
|
allocatePointerArgumentsToPool(inputs, &localMemories);
|
|
if (n1 != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n1, -1, nullptr, timing};
|
|
}
|
|
const auto [n2, outputPtrArgsMemory, outputPtrArgsLocations] =
|
|
allocatePointerArgumentsToPool(outputs, &localMemories);
|
|
if (n2 != ANEURALNETWORKS_NO_ERROR) {
|
|
return {n2, -1, nullptr, timing};
|
|
}
|
|
|
|
// Copy the input data that was specified via a pointer.
|
|
if (inputPtrArgsMemory != nullptr) {
|
|
uint32_t ptrInputIndex = 0;
|
|
for (const auto& info : inputs) {
|
|
if (info.state() == ModelArgumentInfo::POINTER) {
|
|
const DataLocation& loc = inputPtrArgsLocations[ptrInputIndex++];
|
|
uint8_t* const data = inputPtrArgsMemory->getPointer();
|
|
memcpy(data + loc.offset, info.buffer(), loc.length);
|
|
}
|
|
}
|
|
}
|
|
|
|
Request request;
|
|
request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
|
|
request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
|
|
uint32_t count = localMemories.size();
|
|
request.pools.resize(count);
|
|
for (uint32_t i = 0; i < count; i++) {
|
|
request.pools[i] = localMemories[i]->getMemoryPool();
|
|
}
|
|
|
|
NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
|
|
"DriverPreparedModel::executeFenced");
|
|
|
|
int n = ANEURALNETWORKS_OP_FAILED;
|
|
hidl_vec<hidl_handle> waitForHandles;
|
|
waitForHandles.resize(waitFor.size());
|
|
for (uint32_t i = 0; i < waitFor.size(); i++) {
|
|
native_handle_t* nativeHandle = native_handle_create(1, 0);
|
|
if (nativeHandle == nullptr) {
|
|
LOG(ERROR) << "Failed to create native_handle";
|
|
return {n, -1, nullptr, timing};
|
|
}
|
|
int dupFd = dup(waitFor[i]);
|
|
if (dupFd <= 0) {
|
|
LOG(ERROR) << "Unable to dup the file descriptor";
|
|
return {n, -1, nullptr, timing};
|
|
}
|
|
nativeHandle->data[0] = dupFd;
|
|
hidl_handle hidlHandle;
|
|
hidlHandle.setTo(nativeHandle, /*shouldOwn=*/true);
|
|
waitForHandles[i] = std::move(hidlHandle);
|
|
}
|
|
|
|
hidl_handle syncFence;
|
|
std::tie(n, syncFence, executeFencedCallback, timing) =
|
|
mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
|
|
loopTimeoutDuration, timeoutDurationAfterFence);
|
|
|
|
if (n != ANEURALNETWORKS_NO_ERROR) {
|
|
VLOG(EXECUTION) << "**executeFenced failed**";
|
|
return {n, -1, nullptr, timing};
|
|
}
|
|
|
|
int syncFenceFd = -1;
|
|
if (syncFence.getNativeHandle()) {
|
|
syncFenceFd = dup(syncFence.getNativeHandle()->data[0]);
|
|
if (syncFenceFd < 0) {
|
|
LOG(ERROR) << "Failed to dup the file descriptor";
|
|
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
|
|
}
|
|
}
|
|
// If output buffer is provided as a malloc pointer, wait for the execution to finish.
|
|
// Then copy the output data from shared memory to the output buffers.
|
|
if (outputPtrArgsMemory != nullptr) {
|
|
NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::executeFenced");
|
|
if (syncFenceFd > 0) {
|
|
auto r = syncWait(syncFenceFd, -1);
|
|
if (r != FenceState::SIGNALED) {
|
|
LOG(ERROR) << "syncWait failed, fd: " << syncFenceFd;
|
|
return {ANEURALNETWORKS_OP_FAILED, syncFenceFd, nullptr, timing};
|
|
}
|
|
}
|
|
uint32_t ptrOutputIndex = 0;
|
|
for (const auto& info : outputs) {
|
|
if (info.state() == ModelArgumentInfo::POINTER) {
|
|
const DataLocation& loc = outputPtrArgsLocations[ptrOutputIndex++];
|
|
const uint8_t* const data = outputPtrArgsMemory->getPointer();
|
|
memcpy(info.buffer(), data + loc.offset, loc.length);
|
|
}
|
|
}
|
|
}
|
|
|
|
VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
|
|
return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedCallback, timing};
|
|
}
|
|
|
|
// A special abstracted device for the CPU. Only one instance of this class will exist.
|
|
// Use get() to retrieve it.
|
|
class CpuDevice : public Device {
|
|
public:
|
|
// Returns the singleton CPU fallback device.
|
|
static std::shared_ptr<CpuDevice> get() {
|
|
static std::shared_ptr<CpuDevice> instance(new CpuDevice);
|
|
return instance;
|
|
}
|
|
|
|
const std::string& getName() const override { return kName; }
|
|
const std::string& getVersionString() const override { return kVersionString; }
|
|
int64_t getFeatureLevel() const override { return kFeatureLevel; }
|
|
int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
|
|
const std::vector<Extension>& getSupportedExtensions() const override {
|
|
return kSupportedExtensions;
|
|
}
|
|
std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
|
|
PerformanceInfo getPerformance(OperandType) const override { return kPerformance; }
|
|
PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
|
|
return kPerformance;
|
|
}
|
|
PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
|
|
return kPerformance;
|
|
}
|
|
PerformanceInfo getIfPerformance() const override { return kPerformance; }
|
|
PerformanceInfo getWhilePerformance() const override { return kPerformance; }
|
|
bool isCachingSupported() const override { return false; }
|
|
int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
|
|
|
|
std::pair<int, std::shared_ptr<PreparedModel>> prepareModel(
|
|
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
|
|
const std::optional<Deadline>& deadline, const std::string& cacheDir,
|
|
const std::optional<CacheToken>& maybeToken) const override;
|
|
|
|
std::pair<int, std::unique_ptr<Memory>> allocate(const MemoryDescriptor& desc,
|
|
OperandType type) const override;
|
|
|
|
private:
|
|
CpuDevice() = default;
|
|
const int64_t kFeatureLevel = __ANDROID_API__;
|
|
const std::string kName = "nnapi-reference";
|
|
const std::string kVersionString = build::GetBuildNumber();
|
|
// Since the performance is a ratio compared to the CPU performance,
|
|
// by definition the performance of the CPU is 1.0.
|
|
const PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
|
|
const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
|
|
};
|
|
|
|
// A special abstracted PreparedModel for the CPU, constructed by CpuDevice.
|
|
class CpuPreparedModel : public PreparedModel {
|
|
public:
|
|
// Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
|
|
// a prepared model object if successfully created. Returns an error code
|
|
// and nullptr otherwise.
|
|
static std::pair<int, std::shared_ptr<PreparedModel>> create(Model hidlModel);
|
|
|
|
const Device* getDevice() const override { return CpuDevice::get().get(); }
|
|
std::shared_ptr<VersionedIPreparedModel> getInterface() const override { return nullptr; }
|
|
|
|
std::tuple<int, std::vector<OutputShape>, Timing> execute(
|
|
const std::vector<ModelArgumentInfo>& inputs,
|
|
const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories,
|
|
const std::shared_ptr<ExecutionBurstController>& burstController, MeasureTiming measure,
|
|
const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration) const override;
|
|
|
|
std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
|
|
bool /*preferPowerOverLatency*/) const override {
|
|
return nullptr;
|
|
}
|
|
|
|
std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing> executeFenced(
|
|
const std::vector<ModelArgumentInfo>& inputs,
|
|
const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories, const std::vector<int>& wait_for,
|
|
MeasureTiming measure, const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration,
|
|
const hal::OptionalTimeoutDuration& timeoutDurationAfterFence) const override;
|
|
|
|
// Prefer to use CpuPreparedModel::create.
|
|
CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
|
|
: mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
|
|
|
|
private:
|
|
const Model mModel;
|
|
const std::vector<RunTimePoolInfo> mModelPoolInfos;
|
|
};
|
|
|
|
std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
|
|
const Model& hidlModel = metaModel.getModel();
|
|
const size_t count = hidlModel.main.operations.size();
|
|
std::vector<bool> result(count, false);
|
|
for (size_t i = 0; i < count; i++) {
|
|
// TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
|
|
// We may want to use the slicer for CpuDevice just as we do for
|
|
// DriverDevice.
|
|
OperationType operationType = hidlModel.main.operations[i].type;
|
|
result[i] = !isExtensionOperationType(operationType) &&
|
|
operationType != OperationType::OEM_OPERATION;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::pair<int, std::shared_ptr<PreparedModel>> CpuDevice::prepareModel(
|
|
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
|
|
const std::optional<Deadline>& deadline, const std::string& /*cacheDir*/,
|
|
const std::optional<CacheToken>& maybeToken) const {
|
|
CHECK(!maybeToken.has_value())
|
|
<< "Should never call prepareModel with cache information on CpuDevice";
|
|
|
|
const Model model = makeModel();
|
|
if (!validateModel(model, ValidationMode::RUNTIME) ||
|
|
!validateExecutionPreference(preference) || !validatePriority(priority)) {
|
|
return {ANEURALNETWORKS_OP_FAILED, nullptr};
|
|
}
|
|
if (hasDeadlinePassed(deadline)) {
|
|
return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
|
|
}
|
|
|
|
return CpuPreparedModel::create(model);
|
|
}
|
|
|
|
std::pair<int, std::unique_ptr<Memory>> CpuDevice::allocate(const MemoryDescriptor& desc,
|
|
OperandType type) const {
|
|
uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
|
|
if (size == 0) {
|
|
LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
|
|
return {ANEURALNETWORKS_OP_FAILED, nullptr};
|
|
}
|
|
return MemoryAshmem::create(size);
|
|
}
|
|
|
|
std::pair<int, std::shared_ptr<PreparedModel>> CpuPreparedModel::create(Model hidlModel) {
|
|
std::vector<RunTimePoolInfo> poolInfos;
|
|
if (!setRunTimePoolInfosFromHidlMemories(&poolInfos, hidlModel.pools)) {
|
|
return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
|
|
}
|
|
|
|
std::shared_ptr<PreparedModel> preparedModel =
|
|
std::make_shared<CpuPreparedModel>(std::move(hidlModel), std::move(poolInfos));
|
|
return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
|
|
}
|
|
|
|
static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
|
|
const Model& model, const Request& request,
|
|
const std::vector<RunTimePoolInfo>& modelPoolInfos,
|
|
const std::vector<RunTimePoolInfo>& requestPoolInfos,
|
|
const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration) {
|
|
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
|
|
CpuExecutor executor;
|
|
if (loopTimeoutDuration.getDiscriminator() !=
|
|
OptionalTimeoutDuration::hidl_discriminator::none) {
|
|
executor.setLoopTimeout(loopTimeoutDuration.nanoseconds());
|
|
}
|
|
if (deadline.has_value()) {
|
|
executor.setDeadline(*deadline);
|
|
}
|
|
int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
|
|
const auto& outputShapes = executor.getOutputShapes();
|
|
return {err, outputShapes, kNoTiming};
|
|
}
|
|
|
|
std::tuple<int, int, sp<hal::IFencedExecutionCallback>, hal::Timing>
|
|
CpuPreparedModel::executeFenced(const std::vector<ModelArgumentInfo>& inputs,
|
|
const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories,
|
|
const std::vector<int>& waitFor, hal::MeasureTiming measure,
|
|
const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration,
|
|
const hal::OptionalTimeoutDuration& duration) const {
|
|
VLOG(EXECUTION)
|
|
<< "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
|
|
for (int syncFd : waitFor) {
|
|
if (syncFd > 0) {
|
|
auto r = syncWait(syncFd, -1);
|
|
if (r != FenceState::SIGNALED) {
|
|
LOG(ERROR) << "sync wait failed, fd: " << syncFd;
|
|
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {UINT64_MAX, UINT64_MAX}};
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update deadline if the timeout duration is closer than the deadline.
|
|
auto closestDeadline = deadline;
|
|
if (duration.getDiscriminator() != OptionalTimeoutDuration::hidl_discriminator::none) {
|
|
const auto timeoutDurationDeadline = makeDeadline(duration.nanoseconds());
|
|
if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
|
|
closestDeadline = timeoutDurationDeadline;
|
|
}
|
|
}
|
|
|
|
const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
|
|
closestDeadline, loopTimeoutDuration);
|
|
return {result, -1, nullptr, timing};
|
|
}
|
|
|
|
// Perform computation on NNAPI CPU reference implementation.
|
|
//
|
|
// Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
|
|
// same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
|
|
// there are input/output in this method to avoid data copying.
|
|
//
|
|
// Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
|
|
std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
|
|
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
|
|
const std::vector<const Memory*>& memories,
|
|
const std::shared_ptr<ExecutionBurstController>& /*burstController*/,
|
|
MeasureTiming /*measure*/, const std::optional<Deadline>& deadline,
|
|
const OptionalTimeoutDuration& loopTimeoutDuration) const {
|
|
if (hasDeadlinePassed(deadline)) {
|
|
return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, kNoTiming};
|
|
}
|
|
|
|
std::vector<RunTimePoolInfo> requestPoolInfos;
|
|
requestPoolInfos.reserve(memories.size());
|
|
for (const Memory* mem : memories) {
|
|
if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
|
|
requestPoolInfos.emplace_back(*poolInfo);
|
|
} else {
|
|
return {ANEURALNETWORKS_UNMAPPABLE, {}, kNoTiming};
|
|
}
|
|
}
|
|
// Create as many pools as there are input / output.
|
|
auto fixPointerArguments =
|
|
[&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
|
|
std::vector<DataLocation> ptrArgsLocations;
|
|
for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
|
|
if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
|
|
ptrArgsLocations.push_back(
|
|
{.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
|
|
.offset = 0,
|
|
.length = argumentInfo.length()});
|
|
requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
|
|
static_cast<uint8_t*>(argumentInfo.buffer())));
|
|
}
|
|
}
|
|
return ptrArgsLocations;
|
|
};
|
|
const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
|
|
const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
|
|
|
|
Request request;
|
|
request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
|
|
request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
|
|
|
|
if (!DeviceManager::get()->syncExecCpu()) {
|
|
// TODO: use a thread pool
|
|
// TODO(mikie): this could have NNTRACE so we could measure the overhead
|
|
// of spinning up a new thread.
|
|
std::tuple<int, std::vector<OutputShape>, Timing> result = {};
|
|
std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
|
|
result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
|
|
loopTimeoutDuration);
|
|
}).join();
|
|
return result;
|
|
}
|
|
|
|
return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
|
|
loopTimeoutDuration);
|
|
}
|
|
|
|
DeviceManager* DeviceManager::get() {
|
|
static DeviceManager manager;
|
|
return &manager;
|
|
}
|
|
|
|
std::shared_ptr<Device> DeviceManager::getCpuDevice() {
|
|
return CpuDevice::get();
|
|
}
|
|
|
|
std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const std::string& name,
|
|
const sp<V1_0::IDevice>& device) {
|
|
const DeviceFactory makeDevice = [device](bool /*blocking*/) { return device; };
|
|
const auto driverDevice = DriverDevice::create(name, makeDevice);
|
|
CHECK(driverDevice != nullptr);
|
|
return driverDevice;
|
|
}
|
|
|
|
void DeviceManager::findAvailableDevices() {
|
|
VLOG(MANAGER) << "findAvailableDevices";
|
|
|
|
// register driver devices
|
|
const auto names = hardware::getAllHalInstanceNames(V1_0::IDevice::descriptor);
|
|
for (const auto& name : names) {
|
|
VLOG(MANAGER) << "Found interface " << name;
|
|
const DeviceFactory makeDevice = [name](bool blocking) {
|
|
return blocking ? V1_0::IDevice::getService(name) : V1_0::IDevice::tryGetService(name);
|
|
};
|
|
registerDevice(name, makeDevice);
|
|
}
|
|
|
|
// register CPU fallback device
|
|
mDevices.push_back(CpuDevice::get());
|
|
mDevicesCpuOnly.push_back(CpuDevice::get());
|
|
}
|
|
|
|
void DeviceManager::registerDevice(const std::string& name, const DeviceFactory& makeDevice) {
|
|
if (auto device = DriverDevice::create(name, makeDevice)) {
|
|
mDevices.push_back(std::move(device));
|
|
}
|
|
}
|
|
|
|
DeviceManager::DeviceManager() {
|
|
VLOG(MANAGER) << "DeviceManager::DeviceManager";
|
|
findAvailableDevices();
|
|
#ifdef NN_DEBUGGABLE
|
|
mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
|
|
mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
|
|
mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
|
|
mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
|
|
if (!mSyncExecHalSetter) {
|
|
mSyncExecHal = (getProp("debug.nn.syncexec-hal", 1) != 0);
|
|
}
|
|
mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
|
|
#endif // NN_DEBUGGABLE
|
|
}
|
|
|
|
} // namespace nn
|
|
} // namespace android
|