Back to Ncnn

Vulkan Notes

docs/how-to-use-and-FAQ/vulkan-notes.md

latest3.6 KB
Original Source

supported platform

  • Y = known work
  • ? = shall work, not confirmed
  • / = not applied
windowslinuxandroidmacios
intelYYYY/
amdYY/Y/
nvidiaYY?//
qcom//Y//
apple///YY
arm/YY//

enable vulkan compute support

$ cmake -DNCNN_VULKAN=ON ..

enable vulkan compute inference

cpp
ncnn::Net net;
net.opt.use_vulkan_compute = 1;

proper allocator usage

cpp
ncnn::VkAllocator* blob_vkallocator = vkdev.acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev.acquire_blob_allocator();

net.opt.blob_vkallocator = blob_vkallocator;
net.opt.workspace_vkallocator = blob_vkallocator;
net.opt.staging_vkallocator = staging_vkallocator;

// ....

// after inference
vkdev.reclaim_blob_allocator(blob_vkallocator);
vkdev.reclaim_staging_allocator(staging_vkallocator);

select gpu device

cpp
// get gpu count
int gpu_count = ncnn::get_gpu_count();

// set specified vulkan device before loading param and model
net.set_vulkan_device(0); // use device-0
net.set_vulkan_device(1); // use device-1

// or set opt.vulkan_device_index field before loading param and model
net.opt.vulkan_device_index = 0; // use device-0
net.opt.vulkan_device_index = 1; // use device-1

zero-copy on unified memory device

cpp
ncnn::VkMat blob_gpu;
ncnn::Mat mapped = blob_gpu.mapped();

// use mapped.data directly

hybrid cpu/gpu inference

cpp
ncnn::Net net_cpu;
ncnn::Net net_gpu;
net_cpu.opt.use_vulkan_compute = false;
net_gpu.opt.use_vulkan_compute = true;
net_cpu.load_param();
net_cpu.load_model();
net_gpu.load_param();
net_gpu.load_model();

ncnn::Extractor ex_cpu = net_cpu.create_extractor();
ncnn::Extractor ex_gpu = net_gpu.create_extractor();

#pragma omp parallel sections
{
    #pragma omp section
    {
        ex_cpu.input();
        ex_cpu.extract();
    }
    #pragma omp section
    {
        ex_gpu.input();
        ex_gpu.extract();
    }
}

zero-copy gpu inference chaining

cpp
ncnn::Extractor ex1 = net1.create_extractor();
ncnn::Extractor ex2 = net2.create_extractor();

ncnn::VkCompute cmd(&vkdev);

ncnn::VkMat conv1;
ncnn::VkMat conv2;
ncnn::VkMat conv3;

ex1.input("conv1", conv1);
ex1.extract("conv2", conv2, cmd);

ex2.input("conv2", conv2);
ex2.extract("conv3", conv3, cmd);

cmd.submit_and_wait();

batch inference

cpp
int max_batch_size = vkdev->info.compute_queue_count();

ncnn::Mat inputs[1000];
ncnn::Mat outputs[1000];

#pragma omp parallel for num_threads(max_batch_size)
for (int i=0; i<1000; i++)
{
    ncnn::Extractor ex = net1.create_extractor();
    ex.input("data", inputs[i]);
    ex.extract("prob", outputs[i]);
}

control storage and arithmetic precision

disable all lower-precision optimizations, get full fp32 precision

cpp
ncnn::Net net;
net.opt.use_fp16_packed = false;
net.opt.use_fp16_storage = false;
net.opt.use_fp16_arithmetic = false;
net.opt.use_int8_storage = false;
net.opt.use_int8_arithmetic = false;

debugging tips

cpp
#define ENABLE_VALIDATION_LAYER 1 // modify to 1 in gpu.cpp

add vulkan compute support to layer

  1. add vulkan shader in src/layer/shader/

  2. upload model weight data in Layer::upload_model()

  3. setup pipeline in Layer::create_pipeline()

  4. destroy pipeline in Layer::destroy_pipeline()

  5. record command in Layer::forward()

add optimized shader path

  1. add vulkan shader in src/layer/shader/ named XXX_abc.comp

  2. create pipeline with "XXX_abc"

  3. record command using XXX_abc pipeline

low-level op api

  1. create layer

  2. load param and load model

  3. upload model

  4. create pipeline

  5. new command

  6. record

  7. submit and wait