在SYCL device调用DNNL library纯C++文件实现

2020-03-25 GPU DNNL, SYCL 0 评论

之前遇到的DNNl library加速库的UT无论是CPU还是GPU都是以benchdnn的方式进行结果正确性测试，在深度学习框架调用dnnl相应API。但是有些结果的数值正确性却无法使用benchdnn完全脱离框架复现，因此这里以batch norm op的forward和backward为例，以纯c++的方式完成UT的书写。

需要环境：

SYCL backend (gen9 machine)
computecpp (version 1.0.4)
dnnl (dev-v2 branch)

code

#include "mkldnn.hpp"
#include <CL/sycl.hpp>

// namespace sycl = cl::sycl;
using namespace mkldnn;

void BN_TEST() {
    float epsilon = 0.001;
    int32_t N = 2;
    int32_t C = 2;
    int32_t H = 2;
    int32_t W = 2;
    int32_t all_element =  N * C * H * W;

    auto data_t = memory::data_type::bf16;
    auto dnnl_format = memory::format_tag::nchw;

    memory::dims input_dims = {N,C,H,W};
    memory::dims weight_dims = {C};
    memory::dims bias_dims = {C};
    memory::dims output_dims = {N,C,H,W};

    auto input_md = memory::desc({input_dims}, data_t, dnnl_format);

    auto propagation = prop_kind::forward_training;
    normalization_flags flags = normalization_flags::use_scale_shift;
    auto bnorm_fwd_d = batch_normalization_forward::desc(propagation, input_md, epsilon, flags);
    auto engine = mkldnn::engine(mkldnn::engine::kind::gpu, 0);
    auto bnorm_fwd_pd = batch_normalization_forward::primitive_desc(bnorm_fwd_d, engine);

    auto input_usr_memory = memory({{{input_dims}, data_t, dnnl_format}, engine});
    auto output_usr_memory = memory({{{output_dims}, data_t, dnnl_format}, engine});
    auto weight_bias_memory = memory(bnorm_fwd_pd.weights_desc(), engine);
    auto mean_memory = memory(bnorm_fwd_pd.mean_desc(), engine);
    auto var_memory = memory(bnorm_fwd_pd.variance_desc(), engine);

    // sycl_set_mkldnn_buffer
    cl::sycl::buffer<unsigned short> buff_input(cl::sycl::range<1>(16));
    {
        auto ba = buff_input.get_access<cl::sycl::access::mode::write>();
        // Convert float to unsigned short (bf16)
        for (size_t i = 0; i < 16; i++) {
          float src = 1;
          uint32_t res = 0;
          std::memcpy(&res, &src, sizeof(res));
          ba[i] = res >> 16;
        }
    }
    input_usr_memory.template set_sycl_buffer<unsigned short, 1>(buff_input);

    cl::sycl::buffer<unsigned short> buff_output(cl::sycl::range<1>(16));
    output_usr_memory.template set_sycl_buffer<unsigned short, 1>(buff_output);

    cl::sycl::buffer<float> buff_weight(cl::sycl::range<1>(2 * C));
    {
        auto ba = buff_weight.get_access<cl::sycl::access::mode::write>();
        ba[0] = 1;
        ba[1] = 1;
        ba[2] = 0;
        ba[3] = 0;
        // for (size_t i = 0; i < 4; i++) {
        //   ba[i] = i;
        // }
    }
    weight_bias_memory.template set_sycl_buffer<float, 1>(buff_weight);

    cl::sycl::buffer<float> buff_mean(cl::sycl::range<1>(2));
    mean_memory.template set_sycl_buffer<float, 1>(buff_mean);

    cl::sycl::buffer<float> buff_var(cl::sycl::range<1>(2));
    var_memory.template set_sycl_buffer<float, 1>(buff_var);

    std::shared_ptr<mkldnn::primitive> bn_fwd;
    auto strm = mkldnn::stream(engine);
    bn_fwd.reset(new batch_normalization_forward(bnorm_fwd_pd));

    std::unordered_map<int, mkldnn::memory> args = {
        {MKLDNN_ARG_SRC, input_usr_memory},
        {MKLDNN_ARG_DST, output_usr_memory},
        {MKLDNN_ARG_SCALE_SHIFT, weight_bias_memory},
        {MKLDNN_ARG_MEAN, mean_memory},
        {MKLDNN_ARG_VARIANCE, var_memory},
    };

    bn_fwd->execute(strm, args);

    // TEST Forward
    auto input_acc = buff_input.get_access<cl::sycl::access::mode::read>();
    printf("in ( ");
    for (int i = 0; i < 16; i++) {
      float res = 0;
      uint32_t tmp = input_acc[i];
      tmp <<= 16;
      std::memcpy(&res, &tmp, sizeof(tmp));
      printf("%f ", res);
      // printf("%f ", (float)input_acc[i]);
    }
    printf(")\n");

    auto weight_acc = buff_weight.get_access<cl::sycl::access::mode::read>();
    printf("weight ( ");
    for (int i = 0; i < 2 * 2; i++) {
      printf("%f ", weight_acc[i]);
    }
    printf(")\n");

    auto mean_acc = buff_mean.get_access<cl::sycl::access::mode::read>();
    printf("mean ( ");
    for (int i = 0; i < 2; i++) {
      printf("%f ", mean_acc[i]);
    }
    printf(")\n");

    auto var_acc = buff_var.get_access<cl::sycl::access::mode::read>();
    printf("var ( ");
    for (int i = 0; i < 2; i++) {
      printf("%f ", var_acc[i]);
    }
    printf(")\n");

    auto out_acc = buff_output.get_access<cl::sycl::access::mode::read>();
    printf("out ( ");
    for (int i = 0; i < 16; i++) {
      // Convert unsigned short (bf16) to float
      float res = 0;
      uint32_t tmp = out_acc[i];
      tmp <<= 16;
      std::memcpy(&res, &tmp, sizeof(tmp));
      printf("%f ", res);
      // printf("%d ", out_acc[i]);
    }
    printf(")\n");


  auto pk_bwd = prop_kind::backward;
  auto grad_output_md = memory::desc({input_dims}, data_t, dnnl_format);
  auto bwd_desc = batch_normalization_backward::desc(pk_bwd, grad_output_md,
                                                     input_md, epsilon, flags);
  auto bn_bwd_pd = batch_normalization_backward::primitive_desc(
      bwd_desc, engine, bnorm_fwd_pd);

  auto grad_input_memory =
      memory({{{input_dims}, data_t, dnnl_format}, engine});
  auto grad_output_memory = memory({{{output_dims}, data_t, dnnl_format}, engine});
  auto grad_weight_bias_memory = memory(bn_bwd_pd.diff_weights_desc(), engine);

  // sycl_set_mkldnn_buffer grad_input_memory
  cl::sycl::buffer<unsigned short> buff_grad_input(cl::sycl::range<1>(16));
  grad_input_memory.template set_sycl_buffer<unsigned short, 1>(buff_grad_input);

  // sycl_set_mkldnn_buffer grad_output_memory
  cl::sycl::buffer<unsigned short> buff_grad_output(cl::sycl::range<1>(16));
  {
    auto ba = buff_grad_output.get_access<cl::sycl::access::mode::write>();
    for (size_t i = 0; i < 16; i++) {
      float src = 1;
      uint32_t res = 0;
      std::memcpy(&res, &src, sizeof(res));
      ba[i] = res >> 16;
      // ba[i] = 1;
    }
  }
  grad_output_memory.template set_sycl_buffer<unsigned short, 1>(buff_grad_output);

  // sycl_set_mkldnn_buffer grad_weight_bias_memory
  cl::sycl::buffer<float> buff_grad_weight_bias(cl::sycl::range<1>(2 * 2));
  grad_weight_bias_memory.template set_sycl_buffer<float, 1>(buff_grad_weight_bias);

  std::shared_ptr<mkldnn::primitive> bn_bwd;
  bn_bwd.reset(new batch_normalization_backward(bn_bwd_pd));

  std::unordered_map<int, memory> bwd_args = {
      {MKLDNN_ARG_SRC, input_usr_memory},
      {MKLDNN_ARG_SCALE_SHIFT, weight_bias_memory},
      {MKLDNN_ARG_MEAN, mean_memory},
      {MKLDNN_ARG_VARIANCE, var_memory},
      {MKLDNN_ARG_DIFF_DST, grad_output_memory},
      {MKLDNN_ARG_DIFF_SRC, grad_input_memory},
      {MKLDNN_ARG_DIFF_SCALE_SHIFT, grad_weight_bias_memory},
  };
  bn_bwd->execute(strm, bwd_args);

  auto grad_input_acc =
      buff_grad_input.get_access<cl::sycl::access::mode::read>();
  printf("grad input ( ");
  for (int i = 0; i < 16; i++) {
    float res = 0;
    uint32_t tmp = grad_input_acc[i];
    tmp <<= 16;
    std::memcpy(&res, &tmp, sizeof(tmp));
    printf("%f ", res);
    // printf("%f ", grad_input_acc[i]);
  }
  printf(")\n");

  auto grad_weight_bias_acc =
      buff_grad_weight_bias.get_access<cl::sycl::access::mode::read>();
  printf("grad weight bais ( ");
  for (int i = 0; i < 2 * 2; i++) {
    // float res = 0;
    // uint32_t tmp = grad_weight_bias_acc[i];
    // tmp <<= 16;
    // std::memcpy(&res, &tmp, sizeof(tmp));
    // printf("%f ", res);
    printf("%f ", grad_weight_bias_acc[i]);
  }
  printf(")\n");
}

int main(){
  BN_TEST();
  printf("BN_TEST DONE!\n");
  return 0;
}

Build

g++ -g -o bn -std=c++11 -I/home/huang/mkl-dnn/build/include -I/home/huang/mkl-dnn/include -I/usr/local/computecpp/include test_bn.cc -L/usr/local/computecpp/lib/ -L/home/huang/mkl-dnn/build/src/ -ldnnl -ldl -lComputeCpp 2>&1 | tee log

Run

$ export MKLDNN_VERBOSE=1
(base) huang@mlt:~/lnorm_test/bn$ ./bn
dnnl_verbose,info,DNNL v1.91.0 (commit 9d45cad761dc4dda9ebb34c601fe546b1a9edc5e)
dnnl_verbose,info,cpu,runtime:SYCL
dnnl_verbose,info,cpu,isa:Intel AVX2
dnnl_verbose,info,gpu,runtime:SYCL
dnnl_verbose,info,cpu,engine,0,name:Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz,driver_version:1.2.0
dnnl_verbose,info,gpu,engine,0,name:Intel(R) Gen9 HD Graphics NEO,driver_version:19.20.13008
dnnl_verbose,exec,gpu,batch_normalization,ocl:ref:any,forward_inference,data_bf16::blocked:abcd:f0 diff_undef::undef::f0,,flags:S,mb2ic2ih2iw2,1.59204
dnnl_verbose,exec,gpu,batch_normalization,ocl:ref:any,backward,data_bf16::blocked:abcd:f0 diff_bf16::blocked:abcd:f0,,flags:S,mb2ic2ih2iw2,1.1311

link:
https://intel.github.io/mkl-dnn/dev_guide_batch_normalization.html
https://intel.github.io/mkl-dnn/dev_guide_data_types.html
https://github.com/intel/mkl-dnn/tree/master/tests/benchdnn

本文链接： http://huangzhiyuan.github.io/2020/03/25/batchnorm-sycl-mkldnn-ut/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

黄志远

研发攻城狮

在SYCL device调用DNNL library纯C++文件实现

code

Build

Run

黄志远研发攻城狮