在SYCL device调用DNNL library纯C++文件实现

之前遇到的DNNl library加速库的UT无论是CPU还是GPU都是以benchdnn的方式进行结果正确性测试,在深度学习框架调用dnnl相应API。但是有些结果的数值正确性却无法使用benchdnn完全脱离框架复现,因此这里以batch norm op的forward和backward为例,以纯c++的方式完成UT的书写。

需要环境:

  • SYCL backend (gen9 machine)
  • computecpp (version 1.0.4)
  • dnnl (dev-v2 branch)

code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#include "mkldnn.hpp"
#include <CL/sycl.hpp>

// namespace sycl = cl::sycl;
using namespace mkldnn;

void BN_TEST() {
float epsilon = 0.001;
int32_t N = 2;
int32_t C = 2;
int32_t H = 2;
int32_t W = 2;
int32_t all_element = N * C * H * W;

auto data_t = memory::data_type::bf16;
auto dnnl_format = memory::format_tag::nchw;

memory::dims input_dims = {N,C,H,W};
memory::dims weight_dims = {C};
memory::dims bias_dims = {C};
memory::dims output_dims = {N,C,H,W};

auto input_md = memory::desc({input_dims}, data_t, dnnl_format);

auto propagation = prop_kind::forward_training;
normalization_flags flags = normalization_flags::use_scale_shift;
auto bnorm_fwd_d = batch_normalization_forward::desc(propagation, input_md, epsilon, flags);
auto engine = mkldnn::engine(mkldnn::engine::kind::gpu, 0);
auto bnorm_fwd_pd = batch_normalization_forward::primitive_desc(bnorm_fwd_d, engine);

auto input_usr_memory = memory({{{input_dims}, data_t, dnnl_format}, engine});
auto output_usr_memory = memory({{{output_dims}, data_t, dnnl_format}, engine});
auto weight_bias_memory = memory(bnorm_fwd_pd.weights_desc(), engine);
auto mean_memory = memory(bnorm_fwd_pd.mean_desc(), engine);
auto var_memory = memory(bnorm_fwd_pd.variance_desc(), engine);

// sycl_set_mkldnn_buffer
cl::sycl::buffer<unsigned short> buff_input(cl::sycl::range<1>(16));
{
auto ba = buff_input.get_access<cl::sycl::access::mode::write>();
// Convert float to unsigned short (bf16)
for (size_t i = 0; i < 16; i++) {
float src = 1;
uint32_t res = 0;
std::memcpy(&res, &src, sizeof(res));
ba[i] = res >> 16;
}
}
input_usr_memory.template set_sycl_buffer<unsigned short, 1>(buff_input);

cl::sycl::buffer<unsigned short> buff_output(cl::sycl::range<1>(16));
output_usr_memory.template set_sycl_buffer<unsigned short, 1>(buff_output);

cl::sycl::buffer<float> buff_weight(cl::sycl::range<1>(2 * C));
{
auto ba = buff_weight.get_access<cl::sycl::access::mode::write>();
ba[0] = 1;
ba[1] = 1;
ba[2] = 0;
ba[3] = 0;
// for (size_t i = 0; i < 4; i++) {
// ba[i] = i;
// }
}
weight_bias_memory.template set_sycl_buffer<float, 1>(buff_weight);

cl::sycl::buffer<float> buff_mean(cl::sycl::range<1>(2));
mean_memory.template set_sycl_buffer<float, 1>(buff_mean);

cl::sycl::buffer<float> buff_var(cl::sycl::range<1>(2));
var_memory.template set_sycl_buffer<float, 1>(buff_var);

std::shared_ptr<mkldnn::primitive> bn_fwd;
auto strm = mkldnn::stream(engine);
bn_fwd.reset(new batch_normalization_forward(bnorm_fwd_pd));

std::unordered_map<int, mkldnn::memory> args = {
{MKLDNN_ARG_SRC, input_usr_memory},
{MKLDNN_ARG_DST, output_usr_memory},
{MKLDNN_ARG_SCALE_SHIFT, weight_bias_memory},
{MKLDNN_ARG_MEAN, mean_memory},
{MKLDNN_ARG_VARIANCE, var_memory},
};

bn_fwd->execute(strm, args);

// TEST Forward
auto input_acc = buff_input.get_access<cl::sycl::access::mode::read>();
printf("in ( ");
for (int i = 0; i < 16; i++) {
float res = 0;
uint32_t tmp = input_acc[i];
tmp <<= 16;
std::memcpy(&res, &tmp, sizeof(tmp));
printf("%f ", res);
// printf("%f ", (float)input_acc[i]);
}
printf(")\n");

auto weight_acc = buff_weight.get_access<cl::sycl::access::mode::read>();
printf("weight ( ");
for (int i = 0; i < 2 * 2; i++) {
printf("%f ", weight_acc[i]);
}
printf(")\n");

auto mean_acc = buff_mean.get_access<cl::sycl::access::mode::read>();
printf("mean ( ");
for (int i = 0; i < 2; i++) {
printf("%f ", mean_acc[i]);
}
printf(")\n");

auto var_acc = buff_var.get_access<cl::sycl::access::mode::read>();
printf("var ( ");
for (int i = 0; i < 2; i++) {
printf("%f ", var_acc[i]);
}
printf(")\n");

auto out_acc = buff_output.get_access<cl::sycl::access::mode::read>();
printf("out ( ");
for (int i = 0; i < 16; i++) {
// Convert unsigned short (bf16) to float
float res = 0;
uint32_t tmp = out_acc[i];
tmp <<= 16;
std::memcpy(&res, &tmp, sizeof(tmp));
printf("%f ", res);
// printf("%d ", out_acc[i]);
}
printf(")\n");


auto pk_bwd = prop_kind::backward;
auto grad_output_md = memory::desc({input_dims}, data_t, dnnl_format);
auto bwd_desc = batch_normalization_backward::desc(pk_bwd, grad_output_md,
input_md, epsilon, flags);
auto bn_bwd_pd = batch_normalization_backward::primitive_desc(
bwd_desc, engine, bnorm_fwd_pd);

auto grad_input_memory =
memory({{{input_dims}, data_t, dnnl_format}, engine});
auto grad_output_memory = memory({{{output_dims}, data_t, dnnl_format}, engine});
auto grad_weight_bias_memory = memory(bn_bwd_pd.diff_weights_desc(), engine);

// sycl_set_mkldnn_buffer grad_input_memory
cl::sycl::buffer<unsigned short> buff_grad_input(cl::sycl::range<1>(16));
grad_input_memory.template set_sycl_buffer<unsigned short, 1>(buff_grad_input);

// sycl_set_mkldnn_buffer grad_output_memory
cl::sycl::buffer<unsigned short> buff_grad_output(cl::sycl::range<1>(16));
{
auto ba = buff_grad_output.get_access<cl::sycl::access::mode::write>();
for (size_t i = 0; i < 16; i++) {
float src = 1;
uint32_t res = 0;
std::memcpy(&res, &src, sizeof(res));
ba[i] = res >> 16;
// ba[i] = 1;
}
}
grad_output_memory.template set_sycl_buffer<unsigned short, 1>(buff_grad_output);

// sycl_set_mkldnn_buffer grad_weight_bias_memory
cl::sycl::buffer<float> buff_grad_weight_bias(cl::sycl::range<1>(2 * 2));
grad_weight_bias_memory.template set_sycl_buffer<float, 1>(buff_grad_weight_bias);

std::shared_ptr<mkldnn::primitive> bn_bwd;
bn_bwd.reset(new batch_normalization_backward(bn_bwd_pd));

std::unordered_map<int, memory> bwd_args = {
{MKLDNN_ARG_SRC, input_usr_memory},
{MKLDNN_ARG_SCALE_SHIFT, weight_bias_memory},
{MKLDNN_ARG_MEAN, mean_memory},
{MKLDNN_ARG_VARIANCE, var_memory},
{MKLDNN_ARG_DIFF_DST, grad_output_memory},
{MKLDNN_ARG_DIFF_SRC, grad_input_memory},
{MKLDNN_ARG_DIFF_SCALE_SHIFT, grad_weight_bias_memory},
};
bn_bwd->execute(strm, bwd_args);

auto grad_input_acc =
buff_grad_input.get_access<cl::sycl::access::mode::read>();
printf("grad input ( ");
for (int i = 0; i < 16; i++) {
float res = 0;
uint32_t tmp = grad_input_acc[i];
tmp <<= 16;
std::memcpy(&res, &tmp, sizeof(tmp));
printf("%f ", res);
// printf("%f ", grad_input_acc[i]);
}
printf(")\n");

auto grad_weight_bias_acc =
buff_grad_weight_bias.get_access<cl::sycl::access::mode::read>();
printf("grad weight bais ( ");
for (int i = 0; i < 2 * 2; i++) {
// float res = 0;
// uint32_t tmp = grad_weight_bias_acc[i];
// tmp <<= 16;
// std::memcpy(&res, &tmp, sizeof(tmp));
// printf("%f ", res);
printf("%f ", grad_weight_bias_acc[i]);
}
printf(")\n");
}

int main(){
BN_TEST();
printf("BN_TEST DONE!\n");
return 0;
}

Build

1
g++ -g -o bn -std=c++11 -I/home/huang/mkl-dnn/build/include -I/home/huang/mkl-dnn/include -I/usr/local/computecpp/include test_bn.cc -L/usr/local/computecpp/lib/ -L/home/huang/mkl-dnn/build/src/ -ldnnl -ldl -lComputeCpp 2>&1 | tee log

Run

1
2
3
4
5
6
7
8
9
10
$ export MKLDNN_VERBOSE=1
(base) huang@mlt:~/lnorm_test/bn$ ./bn
dnnl_verbose,info,DNNL v1.91.0 (commit 9d45cad761dc4dda9ebb34c601fe546b1a9edc5e)
dnnl_verbose,info,cpu,runtime:SYCL
dnnl_verbose,info,cpu,isa:Intel AVX2
dnnl_verbose,info,gpu,runtime:SYCL
dnnl_verbose,info,cpu,engine,0,name:Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz,driver_version:1.2.0
dnnl_verbose,info,gpu,engine,0,name:Intel(R) Gen9 HD Graphics NEO,driver_version:19.20.13008
dnnl_verbose,exec,gpu,batch_normalization,ocl:ref:any,forward_inference,data_bf16::blocked:abcd:f0 diff_undef::undef::f0,,flags:S,mb2ic2ih2iw2,1.59204
dnnl_verbose,exec,gpu,batch_normalization,ocl:ref:any,backward,data_bf16::blocked:abcd:f0 diff_bf16::blocked:abcd:f0,,flags:S,mb2ic2ih2iw2,1.1311

link:
https://intel.github.io/mkl-dnn/dev_guide_batch_normalization.html
https://intel.github.io/mkl-dnn/dev_guide_data_types.html
https://github.com/intel/mkl-dnn/tree/master/tests/benchdnn