Copy issue in ATS GPU

Performance of copy using different data types.
Got a performance issue on ATS when copy a continuous buffer of u16 data type. It is almost 2x slower than copy a buffer of same byte-length but using u32 data type. Similarly, copy of u8 is even slower. Below is a test case of copy a same buffer using different data type of u32/u16/u8.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Build with DPC++ Compiler:
// clang++ -std=c++14 -fsycl -O3 copy.cpp -o copy
//
// Run:
// ./copy 65536 512
//
#include <iostream>
#include <cmath>
#include <chrono>
#include "CL/sycl.hpp"
// Timer
#define _(x) x
#define __tstart(n) _(std::chrono::high_resolution_clock::time_point __s##n = \
std::chrono::high_resolution_clock::now());
#define __tend(n) \
_(std::chrono::high_resolution_clock::time_point __e##n = \
std::chrono::high_resolution_clock::now()); \
_(printf("time: %s, %.2f ms\n", #n, \
std::chrono::duration<float, std::milli>(__e##n - __s##n).count()));
namespace sycl = cl::sycl;
int main(int argc, char * argv[])
{
if (argc < 3) {
std::cerr << "Usage: copy <M length> <N length>" << std::endl;
return argc;
}
size_t M = std::atoi(argv[1]);
size_t N = std::atoi(argv[2]);
if (N % 4 != 0) {
std::cerr << "Usage: N must be times of 4" << std::endl;
}
sycl::queue q(sycl::default_selector{});
auto ctx = q.get_context();
auto dev = q.get_device();
uint8_t *Y = static_cast<uint8_t*>(sycl::malloc_shared(M * N, dev, ctx));
uint8_t *Z = static_cast<uint8_t*>(sycl::malloc_shared(M * N, dev, ctx));
uint32_t *Z32 = (uint32_t*)Z;
uint32_t *Y32 = (uint32_t*)Y;
uint16_t *Z16 = (uint16_t*)Z;
uint16_t *Y16 = (uint16_t*)Y;
size_t N32 = N / 4;
size_t N16 = N / 2;
for (size_t i = 0; i < M * N; i++) {
Y[i] = i % 255;
}
// warmup
{
for (size_t j = 0; j < 5; j++) {
q.submit([&](sycl::handler& h) {
h.parallel_for<class u32_warmup>( sycl::range<2>{M, N32}, [=] (sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z32[m * N32 + n] = Y32[m * N32 + n];
});
});
q.wait();
}
}
__tstart(copy_as_u32);
{
for (size_t j = 0; j < 100; j++) {
q.submit([&](sycl::handler& h) {
h.parallel_for<class u32>( sycl::range<2>{M, N32}, [=] (sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z32[m * N32 + n] = Y32[m * N32 + n];
});
});
q.wait();
}
}
__tend(copy_as_u32);
// warmup
{
for (size_t j = 0; j < 5; j++) {
q.submit([&](sycl::handler& h) {
h.parallel_for<class u16_warmup>( sycl::range<2>{M, N16}, [=] (sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z16[m * N16 + n] = Y16[m * N16 + n];
});
});
q.wait();
}
}
__tstart(copy_as_u16);
{
for (size_t j = 0; j < 100; j++) {
q.submit([&](sycl::handler& h) {
h.parallel_for<class u16>( sycl::range<2>{M, N16}, [=] (sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z16[m * N16 + n] = Y16[m * N16 + n];
});
});
q.wait();
}
}
__tend(copy_as_u16);
// warmup
{
for (size_t j = 0; j < 5; j++) {
q.submit([&](sycl::handler& h) {
h.parallel_for<class u8_warmup>( sycl::range<2>{M, N}, [=] (sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z[m * N + n] = Y[m * N + n];
});
});
q.wait();
}
}
__tstart(copy_as_u8);
{
for (size_t j = 0; j < 100; j++) {
q.submit([&](sycl::handler& h) {
h.parallel_for<class u8>( sycl::range<2>{M, N}, [=] (sycl::id<2> it) {
const int m = it[0];
const int n = it[1];
Z[m * N + n] = Y[m * N + n];
});
});
q.wait();
}
}
__tend(copy_as_u8);

sycl::free(Y, ctx);
sycl::free(Z, ctx);
return 0;
}

Build with DPC++:

1
$ clang++ -std=c++14 -fsycl -O3 copy.cpp -o copy

Run the bench on ATS 480EU:

1
2
3
4
$ ./copy 65536 512
time: copy_as_u32, 37.74 ms
time: copy_as_u16, 64.26 ms
time: copy_as_u8, 114.77 ms

It shows copy_as_u16 is 1.7x slower than copy_as_u32, copy_as_u8 is 3x slower than copy_as_u32 when running in a bench case of 100 iterations for each data type.

Checking with the generated code, copy_as_u32 is in a quite compact form:

1
2
3
4
5
6
7
8
Address     Source Line Assembly
0x9a0 0 send.dc1 (16|M0) r45 r41 null 0x0 0x4205E01 {$5} [, msg-length:2, resp-length:2, header:no, func-control:5e01]
0x9b0 0 sync.nop null {Compacted, I@1}
0x9b8 0 send.dc1 (16|M16) r47 r43 null 0x0 0x4205E01 {$6} [, msg-length:2, resp-length:2, header:no, func-control:5e01]
...
0xde8 0 send.dc1 (16|M0) null r81 r45 0x80 0x4025E02 {$5} [, msg-length:2, resp-length:0, header:no, func-control:25e02]
0xdf8 0 sync.nop null {Compacted, I@1}
0xe00 0 send.dc1 (16|M16) null r83 r47 0x80 0x4025E02 {$6} [, msg-length:2, resp-length:0, header:no, func-control:25e02]

However for copy_as_u16, the generated code is using only the low 16bit of the registers with the higher 16bit kept empty. And it issues 2x of send with additional movs.

1
2
3
4
5
6
7
8
9
10
11
Address     Source Line Assembly
0x9a0 0 send.dc0 (16|M0) r45 r30 null 0x0 0x4210501 {$5} [, msg-length:2, resp-length:2, header:no, func-control:10501]
0x9b0 0 sync.nop null {Compacted, I@1}
0x9b8 0 send.dc0 (16|M16) r47 r32 null 0x0 0x4210501 {$6} [, msg-length:2, resp-length:2, header:no, func-control:10501]

0xdd0 0 mov (16|M0) r85.0<1>:ud r45.0<2;1,0>:uw {$5.dst}
0xde0 0 mov (16|M16) r87.0<1>:ud r47.0<2;1,0>:uw {$6.dst}

0xe08 0 send.dc0 (16|M0) null r81 r85 0x80 0x4030502 {$7} [, msg-length:2, resp-length:0, header:no, func-control:30502]
0xe18 0 sync.nop null {Compacted, I@1}
0xe20 0 send.dc0 (16|M16) null r83 r87 0x80 0x4030502 {$8} [, msg-length:2, resp-length:0, header:no, func-control:30502]

This seems to be a quite common performance penalty as in FP16 inference/training, most of math kernels uses the FP16 data type passed from the framework front-end. A lot of OPs (like embedding, concat, index and so on) have copy semantics. INT8 inference will have similar issue. We expect in case of contiguous copy, u8/u16 is same fast as u32.

the same issue can also be reproduced on Gen9:

1
2
3
4
$./copy 65536 512
time: copy_as_u32, 379.55 ms
time: copy_as_u16, 579.60 ms
time: copy_as_u8, 764.89 ms