-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathv6.cc
164 lines (133 loc) · 4.68 KB
/
v6.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// Micro-kernel
#include <bits/stdc++.h>
#include <x86intrin.h>
const int B = 8; // number of elements in a vector
typedef float vector __attribute__ (( vector_size(4 * B) ));
float* alloc(int n) {
float* ptr = (float*) std::aligned_alloc(64, 4 * n);
memset(ptr, 0, 4 * n);
return ptr;
}
// c: 6 x 16
// a: 6 x k
// b: k x 16
// c[x:x+6][y:y+16] += a[x:x+6][l:r] * b[l:r][y:y+16]
inline int idx_a(int i, int j, int n) {
//return i * n + j;
return i / 6 * 6 * n + j * 6 + i % 6;
}
inline int idx_b(int i, int j, int n) {
//return i * n + j;
return j / 16 * 16 * n + i * 16 + j % 16;
}
void kernel(const float *a, const vector *b, vector __restrict__ *c, int x, int y, int l, int r, int n) {
vector t[6][2] = {0};
for (int k = l; k < r; k++) {
__builtin_prefetch(&b[n / 8 * (k + 8) + y / 8]);
for (int i = 0; i < 6; i++) {
vector alpha = _mm256_set1_ps(a[x * n + i * n + k]);//a[x * ny + k * 6 + i]);
for (int j = 0; j < 2; j++)
t[i][j] += alpha * b[n / 8 * k + y / 8 + j];//b[(y / 16 * 2 * nx + k * 2 + j)];
// b[idx_b(k, y + 8 * j, nx) / 8];
}
}
// props to C++17 committee for deprecating "register" keyword
/*
#define C(i,j) c[x * ny / 8 + i * ny / 8 + y / 8 + j] += t[i][j];
C(0, 0)
C(0, 1)
C(1, 0)
C(1, 1)
C(2, 0)
C(2, 1)
C(3, 0)
C(3, 1)
C(4, 0)
C(4, 1)
C(5, 0)
C(5, 1)
*/
for (int i = 0; i < 6; i++)
for (int j = 0; j < 2; j++)
c[x * n / 8 + i * n / 8 + y / 8 + j] += t[i][j];
}
const int L1 = (1<<15) / 4; // L1 cache is 32K
const int L2 = (1<<19) / 4; // L2 cache is 512K
const int L3 = (1<<23) / 4; // L3 cache is 8M
void matmul(const float *_a, const float *_b, float *_c, int n) {
int nx = (n + 5) / 6 * 6;
int ny = (n + 15) / 16 * 16;
/*
float *a = alloc(nx * ny);
float *b = alloc(nx * ny);
float *c = alloc(nx * ny);
*/
const int MAXN = 1920 * 1920; // ~15MB each
alignas(64) static float a[MAXN], b[MAXN], c[MAXN];
memset(c, 0, 4 * nx * ny);
//auto a = new(std::align_val_t{64}) float[nx * ny]();
//auto b = new(std::align_val_t{64}) float[nx * ny]();
//auto c = new(std::align_val_t{64}) float[nx * ny]();
//float a[nx * ny] = {0};
//float b[nx * ny] = {0};
//float c[nx * ny] = {0};
for (int i = 0; i < n; i++) {
memcpy(&a[i * ny], &_a[i * n], 4 * n);
memcpy(&b[i * ny], &_b[i * n], 4 * n);
}
/*
for (int i = 0; i < n; i += 6)
for (int j = 0; j < n; j++)
for (int k = 0; k < std::min(6, n - i); k++)
a[i * ny + j * 6 + k] = _a[(i+k) * n + j];
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j += 16)
for (int k = 0; k < std::min(16, n - j); k++)
b[j * nx + i * 16 + k] = _b[i * n + j + k];
*/
// c[x:x+6][y:y+16] += a[x:x+6][l:r] * b[l:r][y:y+16]
// load b[i*L1 : (i+1)*L1][y:y+16] into L1 cache and iterate over a
// when out of L2 cache to hold a, load new strip of b and continue
// when out of L3 cache to hold b, switch to new segment of a
// divide b into segments that fit L3, fix a segment
// divide a into segments that fit L2, fix a segment
// divide b into segments that fit L1, fix a segment
// iterate over a
/*
// how many columns of b fit in L3
const int s3 = std::min(L3 / nx / 16 * 16, ny);
// how many rows of a fit in L2
const int s2 = std::min(L2 / ny / 6 * 6, nx);
// how tall a (k x s3) block in b can be to fit in L1
const int s1 = std::min(L1 / s3, nx);
*/
// s3 * nx < L3 (doesn't really matter)
// s2 * ny < L2
// s1 * s3 < L1
// s1 -> max
//const int s1 = std::min(L1 / 16, nx);
//const int s2 = L2 / ny / 6 * 6;
//const int s3 = 16;
const int u = 96;
const int s3 = u;
const int s2 = 2 * u;
const int s1 = 4 * u;
for (int i3 = 0; i3 < ny; i3 += s3)
// now we are working with b[:][i3:i3+s3]
for (int i2 = 0; i2 < nx; i2 += s2)
// now we are working with a[i2:i2+s2][:]
for (int i1 = 0; i1 < ny; i1 += s1)
// now we are working with b[i1:i1+s1][i3:i3+s3]
// this equates to updating c[i2:i2+s2][i3:i3+s3]
// with [l:r] = [i1:i1+s1]
for (int x = i2; x < std::min(i2 + s2, nx); x += 6)
for (int y = i3; y < std::min(i3 + s3, ny); y += 16)
kernel(a, (vector*) b, (vector*) c, x, y, i1, std::min(i1 + s1, n), ny);
for (int i = 0; i < n; i++)
memcpy(&_c[i * n], &c[i * ny], 4 * n);
/*
delete[] a;
delete[] b;
delete[] c;
*/
}