0%

CSAPP Cache Lab Writeup

前置知识

Cache知识点概述

一般来说,cache就是这样的结构:

来自CSAPP 3e Figure 6.25

首先看(a)图,这么一个cache里面有着$S=2^s$个set,每个set里面又有$E$个line,而每一个line就是cache里面的基本组成单位。

一个line由三部分组成:

  • $B=2^b$个byte用来存缓存下来的内容
  • $t$个bit用来记录当前line的tag
  • $1$个bit用来表示当前line是valid的还是invalid的

可见,cache能存储内容的量并不等同于实际所占的大小,它的size为$S \times E \times B$ bytes。

cache这样的结构就支持了用一个$m$位的地址去定位访问cache,如图(b)。当已知cache的地址,我们可以尝试从cache里面访问内容,流程如下:

  1. 通过set index确定所在的set。
  2. 找出tag与之相等的line。
  3. 断言valid位不为0。
  4. block offset代表starting byte,retrieve从这个byte开始的内容。

当一次访问值的操作中恰好可以从cache里面找到而不用下放到访问内存,那么称为一次hit。当在cache里面找不到,称为一次miss。

在cache里面找不到值的miss发生时,这个值会被添加进cache当中,如果必须把某个cache line覆盖掉,那么这个过程称为eviction。

在评估缓存友好度的时候,一般我们看miss rate,miss发生得越多,miss penalty所占总时间就越多,运行速度就较慢。miss penalty的要比hit大两个数量级左右。

要做到缓存友好,我们需要贯彻temporal locality(时间局部性)和spatial locality(空间局部性),比如步长为1的访问数组有很强的spatial locality,经常访问一个变量且尽量减少被evict的可能性有较好的temporal locality。

做前必看

做之前先看看实验指导,大体思路都在里面。

然后再了解一下blocking的知识,这个方法会在Part B中用到。

Part A: Building Cache Simulator

Part A比较简单,需要你去模拟一个简化版的cache。

简化版代表不需要考虑b位的储存内容,eviction的策略选择Least Recently Used(LRU)原则,把最不常访问的元素顶替掉。

只需要修改csim.c,最终输出hit、miss跟eviction的次数。

具体思路recitation都有,直接贴代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <unistd.h>
#include <string.h>
#include <math.h>

#include "cachelab.h"

int verbose; // 0 means disable, 1 means enable
int s; // number of set index bits
int S; // number of sets
int E; // associativity
int b; // number of block bits
char traceFile[105]; // name of the `valgrind` trace to replay
unsigned int hit, miss, eviction; // final answers

void printUsageInfo() {
printf("Usage: ./csim-ref [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
printf(" -h: Optional help flag that prints usage info\n");
printf(" -v: Optional verbose flag that displays trace info\n");
printf(" -s <s>: Number of set index bits (S = 2s is the number of sets)\n");
printf(" -E <E>: Associativity (number of lines per set)\n");
printf(" -b <b>: Number of block bits (B = 2b is the block size)\n");
printf(" -t <tracefile>: Name of the valgrind trace to replay\n");
}

void parseArguments(int argc, char **argv) {
int opt;
while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1) {
switch (opt) {
case 'h':
printUsageInfo();
break;
case 'v':
verbose = 1;
break;
case 's':
s = atoi(optarg);
S = (1 << s);
break;
case 'E':
E = atoi(optarg);
break;
case 'b':
b = atoi(optarg);
break;
case 't':
strncpy(traceFile, optarg, 105);
break;
default:
printf("Unknown format. Please make sure it is valid.\n");
printUsageInfo();
break;
}
}
}

typedef struct {
int valid;
unsigned int tag;
int timestamp;
} CacheLine;

CacheLine **cacheTable;

void buildCache() {
// CacheLine **cacheTable;
cacheTable = malloc(S * sizeof(CacheLine*));
for (int i = 0; i < S; ++i) {
cacheTable[i] = malloc(E * sizeof(CacheLine));
for (int j = 0; j < E; ++j) {
cacheTable[i][j].valid = 0;
cacheTable[i][j].tag = 0xffffffff; // initially an impossible value
cacheTable[i][j].timestamp = 0;
}
}
}

void freeCache() {
for (int i = 0; i < S; ++i) {
free(cacheTable[i]);
}
free(cacheTable);
}

void timeAdvance() {
for (int i = 0; i < S; ++i) {
for (int j = 0; j < E; ++j) {
if (cacheTable[i][j].valid) {
cacheTable[i][j].timestamp++;
}
}
}
}

void getCache(unsigned int addr, int size) {
unsigned int s_bits = ((addr >> b) & ((1 << s) - 1));
unsigned int t_bits = (addr >> (s + b));
// printf("[debug] t_bits = %u\n", t_bits);
// if the corresponding cache can be found
for (int i = 0; i < E; ++i) {
CacheLine *cacheLine = &cacheTable[s_bits][i];
// printf("[debug] tag = %u\n", cacheLine->tag);
if (cacheLine->tag == t_bits) {
hit++;
cacheLine->timestamp = 0;
if (verbose) printf("hit");
return;
}
}
// if we can fill the cache into a null cache line
for (int i = 0; i < E; ++i) {
CacheLine *cacheLine = &cacheTable[s_bits][i];
// printf("[debug] %u\n", cacheLine.tag);
if (!cacheLine->valid) {
// printf("[debug] fill in invalid\n");
cacheLine->valid = 1;
cacheLine->timestamp = 0;
cacheLine->tag = t_bits;
miss++;
if (verbose) printf("miss");
return;
}
}
// otherwise, find the target to be substituted according to LRU
// LRU: Least Recently Used, due to the inverse of locality
int idx = 0, max_time_interval = 0;
for (int i = 0; i < E; ++i) {
int time_diff = cacheTable[s_bits][i].timestamp;
if (time_diff > max_time_interval) {
max_time_interval = time_diff;
idx = i;
}
}
eviction++;
miss++;
cacheTable[s_bits][idx].tag = t_bits;
cacheTable[s_bits][idx].timestamp = 0;
if (verbose) printf("miss eviction");
return;
}

void solve(char op, unsigned int addr, int size) {
if (verbose) printf("%c %x,%d ", op, addr, size);
switch (op) {
case 'L': // load
getCache(addr, size);
break;
case 'S': // store
getCache(addr, size);
break;
case 'M': // load + store
getCache(addr, size);
if (verbose) printf(" ");
getCache(addr, size);
break;
case 'I': // instruction load, ignored
default:
break;
}
if (verbose) printf("\n");
}

void readTraceFile() {
FILE *file = fopen(traceFile, "r");
if (file == NULL) {
printf("Exception: cannot find tracefile.\n");
exit(1);
}
char op;
unsigned int addr;
int size;
while (fscanf(file, " %c %x,%d", &op, &addr, &size) != EOF) {
// printf("%c %x %u\n", op, addr, size);
solve(op, addr, size);
timeAdvance();
}
fclose(file);
}

int main(int argc, char **argv) {
parseArguments(argc, argv);
buildCache();
readTraceFile();
freeCache();
printSummary(hit, miss, eviction);
return 0;
}

Part B: Efficient Matrix Transpose

这个部分是重头戏。这个部分要求你完成一个矩阵转置的函数,需要满足在不同size下miss次数不高于一定次数,即尽可能降低miss rate。

在这一个part中,$s=5, E=1, b=5$,即我们考虑cache是一个包含32个set,每个set存32个byte的direct-mapped cache。

这篇博客写得非常非常详细,可以直接看 https://yangtau.me/computer-system/csapp-cache.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) {
if (M == 32) {
const int block_size = 8;
// 1. normal blocking: 344 misses
// int i = 0, j = 0, ii, jj, tmp;
// for (i = 0; i < N; i += block_size) {
// for (j = 0; j < M; j += block_size) {
// for (ii = i; ii < i + block_size && ii < N; ++ii) {
// for (jj = j; jj < j + block_size && jj < M; ++jj) {
// tmp = A[ii][jj];
// B[jj][ii] = tmp;
// }
// }
// }
// }

// 2. use local variables: 288 misses
// int i, j, k, a0, a1, a2, a3, a4, a5, a6, a7;
// for (i = 0; i < M; i += block_size) {
// for (j = 0; j < N; j += block_size) {
// for (k = 0; k < block_size; ++k) {
// a0 = A[i + k][j + 0];
// a1 = A[i + k][j + 1];
// a2 = A[i + k][j + 2];
// a3 = A[i + k][j + 3];
// a4 = A[i + k][j + 4];
// a5 = A[i + k][j + 5];
// a6 = A[i + k][j + 6];
// a7 = A[i + k][j + 7];

// B[j + 0][i + k] = a0;
// B[j + 1][i + k] = a1;
// B[j + 2][i + k] = a2;
// B[j + 3][i + k] = a3;
// B[j + 4][i + k] = a4;
// B[j + 5][i + k] = a5;
// B[j + 6][i + k] = a6;
// B[j + 7][i + k] = a7;
// }
// }
// }

// 3. copy then transpose: 260 misses
int i, j, ii, jj, a0, a1, a2, a3, a4, a5, a6, a7;
for (i = 0; i < N; i += block_size) {
for (j = 0; j < M; j += block_size) {
for (ii = i, jj = j; ii < i + block_size; ++ii, ++jj) {
a0 = A[ii][j + 0];
a1 = A[ii][j + 1];
a2 = A[ii][j + 2];
a3 = A[ii][j + 3];
a4 = A[ii][j + 4];
a5 = A[ii][j + 5];
a6 = A[ii][j + 6];
a7 = A[ii][j + 7];

B[jj][i + 0] = a0;
B[jj][i + 1] = a1;
B[jj][i + 2] = a2;
B[jj][i + 3] = a3;
B[jj][i + 4] = a4;
B[jj][i + 5] = a5;
B[jj][i + 6] = a6;
B[jj][i + 7] = a7;
}

for (ii = 0; ii < block_size; ++ii) {
for (jj = ii + 1; jj < block_size; ++jj) {
a0 = B[j + ii][i + jj];
B[j + ii][i + jj] = B[j + jj][i + ii];
B[j + jj][i + ii] = a0;
}
}
}
}
} else if (M == 64) {

int i, j, k, a0, a1, a2, a3, a4, a5, a6, a7, tmp;
const int block_size = 8;
for (i = 0; i < M; i += block_size) {
for (j = 0; j < N; j += block_size) {
for (k = 0; k < block_size / 2; ++k) {
a0 = A[i + k][j + 0];
a1 = A[i + k][j + 1];
a2 = A[i + k][j + 2];
a3 = A[i + k][j + 3];
a4 = A[i + k][j + 4];
a5 = A[i + k][j + 5];
a6 = A[i + k][j + 6];
a7 = A[i + k][j + 7];

B[j + 0][i + k] = a0;
B[j + 1][i + k] = a1;
B[j + 2][i + k] = a2;
B[j + 3][i + k] = a3;

B[j + 0][i + k + 4] = a4;
B[j + 1][i + k + 4] = a5;
B[j + 2][i + k + 4] = a6;
B[j + 3][i + k + 4] = a7;
}
for (k = 0; k < block_size / 2; ++k) {
a0 = A[i + 4][j + k];
a1 = A[i + 5][j + k];
a2 = A[i + 6][j + k];
a3 = A[i + 7][j + k];
a4 = A[i + 4][j + k + 4];
a5 = A[i + 5][j + k + 4];
a6 = A[i + 6][j + k + 4];
a7 = A[i + 7][j + k + 4];

tmp = B[j + k][i + 4]; B[j + k][i + 4] = a0; a0 = tmp;
tmp = B[j + k][i + 5]; B[j + k][i + 5] = a1; a1 = tmp;
tmp = B[j + k][i + 6]; B[j + k][i + 6] = a2; a2 = tmp;
tmp = B[j + k][i + 7]; B[j + k][i + 7] = a3; a3 = tmp;

B[j + k + 4][i + 0] = a0;
B[j + k + 4][i + 1] = a1;
B[j + k + 4][i + 2] = a2;
B[j + k + 4][i + 3] = a3;
B[j + k + 4][i + 4] = a4;
B[j + k + 4][i + 5] = a5;
B[j + k + 4][i + 6] = a6;
B[j + k + 4][i + 7] = a7;
}
}
}
} else {
// 61 * 67, 1793 misses
int i, j, k, l, a0, a1, a2, a3, a4, a5, a6, a7;
for (i = 0; i < N; i += 8) {
for (j = 0; j < M; j += 8) {
if (i + 7 < N && j + 7 < M) {
// bad
// for (k = i; k < i + 8; ++k) {
// a0 = A[k][j + 0];
// a1 = A[k][j + 1];
// a2 = A[k][j + 2];
// a3 = A[k][j + 3];
// a4 = A[k][j + 4];
// a5 = A[k][j + 5];
// a6 = A[k][j + 6];
// a7 = A[k][j + 7];

// B[j + 0][k] = a0;
// B[j + 1][k] = a1;
// B[j + 2][k] = a2;
// B[j + 3][k] = a3;
// B[j + 4][k] = a4;
// B[j + 5][k] = a5;
// B[j + 6][k] = a6;
// B[j + 7][k] = a7;
// }

// good

for (k = j; k < j + 8; ++k) {
a0 = A[i + 0][k];
a1 = A[i + 1][k];
a2 = A[i + 2][k];
a3 = A[i + 3][k];
a4 = A[i + 4][k];
a5 = A[i + 5][k];
a6 = A[i + 6][k];
a7 = A[i + 7][k];

B[k][i + 0] = a0;
B[k][i + 1] = a1;
B[k][i + 2] = a2;
B[k][i + 3] = a3;
B[k][i + 4] = a4;
B[k][i + 5] = a5;
B[k][i + 6] = a6;
B[k][i + 7] = a7;
}
} else {
for (k = i; k < N && k < i + 8; ++k) {
for (l = j; l < M && l < j + 8; ++l) {
B[l][k] = A[k][l];
}
}
}
}
}
}
}