在sohpgo sg2038 (64core) 上面测试vm-scalability的时候发现一个有趣的现象,64个cpu上的%sys长期保持在100%,cpu都耗在了内核的原子操作上面。
测试用例
测试使用的命令行如下,usemem分配了1GB空间并进行随机写入操作。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Performance counter stats for './usemem --runtime 90 -t 64 --prealloc --random 1055820736':
11134377.19 msec task-clock # 63.226 CPUs utilized
22776 context-switches # 2.046 /sec
1621 cpu-migrations # 0.146 /sec
1626092 page-faults # 146.042 /sec
22268290761022 cycles # 2.000 GHz
104991871953 instructions # 0.00 insn per cycle
25806654302 branches # 2.318 M/sec
412629425 branch-misses # 1.60% of all branches
176.104518759 seconds time elapsed
1.114411000 seconds user
11063.446212000 seconds sys
性能分析
逻辑上讲,1GB空间只需要256k次page fault就够了,上面1626092次page faults显然是有其他原因导致而增多的,分析后是由于numa balancing导致,所以测试的时候可以先配置如下, 这样page faults的次数就会接近256K。
- 关闭numa balancing,/proc/sys/kernel/numa_balancing
- 关闭thp, /sys/kernel/mm/transparent_hugepage/enabled
使用perf抓到的调用栈基本都在拿锁的地方
1
2
3
4
5
6
7
- 97.51% 97.51% usemem [kernel.vmlinux] [k] down_read_trylock ▒
do_access ▒
ret_from_exception ▒
do_page_fault ▒
handle_page_fault ▒
lock_vma_under_rcu ▒
down_read_trylock ▒
具体执行的指令就是在lr/sc里面
1
2
3
4
│ raw_atomic64_cmpxchg_acquire(): 0.22 │40: lr.d a2,(s1)
98.30 │ → bne a2,a5,7ff959af
0.22 │ sc.d a1,a3,(s1)
1.14 │ ↑ bnez a1,40 │ fence r,rw
构建用例
根据以上分析,在冲突严重的时候,lr/sc造成了性能的极端情况,我们设计一个小的测试用例方便分析。
这里使用了https://github.com/michaeljclark/riscv-atomics.git里面的接口,其实并不是完全必要,可以使用gcc等的builtin函数。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <sched.h>
#include <stdlib.h>
#define ATOMIC_ASM 1
#include "stdatomic.h"
#define MAX_CPU 64
struct thread_data {
int cpu; /* pin to this cpu */
int loop; /* ops count */
long *shared; /* atomic add to this */
};
pthread_barrier_t start_barrier;
int use_lr_sc = 1;
int pin_myself(int cpu) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu, &cpuset);
pthread_t current_thread = pthread_self();
return pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
}
void *thread(void *ptr)
{
struct thread_data *td = ptr;
int loop = td->loop;
long i;
pin_myself(td->cpu);
pthread_barrier_wait(&start_barrier);
if (use_lr_sc == 1) {
for (i = 0; i < loop; ++i) {
long tmp = atomic_load_explicit(td->shared, memory_order_relaxed);
while (1) {
long r = __atomic_cmpxchg_acq_rel(td->shared, &tmp, tmp + 1);
if (r == tmp) {
break;
} else {
tmp = r;
}
}
}
} else if (use_lr_sc == 0) {
for (i = 0; i < loop; ++i) {
atomic_fetch_add(td->shared, 1);
}
}
return NULL;
}
int main(int argc, char *argv[])
{
pthread_t pth[MAX_CPU];
struct thread_data td[MAX_CPU];
long value = 0;
long total_cas_count = 0;
int loop = 10000;
int num_thr = MAX_CPU;
if (argc > 1) {
use_lr_sc = atoi(argv[1]);
}
if (argc > 2) {
num_thr = atoi(argv[2]);
if (num_thr > MAX_CPU) {
printf("too many cpus\n");
return 1;
}
}
if (argc > 3) {
loop = atoi(argv[3]);
}
printf("num_thr: %d, use_lr_sc: %d\n", num_thr, use_lr_sc);
pthread_barrier_init(&start_barrier, NULL, num_thr);
for (int i = 0; i < num_thr; ++i) {
td[i].cpu = i % MAX_CPU;
td[i].shared = &value;
td[i].loop = loop;
int ret = pthread_create(&pth[i], NULL, thread, (void *)&td[i]);
if (ret != 0) {
printf("pthread_create failed\n");
return 1;
}
}
for (int i = 0; i < num_thr; ++i) {
pthread_join(pth[i], NULL);
}
pthread_barrier_destroy(&start_barrier);
printf("final value: %ld, loop: %d\n", value, loop);
return 0;
}
新用例分析
编译选择对性能会有影响,这里选择O0不优化
$ gcc test-atomic.c -g -O0 -o atomic.O0
LR/SC vs AMO
分别测试LR/SC和AMO的效果,可以看到在强竞争环境下AMO的效果会好非常多
- 性能差距在1000x级别
- LR/SC的性能结果方差较大,有时出现好几倍的变化. 可能跟系统的状态比如cache等有关系,后续可以看看是否有相应PMU来分析
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
$ sudo perf stat ./atomic.O0 1
num_thr: 64, use_lr_sc: 1
final value: 640000, loop: 10000
Performance counter stats for './atomic.O0 1':
2141505.16 msec task-clock # 55.357 CPUs utilized
1389 context-switches # 0.649 /sec
64 cpu-migrations # 0.030 /sec
177 page-faults # 0.083 /sec
4282984915631 cycles # 2.000 GHz
6609772363 instructions # 0.00 insn per cycle
2455506052 branches # 1.147 M/sec
35676085 branch-misses # 1.45% of all branches
38.685101551 seconds time elapsed
2141.456224000 seconds user
0.049997000 seconds sys
$ sudo perf stat ./atomic.O0 0
num_thr: 64, use_lr_sc: 0
final value: 640000, loop: 10000
Performance counter stats for './atomic.O0 0':
1364.05 msec task-clock # 24.749 CPUs utilized
190 context-switches # 139.291 /sec
64 cpu-migrations # 46.919 /sec
177 page-faults # 129.760 /sec
2722508418 cycles # 1.996 GHz
68832229 instructions # 0.03 insn per cycle
5228336 branches # 3.833 M/sec
438094 branch-misses # 8.38% of all branches
0.055116118 seconds time elapsed
1.346023000 seconds user
0.031547000 seconds sys
高并发的LR/SC
测试不同并发条件下LR/SC的表现
1
$ for i in 8 16 32 64; do sudo perf stat ./atomic.O0 1 $i; done
并发 | seconds user (perf stat) |
---|---|
8 | 0.979668000 |
16 | 40.213113000 |
32 | 342.519529000 |
64 | 2007.073680000 |
低并发的LR/SC
这里改大了循环次数
1
$ for i in 1 2 4 8; do sudo perf stat ./atomic.O0 1 $i 2000000; done
并发 | seconds user (lr/sc) | amo |
---|---|---|
1 | 0.142973503 | 0.117086000 |
2 | 0.510705000 | 0.263231000 |
4 | 2.302510000 | 0.718847000 |
8 | 200.607623000 | 4.448020000 |
后续
还有很多事情可以做,比如:
- 在x86上测试,和sophgo进行比较
- 查看是否有对应PMU可以观察, 比如lr/sc性能不稳定的来源
- 软件上是不是可以通过插入其他指令来避免lr/sc带来的竞争
- 硬件上是不是有办法动态调节lr/sc的竞争及时退让, 预计这方面在x86/arm等系统上有经验