论坛徽章:: 11

电梯直达

1楼 [收藏(0)] [报告]

发表于 2014-05-01 15:05 |只看该作者 |倒序浏览

测试了高并发，低并发下几种锁的性能差异：

#define _GNU_SOURCE
#include <stdint.h>
#include <stdio.h>
#include "lkf.h"
#include <pthread.h>
#include <unistd.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
struct timespec base;
int timer_init(){
int n = clock_gettime(CLOCK_MONOTONIC, &base);
if(0 != n){
return -1;
}
return 0;
}
unsigned int now(){
struct timespec tp;
int n = clock_gettime(CLOCK_MONOTONIC, &tp);
if(0 != n){
assert(0);
}
uint64_t ms = (tp.tv_sec - base.tv_sec) * 1000 + 1e-6 * (tp.tv_nsec - base.tv_nsec);
return (unsigned int) ms;
}
static LKF_LIST(head);
struct ctx {
struct lkf_node entry;
int n;
};
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
#define lock1(lkp) do{ \
while(!__sync_bool_compare_and_swap(lkp, 0, 1)){ \
sched_yield(); \
} \
}while(0)
#define lock2(lkp) do{ \
while(!__sync_bool_compare_and_swap(lkp, 0, 1)){ \
continue; \
} \
}while(0)
#define unlock(lkp) do{ \
*(lkp) = 0; \
}while(0)
int lck = 0;
void* thread_func(void* any)
{
int n2 = 0;
int n3 = 0;
unsigned t1, t2 = now();
while (1) {
struct ctx* ctx = NULL;
#if 1
struct lkf_node* node = lkf_node_get(&head);
if (node == NULL) {
__sync_add_and_fetch(&n2, 1);
continue;
}
struct lkf_node* nd = NULL;
do {
nd = lkf_node_next(node);
if (nd == NULL) {
continue;
}
__sync_add_and_fetch(&n3, 1);
lkf_node_put(&head, nd);
if (n3 % 10000 == 0) {
t1 = now();
printf("%d, %d \n", n2, t1 - t2);
t2 = t1;
}
} while (nd != node);
#else
//lock1(&lck);
// lock2(&lck);
pthread_mutex_lock(&mutex);
struct lkf_node* node = lkf_node_get_one(&head);
pthread_mutex_unlock(&mutex);
//unlock(&lck);
if (node == NULL) {
__sync_add_and_fetch(&n2, 1);
continue;
}
struct lkf_node* nd = node;
__sync_add_and_fetch(&n3, 1);
lkf_node_put(&head, nd);
if (n3 % 10000 == 0) {
t1 = now();
printf("%d, %d \n", n2, t1 - t2);
t2 = t1;
}
#endif
n2 = 0;
}
return NULL;
}
int main(void)
{
timer_init();
for (int i = 0; i < 2000; ++i) {
struct ctx* ctx = malloc(sizeof(struct ctx));
ctx->entry.next = NULL;
lkf_node_put(&head, &ctx->entry);
}
pthread_t tid;
for (int i = 0; i < 8; ++i) {
pthread_create(&tid, NULL, thread_func, NULL);
}
pause();
return 0;
}

复制代码

得到数据如下：
800 threads:
lock less batch  2,30 seconds
lock less single 80.x --- 100.x seconds
spin lock       infinite
yield lock    1.x seconds
mutex          5.x seconds

4 threads:
lockless batch 3ms
lock less single 12ms
yield lock    11ms
spin lock       16ms
mutex          3,40 ms

8 threads:
lockless batch 20% 15ms,  80% 5ms
lock less batch  20% 140s,  80% 60ms
spin lock       3%  120ms, 97%  50ms
yield lock    20ms
mutex          60ms

机器是 azure 上的一台 a3 4 核的虚拟机
无锁使用的是之前我发的那份
基本结论如下：

线程数小于或接近 cpu 个数的情况下，我的自己那个无锁链表批取模式性能最高，单取性能还算可以，但与 shedu_yield 锁稍差那么一点，比 spin lock 锁高那么一点，但到了 8 线程，就已经被 spin lock 锁超越了
高并发下，还是同步锁性能最高， spin lock 是找死，用无锁也是脑残的行为

在任何负载下，通过CAS加轻量的 shed_yield 实现的 yield 锁性能都高于 mutex,  futex 没测试，估计性能位于 yield 锁和 mutex 之间
mutex 基本上似乎都表现中庸，似乎在任何情况下，都是第二选择

文库|博客

zylthinking

大富大贵

论坛徽章:: 11

2楼 [报告]

发表于 2014-05-01 15:16 |只看该作者

加上了 futex, 表明确实如猜测;
在这个测试中所有线程都没有阻塞情况，因此 futex 比较吃亏，若存在大量阻塞，估计 futex 表现会超过 yiled 锁

#define lock(lkp) \
do { \
if (!__sync_bool_compare_and_swap(lkp, 0, 1)) { \
if (*lkp == 2) { \
syscall(__NR_futex, lkp, FUTEX_WAIT, 2, NULL, NULL, 0); \
} \
\
while (0 != __sync_lock_test_and_set(lkp, 2)) { \
syscall(__NR_futex, lkp, FUTEX_WAIT, 2, NULL, NULL, 0); \
} \
} \
} while (0)
#define unlock(lkp) \
do { \
wmb(); \
if (2 == __sync_lock_test_and_set(lkp, 0)) { \
while (-1 == syscall(__NR_futex, lkp, FUTEX_WAKE, 1, NULL, NULL, 0)); \
} \
} while (0)

复制代码

800 threads:
lock less batch  2,30 seconds
lock less single 80.x --- 100.x seconds
spin lock       infinite
yield lock    1.x seconds
futex          4.x
mutex          5.x seconds

4 threads:
lockless batch 3ms
lock less single 12ms
yield lock    11ms
spin lock       16ms
futex          20ms
mutex          3,40 ms

8 threads:
lockless batch 20% 15ms,  80% 5ms
lock less batch  20% 140s,  80% 60ms
spin lock       3%  120ms, 97%  50ms
yield lock    20ms
futex          40ms
mutex          60ms