平台论坛博客文库

论坛徽章:: 12

21楼 [报告]

发表于 2015-01-12 16:51 |只看该作者

回复 18# windoze

维基百科上的文字和我记忆里是相符的：

The cores of Intel MIC are based on a modified version of P54C design, used in the original Pentium.[48] The basis of the Intel MIC architecture is to leverage x86 legacy by creating a x86-compatible multiprocessor architecture that can utilize existing parallelization software tools.[20] Programming tools include OpenMP, OpenCL,[49] Cilk/Cilk Plus and specialised versions of Intel's Fortran, C++[50] and math libraries.[51]

复制代码

从奔腾的核心修改的，可以直接跑之前的程序。

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

yulihua49

巨富豪门

论坛徽章:: 15

22楼 [报告]

发表于 2015-01-12 16:54 |只看该作者

zhaohongjian000 发表于 2015-01-12 16:27
回复 16# windoze

我们的技术，几十核没啥问题。上千的核？还真是挑战，得继续研究。

linux_c_py_php

巨富豪门

论坛徽章:: 4

23楼 [报告]

发表于 2015-01-12 20:10 |只看该作者

对于并行软件开发模式，锁最终会是瓶颈，随着cpu个数增多，性能无法线性。

不过这不是说没法提高了，只是不能再依靠mutex+shared_ptr搞并发了，必须靠副本，消息机制，就像erlang所倡导的那样。

wang290

小富即安

论坛徽章:: 6

24楼 [报告]

发表于 2015-01-13 13:41 |只看该作者

linux_c_py_php 发表于 2015-01-12 20:10
对于并行软件开发模式，锁最终会是瓶颈，随着cpu个数增多，性能无法线性。

不过这不是说没法提高了，只是 ...

大神是典型的工程师作风，往往不屑那些学院派，比如洪内核和微内核之争，大神还是很鄙视那些教授们的

windoze

版主

论坛徽章:: 44

25楼 [报告]

发表于 2015-01-13 14:24 |只看该作者

回复 22# yulihua49

上千核的程序就不是这么写了……

yulihua49

巨富豪门

论坛徽章:: 15

26楼 [报告]

发表于 2015-01-13 19:35 |只看该作者

windoze 发表于 2015-01-13 14:24
回复 22# yulihua49

上千核的程序就不是这么写了……

提示一下，怎么玩？

windoze

版主

论坛徽章:: 44

27楼 [报告]

发表于 2015-01-13 20:31 |只看该作者

回复 26# yulihua49

妥妥的MPI+并行算法，目前没有什么更好的办法。

lost_templar

丰衣足食

论坛徽章:: 1

28楼 [报告]

发表于 2015-01-13 22:33 |只看该作者

回复 26# yulihua49

正在吐血调试的一段，并行在四块显卡上，有 2880 + 2496 + 2496 + 2496 个核

__global__ void
make_individual_pattern_intensity_diff( double* cuda_ug, unsigned long* cuda_ar, double* cuda_diag, double thickness, unsigned long* cuda_dim, double* cuda_I_exp, double* cuda_I_diff, unsigned long column_index, double2* cuda_cache, unsigned long max_dim, unsigned long tilt_size )
{
unsigned long const tilt_index = blockDim.x * blockIdx.x + threadIdx.x;
if ( tilt_index >= tilt_size ) return;
unsigned long const dim = *(cuda_dim + tilt_index);
double* ug = cuda_ug;
unsigned long* ar = cuda_ar + tilt_index * max_dim * max_dim;
double* diag = cuda_diag + tilt_index * max_dim;
double* I_exp = cuda_I_exp + tilt_index * max_dim;
double* I_diff = cuda_I_diff + tilt_index * max_dim;
double2* cache = cuda_cache + 6 * tilt_index * max_dim * max_dim;
unsigned long dimdim = dim*dim;
//cache should be of size 6*N^2
double2* a_ = cache;
double2* aa_ = a_ + dimdim;
double2* aaa_ = aa_ + dimdim;
double2* p1 = aaa_ + dimdim;
double2* p2 = p1 + dimdim;
double2* p3 = p2 + dimdim;
//reuse memory in latter steps, when a_, aa_ and aaa_ are idle
double2* p2p3 = a_;
double2* s = aa_;
double2* s_ = aaa_;
//1)
kernel_assert( (compose_a<<<1, dim>>>( ug, ar, diag, thickness, a_, dim )) );
cuda_assert( cudaDeviceSynchronize() );
//2)
//TODO
double* the_norm = (double*)aa_;
kernel_assert( (Dznrm2<<<1,128>>>( dimdim, a_, the_norm )) );
//kernel_assert( (Dasum<<<1,128>>>( dimdim, a_, the_norm )) );
cuda_assert( cudaDeviceSynchronize() );
//double const ratio = (*the_norm) * 53.71920351148152;
double const ratio = (*the_norm) / 5.371920351148152;
unsigned long const scaler = ratio < 1.0 ? 0 : ceil(log2(ratio));
unsigned long const scaling_factor = 1 << scaler;
double const scale = scaling_factor;
kernel_assert( (Zscal<<<1, 128>>>( dimdim, 1.0/scale, a_ )) ); //a_ /= scale
cuda_assert( cudaDeviceSynchronize() );
//3)
dim3 const mm_grids( (dim+15)/16, (dim+15)/16 );
dim3 const mm_threads( 16, 16 );
kernel_assert( (Zgemm<<<mm_grids, mm_threads>>>( aa_, a_, a_, dim, 1.0 )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zgemm<<<mm_grids, mm_threads>>>( aaa_, aa_, a_, dim, 1.0 )) );
cuda_assert( cudaDeviceSynchronize() );
//4)
/*solve(_Z^9+9*_Z^8+72*_Z^7+504*_Z^6+3024*_Z^5+15120*_Z^4+60480*_Z^3+181440*_Z^2+362880*_Z+362880 = 0)
* Returns:
* 2.697333461536989227389605+5.184162062649414177834087*I, //c1
* -.3810698456631129990312942+4.384644533145397950369203*I, //c2
* -2.110839800302654737498705+3.089910928725500922777702*I, //c3
* -3.038648072936697089212469+1.586801195758838328803868*I, //c4
* -3.333551485269048803294274, //c5
* -3.038648072936697089212469-1.586801195758838328803868*I, //c6
* -2.110839800302654737498705-3.089910928725500922777702*I, //c7
* -.3810698456631129990312942-4.384644533145397950369203*I, //c8
* 2.697333461536989227389605-5.184162062649414177834087*I //c9
*
* expand((x-c1)*(x-c2)*(x-c3)) >> p1 ( p1_c )
* x^3-.205423815571221490859606*x^2-(12.65871752452031305098099*I)*x^2-58.21460179641193947200471*x-(3.189848964212376356715960*I)*x-19.71085376106750328141397+94.20645646169128946503649*I
*
* expand((x-c4)*(x-c5)*(x-c6)) >> p2 ( p2_c )
* x^3+9.410847631142442981719212*x^2+39.17363072664900708597702-6.123261017392618755198919*10^(-24)*I+32.01029973951970099352671*x+(4.*10^(-24)*I)*x
*
* expand((x-c7)*(x-c8)*(x-c9)) >> p3 ( p3_c )
* x^3-.205423815571221490859601*x^2+(12.65871752452031305098099*I)*x^2-58.21460179641193947200470*x+(3.18984896421237635671600*I)*x-19.71085376106750328141404-94.20645646169128946503646*I
*
* expand((x-c1)*(x-c2)*(x-c3)*(x-c4)*(x-c5)*(x-c6)*(x-c7)*(x-c8)*(x-c9))
* 3.628800000000000000000003*10^5-1.365022562699469279472268*10^(-19)*I+3.628800000000000000000003*10^5*x+x^9+9.00000000000000000000000*x^8+72.00000000000000000000006*x^7+503.9999999999999999999995*x^6+3024.000000000000000000002*x^5+15120.00000000000000000000*x^4+60479.99999999999999999995*x^3+1.814400000000000000000001*10^5*x^2-(5.*10^(-22)*I)*x^6-(1.*10^(-20)*I)*x^4-(1.0*10^(-19)*I)*x^3+(2.*10^(-24)*I)*x^8-(3.0*10^(-19)*I)*x^2-(7.*10^(-21)*I)*x^5-(4.*10^(-19)*I)*x+(2.*10^(-23)*I)*x^7
*/
//4 - p1)
kernel_assert( (Zcopy<<<1,128>>>( dimdim, aaa_, p1 )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zaxpy<<<1,128>>>( dimdim, -0.205423815571221490859606, -12.65871752452031305098099, p1, aa_ )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zaxpy<<<1,128>>>( dimdim, -58.21460179641193947200471, -3.189848964212376356715960, p1, a_ )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (sum_diag<<<1,dim>>>( p1, dim, -19.71085376106750328141397, 94.20645646169128946503649 )) );
cuda_assert( cudaDeviceSynchronize() );
//4 - p2)
kernel_assert( (Zcopy<<<1,128>>>( dimdim, aaa_, p2 )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zaxpy<<<1,128>>>( dimdim, 9.410847631142442981719212, 0.0, p2, aa_ )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zaxpy<<<1,128>>>( dimdim, 32.01029973951970099352671, 0.0, p2, a_ )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (sum_diag<<<1,dim>>>( p2, dim, 39.17363072664900708597702, 0.0 )) );
cuda_assert( cudaDeviceSynchronize() );
//4 - p3)
kernel_assert( (Zcopy<<<1,128>>>( dimdim, aaa_, p3 )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zaxpy<<<1,128>>>( dimdim, -0.205423815571221490859601, 12.65871752452031305098099, p3, aa_ )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zaxpy<<<1,128>>>( dimdim, -58.21460179641193947200470, 3.18984896421237635671600, p3, a_ )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (sum_diag<<<1,dim>>>( p3, dim, -19.71085376106750328141404, -94.20645646169128946503646 )) );
cuda_assert( cudaDeviceSynchronize() );
//4 - s)
// s = 1/602.39521910453439454428( p1 * ( 1/602.39521910453439454428 * p2 * p3 ) ) = (p1 p2 p3)/362880
kernel_assert( (Zgemm<<<mm_grids, mm_threads>>>( p2p3, p2, p3, dim, 0.0016600397351866578333 )) );
cuda_assert( cudaDeviceSynchronize() );
kernel_assert( (Zgemm<<<mm_grids, mm_threads>>>( s, p1, p2p3, dim, 0.0016600397351866578333 )) );
cuda_assert( cudaDeviceSynchronize() );
//5)
for ( unsigned long index = 0; index != scaler; ++index )
{
kernel_assert( (Zgemm<<<mm_grids, mm_threads>>>( s_, s, s, dim, 1.0 )) );
cuda_assert( cudaDeviceSynchronize() );
double2* tmp = s_;
s_ = s;
s = tmp;
}
//6)
//kernel_assert( (extract_intensity_diff<<<1,dim>>>( s, I_exp, I_diff, dim, column_index )) );
double const ac_offset = cuda_ug[0];
double const dc_offset = cuda_ug[1];
kernel_assert( (extract_intensity_diff_with_offset<<<1,dim>>>( s, I_exp, I_diff, dim, column_index, ac_offset, dc_offset )) );
cuda_assert( cudaDeviceSynchronize() );