- 论坛徽章:
- 0
|
本帖最后由 Posix_man 于 2010-03-04 12:53 编辑
呵呵,很少人参与啊!
我来玩玩。希望能抛砖引玉。
可惜没n家的卡,也懒得装驱动和环境,所以麻烦哪位有此环境的兄弟或姐妹,帮我把代码编译运行一次,看看有什么错误。
cuda的资料只看了一点,所以先做题目一。- #include <iostream>
- #include <string>
- #include <time.h>
- typedef struct {
- int width;
- int height;
- float* elements;
- } myMatrix;
- // for single device
- int activeDevice=-1;
- int maxBlockSize=0;
- //for mulit device
- int deviceCount=-1;
- int gpuCount=0;
- int blockSize=0;
- // Forward declaration of the matrix multiplication kernel
- __global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
- // Matrix multiplication - Host code
- // Matrix dimensions are assumed to be multiples of BLOCK_SIZE
- int main(){
-
- //iniatlize.....
- cudaDeviceProp deviceProp;
- cudaGetDeviceCount(&deviceCount);
- for (int device = 0; device < deviceCount; ++device) {
- cudaGetDeviceProperties(&deviceProp, device);
- if ((device == 0)&&(deviceProp.major == 9999) &&( deviceProp.minor == 9999)){
- std::cout<<"There is no device supporting CUDA. See you!\n";
- return -1;
- }
- if(deviceProp.computeMode == cudaComputeModeDefault)
- activeDevice=device;
- else{
- if(cudaSetDevice(device) == cudaErrorSetOnAcitveProcess)
- continue;
- }
- gpuCount += deviceProp.multiProcessorCount;
- if (blockSize==0)
- blockSize= deviceProp.maxThreadsPerBlock;
- else
- blockSize = deviceProp.maxThreadsPerBlock < blockSize ? deviceProp.maxThreadsPerBlock : blockSize;
- activeDevice = deviceProp.maxThreadsPerBlock > maxBlockSize ? device : activeDevice;
- maxBlockSize = deviceProp.maxThreadsPerBlock > maxBlockSize ? deviceProp.maxThreadsPerBlock : maxBlockSize;
-
- }
- if(activeDevice == -1){
- std::cout << "There is no available CUDA device. See you!\n";
- return -1;
- }
- //set the device on
- if(cudaSetDevice(activeDevice) == cudaErrorSetOnAcitveProcess){
- std::cout << "There is no available CUDA device. See you!\n";
- return -1;
- }
-
- //get inputs of matrix A and B.
- myMatrix mA,mB,mC;
- std::cout <<"\n\tPlease input the width of matrix A !";
- std::cin >> mA.width;
- std::cout <<"\n\tPlease input the height of matrix A !";
- std::cin >> mA.height;
- mA.elements= new float[mA.width * mA.height];
- for(int i=0;i < mA.width;i++){
- for(int j=0; j < mA.height;j++){
- std::cout<< "\nPlease input Matrix A ["<< i <<"][" << j << "] :";
- std::cin >> mA.elements[i][j];
- }
- }
- std::cout<<"Matrix A is done!\n";
- std::cout <<"\n\tPlease input the width of matrix B !";
- std::cin >> mB.width;
- std::cout <<"\n\tPlease input the height of matrix B !";
- std::cin >> mB.height;
- mB.elements= new float[mB.width * mB.height];
- for(int i=0;i < mB.width;i++){
- for(int j=0; j < mB.height;j++){
- std::cout<< "\nPlease input Matrix B ["<< i <<"][" << j << "] :";
- std::cin >> mB.elements[i][j];
- }
- }
- std::cout<<"Matrix B is done!\n";
- mC.width=mB.width;
- mC.height=mA.height;
- mC.elements= new float[mC.width * mC.height];
- clock_t t1=clock();
- MatrixMul4OneDev(mA,mB,mC);
-
- std::cout<<"This matrix is done in "<< clock()-t1 <<" ticks!\n";
- for(int i=0;i< mC.width * mC.height;i++){
- if(i % mC.width ==0 )
- std::cout << "\n";
- std::cout << mC.elements[i] << " ";
- }
- delete [] mA.elements;
- delete [] mB.elements;
- delete [] mC.elements;
- return mC.width * mC. height;
- }
- void MatrixMul4OneDev(const myMatrix& A, const myMatrix& B, myMatrix& C)
- {
- Matrix d_A;
- d_A.width = A.width; d_A.height = A.height;
- size_t size = A.width * A.height * sizeof(float);
- cudaMalloc((void**)&d_A.elements, size);
- cudaMemcpy(d_A.elements, A.elements, size,cudaMemcpyHostToDevice);
-
- Matrix d_B;
- d_B.width = B.width; d_B.height = B.height;
- size = B.width * B.height * sizeof(float);
- cudaMalloc((void**)&d_B.elements, size);
- cudaMemcpy(d_B.elements, B.elements, size,cudaMemcpyHostToDevice);
-
- Matrix d_C;
- d_C.width = C.width; d_C.height = C.height;
- size = C.width * C.height * sizeof(float);
- cudaMalloc((void**)&d_C.elements, size);
- dim3 dimBlock(maxBlockSize/2,maxBlockSize/2);
- dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
- MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
- // Read C from device memory
- cudaMemcpy(C.elements, Cd.elements, size,cudaMemcpyDeviceToHost);
- // Free device memory
- cudaFree(d_A.elements);
- cudaFree(d_B.elements);
- cudaFree(d_C.elements);
- }
- __global__ void MatMulKernel(myMatrix A, myMatrix B, myMatrix C)
- {
- float Cvalue = 0;
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
- for (int e = 0; e < A.width; ++e)
- Cvalue += A.elements[row * A.width + e]* B.elements[e * B.width + col];
- C.elements[row * C.width + col] = Cvalue;
- }
复制代码 |
|