Actual source code: ex5cu.cu

  1: static char help[] = "Test of CUDA matrix assemble with simple matrix.\n\n";

  3: // This a minimal example of the use of the CUDA MatAIJ metadata for assembly.
  4: //
  5: // The matrix must be a type 'aijcusparse' and must first be assembled on the CPU to provide the nonzero pattern.
  6: // Next, get a pointer to a simple CSR mirror (PetscSplitCSRDataStructure) of the matrix data on
  7: //    the GPU with MatCUSPARSEGetDeviceMatWrite().
  8: // Then use this object to populate the matrix on the GPU with MatSetValuesDevice().
  9: // Finally call MatAssemblyBegin/End() and the matrix is ready to use on the GPU without matrix data movement between the
 10: //    host and GPU.

 12: #include <petscconf.h>
 13: #include <petscmat.h>
 14: #include <petscdevice.h>
 15: #include <assert.h>

 17: #include <petscaijdevice.h>
 18: __global__
 19: void assemble_on_gpu(PetscSplitCSRDataStructure d_mat, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
 20: {
 21:   const PetscInt  inc = blockDim.x, my0 = threadIdx.x;
 22:   PetscInt        i;
 23:   PetscErrorCode  ierr;

 25:   for (i=start+my0; i<end+1; i+=inc) {
 26:     PetscInt    js[] = {i-1, i}, nn = (i==N) ? 1 : 2; // negative indices are igored but >= N are not, so clip end
 27:     PetscScalar values[] = {1,1,1,1};
 28:     MatSetValuesDevice(d_mat,nn,js,nn,js,values,ADD_VALUES);if (ierr) assert(0);
 29:   }
 30: }

 32: PetscErrorCode assemble_on_cpu(Mat A, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
 33: {
 35:   for (PetscInt i=start; i<end+1; i++) {
 36:     PetscInt    js[] = {i-1, i}, nn = (i==N) ? 1 : 2;
 37:     PetscScalar values[] = {1,1,1,1};
 38:     MatSetValues(A,nn,js,nn,js,values,ADD_VALUES);
 39:   }
 40:   return 0;
 41: }

 43: int main(int argc,char **args)
 44: {
 45:   Mat                        A;
 46:   PetscInt                   N=11, nz=3, Istart, Iend, num_threads = 128;
 47:   PetscSplitCSRDataStructure d_mat;
 48:   PetscLogEvent              event;
 49:   PetscMPIInt                rank,size;
 50:   PetscBool                  testmpiseq = PETSC_FALSE;
 51:   Vec                        x,y;

 53:   PetscInitialize(&argc,&args,(char*)0,help);
 54:   PetscOptionsGetInt(NULL,NULL, "-n", &N, NULL);
 55:   PetscOptionsGetInt(NULL,NULL, "-num_threads", &num_threads, NULL);
 56:   PetscOptionsGetInt(NULL,NULL, "-nz_row", &nz, NULL);
 57:   PetscOptionsGetBool(NULL,NULL, "-testmpiseq", &testmpiseq, NULL);
 58:   if (nz<3)   nz=3;
 59:   if (nz>N+1) nz=N+1;
 60:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
 61:   MPI_Comm_size(PETSC_COMM_WORLD,&size);

 63:   PetscLogEventRegister("GPU operator", MAT_CLASSID, &event);
 64:   MatCreateAIJCUSPARSE(PETSC_COMM_WORLD,PETSC_DECIDE,PETSC_DECIDE,N,N,nz,NULL,nz-1,NULL,&A);
 65:   MatSetFromOptions(A);
 66:   MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE);
 67:   MatCreateVecs(A,&x,&y);
 68:   MatGetOwnershipRange(A,&Istart,&Iend);
 69:   /* current GPU assembly code does not support offprocessor values insertion */
 70:   assemble_on_cpu(A, Istart, Iend, N, rank);
 71:   MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
 72:   MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);

 74:   // test
 75:   VecSet(x,1.0);
 76:   MatMult(A,x,y);
 77:   VecViewFromOptions(y,NULL,"-ex5_vec_view");

 79:   if (testmpiseq && size == 1) {
 80:     MatConvert(A,MATSEQAIJ,MAT_INPLACE_MATRIX,&A);
 81:     MatConvert(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
 82:   }
 83:   PetscLogEventBegin(event,0,0,0,0);
 84:   MatCUSPARSEGetDeviceMatWrite(A,&d_mat);
 85:   assemble_on_gpu<<<1,num_threads>>>(d_mat, Istart, Iend, N, rank);
 86:   cudaDeviceSynchronize();
 87:   MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
 88:   MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
 89:   PetscLogEventEnd(event,0,0,0,0);

 91:   // test
 92:   VecSet(x,1.0);
 93:   MatMult(A,x,y);
 94:   VecViewFromOptions(y,NULL,"-ex5_vec_view");

 96:   MatDestroy(&A);
 97:   VecDestroy(&x);
 98:   VecDestroy(&y);
 99:   PetscFinalize();
100:   return 0;
101: }

103: /*TEST

105:    build:
106:       requires: cuda

108:    test:
109:       suffix: 0
110:       diff_args: -j
111:       args: -n 11 -ex5_vec_view
112:       nsize: 1

114:    test:
115:       suffix: 1
116:       diff_args: -j
117:       args: -n 11 -ex5_vec_view
118:       nsize: 2

120:    test:
121:       suffix: 2
122:       diff_args: -j
123:       args: -n 11 -testmpiseq -ex5_vec_view
124:       nsize: 1

126: TEST*/