Actual source code: ex5cu.cu
1: static char help[] = "Test of CUDA matrix assemble with simple matrix.\n\n";
3: // This a minimal example of the use of the CUDA MatAIJ metadata for assembly.
4: //
5: // The matrix must be a type 'aijcusparse' and must first be assembled on the CPU to provide the nonzero pattern.
6: // Next, get a pointer to a simple CSR mirror (PetscSplitCSRDataStructure) of the matrix data on
7: // the GPU with MatCUSPARSEGetDeviceMatWrite().
8: // Then use this object to populate the matrix on the GPU with MatSetValuesDevice().
9: // Finally call MatAssemblyBegin/End() and the matrix is ready to use on the GPU without matrix data movement between the
10: // host and GPU.
12: #include <petscconf.h>
13: #include <petscmat.h>
14: #include <petscdevice.h>
15: #include <assert.h>
17: #include <petscaijdevice.h>
18: __global__
19: void assemble_on_gpu(PetscSplitCSRDataStructure d_mat, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
20: {
21: const PetscInt inc = blockDim.x, my0 = threadIdx.x;
22: PetscInt i;
23: PetscErrorCode ierr;
25: for (i=start+my0; i<end+1; i+=inc) {
26: PetscInt js[] = {i-1, i}, nn = (i==N) ? 1 : 2; // negative indices are igored but >= N are not, so clip end
27: PetscScalar values[] = {1,1,1,1};
28: MatSetValuesDevice(d_mat,nn,js,nn,js,values,ADD_VALUES);if (ierr) assert(0);
29: }
30: }
32: PetscErrorCode assemble_on_cpu(Mat A, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
33: {
35: for (PetscInt i=start; i<end+1; i++) {
36: PetscInt js[] = {i-1, i}, nn = (i==N) ? 1 : 2;
37: PetscScalar values[] = {1,1,1,1};
38: MatSetValues(A,nn,js,nn,js,values,ADD_VALUES);
39: }
40: return 0;
41: }
43: int main(int argc,char **args)
44: {
45: Mat A;
46: PetscInt N=11, nz=3, Istart, Iend, num_threads = 128;
47: PetscSplitCSRDataStructure d_mat;
48: PetscLogEvent event;
49: PetscMPIInt rank,size;
50: PetscBool testmpiseq = PETSC_FALSE;
51: Vec x,y;
53: PetscInitialize(&argc,&args,(char*)0,help);
54: PetscOptionsGetInt(NULL,NULL, "-n", &N, NULL);
55: PetscOptionsGetInt(NULL,NULL, "-num_threads", &num_threads, NULL);
56: PetscOptionsGetInt(NULL,NULL, "-nz_row", &nz, NULL);
57: PetscOptionsGetBool(NULL,NULL, "-testmpiseq", &testmpiseq, NULL);
58: if (nz<3) nz=3;
59: if (nz>N+1) nz=N+1;
60: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
61: MPI_Comm_size(PETSC_COMM_WORLD,&size);
63: PetscLogEventRegister("GPU operator", MAT_CLASSID, &event);
64: MatCreateAIJCUSPARSE(PETSC_COMM_WORLD,PETSC_DECIDE,PETSC_DECIDE,N,N,nz,NULL,nz-1,NULL,&A);
65: MatSetFromOptions(A);
66: MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE);
67: MatCreateVecs(A,&x,&y);
68: MatGetOwnershipRange(A,&Istart,&Iend);
69: /* current GPU assembly code does not support offprocessor values insertion */
70: assemble_on_cpu(A, Istart, Iend, N, rank);
71: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
72: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
74: // test
75: VecSet(x,1.0);
76: MatMult(A,x,y);
77: VecViewFromOptions(y,NULL,"-ex5_vec_view");
79: if (testmpiseq && size == 1) {
80: MatConvert(A,MATSEQAIJ,MAT_INPLACE_MATRIX,&A);
81: MatConvert(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
82: }
83: PetscLogEventBegin(event,0,0,0,0);
84: MatCUSPARSEGetDeviceMatWrite(A,&d_mat);
85: assemble_on_gpu<<<1,num_threads>>>(d_mat, Istart, Iend, N, rank);
86: cudaDeviceSynchronize();
87: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
88: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
89: PetscLogEventEnd(event,0,0,0,0);
91: // test
92: VecSet(x,1.0);
93: MatMult(A,x,y);
94: VecViewFromOptions(y,NULL,"-ex5_vec_view");
96: MatDestroy(&A);
97: VecDestroy(&x);
98: VecDestroy(&y);
99: PetscFinalize();
100: return 0;
101: }
103: /*TEST
105: build:
106: requires: cuda
108: test:
109: suffix: 0
110: diff_args: -j
111: args: -n 11 -ex5_vec_view
112: nsize: 1
114: test:
115: suffix: 1
116: diff_args: -j
117: args: -n 11 -ex5_vec_view
118: nsize: 2
120: test:
121: suffix: 2
122: diff_args: -j
123: args: -n 11 -testmpiseq -ex5_vec_view
124: nsize: 1
126: TEST*/