Actual source code: aijcusp.cu

petsc-3.3-p7 2013-05-11

  3: /*
  4:     Defines the basic matrix operations for the AIJ (compressed row)
  5:   matrix storage format.
  6: */

  8: #include "petscconf.h"
  9: PETSC_CUDA_EXTERN_C_BEGIN
 10:  #include ../src/mat/impls/aij/seq/aij.h
 11:  #include petscbt.h
 12:  #include ../src/vec/vec/impls/dvecimpl.h
 13: #include "petsc-private/vecimpl.h"
 14: PETSC_CUDA_EXTERN_C_END
 15: #undef VecType
 16:  #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h


 19: #ifdef PETSC_HAVE_TXPETSCGPU
 20: // this is such a hack ... but I don't know of another way to pass this variable
 21: // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel
 22: //  SpMV. Essentially, I need to use the same stream variable in two different
 23: //  data structures. I do this by creating a single instance of that stream
 24: //  and reuse it.
 25: cudaStream_t theBodyStream=0;

 27: struct Mat_SeqAIJGPU {
 28:   GPU_Matrix_Ifc*   mat; /* pointer to the matrix on the GPU */
 29:   CUSPARRAY*        tempvec; /*pointer to a workvector to which we can copy the relevant indices of a vector we want to multiply */
 30:   PetscInt          nonzerorow; /* number of nonzero rows ... used in the flop calculations */
 31: };
 32: #endif // PETSC_HAVE_TXPETSCGPU

 34: #include <algorithm>
 35: #include <vector>
 36: #include <string>
 37: #include <thrust/sort.h>
 38: #include <thrust/fill.h>
 39: #include <cusparse.h>

 41: // Hanlde to the CUSPARSE library. Only need one
 42: cusparseHandle_t Mat_CUSPARSE_Handle;

 44: struct Mat_SeqAIJTriFactors {
 45:   void *loTriFactorPtr; /* pointer for lower triangular (factored matrix) on GPU */
 46:   void *upTriFactorPtr; /* pointer for upper triangular (factored matrix) on GPU */
 47: };

 49: struct Mat_SeqAIJCUSPARSETriFactor {
 50:   CUSPMATRIX*                 mat;     /* pointer to the matrix on the GPU */
 51:   cusparseMatDescr_t          Mat_CUSPARSE_description;
 52:   cusparseSolveAnalysisInfo_t Mat_CUSPARSE_solveAnalysisInfo;
 53:   CUSPARRAY*                  tempvecGPU; /*pointer to a workvector to which we can copy the relevant indices of a vector we want to multiply */
 54:   CUSPINTARRAYGPU*            ordIndicesGPU; /* For Lower triangular, this is the row permutation. For Upper triangular, this the column permutation */
 55: };


 58: EXTERN_C_BEGIN
 59: PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSP(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSP(Mat,Mat,IS,IS,const MatFactorInfo*);
 61: PetscErrorCode MatLUFactorNumeric_SeqAIJCUSP(Mat,Mat,const MatFactorInfo *);
 62: PetscErrorCode MatSolve_SeqAIJCUSP(Mat,Vec,Vec);
 63: PetscErrorCode MatSolve_SeqAIJCUSP_NaturalOrdering(Mat,Vec,Vec);
 64: EXTERN_C_END


 67: EXTERN_C_BEGIN
 68: extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*);
 71: PetscErrorCode MatGetFactor_seqaij_petsccusp(Mat A,MatFactorType ftype,Mat *B)
 72: {
 73:   PetscErrorCode     ierr;

 76:   MatGetFactor_seqaij_petsc(A,ftype,B);

 78:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
 79:     MatSetType(*B,MATSEQAIJCUSP);
 80:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSP;
 81:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSP;
 82:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSP Matrix Types");
 83:   (*B)->factortype = ftype;
 84:   return(0);
 85: }
 86: EXTERN_C_END


 91: PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSP(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
 92: {
 93:   PetscErrorCode     ierr;

 96:   MatILUFactorSymbolic_SeqAIJ(fact,A,isrow,iscol,info);
 97:   (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSP;
 98:   return(0);
 99: }

103: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSP(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
104: {
105:   PetscErrorCode     ierr;

108:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
109:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSP;
110:   return(0);
111: }


116: PetscErrorCode MatCUSPARSEInitialize(Mat A)
117: {
118:   Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
119:   Mat_SeqAIJCUSPARSETriFactor* cusparsestructLo  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->loTriFactorPtr;
120:   Mat_SeqAIJCUSPARSETriFactor* cusparsestructUp  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->upTriFactorPtr;
121:   cusparseStatus_t stat;

124:   try {
125:     // cusparse handle creation
126:     stat = cusparseCreate(&Mat_CUSPARSE_Handle);CHKERRCUSP(stat);

128:     //  LOWER TRIANGULAR MATRIX
129:     stat = cusparseCreateMatDescr(&(cusparsestructLo->Mat_CUSPARSE_description));CHKERRCUSP(stat);
130:     stat = cusparseSetMatType(cusparsestructLo->Mat_CUSPARSE_description, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
131:     stat = cusparseSetMatFillMode(cusparsestructLo->Mat_CUSPARSE_description, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
132:     // cusparse set matrix diag type ... this doesn't seem to work right now??
133:     //stat = cusparseSetMatDiagType(cusparsestructLo->Mat_CUSPARSE_description, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
134:     stat = cusparseSetMatIndexBase(cusparsestructLo->Mat_CUSPARSE_description, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
135:     stat = cusparseCreateSolveAnalysisInfo(&(cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo));CHKERRCUSP(stat);
136: 
137:     //  UPPER TRIANGULAR MATRIX
138:     stat = cusparseCreateMatDescr(&(cusparsestructUp->Mat_CUSPARSE_description));CHKERRCUSP(stat);
139:     stat = cusparseSetMatType(cusparsestructUp->Mat_CUSPARSE_description, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
140:     stat = cusparseSetMatFillMode(cusparsestructUp->Mat_CUSPARSE_description, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
141:     stat = cusparseSetMatDiagType(cusparsestructUp->Mat_CUSPARSE_description, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
142:     stat = cusparseSetMatIndexBase(cusparsestructUp->Mat_CUSPARSE_description, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
143:     stat = cusparseCreateSolveAnalysisInfo(&(cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo));CHKERRCUSP(stat);
144:   } catch(char* ex) {
145:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
146:   }
147:   return(0);
148: }

152: PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A)
153: {
154:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
155:   PetscInt          n = A->rmap->n;
156:   Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
157:   Mat_SeqAIJCUSPARSETriFactor* cusparsestructLo  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->loTriFactorPtr;
158:   const PetscInt    *ai = a->i,*aj = a->j,*vi;
159:   const MatScalar   *aa = a->a,*v;
160:   cusparseStatus_t stat;
161:   PetscErrorCode     ierr;
162:   PetscInt *AiLo, *AjLo;
163:   PetscScalar *AALo;
164:   PetscInt i,nz, nzLower, offset, rowOffset;

167:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
168:     try {
169:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
170:       nzLower=n+ai[n]-ai[1];
171: 
172:       /* Allocate Space for the lower triangular matrix */
173:       cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
174:       cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
175:       cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
176: 
177:       /* Fill the lower triangular matrix */
178:       AiLo[0]=(PetscInt) 0;
179:       AiLo[n]=nzLower;
180:       AjLo[0]=(PetscInt) 0;
181:       AALo[0]=(MatScalar) 1.0;
182:       v    = aa;
183:       vi   = aj;
184:       offset=1;
185:       rowOffset=1;
186:       for (i=1; i<n; i++) {
187:         nz  = ai[i+1] - ai[i];
188:         // additional 1 for the term on the diagonal
189:         AiLo[i]=rowOffset;
190:         rowOffset+=nz+1;

192:         PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
193:         PetscMemcpy(&(AALo[offset]), v, nz*sizeof(MatScalar));
194: 
195:         offset+=nz;
196:         AjLo[offset]=(PetscInt) i;
197:         AALo[offset]=(MatScalar) 1.0;
198:         offset+=1;
199: 
200:         v  += nz;
201:         vi += nz;
202:       }
203:       /* The Lower triangular matrix */
204:       cusparsestructLo->mat = new CUSPMATRIX;
205:       cusparsestructLo->mat->resize(n,n,nzLower);
206:       cusparsestructLo->mat->row_offsets.assign(AiLo,AiLo+n+1);
207:       cusparsestructLo->mat->column_indices.assign(AjLo,AjLo+nzLower);
208:       cusparsestructLo->mat->values.assign(AALo,AALo+nzLower);
209: 
210:       // work vector allocation
211:       cusparsestructLo->tempvecGPU = new CUSPARRAY;
212:       (cusparsestructLo->tempvecGPU)->resize(n);
213: 
214:       // cusparse analysis
215: #if defined(PETSC_USE_REAL_SINGLE)  
216:       stat = cusparseScsrsv_analysis(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
217:                                      n, cusparsestructLo->Mat_CUSPARSE_description,
218:                                      thrust::raw_pointer_cast(&(cusparsestructLo->mat)->values[0]),
219:                                      thrust::raw_pointer_cast(&(cusparsestructLo->mat)->row_offsets[0]),
220:                                      thrust::raw_pointer_cast(&(cusparsestructLo->mat)->column_indices[0]),
221:                                      cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo);CHKERRCUSP(stat);
222: #elif defined(PETSC_USE_REAL_DOUBLE)
223:       stat = cusparseDcsrsv_analysis(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
224:                                      n, cusparsestructLo->Mat_CUSPARSE_description,
225:                                      thrust::raw_pointer_cast(&(cusparsestructLo->mat)->values[0]),
226:                                      thrust::raw_pointer_cast(&(cusparsestructLo->mat)->row_offsets[0]),
227:                                      thrust::raw_pointer_cast(&(cusparsestructLo->mat)->column_indices[0]),
228:                                      cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo);CHKERRCUSP(stat);
229: #endif      
230:     } catch(char* ex) {
231:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
232:     }
233:   }
234:   return(0);
235: }

239: PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A)
240: {
241:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
242:   PetscInt          n = A->rmap->n;
243:   Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
244:   Mat_SeqAIJCUSPARSETriFactor* cusparsestructUp  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->upTriFactorPtr;
245:   const PetscInt    *aj = a->j,*adiag = a->diag,*vi;
246:   const MatScalar   *aa = a->a,*v;
247:   cusparseStatus_t stat;
248:   PetscInt *AiUp, *AjUp;
249:   PetscScalar *AAUp;
250:   PetscInt i,nz, nzUpper, offset;
251:   PetscErrorCode     ierr;
252: 

255:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
256:     try {
257:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
258:       nzUpper = adiag[0]-adiag[n];
259: 
260:       /* Allocate Space for the upper triangular matrix */
261:       cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
262:       cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
263:       cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
264: 
265:       /* Fill the upper triangular matrix */
266:       AiUp[0]=(PetscInt) 0;
267:       AiUp[n]=nzUpper;
268:       offset = nzUpper;
269:       for (i=n-1; i>=0; i--){
270:         v   = aa + adiag[i+1] + 1;
271:         vi  = aj + adiag[i+1] + 1;
272: 
273:         // number of elements NOT on the diagonal
274:         nz = adiag[i] - adiag[i+1]-1;
275: 
276:         // decrement the offset
277:         offset -= (nz+1);
278: 
279:         // first, set the diagonal elements
280:         AjUp[offset] = (PetscInt) i;
281:         AAUp[offset] = 1./v[nz];
282:         AiUp[i] = AiUp[i+1] - (nz+1);
283: 
284:         PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
285:         PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(MatScalar));
286:       }
287:       /* The Upper triangular matrix */
288:       cusparsestructUp->mat = new CUSPMATRIX;
289:       cusparsestructUp->mat->resize(n,n,nzUpper);
290:       cusparsestructUp->mat->row_offsets.assign(AiUp,AiUp+n+1);
291:       cusparsestructUp->mat->column_indices.assign(AjUp,AjUp+nzUpper);
292:       cusparsestructUp->mat->values.assign(AAUp,AAUp+nzUpper);
293: 
294:       // work vector allocation
295:       cusparsestructUp->tempvecGPU = new CUSPARRAY;
296:       (cusparsestructUp->tempvecGPU)->resize(n);
297: 
298:       // cusparse analysis
299: #if defined(PETSC_USE_REAL_SINGLE)  
300:       stat = cusparseScsrsv_analysis(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
301:                                      n, cusparsestructUp->Mat_CUSPARSE_description,
302:                                      thrust::raw_pointer_cast(&(cusparsestructUp->mat)->values[0]),
303:                                      thrust::raw_pointer_cast(&(cusparsestructUp->mat)->row_offsets[0]),
304:                                      thrust::raw_pointer_cast(&(cusparsestructUp->mat)->column_indices[0]),
305:                                      cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo);CHKERRCUSP(stat);
306: #elif defined(PETSC_USE_REAL_DOUBLE)
307:       stat = cusparseDcsrsv_analysis(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
308:                                      n, cusparsestructUp->Mat_CUSPARSE_description,
309:                                      thrust::raw_pointer_cast(&(cusparsestructUp->mat)->values[0]),
310:                                      thrust::raw_pointer_cast(&(cusparsestructUp->mat)->row_offsets[0]),
311:                                      thrust::raw_pointer_cast(&(cusparsestructUp->mat)->column_indices[0]),
312:                                      cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo);CHKERRCUSP(stat);
313: #endif
314:     } catch(char* ex) {
315:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
316:     }
317:   }
318:   return(0);
319: }

323: PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A)
324: {
325:   PetscErrorCode     ierr;
326:   Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data;
327:   Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
328:   Mat_SeqAIJCUSPARSETriFactor* cusparsestructLo  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->loTriFactorPtr;
329:   Mat_SeqAIJCUSPARSETriFactor* cusparsestructUp  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->upTriFactorPtr;
330:   IS               isrow = a->row,iscol = a->icol;
331:   PetscBool        row_identity,col_identity;
332:   const PetscInt   *r,*c;
334: 
335:   MatCUSPARSEInitialize(A);
336:   MatCUSPARSEBuildLowerTriMatrix(A);
337:   MatCUSPARSEBuildUpperTriMatrix(A);
338:   A->valid_GPU_matrix = PETSC_CUSP_BOTH;

340:   //lower triangular indices
341:   ISGetIndices(isrow,&r);
342:   ISIdentity(isrow,&row_identity);
343:   cusparsestructLo->ordIndicesGPU = new CUSPINTARRAYGPU;
344:   (cusparsestructLo->ordIndicesGPU)->assign(&r[0], &r[0]+A->rmap->n);

346:   //upper triangular indices
347:   ISGetIndices(iscol,&c);
348:   ISIdentity(iscol,&col_identity);
349:   cusparsestructUp->ordIndicesGPU = new CUSPINTARRAYGPU;
350:   (cusparsestructUp->ordIndicesGPU)->assign(&c[0], &c[0]+A->rmap->n);

352:   return(0);
353: }

357: PetscErrorCode MatLUFactorNumeric_SeqAIJCUSP(Mat B,Mat A,const MatFactorInfo *info)
358: {
359:   PetscErrorCode   ierr;
360:   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
361:   IS               isrow = b->row,iscol = b->col;
362:   PetscBool        row_identity,col_identity;

365: 
366:   MatLUFactorNumeric_SeqAIJ(B,A,info);
367: 
368:   // determine which version of MatSolve needs to be used.
369:   ISIdentity(isrow,&row_identity);
370:   ISIdentity(iscol,&col_identity);
371:   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSP_NaturalOrdering;
372:   else                              B->ops->solve = MatSolve_SeqAIJCUSP;

374:   // get the triangular factors
375:   MatCUSPARSEAnalysisAndCopyToGPU(B);
376:   return(0);
377: }



383: PetscErrorCode MatSolve_SeqAIJCUSP(Mat A,Vec bb,Vec xx)
384: {
385:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
387:   CUSPARRAY      *xGPU, *bGPU;
388:   PetscInt          n = A->rmap->n;
389:   cusparseStatus_t stat;
390:   Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
391:   Mat_SeqAIJCUSPARSETriFactor *cusparsestructLo  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->loTriFactorPtr;
392:   Mat_SeqAIJCUSPARSETriFactor *cusparsestructUp  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->upTriFactorPtr;


396:   // Get the GPU pointers
397:   VecCUSPGetArrayWrite(xx,&xGPU);
398:   VecCUSPGetArrayRead(bb,&bGPU);
399: 
400:   // Copy the right hand side vector, bGPU, into xGPU with the row permutation
401:   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), (cusparsestructLo->ordIndicesGPU)->begin()),
402:                thrust::make_permutation_iterator(bGPU->end(),   (cusparsestructLo->ordIndicesGPU)->end()),
403:                xGPU->begin());


406: #if defined(PETSC_USE_REAL_SINGLE)  
407:   stat = cusparseScsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
408:                               n, 1.0f, cusparsestructLo->Mat_CUSPARSE_description,
409:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->values[0]),
410:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->row_offsets[0]),
411:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->column_indices[0]),
412:                               cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo,
413:                               (const PetscScalar *) thrust::raw_pointer_cast(xGPU->data()),
414:                               (PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()));CHKERRCUSP(stat);
415: 
416:   stat = cusparseScsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
417:                               n, 1.0f, cusparsestructUp->Mat_CUSPARSE_description,
418:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->values[0]),
419:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->row_offsets[0]),
420:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->column_indices[0]),
421:                               cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo,
422:                               (const PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()),
423:                               (PetscScalar *) thrust::raw_pointer_cast(xGPU->data()));CHKERRCUSP(stat);

425: #elif defined(PETSC_USE_REAL_DOUBLE)
426:   stat = cusparseDcsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
427:                               n, 1.0, cusparsestructLo->Mat_CUSPARSE_description,
428:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->values[0]),
429:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->row_offsets[0]),
430:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->column_indices[0]),
431:                               cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo,
432:                               (const PetscScalar *) thrust::raw_pointer_cast(xGPU->data()),
433:                               (PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()));CHKERRCUSP(stat);
434: 
435:   stat = cusparseDcsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
436:                               n, 1.0, cusparsestructUp->Mat_CUSPARSE_description,
437:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->values[0]),
438:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->row_offsets[0]),
439:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->column_indices[0]),
440:                               cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo,
441:                               (const PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()),
442:                               (PetscScalar *) thrust::raw_pointer_cast(xGPU->data()));CHKERRCUSP(stat);
443:  #endif
444: 
445:   // Copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place.
446:   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(),   (cusparsestructUp->ordIndicesGPU)->begin()),
447:                thrust::make_permutation_iterator(xGPU->end(), (cusparsestructUp->ordIndicesGPU)->end()),
448:                (cusparsestructLo->tempvecGPU)->begin());
449: 
450:   // Copy the temporary to the full solution.
451:   thrust::copy((cusparsestructLo->tempvecGPU)->begin(), (cusparsestructLo->tempvecGPU)->end(), xGPU->begin());
452: 
453:   VecCUSPRestoreArrayRead(bb,&bGPU);
454:   VecCUSPRestoreArrayWrite(xx,&xGPU);
455:   WaitForGPU();CHKERRCUSP(ierr);
456:   PetscLogFlops(2.0*a->nz - A->cmap->n);
457:   return(0);
458: }




465: PetscErrorCode MatSolve_SeqAIJCUSP_NaturalOrdering(Mat A,Vec bb,Vec xx)
466: {
467:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
468:   PetscErrorCode    ierr;
469:   CUSPARRAY         *xGPU, *bGPU;
470:   PetscInt          n = A->rmap->n;
471:   cusparseStatus_t stat;
472:   Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
473:   Mat_SeqAIJCUSPARSETriFactor *cusparsestructLo  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->loTriFactorPtr;
474:   Mat_SeqAIJCUSPARSETriFactor *cusparsestructUp  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->upTriFactorPtr;
475: 
477:   VecCUSPGetArrayWrite(xx,&xGPU);
478:   VecCUSPGetArrayRead(bb,&bGPU);

480: #if defined(PETSC_USE_REAL_SINGLE)  
481:   stat = cusparseScsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
482:                               n, 1.0f, cusparsestructLo->Mat_CUSPARSE_description,
483:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->values[0]),
484:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->row_offsets[0]),
485:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->column_indices[0]),
486:                               cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo,
487:                               (const PetscScalar *) thrust::raw_pointer_cast(bGPU->data()),
488:                               (PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()));CHKERRCUSP(stat);
489: 
490:   stat = cusparseScsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
491:                               n, 1.0f, cusparsestructUp->Mat_CUSPARSE_description,
492:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->values[0]),
493:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->row_offsets[0]),
494:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->column_indices[0]),
495:                               cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo,
496:                               (const PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()),
497:                               (PetscScalar *) thrust::raw_pointer_cast(xGPU->data()));CHKERRCUSP(stat);
498: 
499: #elif defined(PETSC_USE_REAL_DOUBLE)        
500:   stat = cusparseDcsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
501:                               n, 1.0, cusparsestructLo->Mat_CUSPARSE_description,
502:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->values[0]),
503:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->row_offsets[0]),
504:                               thrust::raw_pointer_cast(&(cusparsestructLo->mat)->column_indices[0]),
505:                               cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo,
506:                               (const PetscScalar *) thrust::raw_pointer_cast(bGPU->data()),
507:                               (PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()));CHKERRCUSP(stat);
508: 
509:   stat = cusparseDcsrsv_solve(Mat_CUSPARSE_Handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
510:                               n, 1.0, cusparsestructUp->Mat_CUSPARSE_description,
511:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->values[0]),
512:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->row_offsets[0]),
513:                               thrust::raw_pointer_cast(&(cusparsestructUp->mat)->column_indices[0]),
514:                               cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo,
515:                               (const PetscScalar *) thrust::raw_pointer_cast((cusparsestructLo->tempvecGPU)->data()),
516:                               (PetscScalar *) thrust::raw_pointer_cast(xGPU->data()));CHKERRCUSP(stat);
517: #endif
518:   VecCUSPRestoreArrayRead(bb,&bGPU);
519:   VecCUSPRestoreArrayWrite(xx,&xGPU);
520:   WaitForGPU();CHKERRCUSP(ierr);
521:   PetscLogFlops(2.0*a->nz - A->cmap->n);
522:   return(0);
523: }

527: PetscErrorCode MatCUSPCopyToGPU(Mat A)
528: {

530: #ifdef PETSC_HAVE_TXPETSCGPU
531:   Mat_SeqAIJGPU *cuspstruct  = (Mat_SeqAIJGPU*)A->spptr;
532: #else
533:   Mat_SeqAIJCUSP *cuspstruct  = (Mat_SeqAIJCUSP*)A->spptr;
534: #endif
535:   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
536:   PetscInt        m           = A->rmap->n,*ii,*ridx;
537:   PetscErrorCode  ierr;


541:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
542:     PetscLogEventBegin(MAT_CUSPCopyToGPU,A,0,0,0);
543:     /*
544:       It may be possible to reuse nonzero structure with new matrix values but 
545:       for simplicity and insured correctness we delete and build a new matrix on
546:       the GPU. Likely a very small performance hit.
547:     */
548:     if (cuspstruct->mat){
549:       try {
550:         delete cuspstruct->mat;
551:         if (cuspstruct->tempvec)
552:           delete cuspstruct->tempvec;
553: 
554:       } catch(char* ex) {
555:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
556:       }
557:     }
558:     try {
559:       cuspstruct->nonzerorow=0;
560:       for (int j = 0; j<m; j++)
561:         cuspstruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);

563: #ifdef PETSC_HAVE_TXPETSCGPU
564:       if (a->compressedrow.use) {
565:         m    = a->compressedrow.nrows;
566:         ii   = a->compressedrow.i;
567:         ridx = a->compressedrow.rindex;
568:       } else {
569:         // Forcing compressed row on the GPU ... only relevant for CSR storage
570:         int k=0;
571:         PetscMalloc((cuspstruct->nonzerorow+1)*sizeof(PetscInt), &ii);
572:         PetscMalloc((cuspstruct->nonzerorow)*sizeof(PetscInt), &ridx);
573:         ii[0]=0;
574:         for (int j = 0; j<m; j++) {
575:           if ((a->i[j+1]-a->i[j])>0) {
576:             ii[k] = a->i[j];
577:             ridx[k]= j;
578:             k++;
579:           }
580:         }
581:         ii[cuspstruct->nonzerorow] = a->nz;
582:         m = cuspstruct->nonzerorow;
583:       }

585:       // Build our matrix ... first determine the GPU storage type
586:       PetscBool found;
587:       char      input[4] = "csr";
588:       std::string storageFormat;
589:       PetscOptionsGetString(PETSC_NULL, "-cusp_storage_format", input, 4, &found);
590:       storageFormat.assign(input);
591:       if(storageFormat!="csr" && storageFormat!="dia"&& storageFormat!="ell"&& storageFormat!="coo")
592:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Bad argument to -cusp_storage_format. Must be either 'csr', 'dia' (diagonal), 'ell' (ellpack), or 'coo' (coordinate)\n");
593:       else {
594:         cuspstruct->mat = GPU_Matrix_Factory::getNew(storageFormat);

596:         // Create the streams and events (if desired).
597:         PetscMPIInt    size;
598:         MPI_Comm_size(PETSC_COMM_WORLD,&size);
599:         cuspstruct->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr);

601:         // Determine which library to use : cusp or cusparse
602:         PetscBool flg = PETSC_FALSE;
603:         PetscOptionsGetBool(PETSC_NULL,"-use_cusparse",&flg,PETSC_NULL);
604:         if (flg)
605:           cuspstruct->mat->initializeCusparse();
606: 
607:         // lastly, build the matrix
608:         cuspstruct->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);
609:         cuspstruct->mat->setCPRowIndices(ridx, m);
610: 
611:         ///
612:         // INODES : Determine the inode data structure for the GPU.
613:         //   This only really matters for the CSR format.
614:         //
615:         if (a->inode.use) {
616:           PetscInt * temp;
617:           PetscMalloc((a->inode.node_count+1)*sizeof(PetscInt), &temp);
618:           temp[0]=0;
619:           PetscInt nodeMax=0, nnzPerRowMax=0;
620:           for (int i = 0; i<a->inode.node_count; i++) {
621:             temp[i+1]= a->inode.size[i]+temp[i];
622:             if (a->inode.size[i] > nodeMax)
623:               nodeMax = a->inode.size[i];
624:           }
625:           // Determine the maximum number of nonzeros in a row.
626:           cuspstruct->nonzerorow = 0;
627:           for (int j = 0; j<A->rmap->n; j++) {
628:             cuspstruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
629:             if (a->i[j+1]-a->i[j] > nnzPerRowMax) {
630:               nnzPerRowMax = a->i[j+1]-a->i[j];
631:             }
632:           }
633:           // Set the Inode data ... only relevant for CSR really
634:           cuspstruct->mat->setInodeData(temp, a->inode.node_count+1, nnzPerRowMax, nodeMax, a->inode.node_count);
635:           PetscFree(temp);
636:         }

638:       }
639:       if (!a->compressedrow.use) {
640:         // free data
641:         PetscFree(ii);
642:         PetscFree(ridx);
643:       }
644: 
645: #else

647:       cuspstruct->mat = new CUSPMATRIX;
648:       if (a->compressedrow.use) {
649:         m    = a->compressedrow.nrows;
650:         ii   = a->compressedrow.i;
651:         ridx = a->compressedrow.rindex;
652:         cuspstruct->mat->resize(m,A->cmap->n,a->nz);
653:         cuspstruct->mat->row_offsets.assign(ii,ii+m+1);
654:         cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
655:         cuspstruct->mat->values.assign(a->a,a->a+a->nz);
656:         cuspstruct->indices = new CUSPINTARRAYGPU;
657:         cuspstruct->indices->assign(ridx,ridx+m);
658:       } else {
659:         cuspstruct->mat->resize(m,A->cmap->n,a->nz);
660:         cuspstruct->mat->row_offsets.assign(a->i,a->i+m+1);
661:         cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
662:         cuspstruct->mat->values.assign(a->a,a->a+a->nz);
663:       }
664: #endif
665:       cuspstruct->tempvec = new CUSPARRAY;
666:       cuspstruct->tempvec->resize(m);
667:     } catch(char* ex) {
668:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
669:     }
670:     WaitForGPU();CHKERRCUSP(ierr);
671:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
672:     PetscLogEventEnd(MAT_CUSPCopyToGPU,A,0,0,0);
673:   }
674:   return(0);
675: }

679: PetscErrorCode MatCUSPCopyFromGPU(Mat A, CUSPMATRIX *Agpu)
680: {
681:   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP *) A->spptr;
682:   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
683:   PetscInt        m          = A->rmap->n;
684:   PetscErrorCode  ierr;

687:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
688:     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
689:       try {
690:         cuspstruct->mat = Agpu;
691:         if (a->compressedrow.use) {
692:           //PetscInt *ii, *ridx;
693:           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
694:         } else {
695:           PetscInt i;

697:           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);
698:           a->nz    = cuspstruct->mat->values.size();
699:           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
700:           A->preallocated = PETSC_TRUE;
701:           // Copy ai, aj, aa
702:           if (a->singlemalloc) {
703:             if (a->a) {PetscFree3(a->a,a->j,a->i);}
704:           } else {
705:             if (a->i) {PetscFree(a->i);}
706:             if (a->j) {PetscFree(a->j);}
707:             if (a->a) {PetscFree(a->a);}
708:           }
709:           PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);
710:           PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));
711:           a->singlemalloc = PETSC_TRUE;
712:           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
713:           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
714:           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
715:           // Setup row lengths
716:           if (a->imax) {PetscFree2(a->imax,a->ilen);}
717:           PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);
718:           PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));
719:           for(i = 0; i < m; ++i) {
720:             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
721:           }
722:           // a->diag?
723:         }
724:         cuspstruct->tempvec = new CUSPARRAY;
725:         cuspstruct->tempvec->resize(m);
726:       } catch(char *ex) {
727:         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
728:       }
729:     }
730:     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
731:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
732:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
733:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
734:   } else {
735:     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
736:   }
737:   return(0);
738: }

742: PetscErrorCode MatGetVecs_SeqAIJCUSP(Mat mat, Vec *right, Vec *left)
743: {


748:   if (right) {
749:     VecCreate(((PetscObject)mat)->comm,right);
750:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
751:     VecSetBlockSize(*right,mat->rmap->bs);
752:     VecSetType(*right,VECSEQCUSP);
753:     PetscLayoutReference(mat->cmap,&(*right)->map);
754:   }
755:   if (left) {
756:     VecCreate(((PetscObject)mat)->comm,left);
757:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
758:     VecSetBlockSize(*left,mat->rmap->bs);
759:     VecSetType(*left,VECSEQCUSP);
760:     PetscLayoutReference(mat->rmap,&(*left)->map);
761:   }
762:   return(0);
763: }

767: PetscErrorCode MatMult_SeqAIJCUSP(Mat A,Vec xx,Vec yy)
768: {
769:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
771: #ifdef PETSC_HAVE_TXPETSCGPU
772:   Mat_SeqAIJGPU *cuspstruct = (Mat_SeqAIJGPU *)A->spptr;
773: #else
774:   PetscBool      usecprow    = a->compressedrow.use;
775:   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP *)A->spptr;
776: #endif
777:   CUSPARRAY      *xarray,*yarray;

780:   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSP
781:   // MatCUSPCopyToGPU(A);
782:   VecCUSPGetArrayRead(xx,&xarray);
783:   VecCUSPGetArrayWrite(yy,&yarray);
784:   try {
785: #ifdef PETSC_HAVE_TXPETSCGPU
786:     cuspstruct->mat->multiply(xarray, yarray);CHKERRCUSP(ierr);
787:     //cuspstruct->mat->multiply(thrust::raw_pointer_cast(xarray->data()),
788:     //                                     thrust::raw_pointer_cast(yarray->data()));CHKERRCUSP(ierr);
789: #else
790:     if (usecprow){ /* use compressed row format */
791:       cusp::multiply(*cuspstruct->mat,*xarray,*cuspstruct->tempvec);
792:       VecSet_SeqCUSP(yy,0.0);
793:       thrust::copy(cuspstruct->tempvec->begin(),cuspstruct->tempvec->end(),thrust::make_permutation_iterator(yarray->begin(),cuspstruct->indices->begin()));
794:     } else { /* do not use compressed row format */
795:       cusp::multiply(*cuspstruct->mat,*xarray,*yarray);
796:     }
797: #endif

799:   } catch (char* ex) {
800:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
801:   }
802:   VecCUSPRestoreArrayRead(xx,&xarray);
803:   VecCUSPRestoreArrayWrite(yy,&yarray);
804: #ifdef PETSC_HAVE_TXPETSCGPU
805:   if (!cuspstruct->mat->hasNonZeroStream())
806:     WaitForGPU();CHKERRCUSP(ierr);
807: #else
808:   WaitForGPU();CHKERRCUSP(ierr);
809: #endif
810:   PetscLogFlops(2.0*a->nz - cuspstruct->nonzerorow);
811:   return(0);
812: }


815: struct VecCUSPPlusEquals
816: {
817:   template <typename Tuple>
818:   __host__ __device__
819:   void operator()(Tuple t)
820:   {
821:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
822:   }
823: };

827: PetscErrorCode MatMultAdd_SeqAIJCUSP(Mat A,Vec xx,Vec yy,Vec zz)
828: {
829:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
831: #ifdef PETSC_HAVE_TXPETSCGPU
832:   Mat_SeqAIJGPU *cuspstruct = (Mat_SeqAIJGPU *)A->spptr;
833: #else
834:   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP *)A->spptr;
835: #endif
836:   CUSPARRAY      *xarray,*yarray,*zarray;
838:   // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSP
839:   // MatCUSPCopyToGPU(A);
840:   try {
841:     VecCopy_SeqCUSP(yy,zz);
842:     VecCUSPGetArrayRead(xx,&xarray);
843:     VecCUSPGetArrayRead(yy,&yarray);
844:     VecCUSPGetArrayWrite(zz,&zarray);
845: #ifdef PETSC_HAVE_TXPETSCGPU
846:     cuspstruct->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr);
847: #else
848:     if (a->compressedrow.use) {
849:       cusp::multiply(*cuspstruct->mat,*xarray, *cuspstruct->tempvec);
850:       thrust::for_each(
851:            thrust::make_zip_iterator(
852:                  thrust::make_tuple(cuspstruct->tempvec->begin(),
853:                                     thrust::make_permutation_iterator(zarray->begin(), cuspstruct->indices->begin()))),
854:            thrust::make_zip_iterator(
855:                  thrust::make_tuple(cuspstruct->tempvec->begin(),
856:                                     thrust::make_permutation_iterator(zarray->begin(),cuspstruct->indices->begin()))) + cuspstruct->tempvec->size(),
857:            VecCUSPPlusEquals());
858:     } else {
859:       cusp::multiply(*cuspstruct->mat,*xarray,*cuspstruct->tempvec);
860:       thrust::for_each(
861:          thrust::make_zip_iterator(thrust::make_tuple(
862:                                     cuspstruct->tempvec->begin(),
863:                                     zarray->begin())),
864:          thrust::make_zip_iterator(thrust::make_tuple(
865:                                     cuspstruct->tempvec->end(),
866:                                     zarray->end())),
867:          VecCUSPPlusEquals());
868:     }
869: #endif
870:     VecCUSPRestoreArrayRead(xx,&xarray);
871:     VecCUSPRestoreArrayRead(yy,&yarray);
872:     VecCUSPRestoreArrayWrite(zz,&zarray);
873: 
874:   } catch(char* ex) {
875:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
876:   }
877:   WaitForGPU();CHKERRCUSP(ierr);
878:   PetscLogFlops(2.0*a->nz);
879:   return(0);
880: }

884: PetscErrorCode MatAssemblyEnd_SeqAIJCUSP(Mat A,MatAssemblyType mode)
885: {
886:   PetscErrorCode  ierr;
888:   MatAssemblyEnd_SeqAIJ(A,mode);
889:   MatCUSPCopyToGPU(A);
890:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
891:   // this is not necessary since MatCUSPCopyToGPU has been called.
892:   //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
893:   //  A->valid_GPU_matrix = PETSC_CUSP_CPU;
894:   //}
895:   A->ops->mult    = MatMult_SeqAIJCUSP;
896:   A->ops->multadd    = MatMultAdd_SeqAIJCUSP;
897:   return(0);
898: }

900: /* --------------------------------------------------------------------------------*/
903: /*@C
904:    MatCreateSeqAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
905:    (the default parallel PETSc format).  For good matrix assembly performance
906:    the user should preallocate the matrix storage by setting the parameter nz
907:    (or the array nnz).  By setting these parameters accurately, performance
908:    during matrix assembly can be increased by more than a factor of 50.

910:    Collective on MPI_Comm

912:    Input Parameters:
913: +  comm - MPI communicator, set to PETSC_COMM_SELF
914: .  m - number of rows
915: .  n - number of columns
916: .  nz - number of nonzeros per row (same for all rows)
917: -  nnz - array containing the number of nonzeros in the various rows 
918:          (possibly different for each row) or PETSC_NULL

920:    Output Parameter:
921: .  A - the matrix 

923:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
924:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
925:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

927:    Notes:
928:    If nnz is given then nz is ignored

930:    The AIJ format (also called the Yale sparse matrix format or
931:    compressed row storage), is fully compatible with standard Fortran 77
932:    storage.  That is, the stored row and column indices can begin at
933:    either one (as in Fortran) or zero.  See the users' manual for details.

935:    Specify the preallocated storage with either nz or nnz (not both).
936:    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 
937:    allocation.  For large problems you MUST preallocate memory or you 
938:    will get TERRIBLE performance, see the users' manual chapter on matrices.

940:    By default, this format uses inodes (identical nodes) when possible, to 
941:    improve numerical efficiency of matrix-vector products and solves. We 
942:    search for consecutive rows with the same nonzero structure, thereby
943:    reusing matrix information to achieve increased efficiency.

945:    Level: intermediate

947: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()

949: @*/
950: PetscErrorCode  MatCreateSeqAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
951: {

955:   MatCreate(comm,A);
956:   MatSetSizes(*A,m,n,m,n);
957:   MatSetType(*A,MATSEQAIJCUSP);
958:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
959:   return(0);
960: }

962: #ifdef PETSC_HAVE_TXPETSCGPU

966: PetscErrorCode MatDestroy_SeqAIJCUSP(Mat A)
967: {
968:   PetscErrorCode        ierr;
969:   Mat_SeqAIJGPU      *cuspstruct = (Mat_SeqAIJGPU*)A->spptr;

972:   if (A->factortype==MAT_FACTOR_NONE) {
973:     // The regular matrices
974:     try {
975:       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
976:         delete (GPU_Matrix_Ifc *)(cuspstruct->mat);
977:       }
978:       if (cuspstruct->tempvec!=0)
979:         delete cuspstruct->tempvec;
980:       delete cuspstruct;
981:       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
982:     } catch(char* ex) {
983:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
984:     }
985:   } else {
986:     // The triangular factors
987:     try {
988:       Mat_SeqAIJTriFactors *cusparseTriFactors  = (Mat_SeqAIJTriFactors*)A->spptr;
989:       Mat_SeqAIJCUSPARSETriFactor *cusparsestructLo  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->loTriFactorPtr;
990:       Mat_SeqAIJCUSPARSETriFactor *cusparsestructUp  = (Mat_SeqAIJCUSPARSETriFactor*)cusparseTriFactors->upTriFactorPtr;
991:       cusparseStatus_t stat;
992: 
993:       stat = cusparseDestroy(Mat_CUSPARSE_Handle);CHKERRCUSP(stat);
994:       stat = cusparseDestroyMatDescr(cusparsestructLo->Mat_CUSPARSE_description);CHKERRCUSP(stat);
995:       stat = cusparseDestroySolveAnalysisInfo(cusparsestructLo->Mat_CUSPARSE_solveAnalysisInfo);CHKERRCUSP(stat);

997:       // destroy the matrix and the container
998:       if (cusparsestructLo->mat!=0)
999:         delete cusparsestructLo->mat;
1000:       if (cusparsestructLo->tempvecGPU!=0)
1001:         delete cusparsestructLo->tempvecGPU;
1002:       delete cusparsestructLo;
1003: 
1004:       stat = cusparseDestroyMatDescr(cusparsestructUp->Mat_CUSPARSE_description);CHKERRCUSP(stat);
1005:       stat = cusparseDestroySolveAnalysisInfo(cusparsestructUp->Mat_CUSPARSE_solveAnalysisInfo);CHKERRCUSP(stat);

1007:       // destroy the matrix and the container
1008:       if (cusparsestructUp->mat!=0)
1009:         delete cusparsestructUp->mat;
1010:       if (cusparsestructUp->tempvecGPU!=0)
1011:         delete cusparsestructUp->tempvecGPU;
1012:       delete cusparsestructUp;
1013:     } catch(char* ex) {
1014:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1015:     }
1016:   }

1018:   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1019:   A->spptr = 0;

1021:   MatDestroy_SeqAIJ(A);
1022:   return(0);
1023: }

1025: #else // if PETSC_HAVE_TXPETSCGPU is 0

1029: PetscErrorCode MatDestroy_SeqAIJCUSP(Mat A)
1030: {
1032:   Mat_SeqAIJCUSP *cuspcontainer = (Mat_SeqAIJCUSP*)A->spptr;

1035:   try {
1036:     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
1037:       delete (CUSPMATRIX *)(cuspcontainer->mat);
1038:     }
1039:     delete cuspcontainer;
1040:     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1041:   } catch(char* ex) {
1042:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1043:   }
1044:   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1045:   A->spptr = 0;
1046:   MatDestroy_SeqAIJ(A);
1047:   return(0);
1048: }

1050: #endif // PETSC_HAVE_TXPETSCGPU
1051: /*
1054: PetscErrorCode MatCreateSeqAIJCUSPFromTriple(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt* i, PetscInt* j, PetscScalar*a, Mat *mat, PetscInt nz, PetscBool idx)
1055: {
1056:   CUSPMATRIX *gpucsr;

1060:   if (idx){
1061:   }
1062:   return(0);
1063: }*/

1065: extern PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat, PetscInt, PetscInt, PetscInt *,const PetscScalar*);

1067: #ifdef PETSC_HAVE_TXPETSCGPU

1069: EXTERN_C_BEGIN
1072: PetscErrorCode  MatCreate_SeqAIJCUSP(Mat B)
1073: {

1077:   MatCreate_SeqAIJ(B);
1078:   B->ops->mult    = MatMult_SeqAIJCUSP;
1079:   B->ops->multadd = MatMultAdd_SeqAIJCUSP;

1081:   if (B->factortype==MAT_FACTOR_NONE) {
1082:     /* you cannot check the inode.use flag here since the matrix was just created.*/
1083:     B->spptr        = new Mat_SeqAIJGPU;
1084:     ((Mat_SeqAIJGPU *)B->spptr)->mat = 0;
1085:     ((Mat_SeqAIJGPU *)B->spptr)->tempvec = 0;
1086:   } else {
1087:     Mat_SeqAIJTriFactors *triFactors;
1088:     /* NEXT, set the pointers to the triangular factors */
1089:     B->spptr = new Mat_SeqAIJTriFactors;
1090:     triFactors  = (Mat_SeqAIJTriFactors*)B->spptr;
1091:     triFactors->loTriFactorPtr = 0;
1092:     triFactors->upTriFactorPtr = 0;
1093: 
1094:     triFactors->loTriFactorPtr        = new Mat_SeqAIJCUSPARSETriFactor;
1095:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->loTriFactorPtr)->mat = 0;
1096:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->loTriFactorPtr)->Mat_CUSPARSE_description = 0;
1097:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->loTriFactorPtr)->Mat_CUSPARSE_solveAnalysisInfo = 0;
1098:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->loTriFactorPtr)->tempvecGPU = 0;
1099: 
1100:     triFactors->upTriFactorPtr        = new Mat_SeqAIJCUSPARSETriFactor;
1101:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->upTriFactorPtr)->mat = 0;
1102:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->upTriFactorPtr)->Mat_CUSPARSE_description = 0;
1103:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->upTriFactorPtr)->Mat_CUSPARSE_solveAnalysisInfo = 0;
1104:     ((Mat_SeqAIJCUSPARSETriFactor *)triFactors->upTriFactorPtr)->tempvecGPU = 0;
1105:   }

1107:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSP;
1108:   B->ops->destroy        = MatDestroy_SeqAIJCUSP;
1109:   B->ops->getvecs        = MatGetVecs_SeqAIJCUSP;
1110:   B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJCUSP;
1111:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSP);
1112:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;

1114:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_petsccusp",MatGetFactor_seqaij_petsccusp);
1115:   return(0);
1116: }
1117: EXTERN_C_END

1119: #else // if PETSC_HAVE_TXPETSCGPU is 0

1121: EXTERN_C_BEGIN
1124: PetscErrorCode  MatCreate_SeqAIJCUSP(Mat B)
1125: {
1127:   Mat_SeqAIJ     *aij;

1130:   MatCreate_SeqAIJ(B);
1131:   aij             = (Mat_SeqAIJ*)B->data;
1132:   aij->inode.use  = PETSC_FALSE;
1133:   B->ops->mult    = MatMult_SeqAIJCUSP;
1134:   B->ops->multadd = MatMultAdd_SeqAIJCUSP;
1135:   B->spptr        = new Mat_SeqAIJCUSP;
1136:   ((Mat_SeqAIJCUSP *)B->spptr)->mat = 0;
1137:   ((Mat_SeqAIJCUSP *)B->spptr)->tempvec = 0;
1138:   ((Mat_SeqAIJCUSP *)B->spptr)->indices = 0;
1139: 
1140:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSP;
1141:   B->ops->destroy        = MatDestroy_SeqAIJCUSP;
1142:   B->ops->getvecs        = MatGetVecs_SeqAIJCUSP;
1143:   B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJCUSP;
1144:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSP);
1145:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1146:   return(0);
1147: }
1148: EXTERN_C_END

1150: #endif // PETSC_HAVE_TXPETSCGPU