petsc-3.8.4 2018-03-24
  1: /*
  2:   Defines basic operations for the MATSEQAIJMKL matrix class.
  3:   This class is derived from the MATSEQAIJ class and retains the
  4:   compressed row storage (aka Yale sparse matrix format) but uses 
  5:   sparse BLAS operations from the Intel Math Kernel Library (MKL) 
  6:   wherever possible.
  7: */

  9:  #include <../src/mat/impls/aij/seq/aij.h>
 10:  #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h>

 12: /* MKL include files. */
 13: #include <mkl_spblas.h>  /* Sparse BLAS */

 15: typedef struct {
 16:   PetscBool no_SpMV2;  /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */
 17:   PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */
 18: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
 19:   sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */
 20:   struct matrix_descr descr;
 21: #endif
 22: } Mat_SeqAIJMKL;

 24: extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);

 26: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
 27: {
 28:   /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */
 29:   /* so we will ignore 'MatType type'. */
 31:   Mat            B       = *newmat;
 32: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
 33:   Mat_SeqAIJMKL  *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
 34: #endif

 37:   if (reuse == MAT_INITIAL_MATRIX) {
 38:     MatDuplicate(A,MAT_COPY_VALUES,&B);
 39:   }

 41:   /* Reset the original function pointers. */
 42:   B->ops->duplicate        = MatDuplicate_SeqAIJ;
 43:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
 44:   B->ops->destroy          = MatDestroy_SeqAIJ;
 45:   B->ops->mult             = MatMult_SeqAIJ;
 46:   B->ops->multtranspose    = MatMultTranspose_SeqAIJ;
 47:   B->ops->multadd          = MatMultAdd_SeqAIJ;
 48:   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
 49:   B->ops->scale            = MatScale_SeqAIJ;
 50:   B->ops->diagonalscale    = MatDiagonalScale_SeqAIJ;
 51:   B->ops->diagonalset      = MatDiagonalSet_SeqAIJ;
 52:   B->ops->axpy             = MatAXPY_SeqAIJ;

 54:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);
 55:   PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);
 56:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);
 57:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);

 59:   /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this 
 60:    * simply involves destroying the MKL sparse matrix handle and then freeing 
 61:    * the spptr pointer. */
 62: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
 63:   if (reuse == MAT_INITIAL_MATRIX) aijmkl = (Mat_SeqAIJMKL*)B->spptr;

 65:   if (aijmkl->sparse_optimized) {
 66:     sparse_status_t stat;
 67:     stat = mkl_sparse_destroy(aijmkl->csrA);
 68:     if (stat != SPARSE_STATUS_SUCCESS) {
 69:       PetscFunctionReturn(PETSC_ERR_LIB);
 70:     }
 71:   }
 72: #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
 73:   PetscFree(B->spptr);

 75:   /* Change the type of B to MATSEQAIJ. */
 76:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);

 78:   *newmat = B;
 79:   return(0);
 80: }

 82: PetscErrorCode MatDestroy_SeqAIJMKL(Mat A)
 83: {
 85:   Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr;


 89:   /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an 
 90:    * spptr pointer. */
 91:   if (aijmkl) {
 92:     /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */
 93: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
 94:     if (aijmkl->sparse_optimized) {
 95:       sparse_status_t stat = SPARSE_STATUS_SUCCESS;
 96:       stat = mkl_sparse_destroy(aijmkl->csrA);
 97:       if (stat != SPARSE_STATUS_SUCCESS) {
 98:         PetscFunctionReturn(PETSC_ERR_LIB);
 99:       }
100:     }
101: #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */
102:     PetscFree(A->spptr);
103:   }

105:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106:    * to destroy everything that remains. */
107:   PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);
108:   /* Note that I don't call MatSetType().  I believe this is because that
109:    * is only to be called when *building* a matrix.  I could be wrong, but
110:    * that is how things work for the SuperLU matrix class. */
111:   MatDestroy_SeqAIJ(A);
112:   return(0);
113: }

115: PETSC_INTERN PetscErrorCode MatSeqAIJMKL_create_mkl_handle(Mat A)
116: {
117: #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
118:   /* If the MKL library does not have mkl_sparse_optimize(), then this routine 
119:    * does nothing. We make it callable anyway in this case because it cuts 
120:    * down on littering the code with #ifdefs. */
121:   return(0);
122: #else
123:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
124:   Mat_SeqAIJMKL   *aijmkl = (Mat_SeqAIJMKL*)A->spptr;
125:   PetscInt        m,n;
126:   MatScalar       *aa;
127:   PetscInt        *aj,*ai;
128:   sparse_status_t stat;

131:   if (aijmkl->no_SpMV2) return(0);

133:   if (aijmkl->sparse_optimized) {
134:     /* Matrix has been previously assembled and optimized. Must destroy old
135:      * matrix handle before running the optimization step again. */
136:     stat = mkl_sparse_destroy(aijmkl->csrA);
137:     if (stat != SPARSE_STATUS_SUCCESS) {
138:       PetscFunctionReturn(PETSC_ERR_LIB);
139:     }
140:   }
141:   aijmkl->sparse_optimized = PETSC_FALSE;

143:   /* Now perform the SpMV2 setup and matrix optimization. */
144:   aijmkl->descr.type        = SPARSE_MATRIX_TYPE_GENERAL;
145:   aijmkl->descr.mode        = SPARSE_FILL_MODE_LOWER;
146:   aijmkl->descr.diag        = SPARSE_DIAG_NON_UNIT;
147:   m = A->rmap->n;
148:   n = A->cmap->n;
149:   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
150:   aa   = a->a;  /* Nonzero elements stored row-by-row. */
151:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
152:   if ((a->nz!=0) & !(A->structure_only)) {
153:     /* Create a new, optimized sparse matrix handle only if the matrix has nonzero entries.
154:      * The MKL sparse-inspector executor routines don't like being passed an empty matrix. */
155:     stat = mkl_sparse_x_create_csr(&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa);
156:     stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000);
157:     stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE);
158:     stat = mkl_sparse_optimize(aijmkl->csrA);
159:     if (stat != SPARSE_STATUS_SUCCESS) {
160:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize");
161:       PetscFunctionReturn(PETSC_ERR_LIB);
162:     }
163:     aijmkl->sparse_optimized = PETSC_TRUE;
164:   }

166:   return(0);
167: #endif
168: }

170: PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M)
171: {
173:   Mat_SeqAIJMKL *aijmkl;
174:   Mat_SeqAIJMKL *aijmkl_dest;

177:   MatDuplicate_SeqAIJ(A,op,M);
178:   aijmkl      = (Mat_SeqAIJMKL*) A->spptr;
179:   aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr;
180:   PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));
181:   aijmkl_dest->sparse_optimized = PETSC_FALSE;
182:   MatSeqAIJMKL_create_mkl_handle(A);
183:   return(0);
184: }

186: PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode)
187: {
188:   PetscErrorCode  ierr;
189:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;

192:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);

194:   /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some
195:    * extra information and some different methods, call the AssemblyEnd 
196:    * routine for a MATSEQAIJ.
197:    * I'm not sure if this is the best way to do this, but it avoids
198:    * a lot of code duplication. */
199:   a->inode.use = PETSC_FALSE;  /* Must disable: otherwise the MKL routines won't get used. */
200:   MatAssemblyEnd_SeqAIJ(A, mode);

202:   /* Now create the MKL sparse handle (if needed; the function checks). */
203:   MatSeqAIJMKL_create_mkl_handle(A);

205:   return(0);
206: }

208: PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy)
209: {
210:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
211:   const PetscScalar *x;
212:   PetscScalar       *y;
213:   const MatScalar   *aa;
214:   PetscErrorCode    ierr;
215:   PetscInt          m=A->rmap->n;
216:   PetscInt          n=A->cmap->n;
217:   PetscScalar       alpha = 1.0;
218:   PetscScalar       beta = 0.0;
219:   const PetscInt    *aj,*ai;
220:   char              matdescra[6];


223:   /* Variables not in MatMult_SeqAIJ. */
224:   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */

227:   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
228:   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
229:   VecGetArrayRead(xx,&x);
230:   VecGetArray(yy,&y);
231:   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
232:   aa   = a->a;  /* Nonzero elements stored row-by-row. */
233:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

235:   /* Call MKL sparse BLAS routine to do the MatMult. */
236:   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);

238:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
239:   VecRestoreArrayRead(xx,&x);
240:   VecRestoreArray(yy,&y);
241:   return(0);
242: }

244: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
245: PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
246: {
247:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
248:   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
249:   const PetscScalar *x;
250:   PetscScalar       *y;
251:   PetscErrorCode    ierr;
252:   sparse_status_t stat = SPARSE_STATUS_SUCCESS;


256:   /* If there are no nonzero entries, zero yy and return immediately. */
257:   if(!a->nz) {
258:     PetscInt i;
259:     PetscInt m=A->rmap->n;
260:     VecGetArray(yy,&y);
261:     for (i=0; i<m; i++) {
262:       y[i] = 0.0;
263:     }
264:     VecRestoreArray(yy,&y);
265:     return(0);
266:   }

268:   VecGetArrayRead(xx,&x);
269:   VecGetArray(yy,&y);

271:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call 
272:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably 
273:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
274:   if (!aijmkl->sparse_optimized) {
275:     MatSeqAIJMKL_create_mkl_handle(A);
276:   }

278:   /* Call MKL SpMV2 executor routine to do the MatMult. */
279:   stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
280: 
281:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
282:   VecRestoreArrayRead(xx,&x);
283:   VecRestoreArray(yy,&y);
284:   if (stat != SPARSE_STATUS_SUCCESS) {
285:     PetscFunctionReturn(PETSC_ERR_LIB);
286:   }
287:   return(0);
288: }
289: #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */

291: PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy)
292: {
293:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
294:   const PetscScalar *x;
295:   PetscScalar       *y;
296:   const MatScalar   *aa;
297:   PetscErrorCode    ierr;
298:   PetscInt          m=A->rmap->n;
299:   PetscInt          n=A->cmap->n;
300:   PetscScalar       alpha = 1.0;
301:   PetscScalar       beta = 0.0;
302:   const PetscInt    *aj,*ai;
303:   char              matdescra[6];

305:   /* Variables not in MatMultTranspose_SeqAIJ. */
306:   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */

309:   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
310:   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */
311:   VecGetArrayRead(xx,&x);
312:   VecGetArray(yy,&y);
313:   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
314:   aa   = a->a;  /* Nonzero elements stored row-by-row. */
315:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

317:   /* Call MKL sparse BLAS routine to do the MatMult. */
318:   mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y);

320:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
321:   VecRestoreArrayRead(xx,&x);
322:   VecRestoreArray(yy,&y);
323:   return(0);
324: }

326: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
327: PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy)
328: {
329:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
330:   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
331:   const PetscScalar *x;
332:   PetscScalar       *y;
333:   PetscErrorCode    ierr;
334:   sparse_status_t   stat;


338:   /* If there are no nonzero entries, zero yy and return immediately. */
339:   if(!a->nz) {
340:     PetscInt i;
341:     PetscInt n=A->cmap->n;
342:     VecGetArray(yy,&y);
343:     for (i=0; i<n; i++) {
344:       y[i] = 0.0;
345:     }
346:     VecRestoreArray(yy,&y);
347:     return(0);
348:   }

350:   VecGetArrayRead(xx,&x);
351:   VecGetArray(yy,&y);

353:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call 
354:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably 
355:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
356:   if (!aijmkl->sparse_optimized) {
357:     MatSeqAIJMKL_create_mkl_handle(A);
358:   }

360:   /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */
361:   stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y);
362: 
363:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
364:   VecRestoreArrayRead(xx,&x);
365:   VecRestoreArray(yy,&y);
366:   if (stat != SPARSE_STATUS_SUCCESS) {
367:     PetscFunctionReturn(PETSC_ERR_LIB);
368:   }
369:   return(0);
370: }
371: #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */

373: PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
374: {
375:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
376:   const PetscScalar *x;
377:   PetscScalar       *y,*z;
378:   const MatScalar   *aa;
379:   PetscErrorCode    ierr;
380:   PetscInt          m=A->rmap->n;
381:   PetscInt          n=A->cmap->n;
382:   const PetscInt    *aj,*ai;
383:   PetscInt          i;

385:   /* Variables not in MatMultAdd_SeqAIJ. */
386:   char transa = 'n';  /* Used to indicate to MKL that we are not computing the transpose product. */
387:   PetscScalar       alpha = 1.0;
388:   PetscScalar       beta;
389:   char              matdescra[6];

392:   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
393:   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */

395:   VecGetArrayRead(xx,&x);
396:   VecGetArrayPair(yy,zz,&y,&z);
397:   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
398:   aa   = a->a;  /* Nonzero elements stored row-by-row. */
399:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

401:   /* Call MKL sparse BLAS routine to do the MatMult. */
402:   if (zz == yy) {
403:     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
404:     beta = 1.0;
405:     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
406:   } else {
407:     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z. 
408:      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
409:     beta = 0.0;
410:     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
411:     for (i=0; i<m; i++) {
412:       z[i] += y[i];
413:     }
414:   }

416:   PetscLogFlops(2.0*a->nz);
417:   VecRestoreArrayRead(xx,&x);
418:   VecRestoreArrayPair(yy,zz,&y,&z);
419:   return(0);
420: }

422: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
423: PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
424: {
425:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
426:   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
427:   const PetscScalar *x;
428:   PetscScalar       *y,*z;
429:   PetscErrorCode    ierr;
430:   PetscInt          m=A->rmap->n;
431:   PetscInt          i;

433:   /* Variables not in MatMultAdd_SeqAIJ. */
434:   sparse_status_t stat = SPARSE_STATUS_SUCCESS;


438:   /* If there are no nonzero entries, set zz = yy and return immediately. */
439:   if(!a->nz) {
440:     PetscInt i;
441:     VecGetArrayPair(yy,zz,&y,&z);
442:     for (i=0; i<m; i++) {
443:       z[i] = y[i];
444:     }
445:     VecRestoreArrayPair(yy,zz,&y,&z);
446:     return(0);
447:   }

449:   VecGetArrayRead(xx,&x);
450:   VecGetArrayPair(yy,zz,&y,&z);

452:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call 
453:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably 
454:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
455:   if (!aijmkl->sparse_optimized) {
456:     MatSeqAIJMKL_create_mkl_handle(A);
457:   }

459:   /* Call MKL sparse BLAS routine to do the MatMult. */
460:   if (zz == yy) {
461:     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y, 
462:      * with alpha and beta both set to 1.0. */
463:     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
464:   } else {
465:     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then 
466:      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
467:     stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
468:     for (i=0; i<m; i++) {
469:       z[i] += y[i];
470:     }
471:   }

473:   PetscLogFlops(2.0*a->nz);
474:   VecRestoreArrayRead(xx,&x);
475:   VecRestoreArrayPair(yy,zz,&y,&z);
476:   if (stat != SPARSE_STATUS_SUCCESS) {
477:     PetscFunctionReturn(PETSC_ERR_LIB);
478:   }
479:   return(0);
480: }
481: #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */

483: PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz)
484: {
485:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
486:   const PetscScalar *x;
487:   PetscScalar       *y,*z;
488:   const MatScalar   *aa;
489:   PetscErrorCode    ierr;
490:   PetscInt          m=A->rmap->n;
491:   PetscInt          n=A->cmap->n;
492:   const PetscInt    *aj,*ai;
493:   PetscInt          i;

495:   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
496:   char transa = 't';  /* Used to indicate to MKL that we are computing the transpose product. */
497:   PetscScalar       alpha = 1.0;
498:   PetscScalar       beta;
499:   char              matdescra[6];

502:   matdescra[0] = 'g';  /* Indicates to MKL that we using a general CSR matrix. */
503:   matdescra[3] = 'c';  /* Indicates to MKL that we use C-style (0-based) indexing. */

505:   VecGetArrayRead(xx,&x);
506:   VecGetArrayPair(yy,zz,&y,&z);
507:   aj   = a->j;  /* aj[k] gives column index for element aa[k]. */
508:   aa   = a->a;  /* Nonzero elements stored row-by-row. */
509:   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

511:   /* Call MKL sparse BLAS routine to do the MatMult. */
512:   if (zz == yy) {
513:     /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */
514:     beta = 1.0;
515:     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
516:   } else {
517:     /* zz and yy are different vectors, so call MKL's mkl_xcsrmv() with beta=0, then add the result to z. 
518:      * MKL sparse BLAS does not have a MatMultAdd equivalent. */
519:     beta = 0.0;
520:     mkl_xcsrmv(&transa,&m,&n,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,z);
521:     for (i=0; i<n; i++) {
522:       z[i] += y[i];
523:     }
524:   }

526:   PetscLogFlops(2.0*a->nz);
527:   VecRestoreArrayRead(xx,&x);
528:   VecRestoreArrayPair(yy,zz,&y,&z);
529:   return(0);
530: }

532: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
533: PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz)
534: {
535:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
536:   Mat_SeqAIJMKL     *aijmkl=(Mat_SeqAIJMKL*)A->spptr;
537:   const PetscScalar *x;
538:   PetscScalar       *y,*z;
539:   PetscErrorCode    ierr;
540:   PetscInt          n=A->cmap->n;
541:   PetscInt          i;

543:   /* Variables not in MatMultTransposeAdd_SeqAIJ. */
544:   sparse_status_t stat = SPARSE_STATUS_SUCCESS;


548:   /* If there are no nonzero entries, set zz = yy and return immediately. */
549:   if(!a->nz) {
550:     PetscInt i;
551:     VecGetArrayPair(yy,zz,&y,&z);
552:     for (i=0; i<n; i++) {
553:       z[i] = y[i];
554:     }
555:     VecRestoreArrayPair(yy,zz,&y,&z);
556:     return(0);
557:   }

559:   VecGetArrayRead(xx,&x);
560:   VecGetArrayPair(yy,zz,&y,&z);

562:   /* In some cases, we get to this point without mkl_sparse_optimize() having been called, so we check and then call 
563:    * it if needed. Eventually, when everything in PETSc is properly updating the matrix state, we should probably 
564:    * take a "lazy" approach to creation/updating of the MKL matrix handle and plan to always do it here (when needed). */
565:   if (!aijmkl->sparse_optimized) {
566:     MatSeqAIJMKL_create_mkl_handle(A);
567:   }

569:   /* Call MKL sparse BLAS routine to do the MatMult. */
570:   if (zz == yy) {
571:     /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y, 
572:      * with alpha and beta both set to 1.0. */
573:     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,z);
574:   } else {
575:     /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then 
576:      * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */
577:     stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,z);
578:     for (i=0; i<n; i++) {
579:       z[i] += y[i];
580:     }
581:   }

583:   PetscLogFlops(2.0*a->nz);
584:   VecRestoreArrayRead(xx,&x);
585:   VecRestoreArrayPair(yy,zz,&y,&z);
586:   if (stat != SPARSE_STATUS_SUCCESS) {
587:     PetscFunctionReturn(PETSC_ERR_LIB);
588:   }
589:   return(0);
590: }
591: #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */

593: PetscErrorCode MatScale_SeqAIJMKL(Mat inA,PetscScalar alpha)
594: {

598:   MatScale_SeqAIJ(inA,alpha);
599:   MatSeqAIJMKL_create_mkl_handle(inA);
600:   return(0);
601: }

603: PetscErrorCode MatDiagonalScale_SeqAIJMKL(Mat A,Vec ll,Vec rr)
604: {

608:   MatDiagonalScale_SeqAIJ(A,ll,rr);
609:   MatSeqAIJMKL_create_mkl_handle(A);
610:   return(0);
611: }

613: PetscErrorCode MatDiagonalSet_SeqAIJMKL(Mat Y,Vec D,InsertMode is)
614: {

618:   MatDiagonalSet_SeqAIJ(Y,D,is);
619:   MatSeqAIJMKL_create_mkl_handle(Y);
620:   return(0);
621: }

623: PetscErrorCode MatAXPY_SeqAIJMKL(Mat Y,PetscScalar a,Mat X,MatStructure str)
624: {

628:   MatAXPY_SeqAIJ(Y,a,X,str);
629:   if (str == SAME_NONZERO_PATTERN) {
630:     /* MatAssemblyEnd() is not called if SAME_NONZERO_PATTERN, so we need to force update of the MKL matrix handle. */
631:     MatSeqAIJMKL_create_mkl_handle(Y);
632:   }
633:   return(0);
634: }

636: /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a
637:  * SeqAIJMKL matrix.  This routine is called by the MatCreate_SeqMKLAIJ()
638:  * routine, but can also be used to convert an assembled SeqAIJ matrix
639:  * into a SeqAIJMKL one. */
640: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat)
641: {
643:   Mat            B = *newmat;
644:   Mat_SeqAIJMKL  *aijmkl;
645:   PetscBool      set;
646:   PetscBool      sametype;

649:   if (reuse == MAT_INITIAL_MATRIX) {
650:     MatDuplicate(A,MAT_COPY_VALUES,&B);
651:   }

653:   PetscObjectTypeCompare((PetscObject)A,type,&sametype);
654:   if (sametype) return(0);

656:   PetscNewLog(B,&aijmkl);
657:   B->spptr = (void*) aijmkl;

659:   /* Set function pointers for methods that we inherit from AIJ but override. 
660:    * We also parse some command line options below, since those determine some of the methods we point to. */
661:   B->ops->duplicate        = MatDuplicate_SeqAIJMKL;
662:   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJMKL;
663:   B->ops->destroy          = MatDestroy_SeqAIJMKL;

665:   aijmkl->sparse_optimized = PETSC_FALSE;
666: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
667:   aijmkl->no_SpMV2 = PETSC_FALSE;  /* Default to using the SpMV2 routines if our MKL supports them. */
668: #else
669:   aijmkl->no_SpMV2 = PETSC_TRUE;
670: #endif

672:   /* Parse command line options. */
673:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");
674:   PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);
675:   PetscOptionsEnd();
676: #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
677:   if(!aijmkl->no_SpMV2) {
678:     PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize();  defaulting to non-SpMV2 routines.\n");
679:     aijmkl->no_SpMV2 = PETSC_TRUE;
680:   }
681: #endif

683:   if(!aijmkl->no_SpMV2) {
684: #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE
685:     B->ops->mult             = MatMult_SeqAIJMKL_SpMV2;
686:     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL_SpMV2;
687:     B->ops->multadd          = MatMultAdd_SeqAIJMKL_SpMV2;
688:     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2;
689: #endif
690:   } else {
691:     B->ops->mult             = MatMult_SeqAIJMKL;
692:     B->ops->multtranspose    = MatMultTranspose_SeqAIJMKL;
693:     B->ops->multadd          = MatMultAdd_SeqAIJMKL;
694:     B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL;
695:   }

697:   B->ops->scale              = MatScale_SeqAIJMKL;
698:   B->ops->diagonalscale      = MatDiagonalScale_SeqAIJMKL;
699:   B->ops->diagonalset        = MatDiagonalSet_SeqAIJMKL;
700:   B->ops->axpy               = MatAXPY_SeqAIJMKL;

702:   PetscObjectComposeFunction((PetscObject)B,"MatScale_SeqAIJMKL_C",MatScale_SeqAIJMKL);
703:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);
704:   PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);
705:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);
706:   PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);

708:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);
709:   *newmat = B;
710:   return(0);
711: }

713: /*@C
714:    MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL.
715:    This type inherits from AIJ and is largely identical, but uses sparse BLAS 
716:    routines from Intel MKL whenever possible.
717:    MatMult, MatMultAdd, MatMultTranspose, and MatMultTransposeAdd 
718:    operations are currently supported.
719:    If the installed version of MKL supports the "SpMV2" sparse 
720:    inspector-executor routines, then those are used by default.

722:    Collective on MPI_Comm

724:    Input Parameters:
725: +  comm - MPI communicator, set to PETSC_COMM_SELF
726: .  m - number of rows
727: .  n - number of columns
728: .  nz - number of nonzeros per row (same for all rows)
729: -  nnz - array containing the number of nonzeros in the various rows
730:          (possibly different for each row) or NULL

732:    Output Parameter:
733: .  A - the matrix

735:    Options Database Keys:
736: .  -mat_aijmkl_no_spmv2 - disables use of the SpMV2 inspector-executor routines

738:    Notes:
739:    If nnz is given then nz is ignored

741:    Level: intermediate

743: .keywords: matrix, MKL, sparse, parallel

745: .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues()
746: @*/
747: PetscErrorCode  MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
748: {

752:   MatCreate(comm,A);
753:   MatSetSizes(*A,m,n,m,n);
754:   MatSetType(*A,MATSEQAIJMKL);
755:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);
756:   return(0);
757: }

759: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A)
760: {

764:   MatSetType(A,MATSEQAIJ);
765:   MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);
766:   return(0);
767: }