Actual source code: matmatmult.c


  2: /*
  3:   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
  4:           C = A * B
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <../src/mat/utils/freespace.h>
  9: #include <petscbt.h>
 10: #include <petsc/private/isimpl.h>
 11: #include <../src/mat/impls/dense/seq/dense.h>

 13: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
 14: {
 15:   if (C->ops->matmultnumeric) {
 17:     (*C->ops->matmultnumeric)(A,B,C);
 18:   } else {
 19:     MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);
 20:   }
 21:   return 0;
 22: }

 24: /* Modified from MatCreateSeqAIJWithArrays() */
 25: PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
 26: {
 27:   PetscInt       ii;
 28:   Mat_SeqAIJ     *aij;
 29:   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;

 32:   MatSetSizes(mat,m,n,m,n);

 34:   if (!mtype) {
 35:     PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij);
 36:     if (!isseqaij) MatSetType(mat,MATSEQAIJ);
 37:   } else {
 38:     MatSetType(mat,mtype);
 39:   }

 41:   aij  = (Mat_SeqAIJ*)(mat)->data;
 42:   osingle = aij->singlemalloc;
 43:   ofree_a = aij->free_a;
 44:   ofree_ij = aij->free_ij;
 45:   /* changes the free flags */
 46:   MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL);

 48:   PetscFree(aij->ilen);
 49:   PetscFree(aij->imax);
 50:   PetscMalloc1(m,&aij->imax);
 51:   PetscMalloc1(m,&aij->ilen);
 52:   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
 53:     const PetscInt rnz = i[ii+1] - i[ii];
 54:     aij->nonzerorowcnt += !!rnz;
 55:     aij->rmax = PetscMax(aij->rmax,rnz);
 56:     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
 57:   }
 58:   aij->maxnz = i[m];
 59:   aij->nz = i[m];

 61:   if (osingle) {
 62:     PetscFree3(aij->a,aij->j,aij->i);
 63:   } else {
 64:     if (ofree_a)  PetscFree(aij->a);
 65:     if (ofree_ij) PetscFree(aij->j);
 66:     if (ofree_ij) PetscFree(aij->i);
 67:   }
 68:   aij->i            = i;
 69:   aij->j            = j;
 70:   aij->a            = a;
 71:   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
 72:   /* default to not retain ownership */
 73:   aij->singlemalloc = PETSC_FALSE;
 74:   aij->free_a       = PETSC_FALSE;
 75:   aij->free_ij      = PETSC_FALSE;
 76:   MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6);
 77:   return 0;
 78: }

 80: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
 81: {
 82:   Mat_Product         *product = C->product;
 83:   MatProductAlgorithm alg;
 84:   PetscBool           flg;

 86:   if (product) {
 87:     alg = product->alg;
 88:   } else {
 89:     alg = "sorted";
 90:   }
 91:   /* sorted */
 92:   PetscStrcmp(alg,"sorted",&flg);
 93:   if (flg) {
 94:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);
 95:     return 0;
 96:   }

 98:   /* scalable */
 99:   PetscStrcmp(alg,"scalable",&flg);
100:   if (flg) {
101:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);
102:     return 0;
103:   }

105:   /* scalable_fast */
106:   PetscStrcmp(alg,"scalable_fast",&flg);
107:   if (flg) {
108:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);
109:     return 0;
110:   }

112:   /* heap */
113:   PetscStrcmp(alg,"heap",&flg);
114:   if (flg) {
115:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);
116:     return 0;
117:   }

119:   /* btheap */
120:   PetscStrcmp(alg,"btheap",&flg);
121:   if (flg) {
122:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);
123:     return 0;
124:   }

126:   /* llcondensed */
127:   PetscStrcmp(alg,"llcondensed",&flg);
128:   if (flg) {
129:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);
130:     return 0;
131:   }

133:   /* rowmerge */
134:   PetscStrcmp(alg,"rowmerge",&flg);
135:   if (flg) {
136:     MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);
137:     return 0;
138:   }

140: #if defined(PETSC_HAVE_HYPRE)
141:   PetscStrcmp(alg,"hypre",&flg);
142:   if (flg) {
143:     MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);
144:     return 0;
145:   }
146: #endif

148:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
149: }

151: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
152: {
153:   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
154:   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
155:   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
156:   PetscReal          afill;
157:   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
158:   PetscTable         ta;
159:   PetscBT            lnkbt;
160:   PetscFreeSpaceList free_space=NULL,current_space=NULL;

162:   /* Get ci and cj */
163:   /*---------------*/
164:   /* Allocate ci array, arrays for fill computation and */
165:   /* free space for accumulating nonzero column info */
166:   PetscMalloc1(am+2,&ci);
167:   ci[0] = 0;

169:   /* create and initialize a linked list */
170:   PetscTableCreate(bn,bn,&ta);
171:   MatRowMergeMax_SeqAIJ(b,bm,ta);
172:   PetscTableGetCount(ta,&Crmax);
173:   PetscTableDestroy(&ta);

175:   PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);

177:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
178:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);

180:   current_space = free_space;

182:   /* Determine ci and cj */
183:   for (i=0; i<am; i++) {
184:     anzi = ai[i+1] - ai[i];
185:     aj   = a->j + ai[i];
186:     for (j=0; j<anzi; j++) {
187:       brow = aj[j];
188:       bnzj = bi[brow+1] - bi[brow];
189:       bj   = b->j + bi[brow];
190:       /* add non-zero cols of B into the sorted linked list lnk */
191:       PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);
192:     }
193:     /* add possible missing diagonal entry */
194:     if (C->force_diagonals) {
195:       PetscLLCondensedAddSorted(1,&i,lnk,lnkbt);
196:     }
197:     cnzi = lnk[0];

199:     /* If free space is not available, make more free space */
200:     /* Double the amount of total space in the list */
201:     if (current_space->local_remaining<cnzi) {
202:       PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);
203:       ndouble++;
204:     }

206:     /* Copy data into free space, then initialize lnk */
207:     PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);

209:     current_space->array           += cnzi;
210:     current_space->local_used      += cnzi;
211:     current_space->local_remaining -= cnzi;

213:     ci[i+1] = ci[i] + cnzi;
214:   }

216:   /* Column indices are in the list of free space */
217:   /* Allocate space for cj, initialize cj, and */
218:   /* destroy list of free space and other temporary array(s) */
219:   PetscMalloc1(ci[am]+1,&cj);
220:   PetscFreeSpaceContiguous(&free_space,cj);
221:   PetscLLCondensedDestroy(lnk,lnkbt);

223:   /* put together the new symbolic matrix */
224:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);
225:   MatSetBlockSizesFromMats(C,A,B);

227:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
228:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
229:   c          = (Mat_SeqAIJ*)(C->data);
230:   c->free_a  = PETSC_FALSE;
231:   c->free_ij = PETSC_TRUE;
232:   c->nonew   = 0;

234:   /* fast, needs non-scalable O(bn) array 'abdense' */
235:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

237:   /* set MatInfo */
238:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
239:   if (afill < 1.0) afill = 1.0;
240:   C->info.mallocs           = ndouble;
241:   C->info.fill_ratio_given  = fill;
242:   C->info.fill_ratio_needed = afill;

244: #if defined(PETSC_USE_INFO)
245:   if (ci[am]) {
246:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
247:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
248:   } else {
249:     PetscInfo(C,"Empty matrix product\n");
250:   }
251: #endif
252:   return 0;
253: }

255: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
256: {
257:   PetscLogDouble    flops=0.0;
258:   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
259:   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
260:   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
261:   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
262:   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
263:   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
264:   PetscScalar       *ca,valtmp;
265:   PetscScalar       *ab_dense;
266:   PetscContainer    cab_dense;
267:   const PetscScalar *aa,*ba,*baj;

269:   MatSeqAIJGetArrayRead(A,&aa);
270:   MatSeqAIJGetArrayRead(B,&ba);
271:   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
272:     PetscMalloc1(ci[cm]+1,&ca);
273:     c->a      = ca;
274:     c->free_a = PETSC_TRUE;
275:   } else ca = c->a;

277:   /* TODO this should be done in the symbolic phase */
278:   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
279:      that is hard to eradicate) */
280:   PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense);
281:   if (!cab_dense) {
282:     PetscMalloc1(B->cmap->N,&ab_dense);
283:     PetscContainerCreate(PETSC_COMM_SELF,&cab_dense);
284:     PetscContainerSetPointer(cab_dense,ab_dense);
285:     PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault);
286:     PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense);
287:     PetscObjectDereference((PetscObject)cab_dense);
288:   }
289:   PetscContainerGetPointer(cab_dense,(void**)&ab_dense);
290:   PetscArrayzero(ab_dense,B->cmap->N);

292:   /* clean old values in C */
293:   PetscArrayzero(ca,ci[cm]);
294:   /* Traverse A row-wise. */
295:   /* Build the ith row in C by summing over nonzero columns in A, */
296:   /* the rows of B corresponding to nonzeros of A. */
297:   for (i=0; i<am; i++) {
298:     anzi = ai[i+1] - ai[i];
299:     for (j=0; j<anzi; j++) {
300:       brow = aj[j];
301:       bnzi = bi[brow+1] - bi[brow];
302:       bjj  = bj + bi[brow];
303:       baj  = ba + bi[brow];
304:       /* perform dense axpy */
305:       valtmp = aa[j];
306:       for (k=0; k<bnzi; k++) {
307:         ab_dense[bjj[k]] += valtmp*baj[k];
308:       }
309:       flops += 2*bnzi;
310:     }
311:     aj += anzi; aa += anzi;

313:     cnzi = ci[i+1] - ci[i];
314:     for (k=0; k<cnzi; k++) {
315:       ca[k]          += ab_dense[cj[k]];
316:       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
317:     }
318:     flops += cnzi;
319:     cj    += cnzi; ca += cnzi;
320:   }
321: #if defined(PETSC_HAVE_DEVICE)
322:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
323: #endif
324:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
325:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
326:   PetscLogFlops(flops);
327:   MatSeqAIJRestoreArrayRead(A,&aa);
328:   MatSeqAIJRestoreArrayRead(B,&ba);
329:   return 0;
330: }

332: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
333: {
334:   PetscLogDouble    flops=0.0;
335:   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
336:   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
337:   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
338:   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
339:   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
340:   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
341:   PetscScalar       *ca=c->a,valtmp;
342:   const PetscScalar *aa,*ba,*baj;
343:   PetscInt          nextb;

345:   MatSeqAIJGetArrayRead(A,&aa);
346:   MatSeqAIJGetArrayRead(B,&ba);
347:   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
348:     PetscMalloc1(ci[cm]+1,&ca);
349:     c->a      = ca;
350:     c->free_a = PETSC_TRUE;
351:   }

353:   /* clean old values in C */
354:   PetscArrayzero(ca,ci[cm]);
355:   /* Traverse A row-wise. */
356:   /* Build the ith row in C by summing over nonzero columns in A, */
357:   /* the rows of B corresponding to nonzeros of A. */
358:   for (i=0; i<am; i++) {
359:     anzi = ai[i+1] - ai[i];
360:     cnzi = ci[i+1] - ci[i];
361:     for (j=0; j<anzi; j++) {
362:       brow = aj[j];
363:       bnzi = bi[brow+1] - bi[brow];
364:       bjj  = bj + bi[brow];
365:       baj  = ba + bi[brow];
366:       /* perform sparse axpy */
367:       valtmp = aa[j];
368:       nextb  = 0;
369:       for (k=0; nextb<bnzi; k++) {
370:         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
371:           ca[k] += valtmp*baj[nextb++];
372:         }
373:       }
374:       flops += 2*bnzi;
375:     }
376:     aj += anzi; aa += anzi;
377:     cj += cnzi; ca += cnzi;
378:   }
379: #if defined(PETSC_HAVE_DEVICE)
380:   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
381: #endif
382:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
383:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
384:   PetscLogFlops(flops);
385:   MatSeqAIJRestoreArrayRead(A,&aa);
386:   MatSeqAIJRestoreArrayRead(B,&ba);
387:   return 0;
388: }

390: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
391: {
392:   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
393:   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
394:   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
395:   MatScalar          *ca;
396:   PetscReal          afill;
397:   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
398:   PetscTable         ta;
399:   PetscFreeSpaceList free_space=NULL,current_space=NULL;

401:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
402:   /*-----------------------------------------------------------------------------------------*/
403:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
404:   PetscMalloc1(am+2,&ci);
405:   ci[0] = 0;

407:   /* create and initialize a linked list */
408:   PetscTableCreate(bn,bn,&ta);
409:   MatRowMergeMax_SeqAIJ(b,bm,ta);
410:   PetscTableGetCount(ta,&Crmax);
411:   PetscTableDestroy(&ta);

413:   PetscLLCondensedCreate_fast(Crmax,&lnk);

415:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
416:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
417:   current_space = free_space;

419:   /* Determine ci and cj */
420:   for (i=0; i<am; i++) {
421:     anzi = ai[i+1] - ai[i];
422:     aj   = a->j + ai[i];
423:     for (j=0; j<anzi; j++) {
424:       brow = aj[j];
425:       bnzj = bi[brow+1] - bi[brow];
426:       bj   = b->j + bi[brow];
427:       /* add non-zero cols of B into the sorted linked list lnk */
428:       PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);
429:     }
430:     /* add possible missing diagonal entry */
431:     if (C->force_diagonals) {
432:       PetscLLCondensedAddSorted_fast(1,&i,lnk);
433:     }
434:     cnzi = lnk[1];

436:     /* If free space is not available, make more free space */
437:     /* Double the amount of total space in the list */
438:     if (current_space->local_remaining<cnzi) {
439:       PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);
440:       ndouble++;
441:     }

443:     /* Copy data into free space, then initialize lnk */
444:     PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);

446:     current_space->array           += cnzi;
447:     current_space->local_used      += cnzi;
448:     current_space->local_remaining -= cnzi;

450:     ci[i+1] = ci[i] + cnzi;
451:   }

453:   /* Column indices are in the list of free space */
454:   /* Allocate space for cj, initialize cj, and */
455:   /* destroy list of free space and other temporary array(s) */
456:   PetscMalloc1(ci[am]+1,&cj);
457:   PetscFreeSpaceContiguous(&free_space,cj);
458:   PetscLLCondensedDestroy_fast(lnk);

460:   /* Allocate space for ca */
461:   PetscCalloc1(ci[am]+1,&ca);

463:   /* put together the new symbolic matrix */
464:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);
465:   MatSetBlockSizesFromMats(C,A,B);

467:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
468:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
469:   c          = (Mat_SeqAIJ*)(C->data);
470:   c->free_a  = PETSC_TRUE;
471:   c->free_ij = PETSC_TRUE;
472:   c->nonew   = 0;

474:   /* slower, less memory */
475:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

477:   /* set MatInfo */
478:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
479:   if (afill < 1.0) afill = 1.0;
480:   C->info.mallocs           = ndouble;
481:   C->info.fill_ratio_given  = fill;
482:   C->info.fill_ratio_needed = afill;

484: #if defined(PETSC_USE_INFO)
485:   if (ci[am]) {
486:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
487:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
488:   } else {
489:     PetscInfo(C,"Empty matrix product\n");
490:   }
491: #endif
492:   return 0;
493: }

495: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
496: {
497:   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
498:   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
499:   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
500:   MatScalar          *ca;
501:   PetscReal          afill;
502:   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
503:   PetscTable         ta;
504:   PetscFreeSpaceList free_space=NULL,current_space=NULL;

506:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
507:   /*---------------------------------------------------------------------------------------------*/
508:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
509:   PetscMalloc1(am+2,&ci);
510:   ci[0] = 0;

512:   /* create and initialize a linked list */
513:   PetscTableCreate(bn,bn,&ta);
514:   MatRowMergeMax_SeqAIJ(b,bm,ta);
515:   PetscTableGetCount(ta,&Crmax);
516:   PetscTableDestroy(&ta);
517:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

519:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
520:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
521:   current_space = free_space;

523:   /* Determine ci and cj */
524:   for (i=0; i<am; i++) {
525:     anzi = ai[i+1] - ai[i];
526:     aj   = a->j + ai[i];
527:     for (j=0; j<anzi; j++) {
528:       brow = aj[j];
529:       bnzj = bi[brow+1] - bi[brow];
530:       bj   = b->j + bi[brow];
531:       /* add non-zero cols of B into the sorted linked list lnk */
532:       PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);
533:     }
534:     /* add possible missing diagonal entry */
535:     if (C->force_diagonals) {
536:       PetscLLCondensedAddSorted_Scalable(1,&i,lnk);
537:     }

539:     cnzi = lnk[0];

541:     /* If free space is not available, make more free space */
542:     /* Double the amount of total space in the list */
543:     if (current_space->local_remaining<cnzi) {
544:       PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space);
545:       ndouble++;
546:     }

548:     /* Copy data into free space, then initialize lnk */
549:     PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);

551:     current_space->array           += cnzi;
552:     current_space->local_used      += cnzi;
553:     current_space->local_remaining -= cnzi;

555:     ci[i+1] = ci[i] + cnzi;
556:   }

558:   /* Column indices are in the list of free space */
559:   /* Allocate space for cj, initialize cj, and */
560:   /* destroy list of free space and other temporary array(s) */
561:   PetscMalloc1(ci[am]+1,&cj);
562:   PetscFreeSpaceContiguous(&free_space,cj);
563:   PetscLLCondensedDestroy_Scalable(lnk);

565:   /* Allocate space for ca */
566:   /*-----------------------*/
567:   PetscCalloc1(ci[am]+1,&ca);

569:   /* put together the new symbolic matrix */
570:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C);
571:   MatSetBlockSizesFromMats(C,A,B);

573:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
574:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
575:   c          = (Mat_SeqAIJ*)(C->data);
576:   c->free_a  = PETSC_TRUE;
577:   c->free_ij = PETSC_TRUE;
578:   c->nonew   = 0;

580:   /* slower, less memory */
581:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;

583:   /* set MatInfo */
584:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
585:   if (afill < 1.0) afill = 1.0;
586:   C->info.mallocs           = ndouble;
587:   C->info.fill_ratio_given  = fill;
588:   C->info.fill_ratio_needed = afill;

590: #if defined(PETSC_USE_INFO)
591:   if (ci[am]) {
592:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
593:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
594:   } else {
595:     PetscInfo(C,"Empty matrix product\n");
596:   }
597: #endif
598:   return 0;
599: }

601: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
602: {
603:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
604:   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
605:   PetscInt           *ci,*cj,*bb;
606:   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
607:   PetscReal          afill;
608:   PetscInt           i,j,col,ndouble = 0;
609:   PetscFreeSpaceList free_space=NULL,current_space=NULL;
610:   PetscHeap          h;

612:   /* Get ci and cj - by merging sorted rows using a heap */
613:   /*---------------------------------------------------------------------------------------------*/
614:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
615:   PetscMalloc1(am+2,&ci);
616:   ci[0] = 0;

618:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
619:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
620:   current_space = free_space;

622:   PetscHeapCreate(a->rmax,&h);
623:   PetscMalloc1(a->rmax,&bb);

625:   /* Determine ci and cj */
626:   for (i=0; i<am; i++) {
627:     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
628:     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
629:     ci[i+1] = ci[i];
630:     /* Populate the min heap */
631:     for (j=0; j<anzi; j++) {
632:       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
633:       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
634:         PetscHeapAdd(h,j,bj[bb[j]++]);
635:       }
636:     }
637:     /* Pick off the min element, adding it to free space */
638:     PetscHeapPop(h,&j,&col);
639:     while (j >= 0) {
640:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
641:         PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);
642:         ndouble++;
643:       }
644:       *(current_space->array++) = col;
645:       current_space->local_used++;
646:       current_space->local_remaining--;
647:       ci[i+1]++;

649:       /* stash if anything else remains in this row of B */
650:       if (bb[j] < bi[acol[j]+1]) PetscHeapStash(h,j,bj[bb[j]++]);
651:       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
652:         PetscInt j2,col2;
653:         PetscHeapPeek(h,&j2,&col2);
654:         if (col2 != col) break;
655:         PetscHeapPop(h,&j2,&col2);
656:         if (bb[j2] < bi[acol[j2]+1]) PetscHeapStash(h,j2,bj[bb[j2]++]);
657:       }
658:       /* Put any stashed elements back into the min heap */
659:       PetscHeapUnstash(h);
660:       PetscHeapPop(h,&j,&col);
661:     }
662:   }
663:   PetscFree(bb);
664:   PetscHeapDestroy(&h);

666:   /* Column indices are in the list of free space */
667:   /* Allocate space for cj, initialize cj, and */
668:   /* destroy list of free space and other temporary array(s) */
669:   PetscMalloc1(ci[am],&cj);
670:   PetscFreeSpaceContiguous(&free_space,cj);

672:   /* put together the new symbolic matrix */
673:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);
674:   MatSetBlockSizesFromMats(C,A,B);

676:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
677:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
678:   c          = (Mat_SeqAIJ*)(C->data);
679:   c->free_a  = PETSC_TRUE;
680:   c->free_ij = PETSC_TRUE;
681:   c->nonew   = 0;

683:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

685:   /* set MatInfo */
686:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
687:   if (afill < 1.0) afill = 1.0;
688:   C->info.mallocs           = ndouble;
689:   C->info.fill_ratio_given  = fill;
690:   C->info.fill_ratio_needed = afill;

692: #if defined(PETSC_USE_INFO)
693:   if (ci[am]) {
694:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
695:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
696:   } else {
697:     PetscInfo(C,"Empty matrix product\n");
698:   }
699: #endif
700:   return 0;
701: }

703: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
704: {
705:   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
706:   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
707:   PetscInt           *ci,*cj,*bb;
708:   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
709:   PetscReal          afill;
710:   PetscInt           i,j,col,ndouble = 0;
711:   PetscFreeSpaceList free_space=NULL,current_space=NULL;
712:   PetscHeap          h;
713:   PetscBT            bt;

715:   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
716:   /*---------------------------------------------------------------------------------------------*/
717:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
718:   PetscMalloc1(am+2,&ci);
719:   ci[0] = 0;

721:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
722:   PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);

724:   current_space = free_space;

726:   PetscHeapCreate(a->rmax,&h);
727:   PetscMalloc1(a->rmax,&bb);
728:   PetscBTCreate(bn,&bt);

730:   /* Determine ci and cj */
731:   for (i=0; i<am; i++) {
732:     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
733:     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
734:     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
735:     ci[i+1] = ci[i];
736:     /* Populate the min heap */
737:     for (j=0; j<anzi; j++) {
738:       PetscInt brow = acol[j];
739:       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
740:         PetscInt bcol = bj[bb[j]];
741:         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
742:           PetscHeapAdd(h,j,bcol);
743:           bb[j]++;
744:           break;
745:         }
746:       }
747:     }
748:     /* Pick off the min element, adding it to free space */
749:     PetscHeapPop(h,&j,&col);
750:     while (j >= 0) {
751:       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
752:         fptr = NULL;                      /* need PetscBTMemzero */
753:         PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space);
754:         ndouble++;
755:       }
756:       *(current_space->array++) = col;
757:       current_space->local_used++;
758:       current_space->local_remaining--;
759:       ci[i+1]++;

761:       /* stash if anything else remains in this row of B */
762:       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
763:         PetscInt bcol = bj[bb[j]];
764:         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
765:           PetscHeapAdd(h,j,bcol);
766:           bb[j]++;
767:           break;
768:         }
769:       }
770:       PetscHeapPop(h,&j,&col);
771:     }
772:     if (fptr) {                 /* Clear the bits for this row */
773:       for (; fptr<current_space->array; fptr++) PetscBTClear(bt,*fptr);
774:     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
775:       PetscBTMemzero(bn,bt);
776:     }
777:   }
778:   PetscFree(bb);
779:   PetscHeapDestroy(&h);
780:   PetscBTDestroy(&bt);

782:   /* Column indices are in the list of free space */
783:   /* Allocate space for cj, initialize cj, and */
784:   /* destroy list of free space and other temporary array(s) */
785:   PetscMalloc1(ci[am],&cj);
786:   PetscFreeSpaceContiguous(&free_space,cj);

788:   /* put together the new symbolic matrix */
789:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);
790:   MatSetBlockSizesFromMats(C,A,B);

792:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
793:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
794:   c          = (Mat_SeqAIJ*)(C->data);
795:   c->free_a  = PETSC_TRUE;
796:   c->free_ij = PETSC_TRUE;
797:   c->nonew   = 0;

799:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

801:   /* set MatInfo */
802:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
803:   if (afill < 1.0) afill = 1.0;
804:   C->info.mallocs           = ndouble;
805:   C->info.fill_ratio_given  = fill;
806:   C->info.fill_ratio_needed = afill;

808: #if defined(PETSC_USE_INFO)
809:   if (ci[am]) {
810:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
811:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
812:   } else {
813:     PetscInfo(C,"Empty matrix product\n");
814:   }
815: #endif
816:   return 0;
817: }

819: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
820: {
821:   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
822:   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
823:   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
824:   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
825:   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
826:   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
827:   const PetscInt     *brow_ptr[8],*brow_end[8];
828:   PetscInt           window[8];
829:   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
830:   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
831:   PetscReal          afill;
832:   PetscInt           *workj_L1,*workj_L2,*workj_L3;
833:   PetscInt           L1_nnz,L2_nnz;

835:   /* Step 1: Get upper bound on memory required for allocation.
836:              Because of the way virtual memory works,
837:              only the memory pages that are actually needed will be physically allocated. */
838:   PetscMalloc1(am+1,&ci);
839:   for (i=0; i<am; i++) {
840:     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
841:     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
842:     a_rownnz = 0;
843:     for (k=0; k<anzi; ++k) {
844:       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
845:       if (a_rownnz > bn) {
846:         a_rownnz = bn;
847:         break;
848:       }
849:     }
850:     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
851:   }
852:   /* temporary work areas for merging rows */
853:   PetscMalloc1(a_maxrownnz*8,&workj_L1);
854:   PetscMalloc1(a_maxrownnz*8,&workj_L2);
855:   PetscMalloc1(a_maxrownnz,&workj_L3);

857:   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
858:   c_maxmem = 8*(ai[am]+bi[bm]);
859:   /* Step 2: Populate pattern for C */
860:   PetscMalloc1(c_maxmem,&cj);

862:   ci_nnz       = 0;
863:   ci[0]        = 0;
864:   worki_L1[0]  = 0;
865:   worki_L2[0]  = 0;
866:   for (i=0; i<am; i++) {
867:     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
868:     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
869:     rowsleft             = anzi;
870:     inputcol_L1          = acol;
871:     L2_nnz               = 0;
872:     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
873:     worki_L2[1]          = 0;
874:     outputi_nnz          = 0;

876:     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
877:     while (ci_nnz+a_maxrownnz > c_maxmem) {
878:       c_maxmem *= 2;
879:       ndouble++;
880:       PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);
881:     }

883:     while (rowsleft) {
884:       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
885:       L1_nrows    = 0;
886:       L1_nnz      = 0;
887:       inputcol    = inputcol_L1;
888:       inputi      = bi;
889:       inputj      = bj;

891:       /* The following macro is used to specialize for small rows in A.
892:          This helps with compiler unrolling, improving performance substantially.
893:           Input:  inputj   inputi  inputcol  bn
894:           Output: outputj  outputi_nnz                       */
895:        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
896:          window_min  = bn;                                                   \
897:          outputi_nnz = 0;                                                    \
898:          for (k=0; k<ANNZ; ++k) {                                            \
899:            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
900:            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
901:            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
902:            window_min  = PetscMin(window[k], window_min);                    \
903:          }                                                                   \
904:          while (window_min < bn) {                                           \
905:            outputj[outputi_nnz++] = window_min;                              \
906:            /* advance front and compute new minimum */                       \
907:            old_window_min = window_min;                                      \
908:            window_min = bn;                                                  \
909:            for (k=0; k<ANNZ; ++k) {                                          \
910:              if (window[k] == old_window_min) {                              \
911:                brow_ptr[k]++;                                                \
912:                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
913:              }                                                               \
914:              window_min = PetscMin(window[k], window_min);                   \
915:            }                                                                 \
916:          }

918:       /************** L E V E L  1 ***************/
919:       /* Merge up to 8 rows of B to L1 work array*/
920:       while (L1_rowsleft) {
921:         outputi_nnz = 0;
922:         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
923:         else           outputj = cj + ci_nnz;           /* Merge directly to C */

925:         switch (L1_rowsleft) {
926:         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
927:                  brow_end[0] = inputj + inputi[inputcol[0]+1];
928:                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
929:                  inputcol    += L1_rowsleft;
930:                  rowsleft    -= L1_rowsleft;
931:                  L1_rowsleft  = 0;
932:                  break;
933:         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
934:                  inputcol    += L1_rowsleft;
935:                  rowsleft    -= L1_rowsleft;
936:                  L1_rowsleft  = 0;
937:                  break;
938:         case 3: MatMatMultSymbolic_RowMergeMacro(3);
939:                  inputcol    += L1_rowsleft;
940:                  rowsleft    -= L1_rowsleft;
941:                  L1_rowsleft  = 0;
942:                  break;
943:         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
944:                  inputcol    += L1_rowsleft;
945:                  rowsleft    -= L1_rowsleft;
946:                  L1_rowsleft  = 0;
947:                  break;
948:         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
949:                  inputcol    += L1_rowsleft;
950:                  rowsleft    -= L1_rowsleft;
951:                  L1_rowsleft  = 0;
952:                  break;
953:         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
954:                  inputcol    += L1_rowsleft;
955:                  rowsleft    -= L1_rowsleft;
956:                  L1_rowsleft  = 0;
957:                  break;
958:         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
959:                  inputcol    += L1_rowsleft;
960:                  rowsleft    -= L1_rowsleft;
961:                  L1_rowsleft  = 0;
962:                  break;
963:         default: MatMatMultSymbolic_RowMergeMacro(8);
964:                  inputcol    += 8;
965:                  rowsleft    -= 8;
966:                  L1_rowsleft -= 8;
967:                  break;
968:         }
969:         inputcol_L1           = inputcol;
970:         L1_nnz               += outputi_nnz;
971:         worki_L1[++L1_nrows]  = L1_nnz;
972:       }

974:       /********************** L E V E L  2 ************************/
975:       /* Merge from L1 work array to either C or to L2 work array */
976:       if (anzi > 8) {
977:         inputi      = worki_L1;
978:         inputj      = workj_L1;
979:         inputcol    = workcol;
980:         outputi_nnz = 0;

982:         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
983:         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */

985:         switch (L1_nrows) {
986:         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
987:                  brow_end[0] = inputj + inputi[inputcol[0]+1];
988:                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
989:                  break;
990:         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
991:         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
992:         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
993:         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
994:         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
995:         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
996:         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
997:         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
998:         }
999:         L2_nnz               += outputi_nnz;
1000:         worki_L2[++L2_nrows]  = L2_nnz;

1002:         /************************ L E V E L  3 **********************/
1003:         /* Merge from L2 work array to either C or to L2 work array */
1004:         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1005:           inputi      = worki_L2;
1006:           inputj      = workj_L2;
1007:           inputcol    = workcol;
1008:           outputi_nnz = 0;
1009:           if (rowsleft) outputj = workj_L3;
1010:           else          outputj = cj + ci_nnz;
1011:           switch (L2_nrows) {
1012:           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1013:                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1014:                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1015:                    break;
1016:           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1017:           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1018:           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1019:           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1020:           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1021:           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1022:           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1023:           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1024:           }
1025:           L2_nrows    = 1;
1026:           L2_nnz      = outputi_nnz;
1027:           worki_L2[1] = outputi_nnz;
1028:           /* Copy to workj_L2 */
1029:           if (rowsleft) {
1030:             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1031:           }
1032:         }
1033:       }
1034:     }  /* while (rowsleft) */
1035: #undef MatMatMultSymbolic_RowMergeMacro

1037:     /* terminate current row */
1038:     ci_nnz += outputi_nnz;
1039:     ci[i+1] = ci_nnz;
1040:   }

1042:   /* Step 3: Create the new symbolic matrix */
1043:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);
1044:   MatSetBlockSizesFromMats(C,A,B);

1046:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1047:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1048:   c          = (Mat_SeqAIJ*)(C->data);
1049:   c->free_a  = PETSC_TRUE;
1050:   c->free_ij = PETSC_TRUE;
1051:   c->nonew   = 0;

1053:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1055:   /* set MatInfo */
1056:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1057:   if (afill < 1.0) afill = 1.0;
1058:   C->info.mallocs           = ndouble;
1059:   C->info.fill_ratio_given  = fill;
1060:   C->info.fill_ratio_needed = afill;

1062: #if defined(PETSC_USE_INFO)
1063:   if (ci[am]) {
1064:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
1065:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
1066:   } else {
1067:     PetscInfo(C,"Empty matrix product\n");
1068:   }
1069: #endif

1071:   /* Step 4: Free temporary work areas */
1072:   PetscFree(workj_L1);
1073:   PetscFree(workj_L2);
1074:   PetscFree(workj_L3);
1075:   return 0;
1076: }

1078: /* concatenate unique entries and then sort */
1079: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1080: {
1081:   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1082:   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1083:   PetscInt       *ci,*cj,bcol;
1084:   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1085:   PetscReal      afill;
1086:   PetscInt       i,j,ndouble = 0;
1087:   PetscSegBuffer seg,segrow;
1088:   char           *seen;

1090:   PetscMalloc1(am+1,&ci);
1091:   ci[0] = 0;

1093:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1094:   PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);
1095:   PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);
1096:   PetscCalloc1(bn,&seen);

1098:   /* Determine ci and cj */
1099:   for (i=0; i<am; i++) {
1100:     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1101:     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1102:     PetscInt packlen = 0,*PETSC_RESTRICT crow;

1104:     /* Pack segrow */
1105:     for (j=0; j<anzi; j++) {
1106:       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1107:       for (k=bjstart; k<bjend; k++) {
1108:         bcol = bj[k];
1109:         if (!seen[bcol]) { /* new entry */
1110:           PetscInt *PETSC_RESTRICT slot;
1111:           PetscSegBufferGetInts(segrow,1,&slot);
1112:           *slot = bcol;
1113:           seen[bcol] = 1;
1114:           packlen++;
1115:         }
1116:       }
1117:     }

1119:     /* Check i-th diagonal entry */
1120:     if (C->force_diagonals && !seen[i]) {
1121:       PetscInt *PETSC_RESTRICT slot;
1122:       PetscSegBufferGetInts(segrow,1,&slot);
1123:       *slot   = i;
1124:       seen[i] = 1;
1125:       packlen++;
1126:     }

1128:     PetscSegBufferGetInts(seg,packlen,&crow);
1129:     PetscSegBufferExtractTo(segrow,crow);
1130:     PetscSortInt(packlen,crow);
1131:     ci[i+1] = ci[i] + packlen;
1132:     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1133:   }
1134:   PetscSegBufferDestroy(&segrow);
1135:   PetscFree(seen);

1137:   /* Column indices are in the segmented buffer */
1138:   PetscSegBufferExtractAlloc(seg,&cj);
1139:   PetscSegBufferDestroy(&seg);

1141:   /* put together the new symbolic matrix */
1142:   MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C);
1143:   MatSetBlockSizesFromMats(C,A,B);

1145:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1146:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1147:   c          = (Mat_SeqAIJ*)(C->data);
1148:   c->free_a  = PETSC_TRUE;
1149:   c->free_ij = PETSC_TRUE;
1150:   c->nonew   = 0;

1152:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;

1154:   /* set MatInfo */
1155:   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1156:   if (afill < 1.0) afill = 1.0;
1157:   C->info.mallocs           = ndouble;
1158:   C->info.fill_ratio_given  = fill;
1159:   C->info.fill_ratio_needed = afill;

1161: #if defined(PETSC_USE_INFO)
1162:   if (ci[am]) {
1163:     PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
1164:     PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
1165:   } else {
1166:     PetscInfo(C,"Empty matrix product\n");
1167:   }
1168: #endif
1169:   return 0;
1170: }

1172: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1173: {
1174:   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;

1176:   MatTransposeColoringDestroy(&abt->matcoloring);
1177:   MatDestroy(&abt->Bt_den);
1178:   MatDestroy(&abt->ABt_den);
1179:   PetscFree(abt);
1180:   return 0;
1181: }

1183: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1184: {
1185:   Mat                 Bt;
1186:   PetscInt            *bti,*btj;
1187:   Mat_MatMatTransMult *abt;
1188:   Mat_Product         *product = C->product;
1189:   char                *alg;


1194:   /* create symbolic Bt */
1195:   MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);
1196:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);
1197:   MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
1198:   MatSetType(Bt,((PetscObject)A)->type_name);

1200:   /* get symbolic C=A*Bt */
1201:   PetscStrallocpy(product->alg,&alg);
1202:   MatProductSetAlgorithm(C,"sorted"); /* set algorithm for C = A*Bt */
1203:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);
1204:   MatProductSetAlgorithm(C,alg); /* resume original algorithm for ABt product */
1205:   PetscFree(alg);

1207:   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1208:   PetscNew(&abt);

1210:   product->data    = abt;
1211:   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;

1213:   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;

1215:   abt->usecoloring = PETSC_FALSE;
1216:   PetscStrcmp(product->alg,"color",&abt->usecoloring);
1217:   if (abt->usecoloring) {
1218:     /* Create MatTransposeColoring from symbolic C=A*B^T */
1219:     MatTransposeColoring matcoloring;
1220:     MatColoring          coloring;
1221:     ISColoring           iscoloring;
1222:     Mat                  Bt_dense,C_dense;

1224:     /* inode causes memory problem */
1225:     MatSetOption(C,MAT_USE_INODES,PETSC_FALSE);

1227:     MatColoringCreate(C,&coloring);
1228:     MatColoringSetDistance(coloring,2);
1229:     MatColoringSetType(coloring,MATCOLORINGSL);
1230:     MatColoringSetFromOptions(coloring);
1231:     MatColoringApply(coloring,&iscoloring);
1232:     MatColoringDestroy(&coloring);
1233:     MatTransposeColoringCreate(C,iscoloring,&matcoloring);

1235:     abt->matcoloring = matcoloring;

1237:     ISColoringDestroy(&iscoloring);

1239:     /* Create Bt_dense and C_dense = A*Bt_dense */
1240:     MatCreate(PETSC_COMM_SELF,&Bt_dense);
1241:     MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);
1242:     MatSetType(Bt_dense,MATSEQDENSE);
1243:     MatSeqDenseSetPreallocation(Bt_dense,NULL);

1245:     Bt_dense->assembled = PETSC_TRUE;
1246:     abt->Bt_den         = Bt_dense;

1248:     MatCreate(PETSC_COMM_SELF,&C_dense);
1249:     MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);
1250:     MatSetType(C_dense,MATSEQDENSE);
1251:     MatSeqDenseSetPreallocation(C_dense,NULL);

1253:     Bt_dense->assembled = PETSC_TRUE;
1254:     abt->ABt_den  = C_dense;

1256: #if defined(PETSC_USE_INFO)
1257:     {
1258:       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1259:       PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors))));
1260:     }
1261: #endif
1262:   }
1263:   /* clean up */
1264:   MatDestroy(&Bt);
1265:   MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);
1266:   return 0;
1267: }

1269: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1270: {
1271:   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1272:   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1273:   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1274:   PetscLogDouble      flops=0.0;
1275:   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1276:   Mat_MatMatTransMult *abt;
1277:   Mat_Product         *product = C->product;

1280:   abt = (Mat_MatMatTransMult *)product->data;
1282:   /* clear old values in C */
1283:   if (!c->a) {
1284:     PetscCalloc1(ci[cm]+1,&ca);
1285:     c->a      = ca;
1286:     c->free_a = PETSC_TRUE;
1287:   } else {
1288:     ca =  c->a;
1289:     PetscArrayzero(ca,ci[cm]+1);
1290:   }

1292:   if (abt->usecoloring) {
1293:     MatTransposeColoring matcoloring = abt->matcoloring;
1294:     Mat                  Bt_dense,C_dense = abt->ABt_den;

1296:     /* Get Bt_dense by Apply MatTransposeColoring to B */
1297:     Bt_dense = abt->Bt_den;
1298:     MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);

1300:     /* C_dense = A*Bt_dense */
1301:     MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);

1303:     /* Recover C from C_dense */
1304:     MatTransColoringApplyDenToSp(matcoloring,C_dense,C);
1305:     return 0;
1306:   }

1308:   for (i=0; i<cm; i++) {
1309:     anzi = ai[i+1] - ai[i];
1310:     acol = aj + ai[i];
1311:     aval = aa + ai[i];
1312:     cnzi = ci[i+1] - ci[i];
1313:     ccol = cj + ci[i];
1314:     cval = ca + ci[i];
1315:     for (j=0; j<cnzi; j++) {
1316:       brow = ccol[j];
1317:       bnzj = bi[brow+1] - bi[brow];
1318:       bcol = bj + bi[brow];
1319:       bval = ba + bi[brow];

1321:       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1322:       nexta = 0; nextb = 0;
1323:       while (nexta<anzi && nextb<bnzj) {
1324:         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1325:         if (nexta == anzi) break;
1326:         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1327:         if (nextb == bnzj) break;
1328:         if (acol[nexta] == bcol[nextb]) {
1329:           cval[j] += aval[nexta]*bval[nextb];
1330:           nexta++; nextb++;
1331:           flops += 2;
1332:         }
1333:       }
1334:     }
1335:   }
1336:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1337:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1338:   PetscLogFlops(flops);
1339:   return 0;
1340: }

1342: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1343: {
1344:   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;

1346:   MatDestroy(&atb->At);
1347:   if (atb->destroy) {
1348:     (*atb->destroy)(atb->data);
1349:   }
1350:   PetscFree(atb);
1351:   return 0;
1352: }

1354: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1355: {
1356:   Mat            At = NULL;
1357:   PetscInt       *ati,*atj;
1358:   Mat_Product    *product = C->product;
1359:   PetscBool      flg,def,square;

1361:   MatCheckProduct(C,4);
1362:   square = (PetscBool)(A == B && A->symmetric && A->symmetric_set);
1363:   /* outerproduct */
1364:   PetscStrcmp(product->alg,"outerproduct",&flg);
1365:   if (flg) {
1366:     /* create symbolic At */
1367:     if (!square) {
1368:       MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1369:       MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);
1370:       MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
1371:       MatSetType(At,((PetscObject)A)->type_name);
1372:     }
1373:     /* get symbolic C=At*B */
1374:     MatProductSetAlgorithm(C,"sorted");
1375:     MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);

1377:     /* clean up */
1378:     if (!square) {
1379:       MatDestroy(&At);
1380:       MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1381:     }

1383:     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1384:     MatProductSetAlgorithm(C,"outerproduct");
1385:     return 0;
1386:   }

1388:   /* matmatmult */
1389:   PetscStrcmp(product->alg,"default",&def);
1390:   PetscStrcmp(product->alg,"at*b",&flg);
1391:   if (flg || def) {
1392:     Mat_MatTransMatMult *atb;

1395:     PetscNew(&atb);
1396:     if (!square) {
1397:       MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);
1398:     }
1399:     MatProductSetAlgorithm(C,"sorted");
1400:     MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C);
1401:     MatProductSetAlgorithm(C,"at*b");
1402:     product->data    = atb;
1403:     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1404:     atb->At          = At;
1405:     atb->updateAt    = PETSC_FALSE; /* because At is computed here */

1407:     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1408:     return 0;
1409:   }

1411:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1412: }

1414: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1415: {
1416:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1417:   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1418:   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1419:   PetscLogDouble flops=0.0;
1420:   MatScalar      *aa=a->a,*ba,*ca,*caj;

1422:   if (!c->a) {
1423:     PetscCalloc1(ci[cm]+1,&ca);

1425:     c->a      = ca;
1426:     c->free_a = PETSC_TRUE;
1427:   } else {
1428:     ca   = c->a;
1429:     PetscArrayzero(ca,ci[cm]);
1430:   }

1432:   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1433:   for (i=0; i<am; i++) {
1434:     bj   = b->j + bi[i];
1435:     ba   = b->a + bi[i];
1436:     bnzi = bi[i+1] - bi[i];
1437:     anzi = ai[i+1] - ai[i];
1438:     for (j=0; j<anzi; j++) {
1439:       nextb = 0;
1440:       crow  = *aj++;
1441:       cjj   = cj + ci[crow];
1442:       caj   = ca + ci[crow];
1443:       /* perform sparse axpy operation.  Note cjj includes bj. */
1444:       for (k=0; nextb<bnzi; k++) {
1445:         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1446:           caj[k] += (*aa)*(*(ba+nextb));
1447:           nextb++;
1448:         }
1449:       }
1450:       flops += 2*bnzi;
1451:       aa++;
1452:     }
1453:   }

1455:   /* Assemble the final matrix and clean up */
1456:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1457:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1458:   PetscLogFlops(flops);
1459:   return 0;
1460: }

1462: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1463: {
1464:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1465:   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1466:   return 0;
1467: }

1469: PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1470: {
1471:   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1472:   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1473:   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1474:   const PetscInt    *aj;
1475:   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1476:   PetscInt          clda;
1477:   PetscInt          am4,bm4,col,i,j,n;

1479:   if (!cm || !cn) return 0;
1480:   MatSeqAIJGetArrayRead(A,&av);
1481:   if (add) {
1482:     MatDenseGetArray(C,&c);
1483:   } else {
1484:     MatDenseGetArrayWrite(C,&c);
1485:   }
1486:   MatDenseGetArrayRead(B,&b);
1487:   MatDenseGetLDA(B,&bm);
1488:   MatDenseGetLDA(C,&clda);
1489:   am4 = 4*clda;
1490:   bm4 = 4*bm;
1491:   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1492:   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1493:   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1494:     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1495:       r1 = r2 = r3 = r4 = 0.0;
1496:       n  = a->i[i+1] - a->i[i];
1497:       aj = a->j + a->i[i];
1498:       aa = av + a->i[i];
1499:       for (j=0; j<n; j++) {
1500:         const PetscScalar aatmp = aa[j];
1501:         const PetscInt    ajtmp = aj[j];
1502:         r1 += aatmp*b1[ajtmp];
1503:         r2 += aatmp*b2[ajtmp];
1504:         r3 += aatmp*b3[ajtmp];
1505:         r4 += aatmp*b4[ajtmp];
1506:       }
1507:       if (add) {
1508:         c1[i] += r1;
1509:         c2[i] += r2;
1510:         c3[i] += r3;
1511:         c4[i] += r4;
1512:       } else {
1513:         c1[i] = r1;
1514:         c2[i] = r2;
1515:         c3[i] = r3;
1516:         c4[i] = r4;
1517:       }
1518:     }
1519:     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1520:     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1521:   }
1522:   /* process remaining columns */
1523:   if (col != cn) {
1524:     PetscInt rc = cn-col;

1526:     if (rc == 1) {
1527:       for (i=0; i<am; i++) {
1528:         r1 = 0.0;
1529:         n  = a->i[i+1] - a->i[i];
1530:         aj = a->j + a->i[i];
1531:         aa = av + a->i[i];
1532:         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1533:         if (add) c1[i] += r1;
1534:         else c1[i] = r1;
1535:       }
1536:     } else if (rc == 2) {
1537:       for (i=0; i<am; i++) {
1538:         r1 = r2 = 0.0;
1539:         n  = a->i[i+1] - a->i[i];
1540:         aj = a->j + a->i[i];
1541:         aa = av + a->i[i];
1542:         for (j=0; j<n; j++) {
1543:           const PetscScalar aatmp = aa[j];
1544:           const PetscInt    ajtmp = aj[j];
1545:           r1 += aatmp*b1[ajtmp];
1546:           r2 += aatmp*b2[ajtmp];
1547:         }
1548:         if (add) {
1549:           c1[i] += r1;
1550:           c2[i] += r2;
1551:         } else {
1552:           c1[i] = r1;
1553:           c2[i] = r2;
1554:         }
1555:       }
1556:     } else {
1557:       for (i=0; i<am; i++) {
1558:         r1 = r2 = r3 = 0.0;
1559:         n  = a->i[i+1] - a->i[i];
1560:         aj = a->j + a->i[i];
1561:         aa = av + a->i[i];
1562:         for (j=0; j<n; j++) {
1563:           const PetscScalar aatmp = aa[j];
1564:           const PetscInt    ajtmp = aj[j];
1565:           r1 += aatmp*b1[ajtmp];
1566:           r2 += aatmp*b2[ajtmp];
1567:           r3 += aatmp*b3[ajtmp];
1568:         }
1569:         if (add) {
1570:           c1[i] += r1;
1571:           c2[i] += r2;
1572:           c3[i] += r3;
1573:         } else {
1574:           c1[i] = r1;
1575:           c2[i] = r2;
1576:           c3[i] = r3;
1577:         }
1578:       }
1579:     }
1580:   }
1581:   PetscLogFlops(cn*(2.0*a->nz));
1582:   if (add) {
1583:     MatDenseRestoreArray(C,&c);
1584:   } else {
1585:     MatDenseRestoreArrayWrite(C,&c);
1586:   }
1587:   MatDenseRestoreArrayRead(B,&b);
1588:   MatSeqAIJRestoreArrayRead(A,&av);
1589:   return 0;
1590: }

1592: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1593: {

1598:   MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE);
1599:   return 0;
1600: }

1602: /* ------------------------------------------------------- */
1603: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1604: {
1605:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1606:   C->ops->productsymbolic = MatProductSymbolic_AB;
1607:   return 0;
1608: }

1610: PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);

1612: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1613: {
1614:   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1615:   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1616:   return 0;
1617: }

1619: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1620: {
1621:   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1622:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1623:   return 0;
1624: }

1626: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1627: {
1628:   Mat_Product    *product = C->product;

1630:   switch (product->type) {
1631:   case MATPRODUCT_AB:
1632:     MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C);
1633:     break;
1634:   case MATPRODUCT_AtB:
1635:     MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C);
1636:     break;
1637:   case MATPRODUCT_ABt:
1638:     MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C);
1639:     break;
1640:   default:
1641:     break;
1642:   }
1643:   return 0;
1644: }
1645: /* ------------------------------------------------------- */
1646: static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1647: {
1648:   Mat_Product    *product = C->product;
1649:   Mat            A = product->A;
1650:   PetscBool      baij;

1652:   PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij);
1653:   if (!baij) { /* A is seqsbaij */
1654:     PetscBool sbaij;
1655:     PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij);

1658:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1659:   } else { /* A is seqbaij */
1660:     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1661:   }

1663:   C->ops->productsymbolic = MatProductSymbolic_AB;
1664:   return 0;
1665: }

1667: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1668: {
1669:   Mat_Product    *product = C->product;

1671:   MatCheckProduct(C,1);
1673:   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric)) {
1674:     MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C);
1675:   }
1676:   return 0;
1677: }

1679: /* ------------------------------------------------------- */
1680: static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1681: {
1682:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1683:   C->ops->productsymbolic = MatProductSymbolic_AB;
1684:   return 0;
1685: }

1687: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1688: {
1689:   Mat_Product    *product = C->product;

1691:   if (product->type == MATPRODUCT_AB) {
1692:     MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C);
1693:   }
1694:   return 0;
1695: }
1696: /* ------------------------------------------------------- */

1698: PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1699: {
1700:   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1701:   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1702:   PetscInt       *bi      = b->i,*bj=b->j;
1703:   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1704:   MatScalar      *btval,*btval_den,*ba=b->a;
1705:   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;

1707:   btval_den=btdense->v;
1708:   PetscArrayzero(btval_den,m*n);
1709:   for (k=0; k<ncolors; k++) {
1710:     ncolumns = coloring->ncolumns[k];
1711:     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1712:       col   = *(columns + colorforcol[k] + l);
1713:       btcol = bj + bi[col];
1714:       btval = ba + bi[col];
1715:       anz   = bi[col+1] - bi[col];
1716:       for (j=0; j<anz; j++) {
1717:         brow            = btcol[j];
1718:         btval_den[brow] = btval[j];
1719:       }
1720:     }
1721:     btval_den += m;
1722:   }
1723:   return 0;
1724: }

1726: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1727: {
1728:   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1729:   const PetscScalar *ca_den,*ca_den_ptr;
1730:   PetscScalar       *ca=csp->a;
1731:   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1732:   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1733:   PetscInt          nrows,*row,*idx;
1734:   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;

1736:   MatDenseGetArrayRead(Cden,&ca_den);

1738:   if (brows > 0) {
1739:     PetscInt *lstart,row_end,row_start;
1740:     lstart = matcoloring->lstart;
1741:     PetscArrayzero(lstart,ncolors);

1743:     row_end = brows;
1744:     if (row_end > m) row_end = m;
1745:     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1746:       ca_den_ptr = ca_den;
1747:       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1748:         nrows = matcoloring->nrows[k];
1749:         row   = rows  + colorforrow[k];
1750:         idx   = den2sp + colorforrow[k];
1751:         for (l=lstart[k]; l<nrows; l++) {
1752:           if (row[l] >= row_end) {
1753:             lstart[k] = l;
1754:             break;
1755:           } else {
1756:             ca[idx[l]] = ca_den_ptr[row[l]];
1757:           }
1758:         }
1759:         ca_den_ptr += m;
1760:       }
1761:       row_end += brows;
1762:       if (row_end > m) row_end = m;
1763:     }
1764:   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1765:     ca_den_ptr = ca_den;
1766:     for (k=0; k<ncolors; k++) {
1767:       nrows = matcoloring->nrows[k];
1768:       row   = rows  + colorforrow[k];
1769:       idx   = den2sp + colorforrow[k];
1770:       for (l=0; l<nrows; l++) {
1771:         ca[idx[l]] = ca_den_ptr[row[l]];
1772:       }
1773:       ca_den_ptr += m;
1774:     }
1775:   }

1777:   MatDenseRestoreArrayRead(Cden,&ca_den);
1778: #if defined(PETSC_USE_INFO)
1779:   if (matcoloring->brows > 0) {
1780:     PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows);
1781:   } else {
1782:     PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");
1783:   }
1784: #endif
1785:   return 0;
1786: }

1788: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1789: {
1790:   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1791:   const PetscInt *is,*ci,*cj,*row_idx;
1792:   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1793:   IS             *isa;
1794:   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1795:   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1796:   PetscInt       *colorforcol,*columns,*columns_i,brows;
1797:   PetscBool      flg;

1799:   ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);

1801:   /* bs >1 is not being tested yet! */
1802:   Nbs       = mat->cmap->N/bs;
1803:   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1804:   c->N      = Nbs;
1805:   c->m      = c->M;
1806:   c->rstart = 0;
1807:   c->brows  = 100;

1809:   c->ncolors = nis;
1810:   PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);
1811:   PetscMalloc1(csp->nz+1,&rows);
1812:   PetscMalloc1(csp->nz+1,&den2sp);

1814:   brows = c->brows;
1815:   PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);
1816:   if (flg) c->brows = brows;
1817:   if (brows > 0) {
1818:     PetscMalloc1(nis+1,&c->lstart);
1819:   }

1821:   colorforrow[0] = 0;
1822:   rows_i         = rows;
1823:   den2sp_i       = den2sp;

1825:   PetscMalloc1(nis+1,&colorforcol);
1826:   PetscMalloc1(Nbs+1,&columns);

1828:   colorforcol[0] = 0;
1829:   columns_i      = columns;

1831:   /* get column-wise storage of mat */
1832:   MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);

1834:   cm   = c->m;
1835:   PetscMalloc1(cm+1,&rowhit);
1836:   PetscMalloc1(cm+1,&idxhit);
1837:   for (i=0; i<nis; i++) { /* loop over color */
1838:     ISGetLocalSize(isa[i],&n);
1839:     ISGetIndices(isa[i],&is);

1841:     c->ncolumns[i] = n;
1842:     if (n) {
1843:       PetscArraycpy(columns_i,is,n);
1844:     }
1845:     colorforcol[i+1] = colorforcol[i] + n;
1846:     columns_i       += n;

1848:     /* fast, crude version requires O(N*N) work */
1849:     PetscArrayzero(rowhit,cm);

1851:     for (j=0; j<n; j++) { /* loop over columns*/
1852:       col     = is[j];
1853:       row_idx = cj + ci[col];
1854:       m       = ci[col+1] - ci[col];
1855:       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1856:         idxhit[*row_idx]   = spidx[ci[col] + k];
1857:         rowhit[*row_idx++] = col + 1;
1858:       }
1859:     }
1860:     /* count the number of hits */
1861:     nrows = 0;
1862:     for (j=0; j<cm; j++) {
1863:       if (rowhit[j]) nrows++;
1864:     }
1865:     c->nrows[i]      = nrows;
1866:     colorforrow[i+1] = colorforrow[i] + nrows;

1868:     nrows = 0;
1869:     for (j=0; j<cm; j++) { /* loop over rows */
1870:       if (rowhit[j]) {
1871:         rows_i[nrows]   = j;
1872:         den2sp_i[nrows] = idxhit[j];
1873:         nrows++;
1874:       }
1875:     }
1876:     den2sp_i += nrows;

1878:     ISRestoreIndices(isa[i],&is);
1879:     rows_i += nrows;
1880:   }
1881:   MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1882:   PetscFree(rowhit);
1883:   ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);

1886:   c->colorforrow = colorforrow;
1887:   c->rows        = rows;
1888:   c->den2sp      = den2sp;
1889:   c->colorforcol = colorforcol;
1890:   c->columns     = columns;

1892:   PetscFree(idxhit);
1893:   return 0;
1894: }

1896: /* --------------------------------------------------------------- */
1897: static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1898: {
1899:   Mat_Product    *product = C->product;
1900:   Mat            A=product->A,B=product->B;

1902:   if (C->ops->mattransposemultnumeric) {
1903:     /* Alg: "outerproduct" */
1904:     (*C->ops->mattransposemultnumeric)(A,B,C);
1905:   } else {
1906:     /* Alg: "matmatmult" -- C = At*B */
1907:     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1908:     Mat                 At;

1911:     At = atb->At;
1912:     if (atb->updateAt && At) { /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ() */
1913:       MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);
1914:     }
1915:     MatMatMultNumeric_SeqAIJ_SeqAIJ(At ? At : A,B,C);
1916:     atb->updateAt = PETSC_TRUE;
1917:   }
1918:   return 0;
1919: }

1921: static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1922: {
1923:   Mat_Product    *product = C->product;
1924:   Mat            A=product->A,B=product->B;
1925:   PetscReal      fill=product->fill;

1927:   MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);

1929:   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1930:   return 0;
1931: }

1933: /* --------------------------------------------------------------- */
1934: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1935: {
1937:   Mat_Product    *product = C->product;
1938:   PetscInt       alg = 0; /* default algorithm */
1939:   PetscBool      flg = PETSC_FALSE;
1940: #if !defined(PETSC_HAVE_HYPRE)
1941:   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1942:   PetscInt       nalg = 7;
1943: #else
1944:   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1945:   PetscInt       nalg = 8;
1946: #endif

1948:   /* Set default algorithm */
1949:   PetscStrcmp(C->product->alg,"default",&flg);
1950:   if (flg) {
1951:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
1952:   }

1954:   /* Get runtime option */
1955:   if (product->api_user) {
1956:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1957:     PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg);
1958:     PetscOptionsEnd();
1959:   } else {
1960:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1961:     PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg);
1962:     PetscOptionsEnd();
1963:   }
1964:   if (flg) {
1965:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
1966:   }

1968:   C->ops->productsymbolic = MatProductSymbolic_AB;
1969:   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1970:   return 0;
1971: }

1973: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1974: {
1976:   Mat_Product    *product = C->product;
1977:   PetscInt       alg = 0; /* default algorithm */
1978:   PetscBool      flg = PETSC_FALSE;
1979:   const char     *algTypes[3] = {"default","at*b","outerproduct"};
1980:   PetscInt       nalg = 3;

1982:   /* Get runtime option */
1983:   if (product->api_user) {
1984:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
1985:     PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);
1986:     PetscOptionsEnd();
1987:   } else {
1988:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
1989:     PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg);
1990:     PetscOptionsEnd();
1991:   }
1992:   if (flg) {
1993:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
1994:   }

1996:   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
1997:   return 0;
1998: }

2000: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2001: {
2003:   Mat_Product    *product = C->product;
2004:   PetscInt       alg = 0; /* default algorithm */
2005:   PetscBool      flg = PETSC_FALSE;
2006:   const char     *algTypes[2] = {"default","color"};
2007:   PetscInt       nalg = 2;

2009:   /* Set default algorithm */
2010:   PetscStrcmp(C->product->alg,"default",&flg);
2011:   if (!flg) {
2012:     alg = 1;
2013:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2014:   }

2016:   /* Get runtime option */
2017:   if (product->api_user) {
2018:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2019:     PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg);
2020:     PetscOptionsEnd();
2021:   } else {
2022:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2023:     PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg);
2024:     PetscOptionsEnd();
2025:   }
2026:   if (flg) {
2027:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2028:   }

2030:   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2031:   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2032:   return 0;
2033: }

2035: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2036: {
2038:   Mat_Product    *product = C->product;
2039:   PetscBool      flg = PETSC_FALSE;
2040:   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2041: #if !defined(PETSC_HAVE_HYPRE)
2042:   const char     *algTypes[2] = {"scalable","rap"};
2043:   PetscInt       nalg = 2;
2044: #else
2045:   const char     *algTypes[3] = {"scalable","rap","hypre"};
2046:   PetscInt       nalg = 3;
2047: #endif

2049:   /* Set default algorithm */
2050:   PetscStrcmp(product->alg,"default",&flg);
2051:   if (flg) {
2052:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2053:   }

2055:   /* Get runtime option */
2056:   if (product->api_user) {
2057:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2058:     PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg);
2059:     PetscOptionsEnd();
2060:   } else {
2061:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2062:     PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg);
2063:     PetscOptionsEnd();
2064:   }
2065:   if (flg) {
2066:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2067:   }

2069:   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2070:   return 0;
2071: }

2073: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2074: {
2076:   Mat_Product    *product = C->product;
2077:   PetscBool      flg = PETSC_FALSE;
2078:   PetscInt       alg = 0; /* default algorithm */
2079:   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2080:   PetscInt        nalg = 3;

2082:   /* Set default algorithm */
2083:   PetscStrcmp(product->alg,"default",&flg);
2084:   if (flg) {
2085:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2086:   }

2088:   /* Get runtime option */
2089:   if (product->api_user) {
2090:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2091:     PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg);
2092:     PetscOptionsEnd();
2093:   } else {
2094:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2095:     PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg);
2096:     PetscOptionsEnd();
2097:   }
2098:   if (flg) {
2099:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2100:   }

2102:   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2103:   return 0;
2104: }

2106: /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2107: static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2108: {
2110:   Mat_Product    *product = C->product;
2111:   PetscInt       alg = 0; /* default algorithm */
2112:   PetscBool      flg = PETSC_FALSE;
2113:   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2114:   PetscInt       nalg = 7;

2116:   /* Set default algorithm */
2117:   PetscStrcmp(product->alg,"default",&flg);
2118:   if (flg) {
2119:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2120:   }

2122:   /* Get runtime option */
2123:   if (product->api_user) {
2124:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2125:     PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg);
2126:     PetscOptionsEnd();
2127:   } else {
2128:     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2129:     PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg);
2130:     PetscOptionsEnd();
2131:   }
2132:   if (flg) {
2133:     MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]);
2134:   }

2136:   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2137:   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2138:   return 0;
2139: }

2141: PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2142: {
2143:   Mat_Product    *product = C->product;

2145:   switch (product->type) {
2146:   case MATPRODUCT_AB:
2147:     MatProductSetFromOptions_SeqAIJ_AB(C);
2148:     break;
2149:   case MATPRODUCT_AtB:
2150:     MatProductSetFromOptions_SeqAIJ_AtB(C);
2151:     break;
2152:   case MATPRODUCT_ABt:
2153:     MatProductSetFromOptions_SeqAIJ_ABt(C);
2154:     break;
2155:   case MATPRODUCT_PtAP:
2156:     MatProductSetFromOptions_SeqAIJ_PtAP(C);
2157:     break;
2158:   case MATPRODUCT_RARt:
2159:     MatProductSetFromOptions_SeqAIJ_RARt(C);
2160:     break;
2161:   case MATPRODUCT_ABC:
2162:     MatProductSetFromOptions_SeqAIJ_ABC(C);
2163:     break;
2164:   default:
2165:     break;
2166:   }
2167:   return 0;
2168: }