Actual source code: matmatmult.c

petsc-3.3-p7 2013-05-11
  2: /*
  3:   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
  4:           C = A * B
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
  8: #include <../src/mat/utils/freespace.h>
  9: #include <petscbt.h>
 10: #include <../src/mat/impls/dense/seq/dense.h> /*I "petscmat.h" I*/
 11: /*
 12: #define DEBUG_MATMATMULT
 13:  */
 14: EXTERN_C_BEGIN
 17: PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
 18: {
 20:   PetscBool      scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE;

 23:   if (scall == MAT_INITIAL_MATRIX){
 24:     PetscObjectOptionsBegin((PetscObject)A);
 25:     PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);
 26:     PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,PETSC_NULL);
 27:     PetscOptionsEnd();
 28:     PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
 29:     if (scalable_fast){
 30:       MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);
 31:     } else if (scalable){
 32:       MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);
 33:     } else {
 34:       MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
 35:     }
 36:     PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
 37:   }
 38:   PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
 39:   (*(*C)->ops->matmultnumeric)(A,B,*C);
 40:   PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
 41:   return(0);
 42: }
 43: EXTERN_C_END

 47: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
 48: {
 49:   PetscErrorCode     ierr;
 50:   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
 51:   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
 52:   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
 53:   PetscReal          afill;
 54:   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
 55:   PetscBT            lnkbt;
 56:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;

 59:   /* Get ci and cj */
 60:   /*---------------*/
 61:   /* Allocate ci array, arrays for fill computation and */
 62:   /* free space for accumulating nonzero column info */
 63:   PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
 64:   ci[0] = 0;
 65: 
 66:   /* create and initialize a linked list */
 67:   nlnk_max = a->rmax*b->rmax;
 68:   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
 69:   PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);

 71:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
 72:   PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
 73:   current_space = free_space;

 75:   /* Determine ci and cj */
 76:   for (i=0; i<am; i++) {
 77:     anzi = ai[i+1] - ai[i];
 78:     aj   = a->j + ai[i];
 79:     for (j=0; j<anzi; j++){
 80:       brow = aj[j];
 81:       bnzj = bi[brow+1] - bi[brow];
 82:       bj   = b->j + bi[brow];
 83:       /* add non-zero cols of B into the sorted linked list lnk */
 84:       PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);
 85:     }
 86:     cnzi = lnk[0];

 88:     /* If free space is not available, make more free space */
 89:     /* Double the amount of total space in the list */
 90:     if (current_space->local_remaining<cnzi) {
 91:       PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);
 92:       ndouble++;
 93:     }

 95:     /* Copy data into free space, then initialize lnk */
 96:     PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);
 97:     current_space->array           += cnzi;
 98:     current_space->local_used      += cnzi;
 99:     current_space->local_remaining -= cnzi;
100:     ci[i+1] = ci[i] + cnzi;
101:   }

103:   /* Column indices are in the list of free space */
104:   /* Allocate space for cj, initialize cj, and */
105:   /* destroy list of free space and other temporary array(s) */
106:   PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
107:   PetscFreeSpaceContiguous(&free_space,cj);
108:   PetscLLCondensedDestroy(lnk,lnkbt);
109: 
110:   /* put together the new symbolic matrix */
111:   MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);
112:   (*C)->rmap->bs = A->rmap->bs;
113:   (*C)->cmap->bs = B->cmap->bs;

115:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
116:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
117:   c = (Mat_SeqAIJ *)((*C)->data);
118:   c->free_a  = PETSC_FALSE;
119:   c->free_ij = PETSC_TRUE;
120:   c->nonew   = 0;
121:   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
122: 
123:   /* set MatInfo */
124:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
125:   if (afill < 1.0) afill = 1.0;
126:   c->maxnz                     = ci[am];
127:   c->nz                        = ci[am];
128:   (*C)->info.mallocs           = ndouble;
129:   (*C)->info.fill_ratio_given  = fill;
130:   (*C)->info.fill_ratio_needed = afill;

132: #if defined(PETSC_USE_INFO)
133:   if (ci[am]) {
134:     PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);
135:     PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
136:   } else {
137:     PetscInfo((*C),"Empty matrix product\n");
138:   }
139: #endif
140:   return(0);
141: }

145: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
146: {
148:   PetscLogDouble flops=0.0;
149:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
150:   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
151:   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
152:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
153:   PetscInt       am=A->rmap->n,cm=C->rmap->n;
154:   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
155:   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
156:   PetscScalar    *ab_dense;
157: 
159:   /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
160:   if (!c->a){ /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
161:     PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);
162:     c->a      = ca;
163:     c->free_a = PETSC_TRUE;

165:     PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);
166:     PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));
167:     c->matmult_abdense = ab_dense;
168:   } else {
169:     ca       = c->a;
170:     ab_dense = c->matmult_abdense;
171:   }

173:   /* clean old values in C */
174:   PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
175:   /* Traverse A row-wise. */
176:   /* Build the ith row in C by summing over nonzero columns in A, */
177:   /* the rows of B corresponding to nonzeros of A. */
178:   for (i=0; i<am; i++) {
179:     anzi = ai[i+1] - ai[i];
180:     for (j=0; j<anzi; j++) {
181:       brow = aj[j];
182:       bnzi = bi[brow+1] - bi[brow];
183:       bjj  = bj + bi[brow];
184:       baj  = ba + bi[brow];
185:       /* perform dense axpy */
186:       valtmp = aa[j];
187:       for (k=0; k<bnzi; k++) {
188:         ab_dense[bjj[k]] += valtmp*baj[k];
189:       }
190:       flops += 2*bnzi;
191:     }
192:     aj += anzi; aa += anzi;

194:     cnzi = ci[i+1] - ci[i];
195:     for (k=0; k<cnzi; k++) {
196:       ca[k]          += ab_dense[cj[k]];
197:       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
198:     }
199:     flops += cnzi;
200:     cj += cnzi; ca += cnzi;
201:   }
202:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
203:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
204:   PetscLogFlops(flops);
205:   return(0);
206: }

210: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
211: {
213:   PetscLogDouble flops=0.0;
214:   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
215:   Mat_SeqAIJ     *b = (Mat_SeqAIJ *)B->data;
216:   Mat_SeqAIJ     *c = (Mat_SeqAIJ *)C->data;
217:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
218:   PetscInt       am=A->rmap->N,cm=C->rmap->N;
219:   PetscInt       i,j,k,anzi,bnzi,cnzi,brow;
220:   PetscScalar    *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
221:   PetscInt       nextb;
222: 
224: #if defined(DEBUG_MATMATMULT)
225:   PetscPrintf(PETSC_COMM_SELF,"MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable...\n");
226: #endif
227:   /* clean old values in C */
228:   PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
229:   /* Traverse A row-wise. */
230:   /* Build the ith row in C by summing over nonzero columns in A, */
231:   /* the rows of B corresponding to nonzeros of A. */
232:   for (i=0;i<am;i++) {
233:     anzi = ai[i+1] - ai[i];
234:     cnzi = ci[i+1] - ci[i];
235:     for (j=0;j<anzi;j++) {
236:       brow = aj[j];
237:       bnzi = bi[brow+1] - bi[brow];
238:       bjj  = bj + bi[brow];
239:       baj  = ba + bi[brow];
240:       /* perform sparse axpy */
241:       valtmp = aa[j];
242:       nextb  = 0;
243:       for (k=0; nextb<bnzi; k++) {
244:         if (cj[k] == bjj[nextb]){ /* ccol == bcol */
245:           ca[k] += valtmp*baj[nextb++];
246:         }
247:       }
248:       flops += 2*bnzi;
249:     }
250:     aj += anzi; aa += anzi;
251:     cj += cnzi; ca += cnzi;
252:   }

254:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
255:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
256:   PetscLogFlops(flops);
257:   return(0);
258: }

262: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
263: {
264:   PetscErrorCode     ierr;
265:   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
266:   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
267:   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
268:   MatScalar          *ca;
269:   PetscReal          afill;
270:   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
271:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;

274:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
275:   /*-----------------------------------------------------------------------------------------*/
276:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
277:   PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
278:   ci[0] = 0;
279: 
280:   /* create and initialize a linked list */
281:   nlnk_max = a->rmax*b->rmax;
282:   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
283:   PetscLLCondensedCreate_fast(nlnk_max,&lnk);

285:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
286:   PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
287:   current_space = free_space;

289:   /* Determine ci and cj */
290:   for (i=0; i<am; i++) {
291:     anzi = ai[i+1] - ai[i];
292:     aj   = a->j + ai[i];
293:     for (j=0; j<anzi; j++){
294:       brow = aj[j];
295:       bnzj = bi[brow+1] - bi[brow];
296:       bj   = b->j + bi[brow];
297:       /* add non-zero cols of B into the sorted linked list lnk */
298:       PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);
299:     }
300:     cnzi = lnk[1];

302:     /* If free space is not available, make more free space */
303:     /* Double the amount of total space in the list */
304:     if (current_space->local_remaining<cnzi) {
305:       PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);
306:       ndouble++;
307:     }

309:     /* Copy data into free space, then initialize lnk */
310:     PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);
311:     current_space->array           += cnzi;
312:     current_space->local_used      += cnzi;
313:     current_space->local_remaining -= cnzi;
314:     ci[i+1] = ci[i] + cnzi;
315:   }

317:   /* Column indices are in the list of free space */
318:   /* Allocate space for cj, initialize cj, and */
319:   /* destroy list of free space and other temporary array(s) */
320:   PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
321:   PetscFreeSpaceContiguous(&free_space,cj);
322:   PetscLLCondensedDestroy_fast(lnk);
323: 
324:   /* Allocate space for ca */
325:   PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);
326:   PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
327: 
328:   /* put together the new symbolic matrix */
329:   MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);
330:   (*C)->rmap->bs = A->rmap->bs;
331:   (*C)->cmap->bs = B->cmap->bs;

333:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
334:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
335:   c = (Mat_SeqAIJ *)((*C)->data);
336:   c->free_a   = PETSC_TRUE;
337:   c->free_ij  = PETSC_TRUE;
338:   c->nonew    = 0;
339:   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
340: 
341:   /* set MatInfo */
342:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
343:   if (afill < 1.0) afill = 1.0;
344:   c->maxnz                     = ci[am];
345:   c->nz                        = ci[am];
346:   (*C)->info.mallocs           = ndouble;
347:   (*C)->info.fill_ratio_given  = fill;
348:   (*C)->info.fill_ratio_needed = afill;

350: #if defined(PETSC_USE_INFO)
351:   if (ci[am]) {
352:     PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);
353:     PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
354:   } else {
355:     PetscInfo((*C),"Empty matrix product\n");
356:   }
357: #endif
358:   return(0);
359: }


364: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
365: {
366:   PetscErrorCode     ierr;
367:   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
368:   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
369:   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
370:   MatScalar          *ca;
371:   PetscReal          afill;
372:   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
373:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;

376:   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
377:   /*---------------------------------------------------------------------------------------------*/
378:   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
379:   PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
380:   ci[0] = 0;
381: 
382:   /* create and initialize a linked list */
383:   nlnk_max = a->rmax*b->rmax;
384:   if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
385:   PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);

387:   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
388:   PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
389:   current_space = free_space;

391:   /* Determine ci and cj */
392:   for (i=0; i<am; i++) {
393:     anzi = ai[i+1] - ai[i];
394:     aj   = a->j + ai[i];
395:     for (j=0; j<anzi; j++){
396:       brow = aj[j];
397:       bnzj = bi[brow+1] - bi[brow];
398:       bj   = b->j + bi[brow];
399:       /* add non-zero cols of B into the sorted linked list lnk */
400:       PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);
401:     }
402:     cnzi = lnk[0];

404:     /* If free space is not available, make more free space */
405:     /* Double the amount of total space in the list */
406:     if (current_space->local_remaining<cnzi) {
407:       PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);
408:       ndouble++;
409:     }

411:     /* Copy data into free space, then initialize lnk */
412:     PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);
413:     current_space->array           += cnzi;
414:     current_space->local_used      += cnzi;
415:     current_space->local_remaining -= cnzi;
416:     ci[i+1] = ci[i] + cnzi;
417:   }

419:   /* Column indices are in the list of free space */
420:   /* Allocate space for cj, initialize cj, and */
421:   /* destroy list of free space and other temporary array(s) */
422:   PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
423:   PetscFreeSpaceContiguous(&free_space,cj);
424:   PetscLLCondensedDestroy_Scalable(lnk);
425: 
426:   /* Allocate space for ca */
427:   /*-----------------------*/
428:   PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);
429:   PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
430: 
431:   /* put together the new symbolic matrix */
432:   MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);
433:   (*C)->rmap->bs = A->rmap->bs;
434:   (*C)->cmap->bs = B->cmap->bs;

436:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
437:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
438:   c = (Mat_SeqAIJ *)((*C)->data);
439:   c->free_a   = PETSC_TRUE;
440:   c->free_ij  = PETSC_TRUE;
441:   c->nonew    = 0;
442:   (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
443: 
444:   /* set MatInfo */
445:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
446:   if (afill < 1.0) afill = 1.0;
447:   c->maxnz                     = ci[am];
448:   c->nz                        = ci[am];
449:   (*C)->info.mallocs           = ndouble;
450:   (*C)->info.fill_ratio_given  = fill;
451:   (*C)->info.fill_ratio_needed = afill;

453: #if defined(PETSC_USE_INFO)
454:   if (ci[am]) {
455:     PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);
456:     PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
457:   } else {
458:     PetscInfo((*C),"Empty matrix product\n");
459:   }
460: #endif
461:   return(0);
462: }


465: /* This routine is not used. Should be removed! */
468: PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
469: {
471: 
473:   if (scall == MAT_INITIAL_MATRIX){
474:     MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
475:   }
476:   MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
477:   return(0);
478: }

482: PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
483: {
484:   PetscErrorCode      ierr;
485:   Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;

488:   MatTransposeColoringDestroy(&multtrans->matcoloring);
489:   MatDestroy(&multtrans->Bt_den);
490:   MatDestroy(&multtrans->ABt_den);
491:   PetscFree(multtrans);
492:   return(0);
493: }

497: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
498: {
499:   PetscErrorCode      ierr;
500:   PetscContainer      container;
501:   Mat_MatMatTransMult *multtrans=PETSC_NULL;

504:   PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);
505:   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
506:   PetscContainerGetPointer(container,(void **)&multtrans);
507:   A->ops->destroy   = multtrans->destroy;
508:   if (A->ops->destroy) {
509:     (*A->ops->destroy)(A);
510:   }
511:   PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);
512:   return(0);
513: }

517: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
518: {
519:   PetscErrorCode      ierr;
520:   Mat                 Bt;
521:   PetscInt            *bti,*btj;
522:   Mat_MatMatTransMult *multtrans;
523:   PetscContainer      container;
524:   PetscLogDouble      t0,tf,etime2=0.0;
525: 
527:   PetscGetTime(&t0);
528:    /* create symbolic Bt */
529:   MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);
530:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);
531:   Bt->rmap->bs = A->cmap->bs;
532:   Bt->cmap->bs = B->cmap->bs;

534:   /* get symbolic C=A*Bt */
535:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);

537:   /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
538:   PetscNew(Mat_MatMatTransMult,&multtrans);

540:   /* attach the supporting struct to C */
541:   PetscContainerCreate(PETSC_COMM_SELF,&container);
542:   PetscContainerSetPointer(container,multtrans);
543:   PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);
544:   PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);
545:   PetscContainerDestroy(&container);

547:   multtrans->usecoloring = PETSC_FALSE;
548:   multtrans->destroy = (*C)->ops->destroy;
549:   (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;

551:   PetscGetTime(&tf);
552:   etime2 += tf - t0;

554:   PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);
555:   if (multtrans->usecoloring){
556:     /* Create MatTransposeColoring from symbolic C=A*B^T */
557:     MatTransposeColoring matcoloring;
558:     ISColoring           iscoloring;
559:     Mat                  Bt_dense,C_dense;
560:     PetscLogDouble       etime0=0.0,etime01=0.0,etime1=0.0;
561: 
562:     PetscGetTime(&t0);
563:     MatGetColoring(*C,MATCOLORINGLF,&iscoloring);
564:     PetscGetTime(&tf);
565:     etime0 += tf - t0;

567:     PetscGetTime(&t0);
568:     MatTransposeColoringCreate(*C,iscoloring,&matcoloring);
569:     multtrans->matcoloring = matcoloring;
570:     ISColoringDestroy(&iscoloring);
571:     PetscGetTime(&tf);
572:     etime01 += tf - t0;

574:     PetscGetTime(&t0);
575:     /* Create Bt_dense and C_dense = A*Bt_dense */
576:     MatCreate(PETSC_COMM_SELF,&Bt_dense);
577:     MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);
578:     MatSetType(Bt_dense,MATSEQDENSE);
579:     MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);
580:     Bt_dense->assembled = PETSC_TRUE;
581:     multtrans->Bt_den = Bt_dense;
582: 
583:     MatCreate(PETSC_COMM_SELF,&C_dense);
584:     MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);
585:     MatSetType(C_dense,MATSEQDENSE);
586:     MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);
587:     Bt_dense->assembled = PETSC_TRUE;
588:     multtrans->ABt_den = C_dense;
589:     PetscGetTime(&tf);
590:     etime1 += tf - t0;

592: #if defined(PETSC_USE_INFO)
593:     {
594:     Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
595:     PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
596:     PetscInfo5(*C,"Sym = GetColor %g + ColorCreate %g + MatDenseCreate %g + non-colorSym %g = %g\n",etime0,etime01,etime1,etime2,etime0+etime01+etime1+etime2);
597:     }
598: #endif
599:   }
600:   /* clean up */
601:   MatDestroy(&Bt);
602:   MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);

604: 
605: 
606: #if defined(INEFFICIENT_ALGORITHM)
607:   /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
608:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
609:   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
610:   PetscInt           *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
611:   PetscInt           am=A->rmap->N,bm=B->rmap->N;
612:   PetscInt           i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
613:   MatScalar          *ca;
614:   PetscBT            lnkbt;
615:   PetscReal          afill;

617:   /* Allocate row pointer array ci  */
618:   PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
619:   ci[0] = 0;
620: 
621:   /* Create and initialize a linked list for C columns */
622:   nlnk = bm+1;
623:   PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);

625:   /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
626:   PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
627:   current_space = free_space;

629:   /* Determine symbolic info for each row of the product A*B^T: */
630:   for (i=0; i<am; i++) {
631:     anzi = ai[i+1] - ai[i];
632:     cnzi = 0;
633:     acol = aj + ai[i];
634:     for (j=0; j<bm; j++){
635:       bnzj = bi[j+1] - bi[j];
636:       bcol= bj + bi[j];
637:       /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
638:       ka = 0; kb = 0;
639:       while (ka < anzi && kb < bnzj){
640:         while (acol[ka] < bcol[kb] && ka < anzi) ka++;
641:         if (ka == anzi) break;
642:         while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
643:         if (kb == bnzj) break;
644:         if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
645:           index[0] = j;
646:           PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);
647:           cnzi++;
648:           break;
649:         }
650:       }
651:     }
652: 
653:     /* If free space is not available, make more free space */
654:     /* Double the amount of total space in the list */
655:     if (current_space->local_remaining<cnzi) {
656:       PetscFreeSpaceGet(cnzi+current_space->total_array_size,&current_space);
657:       nspacedouble++;
658:     }

660:     /* Copy data into free space, then initialize lnk */
661:     PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);
662:     current_space->array           += cnzi;
663:     current_space->local_used      += cnzi;
664:     current_space->local_remaining -= cnzi;
665: 
666:     ci[i+1] = ci[i] + cnzi;
667:   }
668: 

670:   /* Column indices are in the list of free space. 
671:      Allocate array cj, copy column indices to cj, and destroy list of free space */
672:   PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
673:   PetscFreeSpaceContiguous(&free_space,cj);
674:   PetscLLDestroy(lnk,lnkbt);
675: 
676:   /* Allocate space for ca */
677:   PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);
678:   PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
679: 
680:   /* put together the new symbolic matrix */
681:   MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);
682:   (*C)->rmap->bs = A->cmap->bs;
683:   (*C)->cmap->bs = B->cmap->bs;

685:   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
686:   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
687:   c = (Mat_SeqAIJ *)((*C)->data);
688:   c->free_a   = PETSC_TRUE;
689:   c->free_ij  = PETSC_TRUE;
690:   c->nonew    = 0;

692:   /* set MatInfo */
693:   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
694:   if (afill < 1.0) afill = 1.0;
695:   c->maxnz                     = ci[am];
696:   c->nz                        = ci[am];
697:   (*C)->info.mallocs           = nspacedouble;
698:   (*C)->info.fill_ratio_given  = fill;
699:   (*C)->info.fill_ratio_needed = afill;

701: #if defined(PETSC_USE_INFO)
702:   if (ci[am]) {
703:     PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
704:     PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
705:   } else {
706:     PetscInfo((*C),"Empty matrix product\n");
707:   }
708: #endif
709: #endif
710:   return(0);
711: }

713: /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
716: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
717: {
719:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
720:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
721:   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
722:   PetscLogDouble flops=0.0;
723:   MatScalar      *aa=a->a,*aval,*ba=b->a,*bval,*ca,*cval;
724:   Mat_MatMatTransMult *multtrans;
725:   PetscContainer      container;
726: #if defined(USE_ARRAY)
727:   MatScalar      *spdot;
728: #endif

731:   PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);
732:   if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
733:   PetscContainerGetPointer(container,(void **)&multtrans);
734:   if (multtrans->usecoloring){
735:     MatTransposeColoring  matcoloring = multtrans->matcoloring;
736:     Mat                   Bt_dense;
737:     PetscInt              m,n;
738:     PetscLogDouble t0,tf,etime0=0.0,etime1=0.0,etime2=0.0;
739:     Mat C_dense = multtrans->ABt_den;

741:     Bt_dense = multtrans->Bt_den;
742:     MatGetLocalSize(Bt_dense,&m,&n);

744:     /* Get Bt_dense by Apply MatTransposeColoring to B */
745:     PetscGetTime(&t0);
746:     MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);
747:     PetscGetTime(&tf);
748:     etime0 += tf - t0;

750:     /* C_dense = A*Bt_dense */
751:     PetscGetTime(&t0);
752:     MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);
753:     PetscGetTime(&tf);
754:     etime2 += tf - t0;
755: 
756:     /* Recover C from C_dense */
757:     PetscGetTime(&t0);
758:     MatTransColoringApplyDenToSp(matcoloring,C_dense,C);
759:     PetscGetTime(&tf);
760:     etime1 += tf - t0;
761: #if defined(PETSC_USE_INFO)
762:     PetscInfo4(C,"Num = ColoringApply: %g %g + Mult_sp_dense %g = %g\n",etime0,etime1,etime2,etime0+etime1+etime2);
763: #endif
764:     return(0);
765:   }

767: #if defined(USE_ARRAY)
768:   /* allocate an array for implementing sparse inner-product */
769:   PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);
770:   PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));
771: #endif

773:   /* clear old values in C */
774:   if (!c->a){
775:     PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);
776:     c->a      = ca;
777:     c->free_a = PETSC_TRUE;
778:   } else {
779:     ca =  c->a;
780:   }
781:   PetscMemzero(ca,ci[cm]*sizeof(MatScalar));

783:   for (i=0; i<cm; i++) {
784:     anzi = ai[i+1] - ai[i];
785:     acol = aj + ai[i];
786:     aval = aa + ai[i];
787:     cnzi = ci[i+1] - ci[i];
788:     ccol = cj + ci[i];
789:     cval = ca + ci[i];
790:     for (j=0; j<cnzi; j++){
791:       brow = ccol[j];
792:       bnzj = bi[brow+1] - bi[brow];
793:       bcol = bj + bi[brow];
794:       bval = ba + bi[brow];

796:       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
797: #if defined(USE_ARRAY)
798:       /* put ba to spdot array */
799:       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
800:       /* c(i,j)=A[i,:]*B[j,:]^T */
801:       for (nexta=0; nexta<anzi; nexta++){
802:         cval[j] += spdot[acol[nexta]]*aval[nexta];
803:       }
804:       /* zero spdot array */
805:       for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
806: #else
807:       nexta = 0; nextb = 0;
808:       while (nexta<anzi && nextb<bnzj){
809:         while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
810:         if (nexta == anzi) break;
811:         while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
812:         if (nextb == bnzj) break;
813:         if (acol[nexta] == bcol[nextb]){
814:           cval[j] += aval[nexta]*bval[nextb];
815:           nexta++; nextb++;
816:           flops += 2;
817:         }
818:       }
819: #endif
820:     }
821:   }
822:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
823:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
824:   PetscLogFlops(flops);
825: #if defined(USE_ARRAY)
826:   PetscFree(spdot);
827: #endif
828:   return(0);
829: }

833: PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {

837:   if (scall == MAT_INITIAL_MATRIX){
838:     MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
839:   }
840:   MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
841:   return(0);
842: }

846: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
847: {
849:   Mat            At;
850:   PetscInt       *ati,*atj;

853:   /* create symbolic At */
854:   MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);
855:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);
856:   At->rmap->bs = A->cmap->bs;
857:   At->cmap->bs = B->cmap->bs;

859:   /* get symbolic C=At*B */
860:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);

862:   /* clean up */
863:   MatDestroy(&At);
864:   MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);
865:   return(0);
866: }

870: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
871: {
873:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
874:   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
875:   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
876:   PetscLogDouble flops=0.0;
877:   MatScalar      *aa=a->a,*ba,*ca,*caj;
878: 
880:   if (!c->a){
881:     PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);
882:     c->a      = ca;
883:     c->free_a = PETSC_TRUE;
884:   } else {
885:     ca = c->a;
886:   }
887:   /* clear old values in C */
888:   PetscMemzero(ca,ci[cm]*sizeof(MatScalar));

890:   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
891:   for (i=0;i<am;i++) {
892:     bj   = b->j + bi[i];
893:     ba   = b->a + bi[i];
894:     bnzi = bi[i+1] - bi[i];
895:     anzi = ai[i+1] - ai[i];
896:     for (j=0; j<anzi; j++) {
897:       nextb = 0;
898:       crow  = *aj++;
899:       cjj   = cj + ci[crow];
900:       caj   = ca + ci[crow];
901:       /* perform sparse axpy operation.  Note cjj includes bj. */
902:       for (k=0; nextb<bnzi; k++) {
903:         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
904:           caj[k] += (*aa)*(*(ba+nextb));
905:           nextb++;
906:         }
907:       }
908:       flops += 2*bnzi;
909:       aa++;
910:     }
911:   }

913:   /* Assemble the final matrix and clean up */
914:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
915:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
916:   PetscLogFlops(flops);
917:   return(0);
918: }

920: EXTERN_C_BEGIN
923: PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
924: {

928:   if (scall == MAT_INITIAL_MATRIX){
929:     MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);
930:   }
931:   MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);
932:   return(0);
933: }
934: EXTERN_C_END

938: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
939: {

943:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
944:   (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqDense;
945:   return(0);
946: }

950: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
951: {
952:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
954:   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
955:   MatScalar      *aa;
956:   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
957:   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam;

960:   if (!cm || !cn) return(0);
961:   if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
962:   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
963:   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
964:   MatGetArray(B,&b);
965:   MatGetArray(C,&c);
966:   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
967:   for (col=0; col<cn-4; col += 4){  /* over columns of C */
968:     colam = col*am;
969:     for (i=0; i<am; i++) {        /* over rows of C in those columns */
970:       r1 = r2 = r3 = r4 = 0.0;
971:       n   = a->i[i+1] - a->i[i];
972:       aj  = a->j + a->i[i];
973:       aa  = a->a + a->i[i];
974:       for (j=0; j<n; j++) {
975:         r1 += (*aa)*b1[*aj];
976:         r2 += (*aa)*b2[*aj];
977:         r3 += (*aa)*b3[*aj];
978:         r4 += (*aa++)*b4[*aj++];
979:       }
980:       c[colam + i]       = r1;
981:       c[colam + am + i]  = r2;
982:       c[colam + am2 + i] = r3;
983:       c[colam + am3 + i] = r4;
984:     }
985:     b1 += bm4;
986:     b2 += bm4;
987:     b3 += bm4;
988:     b4 += bm4;
989:   }
990:   for (;col<cn; col++){     /* over extra columns of C */
991:     for (i=0; i<am; i++) {  /* over rows of C in those columns */
992:       r1 = 0.0;
993:       n   = a->i[i+1] - a->i[i];
994:       aj  = a->j + a->i[i];
995:       aa  = a->a + a->i[i];

997:       for (j=0; j<n; j++) {
998:         r1 += (*aa++)*b1[*aj++];
999:       }
1000:       c[col*am + i]     = r1;
1001:     }
1002:     b1 += bm;
1003:   }
1004:   PetscLogFlops(cn*(2.0*a->nz));
1005:   MatRestoreArray(B,&b);
1006:   MatRestoreArray(C,&c);
1007:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1008:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1009:   return(0);
1010: }

1012: /*
1013:    Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1014: */
1017: PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1018: {
1019:   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
1021:   PetscScalar    *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1022:   MatScalar      *aa;
1023:   PetscInt       cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1024:   PetscInt       am2 = 2*am, am3 = 3*am,  bm4 = 4*bm,colam,*ridx;

1027:   if (!cm || !cn) return(0);
1028:   MatGetArray(B,&b);
1029:   MatGetArray(C,&c);
1030:   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;

1032:   if (a->compressedrow.use){ /* use compressed row format */
1033:     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1034:       colam = col*am;
1035:       arm   = a->compressedrow.nrows;
1036:       ii    = a->compressedrow.i;
1037:       ridx  = a->compressedrow.rindex;
1038:       for (i=0; i<arm; i++) {        /* over rows of C in those columns */
1039:         r1 = r2 = r3 = r4 = 0.0;
1040:         n   = ii[i+1] - ii[i];
1041:         aj  = a->j + ii[i];
1042:         aa  = a->a + ii[i];
1043:         for (j=0; j<n; j++) {
1044:           r1 += (*aa)*b1[*aj];
1045:           r2 += (*aa)*b2[*aj];
1046:           r3 += (*aa)*b3[*aj];
1047:           r4 += (*aa++)*b4[*aj++];
1048:         }
1049:         c[colam       + ridx[i]] += r1;
1050:         c[colam + am  + ridx[i]] += r2;
1051:         c[colam + am2 + ridx[i]] += r3;
1052:         c[colam + am3 + ridx[i]] += r4;
1053:       }
1054:       b1 += bm4;
1055:       b2 += bm4;
1056:       b3 += bm4;
1057:       b4 += bm4;
1058:     }
1059:     for (;col<cn; col++){     /* over extra columns of C */
1060:       colam = col*am;
1061:       arm   = a->compressedrow.nrows;
1062:       ii    = a->compressedrow.i;
1063:       ridx  = a->compressedrow.rindex;
1064:       for (i=0; i<arm; i++) {  /* over rows of C in those columns */
1065:         r1 = 0.0;
1066:         n   = ii[i+1] - ii[i];
1067:         aj  = a->j + ii[i];
1068:         aa  = a->a + ii[i];

1070:         for (j=0; j<n; j++) {
1071:           r1 += (*aa++)*b1[*aj++];
1072:         }
1073:         c[col*am + ridx[i]] += r1;
1074:       }
1075:       b1 += bm;
1076:     }
1077:   } else {
1078:     for (col=0; col<cn-4; col += 4){  /* over columns of C */
1079:       colam = col*am;
1080:       for (i=0; i<am; i++) {        /* over rows of C in those columns */
1081:         r1 = r2 = r3 = r4 = 0.0;
1082:         n   = a->i[i+1] - a->i[i];
1083:         aj  = a->j + a->i[i];
1084:         aa  = a->a + a->i[i];
1085:         for (j=0; j<n; j++) {
1086:           r1 += (*aa)*b1[*aj];
1087:           r2 += (*aa)*b2[*aj];
1088:           r3 += (*aa)*b3[*aj];
1089:           r4 += (*aa++)*b4[*aj++];
1090:         }
1091:         c[colam + i]       += r1;
1092:         c[colam + am + i]  += r2;
1093:         c[colam + am2 + i] += r3;
1094:         c[colam + am3 + i] += r4;
1095:       }
1096:       b1 += bm4;
1097:       b2 += bm4;
1098:       b3 += bm4;
1099:       b4 += bm4;
1100:     }
1101:     for (;col<cn; col++){     /* over extra columns of C */
1102:       for (i=0; i<am; i++) {  /* over rows of C in those columns */
1103:         r1 = 0.0;
1104:         n   = a->i[i+1] - a->i[i];
1105:         aj  = a->j + a->i[i];
1106:         aa  = a->a + a->i[i];

1108:         for (j=0; j<n; j++) {
1109:           r1 += (*aa++)*b1[*aj++];
1110:         }
1111:         c[col*am + i]     += r1;
1112:       }
1113:       b1 += bm;
1114:     }
1115:   }
1116:   PetscLogFlops(cn*2.0*a->nz);
1117:   MatRestoreArray(B,&b);
1118:   MatRestoreArray(C,&c);
1119:   return(0);
1120: }

1124: PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1125: {
1127:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1128:   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1129:   PetscInt       *bi=b->i,*bj=b->j;
1130:   PetscInt       m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1131:   MatScalar      *btval,*btval_den,*ba=b->a;
1132:   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;

1135:   btval_den=btdense->v;
1136:   PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));
1137:   for (k=0; k<ncolors; k++) {
1138:     ncolumns = coloring->ncolumns[k];
1139:     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1140:       col   = *(columns + colorforcol[k] + l);
1141:       btcol = bj + bi[col];
1142:       btval = ba + bi[col];
1143:       anz   = bi[col+1] - bi[col];
1144:       for (j=0; j<anz; j++){
1145:         brow            = btcol[j];
1146:         btval_den[brow] = btval[j];
1147:       }
1148:     }
1149:     btval_den += m;
1150:   }
1151:   return(0);
1152: }

1156: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1157: {
1159:   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)Csp->data;
1160:   PetscInt       k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1161:   PetscScalar    *ca_den,*cp_den,*ca=csp->a;
1162:   PetscInt       *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1163: 
1165:   MatGetLocalSize(Csp,&m,PETSC_NULL);
1166:   MatGetArray(Cden,&ca_den);
1167:   cp_den = ca_den;
1168:   for (k=0; k<ncolors; k++) {
1169:     nrows = matcoloring->nrows[k];
1170:     row   = rows  + colorforrow[k];
1171:     idx   = spidx + colorforrow[k];
1172:     for (l=0; l<nrows; l++){
1173:       ca[idx[l]] = cp_den[row[l]];
1174:     }
1175:     cp_den += m;
1176:   }
1177:   MatRestoreArray(Cden,&ca_den);
1178:   return(0);
1179: }

1181: /*
1182:  MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1183:  MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1184:  spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1185:  */
1188: PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *nn,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1189: {
1190:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1192:   PetscInt       i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1193:   PetscInt       nz = a->i[m],row,*jj,mr,col;
1194:   PetscInt       *cspidx;

1197:   *nn = n;
1198:   if (!ia) return(0);
1199:   if (symmetric) {
1200:     SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1201:     MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,ia,ja);
1202:   } else {
1203:     PetscMalloc((n+1)*sizeof(PetscInt),&collengths);
1204:     PetscMemzero(collengths,n*sizeof(PetscInt));
1205:     PetscMalloc((n+1)*sizeof(PetscInt),&cia);
1206:     PetscMalloc((nz+1)*sizeof(PetscInt),&cja);
1207:     PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);
1208:     jj = a->j;
1209:     for (i=0; i<nz; i++) {
1210:       collengths[jj[i]]++;
1211:     }
1212:     cia[0] = oshift;
1213:     for (i=0; i<n; i++) {
1214:       cia[i+1] = cia[i] + collengths[i];
1215:     }
1216:     PetscMemzero(collengths,n*sizeof(PetscInt));
1217:     jj   = a->j;
1218:     for (row=0; row<m; row++) {
1219:       mr = a->i[row+1] - a->i[row];
1220:       for (i=0; i<mr; i++) {
1221:         col = *jj++;
1222:         cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1223:         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1224:       }
1225:     }
1226:     PetscFree(collengths);
1227:     *ia = cia; *ja = cja;
1228:     *spidx = cspidx;
1229:   }
1230:   return(0);
1231: }

1235: PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  inodecompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
1236: {

1240:   if (!ia) return(0);

1242:   PetscFree(*ia);
1243:   PetscFree(*ja);
1244:   PetscFree(*spidx);
1245:   return(0);
1246: }

1250: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1251: {
1253:   PetscInt       i,n,nrows,N,j,k,m,*row_idx,*ci,*cj,ncols,col,cm;
1254:   const PetscInt *is;
1255:   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1256:   IS             *isa;
1257:   PetscBool      flg1,flg2;
1258:   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1259:   PetscInt       *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1260:   PetscInt       *colorforcol,*columns,*columns_i;

1263:   ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);
1264: 
1265:   /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1266:   PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);
1267:   PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);
1268:   if (flg1 || flg2) {
1269:     MatGetBlockSize(mat,&bs);
1270:   }

1272:   N         = mat->cmap->N/bs;
1273:   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1274:   c->N      = mat->cmap->N/bs;
1275:   c->m      = mat->rmap->N/bs;
1276:   c->rstart = 0;

1278:   c->ncolors = nis;
1279:   PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);
1280:   PetscMalloc(nis*sizeof(PetscInt),&c->nrows);
1281:   PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);
1282:   PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);
1283:   colorforrow[0]    = 0;
1284:   rows_i            = rows;
1285:   columnsforspidx_i = columnsforspidx;

1287:   PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);
1288:   PetscMalloc((N+1)*sizeof(PetscInt),&columns);
1289:   colorforcol[0] = 0;
1290:   columns_i      = columns;
1291: 
1292:   MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL); /* column-wise storage of mat */

1294:   cm = c->m;
1295:   PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);
1296:   PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);
1297:   for (i=0; i<nis; i++) {
1298:     ISGetLocalSize(isa[i],&n);
1299:     ISGetIndices(isa[i],&is);
1300:     c->ncolumns[i] = n;
1301:     if (n) {
1302:       PetscMemcpy(columns_i,is,n*sizeof(PetscInt));
1303:     }
1304:     colorforcol[i+1] = colorforcol[i] + n;
1305:     columns_i       += n;

1307:     /* fast, crude version requires O(N*N) work */
1308:     PetscMemzero(rowhit,cm*sizeof(PetscInt));
1309: 
1310:     /* loop over columns*/
1311:     for (j=0; j<n; j++) {
1312:       col     = is[j];
1313:       row_idx = cj + ci[col];
1314:       m       = ci[col+1] - ci[col];
1315:       /* loop over columns marking them in rowhit */
1316:       for (k=0; k<m; k++) {
1317:         idxhit[*row_idx]   = spidx[ci[col] + k];
1318:         rowhit[*row_idx++] = col + 1;
1319:       }
1320:     }
1321:     /* count the number of hits */
1322:     nrows = 0;
1323:     for (j=0; j<cm; j++) {
1324:       if (rowhit[j]) nrows++;
1325:     }
1326:     c->nrows[i]      = nrows;
1327:     colorforrow[i+1] = colorforrow[i] + nrows;
1328: 
1329:     nrows       = 0;
1330:     for (j=0; j<cm; j++) {
1331:       if (rowhit[j]) {
1332:         rows_i[nrows]            = j;
1333:         columnsforspidx_i[nrows] = idxhit[j];
1334:         nrows++;
1335:       }
1336:     }
1337:     ISRestoreIndices(isa[i],&is);
1338:     rows_i += nrows; columnsforspidx_i += nrows;
1339:   }
1340:   MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL);
1341:   PetscFree(rowhit);
1342:   ISColoringRestoreIS(iscoloring,&isa);
1343: #if defined(PETSC_USE_DEBUG)
1344:   if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1345: #endif
1346: 
1347:   c->colorforrow     = colorforrow;
1348:   c->rows            = rows;
1349:   c->columnsforspidx = columnsforspidx;
1350:   c->colorforcol     = colorforcol;
1351:   c->columns         = columns;
1352:   PetscFree(idxhit);
1353:   return(0);
1354: }