Actual source code: matmatmult.c
petsc-3.10.5 2019-03-28
2: /*
3: Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4: C = A * B
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <petscbt.h>
10: #include <petsc/private/isimpl.h>
11: #include <../src/mat/impls/dense/seq/dense.h>
13: static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
15: #if defined(PETSC_HAVE_HYPRE)
16: PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
17: #endif
19: PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
20: {
22: #if !defined(PETSC_HAVE_HYPRE)
23: const char *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined"};
24: PetscInt nalg = 7;
25: #else
26: const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","hypre"};
27: PetscInt nalg = 8;
28: #endif
29: PetscInt alg = 0; /* set default algorithm */
30: PetscBool combined = PETSC_FALSE; /* Indicates whether the symbolic stage already computed the numerical values. */
33: if (scall == MAT_INITIAL_MATRIX) {
34: PetscObjectOptionsBegin((PetscObject)A);
35: PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
36: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);
37: PetscOptionsEnd();
38: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
39: switch (alg) {
40: case 1:
41: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);
42: break;
43: case 2:
44: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);
45: break;
46: case 3:
47: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);
48: break;
49: case 4:
50: MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);
51: break;
52: case 5:
53: MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);
54: break;
55: case 6:
56: MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);
57: combined = PETSC_TRUE;
58: break;
59: #if defined(PETSC_HAVE_HYPRE)
60: case 7:
61: MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);
62: break;
63: #endif
64: default:
65: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
66: break;
67: }
68: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
69: }
71: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
72: if (!combined) {
73: (*(*C)->ops->matmultnumeric)(A,B,*C);
74: }
75: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
76: return(0);
77: }
79: static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
80: {
81: PetscErrorCode ierr;
82: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
83: PetscInt *ai=a->i,*bi=b->i,*ci,*cj;
84: PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
85: PetscReal afill;
86: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
87: PetscTable ta;
88: PetscBT lnkbt;
89: PetscFreeSpaceList free_space=NULL,current_space=NULL;
92: /* Get ci and cj */
93: /*---------------*/
94: /* Allocate ci array, arrays for fill computation and */
95: /* free space for accumulating nonzero column info */
96: PetscMalloc1(am+2,&ci);
97: ci[0] = 0;
99: /* create and initialize a linked list */
100: PetscTableCreate(bn,bn,&ta);
101: MatRowMergeMax_SeqAIJ(b,bm,ta);
102: PetscTableGetCount(ta,&Crmax);
103: PetscTableDestroy(&ta);
105: PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);
107: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
108: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
110: current_space = free_space;
112: /* Determine ci and cj */
113: for (i=0; i<am; i++) {
114: anzi = ai[i+1] - ai[i];
115: aj = a->j + ai[i];
116: for (j=0; j<anzi; j++) {
117: brow = aj[j];
118: bnzj = bi[brow+1] - bi[brow];
119: bj = b->j + bi[brow];
120: /* add non-zero cols of B into the sorted linked list lnk */
121: PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);
122: }
123: cnzi = lnk[0];
125: /* If free space is not available, make more free space */
126: /* Double the amount of total space in the list */
127: if (current_space->local_remaining<cnzi) {
128: PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);
129: ndouble++;
130: }
132: /* Copy data into free space, then initialize lnk */
133: PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);
135: current_space->array += cnzi;
136: current_space->local_used += cnzi;
137: current_space->local_remaining -= cnzi;
139: ci[i+1] = ci[i] + cnzi;
140: }
142: /* Column indices are in the list of free space */
143: /* Allocate space for cj, initialize cj, and */
144: /* destroy list of free space and other temporary array(s) */
145: PetscMalloc1(ci[am]+1,&cj);
146: PetscFreeSpaceContiguous(&free_space,cj);
147: PetscLLCondensedDestroy(lnk,lnkbt);
149: /* put together the new symbolic matrix */
150: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
151: MatSetBlockSizesFromMats(*C,A,B);
153: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
154: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
155: c = (Mat_SeqAIJ*)((*C)->data);
156: c->free_a = PETSC_FALSE;
157: c->free_ij = PETSC_TRUE;
158: c->nonew = 0;
159: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
161: /* set MatInfo */
162: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
163: if (afill < 1.0) afill = 1.0;
164: c->maxnz = ci[am];
165: c->nz = ci[am];
166: (*C)->info.mallocs = ndouble;
167: (*C)->info.fill_ratio_given = fill;
168: (*C)->info.fill_ratio_needed = afill;
170: #if defined(PETSC_USE_INFO)
171: if (ci[am]) {
172: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
173: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
174: } else {
175: PetscInfo((*C),"Empty matrix product\n");
176: }
177: #endif
178: return(0);
179: }
181: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
182: {
184: PetscLogDouble flops=0.0;
185: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
186: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
187: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
188: PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
189: PetscInt am =A->rmap->n,cm=C->rmap->n;
190: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
191: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
192: PetscScalar *ab_dense;
195: if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
196: PetscMalloc1(ci[cm]+1,&ca);
197: c->a = ca;
198: c->free_a = PETSC_TRUE;
199: } else {
200: ca = c->a;
201: }
202: if (!c->matmult_abdense) {
203: PetscCalloc1(B->cmap->N,&ab_dense);
204: c->matmult_abdense = ab_dense;
205: } else {
206: ab_dense = c->matmult_abdense;
207: }
209: /* clean old values in C */
210: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
211: /* Traverse A row-wise. */
212: /* Build the ith row in C by summing over nonzero columns in A, */
213: /* the rows of B corresponding to nonzeros of A. */
214: for (i=0; i<am; i++) {
215: anzi = ai[i+1] - ai[i];
216: for (j=0; j<anzi; j++) {
217: brow = aj[j];
218: bnzi = bi[brow+1] - bi[brow];
219: bjj = bj + bi[brow];
220: baj = ba + bi[brow];
221: /* perform dense axpy */
222: valtmp = aa[j];
223: for (k=0; k<bnzi; k++) {
224: ab_dense[bjj[k]] += valtmp*baj[k];
225: }
226: flops += 2*bnzi;
227: }
228: aj += anzi; aa += anzi;
230: cnzi = ci[i+1] - ci[i];
231: for (k=0; k<cnzi; k++) {
232: ca[k] += ab_dense[cj[k]];
233: ab_dense[cj[k]] = 0.0; /* zero ab_dense */
234: }
235: flops += cnzi;
236: cj += cnzi; ca += cnzi;
237: }
238: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
239: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
240: PetscLogFlops(flops);
241: return(0);
242: }
244: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
245: {
247: PetscLogDouble flops=0.0;
248: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
249: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
250: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
251: PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
252: PetscInt am = A->rmap->N,cm=C->rmap->N;
253: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
254: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
255: PetscInt nextb;
258: if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
259: PetscMalloc1(ci[cm]+1,&ca);
260: c->a = ca;
261: c->free_a = PETSC_TRUE;
262: }
264: /* clean old values in C */
265: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
266: /* Traverse A row-wise. */
267: /* Build the ith row in C by summing over nonzero columns in A, */
268: /* the rows of B corresponding to nonzeros of A. */
269: for (i=0; i<am; i++) {
270: anzi = ai[i+1] - ai[i];
271: cnzi = ci[i+1] - ci[i];
272: for (j=0; j<anzi; j++) {
273: brow = aj[j];
274: bnzi = bi[brow+1] - bi[brow];
275: bjj = bj + bi[brow];
276: baj = ba + bi[brow];
277: /* perform sparse axpy */
278: valtmp = aa[j];
279: nextb = 0;
280: for (k=0; nextb<bnzi; k++) {
281: if (cj[k] == bjj[nextb]) { /* ccol == bcol */
282: ca[k] += valtmp*baj[nextb++];
283: }
284: }
285: flops += 2*bnzi;
286: }
287: aj += anzi; aa += anzi;
288: cj += cnzi; ca += cnzi;
289: }
291: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
292: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
293: PetscLogFlops(flops);
294: return(0);
295: }
297: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
298: {
299: PetscErrorCode ierr;
300: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
301: PetscInt *ai = a->i,*bi=b->i,*ci,*cj;
302: PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
303: MatScalar *ca;
304: PetscReal afill;
305: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
306: PetscTable ta;
307: PetscFreeSpaceList free_space=NULL,current_space=NULL;
310: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
311: /*-----------------------------------------------------------------------------------------*/
312: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
313: PetscMalloc1(am+2,&ci);
314: ci[0] = 0;
316: /* create and initialize a linked list */
317: PetscTableCreate(bn,bn,&ta);
318: MatRowMergeMax_SeqAIJ(b,bm,ta);
319: PetscTableGetCount(ta,&Crmax);
320: PetscTableDestroy(&ta);
322: PetscLLCondensedCreate_fast(Crmax,&lnk);
324: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
325: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
326: current_space = free_space;
328: /* Determine ci and cj */
329: for (i=0; i<am; i++) {
330: anzi = ai[i+1] - ai[i];
331: aj = a->j + ai[i];
332: for (j=0; j<anzi; j++) {
333: brow = aj[j];
334: bnzj = bi[brow+1] - bi[brow];
335: bj = b->j + bi[brow];
336: /* add non-zero cols of B into the sorted linked list lnk */
337: PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);
338: }
339: cnzi = lnk[1];
341: /* If free space is not available, make more free space */
342: /* Double the amount of total space in the list */
343: if (current_space->local_remaining<cnzi) {
344: PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);
345: ndouble++;
346: }
348: /* Copy data into free space, then initialize lnk */
349: PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);
351: current_space->array += cnzi;
352: current_space->local_used += cnzi;
353: current_space->local_remaining -= cnzi;
355: ci[i+1] = ci[i] + cnzi;
356: }
358: /* Column indices are in the list of free space */
359: /* Allocate space for cj, initialize cj, and */
360: /* destroy list of free space and other temporary array(s) */
361: PetscMalloc1(ci[am]+1,&cj);
362: PetscFreeSpaceContiguous(&free_space,cj);
363: PetscLLCondensedDestroy_fast(lnk);
365: /* Allocate space for ca */
366: PetscMalloc1(ci[am]+1,&ca);
367: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
369: /* put together the new symbolic matrix */
370: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);
371: MatSetBlockSizesFromMats(*C,A,B);
373: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
374: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
375: c = (Mat_SeqAIJ*)((*C)->data);
376: c->free_a = PETSC_TRUE;
377: c->free_ij = PETSC_TRUE;
378: c->nonew = 0;
380: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
382: /* set MatInfo */
383: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
384: if (afill < 1.0) afill = 1.0;
385: c->maxnz = ci[am];
386: c->nz = ci[am];
387: (*C)->info.mallocs = ndouble;
388: (*C)->info.fill_ratio_given = fill;
389: (*C)->info.fill_ratio_needed = afill;
391: #if defined(PETSC_USE_INFO)
392: if (ci[am]) {
393: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
394: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
395: } else {
396: PetscInfo((*C),"Empty matrix product\n");
397: }
398: #endif
399: return(0);
400: }
403: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
404: {
405: PetscErrorCode ierr;
406: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
407: PetscInt *ai = a->i,*bi=b->i,*ci,*cj;
408: PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
409: MatScalar *ca;
410: PetscReal afill;
411: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
412: PetscTable ta;
413: PetscFreeSpaceList free_space=NULL,current_space=NULL;
416: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
417: /*---------------------------------------------------------------------------------------------*/
418: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
419: PetscMalloc1(am+2,&ci);
420: ci[0] = 0;
422: /* create and initialize a linked list */
423: PetscTableCreate(bn,bn,&ta);
424: MatRowMergeMax_SeqAIJ(b,bm,ta);
425: PetscTableGetCount(ta,&Crmax);
426: PetscTableDestroy(&ta);
427: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
429: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
430: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
431: current_space = free_space;
433: /* Determine ci and cj */
434: for (i=0; i<am; i++) {
435: anzi = ai[i+1] - ai[i];
436: aj = a->j + ai[i];
437: for (j=0; j<anzi; j++) {
438: brow = aj[j];
439: bnzj = bi[brow+1] - bi[brow];
440: bj = b->j + bi[brow];
441: /* add non-zero cols of B into the sorted linked list lnk */
442: PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);
443: }
444: cnzi = lnk[0];
446: /* If free space is not available, make more free space */
447: /* Double the amount of total space in the list */
448: if (current_space->local_remaining<cnzi) {
449: PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);
450: ndouble++;
451: }
453: /* Copy data into free space, then initialize lnk */
454: PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);
456: current_space->array += cnzi;
457: current_space->local_used += cnzi;
458: current_space->local_remaining -= cnzi;
460: ci[i+1] = ci[i] + cnzi;
461: }
463: /* Column indices are in the list of free space */
464: /* Allocate space for cj, initialize cj, and */
465: /* destroy list of free space and other temporary array(s) */
466: PetscMalloc1(ci[am]+1,&cj);
467: PetscFreeSpaceContiguous(&free_space,cj);
468: PetscLLCondensedDestroy_Scalable(lnk);
470: /* Allocate space for ca */
471: /*-----------------------*/
472: PetscMalloc1(ci[am]+1,&ca);
473: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
475: /* put together the new symbolic matrix */
476: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);
477: MatSetBlockSizesFromMats(*C,A,B);
479: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
480: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
481: c = (Mat_SeqAIJ*)((*C)->data);
482: c->free_a = PETSC_TRUE;
483: c->free_ij = PETSC_TRUE;
484: c->nonew = 0;
486: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
488: /* set MatInfo */
489: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
490: if (afill < 1.0) afill = 1.0;
491: c->maxnz = ci[am];
492: c->nz = ci[am];
493: (*C)->info.mallocs = ndouble;
494: (*C)->info.fill_ratio_given = fill;
495: (*C)->info.fill_ratio_needed = afill;
497: #if defined(PETSC_USE_INFO)
498: if (ci[am]) {
499: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
500: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
501: } else {
502: PetscInfo((*C),"Empty matrix product\n");
503: }
504: #endif
505: return(0);
506: }
508: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
509: {
510: PetscErrorCode ierr;
511: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
512: const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
513: PetscInt *ci,*cj,*bb;
514: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
515: PetscReal afill;
516: PetscInt i,j,col,ndouble = 0;
517: PetscFreeSpaceList free_space=NULL,current_space=NULL;
518: PetscHeap h;
521: /* Get ci and cj - by merging sorted rows using a heap */
522: /*---------------------------------------------------------------------------------------------*/
523: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
524: PetscMalloc1(am+2,&ci);
525: ci[0] = 0;
527: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
528: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
529: current_space = free_space;
531: PetscHeapCreate(a->rmax,&h);
532: PetscMalloc1(a->rmax,&bb);
534: /* Determine ci and cj */
535: for (i=0; i<am; i++) {
536: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
537: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
538: ci[i+1] = ci[i];
539: /* Populate the min heap */
540: for (j=0; j<anzi; j++) {
541: bb[j] = bi[acol[j]]; /* bb points at the start of the row */
542: if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
543: PetscHeapAdd(h,j,bj[bb[j]++]);
544: }
545: }
546: /* Pick off the min element, adding it to free space */
547: PetscHeapPop(h,&j,&col);
548: while (j >= 0) {
549: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
550: PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);
551: ndouble++;
552: }
553: *(current_space->array++) = col;
554: current_space->local_used++;
555: current_space->local_remaining--;
556: ci[i+1]++;
558: /* stash if anything else remains in this row of B */
559: if (bb[j] < bi[acol[j]+1]) {PetscHeapStash(h,j,bj[bb[j]++]);}
560: while (1) { /* pop and stash any other rows of B that also had an entry in this column */
561: PetscInt j2,col2;
562: PetscHeapPeek(h,&j2,&col2);
563: if (col2 != col) break;
564: PetscHeapPop(h,&j2,&col2);
565: if (bb[j2] < bi[acol[j2]+1]) {PetscHeapStash(h,j2,bj[bb[j2]++]);}
566: }
567: /* Put any stashed elements back into the min heap */
568: PetscHeapUnstash(h);
569: PetscHeapPop(h,&j,&col);
570: }
571: }
572: PetscFree(bb);
573: PetscHeapDestroy(&h);
575: /* Column indices are in the list of free space */
576: /* Allocate space for cj, initialize cj, and */
577: /* destroy list of free space and other temporary array(s) */
578: PetscMalloc1(ci[am],&cj);
579: PetscFreeSpaceContiguous(&free_space,cj);
581: /* put together the new symbolic matrix */
582: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
583: MatSetBlockSizesFromMats(*C,A,B);
585: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
586: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
587: c = (Mat_SeqAIJ*)((*C)->data);
588: c->free_a = PETSC_TRUE;
589: c->free_ij = PETSC_TRUE;
590: c->nonew = 0;
592: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
594: /* set MatInfo */
595: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
596: if (afill < 1.0) afill = 1.0;
597: c->maxnz = ci[am];
598: c->nz = ci[am];
599: (*C)->info.mallocs = ndouble;
600: (*C)->info.fill_ratio_given = fill;
601: (*C)->info.fill_ratio_needed = afill;
603: #if defined(PETSC_USE_INFO)
604: if (ci[am]) {
605: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
606: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
607: } else {
608: PetscInfo((*C),"Empty matrix product\n");
609: }
610: #endif
611: return(0);
612: }
614: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
615: {
616: PetscErrorCode ierr;
617: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
618: const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
619: PetscInt *ci,*cj,*bb;
620: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
621: PetscReal afill;
622: PetscInt i,j,col,ndouble = 0;
623: PetscFreeSpaceList free_space=NULL,current_space=NULL;
624: PetscHeap h;
625: PetscBT bt;
628: /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
629: /*---------------------------------------------------------------------------------------------*/
630: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
631: PetscMalloc1(am+2,&ci);
632: ci[0] = 0;
634: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
635: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
637: current_space = free_space;
639: PetscHeapCreate(a->rmax,&h);
640: PetscMalloc1(a->rmax,&bb);
641: PetscBTCreate(bn,&bt);
643: /* Determine ci and cj */
644: for (i=0; i<am; i++) {
645: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
646: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
647: const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
648: ci[i+1] = ci[i];
649: /* Populate the min heap */
650: for (j=0; j<anzi; j++) {
651: PetscInt brow = acol[j];
652: for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
653: PetscInt bcol = bj[bb[j]];
654: if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
655: PetscHeapAdd(h,j,bcol);
656: bb[j]++;
657: break;
658: }
659: }
660: }
661: /* Pick off the min element, adding it to free space */
662: PetscHeapPop(h,&j,&col);
663: while (j >= 0) {
664: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
665: fptr = NULL; /* need PetscBTMemzero */
666: PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);
667: ndouble++;
668: }
669: *(current_space->array++) = col;
670: current_space->local_used++;
671: current_space->local_remaining--;
672: ci[i+1]++;
674: /* stash if anything else remains in this row of B */
675: for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
676: PetscInt bcol = bj[bb[j]];
677: if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
678: PetscHeapAdd(h,j,bcol);
679: bb[j]++;
680: break;
681: }
682: }
683: PetscHeapPop(h,&j,&col);
684: }
685: if (fptr) { /* Clear the bits for this row */
686: for (; fptr<current_space->array; fptr++) {PetscBTClear(bt,*fptr);}
687: } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
688: PetscBTMemzero(bn,bt);
689: }
690: }
691: PetscFree(bb);
692: PetscHeapDestroy(&h);
693: PetscBTDestroy(&bt);
695: /* Column indices are in the list of free space */
696: /* Allocate space for cj, initialize cj, and */
697: /* destroy list of free space and other temporary array(s) */
698: PetscMalloc1(ci[am],&cj);
699: PetscFreeSpaceContiguous(&free_space,cj);
701: /* put together the new symbolic matrix */
702: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
703: MatSetBlockSizesFromMats(*C,A,B);
705: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
706: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
707: c = (Mat_SeqAIJ*)((*C)->data);
708: c->free_a = PETSC_TRUE;
709: c->free_ij = PETSC_TRUE;
710: c->nonew = 0;
712: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
714: /* set MatInfo */
715: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
716: if (afill < 1.0) afill = 1.0;
717: c->maxnz = ci[am];
718: c->nz = ci[am];
719: (*C)->info.mallocs = ndouble;
720: (*C)->info.fill_ratio_given = fill;
721: (*C)->info.fill_ratio_needed = afill;
723: #if defined(PETSC_USE_INFO)
724: if (ci[am]) {
725: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
726: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
727: } else {
728: PetscInfo((*C),"Empty matrix product\n");
729: }
730: #endif
731: return(0);
732: }
734: /* concatenate unique entries and then sort */
735: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
736: {
737: PetscErrorCode ierr;
738: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
739: const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
740: PetscInt *ci,*cj;
741: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
742: PetscReal afill;
743: PetscInt i,j,ndouble = 0;
744: PetscSegBuffer seg,segrow;
745: char *seen;
748: PetscMalloc1(am+1,&ci);
749: ci[0] = 0;
751: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
752: PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);
753: PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);
754: PetscMalloc1(bn,&seen);
755: PetscMemzero(seen,bn*sizeof(char));
757: /* Determine ci and cj */
758: for (i=0; i<am; i++) {
759: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
760: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
761: PetscInt packlen = 0,*PETSC_RESTRICT crow;
762: /* Pack segrow */
763: for (j=0; j<anzi; j++) {
764: PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
765: for (k=bjstart; k<bjend; k++) {
766: PetscInt bcol = bj[k];
767: if (!seen[bcol]) { /* new entry */
768: PetscInt *PETSC_RESTRICT slot;
769: PetscSegBufferGetInts(segrow,1,&slot);
770: *slot = bcol;
771: seen[bcol] = 1;
772: packlen++;
773: }
774: }
775: }
776: PetscSegBufferGetInts(seg,packlen,&crow);
777: PetscSegBufferExtractTo(segrow,crow);
778: PetscSortInt(packlen,crow);
779: ci[i+1] = ci[i] + packlen;
780: for (j=0; j<packlen; j++) seen[crow[j]] = 0;
781: }
782: PetscSegBufferDestroy(&segrow);
783: PetscFree(seen);
785: /* Column indices are in the segmented buffer */
786: PetscSegBufferExtractAlloc(seg,&cj);
787: PetscSegBufferDestroy(&seg);
789: /* put together the new symbolic matrix */
790: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
791: MatSetBlockSizesFromMats(*C,A,B);
793: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
794: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
795: c = (Mat_SeqAIJ*)((*C)->data);
796: c->free_a = PETSC_TRUE;
797: c->free_ij = PETSC_TRUE;
798: c->nonew = 0;
800: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
802: /* set MatInfo */
803: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
804: if (afill < 1.0) afill = 1.0;
805: c->maxnz = ci[am];
806: c->nz = ci[am];
807: (*C)->info.mallocs = ndouble;
808: (*C)->info.fill_ratio_given = fill;
809: (*C)->info.fill_ratio_needed = afill;
811: #if defined(PETSC_USE_INFO)
812: if (ci[am]) {
813: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
814: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
815: } else {
816: PetscInfo((*C),"Empty matrix product\n");
817: }
818: #endif
819: return(0);
820: }
822: /* This routine is not used. Should be removed! */
823: PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
824: {
828: if (scall == MAT_INITIAL_MATRIX) {
829: PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);
830: MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
831: PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);
832: }
833: PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);
834: MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
835: PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);
836: return(0);
837: }
839: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
840: {
841: PetscErrorCode ierr;
842: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
843: Mat_MatMatTransMult *abt=a->abt;
846: (abt->destroy)(A);
847: MatTransposeColoringDestroy(&abt->matcoloring);
848: MatDestroy(&abt->Bt_den);
849: MatDestroy(&abt->ABt_den);
850: PetscFree(abt);
851: return(0);
852: }
854: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
855: {
856: PetscErrorCode ierr;
857: Mat Bt;
858: PetscInt *bti,*btj;
859: Mat_MatMatTransMult *abt;
860: Mat_SeqAIJ *c;
863: /* create symbolic Bt */
864: MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);
865: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);
866: MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
868: /* get symbolic C=A*Bt */
869: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);
871: /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
872: PetscNew(&abt);
873: c = (Mat_SeqAIJ*)(*C)->data;
874: c->abt = abt;
876: abt->usecoloring = PETSC_FALSE;
877: abt->destroy = (*C)->ops->destroy;
878: (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
880: PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);
881: if (abt->usecoloring) {
882: /* Create MatTransposeColoring from symbolic C=A*B^T */
883: MatTransposeColoring matcoloring;
884: MatColoring coloring;
885: ISColoring iscoloring;
886: Mat Bt_dense,C_dense;
887: Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
888: /* inode causes memory problem, don't know why */
889: if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
891: MatColoringCreate(*C,&coloring);
892: MatColoringSetDistance(coloring,2);
893: MatColoringSetType(coloring,MATCOLORINGSL);
894: MatColoringSetFromOptions(coloring);
895: MatColoringApply(coloring,&iscoloring);
896: MatColoringDestroy(&coloring);
897: MatTransposeColoringCreate(*C,iscoloring,&matcoloring);
899: abt->matcoloring = matcoloring;
901: ISColoringDestroy(&iscoloring);
903: /* Create Bt_dense and C_dense = A*Bt_dense */
904: MatCreate(PETSC_COMM_SELF,&Bt_dense);
905: MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);
906: MatSetType(Bt_dense,MATSEQDENSE);
907: MatSeqDenseSetPreallocation(Bt_dense,NULL);
909: Bt_dense->assembled = PETSC_TRUE;
910: abt->Bt_den = Bt_dense;
912: MatCreate(PETSC_COMM_SELF,&C_dense);
913: MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);
914: MatSetType(C_dense,MATSEQDENSE);
915: MatSeqDenseSetPreallocation(C_dense,NULL);
917: Bt_dense->assembled = PETSC_TRUE;
918: abt->ABt_den = C_dense;
920: #if defined(PETSC_USE_INFO)
921: {
922: Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
923: PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
924: }
925: #endif
926: }
927: /* clean up */
928: MatDestroy(&Bt);
929: MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);
930: return(0);
931: }
933: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
934: {
935: PetscErrorCode ierr;
936: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
937: PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
938: PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
939: PetscLogDouble flops=0.0;
940: MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
941: Mat_MatMatTransMult *abt = c->abt;
944: /* clear old values in C */
945: if (!c->a) {
946: PetscMalloc1(ci[cm]+1,&ca);
947: c->a = ca;
948: c->free_a = PETSC_TRUE;
949: } else {
950: ca = c->a;
951: }
952: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
954: if (abt->usecoloring) {
955: MatTransposeColoring matcoloring = abt->matcoloring;
956: Mat Bt_dense,C_dense = abt->ABt_den;
958: /* Get Bt_dense by Apply MatTransposeColoring to B */
959: Bt_dense = abt->Bt_den;
960: MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);
962: /* C_dense = A*Bt_dense */
963: MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);
965: /* Recover C from C_dense */
966: MatTransColoringApplyDenToSp(matcoloring,C_dense,C);
967: return(0);
968: }
970: for (i=0; i<cm; i++) {
971: anzi = ai[i+1] - ai[i];
972: acol = aj + ai[i];
973: aval = aa + ai[i];
974: cnzi = ci[i+1] - ci[i];
975: ccol = cj + ci[i];
976: cval = ca + ci[i];
977: for (j=0; j<cnzi; j++) {
978: brow = ccol[j];
979: bnzj = bi[brow+1] - bi[brow];
980: bcol = bj + bi[brow];
981: bval = ba + bi[brow];
983: /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
984: nexta = 0; nextb = 0;
985: while (nexta<anzi && nextb<bnzj) {
986: while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
987: if (nexta == anzi) break;
988: while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
989: if (nextb == bnzj) break;
990: if (acol[nexta] == bcol[nextb]) {
991: cval[j] += aval[nexta]*bval[nextb];
992: nexta++; nextb++;
993: flops += 2;
994: }
995: }
996: }
997: }
998: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
999: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1000: PetscLogFlops(flops);
1001: return(0);
1002: }
1004: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
1005: {
1006: PetscErrorCode ierr;
1007: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1008: Mat_MatTransMatMult *atb = a->atb;
1011: MatDestroy(&atb->At);
1012: (atb->destroy)(A);
1013: PetscFree(atb);
1014: return(0);
1015: }
1017: PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1018: {
1019: PetscErrorCode ierr;
1020: const char *algTypes[2] = {"matmatmult","outerproduct"};
1021: PetscInt alg=0; /* set default algorithm */
1022: Mat At;
1023: Mat_MatTransMatMult *atb;
1024: Mat_SeqAIJ *c;
1027: if (scall == MAT_INITIAL_MATRIX) {
1028: PetscObjectOptionsBegin((PetscObject)A);
1029: PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
1030: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);
1031: PetscOptionsEnd();
1033: switch (alg) {
1034: case 1:
1035: MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
1036: break;
1037: default:
1038: PetscNew(&atb);
1039: MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);
1040: MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);
1042: c = (Mat_SeqAIJ*)(*C)->data;
1043: c->atb = atb;
1044: atb->At = At;
1045: atb->destroy = (*C)->ops->destroy;
1046: (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1048: break;
1049: }
1050: }
1051: if (alg) {
1052: (*(*C)->ops->mattransposemultnumeric)(A,B,*C);
1053: } else if (!alg && scall == MAT_REUSE_MATRIX) {
1054: c = (Mat_SeqAIJ*)(*C)->data;
1055: atb = c->atb;
1056: At = atb->At;
1057: MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);
1058: MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);
1059: }
1060: return(0);
1061: }
1063: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1064: {
1066: Mat At;
1067: PetscInt *ati,*atj;
1070: /* create symbolic At */
1071: MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1072: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);
1073: MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
1075: /* get symbolic C=At*B */
1076: MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);
1078: /* clean up */
1079: MatDestroy(&At);
1080: MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1082: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ;
1083: return(0);
1084: }
1086: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1087: {
1089: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1090: PetscInt am =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1091: PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1092: PetscLogDouble flops=0.0;
1093: MatScalar *aa =a->a,*ba,*ca,*caj;
1096: if (!c->a) {
1097: PetscMalloc1(ci[cm]+1,&ca);
1099: c->a = ca;
1100: c->free_a = PETSC_TRUE;
1101: } else {
1102: ca = c->a;
1103: }
1104: /* clear old values in C */
1105: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
1107: /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1108: for (i=0; i<am; i++) {
1109: bj = b->j + bi[i];
1110: ba = b->a + bi[i];
1111: bnzi = bi[i+1] - bi[i];
1112: anzi = ai[i+1] - ai[i];
1113: for (j=0; j<anzi; j++) {
1114: nextb = 0;
1115: crow = *aj++;
1116: cjj = cj + ci[crow];
1117: caj = ca + ci[crow];
1118: /* perform sparse axpy operation. Note cjj includes bj. */
1119: for (k=0; nextb<bnzi; k++) {
1120: if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1121: caj[k] += (*aa)*(*(ba+nextb));
1122: nextb++;
1123: }
1124: }
1125: flops += 2*bnzi;
1126: aa++;
1127: }
1128: }
1130: /* Assemble the final matrix and clean up */
1131: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1132: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1133: PetscLogFlops(flops);
1134: return(0);
1135: }
1137: PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1138: {
1142: if (scall == MAT_INITIAL_MATRIX) {
1143: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
1144: MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);
1145: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
1146: }
1147: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
1148: MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);
1149: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
1150: return(0);
1151: }
1153: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1154: {
1158: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1160: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1161: return(0);
1162: }
1164: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1165: {
1166: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1167: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1168: PetscErrorCode ierr;
1169: PetscScalar *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1170: const PetscScalar *aa,*b1,*b2,*b3,*b4;
1171: const PetscInt *aj;
1172: PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1173: PetscInt am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
1176: if (!cm || !cn) return(0);
1177: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1178: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1179: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1180: b = bd->v;
1181: MatDenseGetArray(C,&c);
1182: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1183: c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1184: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1185: for (i=0; i<am; i++) { /* over rows of C in those columns */
1186: r1 = r2 = r3 = r4 = 0.0;
1187: n = a->i[i+1] - a->i[i];
1188: aj = a->j + a->i[i];
1189: aa = a->a + a->i[i];
1190: for (j=0; j<n; j++) {
1191: aatmp = aa[j]; ajtmp = aj[j];
1192: r1 += aatmp*b1[ajtmp];
1193: r2 += aatmp*b2[ajtmp];
1194: r3 += aatmp*b3[ajtmp];
1195: r4 += aatmp*b4[ajtmp];
1196: }
1197: c1[i] = r1;
1198: c2[i] = r2;
1199: c3[i] = r3;
1200: c4[i] = r4;
1201: }
1202: b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1203: c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1204: }
1205: for (; col<cn; col++) { /* over extra columns of C */
1206: for (i=0; i<am; i++) { /* over rows of C in those columns */
1207: r1 = 0.0;
1208: n = a->i[i+1] - a->i[i];
1209: aj = a->j + a->i[i];
1210: aa = a->a + a->i[i];
1211: for (j=0; j<n; j++) {
1212: r1 += aa[j]*b1[aj[j]];
1213: }
1214: c1[i] = r1;
1215: }
1216: b1 += bm;
1217: c1 += am;
1218: }
1219: PetscLogFlops(cn*(2.0*a->nz));
1220: MatDenseRestoreArray(C,&c);
1221: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1222: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1223: return(0);
1224: }
1226: /*
1227: Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1228: */
1229: PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1230: {
1231: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1232: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1234: PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1235: MatScalar *aa;
1236: PetscInt cm = C->rmap->n, cn=B->cmap->n, bm=bd->lda, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1237: PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam,*ridx;
1240: if (!cm || !cn) return(0);
1241: b = bd->v;
1242: MatDenseGetArray(C,&c);
1243: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1245: if (a->compressedrow.use) { /* use compressed row format */
1246: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1247: colam = col*am;
1248: arm = a->compressedrow.nrows;
1249: ii = a->compressedrow.i;
1250: ridx = a->compressedrow.rindex;
1251: for (i=0; i<arm; i++) { /* over rows of C in those columns */
1252: r1 = r2 = r3 = r4 = 0.0;
1253: n = ii[i+1] - ii[i];
1254: aj = a->j + ii[i];
1255: aa = a->a + ii[i];
1256: for (j=0; j<n; j++) {
1257: r1 += (*aa)*b1[*aj];
1258: r2 += (*aa)*b2[*aj];
1259: r3 += (*aa)*b3[*aj];
1260: r4 += (*aa++)*b4[*aj++];
1261: }
1262: c[colam + ridx[i]] += r1;
1263: c[colam + am + ridx[i]] += r2;
1264: c[colam + am2 + ridx[i]] += r3;
1265: c[colam + am3 + ridx[i]] += r4;
1266: }
1267: b1 += bm4;
1268: b2 += bm4;
1269: b3 += bm4;
1270: b4 += bm4;
1271: }
1272: for (; col<cn; col++) { /* over extra columns of C */
1273: colam = col*am;
1274: arm = a->compressedrow.nrows;
1275: ii = a->compressedrow.i;
1276: ridx = a->compressedrow.rindex;
1277: for (i=0; i<arm; i++) { /* over rows of C in those columns */
1278: r1 = 0.0;
1279: n = ii[i+1] - ii[i];
1280: aj = a->j + ii[i];
1281: aa = a->a + ii[i];
1283: for (j=0; j<n; j++) {
1284: r1 += (*aa++)*b1[*aj++];
1285: }
1286: c[colam + ridx[i]] += r1;
1287: }
1288: b1 += bm;
1289: }
1290: } else {
1291: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1292: colam = col*am;
1293: for (i=0; i<am; i++) { /* over rows of C in those columns */
1294: r1 = r2 = r3 = r4 = 0.0;
1295: n = a->i[i+1] - a->i[i];
1296: aj = a->j + a->i[i];
1297: aa = a->a + a->i[i];
1298: for (j=0; j<n; j++) {
1299: r1 += (*aa)*b1[*aj];
1300: r2 += (*aa)*b2[*aj];
1301: r3 += (*aa)*b3[*aj];
1302: r4 += (*aa++)*b4[*aj++];
1303: }
1304: c[colam + i] += r1;
1305: c[colam + am + i] += r2;
1306: c[colam + am2 + i] += r3;
1307: c[colam + am3 + i] += r4;
1308: }
1309: b1 += bm4;
1310: b2 += bm4;
1311: b3 += bm4;
1312: b4 += bm4;
1313: }
1314: for (; col<cn; col++) { /* over extra columns of C */
1315: colam = col*am;
1316: for (i=0; i<am; i++) { /* over rows of C in those columns */
1317: r1 = 0.0;
1318: n = a->i[i+1] - a->i[i];
1319: aj = a->j + a->i[i];
1320: aa = a->a + a->i[i];
1322: for (j=0; j<n; j++) {
1323: r1 += (*aa++)*b1[*aj++];
1324: }
1325: c[colam + i] += r1;
1326: }
1327: b1 += bm;
1328: }
1329: }
1330: PetscLogFlops(cn*2.0*a->nz);
1331: MatDenseRestoreArray(C,&c);
1332: return(0);
1333: }
1335: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1336: {
1338: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
1339: Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data;
1340: PetscInt *bi = b->i,*bj=b->j;
1341: PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1342: MatScalar *btval,*btval_den,*ba=b->a;
1343: PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1346: btval_den=btdense->v;
1347: PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));
1348: for (k=0; k<ncolors; k++) {
1349: ncolumns = coloring->ncolumns[k];
1350: for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1351: col = *(columns + colorforcol[k] + l);
1352: btcol = bj + bi[col];
1353: btval = ba + bi[col];
1354: anz = bi[col+1] - bi[col];
1355: for (j=0; j<anz; j++) {
1356: brow = btcol[j];
1357: btval_den[brow] = btval[j];
1358: }
1359: }
1360: btval_den += m;
1361: }
1362: return(0);
1363: }
1365: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1366: {
1368: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data;
1369: PetscScalar *ca_den,*ca_den_ptr,*ca=csp->a;
1370: PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1371: PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1372: PetscInt nrows,*row,*idx;
1373: PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1376: MatDenseGetArray(Cden,&ca_den);
1378: if (brows > 0) {
1379: PetscInt *lstart,row_end,row_start;
1380: lstart = matcoloring->lstart;
1381: PetscMemzero(lstart,ncolors*sizeof(PetscInt));
1383: row_end = brows;
1384: if (row_end > m) row_end = m;
1385: for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1386: ca_den_ptr = ca_den;
1387: for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1388: nrows = matcoloring->nrows[k];
1389: row = rows + colorforrow[k];
1390: idx = den2sp + colorforrow[k];
1391: for (l=lstart[k]; l<nrows; l++) {
1392: if (row[l] >= row_end) {
1393: lstart[k] = l;
1394: break;
1395: } else {
1396: ca[idx[l]] = ca_den_ptr[row[l]];
1397: }
1398: }
1399: ca_den_ptr += m;
1400: }
1401: row_end += brows;
1402: if (row_end > m) row_end = m;
1403: }
1404: } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1405: ca_den_ptr = ca_den;
1406: for (k=0; k<ncolors; k++) {
1407: nrows = matcoloring->nrows[k];
1408: row = rows + colorforrow[k];
1409: idx = den2sp + colorforrow[k];
1410: for (l=0; l<nrows; l++) {
1411: ca[idx[l]] = ca_den_ptr[row[l]];
1412: }
1413: ca_den_ptr += m;
1414: }
1415: }
1417: MatDenseRestoreArray(Cden,&ca_den);
1418: #if defined(PETSC_USE_INFO)
1419: if (matcoloring->brows > 0) {
1420: PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);
1421: } else {
1422: PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");
1423: }
1424: #endif
1425: return(0);
1426: }
1428: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1429: {
1431: PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1432: const PetscInt *is,*ci,*cj,*row_idx;
1433: PetscInt nis = iscoloring->n,*rowhit,bs = 1;
1434: IS *isa;
1435: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data;
1436: PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1437: PetscInt *colorforcol,*columns,*columns_i,brows;
1438: PetscBool flg;
1441: ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);
1443: /* bs >1 is not being tested yet! */
1444: Nbs = mat->cmap->N/bs;
1445: c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */
1446: c->N = Nbs;
1447: c->m = c->M;
1448: c->rstart = 0;
1449: c->brows = 100;
1451: c->ncolors = nis;
1452: PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);
1453: PetscMalloc1(csp->nz+1,&rows);
1454: PetscMalloc1(csp->nz+1,&den2sp);
1456: brows = c->brows;
1457: PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);
1458: if (flg) c->brows = brows;
1459: if (brows > 0) {
1460: PetscMalloc1(nis+1,&c->lstart);
1461: }
1463: colorforrow[0] = 0;
1464: rows_i = rows;
1465: den2sp_i = den2sp;
1467: PetscMalloc1(nis+1,&colorforcol);
1468: PetscMalloc1(Nbs+1,&columns);
1470: colorforcol[0] = 0;
1471: columns_i = columns;
1473: /* get column-wise storage of mat */
1474: MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1476: cm = c->m;
1477: PetscMalloc1(cm+1,&rowhit);
1478: PetscMalloc1(cm+1,&idxhit);
1479: for (i=0; i<nis; i++) { /* loop over color */
1480: ISGetLocalSize(isa[i],&n);
1481: ISGetIndices(isa[i],&is);
1483: c->ncolumns[i] = n;
1484: if (n) {
1485: PetscMemcpy(columns_i,is,n*sizeof(PetscInt));
1486: }
1487: colorforcol[i+1] = colorforcol[i] + n;
1488: columns_i += n;
1490: /* fast, crude version requires O(N*N) work */
1491: PetscMemzero(rowhit,cm*sizeof(PetscInt));
1493: for (j=0; j<n; j++) { /* loop over columns*/
1494: col = is[j];
1495: row_idx = cj + ci[col];
1496: m = ci[col+1] - ci[col];
1497: for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1498: idxhit[*row_idx] = spidx[ci[col] + k];
1499: rowhit[*row_idx++] = col + 1;
1500: }
1501: }
1502: /* count the number of hits */
1503: nrows = 0;
1504: for (j=0; j<cm; j++) {
1505: if (rowhit[j]) nrows++;
1506: }
1507: c->nrows[i] = nrows;
1508: colorforrow[i+1] = colorforrow[i] + nrows;
1510: nrows = 0;
1511: for (j=0; j<cm; j++) { /* loop over rows */
1512: if (rowhit[j]) {
1513: rows_i[nrows] = j;
1514: den2sp_i[nrows] = idxhit[j];
1515: nrows++;
1516: }
1517: }
1518: den2sp_i += nrows;
1520: ISRestoreIndices(isa[i],&is);
1521: rows_i += nrows;
1522: }
1523: MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1524: PetscFree(rowhit);
1525: ISColoringRestoreIS(iscoloring,&isa);
1526: if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1528: c->colorforrow = colorforrow;
1529: c->rows = rows;
1530: c->den2sp = den2sp;
1531: c->colorforcol = colorforcol;
1532: c->columns = columns;
1534: PetscFree(idxhit);
1535: return(0);
1536: }
1538: /* Needed for MatMatMult_SeqAIJ_SeqAIJ_Combined() */
1539: /* Append value to an array if the value is not present yet. A bitarray */
1540: /* was used to determine if there is already an entry at this position. */
1541: void appendToArray(PetscInt val, PetscInt *array, PetscInt *cnzi)
1542: {
1543: array[(*cnzi)++] = val;
1544: }
1546: /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */
1547: PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C)
1548: {
1549: PetscErrorCode ierr;
1550: PetscLogDouble flops=0.0;
1551: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
1552: const PetscInt *ai = a->i,*bi = b->i, *aj = a->j;
1553: PetscInt *ci,*cj,*cj_i;
1554: PetscScalar *ca, *ca_i;
1555: PetscInt c_maxmem = 0, a_maxrownnz = 0, a_rownnz, a_col;
1556: PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1557: PetscInt i, k, ndouble = 0;
1558: PetscReal afill;
1559: PetscScalar *c_row_val_dense;
1560: PetscBool *c_row_idx_flags;
1561: PetscInt *aj_i = a->j;
1562: PetscScalar *aa_i = a->a;
1565: /* Step 1: Determine upper bounds on memory for C */
1566: for (i=0; i<am; i++) { /* iterate over all rows of A */
1567: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1568: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1569: a_rownnz = 0;
1570: for (k=0;k<anzi;++k) a_rownnz += bi[acol[k]+1] - bi[acol[k]];
1571: a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
1572: c_maxmem += a_rownnz;
1573: }
1574: PetscMalloc1(am+1, &ci);
1575: PetscMalloc1(bn, &c_row_val_dense);
1576: PetscMalloc1(bn, &c_row_idx_flags);
1577: PetscMalloc1(c_maxmem,&cj);
1578: PetscMalloc1(c_maxmem,&ca);
1579: ca_i = ca;
1580: cj_i = cj;
1581: ci[0] = 0;
1582: PetscMemzero(c_row_val_dense, bn * sizeof(PetscScalar));
1583: PetscMemzero(c_row_idx_flags, bn * sizeof(PetscBool));
1584: for (i=0; i<am; i++) {
1585: /* Step 2: Initialize the dense row vector for C */
1586: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1587: PetscInt cnzi = 0;
1588: PetscInt *bj_i;
1589: PetscScalar *ba_i;
1591: /* Step 3: Do the numerical calculations */
1592: for (a_col=0; a_col<anzi; a_col++) { /* iterate over all non zero values in a row of A */
1593: PetscInt a_col_index = aj_i[a_col];
1594: const PetscInt bnzi = bi[a_col_index+1] - bi[a_col_index];
1595: flops += 2*bnzi;
1596: bj_i = b->j + bi[a_col_index]; /* points to the current row in bj */
1597: ba_i = b->a + bi[a_col_index]; /* points to the current row in ba */
1598: for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */
1599: if (c_row_idx_flags[ bj_i[k] ] == PETSC_FALSE) {
1600: appendToArray(bj_i[k], cj_i, &cnzi);
1601: c_row_idx_flags[ bj_i[k] ] = PETSC_TRUE;
1602: }
1603: c_row_val_dense[ bj_i[k] ] += aa_i[a_col] * ba_i[k];
1604: }
1605: }
1607: /* Sort array */
1608: PetscSortInt(cnzi, cj_i);
1609: /* Step 4 */
1610: for (k=0; k < cnzi; k++) {
1611: ca_i[k] = c_row_val_dense[cj_i[k]];
1612: c_row_val_dense[cj_i[k]] = 0.;
1613: c_row_idx_flags[cj_i[k]] = PETSC_FALSE;
1614: }
1615: /* terminate current row */
1616: aa_i += anzi;
1617: aj_i += anzi;
1618: ca_i += cnzi;
1619: cj_i += cnzi;
1620: ci[i+1] = ci[i] + cnzi;
1621: flops += cnzi;
1622: }
1624: /* Step 5 */
1625: /* Create the new matrix */
1626: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
1627: MatSetBlockSizesFromMats(*C,A,B);
1629: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1630: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1631: c = (Mat_SeqAIJ*)((*C)->data);
1632: c->a = ca;
1633: c->free_a = PETSC_TRUE;
1634: c->free_ij = PETSC_TRUE;
1635: c->nonew = 0;
1637: /* set MatInfo */
1638: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1639: if (afill < 1.0) afill = 1.0;
1640: c->maxnz = ci[am];
1641: c->nz = ci[am];
1642: (*C)->info.mallocs = ndouble;
1643: (*C)->info.fill_ratio_given = fill;
1644: (*C)->info.fill_ratio_needed = afill;
1646: PetscFree(c_row_val_dense);
1647: PetscFree(c_row_idx_flags);
1649: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
1650: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
1651: PetscLogFlops(flops);
1652: return(0);
1653: }