Actual source code: matmatmult.c
petsc-3.12.5 2020-03-29
2: /*
3: Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4: C = A * B
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <petscbt.h>
10: #include <petsc/private/isimpl.h>
11: #include <../src/mat/impls/dense/seq/dense.h>
14: PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
15: {
19: if (scall == MAT_INITIAL_MATRIX) {
20: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
21: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
22: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
23: }
24: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
25: MatMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
26: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
27: return(0);
28: }
30: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
31: {
35: if (C->ops->matmultnumeric) {
36: (*C->ops->matmultnumeric)(A,B,C);
37: } else {
38: MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C);
39: }
40: return(0);
41: }
43: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
44: {
46: #if !defined(PETSC_HAVE_HYPRE)
47: const char *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge"};
48: PetscInt nalg = 8;
49: #else
50: const char *algTypes[9] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","combined","rowmerge","hypre"};
51: PetscInt nalg = 9;
52: #endif
53: PetscInt alg = 0; /* set default algorithm */
56: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatMatMult","Mat");
57: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,NULL);
58: PetscOptionsEnd();
59: switch (alg) {
60: case 1:
61: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);
62: break;
63: case 2:
64: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);
65: break;
66: case 3:
67: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);
68: break;
69: case 4:
70: MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);
71: break;
72: case 5:
73: MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);
74: break;
75: case 6:
76: MatMatMult_SeqAIJ_SeqAIJ_Combined(A,B,fill,C);
77: break;
78: case 7:
79: MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C);
80: break;
81: #if defined(PETSC_HAVE_HYPRE)
82: case 8:
83: MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);
84: break;
85: #endif
86: default:
87: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C);
88: break;
89: }
90: return(0);
91: }
93: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
94: {
95: PetscErrorCode ierr;
96: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
97: PetscInt *ai=a->i,*bi=b->i,*ci,*cj;
98: PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
99: PetscReal afill;
100: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
101: PetscTable ta;
102: PetscBT lnkbt;
103: PetscFreeSpaceList free_space=NULL,current_space=NULL;
106: /* Get ci and cj */
107: /*---------------*/
108: /* Allocate ci array, arrays for fill computation and */
109: /* free space for accumulating nonzero column info */
110: PetscMalloc1(am+2,&ci);
111: ci[0] = 0;
113: /* create and initialize a linked list */
114: PetscTableCreate(bn,bn,&ta);
115: MatRowMergeMax_SeqAIJ(b,bm,ta);
116: PetscTableGetCount(ta,&Crmax);
117: PetscTableDestroy(&ta);
119: PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt);
121: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
122: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
124: current_space = free_space;
126: /* Determine ci and cj */
127: for (i=0; i<am; i++) {
128: anzi = ai[i+1] - ai[i];
129: aj = a->j + ai[i];
130: for (j=0; j<anzi; j++) {
131: brow = aj[j];
132: bnzj = bi[brow+1] - bi[brow];
133: bj = b->j + bi[brow];
134: /* add non-zero cols of B into the sorted linked list lnk */
135: PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);
136: }
137: cnzi = lnk[0];
139: /* If free space is not available, make more free space */
140: /* Double the amount of total space in the list */
141: if (current_space->local_remaining<cnzi) {
142: PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);
143: ndouble++;
144: }
146: /* Copy data into free space, then initialize lnk */
147: PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);
149: current_space->array += cnzi;
150: current_space->local_used += cnzi;
151: current_space->local_remaining -= cnzi;
153: ci[i+1] = ci[i] + cnzi;
154: }
156: /* Column indices are in the list of free space */
157: /* Allocate space for cj, initialize cj, and */
158: /* destroy list of free space and other temporary array(s) */
159: PetscMalloc1(ci[am]+1,&cj);
160: PetscFreeSpaceContiguous(&free_space,cj);
161: PetscLLCondensedDestroy(lnk,lnkbt);
163: /* put together the new symbolic matrix */
164: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
165: MatSetBlockSizesFromMats(*C,A,B);
166: MatSetType(*C,((PetscObject)A)->type_name);
168: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
169: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
170: c = (Mat_SeqAIJ*)((*C)->data);
171: c->free_a = PETSC_FALSE;
172: c->free_ij = PETSC_TRUE;
173: c->nonew = 0;
174: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; /* fast, needs non-scalable O(bn) array 'abdense' */
176: /* set MatInfo */
177: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
178: if (afill < 1.0) afill = 1.0;
179: c->maxnz = ci[am];
180: c->nz = ci[am];
181: (*C)->info.mallocs = ndouble;
182: (*C)->info.fill_ratio_given = fill;
183: (*C)->info.fill_ratio_needed = afill;
185: #if defined(PETSC_USE_INFO)
186: if (ci[am]) {
187: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
188: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
189: } else {
190: PetscInfo((*C),"Empty matrix product\n");
191: }
192: #endif
193: return(0);
194: }
196: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
197: {
199: PetscLogDouble flops=0.0;
200: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
201: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
202: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
203: PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
204: PetscInt am =A->rmap->n,cm=C->rmap->n;
205: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
206: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
207: PetscScalar *ab_dense;
210: if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
211: PetscMalloc1(ci[cm]+1,&ca);
212: c->a = ca;
213: c->free_a = PETSC_TRUE;
214: } else {
215: ca = c->a;
216: }
217: if (!c->matmult_abdense) {
218: PetscCalloc1(B->cmap->N,&ab_dense);
219: c->matmult_abdense = ab_dense;
220: } else {
221: ab_dense = c->matmult_abdense;
222: }
224: /* clean old values in C */
225: PetscArrayzero(ca,ci[cm]);
226: /* Traverse A row-wise. */
227: /* Build the ith row in C by summing over nonzero columns in A, */
228: /* the rows of B corresponding to nonzeros of A. */
229: for (i=0; i<am; i++) {
230: anzi = ai[i+1] - ai[i];
231: for (j=0; j<anzi; j++) {
232: brow = aj[j];
233: bnzi = bi[brow+1] - bi[brow];
234: bjj = bj + bi[brow];
235: baj = ba + bi[brow];
236: /* perform dense axpy */
237: valtmp = aa[j];
238: for (k=0; k<bnzi; k++) {
239: ab_dense[bjj[k]] += valtmp*baj[k];
240: }
241: flops += 2*bnzi;
242: }
243: aj += anzi; aa += anzi;
245: cnzi = ci[i+1] - ci[i];
246: for (k=0; k<cnzi; k++) {
247: ca[k] += ab_dense[cj[k]];
248: ab_dense[cj[k]] = 0.0; /* zero ab_dense */
249: }
250: flops += cnzi;
251: cj += cnzi; ca += cnzi;
252: }
253: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
254: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
255: PetscLogFlops(flops);
256: return(0);
257: }
259: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
260: {
262: PetscLogDouble flops=0.0;
263: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
264: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
265: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
266: PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
267: PetscInt am = A->rmap->N,cm=C->rmap->N;
268: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
269: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
270: PetscInt nextb;
273: if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
274: PetscMalloc1(ci[cm]+1,&ca);
275: c->a = ca;
276: c->free_a = PETSC_TRUE;
277: }
279: /* clean old values in C */
280: PetscArrayzero(ca,ci[cm]);
281: /* Traverse A row-wise. */
282: /* Build the ith row in C by summing over nonzero columns in A, */
283: /* the rows of B corresponding to nonzeros of A. */
284: for (i=0; i<am; i++) {
285: anzi = ai[i+1] - ai[i];
286: cnzi = ci[i+1] - ci[i];
287: for (j=0; j<anzi; j++) {
288: brow = aj[j];
289: bnzi = bi[brow+1] - bi[brow];
290: bjj = bj + bi[brow];
291: baj = ba + bi[brow];
292: /* perform sparse axpy */
293: valtmp = aa[j];
294: nextb = 0;
295: for (k=0; nextb<bnzi; k++) {
296: if (cj[k] == bjj[nextb]) { /* ccol == bcol */
297: ca[k] += valtmp*baj[nextb++];
298: }
299: }
300: flops += 2*bnzi;
301: }
302: aj += anzi; aa += anzi;
303: cj += cnzi; ca += cnzi;
304: }
306: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
307: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
308: PetscLogFlops(flops);
309: return(0);
310: }
312: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
313: {
314: PetscErrorCode ierr;
315: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
316: PetscInt *ai = a->i,*bi=b->i,*ci,*cj;
317: PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
318: MatScalar *ca;
319: PetscReal afill;
320: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
321: PetscTable ta;
322: PetscFreeSpaceList free_space=NULL,current_space=NULL;
325: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
326: /*-----------------------------------------------------------------------------------------*/
327: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
328: PetscMalloc1(am+2,&ci);
329: ci[0] = 0;
331: /* create and initialize a linked list */
332: PetscTableCreate(bn,bn,&ta);
333: MatRowMergeMax_SeqAIJ(b,bm,ta);
334: PetscTableGetCount(ta,&Crmax);
335: PetscTableDestroy(&ta);
337: PetscLLCondensedCreate_fast(Crmax,&lnk);
339: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
340: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
341: current_space = free_space;
343: /* Determine ci and cj */
344: for (i=0; i<am; i++) {
345: anzi = ai[i+1] - ai[i];
346: aj = a->j + ai[i];
347: for (j=0; j<anzi; j++) {
348: brow = aj[j];
349: bnzj = bi[brow+1] - bi[brow];
350: bj = b->j + bi[brow];
351: /* add non-zero cols of B into the sorted linked list lnk */
352: PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);
353: }
354: cnzi = lnk[1];
356: /* If free space is not available, make more free space */
357: /* Double the amount of total space in the list */
358: if (current_space->local_remaining<cnzi) {
359: PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);
360: ndouble++;
361: }
363: /* Copy data into free space, then initialize lnk */
364: PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);
366: current_space->array += cnzi;
367: current_space->local_used += cnzi;
368: current_space->local_remaining -= cnzi;
370: ci[i+1] = ci[i] + cnzi;
371: }
373: /* Column indices are in the list of free space */
374: /* Allocate space for cj, initialize cj, and */
375: /* destroy list of free space and other temporary array(s) */
376: PetscMalloc1(ci[am]+1,&cj);
377: PetscFreeSpaceContiguous(&free_space,cj);
378: PetscLLCondensedDestroy_fast(lnk);
380: /* Allocate space for ca */
381: PetscCalloc1(ci[am]+1,&ca);
383: /* put together the new symbolic matrix */
384: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);
385: MatSetBlockSizesFromMats(*C,A,B);
386: MatSetType(*C,((PetscObject)A)->type_name);
388: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
389: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
390: c = (Mat_SeqAIJ*)((*C)->data);
391: c->free_a = PETSC_TRUE;
392: c->free_ij = PETSC_TRUE;
393: c->nonew = 0;
395: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
397: /* set MatInfo */
398: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
399: if (afill < 1.0) afill = 1.0;
400: c->maxnz = ci[am];
401: c->nz = ci[am];
402: (*C)->info.mallocs = ndouble;
403: (*C)->info.fill_ratio_given = fill;
404: (*C)->info.fill_ratio_needed = afill;
406: #if defined(PETSC_USE_INFO)
407: if (ci[am]) {
408: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
409: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
410: } else {
411: PetscInfo((*C),"Empty matrix product\n");
412: }
413: #endif
414: return(0);
415: }
418: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
419: {
420: PetscErrorCode ierr;
421: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
422: PetscInt *ai = a->i,*bi=b->i,*ci,*cj;
423: PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
424: MatScalar *ca;
425: PetscReal afill;
426: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
427: PetscTable ta;
428: PetscFreeSpaceList free_space=NULL,current_space=NULL;
431: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
432: /*---------------------------------------------------------------------------------------------*/
433: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
434: PetscMalloc1(am+2,&ci);
435: ci[0] = 0;
437: /* create and initialize a linked list */
438: PetscTableCreate(bn,bn,&ta);
439: MatRowMergeMax_SeqAIJ(b,bm,ta);
440: PetscTableGetCount(ta,&Crmax);
441: PetscTableDestroy(&ta);
442: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
444: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
445: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
446: current_space = free_space;
448: /* Determine ci and cj */
449: for (i=0; i<am; i++) {
450: anzi = ai[i+1] - ai[i];
451: aj = a->j + ai[i];
452: for (j=0; j<anzi; j++) {
453: brow = aj[j];
454: bnzj = bi[brow+1] - bi[brow];
455: bj = b->j + bi[brow];
456: /* add non-zero cols of B into the sorted linked list lnk */
457: PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);
458: }
459: cnzi = lnk[0];
461: /* If free space is not available, make more free space */
462: /* Double the amount of total space in the list */
463: if (current_space->local_remaining<cnzi) {
464: PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),¤t_space);
465: ndouble++;
466: }
468: /* Copy data into free space, then initialize lnk */
469: PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);
471: current_space->array += cnzi;
472: current_space->local_used += cnzi;
473: current_space->local_remaining -= cnzi;
475: ci[i+1] = ci[i] + cnzi;
476: }
478: /* Column indices are in the list of free space */
479: /* Allocate space for cj, initialize cj, and */
480: /* destroy list of free space and other temporary array(s) */
481: PetscMalloc1(ci[am]+1,&cj);
482: PetscFreeSpaceContiguous(&free_space,cj);
483: PetscLLCondensedDestroy_Scalable(lnk);
485: /* Allocate space for ca */
486: /*-----------------------*/
487: PetscCalloc1(ci[am]+1,&ca);
489: /* put together the new symbolic matrix */
490: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);
491: MatSetBlockSizesFromMats(*C,A,B);
492: MatSetType(*C,((PetscObject)A)->type_name);
494: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
495: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
496: c = (Mat_SeqAIJ*)((*C)->data);
497: c->free_a = PETSC_TRUE;
498: c->free_ij = PETSC_TRUE;
499: c->nonew = 0;
501: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
503: /* set MatInfo */
504: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
505: if (afill < 1.0) afill = 1.0;
506: c->maxnz = ci[am];
507: c->nz = ci[am];
508: (*C)->info.mallocs = ndouble;
509: (*C)->info.fill_ratio_given = fill;
510: (*C)->info.fill_ratio_needed = afill;
512: #if defined(PETSC_USE_INFO)
513: if (ci[am]) {
514: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
515: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
516: } else {
517: PetscInfo((*C),"Empty matrix product\n");
518: }
519: #endif
520: return(0);
521: }
523: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
524: {
525: PetscErrorCode ierr;
526: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
527: const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
528: PetscInt *ci,*cj,*bb;
529: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
530: PetscReal afill;
531: PetscInt i,j,col,ndouble = 0;
532: PetscFreeSpaceList free_space=NULL,current_space=NULL;
533: PetscHeap h;
536: /* Get ci and cj - by merging sorted rows using a heap */
537: /*---------------------------------------------------------------------------------------------*/
538: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
539: PetscMalloc1(am+2,&ci);
540: ci[0] = 0;
542: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
543: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
544: current_space = free_space;
546: PetscHeapCreate(a->rmax,&h);
547: PetscMalloc1(a->rmax,&bb);
549: /* Determine ci and cj */
550: for (i=0; i<am; i++) {
551: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
552: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
553: ci[i+1] = ci[i];
554: /* Populate the min heap */
555: for (j=0; j<anzi; j++) {
556: bb[j] = bi[acol[j]]; /* bb points at the start of the row */
557: if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
558: PetscHeapAdd(h,j,bj[bb[j]++]);
559: }
560: }
561: /* Pick off the min element, adding it to free space */
562: PetscHeapPop(h,&j,&col);
563: while (j >= 0) {
564: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
565: PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);
566: ndouble++;
567: }
568: *(current_space->array++) = col;
569: current_space->local_used++;
570: current_space->local_remaining--;
571: ci[i+1]++;
573: /* stash if anything else remains in this row of B */
574: if (bb[j] < bi[acol[j]+1]) {PetscHeapStash(h,j,bj[bb[j]++]);}
575: while (1) { /* pop and stash any other rows of B that also had an entry in this column */
576: PetscInt j2,col2;
577: PetscHeapPeek(h,&j2,&col2);
578: if (col2 != col) break;
579: PetscHeapPop(h,&j2,&col2);
580: if (bb[j2] < bi[acol[j2]+1]) {PetscHeapStash(h,j2,bj[bb[j2]++]);}
581: }
582: /* Put any stashed elements back into the min heap */
583: PetscHeapUnstash(h);
584: PetscHeapPop(h,&j,&col);
585: }
586: }
587: PetscFree(bb);
588: PetscHeapDestroy(&h);
590: /* Column indices are in the list of free space */
591: /* Allocate space for cj, initialize cj, and */
592: /* destroy list of free space and other temporary array(s) */
593: PetscMalloc1(ci[am],&cj);
594: PetscFreeSpaceContiguous(&free_space,cj);
596: /* put together the new symbolic matrix */
597: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
598: MatSetBlockSizesFromMats(*C,A,B);
599: MatSetType(*C,((PetscObject)A)->type_name);
601: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
602: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
603: c = (Mat_SeqAIJ*)((*C)->data);
604: c->free_a = PETSC_TRUE;
605: c->free_ij = PETSC_TRUE;
606: c->nonew = 0;
608: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
610: /* set MatInfo */
611: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
612: if (afill < 1.0) afill = 1.0;
613: c->maxnz = ci[am];
614: c->nz = ci[am];
615: (*C)->info.mallocs = ndouble;
616: (*C)->info.fill_ratio_given = fill;
617: (*C)->info.fill_ratio_needed = afill;
619: #if defined(PETSC_USE_INFO)
620: if (ci[am]) {
621: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
622: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
623: } else {
624: PetscInfo((*C),"Empty matrix product\n");
625: }
626: #endif
627: return(0);
628: }
630: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
631: {
632: PetscErrorCode ierr;
633: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
634: const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
635: PetscInt *ci,*cj,*bb;
636: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
637: PetscReal afill;
638: PetscInt i,j,col,ndouble = 0;
639: PetscFreeSpaceList free_space=NULL,current_space=NULL;
640: PetscHeap h;
641: PetscBT bt;
644: /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
645: /*---------------------------------------------------------------------------------------------*/
646: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
647: PetscMalloc1(am+2,&ci);
648: ci[0] = 0;
650: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
651: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space);
653: current_space = free_space;
655: PetscHeapCreate(a->rmax,&h);
656: PetscMalloc1(a->rmax,&bb);
657: PetscBTCreate(bn,&bt);
659: /* Determine ci and cj */
660: for (i=0; i<am; i++) {
661: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
662: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
663: const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
664: ci[i+1] = ci[i];
665: /* Populate the min heap */
666: for (j=0; j<anzi; j++) {
667: PetscInt brow = acol[j];
668: for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
669: PetscInt bcol = bj[bb[j]];
670: if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
671: PetscHeapAdd(h,j,bcol);
672: bb[j]++;
673: break;
674: }
675: }
676: }
677: /* Pick off the min element, adding it to free space */
678: PetscHeapPop(h,&j,&col);
679: while (j >= 0) {
680: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
681: fptr = NULL; /* need PetscBTMemzero */
682: PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),¤t_space);
683: ndouble++;
684: }
685: *(current_space->array++) = col;
686: current_space->local_used++;
687: current_space->local_remaining--;
688: ci[i+1]++;
690: /* stash if anything else remains in this row of B */
691: for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
692: PetscInt bcol = bj[bb[j]];
693: if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
694: PetscHeapAdd(h,j,bcol);
695: bb[j]++;
696: break;
697: }
698: }
699: PetscHeapPop(h,&j,&col);
700: }
701: if (fptr) { /* Clear the bits for this row */
702: for (; fptr<current_space->array; fptr++) {PetscBTClear(bt,*fptr);}
703: } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
704: PetscBTMemzero(bn,bt);
705: }
706: }
707: PetscFree(bb);
708: PetscHeapDestroy(&h);
709: PetscBTDestroy(&bt);
711: /* Column indices are in the list of free space */
712: /* Allocate space for cj, initialize cj, and */
713: /* destroy list of free space and other temporary array(s) */
714: PetscMalloc1(ci[am],&cj);
715: PetscFreeSpaceContiguous(&free_space,cj);
717: /* put together the new symbolic matrix */
718: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
719: MatSetBlockSizesFromMats(*C,A,B);
720: MatSetType(*C,((PetscObject)A)->type_name);
722: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
723: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
724: c = (Mat_SeqAIJ*)((*C)->data);
725: c->free_a = PETSC_TRUE;
726: c->free_ij = PETSC_TRUE;
727: c->nonew = 0;
729: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
731: /* set MatInfo */
732: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
733: if (afill < 1.0) afill = 1.0;
734: c->maxnz = ci[am];
735: c->nz = ci[am];
736: (*C)->info.mallocs = ndouble;
737: (*C)->info.fill_ratio_given = fill;
738: (*C)->info.fill_ratio_needed = afill;
740: #if defined(PETSC_USE_INFO)
741: if (ci[am]) {
742: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
743: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
744: } else {
745: PetscInfo((*C),"Empty matrix product\n");
746: }
747: #endif
748: return(0);
749: }
752: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat *C)
753: {
754: PetscErrorCode ierr;
755: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
756: const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
757: PetscInt *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
758: PetscInt c_maxmem,a_maxrownnz=0,a_rownnz;
759: const PetscInt workcol[8]={0,1,2,3,4,5,6,7};
760: const PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
761: const PetscInt *brow_ptr[8],*brow_end[8];
762: PetscInt window[8];
763: PetscInt window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
764: PetscInt i,k,ndouble=0,L1_rowsleft,rowsleft;
765: PetscReal afill;
766: PetscInt *workj_L1,*workj_L2,*workj_L3;
767: PetscInt L1_nnz,L2_nnz;
769: /* Step 1: Get upper bound on memory required for allocation.
770: Because of the way virtual memory works,
771: only the memory pages that are actually needed will be physically allocated. */
773: PetscMalloc1(am+1,&ci);
774: for (i=0; i<am; i++) {
775: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
776: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
777: a_rownnz = 0;
778: for (k=0; k<anzi; ++k) {
779: a_rownnz += bi[acol[k]+1] - bi[acol[k]];
780: if (a_rownnz > bn) {
781: a_rownnz = bn;
782: break;
783: }
784: }
785: a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
786: }
787: /* temporary work areas for merging rows */
788: PetscMalloc1(a_maxrownnz*8,&workj_L1);
789: PetscMalloc1(a_maxrownnz*8,&workj_L2);
790: PetscMalloc1(a_maxrownnz,&workj_L3);
792: /* This should be enough for almost all matrices. If not, memory is reallocated later. */
793: c_maxmem = 8*(ai[am]+bi[bm]);
794: /* Step 2: Populate pattern for C */
795: PetscMalloc1(c_maxmem,&cj);
797: ci_nnz = 0;
798: ci[0] = 0;
799: worki_L1[0] = 0;
800: worki_L2[0] = 0;
801: for (i=0; i<am; i++) {
802: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
803: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
804: rowsleft = anzi;
805: inputcol_L1 = acol;
806: L2_nnz = 0;
807: L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */
808: worki_L2[1] = 0;
809: outputi_nnz = 0;
811: /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */
812: while (ci_nnz+a_maxrownnz > c_maxmem) {
813: c_maxmem *= 2;
814: ndouble++;
815: PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);
816: }
818: while (rowsleft) {
819: L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
820: L1_nrows = 0;
821: L1_nnz = 0;
822: inputcol = inputcol_L1;
823: inputi = bi;
824: inputj = bj;
826: /* The following macro is used to specialize for small rows in A.
827: This helps with compiler unrolling, improving performance substantially.
828: Input: inputj inputi inputcol bn
829: Output: outputj outputi_nnz */
830: #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
831: window_min = bn; \
832: outputi_nnz = 0; \
833: for (k=0; k<ANNZ; ++k) { \
834: brow_ptr[k] = inputj + inputi[inputcol[k]]; \
835: brow_end[k] = inputj + inputi[inputcol[k]+1]; \
836: window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
837: window_min = PetscMin(window[k], window_min); \
838: } \
839: while (window_min < bn) { \
840: outputj[outputi_nnz++] = window_min; \
841: /* advance front and compute new minimum */ \
842: old_window_min = window_min; \
843: window_min = bn; \
844: for (k=0; k<ANNZ; ++k) { \
845: if (window[k] == old_window_min) { \
846: brow_ptr[k]++; \
847: window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
848: } \
849: window_min = PetscMin(window[k], window_min); \
850: } \
851: }
853: /************** L E V E L 1 ***************/
854: /* Merge up to 8 rows of B to L1 work array*/
855: while (L1_rowsleft) {
856: outputi_nnz = 0;
857: if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
858: else outputj = cj + ci_nnz; /* Merge directly to C */
860: switch (L1_rowsleft) {
861: case 1: brow_ptr[0] = inputj + inputi[inputcol[0]];
862: brow_end[0] = inputj + inputi[inputcol[0]+1];
863: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
864: inputcol += L1_rowsleft;
865: rowsleft -= L1_rowsleft;
866: L1_rowsleft = 0;
867: break;
868: case 2: MatMatMultSymbolic_RowMergeMacro(2);
869: inputcol += L1_rowsleft;
870: rowsleft -= L1_rowsleft;
871: L1_rowsleft = 0;
872: break;
873: case 3: MatMatMultSymbolic_RowMergeMacro(3);
874: inputcol += L1_rowsleft;
875: rowsleft -= L1_rowsleft;
876: L1_rowsleft = 0;
877: break;
878: case 4: MatMatMultSymbolic_RowMergeMacro(4);
879: inputcol += L1_rowsleft;
880: rowsleft -= L1_rowsleft;
881: L1_rowsleft = 0;
882: break;
883: case 5: MatMatMultSymbolic_RowMergeMacro(5);
884: inputcol += L1_rowsleft;
885: rowsleft -= L1_rowsleft;
886: L1_rowsleft = 0;
887: break;
888: case 6: MatMatMultSymbolic_RowMergeMacro(6);
889: inputcol += L1_rowsleft;
890: rowsleft -= L1_rowsleft;
891: L1_rowsleft = 0;
892: break;
893: case 7: MatMatMultSymbolic_RowMergeMacro(7);
894: inputcol += L1_rowsleft;
895: rowsleft -= L1_rowsleft;
896: L1_rowsleft = 0;
897: break;
898: default: MatMatMultSymbolic_RowMergeMacro(8);
899: inputcol += 8;
900: rowsleft -= 8;
901: L1_rowsleft -= 8;
902: break;
903: }
904: inputcol_L1 = inputcol;
905: L1_nnz += outputi_nnz;
906: worki_L1[++L1_nrows] = L1_nnz;
907: }
909: /********************** L E V E L 2 ************************/
910: /* Merge from L1 work array to either C or to L2 work array */
911: if (anzi > 8) {
912: inputi = worki_L1;
913: inputj = workj_L1;
914: inputcol = workcol;
915: outputi_nnz = 0;
917: if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
918: else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */
920: switch (L1_nrows) {
921: case 1: brow_ptr[0] = inputj + inputi[inputcol[0]];
922: brow_end[0] = inputj + inputi[inputcol[0]+1];
923: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
924: break;
925: case 2: MatMatMultSymbolic_RowMergeMacro(2); break;
926: case 3: MatMatMultSymbolic_RowMergeMacro(3); break;
927: case 4: MatMatMultSymbolic_RowMergeMacro(4); break;
928: case 5: MatMatMultSymbolic_RowMergeMacro(5); break;
929: case 6: MatMatMultSymbolic_RowMergeMacro(6); break;
930: case 7: MatMatMultSymbolic_RowMergeMacro(7); break;
931: case 8: MatMatMultSymbolic_RowMergeMacro(8); break;
932: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
933: }
934: L2_nnz += outputi_nnz;
935: worki_L2[++L2_nrows] = L2_nnz;
937: /************************ L E V E L 3 **********************/
938: /* Merge from L2 work array to either C or to L2 work array */
939: if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
940: inputi = worki_L2;
941: inputj = workj_L2;
942: inputcol = workcol;
943: outputi_nnz = 0;
944: if (rowsleft) outputj = workj_L3;
945: else outputj = cj + ci_nnz;
946: switch (L2_nrows) {
947: case 1: brow_ptr[0] = inputj + inputi[inputcol[0]];
948: brow_end[0] = inputj + inputi[inputcol[0]+1];
949: for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
950: break;
951: case 2: MatMatMultSymbolic_RowMergeMacro(2); break;
952: case 3: MatMatMultSymbolic_RowMergeMacro(3); break;
953: case 4: MatMatMultSymbolic_RowMergeMacro(4); break;
954: case 5: MatMatMultSymbolic_RowMergeMacro(5); break;
955: case 6: MatMatMultSymbolic_RowMergeMacro(6); break;
956: case 7: MatMatMultSymbolic_RowMergeMacro(7); break;
957: case 8: MatMatMultSymbolic_RowMergeMacro(8); break;
958: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
959: }
960: L2_nrows = 1;
961: L2_nnz = outputi_nnz;
962: worki_L2[1] = outputi_nnz;
963: /* Copy to workj_L2 */
964: if (rowsleft) {
965: for (k=0; k<outputi_nnz; ++k) workj_L2[k] = outputj[k];
966: }
967: }
968: }
969: } /* while (rowsleft) */
970: #undef MatMatMultSymbolic_RowMergeMacro
972: /* terminate current row */
973: ci_nnz += outputi_nnz;
974: ci[i+1] = ci_nnz;
975: }
977: /* Step 3: Create the new symbolic matrix */
978: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
979: MatSetBlockSizesFromMats(*C,A,B);
980: MatSetType(*C,((PetscObject)A)->type_name);
982: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
983: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
984: c = (Mat_SeqAIJ*)((*C)->data);
985: c->free_a = PETSC_TRUE;
986: c->free_ij = PETSC_TRUE;
987: c->nonew = 0;
989: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
991: /* set MatInfo */
992: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
993: if (afill < 1.0) afill = 1.0;
994: c->maxnz = ci[am];
995: c->nz = ci[am];
996: (*C)->info.mallocs = ndouble;
997: (*C)->info.fill_ratio_given = fill;
998: (*C)->info.fill_ratio_needed = afill;
1000: #if defined(PETSC_USE_INFO)
1001: if (ci[am]) {
1002: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
1003: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
1004: } else {
1005: PetscInfo((*C),"Empty matrix product\n");
1006: }
1007: #endif
1009: /* Step 4: Free temporary work areas */
1010: PetscFree(workj_L1);
1011: PetscFree(workj_L2);
1012: PetscFree(workj_L3);
1013: return(0);
1014: }
1016: /* concatenate unique entries and then sort */
1017: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat *C)
1018: {
1019: PetscErrorCode ierr;
1020: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1021: const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1022: PetscInt *ci,*cj;
1023: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1024: PetscReal afill;
1025: PetscInt i,j,ndouble = 0;
1026: PetscSegBuffer seg,segrow;
1027: char *seen;
1030: PetscMalloc1(am+1,&ci);
1031: ci[0] = 0;
1033: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1034: PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);
1035: PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);
1036: PetscCalloc1(bn,&seen);
1038: /* Determine ci and cj */
1039: for (i=0; i<am; i++) {
1040: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1041: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1042: PetscInt packlen = 0,*PETSC_RESTRICT crow;
1043: /* Pack segrow */
1044: for (j=0; j<anzi; j++) {
1045: PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1046: for (k=bjstart; k<bjend; k++) {
1047: PetscInt bcol = bj[k];
1048: if (!seen[bcol]) { /* new entry */
1049: PetscInt *PETSC_RESTRICT slot;
1050: PetscSegBufferGetInts(segrow,1,&slot);
1051: *slot = bcol;
1052: seen[bcol] = 1;
1053: packlen++;
1054: }
1055: }
1056: }
1057: PetscSegBufferGetInts(seg,packlen,&crow);
1058: PetscSegBufferExtractTo(segrow,crow);
1059: PetscSortInt(packlen,crow);
1060: ci[i+1] = ci[i] + packlen;
1061: for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1062: }
1063: PetscSegBufferDestroy(&segrow);
1064: PetscFree(seen);
1066: /* Column indices are in the segmented buffer */
1067: PetscSegBufferExtractAlloc(seg,&cj);
1068: PetscSegBufferDestroy(&seg);
1070: /* put together the new symbolic matrix */
1071: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
1072: MatSetBlockSizesFromMats(*C,A,B);
1073: MatSetType(*C,((PetscObject)A)->type_name);
1075: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1076: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1077: c = (Mat_SeqAIJ*)((*C)->data);
1078: c->free_a = PETSC_TRUE;
1079: c->free_ij = PETSC_TRUE;
1080: c->nonew = 0;
1082: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1084: /* set MatInfo */
1085: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1086: if (afill < 1.0) afill = 1.0;
1087: c->maxnz = ci[am];
1088: c->nz = ci[am];
1089: (*C)->info.mallocs = ndouble;
1090: (*C)->info.fill_ratio_given = fill;
1091: (*C)->info.fill_ratio_needed = afill;
1093: #if defined(PETSC_USE_INFO)
1094: if (ci[am]) {
1095: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
1096: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
1097: } else {
1098: PetscInfo((*C),"Empty matrix product\n");
1099: }
1100: #endif
1101: return(0);
1102: }
1104: /* This routine is not used. Should be removed! */
1105: PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1106: {
1110: if (scall == MAT_INITIAL_MATRIX) {
1111: PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);
1112: MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
1113: PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);
1114: }
1115: PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);
1116: MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
1117: PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);
1118: return(0);
1119: }
1121: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
1122: {
1123: PetscErrorCode ierr;
1124: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1125: Mat_MatMatTransMult *abt=a->abt;
1128: (abt->destroy)(A);
1129: MatTransposeColoringDestroy(&abt->matcoloring);
1130: MatDestroy(&abt->Bt_den);
1131: MatDestroy(&abt->ABt_den);
1132: PetscFree(abt);
1133: return(0);
1134: }
1136: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1137: {
1138: PetscErrorCode ierr;
1139: Mat Bt;
1140: PetscInt *bti,*btj;
1141: Mat_MatMatTransMult *abt;
1142: Mat_SeqAIJ *c;
1145: /* create symbolic Bt */
1146: MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);
1147: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);
1148: MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
1149: MatSetType(Bt,((PetscObject)A)->type_name);
1151: /* get symbolic C=A*Bt */
1152: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);
1154: /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
1155: PetscNew(&abt);
1156: c = (Mat_SeqAIJ*)(*C)->data;
1157: c->abt = abt;
1159: abt->usecoloring = PETSC_FALSE;
1160: abt->destroy = (*C)->ops->destroy;
1161: (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1163: PetscOptionsGetBool(((PetscObject)A)->options,NULL,"-matmattransmult_color",&abt->usecoloring,NULL);
1164: if (abt->usecoloring) {
1165: /* Create MatTransposeColoring from symbolic C=A*B^T */
1166: MatTransposeColoring matcoloring;
1167: MatColoring coloring;
1168: ISColoring iscoloring;
1169: Mat Bt_dense,C_dense;
1170: Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
1171: /* inode causes memory problem, don't know why */
1172: if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
1174: MatColoringCreate(*C,&coloring);
1175: MatColoringSetDistance(coloring,2);
1176: MatColoringSetType(coloring,MATCOLORINGSL);
1177: MatColoringSetFromOptions(coloring);
1178: MatColoringApply(coloring,&iscoloring);
1179: MatColoringDestroy(&coloring);
1180: MatTransposeColoringCreate(*C,iscoloring,&matcoloring);
1182: abt->matcoloring = matcoloring;
1184: ISColoringDestroy(&iscoloring);
1186: /* Create Bt_dense and C_dense = A*Bt_dense */
1187: MatCreate(PETSC_COMM_SELF,&Bt_dense);
1188: MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);
1189: MatSetType(Bt_dense,MATSEQDENSE);
1190: MatSeqDenseSetPreallocation(Bt_dense,NULL);
1192: Bt_dense->assembled = PETSC_TRUE;
1193: abt->Bt_den = Bt_dense;
1195: MatCreate(PETSC_COMM_SELF,&C_dense);
1196: MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);
1197: MatSetType(C_dense,MATSEQDENSE);
1198: MatSeqDenseSetPreallocation(C_dense,NULL);
1200: Bt_dense->assembled = PETSC_TRUE;
1201: abt->ABt_den = C_dense;
1203: #if defined(PETSC_USE_INFO)
1204: {
1205: Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
1206: PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
1207: }
1208: #endif
1209: }
1210: /* clean up */
1211: MatDestroy(&Bt);
1212: MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);
1213: return(0);
1214: }
1216: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1217: {
1218: PetscErrorCode ierr;
1219: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1220: PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1221: PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1222: PetscLogDouble flops=0.0;
1223: MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1224: Mat_MatMatTransMult *abt = c->abt;
1227: /* clear old values in C */
1228: if (!c->a) {
1229: PetscCalloc1(ci[cm]+1,&ca);
1230: c->a = ca;
1231: c->free_a = PETSC_TRUE;
1232: } else {
1233: ca = c->a;
1234: PetscArrayzero(ca,ci[cm]+1);
1235: }
1237: if (abt->usecoloring) {
1238: MatTransposeColoring matcoloring = abt->matcoloring;
1239: Mat Bt_dense,C_dense = abt->ABt_den;
1241: /* Get Bt_dense by Apply MatTransposeColoring to B */
1242: Bt_dense = abt->Bt_den;
1243: MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);
1245: /* C_dense = A*Bt_dense */
1246: MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);
1248: /* Recover C from C_dense */
1249: MatTransColoringApplyDenToSp(matcoloring,C_dense,C);
1250: return(0);
1251: }
1253: for (i=0; i<cm; i++) {
1254: anzi = ai[i+1] - ai[i];
1255: acol = aj + ai[i];
1256: aval = aa + ai[i];
1257: cnzi = ci[i+1] - ci[i];
1258: ccol = cj + ci[i];
1259: cval = ca + ci[i];
1260: for (j=0; j<cnzi; j++) {
1261: brow = ccol[j];
1262: bnzj = bi[brow+1] - bi[brow];
1263: bcol = bj + bi[brow];
1264: bval = ba + bi[brow];
1266: /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1267: nexta = 0; nextb = 0;
1268: while (nexta<anzi && nextb<bnzj) {
1269: while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1270: if (nexta == anzi) break;
1271: while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1272: if (nextb == bnzj) break;
1273: if (acol[nexta] == bcol[nextb]) {
1274: cval[j] += aval[nexta]*bval[nextb];
1275: nexta++; nextb++;
1276: flops += 2;
1277: }
1278: }
1279: }
1280: }
1281: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1282: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1283: PetscLogFlops(flops);
1284: return(0);
1285: }
1287: PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(Mat A)
1288: {
1289: PetscErrorCode ierr;
1290: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1291: Mat_MatTransMatMult *atb = a->atb;
1294: if (atb) {
1295: MatDestroy(&atb->At);
1296: (*atb->destroy)(A);
1297: }
1298: PetscFree(atb);
1299: return(0);
1300: }
1302: PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1303: {
1304: PetscErrorCode ierr;
1305: const char *algTypes[2] = {"matmatmult","outerproduct"};
1306: PetscInt alg=0; /* set default algorithm */
1307: Mat At;
1308: Mat_MatTransMatMult *atb;
1309: Mat_SeqAIJ *c;
1312: if (scall == MAT_INITIAL_MATRIX) {
1313: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatTransposeMatMult","Mat");
1314: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,2,algTypes[0],&alg,NULL);
1315: PetscOptionsEnd();
1317: switch (alg) {
1318: case 1:
1319: MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
1320: break;
1321: default:
1322: PetscNew(&atb);
1323: MatTranspose_SeqAIJ(A,MAT_INITIAL_MATRIX,&At);
1324: MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_INITIAL_MATRIX,fill,C);
1326: c = (Mat_SeqAIJ*)(*C)->data;
1327: c->atb = atb;
1328: atb->At = At;
1329: atb->destroy = (*C)->ops->destroy;
1330: (*C)->ops->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1332: break;
1333: }
1334: }
1335: if (alg) {
1336: (*(*C)->ops->mattransposemultnumeric)(A,B,*C);
1337: } else if (!alg && scall == MAT_REUSE_MATRIX) {
1338: c = (Mat_SeqAIJ*)(*C)->data;
1339: atb = c->atb;
1340: At = atb->At;
1341: MatTranspose_SeqAIJ(A,MAT_REUSE_MATRIX,&At);
1342: MatMatMult_SeqAIJ_SeqAIJ(At,B,MAT_REUSE_MATRIX,fill,C);
1343: }
1344: return(0);
1345: }
1347: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1348: {
1350: Mat At;
1351: PetscInt *ati,*atj;
1354: /* create symbolic At */
1355: MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1356: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);
1357: MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
1358: MatSetType(At,((PetscObject)A)->type_name);
1360: /* get symbolic C=At*B */
1361: MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);
1363: /* clean up */
1364: MatDestroy(&At);
1365: MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1367: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ;
1368: return(0);
1369: }
1371: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1372: {
1374: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1375: PetscInt am =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1376: PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1377: PetscLogDouble flops=0.0;
1378: MatScalar *aa =a->a,*ba,*ca,*caj;
1381: if (!c->a) {
1382: PetscCalloc1(ci[cm]+1,&ca);
1384: c->a = ca;
1385: c->free_a = PETSC_TRUE;
1386: } else {
1387: ca = c->a;
1388: PetscArrayzero(ca,ci[cm]);
1389: }
1391: /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1392: for (i=0; i<am; i++) {
1393: bj = b->j + bi[i];
1394: ba = b->a + bi[i];
1395: bnzi = bi[i+1] - bi[i];
1396: anzi = ai[i+1] - ai[i];
1397: for (j=0; j<anzi; j++) {
1398: nextb = 0;
1399: crow = *aj++;
1400: cjj = cj + ci[crow];
1401: caj = ca + ci[crow];
1402: /* perform sparse axpy operation. Note cjj includes bj. */
1403: for (k=0; nextb<bnzi; k++) {
1404: if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1405: caj[k] += (*aa)*(*(ba+nextb));
1406: nextb++;
1407: }
1408: }
1409: flops += 2*bnzi;
1410: aa++;
1411: }
1412: }
1414: /* Assemble the final matrix and clean up */
1415: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1416: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1417: PetscLogFlops(flops);
1418: return(0);
1419: }
1421: PETSC_INTERN PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1422: {
1426: if (scall == MAT_INITIAL_MATRIX) {
1427: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
1428: MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);
1429: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
1430: }
1431: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
1432: MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);
1433: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
1434: return(0);
1435: }
1437: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1438: {
1442: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1444: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1445: return(0);
1446: }
1448: PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1449: {
1450: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1451: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1452: PetscErrorCode ierr;
1453: PetscScalar *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1454: const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1455: const PetscInt *aj;
1456: PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=bd->lda,am=A->rmap->n;
1457: PetscInt am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
1460: if (!cm || !cn) return(0);
1461: MatSeqAIJGetArrayRead(A,&av);
1462: MatDenseGetArray(C,&c);
1463: MatDenseGetArrayRead(B,&b);
1464: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1465: c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1466: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1467: for (i=0; i<am; i++) { /* over rows of C in those columns */
1468: r1 = r2 = r3 = r4 = 0.0;
1469: n = a->i[i+1] - a->i[i];
1470: aj = a->j + a->i[i];
1471: aa = av + a->i[i];
1472: for (j=0; j<n; j++) {
1473: aatmp = aa[j]; ajtmp = aj[j];
1474: r1 += aatmp*b1[ajtmp];
1475: r2 += aatmp*b2[ajtmp];
1476: r3 += aatmp*b3[ajtmp];
1477: r4 += aatmp*b4[ajtmp];
1478: }
1479: c1[i] += r1;
1480: c2[i] += r2;
1481: c3[i] += r3;
1482: c4[i] += r4;
1483: }
1484: b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1485: c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1486: }
1487: for (; col<cn; col++) { /* over extra columns of C */
1488: for (i=0; i<am; i++) { /* over rows of C in those columns */
1489: r1 = 0.0;
1490: n = a->i[i+1] - a->i[i];
1491: aj = a->j + a->i[i];
1492: aa = av + a->i[i];
1493: for (j=0; j<n; j++) {
1494: r1 += aa[j]*b1[aj[j]];
1495: }
1496: c1[i] += r1;
1497: }
1498: b1 += bm;
1499: c1 += am;
1500: }
1501: PetscLogFlops(cn*(2.0*a->nz));
1502: MatDenseRestoreArray(C,&c);
1503: MatDenseRestoreArrayRead(B,&b);
1504: MatSeqAIJRestoreArrayRead(A,&av);
1505: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1506: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1507: return(0);
1508: }
1510: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1511: {
1515: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1516: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1517: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1519: MatZeroEntries(C);
1520: MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C);
1521: return(0);
1522: }
1524: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1525: {
1527: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
1528: Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data;
1529: PetscInt *bi = b->i,*bj=b->j;
1530: PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1531: MatScalar *btval,*btval_den,*ba=b->a;
1532: PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1535: btval_den=btdense->v;
1536: PetscArrayzero(btval_den,m*n);
1537: for (k=0; k<ncolors; k++) {
1538: ncolumns = coloring->ncolumns[k];
1539: for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1540: col = *(columns + colorforcol[k] + l);
1541: btcol = bj + bi[col];
1542: btval = ba + bi[col];
1543: anz = bi[col+1] - bi[col];
1544: for (j=0; j<anz; j++) {
1545: brow = btcol[j];
1546: btval_den[brow] = btval[j];
1547: }
1548: }
1549: btval_den += m;
1550: }
1551: return(0);
1552: }
1554: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1555: {
1556: PetscErrorCode ierr;
1557: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data;
1558: const PetscScalar *ca_den,*ca_den_ptr;
1559: PetscScalar *ca=csp->a;
1560: PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1561: PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1562: PetscInt nrows,*row,*idx;
1563: PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1566: MatDenseGetArrayRead(Cden,&ca_den);
1568: if (brows > 0) {
1569: PetscInt *lstart,row_end,row_start;
1570: lstart = matcoloring->lstart;
1571: PetscArrayzero(lstart,ncolors);
1573: row_end = brows;
1574: if (row_end > m) row_end = m;
1575: for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1576: ca_den_ptr = ca_den;
1577: for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1578: nrows = matcoloring->nrows[k];
1579: row = rows + colorforrow[k];
1580: idx = den2sp + colorforrow[k];
1581: for (l=lstart[k]; l<nrows; l++) {
1582: if (row[l] >= row_end) {
1583: lstart[k] = l;
1584: break;
1585: } else {
1586: ca[idx[l]] = ca_den_ptr[row[l]];
1587: }
1588: }
1589: ca_den_ptr += m;
1590: }
1591: row_end += brows;
1592: if (row_end > m) row_end = m;
1593: }
1594: } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1595: ca_den_ptr = ca_den;
1596: for (k=0; k<ncolors; k++) {
1597: nrows = matcoloring->nrows[k];
1598: row = rows + colorforrow[k];
1599: idx = den2sp + colorforrow[k];
1600: for (l=0; l<nrows; l++) {
1601: ca[idx[l]] = ca_den_ptr[row[l]];
1602: }
1603: ca_den_ptr += m;
1604: }
1605: }
1607: MatDenseRestoreArrayRead(Cden,&ca_den);
1608: #if defined(PETSC_USE_INFO)
1609: if (matcoloring->brows > 0) {
1610: PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);
1611: } else {
1612: PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");
1613: }
1614: #endif
1615: return(0);
1616: }
1618: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1619: {
1621: PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1622: const PetscInt *is,*ci,*cj,*row_idx;
1623: PetscInt nis = iscoloring->n,*rowhit,bs = 1;
1624: IS *isa;
1625: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data;
1626: PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1627: PetscInt *colorforcol,*columns,*columns_i,brows;
1628: PetscBool flg;
1631: ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa);
1633: /* bs >1 is not being tested yet! */
1634: Nbs = mat->cmap->N/bs;
1635: c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */
1636: c->N = Nbs;
1637: c->m = c->M;
1638: c->rstart = 0;
1639: c->brows = 100;
1641: c->ncolors = nis;
1642: PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);
1643: PetscMalloc1(csp->nz+1,&rows);
1644: PetscMalloc1(csp->nz+1,&den2sp);
1646: brows = c->brows;
1647: PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg);
1648: if (flg) c->brows = brows;
1649: if (brows > 0) {
1650: PetscMalloc1(nis+1,&c->lstart);
1651: }
1653: colorforrow[0] = 0;
1654: rows_i = rows;
1655: den2sp_i = den2sp;
1657: PetscMalloc1(nis+1,&colorforcol);
1658: PetscMalloc1(Nbs+1,&columns);
1660: colorforcol[0] = 0;
1661: columns_i = columns;
1663: /* get column-wise storage of mat */
1664: MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1666: cm = c->m;
1667: PetscMalloc1(cm+1,&rowhit);
1668: PetscMalloc1(cm+1,&idxhit);
1669: for (i=0; i<nis; i++) { /* loop over color */
1670: ISGetLocalSize(isa[i],&n);
1671: ISGetIndices(isa[i],&is);
1673: c->ncolumns[i] = n;
1674: if (n) {
1675: PetscArraycpy(columns_i,is,n);
1676: }
1677: colorforcol[i+1] = colorforcol[i] + n;
1678: columns_i += n;
1680: /* fast, crude version requires O(N*N) work */
1681: PetscArrayzero(rowhit,cm);
1683: for (j=0; j<n; j++) { /* loop over columns*/
1684: col = is[j];
1685: row_idx = cj + ci[col];
1686: m = ci[col+1] - ci[col];
1687: for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1688: idxhit[*row_idx] = spidx[ci[col] + k];
1689: rowhit[*row_idx++] = col + 1;
1690: }
1691: }
1692: /* count the number of hits */
1693: nrows = 0;
1694: for (j=0; j<cm; j++) {
1695: if (rowhit[j]) nrows++;
1696: }
1697: c->nrows[i] = nrows;
1698: colorforrow[i+1] = colorforrow[i] + nrows;
1700: nrows = 0;
1701: for (j=0; j<cm; j++) { /* loop over rows */
1702: if (rowhit[j]) {
1703: rows_i[nrows] = j;
1704: den2sp_i[nrows] = idxhit[j];
1705: nrows++;
1706: }
1707: }
1708: den2sp_i += nrows;
1710: ISRestoreIndices(isa[i],&is);
1711: rows_i += nrows;
1712: }
1713: MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1714: PetscFree(rowhit);
1715: ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa);
1716: if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1718: c->colorforrow = colorforrow;
1719: c->rows = rows;
1720: c->den2sp = den2sp;
1721: c->colorforcol = colorforcol;
1722: c->columns = columns;
1724: PetscFree(idxhit);
1725: return(0);
1726: }
1728: /* The combine method has done the symbolic and numeric in the first phase, and so we just return here */
1729: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,Mat C)
1730: {
1732: /* Empty function */
1733: return(0);
1734: }
1736: /* This algorithm combines the symbolic and numeric phase of matrix-matrix multiplication. */
1737: PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ_Combined(Mat A,Mat B,PetscReal fill,Mat *C)
1738: {
1739: PetscErrorCode ierr;
1740: PetscLogDouble flops=0.0;
1741: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1742: const PetscInt *ai=a->i,*bi=b->i;
1743: PetscInt *ci,*cj,*cj_i;
1744: PetscScalar *ca,*ca_i;
1745: PetscInt b_maxmemrow,c_maxmem,a_col;
1746: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1747: PetscInt i,k,ndouble=0;
1748: PetscReal afill;
1749: PetscScalar *c_row_val_dense;
1750: PetscBool *c_row_idx_flags;
1751: PetscInt *aj_i=a->j;
1752: PetscScalar *aa_i=a->a;
1756: /* Step 1: Determine upper bounds on memory for C and allocate memory */
1757: /* This should be enough for almost all matrices. If still more memory is needed, it is reallocated later. */
1758: c_maxmem = 8*(ai[am]+bi[bm]);
1759: b_maxmemrow = PetscMin(bi[bm],bn);
1760: PetscMalloc1(am+1,&ci);
1761: PetscCalloc1(bn,&c_row_val_dense);
1762: PetscCalloc1(bn,&c_row_idx_flags);
1763: PetscMalloc1(c_maxmem,&cj);
1764: PetscMalloc1(c_maxmem,&ca);
1765: ca_i = ca;
1766: cj_i = cj;
1767: ci[0] = 0;
1768: for (i=0; i<am; i++) {
1769: /* Step 2: Initialize the dense row vector for C */
1770: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1771: PetscInt cnzi = 0;
1772: PetscInt *bj_i;
1773: PetscScalar *ba_i;
1774: /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory
1775: Usually, there is enough memory in the first place, so this is not executed. */
1776: while (ci[i] + b_maxmemrow > c_maxmem) {
1777: c_maxmem *= 2;
1778: ndouble++;
1779: PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj);
1780: PetscRealloc(sizeof(PetscScalar)*c_maxmem,&ca);
1781: }
1783: /* Step 3: Do the numerical calculations */
1784: for (a_col=0; a_col<anzi; a_col++) { /* iterate over all non zero values in a row of A */
1785: PetscInt a_col_index = aj_i[a_col];
1786: const PetscInt bnzi = bi[a_col_index+1] - bi[a_col_index];
1787: flops += 2*bnzi;
1788: bj_i = b->j + bi[a_col_index]; /* points to the current row in bj */
1789: ba_i = b->a + bi[a_col_index]; /* points to the current row in ba */
1790: for (k=0; k<bnzi; ++k) { /* iterate over all non zeros of this row in B */
1791: if (c_row_idx_flags[bj_i[k]] == PETSC_FALSE) {
1792: cj_i[cnzi++] = bj_i[k];
1793: c_row_idx_flags[bj_i[k]] = PETSC_TRUE;
1794: }
1795: c_row_val_dense[bj_i[k]] += aa_i[a_col] * ba_i[k];
1796: }
1797: }
1799: /* Sort array */
1800: PetscSortInt(cnzi,cj_i);
1801: /* Step 4 */
1802: for (k=0; k<cnzi; k++) {
1803: ca_i[k] = c_row_val_dense[cj_i[k]];
1804: c_row_val_dense[cj_i[k]] = 0.;
1805: c_row_idx_flags[cj_i[k]] = PETSC_FALSE;
1806: }
1807: /* terminate current row */
1808: aa_i += anzi;
1809: aj_i += anzi;
1810: ca_i += cnzi;
1811: cj_i += cnzi;
1812: ci[i+1] = ci[i] + cnzi;
1813: flops += cnzi;
1814: }
1816: /* Step 5 */
1817: /* Create the new matrix */
1818: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
1819: MatSetBlockSizesFromMats(*C,A,B);
1820: MatSetType(*C,((PetscObject)A)->type_name);
1822: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1823: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1824: c = (Mat_SeqAIJ*)((*C)->data);
1825: c->a = ca;
1826: c->free_a = PETSC_TRUE;
1827: c->free_ij = PETSC_TRUE;
1828: c->nonew = 0;
1830: /* set MatInfo */
1831: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1832: if (afill < 1.0) afill = 1.0;
1833: c->maxnz = ci[am];
1834: c->nz = ci[am];
1835: (*C)->info.mallocs = ndouble;
1836: (*C)->info.fill_ratio_given = fill;
1837: (*C)->info.fill_ratio_needed = afill;
1838: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Combined;
1840: PetscFree(c_row_val_dense);
1841: PetscFree(c_row_idx_flags);
1843: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
1844: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
1845: PetscLogFlops(flops);
1846: return(0);
1847: }