Actual source code: matmatmult.c
petsc-3.6.4 2016-04-12
2: /*
3: Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4: C = A * B
5: */
7: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/utils/petscheap.h>
10: #include <petscbt.h>
11: #include <petsc/private/isimpl.h>
12: #include <../src/mat/impls/dense/seq/dense.h>
14: static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat,Mat,PetscReal,Mat*);
18: PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
19: {
21: const char *algTypes[6] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed"};
22: PetscInt alg=0; /* set default algorithm */
25: if (scall == MAT_INITIAL_MATRIX) {
26: PetscObjectOptionsBegin((PetscObject)A);
27: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,6,algTypes[0],&alg,NULL);
28: PetscOptionsEnd();
29: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
30: switch (alg) {
31: case 1:
32: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);
33: break;
34: case 2:
35: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);
36: break;
37: case 3:
38: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C);
39: break;
40: case 4:
41: MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C);
42: break;
43: case 5:
44: MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C);
45: break;
46: default:
47: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
48: break;
49: }
50: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
51: }
53: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
54: (*(*C)->ops->matmultnumeric)(A,B,*C);
55: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
56: return(0);
57: }
61: static PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat *C)
62: {
63: PetscErrorCode ierr;
64: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
65: PetscInt *ai=a->i,*bi=b->i,*ci,*cj;
66: PetscInt am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
67: PetscReal afill;
68: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
69: PetscBT lnkbt;
70: PetscFreeSpaceList free_space=NULL,current_space=NULL;
73: /* Get ci and cj */
74: /*---------------*/
75: /* Allocate ci array, arrays for fill computation and */
76: /* free space for accumulating nonzero column info */
77: PetscMalloc1(am+2,&ci);
78: ci[0] = 0;
80: /* create and initialize a linked list */
81: nlnk_max = a->rmax*b->rmax;
82: if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
83: PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);
85: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
86: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
88: current_space = free_space;
90: /* Determine ci and cj */
91: for (i=0; i<am; i++) {
92: anzi = ai[i+1] - ai[i];
93: aj = a->j + ai[i];
94: for (j=0; j<anzi; j++) {
95: brow = aj[j];
96: bnzj = bi[brow+1] - bi[brow];
97: bj = b->j + bi[brow];
98: /* add non-zero cols of B into the sorted linked list lnk */
99: PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);
100: }
101: cnzi = lnk[0];
103: /* If free space is not available, make more free space */
104: /* Double the amount of total space in the list */
105: if (current_space->local_remaining<cnzi) {
106: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
107: ndouble++;
108: }
110: /* Copy data into free space, then initialize lnk */
111: PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);
113: current_space->array += cnzi;
114: current_space->local_used += cnzi;
115: current_space->local_remaining -= cnzi;
117: ci[i+1] = ci[i] + cnzi;
118: }
120: /* Column indices are in the list of free space */
121: /* Allocate space for cj, initialize cj, and */
122: /* destroy list of free space and other temporary array(s) */
123: PetscMalloc1(ci[am]+1,&cj);
124: PetscFreeSpaceContiguous(&free_space,cj);
125: PetscLLCondensedDestroy(lnk,lnkbt);
127: /* put together the new symbolic matrix */
128: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
129: MatSetBlockSizesFromMats(*C,A,B);
131: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
132: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
133: c = (Mat_SeqAIJ*)((*C)->data);
134: c->free_a = PETSC_FALSE;
135: c->free_ij = PETSC_TRUE;
136: c->nonew = 0;
137: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
139: /* set MatInfo */
140: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
141: if (afill < 1.0) afill = 1.0;
142: c->maxnz = ci[am];
143: c->nz = ci[am];
144: (*C)->info.mallocs = ndouble;
145: (*C)->info.fill_ratio_given = fill;
146: (*C)->info.fill_ratio_needed = afill;
148: #if defined(PETSC_USE_INFO)
149: if (ci[am]) {
150: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
151: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
152: } else {
153: PetscInfo((*C),"Empty matrix product\n");
154: }
155: #endif
156: return(0);
157: }
161: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
162: {
164: PetscLogDouble flops=0.0;
165: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
166: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
167: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
168: PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
169: PetscInt am =A->rmap->n,cm=C->rmap->n;
170: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
171: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
172: PetscScalar *ab_dense;
175: if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
176: PetscMalloc1(ci[cm]+1,&ca);
177: c->a = ca;
178: c->free_a = PETSC_TRUE;
179: } else {
180: ca = c->a;
181: }
182: if (!c->matmult_abdense) {
183: PetscCalloc1(B->cmap->N,&ab_dense);
184: c->matmult_abdense = ab_dense;
185: } else {
186: ab_dense = c->matmult_abdense;
187: }
189: /* clean old values in C */
190: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
191: /* Traverse A row-wise. */
192: /* Build the ith row in C by summing over nonzero columns in A, */
193: /* the rows of B corresponding to nonzeros of A. */
194: for (i=0; i<am; i++) {
195: anzi = ai[i+1] - ai[i];
196: for (j=0; j<anzi; j++) {
197: brow = aj[j];
198: bnzi = bi[brow+1] - bi[brow];
199: bjj = bj + bi[brow];
200: baj = ba + bi[brow];
201: /* perform dense axpy */
202: valtmp = aa[j];
203: for (k=0; k<bnzi; k++) {
204: ab_dense[bjj[k]] += valtmp*baj[k];
205: }
206: flops += 2*bnzi;
207: }
208: aj += anzi; aa += anzi;
210: cnzi = ci[i+1] - ci[i];
211: for (k=0; k<cnzi; k++) {
212: ca[k] += ab_dense[cj[k]];
213: ab_dense[cj[k]] = 0.0; /* zero ab_dense */
214: }
215: flops += cnzi;
216: cj += cnzi; ca += cnzi;
217: }
218: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
219: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
220: PetscLogFlops(flops);
221: return(0);
222: }
226: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
227: {
229: PetscLogDouble flops=0.0;
230: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
231: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
232: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
233: PetscInt *ai = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
234: PetscInt am = A->rmap->N,cm=C->rmap->N;
235: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
236: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
237: PetscInt nextb;
240: /* clean old values in C */
241: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
242: /* Traverse A row-wise. */
243: /* Build the ith row in C by summing over nonzero columns in A, */
244: /* the rows of B corresponding to nonzeros of A. */
245: for (i=0; i<am; i++) {
246: anzi = ai[i+1] - ai[i];
247: cnzi = ci[i+1] - ci[i];
248: for (j=0; j<anzi; j++) {
249: brow = aj[j];
250: bnzi = bi[brow+1] - bi[brow];
251: bjj = bj + bi[brow];
252: baj = ba + bi[brow];
253: /* perform sparse axpy */
254: valtmp = aa[j];
255: nextb = 0;
256: for (k=0; nextb<bnzi; k++) {
257: if (cj[k] == bjj[nextb]) { /* ccol == bcol */
258: ca[k] += valtmp*baj[nextb++];
259: }
260: }
261: flops += 2*bnzi;
262: }
263: aj += anzi; aa += anzi;
264: cj += cnzi; ca += cnzi;
265: }
267: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
268: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
269: PetscLogFlops(flops);
270: return(0);
271: }
275: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
276: {
277: PetscErrorCode ierr;
278: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
279: PetscInt *ai = a->i,*bi=b->i,*ci,*cj;
280: PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
281: MatScalar *ca;
282: PetscReal afill;
283: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
284: PetscFreeSpaceList free_space=NULL,current_space=NULL;
287: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
288: /*-----------------------------------------------------------------------------------------*/
289: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
290: PetscMalloc1(am+2,&ci);
291: ci[0] = 0;
293: /* create and initialize a linked list */
294: nlnk_max = a->rmax*b->rmax;
295: if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
296: PetscLLCondensedCreate_fast(nlnk_max,&lnk);
298: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
299: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
300: current_space = free_space;
302: /* Determine ci and cj */
303: for (i=0; i<am; i++) {
304: anzi = ai[i+1] - ai[i];
305: aj = a->j + ai[i];
306: for (j=0; j<anzi; j++) {
307: brow = aj[j];
308: bnzj = bi[brow+1] - bi[brow];
309: bj = b->j + bi[brow];
310: /* add non-zero cols of B into the sorted linked list lnk */
311: PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);
312: }
313: cnzi = lnk[1];
315: /* If free space is not available, make more free space */
316: /* Double the amount of total space in the list */
317: if (current_space->local_remaining<cnzi) {
318: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
319: ndouble++;
320: }
322: /* Copy data into free space, then initialize lnk */
323: PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);
325: current_space->array += cnzi;
326: current_space->local_used += cnzi;
327: current_space->local_remaining -= cnzi;
329: ci[i+1] = ci[i] + cnzi;
330: }
332: /* Column indices are in the list of free space */
333: /* Allocate space for cj, initialize cj, and */
334: /* destroy list of free space and other temporary array(s) */
335: PetscMalloc1(ci[am]+1,&cj);
336: PetscFreeSpaceContiguous(&free_space,cj);
337: PetscLLCondensedDestroy_fast(lnk);
339: /* Allocate space for ca */
340: PetscMalloc1(ci[am]+1,&ca);
341: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
343: /* put together the new symbolic matrix */
344: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);
345: MatSetBlockSizesFromMats(*C,A,B);
347: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
348: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
349: c = (Mat_SeqAIJ*)((*C)->data);
350: c->free_a = PETSC_TRUE;
351: c->free_ij = PETSC_TRUE;
352: c->nonew = 0;
354: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
356: /* set MatInfo */
357: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
358: if (afill < 1.0) afill = 1.0;
359: c->maxnz = ci[am];
360: c->nz = ci[am];
361: (*C)->info.mallocs = ndouble;
362: (*C)->info.fill_ratio_given = fill;
363: (*C)->info.fill_ratio_needed = afill;
365: #if defined(PETSC_USE_INFO)
366: if (ci[am]) {
367: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
368: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
369: } else {
370: PetscInfo((*C),"Empty matrix product\n");
371: }
372: #endif
373: return(0);
374: }
379: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
380: {
381: PetscErrorCode ierr;
382: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
383: PetscInt *ai = a->i,*bi=b->i,*ci,*cj;
384: PetscInt am = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
385: MatScalar *ca;
386: PetscReal afill;
387: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
388: PetscFreeSpaceList free_space=NULL,current_space=NULL;
391: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
392: /*---------------------------------------------------------------------------------------------*/
393: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
394: PetscMalloc1(am+2,&ci);
395: ci[0] = 0;
397: /* create and initialize a linked list */
398: nlnk_max = a->rmax*b->rmax;
399: if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
400: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
402: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
403: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
404: current_space = free_space;
406: /* Determine ci and cj */
407: for (i=0; i<am; i++) {
408: anzi = ai[i+1] - ai[i];
409: aj = a->j + ai[i];
410: for (j=0; j<anzi; j++) {
411: brow = aj[j];
412: bnzj = bi[brow+1] - bi[brow];
413: bj = b->j + bi[brow];
414: /* add non-zero cols of B into the sorted linked list lnk */
415: PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);
416: }
417: cnzi = lnk[0];
419: /* If free space is not available, make more free space */
420: /* Double the amount of total space in the list */
421: if (current_space->local_remaining<cnzi) {
422: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
423: ndouble++;
424: }
426: /* Copy data into free space, then initialize lnk */
427: PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);
429: current_space->array += cnzi;
430: current_space->local_used += cnzi;
431: current_space->local_remaining -= cnzi;
433: ci[i+1] = ci[i] + cnzi;
434: }
436: /* Column indices are in the list of free space */
437: /* Allocate space for cj, initialize cj, and */
438: /* destroy list of free space and other temporary array(s) */
439: PetscMalloc1(ci[am]+1,&cj);
440: PetscFreeSpaceContiguous(&free_space,cj);
441: PetscLLCondensedDestroy_Scalable(lnk);
443: /* Allocate space for ca */
444: /*-----------------------*/
445: PetscMalloc1(ci[am]+1,&ca);
446: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
448: /* put together the new symbolic matrix */
449: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,C);
450: MatSetBlockSizesFromMats(*C,A,B);
452: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
453: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
454: c = (Mat_SeqAIJ*)((*C)->data);
455: c->free_a = PETSC_TRUE;
456: c->free_ij = PETSC_TRUE;
457: c->nonew = 0;
459: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
461: /* set MatInfo */
462: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
463: if (afill < 1.0) afill = 1.0;
464: c->maxnz = ci[am];
465: c->nz = ci[am];
466: (*C)->info.mallocs = ndouble;
467: (*C)->info.fill_ratio_given = fill;
468: (*C)->info.fill_ratio_needed = afill;
470: #if defined(PETSC_USE_INFO)
471: if (ci[am]) {
472: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
473: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
474: } else {
475: PetscInfo((*C),"Empty matrix product\n");
476: }
477: #endif
478: return(0);
479: }
483: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat *C)
484: {
485: PetscErrorCode ierr;
486: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
487: const PetscInt *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
488: PetscInt *ci,*cj,*bb;
489: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
490: PetscReal afill;
491: PetscInt i,j,col,ndouble = 0;
492: PetscFreeSpaceList free_space=NULL,current_space=NULL;
493: PetscHeap h;
496: /* Get ci and cj - by merging sorted rows using a heap */
497: /*---------------------------------------------------------------------------------------------*/
498: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
499: PetscMalloc1(am+2,&ci);
500: ci[0] = 0;
502: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
503: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
504: current_space = free_space;
506: PetscHeapCreate(a->rmax,&h);
507: PetscMalloc1(a->rmax,&bb);
509: /* Determine ci and cj */
510: for (i=0; i<am; i++) {
511: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
512: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
513: ci[i+1] = ci[i];
514: /* Populate the min heap */
515: for (j=0; j<anzi; j++) {
516: bb[j] = bi[acol[j]]; /* bb points at the start of the row */
517: if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
518: PetscHeapAdd(h,j,bj[bb[j]++]);
519: }
520: }
521: /* Pick off the min element, adding it to free space */
522: PetscHeapPop(h,&j,&col);
523: while (j >= 0) {
524: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
525: PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),¤t_space);
526: ndouble++;
527: }
528: *(current_space->array++) = col;
529: current_space->local_used++;
530: current_space->local_remaining--;
531: ci[i+1]++;
533: /* stash if anything else remains in this row of B */
534: if (bb[j] < bi[acol[j]+1]) {PetscHeapStash(h,j,bj[bb[j]++]);}
535: while (1) { /* pop and stash any other rows of B that also had an entry in this column */
536: PetscInt j2,col2;
537: PetscHeapPeek(h,&j2,&col2);
538: if (col2 != col) break;
539: PetscHeapPop(h,&j2,&col2);
540: if (bb[j2] < bi[acol[j2]+1]) {PetscHeapStash(h,j2,bj[bb[j2]++]);}
541: }
542: /* Put any stashed elements back into the min heap */
543: PetscHeapUnstash(h);
544: PetscHeapPop(h,&j,&col);
545: }
546: }
547: PetscFree(bb);
548: PetscHeapDestroy(&h);
550: /* Column indices are in the list of free space */
551: /* Allocate space for cj, initialize cj, and */
552: /* destroy list of free space and other temporary array(s) */
553: PetscMalloc1(ci[am],&cj);
554: PetscFreeSpaceContiguous(&free_space,cj);
556: /* put together the new symbolic matrix */
557: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
558: MatSetBlockSizesFromMats(*C,A,B);
560: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
561: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
562: c = (Mat_SeqAIJ*)((*C)->data);
563: c->free_a = PETSC_TRUE;
564: c->free_ij = PETSC_TRUE;
565: c->nonew = 0;
567: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
569: /* set MatInfo */
570: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
571: if (afill < 1.0) afill = 1.0;
572: c->maxnz = ci[am];
573: c->nz = ci[am];
574: (*C)->info.mallocs = ndouble;
575: (*C)->info.fill_ratio_given = fill;
576: (*C)->info.fill_ratio_needed = afill;
578: #if defined(PETSC_USE_INFO)
579: if (ci[am]) {
580: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
581: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
582: } else {
583: PetscInfo((*C),"Empty matrix product\n");
584: }
585: #endif
586: return(0);
587: }
591: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat *C)
592: {
593: PetscErrorCode ierr;
594: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
595: const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
596: PetscInt *ci,*cj,*bb;
597: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
598: PetscReal afill;
599: PetscInt i,j,col,ndouble = 0;
600: PetscFreeSpaceList free_space=NULL,current_space=NULL;
601: PetscHeap h;
602: PetscBT bt;
605: /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
606: /*---------------------------------------------------------------------------------------------*/
607: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
608: PetscMalloc1(am+2,&ci);
609: ci[0] = 0;
611: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
612: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
614: current_space = free_space;
616: PetscHeapCreate(a->rmax,&h);
617: PetscMalloc1(a->rmax,&bb);
618: PetscBTCreate(bn,&bt);
620: /* Determine ci and cj */
621: for (i=0; i<am; i++) {
622: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
623: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
624: const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
625: ci[i+1] = ci[i];
626: /* Populate the min heap */
627: for (j=0; j<anzi; j++) {
628: PetscInt brow = acol[j];
629: for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
630: PetscInt bcol = bj[bb[j]];
631: if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
632: PetscHeapAdd(h,j,bcol);
633: bb[j]++;
634: break;
635: }
636: }
637: }
638: /* Pick off the min element, adding it to free space */
639: PetscHeapPop(h,&j,&col);
640: while (j >= 0) {
641: if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
642: fptr = NULL; /* need PetscBTMemzero */
643: PetscFreeSpaceGet(PetscMin(2*current_space->total_array_size,16 << 20),¤t_space);
644: ndouble++;
645: }
646: *(current_space->array++) = col;
647: current_space->local_used++;
648: current_space->local_remaining--;
649: ci[i+1]++;
651: /* stash if anything else remains in this row of B */
652: for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
653: PetscInt bcol = bj[bb[j]];
654: if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
655: PetscHeapAdd(h,j,bcol);
656: bb[j]++;
657: break;
658: }
659: }
660: PetscHeapPop(h,&j,&col);
661: }
662: if (fptr) { /* Clear the bits for this row */
663: for (; fptr<current_space->array; fptr++) {PetscBTClear(bt,*fptr);}
664: } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
665: PetscBTMemzero(bn,bt);
666: }
667: }
668: PetscFree(bb);
669: PetscHeapDestroy(&h);
670: PetscBTDestroy(&bt);
672: /* Column indices are in the list of free space */
673: /* Allocate space for cj, initialize cj, and */
674: /* destroy list of free space and other temporary array(s) */
675: PetscMalloc1(ci[am],&cj);
676: PetscFreeSpaceContiguous(&free_space,cj);
678: /* put together the new symbolic matrix */
679: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
680: MatSetBlockSizesFromMats(*C,A,B);
682: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
683: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
684: c = (Mat_SeqAIJ*)((*C)->data);
685: c->free_a = PETSC_TRUE;
686: c->free_ij = PETSC_TRUE;
687: c->nonew = 0;
689: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
691: /* set MatInfo */
692: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
693: if (afill < 1.0) afill = 1.0;
694: c->maxnz = ci[am];
695: c->nz = ci[am];
696: (*C)->info.mallocs = ndouble;
697: (*C)->info.fill_ratio_given = fill;
698: (*C)->info.fill_ratio_needed = afill;
700: #if defined(PETSC_USE_INFO)
701: if (ci[am]) {
702: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
703: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
704: } else {
705: PetscInfo((*C),"Empty matrix product\n");
706: }
707: #endif
708: return(0);
709: }
713: /* concatenate unique entries and then sort */
714: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
715: {
716: PetscErrorCode ierr;
717: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
718: const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
719: PetscInt *ci,*cj;
720: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
721: PetscReal afill;
722: PetscInt i,j,ndouble = 0;
723: PetscSegBuffer seg,segrow;
724: char *seen;
727: PetscMalloc1(am+1,&ci);
728: ci[0] = 0;
730: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
731: PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg);
732: PetscSegBufferCreate(sizeof(PetscInt),100,&segrow);
733: PetscMalloc1(bn,&seen);
734: PetscMemzero(seen,bn*sizeof(char));
736: /* Determine ci and cj */
737: for (i=0; i<am; i++) {
738: const PetscInt anzi = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
739: const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
740: PetscInt packlen = 0,*PETSC_RESTRICT crow;
741: /* Pack segrow */
742: for (j=0; j<anzi; j++) {
743: PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
744: for (k=bjstart; k<bjend; k++) {
745: PetscInt bcol = bj[k];
746: if (!seen[bcol]) { /* new entry */
747: PetscInt *PETSC_RESTRICT slot;
748: PetscSegBufferGetInts(segrow,1,&slot);
749: *slot = bcol;
750: seen[bcol] = 1;
751: packlen++;
752: }
753: }
754: }
755: PetscSegBufferGetInts(seg,packlen,&crow);
756: PetscSegBufferExtractTo(segrow,crow);
757: PetscSortInt(packlen,crow);
758: ci[i+1] = ci[i] + packlen;
759: for (j=0; j<packlen; j++) seen[crow[j]] = 0;
760: }
761: PetscSegBufferDestroy(&segrow);
762: PetscFree(seen);
764: /* Column indices are in the segmented buffer */
765: PetscSegBufferExtractAlloc(seg,&cj);
766: PetscSegBufferDestroy(&seg);
768: /* put together the new symbolic matrix */
769: MatCreateSeqAIJWithArrays(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,C);
770: MatSetBlockSizesFromMats(*C,A,B);
772: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
773: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
774: c = (Mat_SeqAIJ*)((*C)->data);
775: c->free_a = PETSC_TRUE;
776: c->free_ij = PETSC_TRUE;
777: c->nonew = 0;
779: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ;
781: /* set MatInfo */
782: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
783: if (afill < 1.0) afill = 1.0;
784: c->maxnz = ci[am];
785: c->nz = ci[am];
786: (*C)->info.mallocs = ndouble;
787: (*C)->info.fill_ratio_given = fill;
788: (*C)->info.fill_ratio_needed = afill;
790: #if defined(PETSC_USE_INFO)
791: if (ci[am]) {
792: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill);
793: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
794: } else {
795: PetscInfo((*C),"Empty matrix product\n");
796: }
797: #endif
798: return(0);
799: }
801: /* This routine is not used. Should be removed! */
804: PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
805: {
809: if (scall == MAT_INITIAL_MATRIX) {
810: PetscLogEventBegin(MAT_MatTransposeMultSymbolic,A,B,0,0);
811: MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
812: PetscLogEventEnd(MAT_MatTransposeMultSymbolic,A,B,0,0);
813: }
814: PetscLogEventBegin(MAT_MatTransposeMultNumeric,A,B,0,0);
815: MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
816: PetscLogEventEnd(MAT_MatTransposeMultNumeric,A,B,0,0);
817: return(0);
818: }
822: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
823: {
824: PetscErrorCode ierr;
825: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
826: Mat_MatMatTransMult *abt=a->abt;
829: (abt->destroy)(A);
830: MatTransposeColoringDestroy(&abt->matcoloring);
831: MatDestroy(&abt->Bt_den);
832: MatDestroy(&abt->ABt_den);
833: PetscFree(abt);
834: return(0);
835: }
839: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
840: {
841: PetscErrorCode ierr;
842: Mat Bt;
843: PetscInt *bti,*btj;
844: Mat_MatMatTransMult *abt;
845: Mat_SeqAIJ *c;
848: /* create symbolic Bt */
849: MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);
850: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,NULL,&Bt);
851: MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
853: /* get symbolic C=A*Bt */
854: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);
856: /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
857: PetscNew(&abt);
858: c = (Mat_SeqAIJ*)(*C)->data;
859: c->abt = abt;
861: abt->usecoloring = PETSC_FALSE;
862: abt->destroy = (*C)->ops->destroy;
863: (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
865: PetscOptionsGetBool(NULL,"-matmattransmult_color",&abt->usecoloring,NULL);
866: if (abt->usecoloring) {
867: /* Create MatTransposeColoring from symbolic C=A*B^T */
868: MatTransposeColoring matcoloring;
869: MatColoring coloring;
870: ISColoring iscoloring;
871: Mat Bt_dense,C_dense;
872: Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
873: /* inode causes memory problem, don't know why */
874: if (c->inode.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_USE_INODES is not supported. Use '-mat_no_inode'");
876: MatColoringCreate(*C,&coloring);
877: MatColoringSetDistance(coloring,2);
878: MatColoringSetType(coloring,MATCOLORINGSL);
879: MatColoringSetFromOptions(coloring);
880: MatColoringApply(coloring,&iscoloring);
881: MatColoringDestroy(&coloring);
882: MatTransposeColoringCreate(*C,iscoloring,&matcoloring);
884: abt->matcoloring = matcoloring;
886: ISColoringDestroy(&iscoloring);
888: /* Create Bt_dense and C_dense = A*Bt_dense */
889: MatCreate(PETSC_COMM_SELF,&Bt_dense);
890: MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);
891: MatSetType(Bt_dense,MATSEQDENSE);
892: MatSeqDenseSetPreallocation(Bt_dense,NULL);
894: Bt_dense->assembled = PETSC_TRUE;
895: abt->Bt_den = Bt_dense;
897: MatCreate(PETSC_COMM_SELF,&C_dense);
898: MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);
899: MatSetType(C_dense,MATSEQDENSE);
900: MatSeqDenseSetPreallocation(C_dense,NULL);
902: Bt_dense->assembled = PETSC_TRUE;
903: abt->ABt_den = C_dense;
905: #if defined(PETSC_USE_INFO)
906: {
907: Mat_SeqAIJ *c = (Mat_SeqAIJ*)(*C)->data;
908: PetscInfo7(*C,"Use coloring of C=A*B^T; B^T: %D %D, Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
909: }
910: #endif
911: }
912: /* clean up */
913: MatDestroy(&Bt);
914: MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);
915: return(0);
916: }
920: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
921: {
922: PetscErrorCode ierr;
923: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
924: PetscInt *ai =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
925: PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
926: PetscLogDouble flops=0.0;
927: MatScalar *aa =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
928: Mat_MatMatTransMult *abt = c->abt;
931: /* clear old values in C */
932: if (!c->a) {
933: PetscMalloc1(ci[cm]+1,&ca);
934: c->a = ca;
935: c->free_a = PETSC_TRUE;
936: } else {
937: ca = c->a;
938: }
939: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
941: if (abt->usecoloring) {
942: MatTransposeColoring matcoloring = abt->matcoloring;
943: Mat Bt_dense,C_dense = abt->ABt_den;
945: /* Get Bt_dense by Apply MatTransposeColoring to B */
946: Bt_dense = abt->Bt_den;
947: MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);
949: /* C_dense = A*Bt_dense */
950: MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);
952: /* Recover C from C_dense */
953: MatTransColoringApplyDenToSp(matcoloring,C_dense,C);
954: return(0);
955: }
957: for (i=0; i<cm; i++) {
958: anzi = ai[i+1] - ai[i];
959: acol = aj + ai[i];
960: aval = aa + ai[i];
961: cnzi = ci[i+1] - ci[i];
962: ccol = cj + ci[i];
963: cval = ca + ci[i];
964: for (j=0; j<cnzi; j++) {
965: brow = ccol[j];
966: bnzj = bi[brow+1] - bi[brow];
967: bcol = bj + bi[brow];
968: bval = ba + bi[brow];
970: /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
971: nexta = 0; nextb = 0;
972: while (nexta<anzi && nextb<bnzj) {
973: while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
974: if (nexta == anzi) break;
975: while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
976: if (nextb == bnzj) break;
977: if (acol[nexta] == bcol[nextb]) {
978: cval[j] += aval[nexta]*bval[nextb];
979: nexta++; nextb++;
980: flops += 2;
981: }
982: }
983: }
984: }
985: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
986: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
987: PetscLogFlops(flops);
988: return(0);
989: }
993: PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
994: {
998: if (scall == MAT_INITIAL_MATRIX) {
999: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,A,B,0,0);
1000: MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
1001: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,A,B,0,0);
1002: }
1003: PetscLogEventBegin(MAT_TransposeMatMultNumeric,A,B,0,0);
1004: MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
1005: PetscLogEventEnd(MAT_TransposeMatMultNumeric,A,B,0,0);
1006: return(0);
1007: }
1011: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
1012: {
1014: Mat At;
1015: PetscInt *ati,*atj;
1018: /* create symbolic At */
1019: MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1020: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,NULL,&At);
1021: MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs));
1023: /* get symbolic C=At*B */
1024: MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);
1026: /* clean up */
1027: MatDestroy(&At);
1028: MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);
1029: return(0);
1030: }
1034: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1035: {
1037: Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1038: PetscInt am =A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1039: PetscInt cm =C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1040: PetscLogDouble flops=0.0;
1041: MatScalar *aa =a->a,*ba,*ca,*caj;
1044: if (!c->a) {
1045: PetscMalloc1(ci[cm]+1,&ca);
1047: c->a = ca;
1048: c->free_a = PETSC_TRUE;
1049: } else {
1050: ca = c->a;
1051: }
1052: /* clear old values in C */
1053: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
1055: /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1056: for (i=0; i<am; i++) {
1057: bj = b->j + bi[i];
1058: ba = b->a + bi[i];
1059: bnzi = bi[i+1] - bi[i];
1060: anzi = ai[i+1] - ai[i];
1061: for (j=0; j<anzi; j++) {
1062: nextb = 0;
1063: crow = *aj++;
1064: cjj = cj + ci[crow];
1065: caj = ca + ci[crow];
1066: /* perform sparse axpy operation. Note cjj includes bj. */
1067: for (k=0; nextb<bnzi; k++) {
1068: if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1069: caj[k] += (*aa)*(*(ba+nextb));
1070: nextb++;
1071: }
1072: }
1073: flops += 2*bnzi;
1074: aa++;
1075: }
1076: }
1078: /* Assemble the final matrix and clean up */
1079: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1080: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1081: PetscLogFlops(flops);
1082: return(0);
1083: }
1087: PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
1088: {
1092: if (scall == MAT_INITIAL_MATRIX) {
1093: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
1094: MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);
1095: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
1096: }
1097: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
1098: MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);
1099: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
1100: return(0);
1101: }
1105: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
1106: {
1110: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
1112: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1113: return(0);
1114: }
1118: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1119: {
1120: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1121: PetscErrorCode ierr;
1122: PetscScalar *c,*b,r1,r2,r3,r4,*c1,*c2,*c3,*c4,aatmp;
1123: const PetscScalar *aa,*b1,*b2,*b3,*b4;
1124: const PetscInt *aj;
1125: PetscInt cm=C->rmap->n,cn=B->cmap->n,bm=B->rmap->n,am=A->rmap->n;
1126: PetscInt am4=4*am,bm4=4*bm,col,i,j,n,ajtmp;
1129: if (!cm || !cn) return(0);
1130: if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
1131: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1132: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1133: MatDenseGetArray(B,&b);
1134: MatDenseGetArray(C,&c);
1135: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1136: c1 = c; c2 = c1 + am; c3 = c2 + am; c4 = c3 + am;
1137: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1138: for (i=0; i<am; i++) { /* over rows of C in those columns */
1139: r1 = r2 = r3 = r4 = 0.0;
1140: n = a->i[i+1] - a->i[i];
1141: aj = a->j + a->i[i];
1142: aa = a->a + a->i[i];
1143: for (j=0; j<n; j++) {
1144: aatmp = aa[j]; ajtmp = aj[j];
1145: r1 += aatmp*b1[ajtmp];
1146: r2 += aatmp*b2[ajtmp];
1147: r3 += aatmp*b3[ajtmp];
1148: r4 += aatmp*b4[ajtmp];
1149: }
1150: c1[i] = r1;
1151: c2[i] = r2;
1152: c3[i] = r3;
1153: c4[i] = r4;
1154: }
1155: b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1156: c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1157: }
1158: for (; col<cn; col++) { /* over extra columns of C */
1159: for (i=0; i<am; i++) { /* over rows of C in those columns */
1160: r1 = 0.0;
1161: n = a->i[i+1] - a->i[i];
1162: aj = a->j + a->i[i];
1163: aa = a->a + a->i[i];
1164: for (j=0; j<n; j++) {
1165: r1 += aa[j]*b1[aj[j]];
1166: }
1167: c1[i] = r1;
1168: }
1169: b1 += bm;
1170: c1 += am;
1171: }
1172: PetscLogFlops(cn*(2.0*a->nz));
1173: MatDenseRestoreArray(B,&b);
1174: MatDenseRestoreArray(C,&c);
1175: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1176: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1177: return(0);
1178: }
1180: /*
1181: Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1182: */
1185: PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1186: {
1187: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1189: PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1190: MatScalar *aa;
1191: PetscInt cm = C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1192: PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam,*ridx;
1195: if (!cm || !cn) return(0);
1196: MatDenseGetArray(B,&b);
1197: MatDenseGetArray(C,&c);
1198: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1200: if (a->compressedrow.use) { /* use compressed row format */
1201: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1202: colam = col*am;
1203: arm = a->compressedrow.nrows;
1204: ii = a->compressedrow.i;
1205: ridx = a->compressedrow.rindex;
1206: for (i=0; i<arm; i++) { /* over rows of C in those columns */
1207: r1 = r2 = r3 = r4 = 0.0;
1208: n = ii[i+1] - ii[i];
1209: aj = a->j + ii[i];
1210: aa = a->a + ii[i];
1211: for (j=0; j<n; j++) {
1212: r1 += (*aa)*b1[*aj];
1213: r2 += (*aa)*b2[*aj];
1214: r3 += (*aa)*b3[*aj];
1215: r4 += (*aa++)*b4[*aj++];
1216: }
1217: c[colam + ridx[i]] += r1;
1218: c[colam + am + ridx[i]] += r2;
1219: c[colam + am2 + ridx[i]] += r3;
1220: c[colam + am3 + ridx[i]] += r4;
1221: }
1222: b1 += bm4;
1223: b2 += bm4;
1224: b3 += bm4;
1225: b4 += bm4;
1226: }
1227: for (; col<cn; col++) { /* over extra columns of C */
1228: colam = col*am;
1229: arm = a->compressedrow.nrows;
1230: ii = a->compressedrow.i;
1231: ridx = a->compressedrow.rindex;
1232: for (i=0; i<arm; i++) { /* over rows of C in those columns */
1233: r1 = 0.0;
1234: n = ii[i+1] - ii[i];
1235: aj = a->j + ii[i];
1236: aa = a->a + ii[i];
1238: for (j=0; j<n; j++) {
1239: r1 += (*aa++)*b1[*aj++];
1240: }
1241: c[colam + ridx[i]] += r1;
1242: }
1243: b1 += bm;
1244: }
1245: } else {
1246: for (col=0; col<cn-4; col += 4) { /* over columns of C */
1247: colam = col*am;
1248: for (i=0; i<am; i++) { /* over rows of C in those columns */
1249: r1 = r2 = r3 = r4 = 0.0;
1250: n = a->i[i+1] - a->i[i];
1251: aj = a->j + a->i[i];
1252: aa = a->a + a->i[i];
1253: for (j=0; j<n; j++) {
1254: r1 += (*aa)*b1[*aj];
1255: r2 += (*aa)*b2[*aj];
1256: r3 += (*aa)*b3[*aj];
1257: r4 += (*aa++)*b4[*aj++];
1258: }
1259: c[colam + i] += r1;
1260: c[colam + am + i] += r2;
1261: c[colam + am2 + i] += r3;
1262: c[colam + am3 + i] += r4;
1263: }
1264: b1 += bm4;
1265: b2 += bm4;
1266: b3 += bm4;
1267: b4 += bm4;
1268: }
1269: for (; col<cn; col++) { /* over extra columns of C */
1270: colam = col*am;
1271: for (i=0; i<am; i++) { /* over rows of C in those columns */
1272: r1 = 0.0;
1273: n = a->i[i+1] - a->i[i];
1274: aj = a->j + a->i[i];
1275: aa = a->a + a->i[i];
1277: for (j=0; j<n; j++) {
1278: r1 += (*aa++)*b1[*aj++];
1279: }
1280: c[colam + i] += r1;
1281: }
1282: b1 += bm;
1283: }
1284: }
1285: PetscLogFlops(cn*2.0*a->nz);
1286: MatDenseRestoreArray(B,&b);
1287: MatDenseRestoreArray(C,&c);
1288: return(0);
1289: }
1293: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1294: {
1296: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
1297: Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data;
1298: PetscInt *bi = b->i,*bj=b->j;
1299: PetscInt m = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1300: MatScalar *btval,*btval_den,*ba=b->a;
1301: PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1304: btval_den=btdense->v;
1305: PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));
1306: for (k=0; k<ncolors; k++) {
1307: ncolumns = coloring->ncolumns[k];
1308: for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1309: col = *(columns + colorforcol[k] + l);
1310: btcol = bj + bi[col];
1311: btval = ba + bi[col];
1312: anz = bi[col+1] - bi[col];
1313: for (j=0; j<anz; j++) {
1314: brow = btcol[j];
1315: btval_den[brow] = btval[j];
1316: }
1317: }
1318: btval_den += m;
1319: }
1320: return(0);
1321: }
1325: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1326: {
1328: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data;
1329: PetscScalar *ca_den,*ca_den_ptr,*ca=csp->a;
1330: PetscInt k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1331: PetscInt brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1332: PetscInt nrows,*row,*idx;
1333: PetscInt *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1336: MatDenseGetArray(Cden,&ca_den);
1338: if (brows > 0) {
1339: PetscInt *lstart,row_end,row_start;
1340: lstart = matcoloring->lstart;
1341: PetscMemzero(lstart,ncolors*sizeof(PetscInt));
1343: row_end = brows;
1344: if (row_end > m) row_end = m;
1345: for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1346: ca_den_ptr = ca_den;
1347: for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1348: nrows = matcoloring->nrows[k];
1349: row = rows + colorforrow[k];
1350: idx = den2sp + colorforrow[k];
1351: for (l=lstart[k]; l<nrows; l++) {
1352: if (row[l] >= row_end) {
1353: lstart[k] = l;
1354: break;
1355: } else {
1356: ca[idx[l]] = ca_den_ptr[row[l]];
1357: }
1358: }
1359: ca_den_ptr += m;
1360: }
1361: row_end += brows;
1362: if (row_end > m) row_end = m;
1363: }
1364: } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1365: ca_den_ptr = ca_den;
1366: for (k=0; k<ncolors; k++) {
1367: nrows = matcoloring->nrows[k];
1368: row = rows + colorforrow[k];
1369: idx = den2sp + colorforrow[k];
1370: for (l=0; l<nrows; l++) {
1371: ca[idx[l]] = ca_den_ptr[row[l]];
1372: }
1373: ca_den_ptr += m;
1374: }
1375: }
1377: MatDenseRestoreArray(Cden,&ca_den);
1378: #if defined(PETSC_USE_INFO)
1379: if (matcoloring->brows > 0) {
1380: PetscInfo1(Csp,"Loop over %D row blocks for den2sp\n",brows);
1381: } else {
1382: PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n");
1383: }
1384: #endif
1385: return(0);
1386: }
1390: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1391: {
1393: PetscInt i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1394: const PetscInt *is,*ci,*cj,*row_idx;
1395: PetscInt nis = iscoloring->n,*rowhit,bs = 1;
1396: IS *isa;
1397: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data;
1398: PetscInt *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1399: PetscInt *colorforcol,*columns,*columns_i,brows;
1400: PetscBool flg;
1403: ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);
1405: /* bs >1 is not being tested yet! */
1406: Nbs = mat->cmap->N/bs;
1407: c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */
1408: c->N = Nbs;
1409: c->m = c->M;
1410: c->rstart = 0;
1411: c->brows = 100;
1413: c->ncolors = nis;
1414: PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow);
1415: PetscMalloc1(csp->nz+1,&rows);
1416: PetscMalloc1(csp->nz+1,&den2sp);
1418: brows = c->brows;
1419: PetscOptionsGetInt(NULL,"-matden2sp_brows",&brows,&flg);
1420: if (flg) c->brows = brows;
1421: if (brows > 0) {
1422: PetscMalloc1(nis+1,&c->lstart);
1423: }
1425: colorforrow[0] = 0;
1426: rows_i = rows;
1427: den2sp_i = den2sp;
1429: PetscMalloc1(nis+1,&colorforcol);
1430: PetscMalloc1(Nbs+1,&columns);
1432: colorforcol[0] = 0;
1433: columns_i = columns;
1435: /* get column-wise storage of mat */
1436: MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1438: cm = c->m;
1439: PetscMalloc1(cm+1,&rowhit);
1440: PetscMalloc1(cm+1,&idxhit);
1441: for (i=0; i<nis; i++) { /* loop over color */
1442: ISGetLocalSize(isa[i],&n);
1443: ISGetIndices(isa[i],&is);
1445: c->ncolumns[i] = n;
1446: if (n) {
1447: PetscMemcpy(columns_i,is,n*sizeof(PetscInt));
1448: }
1449: colorforcol[i+1] = colorforcol[i] + n;
1450: columns_i += n;
1452: /* fast, crude version requires O(N*N) work */
1453: PetscMemzero(rowhit,cm*sizeof(PetscInt));
1455: for (j=0; j<n; j++) { /* loop over columns*/
1456: col = is[j];
1457: row_idx = cj + ci[col];
1458: m = ci[col+1] - ci[col];
1459: for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1460: idxhit[*row_idx] = spidx[ci[col] + k];
1461: rowhit[*row_idx++] = col + 1;
1462: }
1463: }
1464: /* count the number of hits */
1465: nrows = 0;
1466: for (j=0; j<cm; j++) {
1467: if (rowhit[j]) nrows++;
1468: }
1469: c->nrows[i] = nrows;
1470: colorforrow[i+1] = colorforrow[i] + nrows;
1472: nrows = 0;
1473: for (j=0; j<cm; j++) { /* loop over rows */
1474: if (rowhit[j]) {
1475: rows_i[nrows] = j;
1476: den2sp_i[nrows] = idxhit[j];
1477: nrows++;
1478: }
1479: }
1480: den2sp_i += nrows;
1482: ISRestoreIndices(isa[i],&is);
1483: rows_i += nrows;
1484: }
1485: MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL);
1486: PetscFree(rowhit);
1487: ISColoringRestoreIS(iscoloring,&isa);
1488: if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1490: c->colorforrow = colorforrow;
1491: c->rows = rows;
1492: c->den2sp = den2sp;
1493: c->colorforcol = colorforcol;
1494: c->columns = columns;
1496: PetscFree(idxhit);
1497: return(0);
1498: }