Actual source code: matmatmult.c
petsc-3.3-p7 2013-05-11
2: /*
3: Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4: C = A * B
5: */
7: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8: #include <../src/mat/utils/freespace.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/seq/dense.h> /*I "petscmat.h" I*/
11: /*
12: #define DEBUG_MATMATMULT
13: */
14: EXTERN_C_BEGIN
17: PetscErrorCode MatMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
18: {
20: PetscBool scalable=PETSC_FALSE,scalable_fast=PETSC_FALSE;
23: if (scall == MAT_INITIAL_MATRIX){
24: PetscObjectOptionsBegin((PetscObject)A);
25: PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);
26: PetscOptionsBool("-matmatmult_scalable_fast","Use a scalable but slower C=A*B","",scalable_fast,&scalable_fast,PETSC_NULL);
27: PetscOptionsEnd();
28: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
29: if (scalable_fast){
30: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C);
31: } else if (scalable){
32: MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C);
33: } else {
34: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
35: }
36: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
37: }
38: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
39: (*(*C)->ops->matmultnumeric)(A,B,*C);
40: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
41: return(0);
42: }
43: EXTERN_C_END
47: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
48: {
49: PetscErrorCode ierr;
50: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
51: PetscInt *ai=a->i,*bi=b->i,*ci,*cj;
52: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
53: PetscReal afill;
54: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
55: PetscBT lnkbt;
56: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
59: /* Get ci and cj */
60: /*---------------*/
61: /* Allocate ci array, arrays for fill computation and */
62: /* free space for accumulating nonzero column info */
63: PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
64: ci[0] = 0;
65:
66: /* create and initialize a linked list */
67: nlnk_max = a->rmax*b->rmax;
68: if (!nlnk_max || nlnk_max > bn) nlnk_max = bn;
69: PetscLLCondensedCreate(nlnk_max,bn,&lnk,&lnkbt);
71: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
72: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
73: current_space = free_space;
75: /* Determine ci and cj */
76: for (i=0; i<am; i++) {
77: anzi = ai[i+1] - ai[i];
78: aj = a->j + ai[i];
79: for (j=0; j<anzi; j++){
80: brow = aj[j];
81: bnzj = bi[brow+1] - bi[brow];
82: bj = b->j + bi[brow];
83: /* add non-zero cols of B into the sorted linked list lnk */
84: PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt);
85: }
86: cnzi = lnk[0];
88: /* If free space is not available, make more free space */
89: /* Double the amount of total space in the list */
90: if (current_space->local_remaining<cnzi) {
91: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
92: ndouble++;
93: }
95: /* Copy data into free space, then initialize lnk */
96: PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt);
97: current_space->array += cnzi;
98: current_space->local_used += cnzi;
99: current_space->local_remaining -= cnzi;
100: ci[i+1] = ci[i] + cnzi;
101: }
103: /* Column indices are in the list of free space */
104: /* Allocate space for cj, initialize cj, and */
105: /* destroy list of free space and other temporary array(s) */
106: PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
107: PetscFreeSpaceContiguous(&free_space,cj);
108: PetscLLCondensedDestroy(lnk,lnkbt);
109:
110: /* put together the new symbolic matrix */
111: MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,PETSC_NULL,C);
112: (*C)->rmap->bs = A->rmap->bs;
113: (*C)->cmap->bs = B->cmap->bs;
115: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
116: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
117: c = (Mat_SeqAIJ *)((*C)->data);
118: c->free_a = PETSC_FALSE;
119: c->free_ij = PETSC_TRUE;
120: c->nonew = 0;
121: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ; /* fast, needs non-scalable O(bn) array 'abdense' */
122:
123: /* set MatInfo */
124: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
125: if (afill < 1.0) afill = 1.0;
126: c->maxnz = ci[am];
127: c->nz = ci[am];
128: (*C)->info.mallocs = ndouble;
129: (*C)->info.fill_ratio_given = fill;
130: (*C)->info.fill_ratio_needed = afill;
132: #if defined(PETSC_USE_INFO)
133: if (ci[am]) {
134: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);
135: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
136: } else {
137: PetscInfo((*C),"Empty matrix product\n");
138: }
139: #endif
140: return(0);
141: }
145: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
146: {
148: PetscLogDouble flops=0.0;
149: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
150: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
151: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
152: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
153: PetscInt am=A->rmap->n,cm=C->rmap->n;
154: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
155: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca,valtmp;
156: PetscScalar *ab_dense;
157:
159: /* printf("MatMatMultNumeric_SeqAIJ_SeqAIJ...ca %p\n",c->a); */
160: if (!c->a){ /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
161: PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);
162: c->a = ca;
163: c->free_a = PETSC_TRUE;
165: PetscMalloc(B->cmap->N*sizeof(PetscScalar),&ab_dense);
166: PetscMemzero(ab_dense,B->cmap->N*sizeof(PetscScalar));
167: c->matmult_abdense = ab_dense;
168: } else {
169: ca = c->a;
170: ab_dense = c->matmult_abdense;
171: }
173: /* clean old values in C */
174: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
175: /* Traverse A row-wise. */
176: /* Build the ith row in C by summing over nonzero columns in A, */
177: /* the rows of B corresponding to nonzeros of A. */
178: for (i=0; i<am; i++) {
179: anzi = ai[i+1] - ai[i];
180: for (j=0; j<anzi; j++) {
181: brow = aj[j];
182: bnzi = bi[brow+1] - bi[brow];
183: bjj = bj + bi[brow];
184: baj = ba + bi[brow];
185: /* perform dense axpy */
186: valtmp = aa[j];
187: for (k=0; k<bnzi; k++) {
188: ab_dense[bjj[k]] += valtmp*baj[k];
189: }
190: flops += 2*bnzi;
191: }
192: aj += anzi; aa += anzi;
194: cnzi = ci[i+1] - ci[i];
195: for (k=0; k<cnzi; k++) {
196: ca[k] += ab_dense[cj[k]];
197: ab_dense[cj[k]] = 0.0; /* zero ab_dense */
198: }
199: flops += cnzi;
200: cj += cnzi; ca += cnzi;
201: }
202: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
203: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
204: PetscLogFlops(flops);
205: return(0);
206: }
210: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
211: {
213: PetscLogDouble flops=0.0;
214: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
215: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
216: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
217: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
218: PetscInt am=A->rmap->N,cm=C->rmap->N;
219: PetscInt i,j,k,anzi,bnzi,cnzi,brow;
220: PetscScalar *aa=a->a,*ba=b->a,*baj,*ca=c->a,valtmp;
221: PetscInt nextb;
222:
224: #if defined(DEBUG_MATMATMULT)
225: PetscPrintf(PETSC_COMM_SELF,"MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable...\n");
226: #endif
227: /* clean old values in C */
228: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
229: /* Traverse A row-wise. */
230: /* Build the ith row in C by summing over nonzero columns in A, */
231: /* the rows of B corresponding to nonzeros of A. */
232: for (i=0;i<am;i++) {
233: anzi = ai[i+1] - ai[i];
234: cnzi = ci[i+1] - ci[i];
235: for (j=0;j<anzi;j++) {
236: brow = aj[j];
237: bnzi = bi[brow+1] - bi[brow];
238: bjj = bj + bi[brow];
239: baj = ba + bi[brow];
240: /* perform sparse axpy */
241: valtmp = aa[j];
242: nextb = 0;
243: for (k=0; nextb<bnzi; k++) {
244: if (cj[k] == bjj[nextb]){ /* ccol == bcol */
245: ca[k] += valtmp*baj[nextb++];
246: }
247: }
248: flops += 2*bnzi;
249: }
250: aj += anzi; aa += anzi;
251: cj += cnzi; ca += cnzi;
252: }
254: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
255: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
256: PetscLogFlops(flops);
257: return(0);
258: }
262: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat *C)
263: {
264: PetscErrorCode ierr;
265: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
266: PetscInt *ai=a->i,*bi=b->i,*ci,*cj;
267: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
268: MatScalar *ca;
269: PetscReal afill;
270: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
271: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
274: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
275: /*-----------------------------------------------------------------------------------------*/
276: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
277: PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
278: ci[0] = 0;
279:
280: /* create and initialize a linked list */
281: nlnk_max = a->rmax*b->rmax;
282: if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
283: PetscLLCondensedCreate_fast(nlnk_max,&lnk);
285: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
286: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
287: current_space = free_space;
289: /* Determine ci and cj */
290: for (i=0; i<am; i++) {
291: anzi = ai[i+1] - ai[i];
292: aj = a->j + ai[i];
293: for (j=0; j<anzi; j++){
294: brow = aj[j];
295: bnzj = bi[brow+1] - bi[brow];
296: bj = b->j + bi[brow];
297: /* add non-zero cols of B into the sorted linked list lnk */
298: PetscLLCondensedAddSorted_fast(bnzj,bj,lnk);
299: }
300: cnzi = lnk[1];
302: /* If free space is not available, make more free space */
303: /* Double the amount of total space in the list */
304: if (current_space->local_remaining<cnzi) {
305: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
306: ndouble++;
307: }
309: /* Copy data into free space, then initialize lnk */
310: PetscLLCondensedClean_fast(cnzi,current_space->array,lnk);
311: current_space->array += cnzi;
312: current_space->local_used += cnzi;
313: current_space->local_remaining -= cnzi;
314: ci[i+1] = ci[i] + cnzi;
315: }
317: /* Column indices are in the list of free space */
318: /* Allocate space for cj, initialize cj, and */
319: /* destroy list of free space and other temporary array(s) */
320: PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
321: PetscFreeSpaceContiguous(&free_space,cj);
322: PetscLLCondensedDestroy_fast(lnk);
323:
324: /* Allocate space for ca */
325: PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);
326: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
327:
328: /* put together the new symbolic matrix */
329: MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);
330: (*C)->rmap->bs = A->rmap->bs;
331: (*C)->cmap->bs = B->cmap->bs;
333: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
334: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
335: c = (Mat_SeqAIJ *)((*C)->data);
336: c->free_a = PETSC_TRUE;
337: c->free_ij = PETSC_TRUE;
338: c->nonew = 0;
339: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
340:
341: /* set MatInfo */
342: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
343: if (afill < 1.0) afill = 1.0;
344: c->maxnz = ci[am];
345: c->nz = ci[am];
346: (*C)->info.mallocs = ndouble;
347: (*C)->info.fill_ratio_given = fill;
348: (*C)->info.fill_ratio_needed = afill;
350: #if defined(PETSC_USE_INFO)
351: if (ci[am]) {
352: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);
353: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
354: } else {
355: PetscInfo((*C),"Empty matrix product\n");
356: }
357: #endif
358: return(0);
359: }
364: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat *C)
365: {
366: PetscErrorCode ierr;
367: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
368: PetscInt *ai=a->i,*bi=b->i,*ci,*cj;
369: PetscInt am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
370: MatScalar *ca;
371: PetscReal afill;
372: PetscInt i,j,anzi,brow,bnzj,cnzi,*bj,*aj,nlnk_max,*lnk,ndouble=0;
373: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
376: /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
377: /*---------------------------------------------------------------------------------------------*/
378: /* Allocate arrays for fill computation and free space for accumulating nonzero column */
379: PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
380: ci[0] = 0;
381:
382: /* create and initialize a linked list */
383: nlnk_max = a->rmax*b->rmax;
384: if (!nlnk_max || nlnk_max > bn) nlnk_max = bn; /* in case rmax is not defined for A or B */
385: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
387: /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
388: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
389: current_space = free_space;
391: /* Determine ci and cj */
392: for (i=0; i<am; i++) {
393: anzi = ai[i+1] - ai[i];
394: aj = a->j + ai[i];
395: for (j=0; j<anzi; j++){
396: brow = aj[j];
397: bnzj = bi[brow+1] - bi[brow];
398: bj = b->j + bi[brow];
399: /* add non-zero cols of B into the sorted linked list lnk */
400: PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk);
401: }
402: cnzi = lnk[0];
404: /* If free space is not available, make more free space */
405: /* Double the amount of total space in the list */
406: if (current_space->local_remaining<cnzi) {
407: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
408: ndouble++;
409: }
411: /* Copy data into free space, then initialize lnk */
412: PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk);
413: current_space->array += cnzi;
414: current_space->local_used += cnzi;
415: current_space->local_remaining -= cnzi;
416: ci[i+1] = ci[i] + cnzi;
417: }
419: /* Column indices are in the list of free space */
420: /* Allocate space for cj, initialize cj, and */
421: /* destroy list of free space and other temporary array(s) */
422: PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
423: PetscFreeSpaceContiguous(&free_space,cj);
424: PetscLLCondensedDestroy_Scalable(lnk);
425:
426: /* Allocate space for ca */
427: /*-----------------------*/
428: PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);
429: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
430:
431: /* put together the new symbolic matrix */
432: MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bn,ci,cj,ca,C);
433: (*C)->rmap->bs = A->rmap->bs;
434: (*C)->cmap->bs = B->cmap->bs;
436: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
437: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
438: c = (Mat_SeqAIJ *)((*C)->data);
439: c->free_a = PETSC_TRUE;
440: c->free_ij = PETSC_TRUE;
441: c->nonew = 0;
442: (*C)->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; /* slower, less memory */
443:
444: /* set MatInfo */
445: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
446: if (afill < 1.0) afill = 1.0;
447: c->maxnz = ci[am];
448: c->nz = ci[am];
449: (*C)->info.mallocs = ndouble;
450: (*C)->info.fill_ratio_given = fill;
451: (*C)->info.fill_ratio_needed = afill;
453: #if defined(PETSC_USE_INFO)
454: if (ci[am]) {
455: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",ndouble,fill,afill);
456: PetscInfo1((*C),"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
457: } else {
458: PetscInfo((*C),"Empty matrix product\n");
459: }
460: #endif
461: return(0);
462: }
465: /* This routine is not used. Should be removed! */
468: PetscErrorCode MatMatTransposeMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
469: {
471:
473: if (scall == MAT_INITIAL_MATRIX){
474: MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
475: }
476: MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
477: return(0);
478: }
482: PetscErrorCode PetscContainerDestroy_Mat_MatMatTransMult(void *ptr)
483: {
484: PetscErrorCode ierr;
485: Mat_MatMatTransMult *multtrans=(Mat_MatMatTransMult*)ptr;
488: MatTransposeColoringDestroy(&multtrans->matcoloring);
489: MatDestroy(&multtrans->Bt_den);
490: MatDestroy(&multtrans->ABt_den);
491: PetscFree(multtrans);
492: return(0);
493: }
497: PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(Mat A)
498: {
499: PetscErrorCode ierr;
500: PetscContainer container;
501: Mat_MatMatTransMult *multtrans=PETSC_NULL;
504: PetscObjectQuery((PetscObject)A,"Mat_MatMatTransMult",(PetscObject *)&container);
505: if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
506: PetscContainerGetPointer(container,(void **)&multtrans);
507: A->ops->destroy = multtrans->destroy;
508: if (A->ops->destroy) {
509: (*A->ops->destroy)(A);
510: }
511: PetscObjectCompose((PetscObject)A,"Mat_MatMatTransMult",0);
512: return(0);
513: }
517: PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
518: {
519: PetscErrorCode ierr;
520: Mat Bt;
521: PetscInt *bti,*btj;
522: Mat_MatMatTransMult *multtrans;
523: PetscContainer container;
524: PetscLogDouble t0,tf,etime2=0.0;
525:
527: PetscGetTime(&t0);
528: /* create symbolic Bt */
529: MatGetSymbolicTranspose_SeqAIJ(B,&bti,&btj);
530: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,B->cmap->n,B->rmap->n,bti,btj,PETSC_NULL,&Bt);
531: Bt->rmap->bs = A->cmap->bs;
532: Bt->cmap->bs = B->cmap->bs;
534: /* get symbolic C=A*Bt */
535: MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C);
537: /* create a supporting struct for reuse intermidiate dense matrices with matcoloring */
538: PetscNew(Mat_MatMatTransMult,&multtrans);
540: /* attach the supporting struct to C */
541: PetscContainerCreate(PETSC_COMM_SELF,&container);
542: PetscContainerSetPointer(container,multtrans);
543: PetscContainerSetUserDestroy(container,PetscContainerDestroy_Mat_MatMatTransMult);
544: PetscObjectCompose((PetscObject)(*C),"Mat_MatMatTransMult",(PetscObject)container);
545: PetscContainerDestroy(&container);
547: multtrans->usecoloring = PETSC_FALSE;
548: multtrans->destroy = (*C)->ops->destroy;
549: (*C)->ops->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
551: PetscGetTime(&tf);
552: etime2 += tf - t0;
554: PetscOptionsGetBool(PETSC_NULL,"-matmattransmult_color",&multtrans->usecoloring,PETSC_NULL);
555: if (multtrans->usecoloring){
556: /* Create MatTransposeColoring from symbolic C=A*B^T */
557: MatTransposeColoring matcoloring;
558: ISColoring iscoloring;
559: Mat Bt_dense,C_dense;
560: PetscLogDouble etime0=0.0,etime01=0.0,etime1=0.0;
561:
562: PetscGetTime(&t0);
563: MatGetColoring(*C,MATCOLORINGLF,&iscoloring);
564: PetscGetTime(&tf);
565: etime0 += tf - t0;
567: PetscGetTime(&t0);
568: MatTransposeColoringCreate(*C,iscoloring,&matcoloring);
569: multtrans->matcoloring = matcoloring;
570: ISColoringDestroy(&iscoloring);
571: PetscGetTime(&tf);
572: etime01 += tf - t0;
574: PetscGetTime(&t0);
575: /* Create Bt_dense and C_dense = A*Bt_dense */
576: MatCreate(PETSC_COMM_SELF,&Bt_dense);
577: MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors);
578: MatSetType(Bt_dense,MATSEQDENSE);
579: MatSeqDenseSetPreallocation(Bt_dense,PETSC_NULL);
580: Bt_dense->assembled = PETSC_TRUE;
581: multtrans->Bt_den = Bt_dense;
582:
583: MatCreate(PETSC_COMM_SELF,&C_dense);
584: MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors);
585: MatSetType(C_dense,MATSEQDENSE);
586: MatSeqDenseSetPreallocation(C_dense,PETSC_NULL);
587: Bt_dense->assembled = PETSC_TRUE;
588: multtrans->ABt_den = C_dense;
589: PetscGetTime(&tf);
590: etime1 += tf - t0;
592: #if defined(PETSC_USE_INFO)
593: {
594: Mat_SeqAIJ *c=(Mat_SeqAIJ*)(*C)->data;
595: PetscInfo5(*C,"Bt_dense: %D,%D; Cnz %D / (cm*ncolors %D) = %g\n",A->cmap->n,matcoloring->ncolors,c->nz,A->rmap->n*matcoloring->ncolors,(PetscReal)(c->nz)/(A->rmap->n*matcoloring->ncolors));
596: PetscInfo5(*C,"Sym = GetColor %g + ColorCreate %g + MatDenseCreate %g + non-colorSym %g = %g\n",etime0,etime01,etime1,etime2,etime0+etime01+etime1+etime2);
597: }
598: #endif
599: }
600: /* clean up */
601: MatDestroy(&Bt);
602: MatRestoreSymbolicTranspose_SeqAIJ(B,&bti,&btj);
604:
605:
606: #if defined(INEFFICIENT_ALGORITHM)
607: /* The algorithm below computes am*bm sparse inner-product - inefficient! It will be deleted later. */
608: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
609: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
610: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ci,*cj,*acol,*bcol;
611: PetscInt am=A->rmap->N,bm=B->rmap->N;
612: PetscInt i,j,anzi,bnzj,cnzi,nlnk,*lnk,nspacedouble=0,ka,kb,index[1];
613: MatScalar *ca;
614: PetscBT lnkbt;
615: PetscReal afill;
617: /* Allocate row pointer array ci */
618: PetscMalloc(((am+1)+1)*sizeof(PetscInt),&ci);
619: ci[0] = 0;
620:
621: /* Create and initialize a linked list for C columns */
622: nlnk = bm+1;
623: PetscLLCreate(bm,bm,nlnk,lnk,lnkbt);
625: /* Initial FreeSpace with size fill*(nnz(A)+nnz(B)) */
626: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+bi[bm])),&free_space);
627: current_space = free_space;
629: /* Determine symbolic info for each row of the product A*B^T: */
630: for (i=0; i<am; i++) {
631: anzi = ai[i+1] - ai[i];
632: cnzi = 0;
633: acol = aj + ai[i];
634: for (j=0; j<bm; j++){
635: bnzj = bi[j+1] - bi[j];
636: bcol= bj + bi[j];
637: /* sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
638: ka = 0; kb = 0;
639: while (ka < anzi && kb < bnzj){
640: while (acol[ka] < bcol[kb] && ka < anzi) ka++;
641: if (ka == anzi) break;
642: while (acol[ka] > bcol[kb] && kb < bnzj) kb++;
643: if (kb == bnzj) break;
644: if (acol[ka] == bcol[kb]){ /* add nonzero c(i,j) to lnk */
645: index[0] = j;
646: PetscLLAdd(1,index,bm,nlnk,lnk,lnkbt);
647: cnzi++;
648: break;
649: }
650: }
651: }
652:
653: /* If free space is not available, make more free space */
654: /* Double the amount of total space in the list */
655: if (current_space->local_remaining<cnzi) {
656: PetscFreeSpaceGet(cnzi+current_space->total_array_size,¤t_space);
657: nspacedouble++;
658: }
660: /* Copy data into free space, then initialize lnk */
661: PetscLLClean(bm,bm,cnzi,lnk,current_space->array,lnkbt);
662: current_space->array += cnzi;
663: current_space->local_used += cnzi;
664: current_space->local_remaining -= cnzi;
665:
666: ci[i+1] = ci[i] + cnzi;
667: }
668:
670: /* Column indices are in the list of free space.
671: Allocate array cj, copy column indices to cj, and destroy list of free space */
672: PetscMalloc((ci[am]+1)*sizeof(PetscInt),&cj);
673: PetscFreeSpaceContiguous(&free_space,cj);
674: PetscLLDestroy(lnk,lnkbt);
675:
676: /* Allocate space for ca */
677: PetscMalloc((ci[am]+1)*sizeof(MatScalar),&ca);
678: PetscMemzero(ca,(ci[am]+1)*sizeof(MatScalar));
679:
680: /* put together the new symbolic matrix */
681: MatCreateSeqAIJWithArrays(((PetscObject)A)->comm,am,bm,ci,cj,ca,C);
682: (*C)->rmap->bs = A->cmap->bs;
683: (*C)->cmap->bs = B->cmap->bs;
685: /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
686: /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
687: c = (Mat_SeqAIJ *)((*C)->data);
688: c->free_a = PETSC_TRUE;
689: c->free_ij = PETSC_TRUE;
690: c->nonew = 0;
692: /* set MatInfo */
693: afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
694: if (afill < 1.0) afill = 1.0;
695: c->maxnz = ci[am];
696: c->nz = ci[am];
697: (*C)->info.mallocs = nspacedouble;
698: (*C)->info.fill_ratio_given = fill;
699: (*C)->info.fill_ratio_needed = afill;
701: #if defined(PETSC_USE_INFO)
702: if (ci[am]) {
703: PetscInfo3((*C),"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
704: PetscInfo1((*C),"Use MatMatTransposeMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
705: } else {
706: PetscInfo((*C),"Empty matrix product\n");
707: }
708: #endif
709: #endif
710: return(0);
711: }
713: /* #define USE_ARRAY - for sparse dot product. Slower than !USE_ARRAY */
716: PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
717: {
719: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
720: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
721: PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
722: PetscLogDouble flops=0.0;
723: MatScalar *aa=a->a,*aval,*ba=b->a,*bval,*ca,*cval;
724: Mat_MatMatTransMult *multtrans;
725: PetscContainer container;
726: #if defined(USE_ARRAY)
727: MatScalar *spdot;
728: #endif
731: PetscObjectQuery((PetscObject)C,"Mat_MatMatTransMult",(PetscObject *)&container);
732: if (!container) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Container does not exit");
733: PetscContainerGetPointer(container,(void **)&multtrans);
734: if (multtrans->usecoloring){
735: MatTransposeColoring matcoloring = multtrans->matcoloring;
736: Mat Bt_dense;
737: PetscInt m,n;
738: PetscLogDouble t0,tf,etime0=0.0,etime1=0.0,etime2=0.0;
739: Mat C_dense = multtrans->ABt_den;
741: Bt_dense = multtrans->Bt_den;
742: MatGetLocalSize(Bt_dense,&m,&n);
744: /* Get Bt_dense by Apply MatTransposeColoring to B */
745: PetscGetTime(&t0);
746: MatTransColoringApplySpToDen(matcoloring,B,Bt_dense);
747: PetscGetTime(&tf);
748: etime0 += tf - t0;
750: /* C_dense = A*Bt_dense */
751: PetscGetTime(&t0);
752: MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense);
753: PetscGetTime(&tf);
754: etime2 += tf - t0;
755:
756: /* Recover C from C_dense */
757: PetscGetTime(&t0);
758: MatTransColoringApplyDenToSp(matcoloring,C_dense,C);
759: PetscGetTime(&tf);
760: etime1 += tf - t0;
761: #if defined(PETSC_USE_INFO)
762: PetscInfo4(C,"Num = ColoringApply: %g %g + Mult_sp_dense %g = %g\n",etime0,etime1,etime2,etime0+etime1+etime2);
763: #endif
764: return(0);
765: }
767: #if defined(USE_ARRAY)
768: /* allocate an array for implementing sparse inner-product */
769: PetscMalloc((A->cmap->n+1)*sizeof(MatScalar),&spdot);
770: PetscMemzero(spdot,(A->cmap->n+1)*sizeof(MatScalar));
771: #endif
773: /* clear old values in C */
774: if (!c->a){
775: PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);
776: c->a = ca;
777: c->free_a = PETSC_TRUE;
778: } else {
779: ca = c->a;
780: }
781: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
783: for (i=0; i<cm; i++) {
784: anzi = ai[i+1] - ai[i];
785: acol = aj + ai[i];
786: aval = aa + ai[i];
787: cnzi = ci[i+1] - ci[i];
788: ccol = cj + ci[i];
789: cval = ca + ci[i];
790: for (j=0; j<cnzi; j++){
791: brow = ccol[j];
792: bnzj = bi[brow+1] - bi[brow];
793: bcol = bj + bi[brow];
794: bval = ba + bi[brow];
796: /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
797: #if defined(USE_ARRAY)
798: /* put ba to spdot array */
799: for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = bval[nextb];
800: /* c(i,j)=A[i,:]*B[j,:]^T */
801: for (nexta=0; nexta<anzi; nexta++){
802: cval[j] += spdot[acol[nexta]]*aval[nexta];
803: }
804: /* zero spdot array */
805: for (nextb=0; nextb<bnzj; nextb++) spdot[bcol[nextb]] = 0.0;
806: #else
807: nexta = 0; nextb = 0;
808: while (nexta<anzi && nextb<bnzj){
809: while (acol[nexta] < bcol[nextb] && nexta < anzi) nexta++;
810: if (nexta == anzi) break;
811: while (acol[nexta] > bcol[nextb] && nextb < bnzj) nextb++;
812: if (nextb == bnzj) break;
813: if (acol[nexta] == bcol[nextb]){
814: cval[j] += aval[nexta]*bval[nextb];
815: nexta++; nextb++;
816: flops += 2;
817: }
818: }
819: #endif
820: }
821: }
822: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
823: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
824: PetscLogFlops(flops);
825: #if defined(USE_ARRAY)
826: PetscFree(spdot);
827: #endif
828: return(0);
829: }
833: PetscErrorCode MatTransposeMatMult_SeqAIJ_SeqAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) {
837: if (scall == MAT_INITIAL_MATRIX){
838: MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C);
839: }
840: MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(A,B,*C);
841: return(0);
842: }
846: PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat *C)
847: {
849: Mat At;
850: PetscInt *ati,*atj;
853: /* create symbolic At */
854: MatGetSymbolicTranspose_SeqAIJ(A,&ati,&atj);
855: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,A->cmap->n,A->rmap->n,ati,atj,PETSC_NULL,&At);
856: At->rmap->bs = A->cmap->bs;
857: At->cmap->bs = B->cmap->bs;
859: /* get symbolic C=At*B */
860: MatMatMultSymbolic_SeqAIJ_SeqAIJ(At,B,fill,C);
862: /* clean up */
863: MatDestroy(&At);
864: MatRestoreSymbolicTranspose_SeqAIJ(A,&ati,&atj);
865: return(0);
866: }
870: PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
871: {
873: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
874: PetscInt am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
875: PetscInt cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
876: PetscLogDouble flops=0.0;
877: MatScalar *aa=a->a,*ba,*ca,*caj;
878:
880: if (!c->a){
881: PetscMalloc((ci[cm]+1)*sizeof(MatScalar),&ca);
882: c->a = ca;
883: c->free_a = PETSC_TRUE;
884: } else {
885: ca = c->a;
886: }
887: /* clear old values in C */
888: PetscMemzero(ca,ci[cm]*sizeof(MatScalar));
890: /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
891: for (i=0;i<am;i++) {
892: bj = b->j + bi[i];
893: ba = b->a + bi[i];
894: bnzi = bi[i+1] - bi[i];
895: anzi = ai[i+1] - ai[i];
896: for (j=0; j<anzi; j++) {
897: nextb = 0;
898: crow = *aj++;
899: cjj = cj + ci[crow];
900: caj = ca + ci[crow];
901: /* perform sparse axpy operation. Note cjj includes bj. */
902: for (k=0; nextb<bnzi; k++) {
903: if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
904: caj[k] += (*aa)*(*(ba+nextb));
905: nextb++;
906: }
907: }
908: flops += 2*bnzi;
909: aa++;
910: }
911: }
913: /* Assemble the final matrix and clean up */
914: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
915: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
916: PetscLogFlops(flops);
917: return(0);
918: }
920: EXTERN_C_BEGIN
923: PetscErrorCode MatMatMult_SeqAIJ_SeqDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
924: {
928: if (scall == MAT_INITIAL_MATRIX){
929: MatMatMultSymbolic_SeqAIJ_SeqDense(A,B,fill,C);
930: }
931: MatMatMultNumeric_SeqAIJ_SeqDense(A,B,*C);
932: return(0);
933: }
934: EXTERN_C_END
938: PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat *C)
939: {
943: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
944: (*C)->ops->matmult = MatMatMult_SeqAIJ_SeqDense;
945: return(0);
946: }
950: PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
951: {
952: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
954: PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
955: MatScalar *aa;
956: PetscInt cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n;
957: PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam;
960: if (!cm || !cn) return(0);
961: if (bm != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,bm);
962: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
963: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
964: MatGetArray(B,&b);
965: MatGetArray(C,&c);
966: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
967: for (col=0; col<cn-4; col += 4){ /* over columns of C */
968: colam = col*am;
969: for (i=0; i<am; i++) { /* over rows of C in those columns */
970: r1 = r2 = r3 = r4 = 0.0;
971: n = a->i[i+1] - a->i[i];
972: aj = a->j + a->i[i];
973: aa = a->a + a->i[i];
974: for (j=0; j<n; j++) {
975: r1 += (*aa)*b1[*aj];
976: r2 += (*aa)*b2[*aj];
977: r3 += (*aa)*b3[*aj];
978: r4 += (*aa++)*b4[*aj++];
979: }
980: c[colam + i] = r1;
981: c[colam + am + i] = r2;
982: c[colam + am2 + i] = r3;
983: c[colam + am3 + i] = r4;
984: }
985: b1 += bm4;
986: b2 += bm4;
987: b3 += bm4;
988: b4 += bm4;
989: }
990: for (;col<cn; col++){ /* over extra columns of C */
991: for (i=0; i<am; i++) { /* over rows of C in those columns */
992: r1 = 0.0;
993: n = a->i[i+1] - a->i[i];
994: aj = a->j + a->i[i];
995: aa = a->a + a->i[i];
997: for (j=0; j<n; j++) {
998: r1 += (*aa++)*b1[*aj++];
999: }
1000: c[col*am + i] = r1;
1001: }
1002: b1 += bm;
1003: }
1004: PetscLogFlops(cn*(2.0*a->nz));
1005: MatRestoreArray(B,&b);
1006: MatRestoreArray(C,&c);
1007: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1008: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1009: return(0);
1010: }
1012: /*
1013: Note very similar to MatMult_SeqAIJ(), should generate both codes from same base
1014: */
1017: PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1018: {
1019: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1021: PetscScalar *b,*c,r1,r2,r3,r4,*b1,*b2,*b3,*b4;
1022: MatScalar *aa;
1023: PetscInt cm=C->rmap->n, cn=B->cmap->n, bm=B->rmap->n, col, i,j,n,*aj, am = A->rmap->n,*ii,arm;
1024: PetscInt am2 = 2*am, am3 = 3*am, bm4 = 4*bm,colam,*ridx;
1027: if (!cm || !cn) return(0);
1028: MatGetArray(B,&b);
1029: MatGetArray(C,&c);
1030: b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1032: if (a->compressedrow.use){ /* use compressed row format */
1033: for (col=0; col<cn-4; col += 4){ /* over columns of C */
1034: colam = col*am;
1035: arm = a->compressedrow.nrows;
1036: ii = a->compressedrow.i;
1037: ridx = a->compressedrow.rindex;
1038: for (i=0; i<arm; i++) { /* over rows of C in those columns */
1039: r1 = r2 = r3 = r4 = 0.0;
1040: n = ii[i+1] - ii[i];
1041: aj = a->j + ii[i];
1042: aa = a->a + ii[i];
1043: for (j=0; j<n; j++) {
1044: r1 += (*aa)*b1[*aj];
1045: r2 += (*aa)*b2[*aj];
1046: r3 += (*aa)*b3[*aj];
1047: r4 += (*aa++)*b4[*aj++];
1048: }
1049: c[colam + ridx[i]] += r1;
1050: c[colam + am + ridx[i]] += r2;
1051: c[colam + am2 + ridx[i]] += r3;
1052: c[colam + am3 + ridx[i]] += r4;
1053: }
1054: b1 += bm4;
1055: b2 += bm4;
1056: b3 += bm4;
1057: b4 += bm4;
1058: }
1059: for (;col<cn; col++){ /* over extra columns of C */
1060: colam = col*am;
1061: arm = a->compressedrow.nrows;
1062: ii = a->compressedrow.i;
1063: ridx = a->compressedrow.rindex;
1064: for (i=0; i<arm; i++) { /* over rows of C in those columns */
1065: r1 = 0.0;
1066: n = ii[i+1] - ii[i];
1067: aj = a->j + ii[i];
1068: aa = a->a + ii[i];
1070: for (j=0; j<n; j++) {
1071: r1 += (*aa++)*b1[*aj++];
1072: }
1073: c[col*am + ridx[i]] += r1;
1074: }
1075: b1 += bm;
1076: }
1077: } else {
1078: for (col=0; col<cn-4; col += 4){ /* over columns of C */
1079: colam = col*am;
1080: for (i=0; i<am; i++) { /* over rows of C in those columns */
1081: r1 = r2 = r3 = r4 = 0.0;
1082: n = a->i[i+1] - a->i[i];
1083: aj = a->j + a->i[i];
1084: aa = a->a + a->i[i];
1085: for (j=0; j<n; j++) {
1086: r1 += (*aa)*b1[*aj];
1087: r2 += (*aa)*b2[*aj];
1088: r3 += (*aa)*b3[*aj];
1089: r4 += (*aa++)*b4[*aj++];
1090: }
1091: c[colam + i] += r1;
1092: c[colam + am + i] += r2;
1093: c[colam + am2 + i] += r3;
1094: c[colam + am3 + i] += r4;
1095: }
1096: b1 += bm4;
1097: b2 += bm4;
1098: b3 += bm4;
1099: b4 += bm4;
1100: }
1101: for (;col<cn; col++){ /* over extra columns of C */
1102: for (i=0; i<am; i++) { /* over rows of C in those columns */
1103: r1 = 0.0;
1104: n = a->i[i+1] - a->i[i];
1105: aj = a->j + a->i[i];
1106: aa = a->a + a->i[i];
1108: for (j=0; j<n; j++) {
1109: r1 += (*aa++)*b1[*aj++];
1110: }
1111: c[col*am + i] += r1;
1112: }
1113: b1 += bm;
1114: }
1115: }
1116: PetscLogFlops(cn*2.0*a->nz);
1117: MatRestoreArray(B,&b);
1118: MatRestoreArray(C,&c);
1119: return(0);
1120: }
1124: PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1125: {
1127: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
1128: Mat_SeqDense *btdense = (Mat_SeqDense*)Btdense->data;
1129: PetscInt *bi=b->i,*bj=b->j;
1130: PetscInt m=Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1131: MatScalar *btval,*btval_den,*ba=b->a;
1132: PetscInt *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1135: btval_den=btdense->v;
1136: PetscMemzero(btval_den,(m*n)*sizeof(MatScalar));
1137: for (k=0; k<ncolors; k++) {
1138: ncolumns = coloring->ncolumns[k];
1139: for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1140: col = *(columns + colorforcol[k] + l);
1141: btcol = bj + bi[col];
1142: btval = ba + bi[col];
1143: anz = bi[col+1] - bi[col];
1144: for (j=0; j<anz; j++){
1145: brow = btcol[j];
1146: btval_den[brow] = btval[j];
1147: }
1148: }
1149: btval_den += m;
1150: }
1151: return(0);
1152: }
1156: PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1157: {
1159: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)Csp->data;
1160: PetscInt k,l,*row,*idx,m,ncolors=matcoloring->ncolors,nrows;
1161: PetscScalar *ca_den,*cp_den,*ca=csp->a;
1162: PetscInt *rows=matcoloring->rows,*spidx=matcoloring->columnsforspidx,*colorforrow=matcoloring->colorforrow;
1163:
1165: MatGetLocalSize(Csp,&m,PETSC_NULL);
1166: MatGetArray(Cden,&ca_den);
1167: cp_den = ca_den;
1168: for (k=0; k<ncolors; k++) {
1169: nrows = matcoloring->nrows[k];
1170: row = rows + colorforrow[k];
1171: idx = spidx + colorforrow[k];
1172: for (l=0; l<nrows; l++){
1173: ca[idx[l]] = cp_den[row[l]];
1174: }
1175: cp_den += m;
1176: }
1177: MatRestoreArray(Cden,&ca_den);
1178: return(0);
1179: }
1181: /*
1182: MatGetColumnIJ_SeqAIJ_Color() and MatRestoreColumnIJ_SeqAIJ_Color() are customized from
1183: MatGetColumnIJ_SeqAIJ() and MatRestoreColumnIJ_SeqAIJ() by adding an output
1184: spidx[], index of a->j, to be used for setting 'columnsforspidx' in MatTransposeColoringCreate_SeqAIJ().
1185: */
1188: PetscErrorCode MatGetColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *nn,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool *done)
1189: {
1190: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1192: PetscInt i,*collengths,*cia,*cja,n = A->cmap->n,m = A->rmap->n;
1193: PetscInt nz = a->i[m],row,*jj,mr,col;
1194: PetscInt *cspidx;
1197: *nn = n;
1198: if (!ia) return(0);
1199: if (symmetric) {
1200: SETERRQ(((PetscObject)A)->comm,PETSC_ERR_SUP,"MatGetColumnIJ_SeqAIJ_Color() not supported for the case symmetric");
1201: MatToSymmetricIJ_SeqAIJ(A->rmap->n,a->i,a->j,0,oshift,ia,ja);
1202: } else {
1203: PetscMalloc((n+1)*sizeof(PetscInt),&collengths);
1204: PetscMemzero(collengths,n*sizeof(PetscInt));
1205: PetscMalloc((n+1)*sizeof(PetscInt),&cia);
1206: PetscMalloc((nz+1)*sizeof(PetscInt),&cja);
1207: PetscMalloc((nz+1)*sizeof(PetscInt),&cspidx);
1208: jj = a->j;
1209: for (i=0; i<nz; i++) {
1210: collengths[jj[i]]++;
1211: }
1212: cia[0] = oshift;
1213: for (i=0; i<n; i++) {
1214: cia[i+1] = cia[i] + collengths[i];
1215: }
1216: PetscMemzero(collengths,n*sizeof(PetscInt));
1217: jj = a->j;
1218: for (row=0; row<m; row++) {
1219: mr = a->i[row+1] - a->i[row];
1220: for (i=0; i<mr; i++) {
1221: col = *jj++;
1222: cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
1223: cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
1224: }
1225: }
1226: PetscFree(collengths);
1227: *ia = cia; *ja = cja;
1228: *spidx = cspidx;
1229: }
1230: return(0);
1231: }
1235: PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscInt *spidx[],PetscBool *done)
1236: {
1240: if (!ia) return(0);
1242: PetscFree(*ia);
1243: PetscFree(*ja);
1244: PetscFree(*spidx);
1245: return(0);
1246: }
1250: PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1251: {
1253: PetscInt i,n,nrows,N,j,k,m,*row_idx,*ci,*cj,ncols,col,cm;
1254: const PetscInt *is;
1255: PetscInt nis = iscoloring->n,*rowhit,bs = 1;
1256: IS *isa;
1257: PetscBool flg1,flg2;
1258: Mat_SeqAIJ *csp = (Mat_SeqAIJ*)mat->data;
1259: PetscInt *colorforrow,*rows,*rows_i,*columnsforspidx,*columnsforspidx_i,*idxhit,*spidx;
1260: PetscInt *colorforcol,*columns,*columns_i;
1263: ISColoringGetIS(iscoloring,PETSC_IGNORE,&isa);
1264:
1265: /* this is ugly way to get blocksize but cannot call MatGetBlockSize() because AIJ can have bs > 1 */
1266: PetscObjectTypeCompare((PetscObject)mat,MATSEQBAIJ,&flg1);
1267: PetscObjectTypeCompare((PetscObject)mat,MATMPIBAIJ,&flg2);
1268: if (flg1 || flg2) {
1269: MatGetBlockSize(mat,&bs);
1270: }
1272: N = mat->cmap->N/bs;
1273: c->M = mat->rmap->N/bs; /* set total rows, columns and local rows */
1274: c->N = mat->cmap->N/bs;
1275: c->m = mat->rmap->N/bs;
1276: c->rstart = 0;
1278: c->ncolors = nis;
1279: PetscMalloc(nis*sizeof(PetscInt),&c->ncolumns);
1280: PetscMalloc(nis*sizeof(PetscInt),&c->nrows);
1281: PetscMalloc2(csp->nz+1,PetscInt,&rows,csp->nz+1,PetscInt,&columnsforspidx);
1282: PetscMalloc((nis+1)*sizeof(PetscInt),&colorforrow);
1283: colorforrow[0] = 0;
1284: rows_i = rows;
1285: columnsforspidx_i = columnsforspidx;
1287: PetscMalloc((nis+1)*sizeof(PetscInt),&colorforcol);
1288: PetscMalloc((N+1)*sizeof(PetscInt),&columns);
1289: colorforcol[0] = 0;
1290: columns_i = columns;
1291:
1292: MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL); /* column-wise storage of mat */
1294: cm = c->m;
1295: PetscMalloc((cm+1)*sizeof(PetscInt),&rowhit);
1296: PetscMalloc((cm+1)*sizeof(PetscInt),&idxhit);
1297: for (i=0; i<nis; i++) {
1298: ISGetLocalSize(isa[i],&n);
1299: ISGetIndices(isa[i],&is);
1300: c->ncolumns[i] = n;
1301: if (n) {
1302: PetscMemcpy(columns_i,is,n*sizeof(PetscInt));
1303: }
1304: colorforcol[i+1] = colorforcol[i] + n;
1305: columns_i += n;
1307: /* fast, crude version requires O(N*N) work */
1308: PetscMemzero(rowhit,cm*sizeof(PetscInt));
1309:
1310: /* loop over columns*/
1311: for (j=0; j<n; j++) {
1312: col = is[j];
1313: row_idx = cj + ci[col];
1314: m = ci[col+1] - ci[col];
1315: /* loop over columns marking them in rowhit */
1316: for (k=0; k<m; k++) {
1317: idxhit[*row_idx] = spidx[ci[col] + k];
1318: rowhit[*row_idx++] = col + 1;
1319: }
1320: }
1321: /* count the number of hits */
1322: nrows = 0;
1323: for (j=0; j<cm; j++) {
1324: if (rowhit[j]) nrows++;
1325: }
1326: c->nrows[i] = nrows;
1327: colorforrow[i+1] = colorforrow[i] + nrows;
1328:
1329: nrows = 0;
1330: for (j=0; j<cm; j++) {
1331: if (rowhit[j]) {
1332: rows_i[nrows] = j;
1333: columnsforspidx_i[nrows] = idxhit[j];
1334: nrows++;
1335: }
1336: }
1337: ISRestoreIndices(isa[i],&is);
1338: rows_i += nrows; columnsforspidx_i += nrows;
1339: }
1340: MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,PETSC_NULL);
1341: PetscFree(rowhit);
1342: ISColoringRestoreIS(iscoloring,&isa);
1343: #if defined(PETSC_USE_DEBUG)
1344: if (csp->nz != colorforrow[nis]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %d != colorforrow[nis] %d",csp->nz,colorforrow[nis]);
1345: #endif
1346:
1347: c->colorforrow = colorforrow;
1348: c->rows = rows;
1349: c->columnsforspidx = columnsforspidx;
1350: c->colorforcol = colorforcol;
1351: c->columns = columns;
1352: PetscFree(idxhit);
1353: return(0);
1354: }