Actual source code: mpimatmatmult.c
petsc-3.7.0 2016-04-25
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc/private/vecimpl.h>
15: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
18: const char *algTypes[2] = {"scalable","nonscalable"};
19: PetscInt alg=1; /* set default algorithm */
20: MPI_Comm comm;
23: if (scall == MAT_INITIAL_MATRIX) {
24: PetscObjectGetComm((PetscObject)A,&comm);
25: if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
27: PetscObjectOptionsBegin((PetscObject)A);
28: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[1],&alg,NULL);
29: PetscOptionsEnd();
31: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
32: switch (alg) {
33: case 1:
34: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
35: break;
36: default:
37: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
38: break;
39: }
40: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
41: }
42: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
43: (*(*C)->ops->matmultnumeric)(A,B,*C);
44: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
45: return(0);
46: }
50: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
51: {
53: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
54: Mat_PtAPMPI *ptap = a->ptap;
57: PetscFree2(ptap->startsj_s,ptap->startsj_r);
58: PetscFree(ptap->bufa);
59: MatDestroy(&ptap->P_loc);
60: MatDestroy(&ptap->P_oth);
61: MatDestroy(&ptap->Pt);
62: PetscFree(ptap->api);
63: PetscFree(ptap->apj);
64: PetscFree(ptap->apa);
65: ptap->destroy(A);
66: PetscFree(ptap);
67: return(0);
68: }
72: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
73: {
75: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
76: Mat_PtAPMPI *ptap = a->ptap;
79: (*ptap->duplicate)(A,op,M);
81: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
82: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
83: return(0);
84: }
88: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
89: {
91: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
92: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
93: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
94: PetscScalar *cda=cd->a,*coa=co->a;
95: Mat_SeqAIJ *p_loc,*p_oth;
96: PetscScalar *apa,*ca;
97: PetscInt cm =C->rmap->n;
98: Mat_PtAPMPI *ptap=c->ptap;
99: PetscInt *api,*apj,*apJ,i,k;
100: PetscInt cstart=C->cmap->rstart;
101: PetscInt cdnz,conz,k0,k1;
102: MPI_Comm comm;
103: PetscMPIInt size;
106: PetscObjectGetComm((PetscObject)A,&comm);
107: MPI_Comm_size(comm,&size);
109: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
110: /*-----------------------------------------------------*/
111: /* update numerical values of P_oth and P_loc */
112: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
113: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
115: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
116: /*----------------------------------------------------------*/
117: /* get data from symbolic products */
118: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
119: p_oth = NULL;
120: if (size >1) {
121: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
122: }
124: /* get apa for storing dense row A[i,:]*P */
125: apa = ptap->apa;
127: api = ptap->api;
128: apj = ptap->apj;
129: for (i=0; i<cm; i++) {
130: /* compute apa = A[i,:]*P */
131: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
133: /* set values in C */
134: apJ = apj + api[i];
135: cdnz = cd->i[i+1] - cd->i[i];
136: conz = co->i[i+1] - co->i[i];
138: /* 1st off-diagoanl part of C */
139: ca = coa + co->i[i];
140: k = 0;
141: for (k0=0; k0<conz; k0++) {
142: if (apJ[k] >= cstart) break;
143: ca[k0] = apa[apJ[k]];
144: apa[apJ[k]] = 0.0;
145: k++;
146: }
148: /* diagonal part of C */
149: ca = cda + cd->i[i];
150: for (k1=0; k1<cdnz; k1++) {
151: ca[k1] = apa[apJ[k]];
152: apa[apJ[k]] = 0.0;
153: k++;
154: }
156: /* 2nd off-diagoanl part of C */
157: ca = coa + co->i[i];
158: for (; k0<conz; k0++) {
159: ca[k0] = apa[apJ[k]];
160: apa[apJ[k]] = 0.0;
161: k++;
162: }
163: }
164: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
165: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
166: return(0);
167: }
171: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
172: {
173: PetscErrorCode ierr;
174: MPI_Comm comm;
175: PetscMPIInt size;
176: Mat Cmpi;
177: Mat_PtAPMPI *ptap;
178: PetscFreeSpaceList free_space=NULL,current_space=NULL;
179: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
180: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
181: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
182: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
183: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
184: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n,Crmax;
185: PetscBT lnkbt;
186: PetscScalar *apa;
187: PetscReal afill;
188: PetscTable ta;
191: PetscObjectGetComm((PetscObject)A,&comm);
192: MPI_Comm_size(comm,&size);
194: /* create struct Mat_PtAPMPI and attached it to C later */
195: PetscNew(&ptap);
197: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
198: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
200: /* get P_loc by taking all local rows of P */
201: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
203: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
204: pi_loc = p_loc->i; pj_loc = p_loc->j;
205: if (size > 1) {
206: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
207: pi_oth = p_oth->i; pj_oth = p_oth->j;
208: } else {
209: p_oth = NULL;
210: pi_oth = NULL; pj_oth = NULL;
211: }
213: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
214: /*-------------------------------------------------------------------*/
215: PetscMalloc1(am+2,&api);
216: ptap->api = api;
217: api[0] = 0;
219: /* create and initialize a linked list */
220: Crmax = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN));
221: if (Crmax > pN) Crmax = pN;
222: PetscTableCreate(Crmax,pN,&ta);
223: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
224: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
225: PetscTableGetCount(ta,&Crmax);
226: PetscTableDestroy(&ta);
228: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
230: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
231: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
232: current_space = free_space;
234: MatPreallocateInitialize(comm,am,pn,dnz,onz);
235: for (i=0; i<am; i++) {
236: /* diagonal portion of A */
237: nzi = adi[i+1] - adi[i];
238: for (j=0; j<nzi; j++) {
239: row = *adj++;
240: pnz = pi_loc[row+1] - pi_loc[row];
241: Jptr = pj_loc + pi_loc[row];
242: /* add non-zero cols of P into the sorted linked list lnk */
243: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
244: }
245: /* off-diagonal portion of A */
246: nzi = aoi[i+1] - aoi[i];
247: for (j=0; j<nzi; j++) {
248: row = *aoj++;
249: pnz = pi_oth[row+1] - pi_oth[row];
250: Jptr = pj_oth + pi_oth[row];
251: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
252: }
254: apnz = lnk[0];
255: api[i+1] = api[i] + apnz;
257: /* if free space is not available, double the total space in the list */
258: if (current_space->local_remaining<apnz) {
259: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
260: nspacedouble++;
261: }
263: /* Copy data into free space, then initialize lnk */
264: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
265: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
267: current_space->array += apnz;
268: current_space->local_used += apnz;
269: current_space->local_remaining -= apnz;
270: }
272: /* Allocate space for apj, initialize apj, and */
273: /* destroy list of free space and other temporary array(s) */
274: PetscMalloc1(api[am]+1,&ptap->apj);
275: apj = ptap->apj;
276: PetscFreeSpaceContiguous(&free_space,ptap->apj);
277: PetscLLDestroy(lnk,lnkbt);
279: /* malloc apa to store dense row A[i,:]*P */
280: PetscCalloc1(pN,&apa);
282: ptap->apa = apa;
284: /* create and assemble symbolic parallel matrix Cmpi */
285: /*----------------------------------------------------*/
286: MatCreate(comm,&Cmpi);
287: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
288: MatSetBlockSizesFromMats(Cmpi,A,P);
290: MatSetType(Cmpi,MATMPIAIJ);
291: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
292: MatPreallocateFinalize(dnz,onz);
293: for (i=0; i<am; i++) {
294: row = i + rstart;
295: apnz = api[i+1] - api[i];
296: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
297: apj += apnz;
298: }
299: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
300: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
302: ptap->destroy = Cmpi->ops->destroy;
303: ptap->duplicate = Cmpi->ops->duplicate;
304: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
305: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
306: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
308: /* attach the supporting struct to Cmpi for reuse */
309: c = (Mat_MPIAIJ*)Cmpi->data;
310: c->ptap = ptap;
312: *C = Cmpi;
314: /* set MatInfo */
315: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
316: if (afill < 1.0) afill = 1.0;
317: Cmpi->info.mallocs = nspacedouble;
318: Cmpi->info.fill_ratio_given = fill;
319: Cmpi->info.fill_ratio_needed = afill;
321: #if defined(PETSC_USE_INFO)
322: if (api[am]) {
323: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
324: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
325: } else {
326: PetscInfo(Cmpi,"Empty matrix product\n");
327: }
328: #endif
329: return(0);
330: }
334: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
335: {
339: if (scall == MAT_INITIAL_MATRIX) {
340: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
341: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
342: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
343: }
344: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
345: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
346: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
347: return(0);
348: }
350: typedef struct {
351: Mat workB;
352: PetscScalar *rvalues,*svalues;
353: MPI_Request *rwaits,*swaits;
354: } MPIAIJ_MPIDense;
358: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
359: {
360: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
361: PetscErrorCode ierr;
364: MatDestroy(&contents->workB);
365: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
366: PetscFree(contents);
367: return(0);
368: }
372: /*
373: This is a "dummy function" that handles the case where matrix C was created as a dense matrix
374: directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
376: It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
377: */
378: PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
379: {
380: PetscErrorCode ierr;
381: PetscBool flg;
382: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
383: PetscInt nz = aij->B->cmap->n;
384: PetscContainer container;
385: MPIAIJ_MPIDense *contents;
386: VecScatter ctx = aij->Mvctx;
387: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
388: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
391: PetscObjectTypeCompare((PetscObject)B,MATMPIDENSE,&flg);
392: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Second matrix must be mpidense");
394: /* Handle case where where user provided the final C matrix rather than calling MatMatMult() with MAT_INITIAL_MATRIX*/
395: PetscObjectTypeCompare((PetscObject)A,MATMPIAIJ,&flg);
396: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"First matrix must be MPIAIJ");
398: C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
400: PetscNew(&contents);
401: /* Create work matrix used to store off processor rows of B needed for local product */
402: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
403: /* Create work arrays needed */
404: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
405: B->cmap->N*to->starts[to->n],&contents->svalues,
406: from->n,&contents->rwaits,
407: to->n,&contents->swaits);
409: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
410: PetscContainerSetPointer(container,contents);
411: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
412: PetscObjectCompose((PetscObject)C,"workB",(PetscObject)container);
413: PetscContainerDestroy(&container);
415: (*C->ops->matmultnumeric)(A,B,C);
416: return(0);
417: }
421: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
422: {
423: PetscErrorCode ierr;
424: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
425: PetscInt nz = aij->B->cmap->n;
426: PetscContainer container;
427: MPIAIJ_MPIDense *contents;
428: VecScatter ctx = aij->Mvctx;
429: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
430: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
431: PetscInt m = A->rmap->n,n=B->cmap->n;
434: MatCreate(PetscObjectComm((PetscObject)B),C);
435: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
436: MatSetBlockSizesFromMats(*C,A,B);
437: MatSetType(*C,MATMPIDENSE);
438: MatMPIDenseSetPreallocation(*C,NULL);
439: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
440: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
442: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
444: PetscNew(&contents);
445: /* Create work matrix used to store off processor rows of B needed for local product */
446: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
447: /* Create work arrays needed */
448: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
449: B->cmap->N*to->starts[to->n],&contents->svalues,
450: from->n,&contents->rwaits,
451: to->n,&contents->swaits);
453: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
454: PetscContainerSetPointer(container,contents);
455: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
456: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
457: PetscContainerDestroy(&container);
458: return(0);
459: }
463: /*
464: Performs an efficient scatter on the rows of B needed by this process; this is
465: a modification of the VecScatterBegin_() routines.
466: */
467: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
468: {
469: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
470: PetscErrorCode ierr;
471: PetscScalar *b,*w,*svalues,*rvalues;
472: VecScatter ctx = aij->Mvctx;
473: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
474: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
475: PetscInt i,j,k;
476: PetscInt *sindices,*sstarts,*rindices,*rstarts;
477: PetscMPIInt *sprocs,*rprocs,nrecvs;
478: MPI_Request *swaits,*rwaits;
479: MPI_Comm comm;
480: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
481: MPI_Status status;
482: MPIAIJ_MPIDense *contents;
483: PetscContainer container;
484: Mat workB;
487: PetscObjectGetComm((PetscObject)A,&comm);
488: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
489: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
490: PetscContainerGetPointer(container,(void**)&contents);
492: workB = *outworkB = contents->workB;
493: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
494: sindices = to->indices;
495: sstarts = to->starts;
496: sprocs = to->procs;
497: swaits = contents->swaits;
498: svalues = contents->svalues;
500: rindices = from->indices;
501: rstarts = from->starts;
502: rprocs = from->procs;
503: rwaits = contents->rwaits;
504: rvalues = contents->rvalues;
506: MatDenseGetArray(B,&b);
507: MatDenseGetArray(workB,&w);
509: for (i=0; i<from->n; i++) {
510: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
511: }
513: for (i=0; i<to->n; i++) {
514: /* pack a message at a time */
515: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
516: for (k=0; k<ncols; k++) {
517: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
518: }
519: }
520: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
521: }
523: nrecvs = from->n;
524: while (nrecvs) {
525: MPI_Waitany(from->n,rwaits,&imdex,&status);
526: nrecvs--;
527: /* unpack a message at a time */
528: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
529: for (k=0; k<ncols; k++) {
530: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
531: }
532: }
533: }
534: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
536: MatDenseRestoreArray(B,&b);
537: MatDenseRestoreArray(workB,&w);
538: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
539: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
540: return(0);
541: }
542: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
546: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
547: {
549: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
550: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
551: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
552: Mat workB;
555: /* diagonal block of A times all local rows of B*/
556: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
558: /* get off processor parts of B needed to complete the product */
559: MatMPIDenseScatter(A,B,C,&workB);
561: /* off-diagonal block of A times nonlocal rows of B */
562: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
563: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
564: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
565: return(0);
566: }
570: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
571: {
573: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
574: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
575: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
576: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
577: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
578: Mat_SeqAIJ *p_loc,*p_oth;
579: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
580: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
581: PetscInt cm = C->rmap->n,anz,pnz;
582: Mat_PtAPMPI *ptap = c->ptap;
583: PetscScalar *apa_sparse = ptap->apa;
584: PetscInt *api,*apj,*apJ,i,j,k,row;
585: PetscInt cstart = C->cmap->rstart;
586: PetscInt cdnz,conz,k0,k1,nextp;
587: MPI_Comm comm;
588: PetscMPIInt size;
591: PetscObjectGetComm((PetscObject)A,&comm);
592: MPI_Comm_size(comm,&size);
594: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
595: /*-----------------------------------------------------*/
596: /* update numerical values of P_oth and P_loc */
597: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
598: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
600: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
601: /*----------------------------------------------------------*/
602: /* get data from symbolic products */
603: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
604: pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
605: if (size >1) {
606: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
607: pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
608: } else {
609: p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
610: }
612: api = ptap->api;
613: apj = ptap->apj;
614: for (i=0; i<cm; i++) {
615: apJ = apj + api[i];
617: /* diagonal portion of A */
618: anz = adi[i+1] - adi[i];
619: adj = ad->j + adi[i];
620: ada = ad->a + adi[i];
621: for (j=0; j<anz; j++) {
622: row = adj[j];
623: pnz = pi_loc[row+1] - pi_loc[row];
624: pj = pj_loc + pi_loc[row];
625: pa = pa_loc + pi_loc[row];
626: /* perform sparse axpy */
627: valtmp = ada[j];
628: nextp = 0;
629: for (k=0; nextp<pnz; k++) {
630: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
631: apa_sparse[k] += valtmp*pa[nextp++];
632: }
633: }
634: PetscLogFlops(2.0*pnz);
635: }
637: /* off-diagonal portion of A */
638: anz = aoi[i+1] - aoi[i];
639: aoj = ao->j + aoi[i];
640: aoa = ao->a + aoi[i];
641: for (j=0; j<anz; j++) {
642: row = aoj[j];
643: pnz = pi_oth[row+1] - pi_oth[row];
644: pj = pj_oth + pi_oth[row];
645: pa = pa_oth + pi_oth[row];
646: /* perform sparse axpy */
647: valtmp = aoa[j];
648: nextp = 0;
649: for (k=0; nextp<pnz; k++) {
650: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
651: apa_sparse[k] += valtmp*pa[nextp++];
652: }
653: }
654: PetscLogFlops(2.0*pnz);
655: }
657: /* set values in C */
658: cdnz = cd->i[i+1] - cd->i[i];
659: conz = co->i[i+1] - co->i[i];
661: /* 1st off-diagoanl part of C */
662: ca = coa + co->i[i];
663: k = 0;
664: for (k0=0; k0<conz; k0++) {
665: if (apJ[k] >= cstart) break;
666: ca[k0] = apa_sparse[k];
667: apa_sparse[k] = 0.0;
668: k++;
669: }
671: /* diagonal part of C */
672: ca = cda + cd->i[i];
673: for (k1=0; k1<cdnz; k1++) {
674: ca[k1] = apa_sparse[k];
675: apa_sparse[k] = 0.0;
676: k++;
677: }
679: /* 2nd off-diagoanl part of C */
680: ca = coa + co->i[i];
681: for (; k0<conz; k0++) {
682: ca[k0] = apa_sparse[k];
683: apa_sparse[k] = 0.0;
684: k++;
685: }
686: }
687: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
688: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
689: return(0);
690: }
692: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
695: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
696: {
697: PetscErrorCode ierr;
698: MPI_Comm comm;
699: PetscMPIInt size;
700: Mat Cmpi;
701: Mat_PtAPMPI *ptap;
702: PetscFreeSpaceList free_space = NULL,current_space=NULL;
703: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
704: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
705: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
706: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
707: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max;
708: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
709: PetscReal afill;
710: PetscScalar *apa;
711: PetscTable ta;
714: PetscObjectGetComm((PetscObject)A,&comm);
715: MPI_Comm_size(comm,&size);
717: /* create struct Mat_PtAPMPI and attached it to C later */
718: PetscNew(&ptap);
720: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
721: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
722:
723: /* get P_loc by taking all local rows of P */
724: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
726: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
727: pi_loc = p_loc->i; pj_loc = p_loc->j;
728: if (size > 1) {
729: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
730: pi_oth = p_oth->i; pj_oth = p_oth->j;
731: } else {
732: p_oth = NULL;
733: pi_oth = NULL; pj_oth = NULL;
734: }
736: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
737: /*-------------------------------------------------------------------*/
738: PetscMalloc1(am+2,&api);
739: ptap->api = api;
740: api[0] = 0;
742: /* create and initialize a linked list */
743: apnz_max = 6*(p_loc->rmax + (PetscInt)(1.e-2*pN)); /* expected apnz_max */
744: if (apnz_max > pN) apnz_max = pN;
745: PetscTableCreate(apnz_max,pN,&ta);
747: /* Calculate apnz_max */
748: apnz_max = 0;
749: for (i=0; i<am; i++) {
750: PetscTableRemoveAll(ta);
751: /* diagonal portion of A */
752: nzi = adi[i+1] - adi[i];
753: Jptr = adj+adi[i]; /* cols of A_diag */
754: MatMergeRows_SeqAIJ(p_loc,nzi,Jptr,ta);
755: PetscTableGetCount(ta,&apnz);
756: if (apnz_max < apnz) apnz_max = apnz;
758: /* off-diagonal portion of A */
759: nzi = aoi[i+1] - aoi[i];
760: Jptr = aoj+aoi[i]; /* cols of A_off */
761: MatMergeRows_SeqAIJ(p_oth,nzi,Jptr,ta);
762: PetscTableGetCount(ta,&apnz);
763: if (apnz_max < apnz) apnz_max = apnz;
764: }
765: PetscTableDestroy(&ta);
766:
767: PetscLLCondensedCreate_Scalable(apnz_max,&lnk);
769: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
770: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
771: current_space = free_space;
772: MatPreallocateInitialize(comm,am,pn,dnz,onz);
773: for (i=0; i<am; i++) {
774: /* diagonal portion of A */
775: nzi = adi[i+1] - adi[i];
776: for (j=0; j<nzi; j++) {
777: row = *adj++;
778: pnz = pi_loc[row+1] - pi_loc[row];
779: Jptr = pj_loc + pi_loc[row];
780: /* add non-zero cols of P into the sorted linked list lnk */
781: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
782: }
783: /* off-diagonal portion of A */
784: nzi = aoi[i+1] - aoi[i];
785: for (j=0; j<nzi; j++) {
786: row = *aoj++;
787: pnz = pi_oth[row+1] - pi_oth[row];
788: Jptr = pj_oth + pi_oth[row];
789: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
790: }
792: apnz = *lnk;
793: api[i+1] = api[i] + apnz;
795: /* if free space is not available, double the total space in the list */
796: if (current_space->local_remaining<apnz) {
797: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
798: nspacedouble++;
799: }
801: /* Copy data into free space, then initialize lnk */
802: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
803: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
805: current_space->array += apnz;
806: current_space->local_used += apnz;
807: current_space->local_remaining -= apnz;
808: }
810: /* Allocate space for apj, initialize apj, and */
811: /* destroy list of free space and other temporary array(s) */
812: PetscMalloc1(api[am]+1,&ptap->apj);
813: apj = ptap->apj;
814: PetscFreeSpaceContiguous(&free_space,ptap->apj);
815: PetscLLCondensedDestroy_Scalable(lnk);
817: /* create and assemble symbolic parallel matrix Cmpi */
818: /*----------------------------------------------------*/
819: MatCreate(comm,&Cmpi);
820: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
821: MatSetBlockSizesFromMats(Cmpi,A,P);
822: MatSetType(Cmpi,MATMPIAIJ);
823: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
824: MatPreallocateFinalize(dnz,onz);
826: /* malloc apa for assembly Cmpi */
827: PetscCalloc1(apnz_max,&apa);
829: ptap->apa = apa;
830: for (i=0; i<am; i++) {
831: row = i + rstart;
832: apnz = api[i+1] - api[i];
833: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
834: apj += apnz;
835: }
836: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
837: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
839: ptap->destroy = Cmpi->ops->destroy;
840: ptap->duplicate = Cmpi->ops->duplicate;
841: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
842: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
843: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
845: /* attach the supporting struct to Cmpi for reuse */
846: c = (Mat_MPIAIJ*)Cmpi->data;
847: c->ptap = ptap;
849: *C = Cmpi;
851: /* set MatInfo */
852: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
853: if (afill < 1.0) afill = 1.0;
854: Cmpi->info.mallocs = nspacedouble;
855: Cmpi->info.fill_ratio_given = fill;
856: Cmpi->info.fill_ratio_needed = afill;
858: #if defined(PETSC_USE_INFO)
859: if (api[am]) {
860: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
861: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
862: } else {
863: PetscInfo(Cmpi,"Empty matrix product\n");
864: }
865: #endif
866: return(0);
867: }
869: /*-------------------------------------------------------------------------*/
872: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
873: {
875: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
876: PetscInt alg=0; /* set default algorithm */
879: if (scall == MAT_INITIAL_MATRIX) {
880: PetscObjectOptionsBegin((PetscObject)A);
881: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);
882: PetscOptionsEnd();
884: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
885: switch (alg) {
886: case 1:
887: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
888: break;
889: case 2:
890: {
891: Mat Pt;
892: Mat_PtAPMPI *ptap;
893: Mat_MPIAIJ *c;
894: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
895: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
896: c = (Mat_MPIAIJ*)(*C)->data;
897: ptap = c->ptap;
898: ptap->Pt = Pt;
899: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
900: return(0);
901: }
902: break;
903: default:
904: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
905: break;
906: }
907: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
908: }
909: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
910: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
911: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
912: return(0);
913: }
915: /* This routine only works when scall=MAT_REUSE_MATRIX! */
918: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
919: {
921: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
922: Mat_PtAPMPI *ptap= c->ptap;
923: Mat Pt=ptap->Pt;
926: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
927: MatMatMultNumeric(Pt,A,C);
928: return(0);
929: }
931: /* Non-scalable version, use dense axpy */
934: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
935: {
936: PetscErrorCode ierr;
937: Mat_Merge_SeqsToMPI *merge;
938: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
939: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
940: Mat_PtAPMPI *ptap;
941: PetscInt *adj,*aJ;
942: PetscInt i,j,k,anz,pnz,row,*cj;
943: MatScalar *ada,*aval,*ca,valtmp;
944: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
945: MPI_Comm comm;
946: PetscMPIInt size,rank,taga,*len_s;
947: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
948: PetscInt **buf_ri,**buf_rj;
949: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
950: MPI_Request *s_waits,*r_waits;
951: MPI_Status *status;
952: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
953: PetscInt *ai,*aj,*coi,*coj;
954: PetscInt *poJ,*pdJ;
955: Mat A_loc;
956: Mat_SeqAIJ *a_loc;
959: PetscObjectGetComm((PetscObject)C,&comm);
960: MPI_Comm_size(comm,&size);
961: MPI_Comm_rank(comm,&rank);
963: ptap = c->ptap;
964: merge = ptap->merge;
966: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
967: /*--------------------------------------------------------------*/
968: /* get data from symbolic products */
969: coi = merge->coi; coj = merge->coj;
970: PetscCalloc1(coi[pon]+1,&coa);
972: bi = merge->bi; bj = merge->bj;
973: owners = merge->rowmap->range;
974: PetscCalloc1(bi[cm]+1,&ba);
976: /* get A_loc by taking all local rows of A */
977: A_loc = ptap->A_loc;
978: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
979: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
980: ai = a_loc->i;
981: aj = a_loc->j;
983: PetscCalloc1(A->cmap->N,&aval); /* non-scalable!!! */
985: for (i=0; i<am; i++) {
986: /* 2-a) put A[i,:] to dense array aval */
987: anz = ai[i+1] - ai[i];
988: adj = aj + ai[i];
989: ada = a_loc->a + ai[i];
990: for (j=0; j<anz; j++) {
991: aval[adj[j]] = ada[j];
992: }
994: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
995: /*--------------------------------------------------------------*/
996: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
997: pnz = po->i[i+1] - po->i[i];
998: poJ = po->j + po->i[i];
999: pA = po->a + po->i[i];
1000: for (j=0; j<pnz; j++) {
1001: row = poJ[j];
1002: cnz = coi[row+1] - coi[row];
1003: cj = coj + coi[row];
1004: ca = coa + coi[row];
1005: /* perform dense axpy */
1006: valtmp = pA[j];
1007: for (k=0; k<cnz; k++) {
1008: ca[k] += valtmp*aval[cj[k]];
1009: }
1010: PetscLogFlops(2.0*cnz);
1011: }
1013: /* put the value into Cd (diagonal part) */
1014: pnz = pd->i[i+1] - pd->i[i];
1015: pdJ = pd->j + pd->i[i];
1016: pA = pd->a + pd->i[i];
1017: for (j=0; j<pnz; j++) {
1018: row = pdJ[j];
1019: cnz = bi[row+1] - bi[row];
1020: cj = bj + bi[row];
1021: ca = ba + bi[row];
1022: /* perform dense axpy */
1023: valtmp = pA[j];
1024: for (k=0; k<cnz; k++) {
1025: ca[k] += valtmp*aval[cj[k]];
1026: }
1027: PetscLogFlops(2.0*cnz);
1028: }
1030: /* zero the current row of Pt*A */
1031: aJ = aj + ai[i];
1032: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
1033: }
1035: /* 3) send and recv matrix values coa */
1036: /*------------------------------------*/
1037: buf_ri = merge->buf_ri;
1038: buf_rj = merge->buf_rj;
1039: len_s = merge->len_s;
1040: PetscCommGetNewTag(comm,&taga);
1041: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1043: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1044: for (proc=0,k=0; proc<size; proc++) {
1045: if (!len_s[proc]) continue;
1046: i = merge->owners_co[proc];
1047: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1048: k++;
1049: }
1050: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1051: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1053: PetscFree2(s_waits,status);
1054: PetscFree(r_waits);
1055: PetscFree(coa);
1057: /* 4) insert local Cseq and received values into Cmpi */
1058: /*----------------------------------------------------*/
1059: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1060: for (k=0; k<merge->nrecv; k++) {
1061: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1062: nrows = *(buf_ri_k[k]);
1063: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1064: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1065: }
1067: for (i=0; i<cm; i++) {
1068: row = owners[rank] + i; /* global row index of C_seq */
1069: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1070: ba_i = ba + bi[i];
1071: bnz = bi[i+1] - bi[i];
1072: /* add received vals into ba */
1073: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1074: /* i-th row */
1075: if (i == *nextrow[k]) {
1076: cnz = *(nextci[k]+1) - *nextci[k];
1077: cj = buf_rj[k] + *(nextci[k]);
1078: ca = abuf_r[k] + *(nextci[k]);
1079: nextcj = 0;
1080: for (j=0; nextcj<cnz; j++) {
1081: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1082: ba_i[j] += ca[nextcj++];
1083: }
1084: }
1085: nextrow[k]++; nextci[k]++;
1086: PetscLogFlops(2.0*cnz);
1087: }
1088: }
1089: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1090: }
1091: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1092: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1094: PetscFree(ba);
1095: PetscFree(abuf_r[0]);
1096: PetscFree(abuf_r);
1097: PetscFree3(buf_ri_k,nextrow,nextci);
1098: PetscFree(aval);
1099: return(0);
1100: }
1102: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1103: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1106: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1107: {
1108: PetscErrorCode ierr;
1109: Mat Cmpi,A_loc,POt,PDt;
1110: Mat_PtAPMPI *ptap;
1111: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1112: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1113: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1114: PetscInt nnz;
1115: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1116: PetscInt am=A->rmap->n,pn=P->cmap->n;
1117: PetscBT lnkbt;
1118: MPI_Comm comm;
1119: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1120: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1121: PetscInt len,proc,*dnz,*onz,*owners;
1122: PetscInt nzi,*bi,*bj;
1123: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1124: MPI_Request *swaits,*rwaits;
1125: MPI_Status *sstatus,rstatus;
1126: Mat_Merge_SeqsToMPI *merge;
1127: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1128: PetscReal afill =1.0,afill_tmp;
1129: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N;
1130: PetscScalar *vals;
1131: Mat_SeqAIJ *a_loc, *pdt,*pot;
1134: PetscObjectGetComm((PetscObject)A,&comm);
1135: /* check if matrix local sizes are compatible */
1136: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1138: MPI_Comm_size(comm,&size);
1139: MPI_Comm_rank(comm,&rank);
1141: /* create struct Mat_PtAPMPI and attached it to C later */
1142: PetscNew(&ptap);
1144: /* get A_loc by taking all local rows of A */
1145: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1147: ptap->A_loc = A_loc;
1149: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1150: ai = a_loc->i;
1151: aj = a_loc->j;
1153: /* determine symbolic Co=(p->B)^T*A - send to others */
1154: /*----------------------------------------------------*/
1155: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1156: pdt = (Mat_SeqAIJ*)PDt->data;
1157: pdti = pdt->i; pdtj = pdt->j;
1159: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1160: pot = (Mat_SeqAIJ*)POt->data;
1161: poti = pot->i; potj = pot->j;
1163: /* then, compute symbolic Co = (p->B)^T*A */
1164: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1165: PetscMalloc1(pon+1,&coi);
1166: coi[0] = 0;
1168: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1169: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1170: PetscFreeSpaceGet(nnz,&free_space);
1171: current_space = free_space;
1173: /* create and initialize a linked list */
1174: PetscLLCondensedCreate(aN,aN,&lnk,&lnkbt);
1176: for (i=0; i<pon; i++) {
1177: pnz = poti[i+1] - poti[i];
1178: ptJ = potj + poti[i];
1179: for (j=0; j<pnz; j++) {
1180: row = ptJ[j]; /* row of A_loc == col of Pot */
1181: anz = ai[row+1] - ai[row];
1182: Jptr = aj + ai[row];
1183: /* add non-zero cols of AP into the sorted linked list lnk */
1184: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1185: }
1186: nnz = lnk[0];
1188: /* If free space is not available, double the total space in the list */
1189: if (current_space->local_remaining<nnz) {
1190: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1191: nspacedouble++;
1192: }
1194: /* Copy data into free space, and zero out denserows */
1195: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1197: current_space->array += nnz;
1198: current_space->local_used += nnz;
1199: current_space->local_remaining -= nnz;
1201: coi[i+1] = coi[i] + nnz;
1202: }
1204: PetscMalloc1(coi[pon]+1,&coj);
1205: PetscFreeSpaceContiguous(&free_space,coj);
1207: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1208: if (afill_tmp > afill) afill = afill_tmp;
1210: /* send j-array (coj) of Co to other processors */
1211: /*----------------------------------------------*/
1212: /* determine row ownership */
1213: PetscNew(&merge);
1214: PetscLayoutCreate(comm,&merge->rowmap);
1216: merge->rowmap->n = pn;
1217: merge->rowmap->bs = 1;
1219: PetscLayoutSetUp(merge->rowmap);
1220: owners = merge->rowmap->range;
1222: /* determine the number of messages to send, their lengths */
1223: PetscCalloc1(size,&len_si);
1224: PetscMalloc1(size,&merge->len_s);
1226: len_s = merge->len_s;
1227: merge->nsend = 0;
1229: PetscMalloc1(size+2,&owners_co);
1230: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1232: proc = 0;
1233: for (i=0; i<pon; i++) {
1234: while (prmap[i] >= owners[proc+1]) proc++;
1235: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1236: len_s[proc] += coi[i+1] - coi[i];
1237: }
1239: len = 0; /* max length of buf_si[] */
1240: owners_co[0] = 0;
1241: for (proc=0; proc<size; proc++) {
1242: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1243: if (len_si[proc]) {
1244: merge->nsend++;
1245: len_si[proc] = 2*(len_si[proc] + 1);
1246: len += len_si[proc];
1247: }
1248: }
1250: /* determine the number and length of messages to receive for coi and coj */
1251: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1252: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1254: /* post the Irecv and Isend of coj */
1255: PetscCommGetNewTag(comm,&tagj);
1256: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1257: PetscMalloc1(merge->nsend+1,&swaits);
1258: for (proc=0, k=0; proc<size; proc++) {
1259: if (!len_s[proc]) continue;
1260: i = owners_co[proc];
1261: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1262: k++;
1263: }
1265: /* receives and sends of coj are complete */
1266: PetscMalloc1(size,&sstatus);
1267: for (i=0; i<merge->nrecv; i++) {
1268: PetscMPIInt icompleted;
1269: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1270: }
1271: PetscFree(rwaits);
1272: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1274: /* send and recv coi */
1275: /*-------------------*/
1276: PetscCommGetNewTag(comm,&tagi);
1277: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1278: PetscMalloc1(len+1,&buf_s);
1279: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1280: for (proc=0,k=0; proc<size; proc++) {
1281: if (!len_s[proc]) continue;
1282: /* form outgoing message for i-structure:
1283: buf_si[0]: nrows to be sent
1284: [1:nrows]: row index (global)
1285: [nrows+1:2*nrows+1]: i-structure index
1286: */
1287: /*-------------------------------------------*/
1288: nrows = len_si[proc]/2 - 1;
1289: buf_si_i = buf_si + nrows+1;
1290: buf_si[0] = nrows;
1291: buf_si_i[0] = 0;
1292: nrows = 0;
1293: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1294: nzi = coi[i+1] - coi[i];
1295: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1296: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1297: nrows++;
1298: }
1299: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1300: k++;
1301: buf_si += len_si[proc];
1302: }
1303: i = merge->nrecv;
1304: while (i--) {
1305: PetscMPIInt icompleted;
1306: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1307: }
1308: PetscFree(rwaits);
1309: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1310: PetscFree(len_si);
1311: PetscFree(len_ri);
1312: PetscFree(swaits);
1313: PetscFree(sstatus);
1314: PetscFree(buf_s);
1316: /* compute the local portion of C (mpi mat) */
1317: /*------------------------------------------*/
1318: /* allocate bi array and free space for accumulating nonzero column info */
1319: PetscMalloc1(pn+1,&bi);
1320: bi[0] = 0;
1322: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1323: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1324: PetscFreeSpaceGet(nnz,&free_space);
1325: current_space = free_space;
1327: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1328: for (k=0; k<merge->nrecv; k++) {
1329: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1330: nrows = *buf_ri_k[k];
1331: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1332: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1333: }
1335: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1336: rmax = 0;
1337: for (i=0; i<pn; i++) {
1338: /* add pdt[i,:]*AP into lnk */
1339: pnz = pdti[i+1] - pdti[i];
1340: ptJ = pdtj + pdti[i];
1341: for (j=0; j<pnz; j++) {
1342: row = ptJ[j]; /* row of AP == col of Pt */
1343: anz = ai[row+1] - ai[row];
1344: Jptr = aj + ai[row];
1345: /* add non-zero cols of AP into the sorted linked list lnk */
1346: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1347: }
1349: /* add received col data into lnk */
1350: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1351: if (i == *nextrow[k]) { /* i-th row */
1352: nzi = *(nextci[k]+1) - *nextci[k];
1353: Jptr = buf_rj[k] + *nextci[k];
1354: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1355: nextrow[k]++; nextci[k]++;
1356: }
1357: }
1358: nnz = lnk[0];
1360: /* if free space is not available, make more free space */
1361: if (current_space->local_remaining<nnz) {
1362: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1363: nspacedouble++;
1364: }
1365: /* copy data into free space, then initialize lnk */
1366: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1367: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1369: current_space->array += nnz;
1370: current_space->local_used += nnz;
1371: current_space->local_remaining -= nnz;
1373: bi[i+1] = bi[i] + nnz;
1374: if (nnz > rmax) rmax = nnz;
1375: }
1376: PetscFree3(buf_ri_k,nextrow,nextci);
1378: PetscMalloc1(bi[pn]+1,&bj);
1379: PetscFreeSpaceContiguous(&free_space,bj);
1381: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1382: if (afill_tmp > afill) afill = afill_tmp;
1383: PetscLLCondensedDestroy(lnk,lnkbt);
1384: MatDestroy(&POt);
1385: MatDestroy(&PDt);
1387: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1388: /*----------------------------------------------------------------------------------*/
1389: PetscCalloc1(rmax+1,&vals);
1391: MatCreate(comm,&Cmpi);
1392: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1393: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1394: MatSetType(Cmpi,MATMPIAIJ);
1395: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1396: MatPreallocateFinalize(dnz,onz);
1397: MatSetBlockSize(Cmpi,1);
1398: for (i=0; i<pn; i++) {
1399: row = i + rstart;
1400: nnz = bi[i+1] - bi[i];
1401: Jptr = bj + bi[i];
1402: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1403: }
1404: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1405: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1406: PetscFree(vals);
1408: merge->bi = bi;
1409: merge->bj = bj;
1410: merge->coi = coi;
1411: merge->coj = coj;
1412: merge->buf_ri = buf_ri;
1413: merge->buf_rj = buf_rj;
1414: merge->owners_co = owners_co;
1415: merge->destroy = Cmpi->ops->destroy;
1416: merge->duplicate = Cmpi->ops->duplicate;
1418: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1419: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1420: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1422: /* attach the supporting struct to Cmpi for reuse */
1423: c = (Mat_MPIAIJ*)Cmpi->data;
1424: c->ptap = ptap;
1425: ptap->api = NULL;
1426: ptap->apj = NULL;
1427: ptap->merge = merge;
1429: *C = Cmpi;
1430: #if defined(PETSC_USE_INFO)
1431: if (bi[pn] != 0) {
1432: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1433: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1434: } else {
1435: PetscInfo(Cmpi,"Empty matrix product\n");
1436: }
1437: #endif
1438: return(0);
1439: }
1443: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1444: {
1445: PetscErrorCode ierr;
1446: Mat_Merge_SeqsToMPI *merge;
1447: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1448: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1449: Mat_PtAPMPI *ptap;
1450: PetscInt *adj;
1451: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1452: MatScalar *ada,*ca,valtmp;
1453: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1454: MPI_Comm comm;
1455: PetscMPIInt size,rank,taga,*len_s;
1456: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1457: PetscInt **buf_ri,**buf_rj;
1458: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1459: MPI_Request *s_waits,*r_waits;
1460: MPI_Status *status;
1461: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1462: PetscInt *ai,*aj,*coi,*coj;
1463: PetscInt *poJ,*pdJ;
1464: Mat A_loc;
1465: Mat_SeqAIJ *a_loc;
1468: PetscObjectGetComm((PetscObject)C,&comm);
1469: MPI_Comm_size(comm,&size);
1470: MPI_Comm_rank(comm,&rank);
1472: ptap = c->ptap;
1473: merge = ptap->merge;
1475: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1476: /*------------------------------------------*/
1477: /* get data from symbolic products */
1478: coi = merge->coi; coj = merge->coj;
1479: PetscCalloc1(coi[pon]+1,&coa);
1480: bi = merge->bi; bj = merge->bj;
1481: owners = merge->rowmap->range;
1482: PetscCalloc1(bi[cm]+1,&ba);
1484: /* get A_loc by taking all local rows of A */
1485: A_loc = ptap->A_loc;
1486: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1487: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1488: ai = a_loc->i;
1489: aj = a_loc->j;
1491: for (i=0; i<am; i++) {
1492: anz = ai[i+1] - ai[i];
1493: adj = aj + ai[i];
1494: ada = a_loc->a + ai[i];
1496: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1497: /*-------------------------------------------------------------*/
1498: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1499: pnz = po->i[i+1] - po->i[i];
1500: poJ = po->j + po->i[i];
1501: pA = po->a + po->i[i];
1502: for (j=0; j<pnz; j++) {
1503: row = poJ[j];
1504: cj = coj + coi[row];
1505: ca = coa + coi[row];
1506: /* perform sparse axpy */
1507: nexta = 0;
1508: valtmp = pA[j];
1509: for (k=0; nexta<anz; k++) {
1510: if (cj[k] == adj[nexta]) {
1511: ca[k] += valtmp*ada[nexta];
1512: nexta++;
1513: }
1514: }
1515: PetscLogFlops(2.0*anz);
1516: }
1518: /* put the value into Cd (diagonal part) */
1519: pnz = pd->i[i+1] - pd->i[i];
1520: pdJ = pd->j + pd->i[i];
1521: pA = pd->a + pd->i[i];
1522: for (j=0; j<pnz; j++) {
1523: row = pdJ[j];
1524: cj = bj + bi[row];
1525: ca = ba + bi[row];
1526: /* perform sparse axpy */
1527: nexta = 0;
1528: valtmp = pA[j];
1529: for (k=0; nexta<anz; k++) {
1530: if (cj[k] == adj[nexta]) {
1531: ca[k] += valtmp*ada[nexta];
1532: nexta++;
1533: }
1534: }
1535: PetscLogFlops(2.0*anz);
1536: }
1537: }
1539: /* 3) send and recv matrix values coa */
1540: /*------------------------------------*/
1541: buf_ri = merge->buf_ri;
1542: buf_rj = merge->buf_rj;
1543: len_s = merge->len_s;
1544: PetscCommGetNewTag(comm,&taga);
1545: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1547: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1548: for (proc=0,k=0; proc<size; proc++) {
1549: if (!len_s[proc]) continue;
1550: i = merge->owners_co[proc];
1551: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1552: k++;
1553: }
1554: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1555: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1557: PetscFree2(s_waits,status);
1558: PetscFree(r_waits);
1559: PetscFree(coa);
1561: /* 4) insert local Cseq and received values into Cmpi */
1562: /*----------------------------------------------------*/
1563: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1564: for (k=0; k<merge->nrecv; k++) {
1565: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1566: nrows = *(buf_ri_k[k]);
1567: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1568: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1569: }
1571: for (i=0; i<cm; i++) {
1572: row = owners[rank] + i; /* global row index of C_seq */
1573: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1574: ba_i = ba + bi[i];
1575: bnz = bi[i+1] - bi[i];
1576: /* add received vals into ba */
1577: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1578: /* i-th row */
1579: if (i == *nextrow[k]) {
1580: cnz = *(nextci[k]+1) - *nextci[k];
1581: cj = buf_rj[k] + *(nextci[k]);
1582: ca = abuf_r[k] + *(nextci[k]);
1583: nextcj = 0;
1584: for (j=0; nextcj<cnz; j++) {
1585: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1586: ba_i[j] += ca[nextcj++];
1587: }
1588: }
1589: nextrow[k]++; nextci[k]++;
1590: PetscLogFlops(2.0*cnz);
1591: }
1592: }
1593: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1594: }
1595: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1596: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1598: PetscFree(ba);
1599: PetscFree(abuf_r[0]);
1600: PetscFree(abuf_r);
1601: PetscFree3(buf_ri_k,nextrow,nextci);
1602: return(0);
1603: }
1605: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat, MatDuplicateOption,Mat*);
1606: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1607: differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1610: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1611: {
1612: PetscErrorCode ierr;
1613: Mat Cmpi,A_loc,POt,PDt;
1614: Mat_PtAPMPI *ptap;
1615: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1616: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1617: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1618: PetscInt nnz;
1619: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1620: PetscInt am =A->rmap->n,pn=P->cmap->n;
1621: MPI_Comm comm;
1622: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1623: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1624: PetscInt len,proc,*dnz,*onz,*owners;
1625: PetscInt nzi,*bi,*bj;
1626: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1627: MPI_Request *swaits,*rwaits;
1628: MPI_Status *sstatus,rstatus;
1629: Mat_Merge_SeqsToMPI *merge;
1630: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1631: PetscReal afill =1.0,afill_tmp;
1632: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1633: PetscScalar *vals;
1634: Mat_SeqAIJ *a_loc,*pdt,*pot;
1635: PetscTable ta;
1638: PetscObjectGetComm((PetscObject)A,&comm);
1639: /* check if matrix local sizes are compatible */
1640: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1642: MPI_Comm_size(comm,&size);
1643: MPI_Comm_rank(comm,&rank);
1645: /* create struct Mat_PtAPMPI and attached it to C later */
1646: PetscNew(&ptap);
1648: /* get A_loc by taking all local rows of A */
1649: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1651: ptap->A_loc = A_loc;
1652: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1653: ai = a_loc->i;
1654: aj = a_loc->j;
1656: /* determine symbolic Co=(p->B)^T*A - send to others */
1657: /*----------------------------------------------------*/
1658: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1659: pdt = (Mat_SeqAIJ*)PDt->data;
1660: pdti = pdt->i; pdtj = pdt->j;
1662: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1663: pot = (Mat_SeqAIJ*)POt->data;
1664: poti = pot->i; potj = pot->j;
1666: /* then, compute symbolic Co = (p->B)^T*A */
1667: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1668: >= (num of nonzero rows of C_seq) - pn */
1669: PetscMalloc1(pon+1,&coi);
1670: coi[0] = 0;
1672: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1673: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1674: PetscFreeSpaceGet(nnz,&free_space);
1675: current_space = free_space;
1677: /* create and initialize a linked list */
1678: PetscTableCreate(2*a_loc->rmax,aN,&ta);
1679: MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1680: PetscTableGetCount(ta,&Armax);
1681: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1683: for (i=0; i<pon; i++) {
1684: pnz = poti[i+1] - poti[i];
1685: ptJ = potj + poti[i];
1686: for (j=0; j<pnz; j++) {
1687: row = ptJ[j]; /* row of A_loc == col of Pot */
1688: anz = ai[row+1] - ai[row];
1689: Jptr = aj + ai[row];
1690: /* add non-zero cols of AP into the sorted linked list lnk */
1691: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1692: }
1693: nnz = lnk[0];
1695: /* If free space is not available, double the total space in the list */
1696: if (current_space->local_remaining<nnz) {
1697: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1698: nspacedouble++;
1699: }
1701: /* Copy data into free space, and zero out denserows */
1702: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1704: current_space->array += nnz;
1705: current_space->local_used += nnz;
1706: current_space->local_remaining -= nnz;
1708: coi[i+1] = coi[i] + nnz;
1709: }
1711: PetscMalloc1(coi[pon]+1,&coj);
1712: PetscFreeSpaceContiguous(&free_space,coj);
1713: PetscLLCondensedDestroy_Scalable(lnk); /* must destroy to get a new one for C */
1715: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1716: if (afill_tmp > afill) afill = afill_tmp;
1718: /* send j-array (coj) of Co to other processors */
1719: /*----------------------------------------------*/
1720: /* determine row ownership */
1721: PetscNew(&merge);
1722: PetscLayoutCreate(comm,&merge->rowmap);
1724: merge->rowmap->n = pn;
1725: merge->rowmap->bs = 1;
1727: PetscLayoutSetUp(merge->rowmap);
1728: owners = merge->rowmap->range;
1730: /* determine the number of messages to send, their lengths */
1731: PetscCalloc1(size,&len_si);
1732: PetscMalloc1(size,&merge->len_s);
1734: len_s = merge->len_s;
1735: merge->nsend = 0;
1737: PetscMalloc1(size+2,&owners_co);
1738: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1740: proc = 0;
1741: for (i=0; i<pon; i++) {
1742: while (prmap[i] >= owners[proc+1]) proc++;
1743: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1744: len_s[proc] += coi[i+1] - coi[i];
1745: }
1747: len = 0; /* max length of buf_si[] */
1748: owners_co[0] = 0;
1749: for (proc=0; proc<size; proc++) {
1750: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1751: if (len_si[proc]) {
1752: merge->nsend++;
1753: len_si[proc] = 2*(len_si[proc] + 1);
1754: len += len_si[proc];
1755: }
1756: }
1758: /* determine the number and length of messages to receive for coi and coj */
1759: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1760: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1762: /* post the Irecv and Isend of coj */
1763: PetscCommGetNewTag(comm,&tagj);
1764: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1765: PetscMalloc1(merge->nsend+1,&swaits);
1766: for (proc=0, k=0; proc<size; proc++) {
1767: if (!len_s[proc]) continue;
1768: i = owners_co[proc];
1769: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1770: k++;
1771: }
1773: /* receives and sends of coj are complete */
1774: PetscMalloc1(size,&sstatus);
1775: for (i=0; i<merge->nrecv; i++) {
1776: PetscMPIInt icompleted;
1777: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1778: }
1779: PetscFree(rwaits);
1780: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1782: /* add received column indices into table to update Armax */
1783: for (k=0; k<merge->nrecv; k++) {/* k-th received message */
1784: Jptr = buf_rj[k];
1785: for (j=0; j<merge->len_r[k]; j++) {
1786: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1787: }
1788: }
1789: PetscTableGetCount(ta,&Armax);
1791: /* send and recv coi */
1792: /*-------------------*/
1793: PetscCommGetNewTag(comm,&tagi);
1794: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1795: PetscMalloc1(len+1,&buf_s);
1796: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1797: for (proc=0,k=0; proc<size; proc++) {
1798: if (!len_s[proc]) continue;
1799: /* form outgoing message for i-structure:
1800: buf_si[0]: nrows to be sent
1801: [1:nrows]: row index (global)
1802: [nrows+1:2*nrows+1]: i-structure index
1803: */
1804: /*-------------------------------------------*/
1805: nrows = len_si[proc]/2 - 1;
1806: buf_si_i = buf_si + nrows+1;
1807: buf_si[0] = nrows;
1808: buf_si_i[0] = 0;
1809: nrows = 0;
1810: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1811: nzi = coi[i+1] - coi[i];
1812: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1813: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1814: nrows++;
1815: }
1816: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1817: k++;
1818: buf_si += len_si[proc];
1819: }
1820: i = merge->nrecv;
1821: while (i--) {
1822: PetscMPIInt icompleted;
1823: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1824: }
1825: PetscFree(rwaits);
1826: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1827: PetscFree(len_si);
1828: PetscFree(len_ri);
1829: PetscFree(swaits);
1830: PetscFree(sstatus);
1831: PetscFree(buf_s);
1833: /* compute the local portion of C (mpi mat) */
1834: /*------------------------------------------*/
1835: /* allocate bi array and free space for accumulating nonzero column info */
1836: PetscMalloc1(pn+1,&bi);
1837: bi[0] = 0;
1839: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1840: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
1841: PetscFreeSpaceGet(nnz,&free_space);
1842: current_space = free_space;
1844: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1845: for (k=0; k<merge->nrecv; k++) {
1846: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1847: nrows = *buf_ri_k[k];
1848: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1849: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1850: }
1852: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1853: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1854: rmax = 0;
1855: for (i=0; i<pn; i++) {
1856: /* add pdt[i,:]*AP into lnk */
1857: pnz = pdti[i+1] - pdti[i];
1858: ptJ = pdtj + pdti[i];
1859: for (j=0; j<pnz; j++) {
1860: row = ptJ[j]; /* row of AP == col of Pt */
1861: anz = ai[row+1] - ai[row];
1862: Jptr = aj + ai[row];
1863: /* add non-zero cols of AP into the sorted linked list lnk */
1864: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1865: }
1867: /* add received col data into lnk */
1868: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1869: if (i == *nextrow[k]) { /* i-th row */
1870: nzi = *(nextci[k]+1) - *nextci[k];
1871: Jptr = buf_rj[k] + *nextci[k];
1872: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1873: nextrow[k]++; nextci[k]++;
1874: }
1875: }
1876: nnz = lnk[0];
1878: /* if free space is not available, make more free space */
1879: if (current_space->local_remaining<nnz) {
1880: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1881: nspacedouble++;
1882: }
1883: /* copy data into free space, then initialize lnk */
1884: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1885: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1887: current_space->array += nnz;
1888: current_space->local_used += nnz;
1889: current_space->local_remaining -= nnz;
1891: bi[i+1] = bi[i] + nnz;
1892: if (nnz > rmax) rmax = nnz;
1893: }
1894: PetscFree3(buf_ri_k,nextrow,nextci);
1896: PetscMalloc1(bi[pn]+1,&bj);
1897: PetscFreeSpaceContiguous(&free_space,bj);
1898: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1899: if (afill_tmp > afill) afill = afill_tmp;
1900: PetscLLCondensedDestroy_Scalable(lnk);
1901: PetscTableDestroy(&ta);
1903: MatDestroy(&POt);
1904: MatDestroy(&PDt);
1906: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1907: /*----------------------------------------------------------------------------------*/
1908: PetscCalloc1(rmax+1,&vals);
1910: MatCreate(comm,&Cmpi);
1911: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1912: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
1913: MatSetType(Cmpi,MATMPIAIJ);
1914: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1915: MatPreallocateFinalize(dnz,onz);
1916: MatSetBlockSize(Cmpi,1);
1917: for (i=0; i<pn; i++) {
1918: row = i + rstart;
1919: nnz = bi[i+1] - bi[i];
1920: Jptr = bj + bi[i];
1921: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1922: }
1923: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1924: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1925: PetscFree(vals);
1927: merge->bi = bi;
1928: merge->bj = bj;
1929: merge->coi = coi;
1930: merge->coj = coj;
1931: merge->buf_ri = buf_ri;
1932: merge->buf_rj = buf_rj;
1933: merge->owners_co = owners_co;
1934: merge->destroy = Cmpi->ops->destroy;
1935: merge->duplicate = Cmpi->ops->duplicate;
1937: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1938: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1939: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
1941: /* attach the supporting struct to Cmpi for reuse */
1942: c = (Mat_MPIAIJ*)Cmpi->data;
1944: c->ptap = ptap;
1945: ptap->api = NULL;
1946: ptap->apj = NULL;
1947: ptap->merge = merge;
1948: ptap->apa = NULL;
1950: *C = Cmpi;
1951: #if defined(PETSC_USE_INFO)
1952: if (bi[pn] != 0) {
1953: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1954: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1955: } else {
1956: PetscInfo(Cmpi,"Empty matrix product\n");
1957: }
1958: #endif
1959: return(0);
1960: }