Actual source code: mpimatmatmult.c
petsc-3.12.5 2020-03-29
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h>
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc/private/vecimpl.h>
12: #include <petsc/private/vecscatterimpl.h>
14: #if defined(PETSC_HAVE_HYPRE)
15: PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat,Mat,PetscReal,Mat*);
16: #endif
18: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
19: {
21: #if defined(PETSC_HAVE_HYPRE)
22: const char *algTypes[4] = {"scalable","nonscalable","seqmpi","hypre"};
23: PetscInt nalg = 4;
24: #else
25: const char *algTypes[3] = {"scalable","nonscalable","seqmpi"};
26: PetscInt nalg = 3;
27: #endif
28: PetscInt alg = 1; /* set nonscalable algorithm as default */
29: MPI_Comm comm;
30: PetscBool flg;
33: if (scall == MAT_INITIAL_MATRIX) {
34: PetscObjectGetComm((PetscObject)A,&comm);
35: if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,B->rmap->rstart,B->rmap->rend);
37: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatMatMult","Mat");
38: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[1],&alg,&flg);
39: PetscOptionsEnd();
41: if (!flg && B->cmap->N > 100000) { /* may switch to scalable algorithm as default */
42: MatInfo Ainfo,Binfo;
43: PetscInt nz_local;
44: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
46: MatGetInfo(A,MAT_LOCAL,&Ainfo);
47: MatGetInfo(B,MAT_LOCAL,&Binfo);
48: nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated);
50: if (B->cmap->N > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
51: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
53: if (alg_scalable) {
54: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
55: PetscInfo2(B,"Use scalable algorithm, BN %D, fill*nz_allocated %g\n",B->cmap->N,fill*nz_local);
56: }
57: }
59: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
60: switch (alg) {
61: case 1:
62: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
63: break;
64: case 2:
65: MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(A,B,fill,C);
66: break;
67: #if defined(PETSC_HAVE_HYPRE)
68: case 3:
69: MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C);
70: break;
71: #endif
72: default:
73: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
74: break;
75: }
76: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
78: if (alg == 0 || alg == 1) {
79: Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data;
80: Mat_APMPI *ap = c->ap;
81: PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");
82: ap->freestruct = PETSC_FALSE;
83: PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
84: PetscOptionsEnd();
85: }
86: }
88: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
89: (*(*C)->ops->matmultnumeric)(A,B,*C);
90: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
91: return(0);
92: }
94: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
95: {
97: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
98: Mat_APMPI *ptap = a->ap;
101: PetscFree2(ptap->startsj_s,ptap->startsj_r);
102: PetscFree(ptap->bufa);
103: MatDestroy(&ptap->P_loc);
104: MatDestroy(&ptap->P_oth);
105: MatDestroy(&ptap->Pt);
106: PetscFree(ptap->api);
107: PetscFree(ptap->apj);
108: PetscFree(ptap->apa);
109: ptap->destroy(A);
110: PetscFree(ptap);
111: return(0);
112: }
114: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,Mat C)
115: {
117: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
118: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
119: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
120: PetscScalar *cda=cd->a,*coa=co->a;
121: Mat_SeqAIJ *p_loc,*p_oth;
122: PetscScalar *apa,*ca;
123: PetscInt cm =C->rmap->n;
124: Mat_APMPI *ptap=c->ap;
125: PetscInt *api,*apj,*apJ,i,k;
126: PetscInt cstart=C->cmap->rstart;
127: PetscInt cdnz,conz,k0,k1;
128: MPI_Comm comm;
129: PetscMPIInt size;
132: PetscObjectGetComm((PetscObject)A,&comm);
133: MPI_Comm_size(comm,&size);
135: if (!ptap->P_oth && size>1) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"AP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
137: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
138: /*-----------------------------------------------------*/
139: /* update numerical values of P_oth and P_loc */
140: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
141: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
143: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
144: /*----------------------------------------------------------*/
145: /* get data from symbolic products */
146: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
147: p_oth = NULL;
148: if (size >1) {
149: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
150: }
152: /* get apa for storing dense row A[i,:]*P */
153: apa = ptap->apa;
155: api = ptap->api;
156: apj = ptap->apj;
157: for (i=0; i<cm; i++) {
158: /* compute apa = A[i,:]*P */
159: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
161: /* set values in C */
162: apJ = apj + api[i];
163: cdnz = cd->i[i+1] - cd->i[i];
164: conz = co->i[i+1] - co->i[i];
166: /* 1st off-diagoanl part of C */
167: ca = coa + co->i[i];
168: k = 0;
169: for (k0=0; k0<conz; k0++) {
170: if (apJ[k] >= cstart) break;
171: ca[k0] = apa[apJ[k]];
172: apa[apJ[k++]] = 0.0;
173: }
175: /* diagonal part of C */
176: ca = cda + cd->i[i];
177: for (k1=0; k1<cdnz; k1++) {
178: ca[k1] = apa[apJ[k]];
179: apa[apJ[k++]] = 0.0;
180: }
182: /* 2nd off-diagoanl part of C */
183: ca = coa + co->i[i];
184: for (; k0<conz; k0++) {
185: ca[k0] = apa[apJ[k]];
186: apa[apJ[k++]] = 0.0;
187: }
188: }
189: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
190: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
192: if (ptap->freestruct) {
193: MatFreeIntermediateDataStructures(C);
194: }
195: return(0);
196: }
198: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
199: {
200: PetscErrorCode ierr;
201: MPI_Comm comm;
202: PetscMPIInt size;
203: Mat Cmpi;
204: Mat_APMPI *ptap;
205: PetscFreeSpaceList free_space=NULL,current_space=NULL;
206: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
207: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
208: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
209: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
210: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
211: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
212: PetscBT lnkbt;
213: PetscReal afill;
214: MatType mtype;
217: PetscObjectGetComm((PetscObject)A,&comm);
218: MPI_Comm_size(comm,&size);
220: /* create struct Mat_APMPI and attached it to C later */
221: PetscNew(&ptap);
223: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
224: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
226: /* get P_loc by taking all local rows of P */
227: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
229: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
230: pi_loc = p_loc->i; pj_loc = p_loc->j;
231: if (size > 1) {
232: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
233: pi_oth = p_oth->i; pj_oth = p_oth->j;
234: } else {
235: p_oth = NULL;
236: pi_oth = NULL; pj_oth = NULL;
237: }
239: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
240: /*-------------------------------------------------------------------*/
241: PetscMalloc1(am+2,&api);
242: ptap->api = api;
243: api[0] = 0;
245: /* create and initialize a linked list */
246: PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);
248: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
249: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
250: current_space = free_space;
252: MatPreallocateInitialize(comm,am,pn,dnz,onz);
253: for (i=0; i<am; i++) {
254: /* diagonal portion of A */
255: nzi = adi[i+1] - adi[i];
256: for (j=0; j<nzi; j++) {
257: row = *adj++;
258: pnz = pi_loc[row+1] - pi_loc[row];
259: Jptr = pj_loc + pi_loc[row];
260: /* add non-zero cols of P into the sorted linked list lnk */
261: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
262: }
263: /* off-diagonal portion of A */
264: nzi = aoi[i+1] - aoi[i];
265: for (j=0; j<nzi; j++) {
266: row = *aoj++;
267: pnz = pi_oth[row+1] - pi_oth[row];
268: Jptr = pj_oth + pi_oth[row];
269: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
270: }
272: apnz = lnk[0];
273: api[i+1] = api[i] + apnz;
275: /* if free space is not available, double the total space in the list */
276: if (current_space->local_remaining<apnz) {
277: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
278: nspacedouble++;
279: }
281: /* Copy data into free space, then initialize lnk */
282: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
283: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
285: current_space->array += apnz;
286: current_space->local_used += apnz;
287: current_space->local_remaining -= apnz;
288: }
290: /* Allocate space for apj, initialize apj, and */
291: /* destroy list of free space and other temporary array(s) */
292: PetscMalloc1(api[am]+1,&ptap->apj);
293: apj = ptap->apj;
294: PetscFreeSpaceContiguous(&free_space,ptap->apj);
295: PetscLLDestroy(lnk,lnkbt);
297: /* malloc apa to store dense row A[i,:]*P */
298: PetscCalloc1(pN,&ptap->apa);
300: /* create and assemble symbolic parallel matrix Cmpi */
301: /*----------------------------------------------------*/
302: MatCreate(comm,&Cmpi);
303: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
304: MatSetBlockSizesFromMats(Cmpi,A,P);
306: MatGetType(A,&mtype);
307: MatSetType(Cmpi,mtype);
308: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
310: MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Cmpi, apj, api);
311: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
312: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
313: MatPreallocateFinalize(dnz,onz);
315: ptap->destroy = Cmpi->ops->destroy;
316: ptap->duplicate = Cmpi->ops->duplicate;
317: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
318: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
319: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
321: /* attach the supporting struct to Cmpi for reuse */
322: c = (Mat_MPIAIJ*)Cmpi->data;
323: c->ap = ptap;
325: *C = Cmpi;
327: /* set MatInfo */
328: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
329: if (afill < 1.0) afill = 1.0;
330: Cmpi->info.mallocs = nspacedouble;
331: Cmpi->info.fill_ratio_given = fill;
332: Cmpi->info.fill_ratio_needed = afill;
334: #if defined(PETSC_USE_INFO)
335: if (api[am]) {
336: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
337: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
338: } else {
339: PetscInfo(Cmpi,"Empty matrix product\n");
340: }
341: #endif
342: return(0);
343: }
345: PETSC_INTERN PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
346: {
350: if (scall == MAT_INITIAL_MATRIX) {
351: *C = NULL;
352: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
353: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
354: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
355: }
356: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
357: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
358: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
359: return(0);
360: }
362: typedef struct {
363: Mat workB,Bb,Cb,workB1,Bb1,Cb1;
364: MPI_Request *rwaits,*swaits;
365: PetscInt numBb; /* num of Bb matrices */
366: PetscInt nsends,nrecvs;
367: MPI_Datatype *stype,*rtype;
368: } MPIAIJ_MPIDense;
370: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
371: {
372: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*)ctx;
373: PetscErrorCode ierr;
374: PetscInt i;
377: MatDestroy(&contents->workB);
379: if (contents->numBb) {
380: MatDestroy(&contents->Bb);
381: MatDestroy(&contents->Cb);
383: MatDestroy(&contents->workB1);
384: MatDestroy(&contents->Bb1);
385: MatDestroy(&contents->Cb1);
386: }
387: for (i=0; i<contents->nsends; i++) {
388: MPI_Type_free(&contents->stype[i]);
389: }
390: for (i=0; i<contents->nrecvs; i++) {
391: MPI_Type_free(&contents->rtype[i]);
392: }
393: PetscFree4(contents->stype,contents->rtype,contents->rwaits,contents->swaits);
394: PetscFree(contents);
395: return(0);
396: }
398: /*
399: This is a "dummy function" that handles the case where matrix C was created as a dense matrix
400: directly by the user and passed to MatMatMult() with the MAT_REUSE_MATRIX option
402: It is the same as MatMatMultSymbolic_MPIAIJ_MPIDense() except does not create C
403: */
404: PetscErrorCode MatMatMultNumeric_MPIDense(Mat A,Mat B,Mat C)
405: {
409: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,0,&C);
410: (*C->ops->matmultnumeric)(A,B,C);
411: return(0);
412: }
414: /*
415: Create Bb, Cb, Bb1 and Cb1 matrices to be used by MatMatMult_MPIAIJ_MPIDense().
416: These matrices are used as wrappers for sub-columns of B and C, thus their own matrix operations are not used.
417: Modified from MatCreateDense().
418: */
419: PETSC_STATIC_INLINE PetscErrorCode MatCreateSubMPIDense_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt rbs,PetscInt cbs,PetscScalar *data,Mat *A)
420: {
424: MatCreate(comm,A);
425: MatSetSizes(*A,m,n,M,N);
426: MatSetBlockSizes(*A,rbs,cbs);
427: MatSetType(*A,MATMPIDENSE);
428: MatMPIDenseSetPreallocation(*A,data);
429: (*A)->assembled = PETSC_TRUE;
430: return(0);
431: }
433: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
434: {
435: PetscErrorCode ierr;
436: Mat_MPIAIJ *aij=(Mat_MPIAIJ*)A->data;
437: Mat_MPIDense *b=(Mat_MPIDense*)B->data;
438: Mat_SeqDense *bseq=(Mat_SeqDense*)(b->A)->data;
439: PetscInt nz=aij->B->cmap->n,nsends,nrecvs,i,nrows_to,j,lda=bseq->lda;
440: PetscContainer container;
441: MPIAIJ_MPIDense *contents;
442: VecScatter ctx=aij->Mvctx;
443: PetscInt Am=A->rmap->n,Bm=B->rmap->n,Bn=B->cmap->n,BN=B->cmap->N,Bbn,Bbn1,bs,nrows_from;
444: MPI_Comm comm;
445: MPI_Datatype type1,*stype,*rtype;
446: const PetscInt *sindices,*sstarts,*rstarts;
447: PetscMPIInt *disp;
450: PetscObjectGetComm((PetscObject)A,&comm);
451: if (!(*C)) {
452: MatCreate(comm,C);
453: MatSetSizes(*C,Am,Bn,A->rmap->N,BN);
454: MatSetBlockSizesFromMats(*C,A,B);
455: MatSetType(*C,MATMPIDENSE);
456: MatMPIDenseSetPreallocation(*C,NULL);
457: } else {
458: /* Check matrix size */
459: if ((*C)->rmap->n != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Matrix local row dimensions are incompatible, %D != %D",(*C)->rmap->n,A->rmap->n);
460: if ((*C)->cmap->n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Matrix local column dimensions are incompatible, %D != %D",(*C)->cmap->n,B->cmap->n);
461: }
463: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
465: PetscNew(&contents);
466: contents->numBb = 0;
468: VecScatterGetRemote_Private(ctx,PETSC_TRUE/*send*/,&nsends,&sstarts,&sindices,NULL,NULL);
469: VecScatterGetRemoteOrdered_Private(ctx,PETSC_FALSE/*recv*/,&nrecvs,&rstarts,NULL,NULL,NULL);
471: /* Create column block of B and C for memory scalability when BN is too large */
472: /* Estimate Bbn, column size of Bb */
473: if (nz) {
474: Bbn1 = 2*Am*BN/nz;
475: } else Bbn1 = BN;
477: bs = PetscAbs(B->cmap->bs);
478: Bbn1 = Bbn1/bs *bs; /* Bbn1 is a multiple of bs */
479: if (Bbn1 > BN) Bbn1 = BN;
480: MPI_Allreduce(&Bbn1,&Bbn,1,MPIU_INT,MPI_MAX,comm);
482: /* Enable runtime option for Bbn */
483: PetscOptionsBegin(comm,((PetscObject)*C)->prefix,"MatMatMult","Mat");
484: PetscOptionsInt("-matmatmult_Bbn","Number of columns in Bb","MatMatMult",Bbn,&Bbn,NULL);
485: if (Bbn > BN) SETERRQ2(comm,PETSC_ERR_ARG_SIZ,"Bbn=%D cannot be larger than %D, column size of B",Bbn,BN);
486: PetscOptionsEnd();
488: if (Bbn < BN) {
489: contents->numBb = BN/Bbn;
490: Bbn1 = BN - contents->numBb*Bbn;
491: }
493: if (contents->numBb) {
494: PetscScalar data[1]; /* fake array for Bb and Cb */
495: PetscInfo3(*C,"use Bb, BN=%D, Bbn=%D; numBb=%D\n",BN,Bbn,contents->numBb);
496: MatCreateSubMPIDense_private(comm,B->rmap->n,PETSC_DECIDE,A->rmap->N,Bbn,B->rmap->bs,B->cmap->bs,data,&contents->Bb);
497: MatCreateSubMPIDense_private(comm,Am,PETSC_DECIDE,A->rmap->N,Bbn,(*C)->rmap->bs,(*C)->cmap->bs,data,&contents->Cb);
499: if (Bbn1) { /* Create Bb1 and Cb1 for the remaining columns */
500: PetscInfo2(*C,"use Bb1, BN=%D, Bbn1=%D\n",BN,Bbn1);
501: MatCreateSubMPIDense_private(comm,B->rmap->n,PETSC_DECIDE,A->rmap->N,Bbn1,B->rmap->bs,B->cmap->bs,data,&contents->Bb1);
502: MatCreateSubMPIDense_private(comm,Am,PETSC_DECIDE,A->rmap->N,Bbn1,(*C)->rmap->bs,(*C)->cmap->bs,data,&contents->Cb1);
504: /* Create work matrix used to store off processor rows of B needed for local product */
505: MatCreateSeqDense(PETSC_COMM_SELF,nz,Bbn1,NULL,&contents->workB1);
506: }
507: }
509: /* Create work matrix used to store off processor rows of B needed for local product */
510: MatCreateSeqDense(PETSC_COMM_SELF,nz,Bbn,NULL,&contents->workB);
512: /* Use MPI derived data type to reduce memory required by the send/recv buffers */
513: PetscMalloc4(nsends,&stype,nrecvs,&rtype,nrecvs,&contents->rwaits,nsends,&contents->swaits);
514: contents->stype = stype;
515: contents->nsends = nsends;
517: contents->rtype = rtype;
518: contents->nrecvs = nrecvs;
520: PetscMalloc1(Bm+1,&disp);
521: for (i=0; i<nsends; i++) {
522: nrows_to = sstarts[i+1]-sstarts[i];
523: for (j=0; j<nrows_to; j++){
524: disp[j] = sindices[sstarts[i]+j]; /* rowB to be sent */
525: }
526: MPI_Type_create_indexed_block(nrows_to,1,(const PetscMPIInt *)disp,MPIU_SCALAR,&type1);
528: MPI_Type_create_resized(type1,0,lda*sizeof(PetscScalar),&stype[i]);
529: MPI_Type_commit(&stype[i]);
530: MPI_Type_free(&type1);
531: }
533: for (i=0; i<nrecvs; i++) {
534: /* received values from a process form a (nrows_from x Bbn) row block in workB (column-wise) */
535: nrows_from = rstarts[i+1]-rstarts[i];
536: disp[0] = 0;
537: MPI_Type_create_indexed_block(1, nrows_from, (const PetscMPIInt *)disp, MPIU_SCALAR, &type1);
538: MPI_Type_create_resized(type1, 0, nz*sizeof(PetscScalar), &rtype[i]);
539: MPI_Type_commit(&rtype[i]);
540: MPI_Type_free(&type1);
541: }
543: PetscFree(disp);
544: VecScatterRestoreRemote_Private(ctx,PETSC_TRUE/*send*/,&nsends,&sstarts,&sindices,NULL,NULL);
545: VecScatterRestoreRemoteOrdered_Private(ctx,PETSC_FALSE/*recv*/,&nrecvs,&rstarts,NULL,NULL,NULL);
547: PetscContainerCreate(comm,&container);
548: PetscContainerSetPointer(container,contents);
549: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
550: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
551: PetscContainerDestroy(&container);
552: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
553: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
554: return(0);
555: }
557: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
558: /*
559: Performs an efficient scatter on the rows of B needed by this process; this is
560: a modification of the VecScatterBegin_() routines.
562: Input: Bbidx = 0: B = Bb
563: = 1: B = Bb1, see MatMatMultSymbolic_MPIAIJ_MPIDense()
564: */
565: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,PetscInt Bbidx,Mat C,Mat *outworkB)
566: {
567: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
568: PetscErrorCode ierr;
569: const PetscScalar *b;
570: PetscScalar *rvalues;
571: VecScatter ctx = aij->Mvctx;
572: const PetscInt *sindices,*sstarts,*rstarts;
573: const PetscMPIInt *sprocs,*rprocs;
574: PetscInt i,nsends,nrecvs,nrecvs2;
575: MPI_Request *swaits,*rwaits;
576: MPI_Comm comm;
577: PetscMPIInt tag=((PetscObject)ctx)->tag,ncols=B->cmap->N,nrows=aij->B->cmap->n,imdex,nsends_mpi,nrecvs_mpi;
578: MPI_Status status;
579: MPIAIJ_MPIDense *contents;
580: PetscContainer container;
581: Mat workB;
582: MPI_Datatype *stype,*rtype;
585: PetscObjectGetComm((PetscObject)A,&comm);
586: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
587: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
588: PetscContainerGetPointer(container,(void**)&contents);
590: VecScatterGetRemote_Private(ctx,PETSC_TRUE/*send*/,&nsends,&sstarts,&sindices,&sprocs,NULL/*bs*/);
591: VecScatterGetRemoteOrdered_Private(ctx,PETSC_FALSE/*recv*/,&nrecvs,&rstarts,NULL,&rprocs,NULL/*bs*/);
592: PetscMPIIntCast(nsends,&nsends_mpi);
593: PetscMPIIntCast(nrecvs,&nrecvs_mpi);
594: if (Bbidx == 0) {
595: workB = *outworkB = contents->workB;
596: } else {
597: workB = *outworkB = contents->workB1;
598: }
599: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
600: swaits = contents->swaits;
601: rwaits = contents->rwaits;
603: MatDenseGetArrayRead(B,&b);
604: MatDenseGetArray(workB,&rvalues);
606: /* Post recv, use MPI derived data type to save memory */
607: rtype = contents->rtype;
608: for (i=0; i<nrecvs; i++) {
609: MPI_Irecv(rvalues+(rstarts[i]-rstarts[0]),ncols,rtype[i],rprocs[i],tag,comm,rwaits+i);
610: }
612: stype = contents->stype;
613: for (i=0; i<nsends; i++) {
614: MPI_Isend(b,ncols,stype[i],sprocs[i],tag,comm,swaits+i);
615: }
617: nrecvs2 = nrecvs;
618: while (nrecvs2) {
619: MPI_Waitany(nrecvs_mpi,rwaits,&imdex,&status);
620: nrecvs2--;
621: }
622: if (nsends) {MPI_Waitall(nsends_mpi,swaits,MPI_STATUSES_IGNORE);}
624: VecScatterRestoreRemote_Private(ctx,PETSC_TRUE/*send*/,&nsends,&sstarts,&sindices,&sprocs,NULL);
625: VecScatterRestoreRemoteOrdered_Private(ctx,PETSC_FALSE/*recv*/,&nrecvs,&rstarts,NULL,&rprocs,NULL);
626: MatDenseRestoreArrayRead(B,&b);
627: MatDenseRestoreArray(workB,&rvalues);
628: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
629: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
630: return(0);
631: }
633: /*
634: Compute Cb = A*Bb
635: */
636: PETSC_STATIC_INLINE PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense_private(Mat A,Mat Bb,PetscInt Bbidx,PetscInt start,Mat C,const PetscScalar *barray,PetscScalar *carray,Mat Cb)
637: {
638: PetscErrorCode ierr;
639: PetscInt start1;
640: Mat workB;
641: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
642: Mat_MPIDense *cbdense = (Mat_MPIDense*)Cb->data;
645: /* Place barray to Bb */
646: start1 = start*Bb->rmap->n;
647: MatDensePlaceArray(Bb,barray+start1);
649: /* get off processor parts of Bb needed to complete Cb=A*Bb */
650: MatMPIDenseScatter(A,Bb,Bbidx,C,&workB);
651: MatDenseResetArray(Bb);
653: /* off-diagonal block of A times nonlocal rows of Bb */
654: /* Place carray to Cb */
655: start1 = start*Cb->rmap->n;
656: MatDensePlaceArray(Cb,carray+start1);
657: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cbdense->A);
658: MatDenseResetArray(Cb);
659: return(0);
660: }
662: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
663: {
664: PetscErrorCode ierr;
665: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
666: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
667: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
668: Mat workB;
669: MPIAIJ_MPIDense *contents;
670: PetscContainer container;
671: MPI_Comm comm;
672: PetscInt numBb;
675: /* diagonal block of A times all local rows of B*/
676: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
678: PetscObjectGetComm((PetscObject)A,&comm);
679: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
680: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
681: PetscContainerGetPointer(container,(void**)&contents);
682: numBb = contents->numBb;
684: if (!numBb) {
685: /* get off processor parts of B needed to complete C=A*B */
686: MatMPIDenseScatter(A,B,0,C,&workB);
688: /* off-diagonal block of A times nonlocal rows of B */
689: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
691: } else {
692: const PetscScalar *barray;
693: PetscScalar *carray;
694: Mat Bb=contents->Bb,Cb=contents->Cb;
695: PetscInt BbN=Bb->cmap->N,start,i;
697: MatDenseGetArrayRead(B,&barray);
698: MatDenseGetArray(C,&carray);
699: for (i=0; i<numBb; i++) {
700: start = i*BbN;
701: MatMatMultNumeric_MPIAIJ_MPIDense_private(A,Bb,0,start,C,barray,carray,Cb);
702: }
704: if (contents->Bb1) {
705: Bb = contents->Bb1; Cb = contents->Cb1;
706: start = i*BbN;
707: MatMatMultNumeric_MPIAIJ_MPIDense_private(A,Bb,1,start,C,barray,carray,Cb);
708: }
709: MatDenseRestoreArrayRead(B,&barray);
710: MatDenseRestoreArray(C,&carray);
711: }
712: return(0);
713: }
715: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
716: {
718: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
719: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
720: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
721: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
722: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
723: Mat_SeqAIJ *p_loc,*p_oth;
724: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
725: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
726: PetscInt cm = C->rmap->n,anz,pnz;
727: Mat_APMPI *ptap = c->ap;
728: PetscScalar *apa_sparse;
729: PetscInt *api,*apj,*apJ,i,j,k,row;
730: PetscInt cstart = C->cmap->rstart;
731: PetscInt cdnz,conz,k0,k1,nextp;
732: MPI_Comm comm;
733: PetscMPIInt size;
736: PetscObjectGetComm((PetscObject)C,&comm);
737: MPI_Comm_size(comm,&size);
739: if (!ptap->P_oth && size>1) {
740: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"AP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
741: }
742: apa_sparse = ptap->apa;
744: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
745: /*-----------------------------------------------------*/
746: /* update numerical values of P_oth and P_loc */
747: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
748: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
750: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
751: /*----------------------------------------------------------*/
752: /* get data from symbolic products */
753: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
754: pi_loc = p_loc->i; pj_loc = p_loc->j; pa_loc = p_loc->a;
755: if (size >1) {
756: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
757: pi_oth = p_oth->i; pj_oth = p_oth->j; pa_oth = p_oth->a;
758: } else {
759: p_oth = NULL; pi_oth = NULL; pj_oth = NULL; pa_oth = NULL;
760: }
762: api = ptap->api;
763: apj = ptap->apj;
764: for (i=0; i<cm; i++) {
765: apJ = apj + api[i];
767: /* diagonal portion of A */
768: anz = adi[i+1] - adi[i];
769: adj = ad->j + adi[i];
770: ada = ad->a + adi[i];
771: for (j=0; j<anz; j++) {
772: row = adj[j];
773: pnz = pi_loc[row+1] - pi_loc[row];
774: pj = pj_loc + pi_loc[row];
775: pa = pa_loc + pi_loc[row];
776: /* perform sparse axpy */
777: valtmp = ada[j];
778: nextp = 0;
779: for (k=0; nextp<pnz; k++) {
780: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
781: apa_sparse[k] += valtmp*pa[nextp++];
782: }
783: }
784: PetscLogFlops(2.0*pnz);
785: }
787: /* off-diagonal portion of A */
788: anz = aoi[i+1] - aoi[i];
789: aoj = ao->j + aoi[i];
790: aoa = ao->a + aoi[i];
791: for (j=0; j<anz; j++) {
792: row = aoj[j];
793: pnz = pi_oth[row+1] - pi_oth[row];
794: pj = pj_oth + pi_oth[row];
795: pa = pa_oth + pi_oth[row];
796: /* perform sparse axpy */
797: valtmp = aoa[j];
798: nextp = 0;
799: for (k=0; nextp<pnz; k++) {
800: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
801: apa_sparse[k] += valtmp*pa[nextp++];
802: }
803: }
804: PetscLogFlops(2.0*pnz);
805: }
807: /* set values in C */
808: cdnz = cd->i[i+1] - cd->i[i];
809: conz = co->i[i+1] - co->i[i];
811: /* 1st off-diagoanl part of C */
812: ca = coa + co->i[i];
813: k = 0;
814: for (k0=0; k0<conz; k0++) {
815: if (apJ[k] >= cstart) break;
816: ca[k0] = apa_sparse[k];
817: apa_sparse[k] = 0.0;
818: k++;
819: }
821: /* diagonal part of C */
822: ca = cda + cd->i[i];
823: for (k1=0; k1<cdnz; k1++) {
824: ca[k1] = apa_sparse[k];
825: apa_sparse[k] = 0.0;
826: k++;
827: }
829: /* 2nd off-diagoanl part of C */
830: ca = coa + co->i[i];
831: for (; k0<conz; k0++) {
832: ca[k0] = apa_sparse[k];
833: apa_sparse[k] = 0.0;
834: k++;
835: }
836: }
837: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
838: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
840: if (ptap->freestruct) {
841: MatFreeIntermediateDataStructures(C);
842: }
843: return(0);
844: }
846: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
847: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
848: {
849: PetscErrorCode ierr;
850: MPI_Comm comm;
851: PetscMPIInt size;
852: Mat Cmpi;
853: Mat_APMPI *ptap;
854: PetscFreeSpaceList free_space = NULL,current_space=NULL;
855: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
856: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
857: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
858: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
859: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
860: PetscInt am=A->rmap->n,pn=P->cmap->n,pm=P->rmap->n,lsize=pn+20;
861: PetscReal afill;
862: MatType mtype;
865: PetscObjectGetComm((PetscObject)A,&comm);
866: MPI_Comm_size(comm,&size);
868: /* create struct Mat_APMPI and attached it to C later */
869: PetscNew(&ptap);
871: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
872: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
874: /* get P_loc by taking all local rows of P */
875: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
877: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
878: pi_loc = p_loc->i; pj_loc = p_loc->j;
879: if (size > 1) {
880: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
881: pi_oth = p_oth->i; pj_oth = p_oth->j;
882: } else {
883: p_oth = NULL;
884: pi_oth = NULL; pj_oth = NULL;
885: }
887: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
888: /*-------------------------------------------------------------------*/
889: PetscMalloc1(am+2,&api);
890: ptap->api = api;
891: api[0] = 0;
893: PetscLLCondensedCreate_Scalable(lsize,&lnk);
895: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
896: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space);
897: current_space = free_space;
898: MatPreallocateInitialize(comm,am,pn,dnz,onz);
899: for (i=0; i<am; i++) {
900: /* diagonal portion of A */
901: nzi = adi[i+1] - adi[i];
902: for (j=0; j<nzi; j++) {
903: row = *adj++;
904: pnz = pi_loc[row+1] - pi_loc[row];
905: Jptr = pj_loc + pi_loc[row];
906: /* Expand list if it is not long enough */
907: if (pnz+apnz_max > lsize) {
908: lsize = pnz+apnz_max;
909: PetscLLCondensedExpand_Scalable(lsize, &lnk);
910: }
911: /* add non-zero cols of P into the sorted linked list lnk */
912: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
913: apnz = *lnk; /* The first element in the list is the number of items in the list */
914: api[i+1] = api[i] + apnz;
915: if (apnz > apnz_max) apnz_max = apnz;
916: }
917: /* off-diagonal portion of A */
918: nzi = aoi[i+1] - aoi[i];
919: for (j=0; j<nzi; j++) {
920: row = *aoj++;
921: pnz = pi_oth[row+1] - pi_oth[row];
922: Jptr = pj_oth + pi_oth[row];
923: /* Expand list if it is not long enough */
924: if (pnz+apnz_max > lsize) {
925: lsize = pnz + apnz_max;
926: PetscLLCondensedExpand_Scalable(lsize, &lnk);
927: }
928: /* add non-zero cols of P into the sorted linked list lnk */
929: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
930: apnz = *lnk; /* The first element in the list is the number of items in the list */
931: api[i+1] = api[i] + apnz;
932: if (apnz > apnz_max) apnz_max = apnz;
933: }
934: apnz = *lnk;
935: api[i+1] = api[i] + apnz;
936: if (apnz > apnz_max) apnz_max = apnz;
938: /* if free space is not available, double the total space in the list */
939: if (current_space->local_remaining<apnz) {
940: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
941: nspacedouble++;
942: }
944: /* Copy data into free space, then initialize lnk */
945: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
946: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
948: current_space->array += apnz;
949: current_space->local_used += apnz;
950: current_space->local_remaining -= apnz;
951: }
953: /* Allocate space for apj, initialize apj, and */
954: /* destroy list of free space and other temporary array(s) */
955: PetscMalloc1(api[am]+1,&ptap->apj);
956: apj = ptap->apj;
957: PetscFreeSpaceContiguous(&free_space,ptap->apj);
958: PetscLLCondensedDestroy_Scalable(lnk);
960: /* create and assemble symbolic parallel matrix Cmpi */
961: /*----------------------------------------------------*/
962: MatCreate(comm,&Cmpi);
963: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
964: MatSetBlockSizesFromMats(Cmpi,A,P);
965: MatGetType(A,&mtype);
966: MatSetType(Cmpi,mtype);
967: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
969: /* malloc apa for assembly Cmpi */
970: PetscCalloc1(apnz_max,&ptap->apa);
972: MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Cmpi, apj, api);
973: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
974: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
975: MatPreallocateFinalize(dnz,onz);
977: ptap->destroy = Cmpi->ops->destroy;
978: ptap->duplicate = Cmpi->ops->duplicate;
979: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ;
980: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
981: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
983: /* attach the supporting struct to Cmpi for reuse */
984: c = (Mat_MPIAIJ*)Cmpi->data;
985: c->ap = ptap;
986: *C = Cmpi;
988: /* set MatInfo */
989: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
990: if (afill < 1.0) afill = 1.0;
991: Cmpi->info.mallocs = nspacedouble;
992: Cmpi->info.fill_ratio_given = fill;
993: Cmpi->info.fill_ratio_needed = afill;
995: #if defined(PETSC_USE_INFO)
996: if (api[am]) {
997: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
998: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
999: } else {
1000: PetscInfo(Cmpi,"Empty matrix product\n");
1001: }
1002: #endif
1003: return(0);
1004: }
1006: /* This function is needed for the seqMPI matrix-matrix multiplication. */
1007: /* Three input arrays are merged to one output array. The size of the */
1008: /* output array is also output. Duplicate entries only show up once. */
1009: static void Merge3SortedArrays(PetscInt size1, PetscInt *in1,
1010: PetscInt size2, PetscInt *in2,
1011: PetscInt size3, PetscInt *in3,
1012: PetscInt *size4, PetscInt *out)
1013: {
1014: int i = 0, j = 0, k = 0, l = 0;
1016: /* Traverse all three arrays */
1017: while (i<size1 && j<size2 && k<size3) {
1018: if (in1[i] < in2[j] && in1[i] < in3[k]) {
1019: out[l++] = in1[i++];
1020: }
1021: else if(in2[j] < in1[i] && in2[j] < in3[k]) {
1022: out[l++] = in2[j++];
1023: }
1024: else if(in3[k] < in1[i] && in3[k] < in2[j]) {
1025: out[l++] = in3[k++];
1026: }
1027: else if(in1[i] == in2[j] && in1[i] < in3[k]) {
1028: out[l++] = in1[i];
1029: i++, j++;
1030: }
1031: else if(in1[i] == in3[k] && in1[i] < in2[j]) {
1032: out[l++] = in1[i];
1033: i++, k++;
1034: }
1035: else if(in3[k] == in2[j] && in2[j] < in1[i]) {
1036: out[l++] = in2[j];
1037: k++, j++;
1038: }
1039: else if(in1[i] == in2[j] && in1[i] == in3[k]) {
1040: out[l++] = in1[i];
1041: i++, j++, k++;
1042: }
1043: }
1045: /* Traverse two remaining arrays */
1046: while (i<size1 && j<size2) {
1047: if (in1[i] < in2[j]) {
1048: out[l++] = in1[i++];
1049: }
1050: else if(in1[i] > in2[j]) {
1051: out[l++] = in2[j++];
1052: }
1053: else {
1054: out[l++] = in1[i];
1055: i++, j++;
1056: }
1057: }
1059: while (i<size1 && k<size3) {
1060: if (in1[i] < in3[k]) {
1061: out[l++] = in1[i++];
1062: }
1063: else if(in1[i] > in3[k]) {
1064: out[l++] = in3[k++];
1065: }
1066: else {
1067: out[l++] = in1[i];
1068: i++, k++;
1069: }
1070: }
1072: while (k<size3 && j<size2) {
1073: if (in3[k] < in2[j]) {
1074: out[l++] = in3[k++];
1075: }
1076: else if(in3[k] > in2[j]) {
1077: out[l++] = in2[j++];
1078: }
1079: else {
1080: out[l++] = in3[k];
1081: k++, j++;
1082: }
1083: }
1085: /* Traverse one remaining array */
1086: while (i<size1) out[l++] = in1[i++];
1087: while (j<size2) out[l++] = in2[j++];
1088: while (k<size3) out[l++] = in3[k++];
1090: *size4 = l;
1091: }
1093: /* This matrix-matrix multiplication algorithm divides the multiplication into three multiplications and */
1094: /* adds up the products. Two of these three multiplications are performed with existing (sequential) */
1095: /* matrix-matrix multiplications. */
1096: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(Mat A, Mat P, PetscReal fill, Mat *C)
1097: {
1098: PetscErrorCode ierr;
1099: MPI_Comm comm;
1100: PetscMPIInt size;
1101: Mat Cmpi;
1102: Mat_APMPI *ptap;
1103: PetscFreeSpaceList free_space_diag=NULL, current_space=NULL;
1104: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data;
1105: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc;
1106: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data;
1107: Mat_MPIAIJ *c;
1108: Mat_SeqAIJ *adpd_seq, *p_off, *aopoth_seq;
1109: PetscInt adponz, adpdnz;
1110: PetscInt *pi_loc,*dnz,*onz;
1111: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,rstart=A->rmap->rstart;
1112: PetscInt *lnk,i, i1=0,pnz,row,*adpoi,*adpoj, *api, *adpoJ, *aopJ, *apJ,*Jptr, aopnz, nspacedouble=0,j,nzi,
1113: *apj,apnz, *adpdi, *adpdj, *adpdJ, *poff_i, *poff_j, *j_temp, *aopothi, *aopothj;
1114: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n, p_colstart, p_colend;
1115: PetscBT lnkbt;
1116: PetscReal afill;
1117: PetscMPIInt rank;
1118: Mat adpd, aopoth;
1119: MatType mtype;
1120: const char *prefix;
1123: PetscObjectGetComm((PetscObject)A,&comm);
1124: MPI_Comm_size(comm,&size);
1125: MPI_Comm_rank(comm, &rank);
1126: MatGetOwnershipRangeColumn(P, &p_colstart, &p_colend);
1128: /* create struct Mat_APMPI and attached it to C later */
1129: PetscNew(&ptap);
1131: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1132: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1134: /* get P_loc by taking all local rows of P */
1135: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1138: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1139: pi_loc = p_loc->i;
1141: /* Allocate memory for the i arrays of the matrices A*P, A_diag*P_off and A_offd * P */
1142: PetscMalloc1(am+2,&api);
1143: PetscMalloc1(am+2,&adpoi);
1145: adpoi[0] = 0;
1146: ptap->api = api;
1147: api[0] = 0;
1149: /* create and initialize a linked list, will be used for both A_diag * P_loc_off and A_offd * P_oth */
1150: PetscLLCondensedCreate(pN,pN,&lnk,&lnkbt);
1151: MatPreallocateInitialize(comm,am,pn,dnz,onz);
1153: /* Symbolic calc of A_loc_diag * P_loc_diag */
1154: MatGetOptionsPrefix(A,&prefix);
1155: MatSetOptionsPrefix(a->A,prefix);
1156: MatAppendOptionsPrefix(a->A,"inner_diag_");
1157: MatMatMultSymbolic_SeqAIJ_SeqAIJ(a->A, p->A, fill, &adpd);
1158: adpd_seq = (Mat_SeqAIJ*)((adpd)->data);
1159: adpdi = adpd_seq->i; adpdj = adpd_seq->j;
1160: p_off = (Mat_SeqAIJ*)((p->B)->data);
1161: poff_i = p_off->i; poff_j = p_off->j;
1163: /* j_temp stores indices of a result row before they are added to the linked list */
1164: PetscMalloc1(pN+2,&j_temp);
1167: /* Symbolic calc of the A_diag * p_loc_off */
1168: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
1169: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(adi[am],PetscIntSumTruncate(aoi[am],pi_loc[pm]))),&free_space_diag);
1170: current_space = free_space_diag;
1172: for (i=0; i<am; i++) {
1173: /* A_diag * P_loc_off */
1174: nzi = adi[i+1] - adi[i];
1175: for (j=0; j<nzi; j++) {
1176: row = *adj++;
1177: pnz = poff_i[row+1] - poff_i[row];
1178: Jptr = poff_j + poff_i[row];
1179: for(i1 = 0; i1 < pnz; i1++) {
1180: j_temp[i1] = p->garray[Jptr[i1]];
1181: }
1182: /* add non-zero cols of P into the sorted linked list lnk */
1183: PetscLLCondensedAddSorted(pnz,j_temp,lnk,lnkbt);
1184: }
1186: adponz = lnk[0];
1187: adpoi[i+1] = adpoi[i] + adponz;
1189: /* if free space is not available, double the total space in the list */
1190: if (current_space->local_remaining<adponz) {
1191: PetscFreeSpaceGet(PetscIntSumTruncate(adponz,current_space->total_array_size),¤t_space);
1192: nspacedouble++;
1193: }
1195: /* Copy data into free space, then initialize lnk */
1196: PetscLLCondensedClean(pN,adponz,current_space->array,lnk,lnkbt);
1198: current_space->array += adponz;
1199: current_space->local_used += adponz;
1200: current_space->local_remaining -= adponz;
1201: }
1203: /* Symbolic calc of A_off * P_oth */
1204: MatSetOptionsPrefix(a->B,prefix);
1205: MatAppendOptionsPrefix(a->B,"inner_offdiag_");
1206: MatMatMultSymbolic_SeqAIJ_SeqAIJ(a->B, ptap->P_oth, fill, &aopoth);
1207: aopoth_seq = (Mat_SeqAIJ*)((aopoth)->data);
1208: aopothi = aopoth_seq->i; aopothj = aopoth_seq->j;
1210: /* Allocate space for apj, adpj, aopj, ... */
1211: /* destroy lists of free space and other temporary array(s) */
1213: PetscMalloc1(aopothi[am] + adpoi[am] + adpdi[am]+2, &ptap->apj);
1214: PetscMalloc1(adpoi[am]+2, &adpoj);
1216: /* Copy from linked list to j-array */
1217: PetscFreeSpaceContiguous(&free_space_diag,adpoj);
1218: PetscLLDestroy(lnk,lnkbt);
1220: adpoJ = adpoj;
1221: adpdJ = adpdj;
1222: aopJ = aopothj;
1223: apj = ptap->apj;
1224: apJ = apj; /* still empty */
1226: /* Merge j-arrays of A_off * P, A_diag * P_loc_off, and */
1227: /* A_diag * P_loc_diag to get A*P */
1228: for (i = 0; i < am; i++) {
1229: aopnz = aopothi[i+1] - aopothi[i];
1230: adponz = adpoi[i+1] - adpoi[i];
1231: adpdnz = adpdi[i+1] - adpdi[i];
1233: /* Correct indices from A_diag*P_diag */
1234: for(i1 = 0; i1 < adpdnz; i1++) {
1235: adpdJ[i1] += p_colstart;
1236: }
1237: /* Merge j-arrays of A_diag * P_loc_off and A_diag * P_loc_diag and A_off * P_oth */
1238: Merge3SortedArrays(adponz, adpoJ, adpdnz, adpdJ, aopnz, aopJ, &apnz, apJ);
1239: MatPreallocateSet(i+rstart, apnz, apJ, dnz, onz);
1241: aopJ += aopnz;
1242: adpoJ += adponz;
1243: adpdJ += adpdnz;
1244: apJ += apnz;
1245: api[i+1] = api[i] + apnz;
1246: }
1248: /* malloc apa to store dense row A[i,:]*P */
1249: PetscCalloc1(pN+2,&ptap->apa);
1251: /* create and assemble symbolic parallel matrix Cmpi */
1252: MatCreate(comm,&Cmpi);
1253: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1254: MatSetBlockSizesFromMats(Cmpi,A,P);
1255: MatGetType(A,&mtype);
1256: MatSetType(Cmpi,mtype);
1257: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1260: MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Cmpi, apj, api);
1261: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1262: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1263: MatPreallocateFinalize(dnz,onz);
1266: ptap->destroy = Cmpi->ops->destroy;
1267: ptap->duplicate = Cmpi->ops->duplicate;
1268: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1269: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
1271: /* attach the supporting struct to Cmpi for reuse */
1272: c = (Mat_MPIAIJ*)Cmpi->data;
1273: c->ap = ptap;
1274: *C = Cmpi;
1276: /* set MatInfo */
1277: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
1278: if (afill < 1.0) afill = 1.0;
1279: Cmpi->info.mallocs = nspacedouble;
1280: Cmpi->info.fill_ratio_given = fill;
1281: Cmpi->info.fill_ratio_needed = afill;
1283: #if defined(PETSC_USE_INFO)
1284: if (api[am]) {
1285: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1286: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
1287: } else {
1288: PetscInfo(Cmpi,"Empty matrix product\n");
1289: }
1290: #endif
1292: MatDestroy(&aopoth);
1293: MatDestroy(&adpd);
1294: PetscFree(j_temp);
1295: PetscFree(adpoj);
1296: PetscFree(adpoi);
1297: return(0);
1298: }
1301: /*-------------------------------------------------------------------------*/
1302: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
1303: {
1305: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
1306: PetscInt aN=A->cmap->N,alg=1; /* set default algorithm */
1307: PetscBool flg;
1310: if (scall == MAT_INITIAL_MATRIX) {
1311: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatTransposeMatMult","Mat");
1312: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[1],&alg,&flg);
1313: PetscOptionsEnd();
1315: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
1316: switch (alg) {
1317: case 1:
1318: if (!flg && aN > 100000) { /* may switch to scalable algorithm as default */
1319: MatInfo Ainfo,Pinfo;
1320: PetscInt nz_local;
1321: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
1322: MPI_Comm comm;
1324: MatGetInfo(A,MAT_LOCAL,&Ainfo);
1325: MatGetInfo(P,MAT_LOCAL,&Pinfo);
1326: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); /* estimated local nonzero entries */
1328: if (aN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
1329: PetscObjectGetComm((PetscObject)A,&comm);
1330: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
1332: if (alg_scalable) {
1333: alg = 0; /* scalable algorithm would slower than nonscalable algorithm */
1334: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
1335: break;
1336: }
1337: }
1338: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
1339: break;
1340: case 2:
1341: {
1342: Mat Pt;
1343: Mat_APMPI *ptap;
1344: Mat_MPIAIJ *c;
1345: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
1346: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
1347: c = (Mat_MPIAIJ*)(*C)->data;
1348: ptap = c->ap;
1349: if (ptap) {
1350: ptap->Pt = Pt;
1351: (*C)->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1352: }
1353: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
1354: return(0);
1355: }
1356: break;
1357: default: /* scalable algorithm */
1358: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
1359: break;
1360: }
1361: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
1363: {
1364: Mat_MPIAIJ *c = (Mat_MPIAIJ*)(*C)->data;
1365: Mat_APMPI *ap = c->ap;
1366: PetscOptionsBegin(PetscObjectComm((PetscObject)(*C)),((PetscObject)(*C))->prefix,"MatFreeIntermediateDataStructures","Mat");
1367: ap->freestruct = PETSC_FALSE;
1368: PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
1369: PetscOptionsEnd();
1370: }
1371: }
1373: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
1374: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
1375: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
1376: return(0);
1377: }
1379: /* This routine only works when scall=MAT_REUSE_MATRIX! */
1380: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
1381: {
1383: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
1384: Mat_APMPI *ptap= c->ap;
1385: Mat Pt;
1388: if (!ptap->Pt) {
1389: MPI_Comm comm;
1390: PetscObjectGetComm((PetscObject)C,&comm);
1391: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtA cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1392: }
1394: Pt=ptap->Pt;
1395: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
1396: MatMatMultNumeric(Pt,A,C);
1398: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1399: if (ptap->freestruct) {
1400: MatFreeIntermediateDataStructures(C);
1401: }
1402: return(0);
1403: }
1405: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1406: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1407: {
1408: PetscErrorCode ierr;
1409: Mat_APMPI *ptap;
1410: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1411: MPI_Comm comm;
1412: PetscMPIInt size,rank;
1413: Mat Cmpi;
1414: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1415: PetscInt pn=P->cmap->n,aN=A->cmap->N,an=A->cmap->n;
1416: PetscInt *lnk,i,k,nsend;
1417: PetscBT lnkbt;
1418: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1419: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1420: PetscInt len,proc,*dnz,*onz,*owners,nzi;
1421: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1422: MPI_Request *swaits,*rwaits;
1423: MPI_Status *sstatus,rstatus;
1424: PetscLayout rowmap;
1425: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1426: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
1427: PetscInt *Jptr,*prmap=p->garray,con,j,Crmax;
1428: Mat_SeqAIJ *a_loc,*c_loc,*c_oth;
1429: PetscTable ta;
1430: MatType mtype;
1431: const char *prefix;
1434: PetscObjectGetComm((PetscObject)A,&comm);
1435: MPI_Comm_size(comm,&size);
1436: MPI_Comm_rank(comm,&rank);
1438: /* create symbolic parallel matrix Cmpi */
1439: MatCreate(comm,&Cmpi);
1440: MatGetType(A,&mtype);
1441: MatSetType(Cmpi,mtype);
1443: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1445: /* create struct Mat_APMPI and attached it to C later */
1446: PetscNew(&ptap);
1447: ptap->reuse = MAT_INITIAL_MATRIX;
1449: /* (0) compute Rd = Pd^T, Ro = Po^T */
1450: /* --------------------------------- */
1451: MatTranspose_SeqAIJ(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1452: MatTranspose_SeqAIJ(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
1454: /* (1) compute symbolic A_loc */
1455: /* ---------------------------*/
1456: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&ptap->A_loc);
1458: /* (2-1) compute symbolic C_oth = Ro*A_loc */
1459: /* ------------------------------------ */
1460: MatGetOptionsPrefix(A,&prefix);
1461: MatSetOptionsPrefix(ptap->Ro,prefix);
1462: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1463: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->A_loc,fill,&ptap->C_oth);
1465: /* (3) send coj of C_oth to other processors */
1466: /* ------------------------------------------ */
1467: /* determine row ownership */
1468: PetscLayoutCreate(comm,&rowmap);
1469: rowmap->n = pn;
1470: rowmap->bs = 1;
1471: PetscLayoutSetUp(rowmap);
1472: owners = rowmap->range;
1474: /* determine the number of messages to send, their lengths */
1475: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1476: PetscArrayzero(len_s,size);
1477: PetscArrayzero(len_si,size);
1479: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1480: coi = c_oth->i; coj = c_oth->j;
1481: con = ptap->C_oth->rmap->n;
1482: proc = 0;
1483: for (i=0; i<con; i++) {
1484: while (prmap[i] >= owners[proc+1]) proc++;
1485: len_si[proc]++; /* num of rows in Co(=Pt*A) to be sent to [proc] */
1486: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1487: }
1489: len = 0; /* max length of buf_si[], see (4) */
1490: owners_co[0] = 0;
1491: nsend = 0;
1492: for (proc=0; proc<size; proc++) {
1493: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1494: if (len_s[proc]) {
1495: nsend++;
1496: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1497: len += len_si[proc];
1498: }
1499: }
1501: /* determine the number and length of messages to receive for coi and coj */
1502: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1503: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
1505: /* post the Irecv and Isend of coj */
1506: PetscCommGetNewTag(comm,&tagj);
1507: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1508: PetscMalloc1(nsend+1,&swaits);
1509: for (proc=0, k=0; proc<size; proc++) {
1510: if (!len_s[proc]) continue;
1511: i = owners_co[proc];
1512: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1513: k++;
1514: }
1516: /* (2-2) compute symbolic C_loc = Rd*A_loc */
1517: /* ---------------------------------------- */
1518: MatSetOptionsPrefix(ptap->Rd,prefix);
1519: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1520: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->A_loc,fill,&ptap->C_loc);
1521: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1523: /* receives coj are complete */
1524: for (i=0; i<nrecv; i++) {
1525: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1526: }
1527: PetscFree(rwaits);
1528: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1530: /* add received column indices into ta to update Crmax */
1531: a_loc = (Mat_SeqAIJ*)(ptap->A_loc)->data;
1533: /* create and initialize a linked list */
1534: PetscTableCreate(an,aN,&ta); /* for compute Crmax */
1535: MatRowMergeMax_SeqAIJ(a_loc,ptap->A_loc->rmap->N,ta);
1537: for (k=0; k<nrecv; k++) {/* k-th received message */
1538: Jptr = buf_rj[k];
1539: for (j=0; j<len_r[k]; j++) {
1540: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1541: }
1542: }
1543: PetscTableGetCount(ta,&Crmax);
1544: PetscTableDestroy(&ta);
1546: /* (4) send and recv coi */
1547: /*-----------------------*/
1548: PetscCommGetNewTag(comm,&tagi);
1549: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1550: PetscMalloc1(len+1,&buf_s);
1551: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1552: for (proc=0,k=0; proc<size; proc++) {
1553: if (!len_s[proc]) continue;
1554: /* form outgoing message for i-structure:
1555: buf_si[0]: nrows to be sent
1556: [1:nrows]: row index (global)
1557: [nrows+1:2*nrows+1]: i-structure index
1558: */
1559: /*-------------------------------------------*/
1560: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1561: buf_si_i = buf_si + nrows+1;
1562: buf_si[0] = nrows;
1563: buf_si_i[0] = 0;
1564: nrows = 0;
1565: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1566: nzi = coi[i+1] - coi[i];
1567: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1568: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1569: nrows++;
1570: }
1571: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1572: k++;
1573: buf_si += len_si[proc];
1574: }
1575: for (i=0; i<nrecv; i++) {
1576: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1577: }
1578: PetscFree(rwaits);
1579: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1581: PetscFree4(len_s,len_si,sstatus,owners_co);
1582: PetscFree(len_ri);
1583: PetscFree(swaits);
1584: PetscFree(buf_s);
1586: /* (5) compute the local portion of Cmpi */
1587: /* ------------------------------------------ */
1588: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1589: PetscFreeSpaceGet(Crmax,&free_space);
1590: current_space = free_space;
1592: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1593: for (k=0; k<nrecv; k++) {
1594: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1595: nrows = *buf_ri_k[k];
1596: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1597: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1598: }
1600: MatPreallocateInitialize(comm,pn,an,dnz,onz);
1601: PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);
1602: for (i=0; i<pn; i++) {
1603: /* add C_loc into Cmpi */
1604: nzi = c_loc->i[i+1] - c_loc->i[i];
1605: Jptr = c_loc->j + c_loc->i[i];
1606: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1608: /* add received col data into lnk */
1609: for (k=0; k<nrecv; k++) { /* k-th received message */
1610: if (i == *nextrow[k]) { /* i-th row */
1611: nzi = *(nextci[k]+1) - *nextci[k];
1612: Jptr = buf_rj[k] + *nextci[k];
1613: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1614: nextrow[k]++; nextci[k]++;
1615: }
1616: }
1617: nzi = lnk[0];
1619: /* copy data into free space, then initialize lnk */
1620: PetscLLCondensedClean(aN,nzi,current_space->array,lnk,lnkbt);
1621: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
1622: }
1623: PetscFree3(buf_ri_k,nextrow,nextci);
1624: PetscLLDestroy(lnk,lnkbt);
1625: PetscFreeSpaceDestroy(free_space);
1627: /* local sizes and preallocation */
1628: MatSetSizes(Cmpi,pn,an,PETSC_DETERMINE,PETSC_DETERMINE);
1629: if (P->cmap->bs > 0) {PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);}
1630: if (A->cmap->bs > 0) {PetscLayoutSetBlockSize(Cmpi->cmap,A->cmap->bs);}
1631: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1632: MatPreallocateFinalize(dnz,onz);
1634: /* members in merge */
1635: PetscFree(id_r);
1636: PetscFree(len_r);
1637: PetscFree(buf_ri[0]);
1638: PetscFree(buf_ri);
1639: PetscFree(buf_rj[0]);
1640: PetscFree(buf_rj);
1641: PetscLayoutDestroy(&rowmap);
1643: /* attach the supporting struct to Cmpi for reuse */
1644: c = (Mat_MPIAIJ*)Cmpi->data;
1645: c->ap = ptap;
1646: ptap->destroy = Cmpi->ops->destroy;
1648: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1649: Cmpi->assembled = PETSC_FALSE;
1650: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1651: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1653: *C = Cmpi;
1654: return(0);
1655: }
1657: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
1658: {
1659: PetscErrorCode ierr;
1660: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1661: Mat_SeqAIJ *c_seq;
1662: Mat_APMPI *ptap = c->ap;
1663: Mat A_loc,C_loc,C_oth;
1664: PetscInt i,rstart,rend,cm,ncols,row;
1665: const PetscInt *cols;
1666: const PetscScalar *vals;
1669: if (!ptap->A_loc) {
1670: MPI_Comm comm;
1671: PetscObjectGetComm((PetscObject)C,&comm);
1672: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtA cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1673: }
1675: MatZeroEntries(C);
1677: if (ptap->reuse == MAT_REUSE_MATRIX) {
1678: /* These matrices are obtained in MatTransposeMatMultSymbolic() */
1679: /* 1) get R = Pd^T, Ro = Po^T */
1680: /*----------------------------*/
1681: MatTranspose_SeqAIJ(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1682: MatTranspose_SeqAIJ(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1684: /* 2) compute numeric A_loc */
1685: /*--------------------------*/
1686: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&ptap->A_loc);
1687: }
1689: /* 3) C_loc = Rd*A_loc, C_oth = Ro*A_loc */
1690: A_loc = ptap->A_loc;
1691: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,A_loc,ptap->C_loc);
1692: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,A_loc,ptap->C_oth);
1693: C_loc = ptap->C_loc;
1694: C_oth = ptap->C_oth;
1696: /* add C_loc and Co to to C */
1697: MatGetOwnershipRange(C,&rstart,&rend);
1699: /* C_loc -> C */
1700: cm = C_loc->rmap->N;
1701: c_seq = (Mat_SeqAIJ*)C_loc->data;
1702: cols = c_seq->j;
1703: vals = c_seq->a;
1704: for (i=0; i<cm; i++) {
1705: ncols = c_seq->i[i+1] - c_seq->i[i];
1706: row = rstart + i;
1707: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1708: cols += ncols; vals += ncols;
1709: }
1711: /* Co -> C, off-processor part */
1712: cm = C_oth->rmap->N;
1713: c_seq = (Mat_SeqAIJ*)C_oth->data;
1714: cols = c_seq->j;
1715: vals = c_seq->a;
1716: for (i=0; i<cm; i++) {
1717: ncols = c_seq->i[i+1] - c_seq->i[i];
1718: row = p->garray[i];
1719: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1720: cols += ncols; vals += ncols;
1721: }
1722: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1723: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1725: ptap->reuse = MAT_REUSE_MATRIX;
1727: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1728: if (ptap->freestruct) {
1729: MatFreeIntermediateDataStructures(C);
1730: }
1731: return(0);
1732: }
1734: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1735: {
1736: PetscErrorCode ierr;
1737: Mat_Merge_SeqsToMPI *merge;
1738: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1739: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1740: Mat_APMPI *ptap;
1741: PetscInt *adj;
1742: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1743: MatScalar *ada,*ca,valtmp;
1744: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1745: MPI_Comm comm;
1746: PetscMPIInt size,rank,taga,*len_s;
1747: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1748: PetscInt **buf_ri,**buf_rj;
1749: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1750: MPI_Request *s_waits,*r_waits;
1751: MPI_Status *status;
1752: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1753: PetscInt *ai,*aj,*coi,*coj,*poJ,*pdJ;
1754: Mat A_loc;
1755: Mat_SeqAIJ *a_loc;
1758: PetscObjectGetComm((PetscObject)C,&comm);
1759: MPI_Comm_size(comm,&size);
1760: MPI_Comm_rank(comm,&rank);
1762: ptap = c->ap;
1763: if (!ptap->A_loc) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtA cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
1764: merge = ptap->merge;
1766: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1767: /*------------------------------------------*/
1768: /* get data from symbolic products */
1769: coi = merge->coi; coj = merge->coj;
1770: PetscCalloc1(coi[pon]+1,&coa);
1771: bi = merge->bi; bj = merge->bj;
1772: owners = merge->rowmap->range;
1773: PetscCalloc1(bi[cm]+1,&ba);
1775: /* get A_loc by taking all local rows of A */
1776: A_loc = ptap->A_loc;
1777: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1778: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1779: ai = a_loc->i;
1780: aj = a_loc->j;
1782: for (i=0; i<am; i++) {
1783: anz = ai[i+1] - ai[i];
1784: adj = aj + ai[i];
1785: ada = a_loc->a + ai[i];
1787: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1788: /*-------------------------------------------------------------*/
1789: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1790: pnz = po->i[i+1] - po->i[i];
1791: poJ = po->j + po->i[i];
1792: pA = po->a + po->i[i];
1793: for (j=0; j<pnz; j++) {
1794: row = poJ[j];
1795: cj = coj + coi[row];
1796: ca = coa + coi[row];
1797: /* perform sparse axpy */
1798: nexta = 0;
1799: valtmp = pA[j];
1800: for (k=0; nexta<anz; k++) {
1801: if (cj[k] == adj[nexta]) {
1802: ca[k] += valtmp*ada[nexta];
1803: nexta++;
1804: }
1805: }
1806: PetscLogFlops(2.0*anz);
1807: }
1809: /* put the value into Cd (diagonal part) */
1810: pnz = pd->i[i+1] - pd->i[i];
1811: pdJ = pd->j + pd->i[i];
1812: pA = pd->a + pd->i[i];
1813: for (j=0; j<pnz; j++) {
1814: row = pdJ[j];
1815: cj = bj + bi[row];
1816: ca = ba + bi[row];
1817: /* perform sparse axpy */
1818: nexta = 0;
1819: valtmp = pA[j];
1820: for (k=0; nexta<anz; k++) {
1821: if (cj[k] == adj[nexta]) {
1822: ca[k] += valtmp*ada[nexta];
1823: nexta++;
1824: }
1825: }
1826: PetscLogFlops(2.0*anz);
1827: }
1828: }
1830: /* 3) send and recv matrix values coa */
1831: /*------------------------------------*/
1832: buf_ri = merge->buf_ri;
1833: buf_rj = merge->buf_rj;
1834: len_s = merge->len_s;
1835: PetscCommGetNewTag(comm,&taga);
1836: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1838: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1839: for (proc=0,k=0; proc<size; proc++) {
1840: if (!len_s[proc]) continue;
1841: i = merge->owners_co[proc];
1842: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1843: k++;
1844: }
1845: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1846: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1848: PetscFree2(s_waits,status);
1849: PetscFree(r_waits);
1850: PetscFree(coa);
1852: /* 4) insert local Cseq and received values into Cmpi */
1853: /*----------------------------------------------------*/
1854: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1855: for (k=0; k<merge->nrecv; k++) {
1856: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1857: nrows = *(buf_ri_k[k]);
1858: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1859: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1860: }
1862: for (i=0; i<cm; i++) {
1863: row = owners[rank] + i; /* global row index of C_seq */
1864: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1865: ba_i = ba + bi[i];
1866: bnz = bi[i+1] - bi[i];
1867: /* add received vals into ba */
1868: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1869: /* i-th row */
1870: if (i == *nextrow[k]) {
1871: cnz = *(nextci[k]+1) - *nextci[k];
1872: cj = buf_rj[k] + *(nextci[k]);
1873: ca = abuf_r[k] + *(nextci[k]);
1874: nextcj = 0;
1875: for (j=0; nextcj<cnz; j++) {
1876: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1877: ba_i[j] += ca[nextcj++];
1878: }
1879: }
1880: nextrow[k]++; nextci[k]++;
1881: PetscLogFlops(2.0*cnz);
1882: }
1883: }
1884: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1885: }
1886: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1887: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1889: PetscFree(ba);
1890: PetscFree(abuf_r[0]);
1891: PetscFree(abuf_r);
1892: PetscFree3(buf_ri_k,nextrow,nextci);
1894: if (ptap->freestruct) {
1895: MatFreeIntermediateDataStructures(C);
1896: }
1897: return(0);
1898: }
1900: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1901: {
1902: PetscErrorCode ierr;
1903: Mat Cmpi,A_loc,POt,PDt;
1904: Mat_APMPI *ptap;
1905: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1906: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*a=(Mat_MPIAIJ*)A->data,*c;
1907: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1908: PetscInt nnz;
1909: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1910: PetscInt am =A->rmap->n,pn=P->cmap->n;
1911: MPI_Comm comm;
1912: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1913: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1914: PetscInt len,proc,*dnz,*onz,*owners;
1915: PetscInt nzi,*bi,*bj;
1916: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1917: MPI_Request *swaits,*rwaits;
1918: MPI_Status *sstatus,rstatus;
1919: Mat_Merge_SeqsToMPI *merge;
1920: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1921: PetscReal afill =1.0,afill_tmp;
1922: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Armax;
1923: Mat_SeqAIJ *a_loc,*pdt,*pot;
1924: PetscTable ta;
1925: MatType mtype;
1928: PetscObjectGetComm((PetscObject)A,&comm);
1929: /* check if matrix local sizes are compatible */
1930: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1932: MPI_Comm_size(comm,&size);
1933: MPI_Comm_rank(comm,&rank);
1935: /* create struct Mat_APMPI and attached it to C later */
1936: PetscNew(&ptap);
1938: /* get A_loc by taking all local rows of A */
1939: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1941: ptap->A_loc = A_loc;
1942: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1943: ai = a_loc->i;
1944: aj = a_loc->j;
1946: /* determine symbolic Co=(p->B)^T*A - send to others */
1947: /*----------------------------------------------------*/
1948: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1949: pdt = (Mat_SeqAIJ*)PDt->data;
1950: pdti = pdt->i; pdtj = pdt->j;
1952: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1953: pot = (Mat_SeqAIJ*)POt->data;
1954: poti = pot->i; potj = pot->j;
1956: /* then, compute symbolic Co = (p->B)^T*A */
1957: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1958: >= (num of nonzero rows of C_seq) - pn */
1959: PetscMalloc1(pon+1,&coi);
1960: coi[0] = 0;
1962: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1963: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(poti[pon],ai[am]));
1964: PetscFreeSpaceGet(nnz,&free_space);
1965: current_space = free_space;
1967: /* create and initialize a linked list */
1968: PetscTableCreate(A->cmap->n + a->B->cmap->N,aN,&ta);
1969: MatRowMergeMax_SeqAIJ(a_loc,am,ta);
1970: PetscTableGetCount(ta,&Armax);
1972: PetscLLCondensedCreate_Scalable(Armax,&lnk);
1974: for (i=0; i<pon; i++) {
1975: pnz = poti[i+1] - poti[i];
1976: ptJ = potj + poti[i];
1977: for (j=0; j<pnz; j++) {
1978: row = ptJ[j]; /* row of A_loc == col of Pot */
1979: anz = ai[row+1] - ai[row];
1980: Jptr = aj + ai[row];
1981: /* add non-zero cols of AP into the sorted linked list lnk */
1982: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1983: }
1984: nnz = lnk[0];
1986: /* If free space is not available, double the total space in the list */
1987: if (current_space->local_remaining<nnz) {
1988: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
1989: nspacedouble++;
1990: }
1992: /* Copy data into free space, and zero out denserows */
1993: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1995: current_space->array += nnz;
1996: current_space->local_used += nnz;
1997: current_space->local_remaining -= nnz;
1999: coi[i+1] = coi[i] + nnz;
2000: }
2002: PetscMalloc1(coi[pon]+1,&coj);
2003: PetscFreeSpaceContiguous(&free_space,coj);
2004: PetscLLCondensedDestroy_Scalable(lnk); /* must destroy to get a new one for C */
2006: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
2007: if (afill_tmp > afill) afill = afill_tmp;
2009: /* send j-array (coj) of Co to other processors */
2010: /*----------------------------------------------*/
2011: /* determine row ownership */
2012: PetscNew(&merge);
2013: PetscLayoutCreate(comm,&merge->rowmap);
2015: merge->rowmap->n = pn;
2016: merge->rowmap->bs = 1;
2018: PetscLayoutSetUp(merge->rowmap);
2019: owners = merge->rowmap->range;
2021: /* determine the number of messages to send, their lengths */
2022: PetscCalloc1(size,&len_si);
2023: PetscCalloc1(size,&merge->len_s);
2025: len_s = merge->len_s;
2026: merge->nsend = 0;
2028: PetscMalloc1(size+2,&owners_co);
2030: proc = 0;
2031: for (i=0; i<pon; i++) {
2032: while (prmap[i] >= owners[proc+1]) proc++;
2033: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
2034: len_s[proc] += coi[i+1] - coi[i];
2035: }
2037: len = 0; /* max length of buf_si[] */
2038: owners_co[0] = 0;
2039: for (proc=0; proc<size; proc++) {
2040: owners_co[proc+1] = owners_co[proc] + len_si[proc];
2041: if (len_si[proc]) {
2042: merge->nsend++;
2043: len_si[proc] = 2*(len_si[proc] + 1);
2044: len += len_si[proc];
2045: }
2046: }
2048: /* determine the number and length of messages to receive for coi and coj */
2049: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
2050: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
2052: /* post the Irecv and Isend of coj */
2053: PetscCommGetNewTag(comm,&tagj);
2054: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
2055: PetscMalloc1(merge->nsend+1,&swaits);
2056: for (proc=0, k=0; proc<size; proc++) {
2057: if (!len_s[proc]) continue;
2058: i = owners_co[proc];
2059: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
2060: k++;
2061: }
2063: /* receives and sends of coj are complete */
2064: PetscMalloc1(size,&sstatus);
2065: for (i=0; i<merge->nrecv; i++) {
2066: PetscMPIInt icompleted;
2067: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
2068: }
2069: PetscFree(rwaits);
2070: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
2072: /* add received column indices into table to update Armax */
2073: /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/examples/tutorials/ex56.c! */
2074: for (k=0; k<merge->nrecv; k++) {/* k-th received message */
2075: Jptr = buf_rj[k];
2076: for (j=0; j<merge->len_r[k]; j++) {
2077: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
2078: }
2079: }
2080: PetscTableGetCount(ta,&Armax);
2081: /* printf("Armax %d, an %d + Bn %d = %d, aN %d\n",Armax,A->cmap->n,a->B->cmap->N,A->cmap->n+a->B->cmap->N,aN); */
2083: /* send and recv coi */
2084: /*-------------------*/
2085: PetscCommGetNewTag(comm,&tagi);
2086: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
2087: PetscMalloc1(len+1,&buf_s);
2088: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
2089: for (proc=0,k=0; proc<size; proc++) {
2090: if (!len_s[proc]) continue;
2091: /* form outgoing message for i-structure:
2092: buf_si[0]: nrows to be sent
2093: [1:nrows]: row index (global)
2094: [nrows+1:2*nrows+1]: i-structure index
2095: */
2096: /*-------------------------------------------*/
2097: nrows = len_si[proc]/2 - 1;
2098: buf_si_i = buf_si + nrows+1;
2099: buf_si[0] = nrows;
2100: buf_si_i[0] = 0;
2101: nrows = 0;
2102: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
2103: nzi = coi[i+1] - coi[i];
2104: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
2105: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
2106: nrows++;
2107: }
2108: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
2109: k++;
2110: buf_si += len_si[proc];
2111: }
2112: i = merge->nrecv;
2113: while (i--) {
2114: PetscMPIInt icompleted;
2115: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
2116: }
2117: PetscFree(rwaits);
2118: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
2119: PetscFree(len_si);
2120: PetscFree(len_ri);
2121: PetscFree(swaits);
2122: PetscFree(sstatus);
2123: PetscFree(buf_s);
2125: /* compute the local portion of C (mpi mat) */
2126: /*------------------------------------------*/
2127: /* allocate bi array and free space for accumulating nonzero column info */
2128: PetscMalloc1(pn+1,&bi);
2129: bi[0] = 0;
2131: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
2132: nnz = PetscRealIntMultTruncate(fill,PetscIntSumTruncate(pdti[pn],PetscIntSumTruncate(poti[pon],ai[am])));
2133: PetscFreeSpaceGet(nnz,&free_space);
2134: current_space = free_space;
2136: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
2137: for (k=0; k<merge->nrecv; k++) {
2138: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
2139: nrows = *buf_ri_k[k];
2140: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
2141: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
2142: }
2144: PetscLLCondensedCreate_Scalable(Armax,&lnk);
2145: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
2146: rmax = 0;
2147: for (i=0; i<pn; i++) {
2148: /* add pdt[i,:]*AP into lnk */
2149: pnz = pdti[i+1] - pdti[i];
2150: ptJ = pdtj + pdti[i];
2151: for (j=0; j<pnz; j++) {
2152: row = ptJ[j]; /* row of AP == col of Pt */
2153: anz = ai[row+1] - ai[row];
2154: Jptr = aj + ai[row];
2155: /* add non-zero cols of AP into the sorted linked list lnk */
2156: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
2157: }
2159: /* add received col data into lnk */
2160: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
2161: if (i == *nextrow[k]) { /* i-th row */
2162: nzi = *(nextci[k]+1) - *nextci[k];
2163: Jptr = buf_rj[k] + *nextci[k];
2164: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
2165: nextrow[k]++; nextci[k]++;
2166: }
2167: }
2168: nnz = lnk[0];
2170: /* if free space is not available, make more free space */
2171: if (current_space->local_remaining<nnz) {
2172: PetscFreeSpaceGet(PetscIntSumTruncate(nnz,current_space->total_array_size),¤t_space);
2173: nspacedouble++;
2174: }
2175: /* copy data into free space, then initialize lnk */
2176: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
2177: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
2179: current_space->array += nnz;
2180: current_space->local_used += nnz;
2181: current_space->local_remaining -= nnz;
2183: bi[i+1] = bi[i] + nnz;
2184: if (nnz > rmax) rmax = nnz;
2185: }
2186: PetscFree3(buf_ri_k,nextrow,nextci);
2188: PetscMalloc1(bi[pn]+1,&bj);
2189: PetscFreeSpaceContiguous(&free_space,bj);
2190: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
2191: if (afill_tmp > afill) afill = afill_tmp;
2192: PetscLLCondensedDestroy_Scalable(lnk);
2193: PetscTableDestroy(&ta);
2195: MatDestroy(&POt);
2196: MatDestroy(&PDt);
2198: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
2199: /*----------------------------------------------------------------------------------*/
2200: MatCreate(comm,&Cmpi);
2201: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
2202: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));
2203: MatGetType(A,&mtype);
2204: MatSetType(Cmpi,mtype);
2205: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
2206: MatPreallocateFinalize(dnz,onz);
2207: MatSetBlockSize(Cmpi,1);
2208: for (i=0; i<pn; i++) {
2209: row = i + rstart;
2210: nnz = bi[i+1] - bi[i];
2211: Jptr = bj + bi[i];
2212: MatSetValues(Cmpi,1,&row,nnz,Jptr,NULL,INSERT_VALUES);
2213: }
2214: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
2215: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
2216: merge->bi = bi;
2217: merge->bj = bj;
2218: merge->coi = coi;
2219: merge->coj = coj;
2220: merge->buf_ri = buf_ri;
2221: merge->buf_rj = buf_rj;
2222: merge->owners_co = owners_co;
2224: /* attach the supporting struct to Cmpi for reuse */
2225: c = (Mat_MPIAIJ*)Cmpi->data;
2227: c->ap = ptap;
2228: ptap->api = NULL;
2229: ptap->apj = NULL;
2230: ptap->merge = merge;
2231: ptap->apa = NULL;
2232: ptap->destroy = Cmpi->ops->destroy;
2233: ptap->duplicate = Cmpi->ops->duplicate;
2235: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
2236: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
2237: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
2239: *C = Cmpi;
2240: #if defined(PETSC_USE_INFO)
2241: if (bi[pn] != 0) {
2242: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
2243: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
2244: } else {
2245: PetscInfo(Cmpi,"Empty matrix product\n");
2246: }
2247: #endif
2248: return(0);
2249: }