Actual source code: mpiptap.c
petsc-3.13.6 2020-09-29
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
17: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
18: {
19: PetscErrorCode ierr;
20: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
21: Mat_APMPI *ptap=a->ap;
22: PetscBool iascii;
23: PetscViewerFormat format;
26: if (!ptap) {
27: /* hack: MatDuplicate() sets oldmat->ops->view to newmat which is a base mat class with null ptpa! */
28: A->ops->view = MatView_MPIAIJ;
29: (A->ops->view)(A,viewer);
30: return(0);
31: }
33: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
34: if (iascii) {
35: PetscViewerGetFormat(viewer,&format);
36: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
37: if (ptap->algType == 0) {
38: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
39: } else if (ptap->algType == 1) {
40: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
41: } else if (ptap->algType == 2) {
42: PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
43: } else if (ptap->algType == 3) {
44: PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
45: }
46: }
47: }
48: (ptap->view)(A,viewer);
49: return(0);
50: }
52: PetscErrorCode MatFreeIntermediateDataStructures_MPIAIJ_AP(Mat A)
53: {
54: PetscErrorCode ierr;
55: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
56: Mat_APMPI *ptap=a->ap;
57: Mat_Merge_SeqsToMPI *merge;
60: if (!ptap) return(0);
62: PetscFree2(ptap->startsj_s,ptap->startsj_r);
63: PetscFree(ptap->bufa);
64: MatDestroy(&ptap->P_loc);
65: MatDestroy(&ptap->P_oth);
66: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
67: MatDestroy(&ptap->Rd);
68: MatDestroy(&ptap->Ro);
69: if (ptap->AP_loc) { /* used by alg_rap */
70: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
71: PetscFree(ap->i);
72: PetscFree2(ap->j,ap->a);
73: MatDestroy(&ptap->AP_loc);
74: } else { /* used by alg_ptap */
75: PetscFree(ptap->api);
76: PetscFree(ptap->apj);
77: }
78: MatDestroy(&ptap->C_loc);
79: MatDestroy(&ptap->C_oth);
80: if (ptap->apa) {PetscFree(ptap->apa);}
82: MatDestroy(&ptap->Pt);
84: merge=ptap->merge;
85: if (merge) { /* used by alg_ptap */
86: PetscFree(merge->id_r);
87: PetscFree(merge->len_s);
88: PetscFree(merge->len_r);
89: PetscFree(merge->bi);
90: PetscFree(merge->bj);
91: PetscFree(merge->buf_ri[0]);
92: PetscFree(merge->buf_ri);
93: PetscFree(merge->buf_rj[0]);
94: PetscFree(merge->buf_rj);
95: PetscFree(merge->coi);
96: PetscFree(merge->coj);
97: PetscFree(merge->owners_co);
98: PetscLayoutDestroy(&merge->rowmap);
99: PetscFree(ptap->merge);
100: }
101: ISLocalToGlobalMappingDestroy(&ptap->ltog);
103: PetscSFDestroy(&ptap->sf);
104: PetscFree(ptap->c_othi);
105: PetscFree(ptap->c_rmti);
106: return(0);
107: }
109: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
110: {
112: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
113: Mat_APMPI *ptap=a->ap;
116: (*A->ops->freeintermediatedatastructures)(A);
117: (*ptap->destroy)(A); /* MatDestroy_MPIAIJ(A) */
118: PetscFree(ptap);
119: return(0);
120: }
122: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
123: {
124: PetscErrorCode ierr;
125: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
126: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
127: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
128: Mat_APMPI *ptap = c->ap;
129: Mat AP_loc,C_loc,C_oth;
130: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
131: PetscScalar *apa;
132: const PetscInt *cols;
133: const PetscScalar *vals;
136: if (!ptap->AP_loc) {
137: MPI_Comm comm;
138: PetscObjectGetComm((PetscObject)C,&comm);
139: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
140: }
142: MatZeroEntries(C);
144: /* 1) get R = Pd^T,Ro = Po^T */
145: if (ptap->reuse == MAT_REUSE_MATRIX) {
146: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
147: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
148: }
150: /* 2) get AP_loc */
151: AP_loc = ptap->AP_loc;
152: ap = (Mat_SeqAIJ*)AP_loc->data;
154: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
155: /*-----------------------------------------------------*/
156: if (ptap->reuse == MAT_REUSE_MATRIX) {
157: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
158: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
159: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
160: }
162: /* 2-2) compute numeric A_loc*P - dominating part */
163: /* ---------------------------------------------- */
164: /* get data from symbolic products */
165: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
166: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
168: api = ap->i;
169: apj = ap->j;
170: ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
171: for (i=0; i<am; i++) {
172: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
173: apnz = api[i+1] - api[i];
174: apa = ap->a + api[i];
175: PetscArrayzero(apa,apnz);
176: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
177: }
178: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);
179: if (api[AP_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",api[AP_loc->rmap->n],nout);
181: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
182: /* Always use scalable version since we are in the MPI scalable version */
183: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
184: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
186: C_loc = ptap->C_loc;
187: C_oth = ptap->C_oth;
189: /* add C_loc and Co to to C */
190: MatGetOwnershipRange(C,&rstart,&rend);
192: /* C_loc -> C */
193: cm = C_loc->rmap->N;
194: c_seq = (Mat_SeqAIJ*)C_loc->data;
195: cols = c_seq->j;
196: vals = c_seq->a;
197: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);
199: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
200: /* when there are no off-processor parts. */
201: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
202: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
203: /* a table, and other, more complex stuff has to be done. */
204: if (C->assembled) {
205: C->was_assembled = PETSC_TRUE;
206: C->assembled = PETSC_FALSE;
207: }
208: if (C->was_assembled) {
209: for (i=0; i<cm; i++) {
210: ncols = c_seq->i[i+1] - c_seq->i[i];
211: row = rstart + i;
212: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
213: cols += ncols; vals += ncols;
214: }
215: } else {
216: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
217: }
218: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);
219: if (c_seq->i[C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
221: /* Co -> C, off-processor part */
222: cm = C_oth->rmap->N;
223: c_seq = (Mat_SeqAIJ*)C_oth->data;
224: cols = c_seq->j;
225: vals = c_seq->a;
226: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
227: for (i=0; i<cm; i++) {
228: ncols = c_seq->i[i+1] - c_seq->i[i];
229: row = p->garray[i];
230: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
231: cols += ncols; vals += ncols;
232: }
233: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
234: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
236: ptap->reuse = MAT_REUSE_MATRIX;
238: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
239: if (c_seq->i[C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_seq->i[C_loc->rmap->n],nout);
241: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
242: if (ptap->freestruct) {
243: MatFreeIntermediateDataStructures(C);
244: }
245: return(0);
246: }
248: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
249: {
250: PetscErrorCode ierr;
251: Mat_APMPI *ptap;
252: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
253: MPI_Comm comm;
254: PetscMPIInt size,rank;
255: Mat P_loc,P_oth;
256: PetscFreeSpaceList free_space=NULL,current_space=NULL;
257: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
258: PetscInt *lnk,i,k,pnz,row,nsend;
259: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
260: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
261: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
262: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
263: MPI_Request *swaits,*rwaits;
264: MPI_Status *sstatus,rstatus;
265: PetscLayout rowmap;
266: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
267: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
268: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
269: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
270: PetscScalar *apv;
271: PetscTable ta;
272: MatType mtype;
273: const char *prefix;
274: #if defined(PETSC_USE_INFO)
275: PetscReal apfill;
276: #endif
279: PetscObjectGetComm((PetscObject)A,&comm);
280: MPI_Comm_size(comm,&size);
281: MPI_Comm_rank(comm,&rank);
283: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
285: /* create symbolic parallel matrix Cmpi */
286: MatGetType(A,&mtype);
287: MatSetType(Cmpi,mtype);
289: /* create struct Mat_APMPI and attached it to C later */
290: PetscNew(&ptap);
291: ptap->reuse = MAT_INITIAL_MATRIX;
292: ptap->algType = 0;
294: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
295: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
296: /* get P_loc by taking all local rows of P */
297: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
299: ptap->P_loc = P_loc;
300: ptap->P_oth = P_oth;
302: /* (0) compute Rd = Pd^T, Ro = Po^T */
303: /* --------------------------------- */
304: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
305: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
307: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
308: /* ----------------------------------------------------------------- */
309: p_loc = (Mat_SeqAIJ*)P_loc->data;
310: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
312: /* create and initialize a linked list */
313: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
314: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
315: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
316: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
318: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
320: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
321: if (ao) {
322: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
323: } else {
324: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
325: }
326: current_space = free_space;
327: nspacedouble = 0;
329: PetscMalloc1(am+1,&api);
330: api[0] = 0;
331: for (i=0; i<am; i++) {
332: /* diagonal portion: Ad[i,:]*P */
333: ai = ad->i; pi = p_loc->i;
334: nzi = ai[i+1] - ai[i];
335: aj = ad->j + ai[i];
336: for (j=0; j<nzi; j++) {
337: row = aj[j];
338: pnz = pi[row+1] - pi[row];
339: Jptr = p_loc->j + pi[row];
340: /* add non-zero cols of P into the sorted linked list lnk */
341: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
342: }
343: /* off-diagonal portion: Ao[i,:]*P */
344: if (ao) {
345: ai = ao->i; pi = p_oth->i;
346: nzi = ai[i+1] - ai[i];
347: aj = ao->j + ai[i];
348: for (j=0; j<nzi; j++) {
349: row = aj[j];
350: pnz = pi[row+1] - pi[row];
351: Jptr = p_oth->j + pi[row];
352: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
353: }
354: }
355: apnz = lnk[0];
356: api[i+1] = api[i] + apnz;
358: /* if free space is not available, double the total space in the list */
359: if (current_space->local_remaining<apnz) {
360: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
361: nspacedouble++;
362: }
364: /* Copy data into free space, then initialize lnk */
365: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
367: current_space->array += apnz;
368: current_space->local_used += apnz;
369: current_space->local_remaining -= apnz;
370: }
371: /* Allocate space for apj and apv, initialize apj, and */
372: /* destroy list of free space and other temporary array(s) */
373: PetscCalloc2(api[am],&apj,api[am],&apv);
374: PetscFreeSpaceContiguous(&free_space,apj);
375: PetscLLCondensedDestroy_Scalable(lnk);
377: /* Create AP_loc for reuse */
378: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
379: MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);
381: #if defined(PETSC_USE_INFO)
382: if (ao) {
383: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
384: } else {
385: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
386: }
387: ptap->AP_loc->info.mallocs = nspacedouble;
388: ptap->AP_loc->info.fill_ratio_given = fill;
389: ptap->AP_loc->info.fill_ratio_needed = apfill;
391: if (api[am]) {
392: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
393: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
394: } else {
395: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
396: }
397: #endif
399: /* (2-1) compute symbolic Co = Ro*AP_loc */
400: /* -------------------------------------- */
401: MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
402: MatGetOptionsPrefix(A,&prefix);
403: MatSetOptionsPrefix(ptap->C_oth,prefix);
404: MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");
406: MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
407: MatProductSetAlgorithm(ptap->C_oth,"sorted");
408: MatProductSetFill(ptap->C_oth,fill);
409: MatProductSetFromOptions(ptap->C_oth);
410: MatProductSymbolic(ptap->C_oth);
412: /* (3) send coj of C_oth to other processors */
413: /* ------------------------------------------ */
414: /* determine row ownership */
415: PetscLayoutCreate(comm,&rowmap);
416: rowmap->n = pn;
417: rowmap->bs = 1;
418: PetscLayoutSetUp(rowmap);
419: owners = rowmap->range;
421: /* determine the number of messages to send, their lengths */
422: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
423: PetscArrayzero(len_s,size);
424: PetscArrayzero(len_si,size);
426: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
427: coi = c_oth->i; coj = c_oth->j;
428: con = ptap->C_oth->rmap->n;
429: proc = 0;
430: ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
431: for (i=0; i<con; i++) {
432: while (prmap[i] >= owners[proc+1]) proc++;
433: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
434: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
435: }
437: len = 0; /* max length of buf_si[], see (4) */
438: owners_co[0] = 0;
439: nsend = 0;
440: for (proc=0; proc<size; proc++) {
441: owners_co[proc+1] = owners_co[proc] + len_si[proc];
442: if (len_s[proc]) {
443: nsend++;
444: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
445: len += len_si[proc];
446: }
447: }
449: /* determine the number and length of messages to receive for coi and coj */
450: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
451: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
453: /* post the Irecv and Isend of coj */
454: PetscCommGetNewTag(comm,&tagj);
455: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
456: PetscMalloc1(nsend+1,&swaits);
457: for (proc=0, k=0; proc<size; proc++) {
458: if (!len_s[proc]) continue;
459: i = owners_co[proc];
460: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
461: k++;
462: }
464: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
465: /* ---------------------------------------- */
466: MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
467: MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
468: MatProductSetAlgorithm(ptap->C_loc,"default");
469: MatProductSetFill(ptap->C_loc,fill);
471: MatSetOptionsPrefix(ptap->C_loc,prefix);
472: MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");
474: MatProductSetFromOptions(ptap->C_loc);
475: MatProductSymbolic(ptap->C_loc);
477: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
478: ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);
480: /* receives coj are complete */
481: for (i=0; i<nrecv; i++) {
482: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
483: }
484: PetscFree(rwaits);
485: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
487: /* add received column indices into ta to update Crmax */
488: for (k=0; k<nrecv; k++) {/* k-th received message */
489: Jptr = buf_rj[k];
490: for (j=0; j<len_r[k]; j++) {
491: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
492: }
493: }
494: PetscTableGetCount(ta,&Crmax);
495: PetscTableDestroy(&ta);
497: /* (4) send and recv coi */
498: /*-----------------------*/
499: PetscCommGetNewTag(comm,&tagi);
500: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
501: PetscMalloc1(len+1,&buf_s);
502: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
503: for (proc=0,k=0; proc<size; proc++) {
504: if (!len_s[proc]) continue;
505: /* form outgoing message for i-structure:
506: buf_si[0]: nrows to be sent
507: [1:nrows]: row index (global)
508: [nrows+1:2*nrows+1]: i-structure index
509: */
510: /*-------------------------------------------*/
511: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
512: buf_si_i = buf_si + nrows+1;
513: buf_si[0] = nrows;
514: buf_si_i[0] = 0;
515: nrows = 0;
516: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
517: nzi = coi[i+1] - coi[i];
518: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
519: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
520: nrows++;
521: }
522: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
523: k++;
524: buf_si += len_si[proc];
525: }
526: for (i=0; i<nrecv; i++) {
527: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
528: }
529: PetscFree(rwaits);
530: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
532: PetscFree4(len_s,len_si,sstatus,owners_co);
533: PetscFree(len_ri);
534: PetscFree(swaits);
535: PetscFree(buf_s);
537: /* (5) compute the local portion of Cmpi */
538: /* ------------------------------------------ */
539: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
540: PetscFreeSpaceGet(Crmax,&free_space);
541: current_space = free_space;
543: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
544: for (k=0; k<nrecv; k++) {
545: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
546: nrows = *buf_ri_k[k];
547: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
548: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
549: }
551: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
552: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
553: for (i=0; i<pn; i++) {
554: /* add C_loc into Cmpi */
555: nzi = c_loc->i[i+1] - c_loc->i[i];
556: Jptr = c_loc->j + c_loc->i[i];
557: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
559: /* add received col data into lnk */
560: for (k=0; k<nrecv; k++) { /* k-th received message */
561: if (i == *nextrow[k]) { /* i-th row */
562: nzi = *(nextci[k]+1) - *nextci[k];
563: Jptr = buf_rj[k] + *nextci[k];
564: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
565: nextrow[k]++; nextci[k]++;
566: }
567: }
568: nzi = lnk[0];
570: /* copy data into free space, then initialize lnk */
571: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
572: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
573: }
574: PetscFree3(buf_ri_k,nextrow,nextci);
575: PetscLLCondensedDestroy_Scalable(lnk);
576: PetscFreeSpaceDestroy(free_space);
578: /* local sizes and preallocation */
579: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
580: if (P->cmap->bs > 0) {
581: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
582: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
583: }
584: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
585: MatPreallocateFinalize(dnz,onz);
587: /* members in merge */
588: PetscFree(id_r);
589: PetscFree(len_r);
590: PetscFree(buf_ri[0]);
591: PetscFree(buf_ri);
592: PetscFree(buf_rj[0]);
593: PetscFree(buf_rj);
594: PetscLayoutDestroy(&rowmap);
596: /* attach the supporting struct to Cmpi for reuse */
597: c = (Mat_MPIAIJ*)Cmpi->data;
598: c->ap = ptap;
599: ptap->duplicate = Cmpi->ops->duplicate;
600: ptap->destroy = Cmpi->ops->destroy;
601: ptap->view = Cmpi->ops->view;
603: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
604: Cmpi->assembled = PETSC_FALSE;
605: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
606: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
607: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
608: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
610: nout = 0;
611: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
612: if (c_oth->i[ptap->C_oth->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_oth->i[ptap->C_oth->rmap->n],nout);
613: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);
614: if (c_loc->i[ptap->C_loc->rmap->n] != nout) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Incorrect mapping %D != %D\n",c_loc->i[ptap->C_loc->rmap->n],nout);
616: return(0);
617: }
619: PETSC_STATIC_INLINE PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
620: {
621: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
622: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
623: PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
624: PetscInt pcstart,pcend,column,offset;
625: PetscErrorCode ierr;
628: pcstart = P->cmap->rstart;
629: pcstart *= dof;
630: pcend = P->cmap->rend;
631: pcend *= dof;
632: /* diagonal portion: Ad[i,:]*P */
633: ai = ad->i;
634: nzi = ai[i+1] - ai[i];
635: aj = ad->j + ai[i];
636: for (j=0; j<nzi; j++) {
637: row = aj[j];
638: offset = row%dof;
639: row /= dof;
640: nzpi = pd->i[row+1] - pd->i[row];
641: pj = pd->j + pd->i[row];
642: for (k=0; k<nzpi; k++) {
643: PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
644: }
645: }
646: /* off diag P */
647: for (j=0; j<nzi; j++) {
648: row = aj[j];
649: offset = row%dof;
650: row /= dof;
651: nzpi = po->i[row+1] - po->i[row];
652: pj = po->j + po->i[row];
653: for (k=0; k<nzpi; k++) {
654: PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
655: }
656: }
658: /* off diagonal part: Ao[i, :]*P_oth */
659: if (ao) {
660: ai = ao->i;
661: pi = p_oth->i;
662: nzi = ai[i+1] - ai[i];
663: aj = ao->j + ai[i];
664: for (j=0; j<nzi; j++) {
665: row = aj[j];
666: offset = a->garray[row]%dof;
667: row = map[row];
668: pnz = pi[row+1] - pi[row];
669: p_othcols = p_oth->j + pi[row];
670: for (col=0; col<pnz; col++) {
671: column = p_othcols[col] * dof + offset;
672: if (column>=pcstart && column<pcend) {
673: PetscHSetIAdd(dht,column);
674: } else {
675: PetscHSetIAdd(oht,column);
676: }
677: }
678: }
679: } /* end if (ao) */
680: return(0);
681: }
683: PETSC_STATIC_INLINE PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
684: {
685: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
686: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
687: PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
688: PetscScalar ra,*aa,*pa;
689: PetscErrorCode ierr;
692: pcstart = P->cmap->rstart;
693: pcstart *= dof;
695: /* diagonal portion: Ad[i,:]*P */
696: ai = ad->i;
697: nzi = ai[i+1] - ai[i];
698: aj = ad->j + ai[i];
699: aa = ad->a + ai[i];
700: for (j=0; j<nzi; j++) {
701: ra = aa[j];
702: row = aj[j];
703: offset = row%dof;
704: row /= dof;
705: nzpi = pd->i[row+1] - pd->i[row];
706: pj = pd->j + pd->i[row];
707: pa = pd->a + pd->i[row];
708: for (k=0; k<nzpi; k++) {
709: PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
710: }
711: PetscLogFlops(2.0*nzpi);
712: }
713: for (j=0; j<nzi; j++) {
714: ra = aa[j];
715: row = aj[j];
716: offset = row%dof;
717: row /= dof;
718: nzpi = po->i[row+1] - po->i[row];
719: pj = po->j + po->i[row];
720: pa = po->a + po->i[row];
721: for (k=0; k<nzpi; k++) {
722: PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
723: }
724: PetscLogFlops(2.0*nzpi);
725: }
727: /* off diagonal part: Ao[i, :]*P_oth */
728: if (ao) {
729: ai = ao->i;
730: pi = p_oth->i;
731: nzi = ai[i+1] - ai[i];
732: aj = ao->j + ai[i];
733: aa = ao->a + ai[i];
734: for (j=0; j<nzi; j++) {
735: row = aj[j];
736: offset = a->garray[row]%dof;
737: row = map[row];
738: ra = aa[j];
739: pnz = pi[row+1] - pi[row];
740: p_othcols = p_oth->j + pi[row];
741: pa = p_oth->a + pi[row];
742: for (col=0; col<pnz; col++) {
743: PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
744: }
745: PetscLogFlops(2.0*pnz);
746: }
747: } /* end if (ao) */
749: return(0);
750: }
752: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
754: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
755: {
756: PetscErrorCode ierr;
757: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
758: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
759: Mat_APMPI *ptap = c->ap;
760: PetscHMapIV hmap;
761: PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
762: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
763: PetscInt offset,ii,pocol;
764: const PetscInt *mappingindices;
765: IS map;
766: MPI_Comm comm;
769: PetscObjectGetComm((PetscObject)A,&comm);
770: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
772: MatZeroEntries(C);
774: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
775: /*-----------------------------------------------------*/
776: if (ptap->reuse == MAT_REUSE_MATRIX) {
777: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
778: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
779: }
780: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
782: MatGetLocalSize(p->B,NULL,&pon);
783: pon *= dof;
784: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
785: MatGetLocalSize(A,&am,NULL);
786: cmaxr = 0;
787: for (i=0; i<pon; i++) {
788: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
789: }
790: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
791: PetscHMapIVCreate(&hmap);
792: PetscHMapIVResize(hmap,cmaxr);
793: ISGetIndices(map,&mappingindices);
794: for (i=0; i<am && pon; i++) {
795: PetscHMapIVClear(hmap);
796: offset = i%dof;
797: ii = i/dof;
798: nzi = po->i[ii+1] - po->i[ii];
799: if (!nzi) continue;
800: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
801: voff = 0;
802: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
803: if (!voff) continue;
805: /* Form C(ii, :) */
806: poj = po->j + po->i[ii];
807: poa = po->a + po->i[ii];
808: for (j=0; j<nzi; j++) {
809: pocol = poj[j]*dof+offset;
810: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
811: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
812: for (jj=0; jj<voff; jj++) {
813: apvaluestmp[jj] = apvalues[jj]*poa[j];
814: /*If the row is empty */
815: if (!c_rmtc[pocol]) {
816: c_rmtjj[jj] = apindices[jj];
817: c_rmtaa[jj] = apvaluestmp[jj];
818: c_rmtc[pocol]++;
819: } else {
820: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
821: if (loc>=0){ /* hit */
822: c_rmtaa[loc] += apvaluestmp[jj];
823: PetscLogFlops(1.0);
824: } else { /* new element */
825: loc = -(loc+1);
826: /* Move data backward */
827: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
828: c_rmtjj[kk] = c_rmtjj[kk-1];
829: c_rmtaa[kk] = c_rmtaa[kk-1];
830: }/* End kk */
831: c_rmtjj[loc] = apindices[jj];
832: c_rmtaa[loc] = apvaluestmp[jj];
833: c_rmtc[pocol]++;
834: }
835: }
836: PetscLogFlops(voff);
837: } /* End jj */
838: } /* End j */
839: } /* End i */
841: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
842: PetscHMapIVDestroy(&hmap);
844: MatGetLocalSize(P,NULL,&pn);
845: pn *= dof;
846: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
848: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
849: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
850: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
851: pcstart = pcstart*dof;
852: pcend = pcend*dof;
853: cd = (Mat_SeqAIJ*)(c->A)->data;
854: co = (Mat_SeqAIJ*)(c->B)->data;
856: cmaxr = 0;
857: for (i=0; i<pn; i++) {
858: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
859: }
860: PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
861: PetscHMapIVCreate(&hmap);
862: PetscHMapIVResize(hmap,cmaxr);
863: for (i=0; i<am && pn; i++) {
864: PetscHMapIVClear(hmap);
865: offset = i%dof;
866: ii = i/dof;
867: nzi = pd->i[ii+1] - pd->i[ii];
868: if (!nzi) continue;
869: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
870: voff = 0;
871: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
872: if (!voff) continue;
873: /* Form C(ii, :) */
874: pdj = pd->j + pd->i[ii];
875: pda = pd->a + pd->i[ii];
876: for (j=0; j<nzi; j++) {
877: row = pcstart + pdj[j] * dof + offset;
878: for (jj=0; jj<voff; jj++) {
879: apvaluestmp[jj] = apvalues[jj]*pda[j];
880: }
881: PetscLogFlops(voff);
882: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
883: }
884: }
885: ISRestoreIndices(map,&mappingindices);
886: MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
887: PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
888: PetscHMapIVDestroy(&hmap);
889: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
890: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
891: PetscFree2(c_rmtj,c_rmta);
893: /* Add contributions from remote */
894: for (i = 0; i < pn; i++) {
895: row = i + pcstart;
896: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
897: }
898: PetscFree2(c_othj,c_otha);
900: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
901: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
903: ptap->reuse = MAT_REUSE_MATRIX;
905: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
906: if (ptap->freestruct) {
907: MatFreeIntermediateDataStructures(C);
908: }
909: return(0);
910: }
912: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
913: {
914: PetscErrorCode ierr;
918: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
919: return(0);
920: }
922: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
923: {
924: PetscErrorCode ierr;
925: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
926: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
927: Mat_APMPI *ptap = c->ap;
928: PetscHMapIV hmap;
929: PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
930: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
931: PetscInt offset,ii,pocol;
932: const PetscInt *mappingindices;
933: IS map;
934: MPI_Comm comm;
937: PetscObjectGetComm((PetscObject)A,&comm);
938: if (!ptap->P_oth) SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
940: MatZeroEntries(C);
942: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
943: /*-----------------------------------------------------*/
944: if (ptap->reuse == MAT_REUSE_MATRIX) {
945: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
946: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
947: }
948: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
949: MatGetLocalSize(p->B,NULL,&pon);
950: pon *= dof;
951: MatGetLocalSize(P,NULL,&pn);
952: pn *= dof;
954: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
955: MatGetLocalSize(A,&am,NULL);
956: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
957: pcstart *= dof;
958: pcend *= dof;
959: cmaxr = 0;
960: for (i=0; i<pon; i++) {
961: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
962: }
963: cd = (Mat_SeqAIJ*)(c->A)->data;
964: co = (Mat_SeqAIJ*)(c->B)->data;
965: for (i=0; i<pn; i++) {
966: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
967: }
968: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
969: PetscHMapIVCreate(&hmap);
970: PetscHMapIVResize(hmap,cmaxr);
971: ISGetIndices(map,&mappingindices);
972: for (i=0; i<am && (pon || pn); i++) {
973: PetscHMapIVClear(hmap);
974: offset = i%dof;
975: ii = i/dof;
976: nzi = po->i[ii+1] - po->i[ii];
977: dnzi = pd->i[ii+1] - pd->i[ii];
978: if (!nzi && !dnzi) continue;
979: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
980: voff = 0;
981: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
982: if (!voff) continue;
984: /* Form remote C(ii, :) */
985: poj = po->j + po->i[ii];
986: poa = po->a + po->i[ii];
987: for (j=0; j<nzi; j++) {
988: pocol = poj[j]*dof+offset;
989: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
990: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
991: for (jj=0; jj<voff; jj++) {
992: apvaluestmp[jj] = apvalues[jj]*poa[j];
993: /*If the row is empty */
994: if (!c_rmtc[pocol]) {
995: c_rmtjj[jj] = apindices[jj];
996: c_rmtaa[jj] = apvaluestmp[jj];
997: c_rmtc[pocol]++;
998: } else {
999: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
1000: if (loc>=0){ /* hit */
1001: c_rmtaa[loc] += apvaluestmp[jj];
1002: PetscLogFlops(1.0);
1003: } else { /* new element */
1004: loc = -(loc+1);
1005: /* Move data backward */
1006: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
1007: c_rmtjj[kk] = c_rmtjj[kk-1];
1008: c_rmtaa[kk] = c_rmtaa[kk-1];
1009: }/* End kk */
1010: c_rmtjj[loc] = apindices[jj];
1011: c_rmtaa[loc] = apvaluestmp[jj];
1012: c_rmtc[pocol]++;
1013: }
1014: }
1015: } /* End jj */
1016: PetscLogFlops(voff);
1017: } /* End j */
1019: /* Form local C(ii, :) */
1020: pdj = pd->j + pd->i[ii];
1021: pda = pd->a + pd->i[ii];
1022: for (j=0; j<dnzi; j++) {
1023: row = pcstart + pdj[j] * dof + offset;
1024: for (jj=0; jj<voff; jj++) {
1025: apvaluestmp[jj] = apvalues[jj]*pda[j];
1026: }/* End kk */
1027: PetscLogFlops(voff);
1028: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
1029: }/* End j */
1030: } /* End i */
1032: ISRestoreIndices(map,&mappingindices);
1033: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
1034: PetscHMapIVDestroy(&hmap);
1035: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
1037: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1038: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1039: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1040: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPIU_REPLACE);
1041: PetscFree2(c_rmtj,c_rmta);
1043: /* Add contributions from remote */
1044: for (i = 0; i < pn; i++) {
1045: row = i + pcstart;
1046: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
1047: }
1048: PetscFree2(c_othj,c_otha);
1050: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1051: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1053: ptap->reuse = MAT_REUSE_MATRIX;
1055: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
1056: if (ptap->freestruct) {
1057: MatFreeIntermediateDataStructures(C);
1058: }
1059: return(0);
1060: }
1062: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1063: {
1064: PetscErrorCode ierr;
1068: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1069: return(0);
1070: }
1072: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1073: {
1074: Mat_APMPI *ptap;
1075: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1076: MPI_Comm comm;
1077: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1078: MatType mtype;
1079: PetscSF sf;
1080: PetscSFNode *iremote;
1081: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1082: const PetscInt *rootdegrees;
1083: PetscHSetI ht,oht,*hta,*hto;
1084: PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1085: PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1086: PetscInt nalg=2,alg=0,offset,ii;
1087: PetscMPIInt owner;
1088: const PetscInt *mappingindices;
1089: PetscBool flg;
1090: const char *algTypes[2] = {"overlapping","merged"};
1091: IS map;
1092: PetscErrorCode ierr;
1095: PetscObjectGetComm((PetscObject)A,&comm);
1097: /* Create symbolic parallel matrix Cmpi */
1098: MatGetLocalSize(P,NULL,&pn);
1099: pn *= dof;
1100: MatGetType(A,&mtype);
1101: MatSetType(Cmpi,mtype);
1102: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1104: PetscNew(&ptap);
1105: ptap->reuse = MAT_INITIAL_MATRIX;
1106: ptap->algType = 2;
1108: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1109: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1110: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1111: /* This equals to the number of offdiag columns in P */
1112: MatGetLocalSize(p->B,NULL,&pon);
1113: pon *= dof;
1114: /* offsets */
1115: PetscMalloc1(pon+1,&ptap->c_rmti);
1116: /* The number of columns we will send to remote ranks */
1117: PetscMalloc1(pon,&c_rmtc);
1118: PetscMalloc1(pon,&hta);
1119: for (i=0; i<pon; i++) {
1120: PetscHSetICreate(&hta[i]);
1121: }
1122: MatGetLocalSize(A,&am,NULL);
1123: MatGetOwnershipRange(A,&arstart,&arend);
1124: /* Create hash table to merge all columns for C(i, :) */
1125: PetscHSetICreate(&ht);
1127: ISGetIndices(map,&mappingindices);
1128: ptap->c_rmti[0] = 0;
1129: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1130: for (i=0; i<am && pon; i++) {
1131: /* Form one row of AP */
1132: PetscHSetIClear(ht);
1133: offset = i%dof;
1134: ii = i/dof;
1135: /* If the off diag is empty, we should not do any calculation */
1136: nzi = po->i[ii+1] - po->i[ii];
1137: if (!nzi) continue;
1139: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1140: PetscHSetIGetSize(ht,&htsize);
1141: /* If AP is empty, just continue */
1142: if (!htsize) continue;
1143: /* Form C(ii, :) */
1144: poj = po->j + po->i[ii];
1145: for (j=0; j<nzi; j++) {
1146: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1147: }
1148: }
1150: for (i=0; i<pon; i++) {
1151: PetscHSetIGetSize(hta[i],&htsize);
1152: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1153: c_rmtc[i] = htsize;
1154: }
1156: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1158: for (i=0; i<pon; i++) {
1159: off = 0;
1160: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1161: PetscHSetIDestroy(&hta[i]);
1162: }
1163: PetscFree(hta);
1165: PetscMalloc1(pon,&iremote);
1166: for (i=0; i<pon; i++) {
1167: owner = 0; lidx = 0;
1168: offset = i%dof;
1169: ii = i/dof;
1170: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1171: iremote[i].index = lidx*dof + offset;
1172: iremote[i].rank = owner;
1173: }
1175: PetscSFCreate(comm,&sf);
1176: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1177: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1178: PetscSFSetRankOrder(sf,PETSC_TRUE);
1179: PetscSFSetFromOptions(sf);
1180: PetscSFSetUp(sf);
1181: /* How many neighbors have contributions to my rows? */
1182: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1183: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1184: rootspacesize = 0;
1185: for (i = 0; i < pn; i++) {
1186: rootspacesize += rootdegrees[i];
1187: }
1188: PetscMalloc1(rootspacesize,&rootspace);
1189: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1190: /* Get information from leaves
1191: * Number of columns other people contribute to my rows
1192: * */
1193: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1194: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1195: PetscFree(c_rmtc);
1196: PetscCalloc1(pn+1,&ptap->c_othi);
1197: /* The number of columns is received for each row */
1198: ptap->c_othi[0] = 0;
1199: rootspacesize = 0;
1200: rootspaceoffsets[0] = 0;
1201: for (i = 0; i < pn; i++) {
1202: rcvncols = 0;
1203: for (j = 0; j<rootdegrees[i]; j++) {
1204: rcvncols += rootspace[rootspacesize];
1205: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1206: rootspacesize++;
1207: }
1208: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1209: }
1210: PetscFree(rootspace);
1212: PetscMalloc1(pon,&c_rmtoffsets);
1213: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1214: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1215: PetscSFDestroy(&sf);
1216: PetscFree(rootspaceoffsets);
1218: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1219: nleaves = 0;
1220: for (i = 0; i<pon; i++) {
1221: owner = 0;
1222: ii = i/dof;
1223: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1224: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1225: for (j=0; j<sendncols; j++) {
1226: iremote[nleaves].rank = owner;
1227: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1228: }
1229: }
1230: PetscFree(c_rmtoffsets);
1231: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1233: PetscSFCreate(comm,&ptap->sf);
1234: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1235: PetscSFSetFromOptions(ptap->sf);
1236: /* One to one map */
1237: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1239: PetscMalloc2(pn,&dnz,pn,&onz);
1240: PetscHSetICreate(&oht);
1241: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1242: pcstart *= dof;
1243: pcend *= dof;
1244: PetscMalloc2(pn,&hta,pn,&hto);
1245: for (i=0; i<pn; i++) {
1246: PetscHSetICreate(&hta[i]);
1247: PetscHSetICreate(&hto[i]);
1248: }
1249: /* Work on local part */
1250: /* 4) Pass 1: Estimate memory for C_loc */
1251: for (i=0; i<am && pn; i++) {
1252: PetscHSetIClear(ht);
1253: PetscHSetIClear(oht);
1254: offset = i%dof;
1255: ii = i/dof;
1256: nzi = pd->i[ii+1] - pd->i[ii];
1257: if (!nzi) continue;
1259: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1260: PetscHSetIGetSize(ht,&htsize);
1261: PetscHSetIGetSize(oht,&htosize);
1262: if (!(htsize+htosize)) continue;
1263: /* Form C(ii, :) */
1264: pdj = pd->j + pd->i[ii];
1265: for (j=0; j<nzi; j++) {
1266: PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1267: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1268: }
1269: }
1271: ISRestoreIndices(map,&mappingindices);
1273: PetscHSetIDestroy(&ht);
1274: PetscHSetIDestroy(&oht);
1276: /* Get remote data */
1277: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1278: PetscFree(c_rmtj);
1280: for (i = 0; i < pn; i++) {
1281: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1282: rdj = c_othj + ptap->c_othi[i];
1283: for (j = 0; j < nzi; j++) {
1284: col = rdj[j];
1285: /* diag part */
1286: if (col>=pcstart && col<pcend) {
1287: PetscHSetIAdd(hta[i],col);
1288: } else { /* off diag */
1289: PetscHSetIAdd(hto[i],col);
1290: }
1291: }
1292: PetscHSetIGetSize(hta[i],&htsize);
1293: dnz[i] = htsize;
1294: PetscHSetIDestroy(&hta[i]);
1295: PetscHSetIGetSize(hto[i],&htsize);
1296: onz[i] = htsize;
1297: PetscHSetIDestroy(&hto[i]);
1298: }
1300: PetscFree2(hta,hto);
1301: PetscFree(c_othj);
1303: /* local sizes and preallocation */
1304: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1305: MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1306: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1307: MatSetUp(Cmpi);
1308: PetscFree2(dnz,onz);
1310: /* attach the supporting struct to Cmpi for reuse */
1311: c = (Mat_MPIAIJ*)Cmpi->data;
1312: c->ap = ptap;
1313: ptap->duplicate = Cmpi->ops->duplicate;
1314: ptap->destroy = Cmpi->ops->destroy;
1315: ptap->view = Cmpi->ops->view;
1317: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1318: Cmpi->assembled = PETSC_FALSE;
1319: /* pick an algorithm */
1320: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1321: alg = 0;
1322: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1323: PetscOptionsEnd();
1324: switch (alg) {
1325: case 0:
1326: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1327: break;
1328: case 1:
1329: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1330: break;
1331: default:
1332: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1333: }
1334: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1335: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1336: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1337: return(0);
1338: }
1340: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1341: {
1342: PetscErrorCode ierr;
1346: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1347: return(0);
1348: }
1350: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1351: {
1352: Mat_APMPI *ptap;
1353: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c;
1354: MPI_Comm comm;
1355: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1356: MatType mtype;
1357: PetscSF sf;
1358: PetscSFNode *iremote;
1359: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1360: const PetscInt *rootdegrees;
1361: PetscHSetI ht,oht,*hta,*hto,*htd;
1362: PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1363: PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1364: PetscInt nalg=2,alg=0,offset,ii;
1365: PetscMPIInt owner;
1366: PetscBool flg;
1367: const char *algTypes[2] = {"merged","overlapping"};
1368: const PetscInt *mappingindices;
1369: IS map;
1370: PetscErrorCode ierr;
1373: PetscObjectGetComm((PetscObject)A,&comm);
1375: /* Create symbolic parallel matrix Cmpi */
1376: MatGetLocalSize(P,NULL,&pn);
1377: pn *= dof;
1378: MatGetType(A,&mtype);
1379: MatSetType(Cmpi,mtype);
1380: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1382: PetscNew(&ptap);
1383: ptap->reuse = MAT_INITIAL_MATRIX;
1384: ptap->algType = 3;
1386: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1387: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1388: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1390: /* This equals to the number of offdiag columns in P */
1391: MatGetLocalSize(p->B,NULL,&pon);
1392: pon *= dof;
1393: /* offsets */
1394: PetscMalloc1(pon+1,&ptap->c_rmti);
1395: /* The number of columns we will send to remote ranks */
1396: PetscMalloc1(pon,&c_rmtc);
1397: PetscMalloc1(pon,&hta);
1398: for (i=0; i<pon; i++) {
1399: PetscHSetICreate(&hta[i]);
1400: }
1401: MatGetLocalSize(A,&am,NULL);
1402: MatGetOwnershipRange(A,&arstart,&arend);
1403: /* Create hash table to merge all columns for C(i, :) */
1404: PetscHSetICreate(&ht);
1405: PetscHSetICreate(&oht);
1406: PetscMalloc2(pn,&htd,pn,&hto);
1407: for (i=0; i<pn; i++) {
1408: PetscHSetICreate(&htd[i]);
1409: PetscHSetICreate(&hto[i]);
1410: }
1412: ISGetIndices(map,&mappingindices);
1413: ptap->c_rmti[0] = 0;
1414: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1415: for (i=0; i<am && (pon || pn); i++) {
1416: /* Form one row of AP */
1417: PetscHSetIClear(ht);
1418: PetscHSetIClear(oht);
1419: offset = i%dof;
1420: ii = i/dof;
1421: /* If the off diag is empty, we should not do any calculation */
1422: nzi = po->i[ii+1] - po->i[ii];
1423: dnzi = pd->i[ii+1] - pd->i[ii];
1424: if (!nzi && !dnzi) continue;
1426: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1427: PetscHSetIGetSize(ht,&htsize);
1428: PetscHSetIGetSize(oht,&htosize);
1429: /* If AP is empty, just continue */
1430: if (!(htsize+htosize)) continue;
1432: /* Form remote C(ii, :) */
1433: poj = po->j + po->i[ii];
1434: for (j=0; j<nzi; j++) {
1435: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1436: PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1437: }
1439: /* Form local C(ii, :) */
1440: pdj = pd->j + pd->i[ii];
1441: for (j=0; j<dnzi; j++) {
1442: PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1443: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1444: }
1445: }
1447: ISRestoreIndices(map,&mappingindices);
1449: PetscHSetIDestroy(&ht);
1450: PetscHSetIDestroy(&oht);
1452: for (i=0; i<pon; i++) {
1453: PetscHSetIGetSize(hta[i],&htsize);
1454: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1455: c_rmtc[i] = htsize;
1456: }
1458: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1460: for (i=0; i<pon; i++) {
1461: off = 0;
1462: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1463: PetscHSetIDestroy(&hta[i]);
1464: }
1465: PetscFree(hta);
1467: PetscMalloc1(pon,&iremote);
1468: for (i=0; i<pon; i++) {
1469: owner = 0; lidx = 0;
1470: offset = i%dof;
1471: ii = i/dof;
1472: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1473: iremote[i].index = lidx*dof+offset;
1474: iremote[i].rank = owner;
1475: }
1477: PetscSFCreate(comm,&sf);
1478: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1479: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1480: PetscSFSetRankOrder(sf,PETSC_TRUE);
1481: PetscSFSetFromOptions(sf);
1482: PetscSFSetUp(sf);
1483: /* How many neighbors have contributions to my rows? */
1484: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1485: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1486: rootspacesize = 0;
1487: for (i = 0; i < pn; i++) {
1488: rootspacesize += rootdegrees[i];
1489: }
1490: PetscMalloc1(rootspacesize,&rootspace);
1491: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1492: /* Get information from leaves
1493: * Number of columns other people contribute to my rows
1494: * */
1495: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1496: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1497: PetscFree(c_rmtc);
1498: PetscMalloc1(pn+1,&ptap->c_othi);
1499: /* The number of columns is received for each row */
1500: ptap->c_othi[0] = 0;
1501: rootspacesize = 0;
1502: rootspaceoffsets[0] = 0;
1503: for (i = 0; i < pn; i++) {
1504: rcvncols = 0;
1505: for (j = 0; j<rootdegrees[i]; j++) {
1506: rcvncols += rootspace[rootspacesize];
1507: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1508: rootspacesize++;
1509: }
1510: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1511: }
1512: PetscFree(rootspace);
1514: PetscMalloc1(pon,&c_rmtoffsets);
1515: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1516: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1517: PetscSFDestroy(&sf);
1518: PetscFree(rootspaceoffsets);
1520: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1521: nleaves = 0;
1522: for (i = 0; i<pon; i++) {
1523: owner = 0;
1524: ii = i/dof;
1525: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1526: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1527: for (j=0; j<sendncols; j++) {
1528: iremote[nleaves].rank = owner;
1529: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1530: }
1531: }
1532: PetscFree(c_rmtoffsets);
1533: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1535: PetscSFCreate(comm,&ptap->sf);
1536: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1537: PetscSFSetFromOptions(ptap->sf);
1538: /* One to one map */
1539: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1540: /* Get remote data */
1541: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPIU_REPLACE);
1542: PetscFree(c_rmtj);
1543: PetscMalloc2(pn,&dnz,pn,&onz);
1544: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1545: pcstart *= dof;
1546: pcend *= dof;
1547: for (i = 0; i < pn; i++) {
1548: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1549: rdj = c_othj + ptap->c_othi[i];
1550: for (j = 0; j < nzi; j++) {
1551: col = rdj[j];
1552: /* diag part */
1553: if (col>=pcstart && col<pcend) {
1554: PetscHSetIAdd(htd[i],col);
1555: } else { /* off diag */
1556: PetscHSetIAdd(hto[i],col);
1557: }
1558: }
1559: PetscHSetIGetSize(htd[i],&htsize);
1560: dnz[i] = htsize;
1561: PetscHSetIDestroy(&htd[i]);
1562: PetscHSetIGetSize(hto[i],&htsize);
1563: onz[i] = htsize;
1564: PetscHSetIDestroy(&hto[i]);
1565: }
1567: PetscFree2(htd,hto);
1568: PetscFree(c_othj);
1570: /* local sizes and preallocation */
1571: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1572: MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1573: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1574: PetscFree2(dnz,onz);
1576: /* attach the supporting struct to Cmpi for reuse */
1577: c = (Mat_MPIAIJ*)Cmpi->data;
1578: c->ap = ptap;
1579: ptap->duplicate = Cmpi->ops->duplicate;
1580: ptap->destroy = Cmpi->ops->destroy;
1581: ptap->view = Cmpi->ops->view;
1583: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1584: Cmpi->assembled = PETSC_FALSE;
1585: /* pick an algorithm */
1586: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1587: alg = 0;
1588: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1589: PetscOptionsEnd();
1590: switch (alg) {
1591: case 0:
1592: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1593: break;
1594: case 1:
1595: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1596: break;
1597: default:
1598: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm \n");
1599: }
1600: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1601: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1602: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1603: return(0);
1604: }
1606: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1607: {
1608: PetscErrorCode ierr;
1612: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1613: return(0);
1614: }
1616: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1617: {
1618: PetscErrorCode ierr;
1619: Mat_APMPI *ptap;
1620: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
1621: MPI_Comm comm;
1622: PetscMPIInt size,rank;
1623: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1624: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1625: PetscInt *lnk,i,k,pnz,row,nsend;
1626: PetscBT lnkbt;
1627: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
1628: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1629: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1630: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1631: MPI_Request *swaits,*rwaits;
1632: MPI_Status *sstatus,rstatus;
1633: PetscLayout rowmap;
1634: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1635: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
1636: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1637: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1638: PetscScalar *apv;
1639: PetscTable ta;
1640: MatType mtype;
1641: const char *prefix;
1642: #if defined(PETSC_USE_INFO)
1643: PetscReal apfill;
1644: #endif
1647: PetscObjectGetComm((PetscObject)A,&comm);
1648: MPI_Comm_size(comm,&size);
1649: MPI_Comm_rank(comm,&rank);
1651: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1653: /* create symbolic parallel matrix Cmpi */
1654: MatGetType(A,&mtype);
1655: MatSetType(Cmpi,mtype);
1657: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1658: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1660: /* create struct Mat_APMPI and attached it to C later */
1661: PetscNew(&ptap);
1662: ptap->reuse = MAT_INITIAL_MATRIX;
1663: ptap->algType = 1;
1665: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1666: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1667: /* get P_loc by taking all local rows of P */
1668: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1670: /* (0) compute Rd = Pd^T, Ro = Po^T */
1671: /* --------------------------------- */
1672: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1673: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
1675: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1676: /* ----------------------------------------------------------------- */
1677: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1678: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1680: /* create and initialize a linked list */
1681: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1682: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1683: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1684: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1685: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1687: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1689: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1690: if (ao) {
1691: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1692: } else {
1693: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1694: }
1695: current_space = free_space;
1696: nspacedouble = 0;
1698: PetscMalloc1(am+1,&api);
1699: api[0] = 0;
1700: for (i=0; i<am; i++) {
1701: /* diagonal portion: Ad[i,:]*P */
1702: ai = ad->i; pi = p_loc->i;
1703: nzi = ai[i+1] - ai[i];
1704: aj = ad->j + ai[i];
1705: for (j=0; j<nzi; j++) {
1706: row = aj[j];
1707: pnz = pi[row+1] - pi[row];
1708: Jptr = p_loc->j + pi[row];
1709: /* add non-zero cols of P into the sorted linked list lnk */
1710: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1711: }
1712: /* off-diagonal portion: Ao[i,:]*P */
1713: if (ao) {
1714: ai = ao->i; pi = p_oth->i;
1715: nzi = ai[i+1] - ai[i];
1716: aj = ao->j + ai[i];
1717: for (j=0; j<nzi; j++) {
1718: row = aj[j];
1719: pnz = pi[row+1] - pi[row];
1720: Jptr = p_oth->j + pi[row];
1721: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1722: }
1723: }
1724: apnz = lnk[0];
1725: api[i+1] = api[i] + apnz;
1726: if (ap_rmax < apnz) ap_rmax = apnz;
1728: /* if free space is not available, double the total space in the list */
1729: if (current_space->local_remaining<apnz) {
1730: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1731: nspacedouble++;
1732: }
1734: /* Copy data into free space, then initialize lnk */
1735: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1737: current_space->array += apnz;
1738: current_space->local_used += apnz;
1739: current_space->local_remaining -= apnz;
1740: }
1741: /* Allocate space for apj and apv, initialize apj, and */
1742: /* destroy list of free space and other temporary array(s) */
1743: PetscMalloc2(api[am],&apj,api[am],&apv);
1744: PetscFreeSpaceContiguous(&free_space,apj);
1745: PetscLLDestroy(lnk,lnkbt);
1747: /* Create AP_loc for reuse */
1748: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
1750: #if defined(PETSC_USE_INFO)
1751: if (ao) {
1752: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1753: } else {
1754: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1755: }
1756: ptap->AP_loc->info.mallocs = nspacedouble;
1757: ptap->AP_loc->info.fill_ratio_given = fill;
1758: ptap->AP_loc->info.fill_ratio_needed = apfill;
1760: if (api[am]) {
1761: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1762: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1763: } else {
1764: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1765: }
1766: #endif
1768: /* (2-1) compute symbolic Co = Ro*AP_loc */
1769: /* ------------------------------------ */
1770: MatGetOptionsPrefix(A,&prefix);
1771: MatSetOptionsPrefix(ptap->Ro,prefix);
1772: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1774: MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
1775: MatGetOptionsPrefix(Cmpi,&prefix);
1776: MatSetOptionsPrefix(ptap->C_oth,prefix);
1777: MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");
1779: MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
1780: MatProductSetAlgorithm(ptap->C_oth,"default");
1781: MatProductSetFill(ptap->C_oth,fill);
1782: MatProductSetFromOptions(ptap->C_oth);
1783: MatProductSymbolic(ptap->C_oth);
1785: /* (3) send coj of C_oth to other processors */
1786: /* ------------------------------------------ */
1787: /* determine row ownership */
1788: PetscLayoutCreate(comm,&rowmap);
1789: rowmap->n = pn;
1790: rowmap->bs = 1;
1791: PetscLayoutSetUp(rowmap);
1792: owners = rowmap->range;
1794: /* determine the number of messages to send, their lengths */
1795: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1796: PetscArrayzero(len_s,size);
1797: PetscArrayzero(len_si,size);
1799: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1800: coi = c_oth->i; coj = c_oth->j;
1801: con = ptap->C_oth->rmap->n;
1802: proc = 0;
1803: for (i=0; i<con; i++) {
1804: while (prmap[i] >= owners[proc+1]) proc++;
1805: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1806: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1807: }
1809: len = 0; /* max length of buf_si[], see (4) */
1810: owners_co[0] = 0;
1811: nsend = 0;
1812: for (proc=0; proc<size; proc++) {
1813: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1814: if (len_s[proc]) {
1815: nsend++;
1816: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1817: len += len_si[proc];
1818: }
1819: }
1821: /* determine the number and length of messages to receive for coi and coj */
1822: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1823: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
1825: /* post the Irecv and Isend of coj */
1826: PetscCommGetNewTag(comm,&tagj);
1827: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1828: PetscMalloc1(nsend+1,&swaits);
1829: for (proc=0, k=0; proc<size; proc++) {
1830: if (!len_s[proc]) continue;
1831: i = owners_co[proc];
1832: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1833: k++;
1834: }
1836: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1837: /* ---------------------------------------- */
1838: MatSetOptionsPrefix(ptap->Rd,prefix);
1839: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1841: MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
1842: MatGetOptionsPrefix(Cmpi,&prefix);
1843: MatSetOptionsPrefix(ptap->C_loc,prefix);
1844: MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");
1846: MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
1847: MatProductSetAlgorithm(ptap->C_loc,"default");
1848: MatProductSetFill(ptap->C_loc,fill);
1849: MatProductSetFromOptions(ptap->C_loc);
1850: MatProductSymbolic(ptap->C_loc);
1852: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1854: /* receives coj are complete */
1855: for (i=0; i<nrecv; i++) {
1856: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1857: }
1858: PetscFree(rwaits);
1859: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1861: /* add received column indices into ta to update Crmax */
1862: for (k=0; k<nrecv; k++) {/* k-th received message */
1863: Jptr = buf_rj[k];
1864: for (j=0; j<len_r[k]; j++) {
1865: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1866: }
1867: }
1868: PetscTableGetCount(ta,&Crmax);
1869: PetscTableDestroy(&ta);
1871: /* (4) send and recv coi */
1872: /*-----------------------*/
1873: PetscCommGetNewTag(comm,&tagi);
1874: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1875: PetscMalloc1(len+1,&buf_s);
1876: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1877: for (proc=0,k=0; proc<size; proc++) {
1878: if (!len_s[proc]) continue;
1879: /* form outgoing message for i-structure:
1880: buf_si[0]: nrows to be sent
1881: [1:nrows]: row index (global)
1882: [nrows+1:2*nrows+1]: i-structure index
1883: */
1884: /*-------------------------------------------*/
1885: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1886: buf_si_i = buf_si + nrows+1;
1887: buf_si[0] = nrows;
1888: buf_si_i[0] = 0;
1889: nrows = 0;
1890: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1891: nzi = coi[i+1] - coi[i];
1892: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1893: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1894: nrows++;
1895: }
1896: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1897: k++;
1898: buf_si += len_si[proc];
1899: }
1900: for (i=0; i<nrecv; i++) {
1901: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1902: }
1903: PetscFree(rwaits);
1904: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
1906: PetscFree4(len_s,len_si,sstatus,owners_co);
1907: PetscFree(len_ri);
1908: PetscFree(swaits);
1909: PetscFree(buf_s);
1911: /* (5) compute the local portion of Cmpi */
1912: /* ------------------------------------------ */
1913: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1914: PetscFreeSpaceGet(Crmax,&free_space);
1915: current_space = free_space;
1917: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1918: for (k=0; k<nrecv; k++) {
1919: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1920: nrows = *buf_ri_k[k];
1921: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1922: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1923: }
1925: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1926: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1927: for (i=0; i<pn; i++) {
1928: /* add C_loc into Cmpi */
1929: nzi = c_loc->i[i+1] - c_loc->i[i];
1930: Jptr = c_loc->j + c_loc->i[i];
1931: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1933: /* add received col data into lnk */
1934: for (k=0; k<nrecv; k++) { /* k-th received message */
1935: if (i == *nextrow[k]) { /* i-th row */
1936: nzi = *(nextci[k]+1) - *nextci[k];
1937: Jptr = buf_rj[k] + *nextci[k];
1938: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1939: nextrow[k]++; nextci[k]++;
1940: }
1941: }
1942: nzi = lnk[0];
1944: /* copy data into free space, then initialize lnk */
1945: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
1946: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
1947: }
1948: PetscFree3(buf_ri_k,nextrow,nextci);
1949: PetscLLDestroy(lnk,lnkbt);
1950: PetscFreeSpaceDestroy(free_space);
1952: /* local sizes and preallocation */
1953: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1954: if (P->cmap->bs > 0) {
1955: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
1956: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
1957: }
1958: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1959: MatPreallocateFinalize(dnz,onz);
1961: /* members in merge */
1962: PetscFree(id_r);
1963: PetscFree(len_r);
1964: PetscFree(buf_ri[0]);
1965: PetscFree(buf_ri);
1966: PetscFree(buf_rj[0]);
1967: PetscFree(buf_rj);
1968: PetscLayoutDestroy(&rowmap);
1970: /* attach the supporting struct to Cmpi for reuse */
1971: c = (Mat_MPIAIJ*)Cmpi->data;
1972: c->ap = ptap;
1973: ptap->duplicate = Cmpi->ops->duplicate;
1974: ptap->destroy = Cmpi->ops->destroy;
1975: ptap->view = Cmpi->ops->view;
1976: PetscCalloc1(pN,&ptap->apa);
1978: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1979: Cmpi->assembled = PETSC_FALSE;
1980: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1981: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
1982: Cmpi->ops->freeintermediatedatastructures = MatFreeIntermediateDataStructures_MPIAIJ_AP;
1983: return(0);
1984: }
1986: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1987: {
1988: PetscErrorCode ierr;
1989: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1990: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1991: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
1992: Mat_APMPI *ptap = c->ap;
1993: Mat AP_loc,C_loc,C_oth;
1994: PetscInt i,rstart,rend,cm,ncols,row;
1995: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
1996: PetscScalar *apa;
1997: const PetscInt *cols;
1998: const PetscScalar *vals;
2001: if (!ptap->AP_loc) {
2002: MPI_Comm comm;
2003: PetscObjectGetComm((PetscObject)C,&comm);
2004: SETERRQ(comm,PETSC_ERR_ARG_WRONGSTATE,"PtAP cannot be reused. Do not call MatFreeIntermediateDataStructures() or use '-mat_freeintermediatedatastructures'");
2005: }
2007: MatZeroEntries(C);
2008: /* 1) get R = Pd^T,Ro = Po^T */
2009: if (ptap->reuse == MAT_REUSE_MATRIX) {
2010: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
2011: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
2012: }
2014: /* 2) get AP_loc */
2015: AP_loc = ptap->AP_loc;
2016: ap = (Mat_SeqAIJ*)AP_loc->data;
2018: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
2019: /*-----------------------------------------------------*/
2020: if (ptap->reuse == MAT_REUSE_MATRIX) {
2021: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
2022: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
2023: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
2024: }
2026: /* 2-2) compute numeric A_loc*P - dominating part */
2027: /* ---------------------------------------------- */
2028: /* get data from symbolic products */
2029: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
2030: if (ptap->P_oth) {
2031: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
2032: }
2033: apa = ptap->apa;
2034: api = ap->i;
2035: apj = ap->j;
2036: for (i=0; i<am; i++) {
2037: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
2038: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
2039: apnz = api[i+1] - api[i];
2040: for (j=0; j<apnz; j++) {
2041: col = apj[j+api[i]];
2042: ap->a[j+ap->i[i]] = apa[col];
2043: apa[col] = 0.0;
2044: }
2045: }
2046: /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
2047: PetscObjectStateIncrease((PetscObject)AP_loc);
2049: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
2050: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
2051: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
2052: C_loc = ptap->C_loc;
2053: C_oth = ptap->C_oth;
2055: /* add C_loc and Co to to C */
2056: MatGetOwnershipRange(C,&rstart,&rend);
2058: /* C_loc -> C */
2059: cm = C_loc->rmap->N;
2060: c_seq = (Mat_SeqAIJ*)C_loc->data;
2061: cols = c_seq->j;
2062: vals = c_seq->a;
2065: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
2066: /* when there are no off-processor parts. */
2067: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
2068: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
2069: /* a table, and other, more complex stuff has to be done. */
2070: if (C->assembled) {
2071: C->was_assembled = PETSC_TRUE;
2072: C->assembled = PETSC_FALSE;
2073: }
2074: if (C->was_assembled) {
2075: for (i=0; i<cm; i++) {
2076: ncols = c_seq->i[i+1] - c_seq->i[i];
2077: row = rstart + i;
2078: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
2079: cols += ncols; vals += ncols;
2080: }
2081: } else {
2082: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
2083: }
2085: /* Co -> C, off-processor part */
2086: cm = C_oth->rmap->N;
2087: c_seq = (Mat_SeqAIJ*)C_oth->data;
2088: cols = c_seq->j;
2089: vals = c_seq->a;
2090: for (i=0; i<cm; i++) {
2091: ncols = c_seq->i[i+1] - c_seq->i[i];
2092: row = p->garray[i];
2093: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2094: cols += ncols; vals += ncols;
2095: }
2097: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2098: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2100: ptap->reuse = MAT_REUSE_MATRIX;
2102: /* supporting struct ptap consumes almost same amount of memory as C=PtAP, release it if C will not be updated by A and P */
2103: if (ptap->freestruct) {
2104: MatFreeIntermediateDataStructures(C);
2105: }
2106: return(0);
2107: }
2109: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2110: {
2111: PetscErrorCode ierr;
2112: Mat_Product *product = C->product;
2113: Mat A=product->A,P=product->B;
2114: MatProductAlgorithm alg=product->alg;
2115: PetscReal fill=product->fill;
2116: PetscBool flg;
2119: /* scalable: do R=P^T locally, then C=R*A*P */
2120: PetscStrcmp(alg,"scalable",&flg);
2121: if (flg) {
2122: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);
2123: C->ops->productnumeric = MatProductNumeric_PtAP;
2124: goto next;
2125: }
2127: /* nonscalable: do R=P^T locally, then C=R*A*P */
2128: PetscStrcmp(alg,"nonscalable",&flg);
2129: if (flg) {
2130: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
2131: goto next;
2132: }
2134: /* allatonce */
2135: PetscStrcmp(alg,"allatonce",&flg);
2136: if (flg) {
2137: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
2138: goto next;
2139: }
2141: /* allatonce_merged */
2142: PetscStrcmp(alg,"allatonce_merged",&flg);
2143: if (flg) {
2144: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
2145: goto next;
2146: }
2148: /* hypre */
2149: #if defined(PETSC_HAVE_HYPRE)
2150: PetscStrcmp(alg,"hypre",&flg);
2151: if (flg) {
2152: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
2153: C->ops->productnumeric = MatProductNumeric_PtAP;
2154: return(0);
2155: }
2156: #endif
2157: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
2159: next:
2160: C->ops->productnumeric = MatProductNumeric_PtAP;
2161: {
2162: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
2163: Mat_APMPI *ap = c->ap;
2164: PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatFreeIntermediateDataStructures","Mat");
2165: ap->freestruct = PETSC_FALSE;
2166: PetscOptionsBool("-mat_freeintermediatedatastructures","Free intermediate data structures", "MatFreeIntermediateDataStructures",ap->freestruct,&ap->freestruct, NULL);
2167: PetscOptionsEnd();
2168: }
2169: return(0);
2170: }