Actual source code: mpiptap.c
petsc-3.10.5 2019-03-28
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
13: /* #define PTAP_PROFILE */
15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
16: {
17: PetscErrorCode ierr;
18: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
19: Mat_PtAPMPI *ptap=a->ptap;
20: PetscBool iascii;
21: PetscViewerFormat format;
24: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
25: if (iascii) {
26: PetscViewerGetFormat(viewer,&format);
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
30: } else if (ptap->algType == 1) {
31: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
32: }
33: }
34: }
35: (ptap->view)(A,viewer);
36: return(0);
37: }
39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
40: {
42: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data;
43: Mat_PtAPMPI *ptap=a->ptap;
46: if (ptap) {
47: Mat_Merge_SeqsToMPI *merge=ptap->merge;
48: PetscFree2(ptap->startsj_s,ptap->startsj_r);
49: PetscFree(ptap->bufa);
50: MatDestroy(&ptap->P_loc);
51: MatDestroy(&ptap->P_oth);
52: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
53: MatDestroy(&ptap->Rd);
54: MatDestroy(&ptap->Ro);
55: if (ptap->AP_loc) { /* used by alg_rap */
56: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
57: PetscFree(ap->i);
58: PetscFree2(ap->j,ap->a);
59: MatDestroy(&ptap->AP_loc);
60: } else { /* used by alg_ptap */
61: PetscFree(ptap->api);
62: PetscFree(ptap->apj);
63: }
64: MatDestroy(&ptap->C_loc);
65: MatDestroy(&ptap->C_oth);
66: if (ptap->apa) {PetscFree(ptap->apa);}
68: if (merge) { /* used by alg_ptap */
69: PetscFree(merge->id_r);
70: PetscFree(merge->len_s);
71: PetscFree(merge->len_r);
72: PetscFree(merge->bi);
73: PetscFree(merge->bj);
74: PetscFree(merge->buf_ri[0]);
75: PetscFree(merge->buf_ri);
76: PetscFree(merge->buf_rj[0]);
77: PetscFree(merge->buf_rj);
78: PetscFree(merge->coi);
79: PetscFree(merge->coj);
80: PetscFree(merge->owners_co);
81: PetscLayoutDestroy(&merge->rowmap);
82: PetscFree(ptap->merge);
83: }
85: ptap->destroy(A);
86: PetscFree(ptap);
87: }
88: return(0);
89: }
91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
92: {
94: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
95: Mat_PtAPMPI *ptap = a->ptap;
98: (*ptap->duplicate)(A,op,M);
99: (*M)->ops->destroy = ptap->destroy;
100: (*M)->ops->duplicate = ptap->duplicate;
101: (*M)->ops->view = ptap->view;
102: return(0);
103: }
105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108: PetscBool flg;
109: MPI_Comm comm;
110: #if !defined(PETSC_HAVE_HYPRE)
111: const char *algTypes[2] = {"scalable","nonscalable"};
112: PetscInt nalg=2;
113: #else
114: const char *algTypes[3] = {"scalable","nonscalable","hypre"};
115: PetscInt nalg=3;
116: #endif
117: PetscInt pN=P->cmap->N,alg=1; /* set default algorithm */
120: /* check if matrix local sizes are compatible */
121: PetscObjectGetComm((PetscObject)A,&comm);
122: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
123: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
125: if (scall == MAT_INITIAL_MATRIX) {
126: /* pick an algorithm */
127: PetscObjectOptionsBegin((PetscObject)A);
128: PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
129: PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
130: PetscOptionsEnd();
132: if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
133: MatInfo Ainfo,Pinfo;
134: PetscInt nz_local;
135: PetscBool alg_scalable_loc=PETSC_FALSE,alg_scalable;
137: MatGetInfo(A,MAT_LOCAL,&Ainfo);
138: MatGetInfo(P,MAT_LOCAL,&Pinfo);
139: nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);
141: if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
142: MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);
144: if (alg_scalable) {
145: alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
146: }
147: }
149: switch (alg) {
150: case 1:
151: /* do R=P^T locally, then C=R*A*P */
152: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
153: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
154: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
155: break;
156: #if defined(PETSC_HAVE_HYPRE)
157: case 2:
158: /* Use boomerAMGBuildCoarseOperator */
159: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
160: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
161: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
162: break;
163: #endif
164: default:
165: /* do R=P^T locally, then C=R*A*P */
166: PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
167: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
168: PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
169: break;
170: }
171: }
172: PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
173: (*(*C)->ops->ptapnumeric)(A,P,*C);
174: PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
175: return(0);
176: }
178: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
179: {
180: PetscErrorCode ierr;
181: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
182: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
183: Mat_SeqAIJ *ap,*p_loc,*p_oth,*c_seq;
184: Mat_PtAPMPI *ptap = c->ptap;
185: Mat AP_loc,C_loc,C_oth;
186: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
187: PetscScalar *apa;
188: const PetscInt *cols;
189: const PetscScalar *vals;
192: MatZeroEntries(C);
194: /* 1) get R = Pd^T,Ro = Po^T */
195: if (ptap->reuse == MAT_REUSE_MATRIX) {
196: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
197: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
198: }
200: /* 2) get AP_loc */
201: AP_loc = ptap->AP_loc;
202: ap = (Mat_SeqAIJ*)AP_loc->data;
204: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
205: /*-----------------------------------------------------*/
206: if (ptap->reuse == MAT_REUSE_MATRIX) {
207: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
208: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
209: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
210: }
212: /* 2-2) compute numeric A_loc*P - dominating part */
213: /* ---------------------------------------------- */
214: /* get data from symbolic products */
215: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
216: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
217: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");
219: api = ap->i;
220: apj = ap->j;
221: for (i=0; i<am; i++) {
222: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
223: apnz = api[i+1] - api[i];
224: apa = ap->a + api[i];
225: PetscMemzero(apa,sizeof(PetscScalar)*apnz);
226: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
227: PetscLogFlops(2.0*apnz);
228: }
230: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
231: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
232: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
234: C_loc = ptap->C_loc;
235: C_oth = ptap->C_oth;
237: /* add C_loc and Co to to C */
238: MatGetOwnershipRange(C,&rstart,&rend);
240: /* C_loc -> C */
241: cm = C_loc->rmap->N;
242: c_seq = (Mat_SeqAIJ*)C_loc->data;
243: cols = c_seq->j;
244: vals = c_seq->a;
246: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
247: /* when there are no off-processor parts. */
248: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
249: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
250: /* a table, and other, more complex stuff has to be done. */
251: if (C->assembled) {
252: C->was_assembled = PETSC_TRUE;
253: C->assembled = PETSC_FALSE;
254: }
255: if (C->was_assembled) {
256: for (i=0; i<cm; i++) {
257: ncols = c_seq->i[i+1] - c_seq->i[i];
258: row = rstart + i;
259: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
260: cols += ncols; vals += ncols;
261: }
262: } else {
263: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
264: }
266: /* Co -> C, off-processor part */
267: cm = C_oth->rmap->N;
268: c_seq = (Mat_SeqAIJ*)C_oth->data;
269: cols = c_seq->j;
270: vals = c_seq->a;
271: for (i=0; i<cm; i++) {
272: ncols = c_seq->i[i+1] - c_seq->i[i];
273: row = p->garray[i];
274: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
275: cols += ncols; vals += ncols;
276: }
277: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
278: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
280: ptap->reuse = MAT_REUSE_MATRIX;
281: return(0);
282: }
284: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
285: {
286: PetscErrorCode ierr;
287: Mat_PtAPMPI *ptap;
288: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
289: MPI_Comm comm;
290: PetscMPIInt size,rank;
291: Mat Cmpi,P_loc,P_oth;
292: PetscFreeSpaceList free_space=NULL,current_space=NULL;
293: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
294: PetscInt *lnk,i,k,pnz,row,nsend;
295: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
296: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
297: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
298: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
299: MPI_Request *swaits,*rwaits;
300: MPI_Status *sstatus,rstatus;
301: PetscLayout rowmap;
302: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
303: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
304: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
305: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
306: PetscScalar *apv;
307: PetscTable ta;
308: #if defined(PETSC_USE_INFO)
309: PetscReal apfill;
310: #endif
313: PetscObjectGetComm((PetscObject)A,&comm);
314: MPI_Comm_size(comm,&size);
315: MPI_Comm_rank(comm,&rank);
317: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
319: /* create symbolic parallel matrix Cmpi */
320: MatCreate(comm,&Cmpi);
321: MatSetType(Cmpi,MATMPIAIJ);
323: /* create struct Mat_PtAPMPI and attached it to C later */
324: PetscNew(&ptap);
325: ptap->reuse = MAT_INITIAL_MATRIX;
326: ptap->algType = 0;
328: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
329: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
330: /* get P_loc by taking all local rows of P */
331: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
333: ptap->P_loc = P_loc;
334: ptap->P_oth = P_oth;
336: /* (0) compute Rd = Pd^T, Ro = Po^T */
337: /* --------------------------------- */
338: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
339: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
341: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
342: /* ----------------------------------------------------------------- */
343: p_loc = (Mat_SeqAIJ*)P_loc->data;
344: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
346: /* create and initialize a linked list */
347: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
348: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
349: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
350: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
352: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
354: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
355: if (ao) {
356: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
357: } else {
358: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
359: }
360: current_space = free_space;
361: nspacedouble = 0;
363: PetscMalloc1(am+1,&api);
364: api[0] = 0;
365: for (i=0; i<am; i++) {
366: /* diagonal portion: Ad[i,:]*P */
367: ai = ad->i; pi = p_loc->i;
368: nzi = ai[i+1] - ai[i];
369: aj = ad->j + ai[i];
370: for (j=0; j<nzi; j++) {
371: row = aj[j];
372: pnz = pi[row+1] - pi[row];
373: Jptr = p_loc->j + pi[row];
374: /* add non-zero cols of P into the sorted linked list lnk */
375: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
376: }
377: /* off-diagonal portion: Ao[i,:]*P */
378: if (ao) {
379: ai = ao->i; pi = p_oth->i;
380: nzi = ai[i+1] - ai[i];
381: aj = ao->j + ai[i];
382: for (j=0; j<nzi; j++) {
383: row = aj[j];
384: pnz = pi[row+1] - pi[row];
385: Jptr = p_oth->j + pi[row];
386: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
387: }
388: }
389: apnz = lnk[0];
390: api[i+1] = api[i] + apnz;
392: /* if free space is not available, double the total space in the list */
393: if (current_space->local_remaining<apnz) {
394: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
395: nspacedouble++;
396: }
398: /* Copy data into free space, then initialize lnk */
399: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
401: current_space->array += apnz;
402: current_space->local_used += apnz;
403: current_space->local_remaining -= apnz;
404: }
405: /* Allocate space for apj and apv, initialize apj, and */
406: /* destroy list of free space and other temporary array(s) */
407: PetscMalloc2(api[am],&apj,api[am],&apv);
408: PetscFreeSpaceContiguous(&free_space,apj);
409: PetscLLCondensedDestroy_Scalable(lnk);
411: /* Create AP_loc for reuse */
412: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
414: #if defined(PETSC_USE_INFO)
415: if (ao) {
416: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
417: } else {
418: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
419: }
420: ptap->AP_loc->info.mallocs = nspacedouble;
421: ptap->AP_loc->info.fill_ratio_given = fill;
422: ptap->AP_loc->info.fill_ratio_needed = apfill;
424: if (api[am]) {
425: PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
426: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
427: } else {
428: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
429: }
430: #endif
432: /* (2-1) compute symbolic Co = Ro*AP_loc */
433: /* ------------------------------------ */
434: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
436: /* (3) send coj of C_oth to other processors */
437: /* ------------------------------------------ */
438: /* determine row ownership */
439: PetscLayoutCreate(comm,&rowmap);
440: rowmap->n = pn;
441: rowmap->bs = 1;
442: PetscLayoutSetUp(rowmap);
443: owners = rowmap->range;
445: /* determine the number of messages to send, their lengths */
446: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
447: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
448: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
450: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
451: coi = c_oth->i; coj = c_oth->j;
452: con = ptap->C_oth->rmap->n;
453: proc = 0;
454: for (i=0; i<con; i++) {
455: while (prmap[i] >= owners[proc+1]) proc++;
456: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
457: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
458: }
460: len = 0; /* max length of buf_si[], see (4) */
461: owners_co[0] = 0;
462: nsend = 0;
463: for (proc=0; proc<size; proc++) {
464: owners_co[proc+1] = owners_co[proc] + len_si[proc];
465: if (len_s[proc]) {
466: nsend++;
467: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
468: len += len_si[proc];
469: }
470: }
472: /* determine the number and length of messages to receive for coi and coj */
473: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
474: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
476: /* post the Irecv and Isend of coj */
477: PetscCommGetNewTag(comm,&tagj);
478: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
479: PetscMalloc1(nsend+1,&swaits);
480: for (proc=0, k=0; proc<size; proc++) {
481: if (!len_s[proc]) continue;
482: i = owners_co[proc];
483: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
484: k++;
485: }
487: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
488: /* ---------------------------------------- */
489: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
490: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
492: /* receives coj are complete */
493: for (i=0; i<nrecv; i++) {
494: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
495: }
496: PetscFree(rwaits);
497: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
499: /* add received column indices into ta to update Crmax */
500: for (k=0; k<nrecv; k++) {/* k-th received message */
501: Jptr = buf_rj[k];
502: for (j=0; j<len_r[k]; j++) {
503: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
504: }
505: }
506: PetscTableGetCount(ta,&Crmax);
507: PetscTableDestroy(&ta);
509: /* (4) send and recv coi */
510: /*-----------------------*/
511: PetscCommGetNewTag(comm,&tagi);
512: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
513: PetscMalloc1(len+1,&buf_s);
514: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
515: for (proc=0,k=0; proc<size; proc++) {
516: if (!len_s[proc]) continue;
517: /* form outgoing message for i-structure:
518: buf_si[0]: nrows to be sent
519: [1:nrows]: row index (global)
520: [nrows+1:2*nrows+1]: i-structure index
521: */
522: /*-------------------------------------------*/
523: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
524: buf_si_i = buf_si + nrows+1;
525: buf_si[0] = nrows;
526: buf_si_i[0] = 0;
527: nrows = 0;
528: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
529: nzi = coi[i+1] - coi[i];
530: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
531: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
532: nrows++;
533: }
534: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
535: k++;
536: buf_si += len_si[proc];
537: }
538: for (i=0; i<nrecv; i++) {
539: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
540: }
541: PetscFree(rwaits);
542: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
544: PetscFree4(len_s,len_si,sstatus,owners_co);
545: PetscFree(len_ri);
546: PetscFree(swaits);
547: PetscFree(buf_s);
549: /* (5) compute the local portion of Cmpi */
550: /* ------------------------------------------ */
551: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
552: PetscFreeSpaceGet(Crmax,&free_space);
553: current_space = free_space;
555: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
556: for (k=0; k<nrecv; k++) {
557: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
558: nrows = *buf_ri_k[k];
559: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
560: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
561: }
563: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
564: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
565: for (i=0; i<pn; i++) {
566: /* add C_loc into Cmpi */
567: nzi = c_loc->i[i+1] - c_loc->i[i];
568: Jptr = c_loc->j + c_loc->i[i];
569: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
571: /* add received col data into lnk */
572: for (k=0; k<nrecv; k++) { /* k-th received message */
573: if (i == *nextrow[k]) { /* i-th row */
574: nzi = *(nextci[k]+1) - *nextci[k];
575: Jptr = buf_rj[k] + *nextci[k];
576: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
577: nextrow[k]++; nextci[k]++;
578: }
579: }
580: nzi = lnk[0];
582: /* copy data into free space, then initialize lnk */
583: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
584: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
585: }
586: PetscFree3(buf_ri_k,nextrow,nextci);
587: PetscLLCondensedDestroy_Scalable(lnk);
588: PetscFreeSpaceDestroy(free_space);
590: /* local sizes and preallocation */
591: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
592: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
593: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
594: MatPreallocateFinalize(dnz,onz);
596: /* members in merge */
597: PetscFree(id_r);
598: PetscFree(len_r);
599: PetscFree(buf_ri[0]);
600: PetscFree(buf_ri);
601: PetscFree(buf_rj[0]);
602: PetscFree(buf_rj);
603: PetscLayoutDestroy(&rowmap);
605: /* attach the supporting struct to Cmpi for reuse */
606: c = (Mat_MPIAIJ*)Cmpi->data;
607: c->ptap = ptap;
608: ptap->duplicate = Cmpi->ops->duplicate;
609: ptap->destroy = Cmpi->ops->destroy;
610: ptap->view = Cmpi->ops->view;
612: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
613: Cmpi->assembled = PETSC_FALSE;
614: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
615: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
616: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
617: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
618: *C = Cmpi;
619: return(0);
620: }
622: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
623: {
624: PetscErrorCode ierr;
625: Mat_PtAPMPI *ptap;
626: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
627: MPI_Comm comm;
628: PetscMPIInt size,rank;
629: Mat Cmpi;
630: PetscFreeSpaceList free_space=NULL,current_space=NULL;
631: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
632: PetscInt *lnk,i,k,pnz,row,nsend;
633: PetscBT lnkbt;
634: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
635: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
636: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
637: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
638: MPI_Request *swaits,*rwaits;
639: MPI_Status *sstatus,rstatus;
640: PetscLayout rowmap;
641: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
642: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
643: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
644: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
645: PetscScalar *apv;
646: PetscTable ta;
647: #if defined(PETSC_USE_INFO)
648: PetscReal apfill;
649: #endif
652: PetscObjectGetComm((PetscObject)A,&comm);
653: MPI_Comm_size(comm,&size);
654: MPI_Comm_rank(comm,&rank);
656: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
658: /* create symbolic parallel matrix Cmpi */
659: MatCreate(comm,&Cmpi);
660: MatSetType(Cmpi,MATMPIAIJ);
662: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
663: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
665: /* create struct Mat_PtAPMPI and attached it to C later */
666: PetscNew(&ptap);
667: ptap->reuse = MAT_INITIAL_MATRIX;
668: ptap->algType = 1;
670: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
671: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
672: /* get P_loc by taking all local rows of P */
673: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
675: /* (0) compute Rd = Pd^T, Ro = Po^T */
676: /* --------------------------------- */
677: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
678: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
680: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
681: /* ----------------------------------------------------------------- */
682: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
683: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
685: /* create and initialize a linked list */
686: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
687: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
688: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
689: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
690: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
692: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
694: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
695: if (ao) {
696: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
697: } else {
698: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
699: }
700: current_space = free_space;
701: nspacedouble = 0;
703: PetscMalloc1(am+1,&api);
704: api[0] = 0;
705: for (i=0; i<am; i++) {
706: /* diagonal portion: Ad[i,:]*P */
707: ai = ad->i; pi = p_loc->i;
708: nzi = ai[i+1] - ai[i];
709: aj = ad->j + ai[i];
710: for (j=0; j<nzi; j++) {
711: row = aj[j];
712: pnz = pi[row+1] - pi[row];
713: Jptr = p_loc->j + pi[row];
714: /* add non-zero cols of P into the sorted linked list lnk */
715: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
716: }
717: /* off-diagonal portion: Ao[i,:]*P */
718: if (ao) {
719: ai = ao->i; pi = p_oth->i;
720: nzi = ai[i+1] - ai[i];
721: aj = ao->j + ai[i];
722: for (j=0; j<nzi; j++) {
723: row = aj[j];
724: pnz = pi[row+1] - pi[row];
725: Jptr = p_oth->j + pi[row];
726: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
727: }
728: }
729: apnz = lnk[0];
730: api[i+1] = api[i] + apnz;
731: if (ap_rmax < apnz) ap_rmax = apnz;
733: /* if free space is not available, double the total space in the list */
734: if (current_space->local_remaining<apnz) {
735: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
736: nspacedouble++;
737: }
739: /* Copy data into free space, then initialize lnk */
740: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
742: current_space->array += apnz;
743: current_space->local_used += apnz;
744: current_space->local_remaining -= apnz;
745: }
746: /* Allocate space for apj and apv, initialize apj, and */
747: /* destroy list of free space and other temporary array(s) */
748: PetscMalloc2(api[am],&apj,api[am],&apv);
749: PetscFreeSpaceContiguous(&free_space,apj);
750: PetscLLDestroy(lnk,lnkbt);
752: /* Create AP_loc for reuse */
753: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
755: #if defined(PETSC_USE_INFO)
756: if (ao) {
757: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
758: } else {
759: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
760: }
761: ptap->AP_loc->info.mallocs = nspacedouble;
762: ptap->AP_loc->info.fill_ratio_given = fill;
763: ptap->AP_loc->info.fill_ratio_needed = apfill;
765: if (api[am]) {
766: PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
767: PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
768: } else {
769: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
770: }
771: #endif
773: /* (2-1) compute symbolic Co = Ro*AP_loc */
774: /* ------------------------------------ */
775: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);
777: /* (3) send coj of C_oth to other processors */
778: /* ------------------------------------------ */
779: /* determine row ownership */
780: PetscLayoutCreate(comm,&rowmap);
781: rowmap->n = pn;
782: rowmap->bs = 1;
783: PetscLayoutSetUp(rowmap);
784: owners = rowmap->range;
786: /* determine the number of messages to send, their lengths */
787: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
788: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
789: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
791: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
792: coi = c_oth->i; coj = c_oth->j;
793: con = ptap->C_oth->rmap->n;
794: proc = 0;
795: for (i=0; i<con; i++) {
796: while (prmap[i] >= owners[proc+1]) proc++;
797: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
798: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
799: }
801: len = 0; /* max length of buf_si[], see (4) */
802: owners_co[0] = 0;
803: nsend = 0;
804: for (proc=0; proc<size; proc++) {
805: owners_co[proc+1] = owners_co[proc] + len_si[proc];
806: if (len_s[proc]) {
807: nsend++;
808: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
809: len += len_si[proc];
810: }
811: }
813: /* determine the number and length of messages to receive for coi and coj */
814: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
815: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
817: /* post the Irecv and Isend of coj */
818: PetscCommGetNewTag(comm,&tagj);
819: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
820: PetscMalloc1(nsend+1,&swaits);
821: for (proc=0, k=0; proc<size; proc++) {
822: if (!len_s[proc]) continue;
823: i = owners_co[proc];
824: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
825: k++;
826: }
828: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
829: /* ---------------------------------------- */
830: MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
831: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
833: /* receives coj are complete */
834: for (i=0; i<nrecv; i++) {
835: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
836: }
837: PetscFree(rwaits);
838: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
840: /* add received column indices into ta to update Crmax */
841: for (k=0; k<nrecv; k++) {/* k-th received message */
842: Jptr = buf_rj[k];
843: for (j=0; j<len_r[k]; j++) {
844: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
845: }
846: }
847: PetscTableGetCount(ta,&Crmax);
848: PetscTableDestroy(&ta);
850: /* (4) send and recv coi */
851: /*-----------------------*/
852: PetscCommGetNewTag(comm,&tagi);
853: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
854: PetscMalloc1(len+1,&buf_s);
855: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
856: for (proc=0,k=0; proc<size; proc++) {
857: if (!len_s[proc]) continue;
858: /* form outgoing message for i-structure:
859: buf_si[0]: nrows to be sent
860: [1:nrows]: row index (global)
861: [nrows+1:2*nrows+1]: i-structure index
862: */
863: /*-------------------------------------------*/
864: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
865: buf_si_i = buf_si + nrows+1;
866: buf_si[0] = nrows;
867: buf_si_i[0] = 0;
868: nrows = 0;
869: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
870: nzi = coi[i+1] - coi[i];
871: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
872: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
873: nrows++;
874: }
875: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
876: k++;
877: buf_si += len_si[proc];
878: }
879: for (i=0; i<nrecv; i++) {
880: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
881: }
882: PetscFree(rwaits);
883: if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}
885: PetscFree4(len_s,len_si,sstatus,owners_co);
886: PetscFree(len_ri);
887: PetscFree(swaits);
888: PetscFree(buf_s);
890: /* (5) compute the local portion of Cmpi */
891: /* ------------------------------------------ */
892: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
893: PetscFreeSpaceGet(Crmax,&free_space);
894: current_space = free_space;
896: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
897: for (k=0; k<nrecv; k++) {
898: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
899: nrows = *buf_ri_k[k];
900: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
901: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
902: }
904: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
905: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
906: for (i=0; i<pn; i++) {
907: /* add C_loc into Cmpi */
908: nzi = c_loc->i[i+1] - c_loc->i[i];
909: Jptr = c_loc->j + c_loc->i[i];
910: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
912: /* add received col data into lnk */
913: for (k=0; k<nrecv; k++) { /* k-th received message */
914: if (i == *nextrow[k]) { /* i-th row */
915: nzi = *(nextci[k]+1) - *nextci[k];
916: Jptr = buf_rj[k] + *nextci[k];
917: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
918: nextrow[k]++; nextci[k]++;
919: }
920: }
921: nzi = lnk[0];
923: /* copy data into free space, then initialize lnk */
924: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
925: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
926: }
927: PetscFree3(buf_ri_k,nextrow,nextci);
928: PetscLLDestroy(lnk,lnkbt);
929: PetscFreeSpaceDestroy(free_space);
931: /* local sizes and preallocation */
932: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
933: MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
934: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
935: MatPreallocateFinalize(dnz,onz);
937: /* members in merge */
938: PetscFree(id_r);
939: PetscFree(len_r);
940: PetscFree(buf_ri[0]);
941: PetscFree(buf_ri);
942: PetscFree(buf_rj[0]);
943: PetscFree(buf_rj);
944: PetscLayoutDestroy(&rowmap);
946: /* attach the supporting struct to Cmpi for reuse */
947: c = (Mat_MPIAIJ*)Cmpi->data;
948: c->ptap = ptap;
949: ptap->duplicate = Cmpi->ops->duplicate;
950: ptap->destroy = Cmpi->ops->destroy;
951: ptap->view = Cmpi->ops->view;
952: PetscCalloc1(pN,&ptap->apa);
954: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
955: Cmpi->assembled = PETSC_FALSE;
956: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
957: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatPtAP;
958: Cmpi->ops->view = MatView_MPIAIJ_PtAP;
959: *C = Cmpi;
960: return(0);
961: }
964: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
965: {
966: PetscErrorCode ierr;
967: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
968: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
969: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
970: Mat_PtAPMPI *ptap = c->ptap;
971: Mat AP_loc,C_loc,C_oth;
972: PetscInt i,rstart,rend,cm,ncols,row;
973: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
974: PetscScalar *apa;
975: const PetscInt *cols;
976: const PetscScalar *vals;
979: MatZeroEntries(C);
980: /* 1) get R = Pd^T,Ro = Po^T */
981: if (ptap->reuse == MAT_REUSE_MATRIX) {
982: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
983: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
984: }
986: /* 2) get AP_loc */
987: AP_loc = ptap->AP_loc;
988: ap = (Mat_SeqAIJ*)AP_loc->data;
990: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
991: /*-----------------------------------------------------*/
992: if (ptap->reuse == MAT_REUSE_MATRIX) {
993: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
994: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
995: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
996: }
998: /* 2-2) compute numeric A_loc*P - dominating part */
999: /* ---------------------------------------------- */
1000: /* get data from symbolic products */
1001: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1002: if (ptap->P_oth) {
1003: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1004: }
1005: apa = ptap->apa;
1006: api = ap->i;
1007: apj = ap->j;
1008: for (i=0; i<am; i++) {
1009: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1010: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1011: apnz = api[i+1] - api[i];
1012: for (j=0; j<apnz; j++) {
1013: col = apj[j+api[i]];
1014: ap->a[j+ap->i[i]] = apa[col];
1015: apa[col] = 0.0;
1016: }
1017: PetscLogFlops(2.0*apnz);
1018: }
1020: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1021: ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1022: ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1023: C_loc = ptap->C_loc;
1024: C_oth = ptap->C_oth;
1026: /* add C_loc and Co to to C */
1027: MatGetOwnershipRange(C,&rstart,&rend);
1029: /* C_loc -> C */
1030: cm = C_loc->rmap->N;
1031: c_seq = (Mat_SeqAIJ*)C_loc->data;
1032: cols = c_seq->j;
1033: vals = c_seq->a;
1036: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1037: /* when there are no off-processor parts. */
1038: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1039: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1040: /* a table, and other, more complex stuff has to be done. */
1041: if (C->assembled) {
1042: C->was_assembled = PETSC_TRUE;
1043: C->assembled = PETSC_FALSE;
1044: }
1045: if (C->was_assembled) {
1046: for (i=0; i<cm; i++) {
1047: ncols = c_seq->i[i+1] - c_seq->i[i];
1048: row = rstart + i;
1049: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
1050: cols += ncols; vals += ncols;
1051: }
1052: } else {
1053: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
1054: }
1056: /* Co -> C, off-processor part */
1057: cm = C_oth->rmap->N;
1058: c_seq = (Mat_SeqAIJ*)C_oth->data;
1059: cols = c_seq->j;
1060: vals = c_seq->a;
1061: for (i=0; i<cm; i++) {
1062: ncols = c_seq->i[i+1] - c_seq->i[i];
1063: row = p->garray[i];
1064: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1065: cols += ncols; vals += ncols;
1066: }
1068: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1069: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1071: ptap->reuse = MAT_REUSE_MATRIX;
1072: return(0);
1073: }