Actual source code: mpiptap.c

petsc-3.10.5 2019-03-28
Report Typos and Errors

  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7:  #include <../src/mat/impls/aij/seq/aij.h>
  8:  #include <../src/mat/utils/freespace.h>
  9:  #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10:  #include <petscbt.h>
 11:  #include <petsctime.h>

 13: /* #define PTAP_PROFILE */

 15: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 16: {
 17:   PetscErrorCode    ierr;
 18:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data;
 19:   Mat_PtAPMPI       *ptap=a->ptap;
 20:   PetscBool         iascii;
 21:   PetscViewerFormat format;

 24:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 32:       }
 33:     }
 34:   }
 35:   (ptap->view)(A,viewer);
 36:   return(0);
 37: }

 39: PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
 40: {
 42:   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
 43:   Mat_PtAPMPI    *ptap=a->ptap;

 46:   if (ptap) {
 47:     Mat_Merge_SeqsToMPI *merge=ptap->merge;
 48:     PetscFree2(ptap->startsj_s,ptap->startsj_r);
 49:     PetscFree(ptap->bufa);
 50:     MatDestroy(&ptap->P_loc);
 51:     MatDestroy(&ptap->P_oth);
 52:     MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 53:     MatDestroy(&ptap->Rd);
 54:     MatDestroy(&ptap->Ro);
 55:     if (ptap->AP_loc) { /* used by alg_rap */
 56:       Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 57:       PetscFree(ap->i);
 58:       PetscFree2(ap->j,ap->a);
 59:       MatDestroy(&ptap->AP_loc);
 60:     } else { /* used by alg_ptap */
 61:       PetscFree(ptap->api);
 62:       PetscFree(ptap->apj);
 63:     }
 64:     MatDestroy(&ptap->C_loc);
 65:     MatDestroy(&ptap->C_oth);
 66:     if (ptap->apa) {PetscFree(ptap->apa);}

 68:     if (merge) { /* used by alg_ptap */
 69:       PetscFree(merge->id_r);
 70:       PetscFree(merge->len_s);
 71:       PetscFree(merge->len_r);
 72:       PetscFree(merge->bi);
 73:       PetscFree(merge->bj);
 74:       PetscFree(merge->buf_ri[0]);
 75:       PetscFree(merge->buf_ri);
 76:       PetscFree(merge->buf_rj[0]);
 77:       PetscFree(merge->buf_rj);
 78:       PetscFree(merge->coi);
 79:       PetscFree(merge->coj);
 80:       PetscFree(merge->owners_co);
 81:       PetscLayoutDestroy(&merge->rowmap);
 82:       PetscFree(ptap->merge);
 83:     }

 85:     ptap->destroy(A);
 86:     PetscFree(ptap);
 87:   }
 88:   return(0);
 89: }

 91: PetscErrorCode MatDuplicate_MPIAIJ_MatPtAP(Mat A, MatDuplicateOption op, Mat *M)
 92: {
 94:   Mat_MPIAIJ     *a     = (Mat_MPIAIJ*)A->data;
 95:   Mat_PtAPMPI    *ptap  = a->ptap;

 98:   (*ptap->duplicate)(A,op,M);
 99:   (*M)->ops->destroy   = ptap->destroy;
100:   (*M)->ops->duplicate = ptap->duplicate;
101:   (*M)->ops->view      = ptap->view;
102:   return(0);
103: }

105: PETSC_INTERN PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
106: {
108:   PetscBool      flg;
109:   MPI_Comm       comm;
110: #if !defined(PETSC_HAVE_HYPRE)
111:   const char          *algTypes[2] = {"scalable","nonscalable"};
112:   PetscInt            nalg=2;
113: #else
114:   const char          *algTypes[3] = {"scalable","nonscalable","hypre"};
115:   PetscInt            nalg=3;
116: #endif
117:   PetscInt            pN=P->cmap->N,alg=1; /* set default algorithm */

120:   /* check if matrix local sizes are compatible */
121:   PetscObjectGetComm((PetscObject)A,&comm);
122:   if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Arow (%D, %D) != Prow (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
123:   if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, Acol (%D, %D) != Prow (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);

125:   if (scall == MAT_INITIAL_MATRIX) {
126:     /* pick an algorithm */
127:     PetscObjectOptionsBegin((PetscObject)A);
128:     PetscOptionsObject->alreadyprinted = PETSC_FALSE; /* a hack to ensure the option shows in '-help' */
129:     PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
130:     PetscOptionsEnd();

132:     if (!flg && pN > 100000) { /* may switch to scalable algorithm as default */
133:       MatInfo     Ainfo,Pinfo;
134:       PetscInt    nz_local;
135:       PetscBool   alg_scalable_loc=PETSC_FALSE,alg_scalable;

137:       MatGetInfo(A,MAT_LOCAL,&Ainfo);
138:       MatGetInfo(P,MAT_LOCAL,&Pinfo);
139:       nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated);

141:       if (pN > fill*nz_local) alg_scalable_loc = PETSC_TRUE;
142:       MPIU_Allreduce(&alg_scalable_loc,&alg_scalable,1,MPIU_BOOL,MPI_LOR,comm);

144:       if (alg_scalable) {
145:         alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */
146:       }
147:     }

149:     switch (alg) {
150:     case 1:
151:       /* do R=P^T locally, then C=R*A*P */
152:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
153:       MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
154:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
155:       break;
156: #if defined(PETSC_HAVE_HYPRE)
157:     case 2:
158:       /* Use boomerAMGBuildCoarseOperator */
159:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
160:       MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
161:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
162:       break;
163: #endif
164:     default:
165:       /* do R=P^T locally, then C=R*A*P */
166:       PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);
167:       MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,fill,C);
168:       PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);
169:       break;
170:     }
171:   }
172:   PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);
173:   (*(*C)->ops->ptapnumeric)(A,P,*C);
174:   PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);
175:   return(0);
176: }

178: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
179: {
180:   PetscErrorCode    ierr;
181:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
182:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
183:   Mat_SeqAIJ        *ap,*p_loc,*p_oth,*c_seq;
184:   Mat_PtAPMPI       *ptap = c->ptap;
185:   Mat               AP_loc,C_loc,C_oth;
186:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz;
187:   PetscScalar       *apa;
188:   const PetscInt    *cols;
189:   const PetscScalar *vals;

192:   MatZeroEntries(C);

194:   /* 1) get R = Pd^T,Ro = Po^T */
195:   if (ptap->reuse == MAT_REUSE_MATRIX) {
196:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
197:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
198:   }

200:   /* 2) get AP_loc */
201:   AP_loc = ptap->AP_loc;
202:   ap = (Mat_SeqAIJ*)AP_loc->data;

204:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
205:   /*-----------------------------------------------------*/
206:   if (ptap->reuse == MAT_REUSE_MATRIX) {
207:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
208:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
209:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
210:   }

212:   /* 2-2) compute numeric A_loc*P - dominating part */
213:   /* ---------------------------------------------- */
214:   /* get data from symbolic products */
215:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
216:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
217:   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"ptap->P_oth is NULL. Cannot proceed!");

219:   api   = ap->i;
220:   apj   = ap->j;
221:   for (i=0; i<am; i++) {
222:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
223:     apnz = api[i+1] - api[i];
224:     apa = ap->a + api[i];
225:     PetscMemzero(apa,sizeof(PetscScalar)*apnz);
226:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
227:     PetscLogFlops(2.0*apnz);
228:   }

230:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
231:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
232:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

234:   C_loc = ptap->C_loc;
235:   C_oth = ptap->C_oth;

237:   /* add C_loc and Co to to C */
238:   MatGetOwnershipRange(C,&rstart,&rend);

240:   /* C_loc -> C */
241:   cm    = C_loc->rmap->N;
242:   c_seq = (Mat_SeqAIJ*)C_loc->data;
243:   cols = c_seq->j;
244:   vals = c_seq->a;

246:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
247:   /* when there are no off-processor parts.  */
248:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
249:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
250:   /* a table, and other, more complex stuff has to be done. */
251:   if (C->assembled) {
252:     C->was_assembled = PETSC_TRUE;
253:     C->assembled     = PETSC_FALSE;
254:   }
255:   if (C->was_assembled) {
256:     for (i=0; i<cm; i++) {
257:       ncols = c_seq->i[i+1] - c_seq->i[i];
258:       row = rstart + i;
259:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
260:       cols += ncols; vals += ncols;
261:     }
262:   } else {
263:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
264:   }

266:   /* Co -> C, off-processor part */
267:   cm = C_oth->rmap->N;
268:   c_seq = (Mat_SeqAIJ*)C_oth->data;
269:   cols = c_seq->j;
270:   vals = c_seq->a;
271:   for (i=0; i<cm; i++) {
272:     ncols = c_seq->i[i+1] - c_seq->i[i];
273:     row = p->garray[i];
274:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
275:     cols += ncols; vals += ncols;
276:   }
277:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
278:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

280:   ptap->reuse = MAT_REUSE_MATRIX;
281:   return(0);
282: }

284: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat *C)
285: {
286:   PetscErrorCode      ierr;
287:   Mat_PtAPMPI         *ptap;
288:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
289:   MPI_Comm            comm;
290:   PetscMPIInt         size,rank;
291:   Mat                 Cmpi,P_loc,P_oth;
292:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
293:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
294:   PetscInt            *lnk,i,k,pnz,row,nsend;
295:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
296:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
297:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
298:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
299:   MPI_Request         *swaits,*rwaits;
300:   MPI_Status          *sstatus,rstatus;
301:   PetscLayout         rowmap;
302:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
303:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
304:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi;
305:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
306:   PetscScalar         *apv;
307:   PetscTable          ta;
308: #if defined(PETSC_USE_INFO)
309:   PetscReal           apfill;
310: #endif

313:   PetscObjectGetComm((PetscObject)A,&comm);
314:   MPI_Comm_size(comm,&size);
315:   MPI_Comm_rank(comm,&rank);

317:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

319:   /* create symbolic parallel matrix Cmpi */
320:   MatCreate(comm,&Cmpi);
321:   MatSetType(Cmpi,MATMPIAIJ);

323:   /* create struct Mat_PtAPMPI and attached it to C later */
324:   PetscNew(&ptap);
325:   ptap->reuse = MAT_INITIAL_MATRIX;
326:   ptap->algType = 0;

328:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
329:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
330:   /* get P_loc by taking all local rows of P */
331:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

333:   ptap->P_loc = P_loc;
334:   ptap->P_oth = P_oth;

336:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
337:   /* --------------------------------- */
338:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
339:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

341:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
342:   /* ----------------------------------------------------------------- */
343:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
344:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

346:   /* create and initialize a linked list */
347:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
348:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
349:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
350:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

352:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

354:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
355:   if (ao) {
356:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
357:   } else {
358:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
359:   }
360:   current_space = free_space;
361:   nspacedouble  = 0;

363:   PetscMalloc1(am+1,&api);
364:   api[0] = 0;
365:   for (i=0; i<am; i++) {
366:     /* diagonal portion: Ad[i,:]*P */
367:     ai = ad->i; pi = p_loc->i;
368:     nzi = ai[i+1] - ai[i];
369:     aj  = ad->j + ai[i];
370:     for (j=0; j<nzi; j++) {
371:       row  = aj[j];
372:       pnz  = pi[row+1] - pi[row];
373:       Jptr = p_loc->j + pi[row];
374:       /* add non-zero cols of P into the sorted linked list lnk */
375:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
376:     }
377:     /* off-diagonal portion: Ao[i,:]*P */
378:     if (ao) {
379:       ai = ao->i; pi = p_oth->i;
380:       nzi = ai[i+1] - ai[i];
381:       aj  = ao->j + ai[i];
382:       for (j=0; j<nzi; j++) {
383:         row  = aj[j];
384:         pnz  = pi[row+1] - pi[row];
385:         Jptr = p_oth->j + pi[row];
386:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
387:       }
388:     }
389:     apnz     = lnk[0];
390:     api[i+1] = api[i] + apnz;

392:     /* if free space is not available, double the total space in the list */
393:     if (current_space->local_remaining<apnz) {
394:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
395:       nspacedouble++;
396:     }

398:     /* Copy data into free space, then initialize lnk */
399:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

401:     current_space->array           += apnz;
402:     current_space->local_used      += apnz;
403:     current_space->local_remaining -= apnz;
404:   }
405:   /* Allocate space for apj and apv, initialize apj, and */
406:   /* destroy list of free space and other temporary array(s) */
407:   PetscMalloc2(api[am],&apj,api[am],&apv);
408:   PetscFreeSpaceContiguous(&free_space,apj);
409:   PetscLLCondensedDestroy_Scalable(lnk);

411:   /* Create AP_loc for reuse */
412:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

414: #if defined(PETSC_USE_INFO)
415:   if (ao) {
416:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
417:   } else {
418:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
419:   }
420:   ptap->AP_loc->info.mallocs           = nspacedouble;
421:   ptap->AP_loc->info.fill_ratio_given  = fill;
422:   ptap->AP_loc->info.fill_ratio_needed = apfill;

424:   if (api[am]) {
425:     PetscInfo3(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
426:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
427:   } else {
428:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
429:   }
430: #endif

432:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
433:   /* ------------------------------------ */
434:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

436:   /* (3) send coj of C_oth to other processors  */
437:   /* ------------------------------------------ */
438:   /* determine row ownership */
439:   PetscLayoutCreate(comm,&rowmap);
440:   rowmap->n  = pn;
441:   rowmap->bs = 1;
442:   PetscLayoutSetUp(rowmap);
443:   owners = rowmap->range;

445:   /* determine the number of messages to send, their lengths */
446:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
447:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
448:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

450:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
451:   coi   = c_oth->i; coj = c_oth->j;
452:   con   = ptap->C_oth->rmap->n;
453:   proc  = 0;
454:   for (i=0; i<con; i++) {
455:     while (prmap[i] >= owners[proc+1]) proc++;
456:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
457:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
458:   }

460:   len          = 0; /* max length of buf_si[], see (4) */
461:   owners_co[0] = 0;
462:   nsend        = 0;
463:   for (proc=0; proc<size; proc++) {
464:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
465:     if (len_s[proc]) {
466:       nsend++;
467:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
468:       len         += len_si[proc];
469:     }
470:   }

472:   /* determine the number and length of messages to receive for coi and coj  */
473:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
474:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

476:   /* post the Irecv and Isend of coj */
477:   PetscCommGetNewTag(comm,&tagj);
478:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
479:   PetscMalloc1(nsend+1,&swaits);
480:   for (proc=0, k=0; proc<size; proc++) {
481:     if (!len_s[proc]) continue;
482:     i    = owners_co[proc];
483:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
484:     k++;
485:   }

487:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
488:   /* ---------------------------------------- */
489:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
490:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

492:   /* receives coj are complete */
493:   for (i=0; i<nrecv; i++) {
494:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
495:   }
496:   PetscFree(rwaits);
497:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

499:   /* add received column indices into ta to update Crmax */
500:   for (k=0; k<nrecv; k++) {/* k-th received message */
501:     Jptr = buf_rj[k];
502:     for (j=0; j<len_r[k]; j++) {
503:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
504:     }
505:   }
506:   PetscTableGetCount(ta,&Crmax);
507:   PetscTableDestroy(&ta);

509:   /* (4) send and recv coi */
510:   /*-----------------------*/
511:   PetscCommGetNewTag(comm,&tagi);
512:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
513:   PetscMalloc1(len+1,&buf_s);
514:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
515:   for (proc=0,k=0; proc<size; proc++) {
516:     if (!len_s[proc]) continue;
517:     /* form outgoing message for i-structure:
518:          buf_si[0]:                 nrows to be sent
519:                [1:nrows]:           row index (global)
520:                [nrows+1:2*nrows+1]: i-structure index
521:     */
522:     /*-------------------------------------------*/
523:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
524:     buf_si_i    = buf_si + nrows+1;
525:     buf_si[0]   = nrows;
526:     buf_si_i[0] = 0;
527:     nrows       = 0;
528:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
529:       nzi = coi[i+1] - coi[i];
530:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
531:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
532:       nrows++;
533:     }
534:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
535:     k++;
536:     buf_si += len_si[proc];
537:   }
538:   for (i=0; i<nrecv; i++) {
539:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
540:   }
541:   PetscFree(rwaits);
542:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

544:   PetscFree4(len_s,len_si,sstatus,owners_co);
545:   PetscFree(len_ri);
546:   PetscFree(swaits);
547:   PetscFree(buf_s);

549:   /* (5) compute the local portion of Cmpi      */
550:   /* ------------------------------------------ */
551:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
552:   PetscFreeSpaceGet(Crmax,&free_space);
553:   current_space = free_space;

555:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
556:   for (k=0; k<nrecv; k++) {
557:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
558:     nrows       = *buf_ri_k[k];
559:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
560:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
561:   }

563:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
564:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
565:   for (i=0; i<pn; i++) {
566:     /* add C_loc into Cmpi */
567:     nzi  = c_loc->i[i+1] - c_loc->i[i];
568:     Jptr = c_loc->j + c_loc->i[i];
569:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

571:     /* add received col data into lnk */
572:     for (k=0; k<nrecv; k++) { /* k-th received message */
573:       if (i == *nextrow[k]) { /* i-th row */
574:         nzi  = *(nextci[k]+1) - *nextci[k];
575:         Jptr = buf_rj[k] + *nextci[k];
576:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
577:         nextrow[k]++; nextci[k]++;
578:       }
579:     }
580:     nzi = lnk[0];

582:     /* copy data into free space, then initialize lnk */
583:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
584:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
585:   }
586:   PetscFree3(buf_ri_k,nextrow,nextci);
587:   PetscLLCondensedDestroy_Scalable(lnk);
588:   PetscFreeSpaceDestroy(free_space);

590:   /* local sizes and preallocation */
591:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
592:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
593:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
594:   MatPreallocateFinalize(dnz,onz);

596:   /* members in merge */
597:   PetscFree(id_r);
598:   PetscFree(len_r);
599:   PetscFree(buf_ri[0]);
600:   PetscFree(buf_ri);
601:   PetscFree(buf_rj[0]);
602:   PetscFree(buf_rj);
603:   PetscLayoutDestroy(&rowmap);

605:   /* attach the supporting struct to Cmpi for reuse */
606:   c = (Mat_MPIAIJ*)Cmpi->data;
607:   c->ptap         = ptap;
608:   ptap->duplicate = Cmpi->ops->duplicate;
609:   ptap->destroy   = Cmpi->ops->destroy;
610:   ptap->view      = Cmpi->ops->view;

612:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
613:   Cmpi->assembled        = PETSC_FALSE;
614:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
615:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
616:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
617:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
618:   *C                     = Cmpi;
619:   return(0);
620: }

622: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
623: {
624:   PetscErrorCode      ierr;
625:   Mat_PtAPMPI         *ptap;
626:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c;
627:   MPI_Comm            comm;
628:   PetscMPIInt         size,rank;
629:   Mat                 Cmpi;
630:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
631:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
632:   PetscInt            *lnk,i,k,pnz,row,nsend;
633:   PetscBT             lnkbt;
634:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,icompleted=0,nrecv;
635:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
636:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
637:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
638:   MPI_Request         *swaits,*rwaits;
639:   MPI_Status          *sstatus,rstatus;
640:   PetscLayout         rowmap;
641:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
642:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
643:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
644:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
645:   PetscScalar         *apv;
646:   PetscTable          ta;
647: #if defined(PETSC_USE_INFO)
648:   PetscReal           apfill;
649: #endif

652:   PetscObjectGetComm((PetscObject)A,&comm);
653:   MPI_Comm_size(comm,&size);
654:   MPI_Comm_rank(comm,&rank);

656:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

658:   /* create symbolic parallel matrix Cmpi */
659:   MatCreate(comm,&Cmpi);
660:   MatSetType(Cmpi,MATMPIAIJ);

662:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
663:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

665:   /* create struct Mat_PtAPMPI and attached it to C later */
666:   PetscNew(&ptap);
667:   ptap->reuse = MAT_INITIAL_MATRIX;
668:   ptap->algType = 1;

670:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
671:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
672:   /* get P_loc by taking all local rows of P */
673:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

675:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
676:   /* --------------------------------- */
677:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
678:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

680:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
681:   /* ----------------------------------------------------------------- */
682:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
683:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

685:   /* create and initialize a linked list */
686:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
687:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
688:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
689:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
690:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

692:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

694:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
695:   if (ao) {
696:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
697:   } else {
698:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
699:   }
700:   current_space = free_space;
701:   nspacedouble  = 0;

703:   PetscMalloc1(am+1,&api);
704:   api[0] = 0;
705:   for (i=0; i<am; i++) {
706:     /* diagonal portion: Ad[i,:]*P */
707:     ai = ad->i; pi = p_loc->i;
708:     nzi = ai[i+1] - ai[i];
709:     aj  = ad->j + ai[i];
710:     for (j=0; j<nzi; j++) {
711:       row  = aj[j];
712:       pnz  = pi[row+1] - pi[row];
713:       Jptr = p_loc->j + pi[row];
714:       /* add non-zero cols of P into the sorted linked list lnk */
715:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
716:     }
717:     /* off-diagonal portion: Ao[i,:]*P */
718:     if (ao) {
719:       ai = ao->i; pi = p_oth->i;
720:       nzi = ai[i+1] - ai[i];
721:       aj  = ao->j + ai[i];
722:       for (j=0; j<nzi; j++) {
723:         row  = aj[j];
724:         pnz  = pi[row+1] - pi[row];
725:         Jptr = p_oth->j + pi[row];
726:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
727:       }
728:     }
729:     apnz     = lnk[0];
730:     api[i+1] = api[i] + apnz;
731:     if (ap_rmax < apnz) ap_rmax = apnz;

733:     /* if free space is not available, double the total space in the list */
734:     if (current_space->local_remaining<apnz) {
735:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
736:       nspacedouble++;
737:     }

739:     /* Copy data into free space, then initialize lnk */
740:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

742:     current_space->array           += apnz;
743:     current_space->local_used      += apnz;
744:     current_space->local_remaining -= apnz;
745:   }
746:   /* Allocate space for apj and apv, initialize apj, and */
747:   /* destroy list of free space and other temporary array(s) */
748:   PetscMalloc2(api[am],&apj,api[am],&apv);
749:   PetscFreeSpaceContiguous(&free_space,apj);
750:   PetscLLDestroy(lnk,lnkbt);

752:   /* Create AP_loc for reuse */
753:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);

755: #if defined(PETSC_USE_INFO)
756:   if (ao) {
757:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
758:   } else {
759:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
760:   }
761:   ptap->AP_loc->info.mallocs           = nspacedouble;
762:   ptap->AP_loc->info.fill_ratio_given  = fill;
763:   ptap->AP_loc->info.fill_ratio_needed = apfill;

765:   if (api[am]) {
766:     PetscInfo3(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
767:     PetscInfo1(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
768:   } else {
769:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
770:   }
771: #endif

773:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
774:   /* ------------------------------------ */
775:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro,ptap->AP_loc,fill,&ptap->C_oth);

777:   /* (3) send coj of C_oth to other processors  */
778:   /* ------------------------------------------ */
779:   /* determine row ownership */
780:   PetscLayoutCreate(comm,&rowmap);
781:   rowmap->n  = pn;
782:   rowmap->bs = 1;
783:   PetscLayoutSetUp(rowmap);
784:   owners = rowmap->range;

786:   /* determine the number of messages to send, their lengths */
787:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
788:   PetscMemzero(len_s,size*sizeof(PetscMPIInt));
789:   PetscMemzero(len_si,size*sizeof(PetscMPIInt));

791:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
792:   coi   = c_oth->i; coj = c_oth->j;
793:   con   = ptap->C_oth->rmap->n;
794:   proc  = 0;
795:   for (i=0; i<con; i++) {
796:     while (prmap[i] >= owners[proc+1]) proc++;
797:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
798:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
799:   }

801:   len          = 0; /* max length of buf_si[], see (4) */
802:   owners_co[0] = 0;
803:   nsend        = 0;
804:   for (proc=0; proc<size; proc++) {
805:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
806:     if (len_s[proc]) {
807:       nsend++;
808:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
809:       len         += len_si[proc];
810:     }
811:   }

813:   /* determine the number and length of messages to receive for coi and coj  */
814:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
815:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

817:   /* post the Irecv and Isend of coj */
818:   PetscCommGetNewTag(comm,&tagj);
819:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
820:   PetscMalloc1(nsend+1,&swaits);
821:   for (proc=0, k=0; proc<size; proc++) {
822:     if (!len_s[proc]) continue;
823:     i    = owners_co[proc];
824:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
825:     k++;
826:   }

828:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
829:   /* ---------------------------------------- */
830:   MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd,ptap->AP_loc,fill,&ptap->C_loc);
831:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

833:   /* receives coj are complete */
834:   for (i=0; i<nrecv; i++) {
835:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
836:   }
837:   PetscFree(rwaits);
838:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

840:   /* add received column indices into ta to update Crmax */
841:   for (k=0; k<nrecv; k++) {/* k-th received message */
842:     Jptr = buf_rj[k];
843:     for (j=0; j<len_r[k]; j++) {
844:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
845:     }
846:   }
847:   PetscTableGetCount(ta,&Crmax);
848:   PetscTableDestroy(&ta);

850:   /* (4) send and recv coi */
851:   /*-----------------------*/
852:   PetscCommGetNewTag(comm,&tagi);
853:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
854:   PetscMalloc1(len+1,&buf_s);
855:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
856:   for (proc=0,k=0; proc<size; proc++) {
857:     if (!len_s[proc]) continue;
858:     /* form outgoing message for i-structure:
859:          buf_si[0]:                 nrows to be sent
860:                [1:nrows]:           row index (global)
861:                [nrows+1:2*nrows+1]: i-structure index
862:     */
863:     /*-------------------------------------------*/
864:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
865:     buf_si_i    = buf_si + nrows+1;
866:     buf_si[0]   = nrows;
867:     buf_si_i[0] = 0;
868:     nrows       = 0;
869:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
870:       nzi = coi[i+1] - coi[i];
871:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
872:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
873:       nrows++;
874:     }
875:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
876:     k++;
877:     buf_si += len_si[proc];
878:   }
879:   for (i=0; i<nrecv; i++) {
880:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
881:   }
882:   PetscFree(rwaits);
883:   if (nsend) {MPI_Waitall(nsend,swaits,sstatus);}

885:   PetscFree4(len_s,len_si,sstatus,owners_co);
886:   PetscFree(len_ri);
887:   PetscFree(swaits);
888:   PetscFree(buf_s);

890:   /* (5) compute the local portion of Cmpi      */
891:   /* ------------------------------------------ */
892:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
893:   PetscFreeSpaceGet(Crmax,&free_space);
894:   current_space = free_space;

896:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
897:   for (k=0; k<nrecv; k++) {
898:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
899:     nrows       = *buf_ri_k[k];
900:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
901:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure  */
902:   }

904:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
905:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
906:   for (i=0; i<pn; i++) {
907:     /* add C_loc into Cmpi */
908:     nzi  = c_loc->i[i+1] - c_loc->i[i];
909:     Jptr = c_loc->j + c_loc->i[i];
910:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

912:     /* add received col data into lnk */
913:     for (k=0; k<nrecv; k++) { /* k-th received message */
914:       if (i == *nextrow[k]) { /* i-th row */
915:         nzi  = *(nextci[k]+1) - *nextci[k];
916:         Jptr = buf_rj[k] + *nextci[k];
917:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
918:         nextrow[k]++; nextci[k]++;
919:       }
920:     }
921:     nzi = lnk[0];

923:     /* copy data into free space, then initialize lnk */
924:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
925:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
926:   }
927:   PetscFree3(buf_ri_k,nextrow,nextci);
928:   PetscLLDestroy(lnk,lnkbt);
929:   PetscFreeSpaceDestroy(free_space);

931:   /* local sizes and preallocation */
932:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
933:   MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(P->cmap->bs));
934:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
935:   MatPreallocateFinalize(dnz,onz);

937:   /* members in merge */
938:   PetscFree(id_r);
939:   PetscFree(len_r);
940:   PetscFree(buf_ri[0]);
941:   PetscFree(buf_ri);
942:   PetscFree(buf_rj[0]);
943:   PetscFree(buf_rj);
944:   PetscLayoutDestroy(&rowmap);

946:   /* attach the supporting struct to Cmpi for reuse */
947:   c = (Mat_MPIAIJ*)Cmpi->data;
948:   c->ptap         = ptap;
949:   ptap->duplicate = Cmpi->ops->duplicate;
950:   ptap->destroy   = Cmpi->ops->destroy;
951:   ptap->view      = Cmpi->ops->view;
952:   PetscCalloc1(pN,&ptap->apa);

954:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
955:   Cmpi->assembled        = PETSC_FALSE;
956:   Cmpi->ops->destroy     = MatDestroy_MPIAIJ_PtAP;
957:   Cmpi->ops->duplicate   = MatDuplicate_MPIAIJ_MatPtAP;
958:   Cmpi->ops->view        = MatView_MPIAIJ_PtAP;
959:   *C                     = Cmpi;
960:   return(0);
961: }


964: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
965: {
966:   PetscErrorCode    ierr;
967:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
968:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
969:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
970:   Mat_PtAPMPI       *ptap = c->ptap;
971:   Mat               AP_loc,C_loc,C_oth;
972:   PetscInt          i,rstart,rend,cm,ncols,row;
973:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
974:   PetscScalar       *apa;
975:   const PetscInt    *cols;
976:   const PetscScalar *vals;

979:   MatZeroEntries(C);
980:   /* 1) get R = Pd^T,Ro = Po^T */
981:   if (ptap->reuse == MAT_REUSE_MATRIX) {
982:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
983:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
984:   }

986:   /* 2) get AP_loc */
987:   AP_loc = ptap->AP_loc;
988:   ap = (Mat_SeqAIJ*)AP_loc->data;

990:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
991:   /*-----------------------------------------------------*/
992:   if (ptap->reuse == MAT_REUSE_MATRIX) {
993:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
994:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
995:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
996:   }

998:   /* 2-2) compute numeric A_loc*P - dominating part */
999:   /* ---------------------------------------------- */
1000:   /* get data from symbolic products */
1001:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1002:   if (ptap->P_oth) {
1003:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1004:   }
1005:   apa   = ptap->apa;
1006:   api   = ap->i;
1007:   apj   = ap->j;
1008:   for (i=0; i<am; i++) {
1009:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1010:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1011:     apnz = api[i+1] - api[i];
1012:     for (j=0; j<apnz; j++) {
1013:       col = apj[j+api[i]];
1014:       ap->a[j+ap->i[i]] = apa[col];
1015:       apa[col] = 0.0;
1016:     }
1017:     PetscLogFlops(2.0*apnz);
1018:   }

1020:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1021:   ((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd,AP_loc,ptap->C_loc);
1022:   ((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro,AP_loc,ptap->C_oth);
1023:   C_loc = ptap->C_loc;
1024:   C_oth = ptap->C_oth;

1026:   /* add C_loc and Co to to C */
1027:   MatGetOwnershipRange(C,&rstart,&rend);

1029:   /* C_loc -> C */
1030:   cm    = C_loc->rmap->N;
1031:   c_seq = (Mat_SeqAIJ*)C_loc->data;
1032:   cols = c_seq->j;
1033:   vals = c_seq->a;


1036:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1037:   /* when there are no off-processor parts.  */
1038:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1039:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1040:   /* a table, and other, more complex stuff has to be done. */
1041:   if (C->assembled) {
1042:     C->was_assembled = PETSC_TRUE;
1043:     C->assembled     = PETSC_FALSE;
1044:   }
1045:   if (C->was_assembled) {
1046:     for (i=0; i<cm; i++) {
1047:       ncols = c_seq->i[i+1] - c_seq->i[i];
1048:       row = rstart + i;
1049:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
1050:       cols += ncols; vals += ncols;
1051:     }
1052:   } else {
1053:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
1054:   }

1056:   /* Co -> C, off-processor part */
1057:   cm = C_oth->rmap->N;
1058:   c_seq = (Mat_SeqAIJ*)C_oth->data;
1059:   cols = c_seq->j;
1060:   vals = c_seq->a;
1061:   for (i=0; i<cm; i++) {
1062:     ncols = c_seq->i[i+1] - c_seq->i[i];
1063:     row = p->garray[i];
1064:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
1065:     cols += ncols; vals += ncols;
1066:   }

1068:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1069:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1071:   ptap->reuse = MAT_REUSE_MATRIX;
1072:   return(0);
1073: }