Actual source code: matstash.c

petsc-3.14.6 2021-03-30
Report Typos and Errors

  2: #include <petsc/private/matimpl.h>

  4: #define DEFAULT_STASH_SIZE   10000

  6: static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
  7: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
  8: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
  9: #if !defined(PETSC_HAVE_MPIUNI)
 10: static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
 11: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
 12: static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
 13: #endif

 15: /*
 16:   MatStashCreate_Private - Creates a stash,currently used for all the parallel
 17:   matrix implementations. The stash is where elements of a matrix destined
 18:   to be stored on other processors are kept until matrix assembly is done.

 20:   This is a simple minded stash. Simply adds entries to end of stash.

 22:   Input Parameters:
 23:   comm - communicator, required for scatters.
 24:   bs   - stash block size. used when stashing blocks of values

 26:   Output Parameters:
 27:   stash    - the newly created stash
 28: */
 29: PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
 30: {
 32:   PetscInt       max,*opt,nopt,i;
 33:   PetscBool      flg;

 36:   /* Require 2 tags,get the second using PetscCommGetNewTag() */
 37:   stash->comm = comm;

 39:   PetscCommGetNewTag(stash->comm,&stash->tag1);
 40:   PetscCommGetNewTag(stash->comm,&stash->tag2);
 41:   MPI_Comm_size(stash->comm,&stash->size);
 42:   MPI_Comm_rank(stash->comm,&stash->rank);
 43:   PetscMalloc1(2*stash->size,&stash->flg_v);
 44:   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;


 47:   nopt = stash->size;
 48:   PetscMalloc1(nopt,&opt);
 49:   PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg);
 50:   if (flg) {
 51:     if (nopt == 1)                max = opt[0];
 52:     else if (nopt == stash->size) max = opt[stash->rank];
 53:     else if (stash->rank < nopt)  max = opt[stash->rank];
 54:     else                          max = 0; /* Use default */
 55:     stash->umax = max;
 56:   } else {
 57:     stash->umax = 0;
 58:   }
 59:   PetscFree(opt);
 60:   if (bs <= 0) bs = 1;

 62:   stash->bs         = bs;
 63:   stash->nmax       = 0;
 64:   stash->oldnmax    = 0;
 65:   stash->n          = 0;
 66:   stash->reallocs   = -1;
 67:   stash->space_head = NULL;
 68:   stash->space      = NULL;

 70:   stash->send_waits  = NULL;
 71:   stash->recv_waits  = NULL;
 72:   stash->send_status = NULL;
 73:   stash->nsends      = 0;
 74:   stash->nrecvs      = 0;
 75:   stash->svalues     = NULL;
 76:   stash->rvalues     = NULL;
 77:   stash->rindices    = NULL;
 78:   stash->nprocessed  = 0;
 79:   stash->reproduce   = PETSC_FALSE;
 80:   stash->blocktype   = MPI_DATATYPE_NULL;

 82:   PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL);
 83: #if !defined(PETSC_HAVE_MPIUNI)
 84:   flg  = PETSC_FALSE;
 85:   PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL);
 86:   if (!flg) {
 87:     stash->ScatterBegin   = MatStashScatterBegin_BTS;
 88:     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
 89:     stash->ScatterEnd     = MatStashScatterEnd_BTS;
 90:     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
 91:   } else {
 92: #endif
 93:     stash->ScatterBegin   = MatStashScatterBegin_Ref;
 94:     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
 95:     stash->ScatterEnd     = MatStashScatterEnd_Ref;
 96:     stash->ScatterDestroy = NULL;
 97: #if !defined(PETSC_HAVE_MPIUNI)
 98:   }
 99: #endif
100:   return(0);
101: }

103: /*
104:    MatStashDestroy_Private - Destroy the stash
105: */
106: PetscErrorCode MatStashDestroy_Private(MatStash *stash)
107: {

111:   PetscMatStashSpaceDestroy(&stash->space_head);
112:   if (stash->ScatterDestroy) {(*stash->ScatterDestroy)(stash);}

114:   stash->space = NULL;

116:   PetscFree(stash->flg_v);
117:   return(0);
118: }

120: /*
121:    MatStashScatterEnd_Private - This is called as the final stage of
122:    scatter. The final stages of message passing is done here, and
123:    all the memory used for message passing is cleaned up. This
124:    routine also resets the stash, and deallocates the memory used
125:    for the stash. It also keeps track of the current memory usage
126:    so that the same value can be used the next time through.
127: */
128: PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
129: {

133:   (*stash->ScatterEnd)(stash);
134:   return(0);
135: }

137: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
138: {
140:   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
141:   MPI_Status     *send_status;

144:   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
145:   /* wait on sends */
146:   if (nsends) {
147:     PetscMalloc1(2*nsends,&send_status);
148:     MPI_Waitall(2*nsends,stash->send_waits,send_status);
149:     PetscFree(send_status);
150:   }

152:   /* Now update nmaxold to be app 10% more than max n used, this way the
153:      wastage of space is reduced the next time this stash is used.
154:      Also update the oldmax, only if it increases */
155:   if (stash->n) {
156:     bs2     = stash->bs*stash->bs;
157:     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
158:     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
159:   }

161:   stash->nmax       = 0;
162:   stash->n          = 0;
163:   stash->reallocs   = -1;
164:   stash->nprocessed = 0;

166:   PetscMatStashSpaceDestroy(&stash->space_head);

168:   stash->space = NULL;

170:   PetscFree(stash->send_waits);
171:   PetscFree(stash->recv_waits);
172:   PetscFree2(stash->svalues,stash->sindices);
173:   PetscFree(stash->rvalues[0]);
174:   PetscFree(stash->rvalues);
175:   PetscFree(stash->rindices[0]);
176:   PetscFree(stash->rindices);
177:   return(0);
178: }

180: /*
181:    MatStashGetInfo_Private - Gets the relavant statistics of the stash

183:    Input Parameters:
184:    stash    - the stash
185:    nstash   - the size of the stash. Indicates the number of values stored.
186:    reallocs - the number of additional mallocs incurred.

188: */
189: PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
190: {
191:   PetscInt bs2 = stash->bs*stash->bs;

194:   if (nstash) *nstash = stash->n*bs2;
195:   if (reallocs) {
196:     if (stash->reallocs < 0) *reallocs = 0;
197:     else                     *reallocs = stash->reallocs;
198:   }
199:   return(0);
200: }

202: /*
203:    MatStashSetInitialSize_Private - Sets the initial size of the stash

205:    Input Parameters:
206:    stash  - the stash
207:    max    - the value that is used as the max size of the stash.
208:             this value is used while allocating memory.
209: */
210: PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
211: {
213:   stash->umax = max;
214:   return(0);
215: }

217: /* MatStashExpand_Private - Expand the stash. This function is called
218:    when the space in the stash is not sufficient to add the new values
219:    being inserted into the stash.

221:    Input Parameters:
222:    stash - the stash
223:    incr  - the minimum increase requested

225:    Notes:
226:    This routine doubles the currently used memory.
227:  */
228: static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
229: {
231:   PetscInt       newnmax,bs2= stash->bs*stash->bs;

234:   /* allocate a larger stash */
235:   if (!stash->oldnmax && !stash->nmax) { /* new stash */
236:     if (stash->umax)                  newnmax = stash->umax/bs2;
237:     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
238:   } else if (!stash->nmax) { /* resuing stash */
239:     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
240:     else                              newnmax = stash->oldnmax/bs2;
241:   } else                              newnmax = stash->nmax*2;
242:   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;

244:   /* Get a MatStashSpace and attach it to stash */
245:   PetscMatStashSpaceGet(bs2,newnmax,&stash->space);
246:   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
247:     stash->space_head = stash->space;
248:   }

250:   stash->reallocs++;
251:   stash->nmax = newnmax;
252:   return(0);
253: }
254: /*
255:   MatStashValuesRow_Private - inserts values into the stash. This function
256:   expects the values to be roworiented. Multiple columns belong to the same row
257:   can be inserted with a single call to this function.

259:   Input Parameters:
260:   stash  - the stash
261:   row    - the global row correspoiding to the values
262:   n      - the number of elements inserted. All elements belong to the above row.
263:   idxn   - the global column indices corresponding to each of the values.
264:   values - the values inserted
265: */
266: PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
267: {
268:   PetscErrorCode     ierr;
269:   PetscInt           i,k,cnt = 0;
270:   PetscMatStashSpace space=stash->space;

273:   /* Check and see if we have sufficient memory */
274:   if (!space || space->local_remaining < n) {
275:     MatStashExpand_Private(stash,n);
276:   }
277:   space = stash->space;
278:   k     = space->local_used;
279:   for (i=0; i<n; i++) {
280:     if (ignorezeroentries && values && values[i] == 0.0) continue;
281:     space->idx[k] = row;
282:     space->idy[k] = idxn[i];
283:     space->val[k] = values ? values[i] : 0.0;
284:     k++;
285:     cnt++;
286:   }
287:   stash->n               += cnt;
288:   space->local_used      += cnt;
289:   space->local_remaining -= cnt;
290:   return(0);
291: }

293: /*
294:   MatStashValuesCol_Private - inserts values into the stash. This function
295:   expects the values to be columnoriented. Multiple columns belong to the same row
296:   can be inserted with a single call to this function.

298:   Input Parameters:
299:   stash   - the stash
300:   row     - the global row correspoiding to the values
301:   n       - the number of elements inserted. All elements belong to the above row.
302:   idxn    - the global column indices corresponding to each of the values.
303:   values  - the values inserted
304:   stepval - the consecutive values are sepated by a distance of stepval.
305:             this happens because the input is columnoriented.
306: */
307: PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
308: {
309:   PetscErrorCode     ierr;
310:   PetscInt           i,k,cnt = 0;
311:   PetscMatStashSpace space=stash->space;

314:   /* Check and see if we have sufficient memory */
315:   if (!space || space->local_remaining < n) {
316:     MatStashExpand_Private(stash,n);
317:   }
318:   space = stash->space;
319:   k     = space->local_used;
320:   for (i=0; i<n; i++) {
321:     if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
322:     space->idx[k] = row;
323:     space->idy[k] = idxn[i];
324:     space->val[k] = values ? values[i*stepval] : 0.0;
325:     k++;
326:     cnt++;
327:   }
328:   stash->n               += cnt;
329:   space->local_used      += cnt;
330:   space->local_remaining -= cnt;
331:   return(0);
332: }

334: /*
335:   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
336:   This function expects the values to be roworiented. Multiple columns belong
337:   to the same block-row can be inserted with a single call to this function.
338:   This function extracts the sub-block of values based on the dimensions of
339:   the original input block, and the row,col values corresponding to the blocks.

341:   Input Parameters:
342:   stash  - the stash
343:   row    - the global block-row correspoiding to the values
344:   n      - the number of elements inserted. All elements belong to the above row.
345:   idxn   - the global block-column indices corresponding to each of the blocks of
346:            values. Each block is of size bs*bs.
347:   values - the values inserted
348:   rmax   - the number of block-rows in the original block.
349:   cmax   - the number of block-columsn on the original block.
350:   idx    - the index of the current block-row in the original block.
351: */
352: PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
353: {
354:   PetscErrorCode     ierr;
355:   PetscInt           i,j,k,bs2,bs=stash->bs,l;
356:   const PetscScalar  *vals;
357:   PetscScalar        *array;
358:   PetscMatStashSpace space=stash->space;

361:   if (!space || space->local_remaining < n) {
362:     MatStashExpand_Private(stash,n);
363:   }
364:   space = stash->space;
365:   l     = space->local_used;
366:   bs2   = bs*bs;
367:   for (i=0; i<n; i++) {
368:     space->idx[l] = row;
369:     space->idy[l] = idxn[i];
370:     /* Now copy over the block of values. Store the values column oriented.
371:        This enables inserting multiple blocks belonging to a row with a single
372:        funtion call */
373:     array = space->val + bs2*l;
374:     vals  = values + idx*bs2*n + bs*i;
375:     for (j=0; j<bs; j++) {
376:       for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
377:       array++;
378:       vals += cmax*bs;
379:     }
380:     l++;
381:   }
382:   stash->n               += n;
383:   space->local_used      += n;
384:   space->local_remaining -= n;
385:   return(0);
386: }

388: /*
389:   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
390:   This function expects the values to be roworiented. Multiple columns belong
391:   to the same block-row can be inserted with a single call to this function.
392:   This function extracts the sub-block of values based on the dimensions of
393:   the original input block, and the row,col values corresponding to the blocks.

395:   Input Parameters:
396:   stash  - the stash
397:   row    - the global block-row correspoiding to the values
398:   n      - the number of elements inserted. All elements belong to the above row.
399:   idxn   - the global block-column indices corresponding to each of the blocks of
400:            values. Each block is of size bs*bs.
401:   values - the values inserted
402:   rmax   - the number of block-rows in the original block.
403:   cmax   - the number of block-columsn on the original block.
404:   idx    - the index of the current block-row in the original block.
405: */
406: PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
407: {
408:   PetscErrorCode     ierr;
409:   PetscInt           i,j,k,bs2,bs=stash->bs,l;
410:   const PetscScalar  *vals;
411:   PetscScalar        *array;
412:   PetscMatStashSpace space=stash->space;

415:   if (!space || space->local_remaining < n) {
416:     MatStashExpand_Private(stash,n);
417:   }
418:   space = stash->space;
419:   l     = space->local_used;
420:   bs2   = bs*bs;
421:   for (i=0; i<n; i++) {
422:     space->idx[l] = row;
423:     space->idy[l] = idxn[i];
424:     /* Now copy over the block of values. Store the values column oriented.
425:      This enables inserting multiple blocks belonging to a row with a single
426:      funtion call */
427:     array = space->val + bs2*l;
428:     vals  = values + idx*bs2*n + bs*i;
429:     for (j=0; j<bs; j++) {
430:       for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
431:       array += bs;
432:       vals  += rmax*bs;
433:     }
434:     l++;
435:   }
436:   stash->n               += n;
437:   space->local_used      += n;
438:   space->local_remaining -= n;
439:   return(0);
440: }
441: /*
442:   MatStashScatterBegin_Private - Initiates the transfer of values to the
443:   correct owners. This function goes through the stash, and check the
444:   owners of each stashed value, and sends the values off to the owner
445:   processors.

447:   Input Parameters:
448:   stash  - the stash
449:   owners - an array of size 'no-of-procs' which gives the ownership range
450:            for each node.

452:   Notes:
453:     The 'owners' array in the cased of the blocked-stash has the
454:   ranges specified blocked global indices, and for the regular stash in
455:   the proper global indices.
456: */
457: PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
458: {

462:   (*stash->ScatterBegin)(mat,stash,owners);
463:   return(0);
464: }

466: static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
467: {
468:   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
469:   PetscInt           size=stash->size,nsends;
470:   PetscErrorCode     ierr;
471:   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
472:   PetscScalar        **rvalues,*svalues;
473:   MPI_Comm           comm = stash->comm;
474:   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
475:   PetscMPIInt        *sizes,*nlengths,nreceives;
476:   PetscInt           *sp_idx,*sp_idy;
477:   PetscScalar        *sp_val;
478:   PetscMatStashSpace space,space_next;

481:   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
482:     InsertMode addv;
483:     MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));
484:     if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
485:     mat->insertmode = addv; /* in case this processor had no cache */
486:   }

488:   bs2 = stash->bs*stash->bs;

490:   /*  first count number of contributors to each processor */
491:   PetscCalloc1(size,&nlengths);
492:   PetscMalloc1(stash->n+1,&owner);

494:   i       = j    = 0;
495:   lastidx = -1;
496:   space   = stash->space_head;
497:   while (space) {
498:     space_next = space->next;
499:     sp_idx     = space->idx;
500:     for (l=0; l<space->local_used; l++) {
501:       /* if indices are NOT locally sorted, need to start search at the beginning */
502:       if (lastidx > (idx = sp_idx[l])) j = 0;
503:       lastidx = idx;
504:       for (; j<size; j++) {
505:         if (idx >= owners[j] && idx < owners[j+1]) {
506:           nlengths[j]++; owner[i] = j; break;
507:         }
508:       }
509:       i++;
510:     }
511:     space = space_next;
512:   }

514:   /* Now check what procs get messages - and compute nsends. */
515:   PetscCalloc1(size,&sizes);
516:   for (i=0, nsends=0; i<size; i++) {
517:     if (nlengths[i]) {
518:       sizes[i] = 1; nsends++;
519:     }
520:   }

522:   {PetscMPIInt *onodes,*olengths;
523:    /* Determine the number of messages to expect, their lengths, from from-ids */
524:    PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);
525:    PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);
526:    /* since clubbing row,col - lengths are multiplied by 2 */
527:    for (i=0; i<nreceives; i++) olengths[i] *=2;
528:    PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);
529:    /* values are size 'bs2' lengths (and remove earlier factor 2 */
530:    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
531:    PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);
532:    PetscFree(onodes);
533:    PetscFree(olengths);}

535:   /* do sends:
536:       1) starts[i] gives the starting index in svalues for stuff going to
537:          the ith processor
538:   */
539:   PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);
540:   PetscMalloc1(2*nsends,&send_waits);
541:   PetscMalloc2(size,&startv,size,&starti);
542:   /* use 2 sends the first with all_a, the next with all_i and all_j */
543:   startv[0] = 0; starti[0] = 0;
544:   for (i=1; i<size; i++) {
545:     startv[i] = startv[i-1] + nlengths[i-1];
546:     starti[i] = starti[i-1] + 2*nlengths[i-1];
547:   }

549:   i     = 0;
550:   space = stash->space_head;
551:   while (space) {
552:     space_next = space->next;
553:     sp_idx     = space->idx;
554:     sp_idy     = space->idy;
555:     sp_val     = space->val;
556:     for (l=0; l<space->local_used; l++) {
557:       j = owner[i];
558:       if (bs2 == 1) {
559:         svalues[startv[j]] = sp_val[l];
560:       } else {
561:         PetscInt    k;
562:         PetscScalar *buf1,*buf2;
563:         buf1 = svalues+bs2*startv[j];
564:         buf2 = space->val + bs2*l;
565:         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
566:       }
567:       sindices[starti[j]]             = sp_idx[l];
568:       sindices[starti[j]+nlengths[j]] = sp_idy[l];
569:       startv[j]++;
570:       starti[j]++;
571:       i++;
572:     }
573:     space = space_next;
574:   }
575:   startv[0] = 0;
576:   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];

578:   for (i=0,count=0; i<size; i++) {
579:     if (sizes[i]) {
580:       MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);
581:       MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);
582:     }
583:   }
584: #if defined(PETSC_USE_INFO)
585:   PetscInfo1(NULL,"No of messages: %d \n",nsends);
586:   for (i=0; i<size; i++) {
587:     if (sizes[i]) {
588:       PetscInfo2(NULL,"Mesg_to: %d: size: %d bytes\n",i,nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)));
589:     }
590:   }
591: #endif
592:   PetscFree(nlengths);
593:   PetscFree(owner);
594:   PetscFree2(startv,starti);
595:   PetscFree(sizes);

597:   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
598:   PetscMalloc1(2*nreceives,&recv_waits);

600:   for (i=0; i<nreceives; i++) {
601:     recv_waits[2*i]   = recv_waits1[i];
602:     recv_waits[2*i+1] = recv_waits2[i];
603:   }
604:   stash->recv_waits = recv_waits;

606:   PetscFree(recv_waits1);
607:   PetscFree(recv_waits2);

609:   stash->svalues         = svalues;
610:   stash->sindices        = sindices;
611:   stash->rvalues         = rvalues;
612:   stash->rindices        = rindices;
613:   stash->send_waits      = send_waits;
614:   stash->nsends          = nsends;
615:   stash->nrecvs          = nreceives;
616:   stash->reproduce_count = 0;
617:   return(0);
618: }

620: /*
621:    MatStashScatterGetMesg_Private - This function waits on the receives posted
622:    in the function MatStashScatterBegin_Private() and returns one message at
623:    a time to the calling function. If no messages are left, it indicates this
624:    by setting flg = 0, else it sets flg = 1.

626:    Input Parameters:
627:    stash - the stash

629:    Output Parameters:
630:    nvals - the number of entries in the current message.
631:    rows  - an array of row indices (or blocked indices) corresponding to the values
632:    cols  - an array of columnindices (or blocked indices) corresponding to the values
633:    vals  - the values
634:    flg   - 0 indicates no more message left, and the current call has no values associated.
635:            1 indicates that the current call successfully received a message, and the
636:              other output parameters nvals,rows,cols,vals are set appropriately.
637: */
638: PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
639: {

643:   (*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg);
644:   return(0);
645: }

647: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
648: {
650:   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
651:   PetscInt       bs2;
652:   MPI_Status     recv_status;
653:   PetscBool      match_found = PETSC_FALSE;

656:   *flg = 0; /* When a message is discovered this is reset to 1 */
657:   /* Return if no more messages to process */
658:   if (stash->nprocessed == stash->nrecvs) return(0);

660:   bs2 = stash->bs*stash->bs;
661:   /* If a matching pair of receives are found, process them, and return the data to
662:      the calling function. Until then keep receiving messages */
663:   while (!match_found) {
664:     if (stash->reproduce) {
665:       i    = stash->reproduce_count++;
666:       MPI_Wait(stash->recv_waits+i,&recv_status);
667:     } else {
668:       MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status);
669:     }
670:     if (recv_status.MPI_SOURCE < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!");

672:     /* Now pack the received message into a structure which is usable by others */
673:     if (i % 2) {
674:       MPI_Get_count(&recv_status,MPIU_SCALAR,nvals);

676:       flg_v[2*recv_status.MPI_SOURCE] = i/2;

678:       *nvals = *nvals/bs2;
679:     } else {
680:       MPI_Get_count(&recv_status,MPIU_INT,nvals);

682:       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;

684:       *nvals = *nvals/2; /* This message has both row indices and col indices */
685:     }

687:     /* Check if we have both messages from this proc */
688:     i1 = flg_v[2*recv_status.MPI_SOURCE];
689:     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
690:     if (i1 != -1 && i2 != -1) {
691:       *rows = stash->rindices[i2];
692:       *cols = *rows + *nvals;
693:       *vals = stash->rvalues[i1];
694:       *flg  = 1;
695:       stash->nprocessed++;
696:       match_found = PETSC_TRUE;
697:     }
698:   }
699:   return(0);
700: }

702: #if !defined(PETSC_HAVE_MPIUNI)
703: typedef struct {
704:   PetscInt row;
705:   PetscInt col;
706:   PetscScalar vals[1];          /* Actually an array of length bs2 */
707: } MatStashBlock;

709: static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
710: {
712:   PetscMatStashSpace space;
713:   PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
714:   PetscScalar **valptr;

717:   PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm);
718:   for (space=stash->space_head,cnt=0; space; space=space->next) {
719:     for (i=0; i<space->local_used; i++) {
720:       row[cnt] = space->idx[i];
721:       col[cnt] = space->idy[i];
722:       valptr[cnt] = &space->val[i*bs2];
723:       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
724:       cnt++;
725:     }
726:   }
727:   if (cnt != n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %D, but counted %D entries",n,cnt);
728:   PetscSortIntWithArrayPair(n,row,col,perm);
729:   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
730:   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
731:     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
732:       PetscInt colstart;
733:       PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]);
734:       for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
735:         PetscInt j,l;
736:         MatStashBlock *block;
737:         PetscSegBufferGet(stash->segsendblocks,1,&block);
738:         block->row = row[rowstart];
739:         block->col = col[colstart];
740:         PetscArraycpy(block->vals,valptr[perm[colstart]],bs2);
741:         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
742:           if (insertmode == ADD_VALUES) {
743:             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
744:           } else {
745:             PetscArraycpy(block->vals,valptr[perm[j]],bs2);
746:           }
747:         }
748:         colstart = j;
749:       }
750:       rowstart = i;
751:     }
752:   }
753:   PetscFree4(row,col,valptr,perm);
754:   return(0);
755: }

757: static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
758: {

762:   if (stash->blocktype == MPI_DATATYPE_NULL) {
763:     PetscInt     bs2 = PetscSqr(stash->bs);
764:     PetscMPIInt  blocklens[2];
765:     MPI_Aint     displs[2];
766:     MPI_Datatype types[2],stype;
767:     /* Note that DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
768:        std::complex itself has standard layout, so does DummyBlock, recursively.
769:        To be compatiable with C++ std::complex, complex implementations on GPUs must also have standard layout,
770:        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
771:        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.

773:        We can test if std::complex has standard layout with the following code:
774:        #include <iostream>
775:        #include <type_traits>
776:        #include <complex>
777:        int main() {
778:          std::cout << std::boolalpha;
779:          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
780:        }
781:        Output: true
782:      */
783:     struct DummyBlock {PetscInt row,col; PetscScalar vals;};

785:     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
786:     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
787:       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
788:     }
789:     PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);
790:     PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);
791:     PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);
792:     blocklens[0] = 2;
793:     blocklens[1] = bs2;
794:     displs[0] = offsetof(struct DummyBlock,row);
795:     displs[1] = offsetof(struct DummyBlock,vals);
796:     types[0] = MPIU_INT;
797:     types[1] = MPIU_SCALAR;
798:     MPI_Type_create_struct(2,blocklens,displs,types,&stype);
799:     MPI_Type_commit(&stype);
800:     MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype); /* MPI-2 */
801:     MPI_Type_commit(&stash->blocktype);
802:     MPI_Type_free(&stype);
803:   }
804:   return(0);
805: }

807: /* Callback invoked after target rank has initiatied receive of rendezvous message.
808:  * Here we post the main sends.
809:  */
810: static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
811: {
812:   MatStash *stash = (MatStash*)ctx;
813:   MatStashHeader *hdr = (MatStashHeader*)sdata;

817:   if (rank != stash->sendranks[rankid]) SETERRQ3(comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]);
818:   MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);
819:   stash->sendframes[rankid].count = hdr->count;
820:   stash->sendframes[rankid].pending = 1;
821:   return(0);
822: }

824: /* Callback invoked by target after receiving rendezvous message.
825:  * Here we post the main recvs.
826:  */
827: static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
828: {
829:   MatStash *stash = (MatStash*)ctx;
830:   MatStashHeader *hdr = (MatStashHeader*)rdata;
831:   MatStashFrame *frame;

835:   PetscSegBufferGet(stash->segrecvframe,1,&frame);
836:   PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer);
837:   MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);
838:   frame->count = hdr->count;
839:   frame->pending = 1;
840:   return(0);
841: }

843: /*
844:  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
845:  */
846: static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
847: {
849:   size_t nblocks;
850:   char *sendblocks;

853:   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
854:     InsertMode addv;
855:     MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));
856:     if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
857:   }

859:   MatStashBlockTypeSetUp(stash);
860:   MatStashSortCompress_Private(stash,mat->insertmode);
861:   PetscSegBufferGetSize(stash->segsendblocks,&nblocks);
862:   PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks);
863:   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
864:     PetscInt i;
865:     size_t b;
866:     for (i=0,b=0; i<stash->nsendranks; i++) {
867:       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
868:       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
869:       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
870:       for (; b<nblocks; b++) {
871:         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
872:         if (PetscUnlikely(sendblock_b->row < owners[stash->sendranks[i]])) SETERRQ2(stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %D owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]);
873:         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
874:         stash->sendhdr[i].count++;
875:       }
876:     }
877:   } else {                      /* Dynamically count and pack (first time) */
878:     PetscInt sendno;
879:     size_t i,rowstart;

881:     /* Count number of send ranks and allocate for sends */
882:     stash->nsendranks = 0;
883:     for (rowstart=0; rowstart<nblocks;) {
884:       PetscInt owner;
885:       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
886:       PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);
887:       if (owner < 0) owner = -(owner+2);
888:       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
889:         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
890:         if (sendblock_i->row >= owners[owner+1]) break;
891:       }
892:       stash->nsendranks++;
893:       rowstart = i;
894:     }
895:     PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes);

897:     /* Set up sendhdrs and sendframes */
898:     sendno = 0;
899:     for (rowstart=0; rowstart<nblocks;) {
900:       PetscInt owner;
901:       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
902:       PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);
903:       if (owner < 0) owner = -(owner+2);
904:       stash->sendranks[sendno] = owner;
905:       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
906:         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
907:         if (sendblock_i->row >= owners[owner+1]) break;
908:       }
909:       stash->sendframes[sendno].buffer = sendblock_rowstart;
910:       stash->sendframes[sendno].pending = 0;
911:       stash->sendhdr[sendno].count = i - rowstart;
912:       sendno++;
913:       rowstart = i;
914:     }
915:     if (sendno != stash->nsendranks) SETERRQ2(stash->comm,PETSC_ERR_PLIB,"BTS counted %D sendranks, but %D sends",stash->nsendranks,sendno);
916:   }

918:   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
919:    * message or a dummy entry of some sort. */
920:   if (mat->insertmode == INSERT_VALUES) {
921:     size_t i;
922:     for (i=0; i<nblocks; i++) {
923:       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
924:       sendblock_i->row = -(sendblock_i->row+1);
925:     }
926:   }

928:   if (stash->first_assembly_done) {
929:     PetscMPIInt i,tag;
930:     PetscCommGetNewTag(stash->comm,&tag);
931:     for (i=0; i<stash->nrecvranks; i++) {
932:       MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash);
933:     }
934:     for (i=0; i<stash->nsendranks; i++) {
935:       MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash);
936:     }
937:     stash->use_status = PETSC_TRUE; /* Use count from message status. */
938:   } else {
939:     PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
940:                                       &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
941:                                       MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);
942:     PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses);
943:     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
944:   }

946:   PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes);
947:   stash->recvframe_active     = NULL;
948:   stash->recvframe_i          = 0;
949:   stash->some_i               = 0;
950:   stash->some_count           = 0;
951:   stash->recvcount            = 0;
952:   stash->first_assembly_done  = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
953:   stash->insertmode           = &mat->insertmode;
954:   return(0);
955: }

957: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
958: {
960:   MatStashBlock *block;

963:   *flg = 0;
964:   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
965:     if (stash->some_i == stash->some_count) {
966:       if (stash->recvcount == stash->nrecvranks) return(0); /* Done */
967:       MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE);
968:       stash->some_i = 0;
969:     }
970:     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
971:     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
972:     if (stash->use_status) { /* Count what was actually sent */
973:       MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count);
974:     }
975:     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
976:       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
977:       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
978:       if (PetscUnlikely(*stash->insertmode == INSERT_VALUES && block->row >= 0)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
979:       if (PetscUnlikely(*stash->insertmode == ADD_VALUES && block->row < 0)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
980:     }
981:     stash->some_i++;
982:     stash->recvcount++;
983:     stash->recvframe_i = 0;
984:   }
985:   *n = 1;
986:   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
987:   if (block->row < 0) block->row = -(block->row + 1);
988:   *row = &block->row;
989:   *col = &block->col;
990:   *val = block->vals;
991:   stash->recvframe_i++;
992:   *flg = 1;
993:   return(0);
994: }

996: static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
997: {

1001:   MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE);
1002:   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
1003:     void *dummy;
1004:     PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy);
1005:   } else {                      /* No reuse, so collect everything. */
1006:     MatStashScatterDestroy_BTS(stash);
1007:   }

1009:   /* Now update nmaxold to be app 10% more than max n used, this way the
1010:      wastage of space is reduced the next time this stash is used.
1011:      Also update the oldmax, only if it increases */
1012:   if (stash->n) {
1013:     PetscInt bs2     = stash->bs*stash->bs;
1014:     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
1015:     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
1016:   }

1018:   stash->nmax       = 0;
1019:   stash->n          = 0;
1020:   stash->reallocs   = -1;
1021:   stash->nprocessed = 0;

1023:   PetscMatStashSpaceDestroy(&stash->space_head);

1025:   stash->space = NULL;

1027:   return(0);
1028: }

1030: PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
1031: {

1035:   PetscSegBufferDestroy(&stash->segsendblocks);
1036:   PetscSegBufferDestroy(&stash->segrecvframe);
1037:   stash->recvframes = NULL;
1038:   PetscSegBufferDestroy(&stash->segrecvblocks);
1039:   if (stash->blocktype != MPI_DATATYPE_NULL) {
1040:     MPI_Type_free(&stash->blocktype);
1041:   }
1042:   stash->nsendranks = 0;
1043:   stash->nrecvranks = 0;
1044:   PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes);
1045:   PetscFree(stash->sendreqs);
1046:   PetscFree(stash->recvreqs);
1047:   PetscFree(stash->recvranks);
1048:   PetscFree(stash->recvhdr);
1049:   PetscFree2(stash->some_indices,stash->some_statuses);
1050:   return(0);
1051: }
1052: #endif