Actual source code: gs.c

petsc-3.7.7 2017-09-25
Report Typos and Errors
  2: /***********************************gs.c***************************************

  4: Author: Henry M. Tufo III

  6: e-mail: hmt@cs.brown.edu

  8: snail-mail:
  9: Division of Applied Mathematics
 10: Brown University
 11: Providence, RI 02912

 13: Last Modification:
 14: 6.21.97
 15: ************************************gs.c**************************************/

 17: /***********************************gs.c***************************************
 18: File Description:
 19: -----------------

 21: ************************************gs.c**************************************/

 23: #include <../src/ksp/pc/impls/tfs/tfs.h>

 25: /* default length of number of items via tree - doubles if exceeded */
 26: #define TREE_BUF_SZ 2048;
 27: #define GS_VEC_SZ   1



 31: /***********************************gs.c***************************************
 32: Type: struct gather_scatter_id
 33: ------------------------------

 35: ************************************gs.c**************************************/
 36: typedef struct gather_scatter_id {
 37:   PetscInt    id;
 38:   PetscInt    nel_min;
 39:   PetscInt    nel_max;
 40:   PetscInt    nel_sum;
 41:   PetscInt    negl;
 42:   PetscInt    gl_max;
 43:   PetscInt    gl_min;
 44:   PetscInt    repeats;
 45:   PetscInt    ordered;
 46:   PetscInt    positive;
 47:   PetscScalar *vals;

 49:   /* bit mask info */
 50:   PetscInt *my_proc_mask;
 51:   PetscInt mask_sz;
 52:   PetscInt *ngh_buf;
 53:   PetscInt ngh_buf_sz;
 54:   PetscInt *nghs;
 55:   PetscInt num_nghs;
 56:   PetscInt max_nghs;
 57:   PetscInt *pw_nghs;
 58:   PetscInt num_pw_nghs;
 59:   PetscInt *tree_nghs;
 60:   PetscInt num_tree_nghs;

 62:   PetscInt num_loads;

 64:   /* repeats == true -> local info */
 65:   PetscInt nel;         /* number of unique elememts */
 66:   PetscInt *elms;       /* of size nel */
 67:   PetscInt nel_total;
 68:   PetscInt *local_elms; /* of size nel_total */
 69:   PetscInt *companion;  /* of size nel_total */

 71:   /* local info */
 72:   PetscInt num_local_total;
 73:   PetscInt local_strength;
 74:   PetscInt num_local;
 75:   PetscInt *num_local_reduce;
 76:   PetscInt **local_reduce;
 77:   PetscInt num_local_gop;
 78:   PetscInt *num_gop_local_reduce;
 79:   PetscInt **gop_local_reduce;

 81:   /* pairwise info */
 82:   PetscInt    level;
 83:   PetscInt    num_pairs;
 84:   PetscInt    max_pairs;
 85:   PetscInt    loc_node_pairs;
 86:   PetscInt    max_node_pairs;
 87:   PetscInt    min_node_pairs;
 88:   PetscInt    avg_node_pairs;
 89:   PetscInt    *pair_list;
 90:   PetscInt    *msg_sizes;
 91:   PetscInt    **node_list;
 92:   PetscInt    len_pw_list;
 93:   PetscInt    *pw_elm_list;
 94:   PetscScalar *pw_vals;

 96:   MPI_Request *msg_ids_in;
 97:   MPI_Request *msg_ids_out;

 99:   PetscScalar *out;
100:   PetscScalar *in;
101:   PetscInt    msg_total;

103:   /* tree - crystal accumulator info */
104:   PetscInt max_left_over;
105:   PetscInt *pre;
106:   PetscInt *in_num;
107:   PetscInt *out_num;
108:   PetscInt **in_list;
109:   PetscInt **out_list;

111:   /* new tree work*/
112:   PetscInt    tree_nel;
113:   PetscInt    *tree_elms;
114:   PetscScalar *tree_buf;
115:   PetscScalar *tree_work;

117:   PetscInt tree_map_sz;
118:   PetscInt *tree_map_in;
119:   PetscInt *tree_map_out;

121:   /* current memory status */
122:   PetscInt gl_bss_min;
123:   PetscInt gl_perm_min;

125:   /* max segment size for PCTFS_gs_gop_vec() */
126:   PetscInt vec_sz;

128:   /* hack to make paul happy */
129:   MPI_Comm PCTFS_gs_comm;

131: } PCTFS_gs_id;

133: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137: static PCTFS_gs_id *gsi_new(void);
138: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

140: /* same for all but vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142: /* vector flavor */
143: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

145: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);


152: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

155: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

159: /* global vars */
160: /* from comm.c module */

162: static PetscInt num_gs_ids = 0;

164: /* should make this dynamic ... later */
165: static PetscInt msg_buf    =MAX_MSG_BUF;
166: static PetscInt vec_sz     =GS_VEC_SZ;
167: static PetscInt *tree_buf  =NULL;
168: static PetscInt tree_buf_sz=0;
169: static PetscInt ntree      =0;

171: /***************************************************************************/
172: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173: {
175:   vec_sz = size;
176:   return(0);
177: }

179: /******************************************************************************/
180: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181: {
183:   msg_buf = buf_size;
184:   return(0);
185: }

187: /******************************************************************************/
188: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
189: {
190:   PCTFS_gs_id    *gs;
191:   MPI_Group      PCTFS_gs_group;
192:   MPI_Comm       PCTFS_gs_comm;

196:   /* ensure that communication package has been initialized */
197:   PCTFS_comm_init();


200:   /* determines if we have enough dynamic/semi-static memory */
201:   /* checks input, allocs and sets gd_id template            */
202:   gs = gsi_check_args(elms,nel,level);

204:   /* only bit mask version up and working for the moment    */
205:   /* LATER :: get int list version working for sparse pblms */
206:   gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);


209:   MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210:   MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);

212:   gs->PCTFS_gs_comm=PCTFS_gs_comm;

214:   return(gs);
215: }

217: /******************************************************************************/
218: static PCTFS_gs_id *gsi_new(void)
219: {
221:   PCTFS_gs_id    *gs;
222:   gs   = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
223:   PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
224:   return(gs);
225: }

227: /******************************************************************************/
228: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
229: {
230:   PetscInt       i, j, k, t2;
231:   PetscInt       *companion, *elms, *unique, *iptr;
232:   PetscInt       num_local=0, *num_to_reduce, **local_reduce;
233:   PetscInt       oprs[]   = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
234:   PetscInt       vals[sizeof(oprs)/sizeof(oprs[0])-1];
235:   PetscInt       work[sizeof(oprs)/sizeof(oprs[0])-1];
236:   PCTFS_gs_id    *gs;


240:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
241:   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");

243:   if (nel==0) { PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }

245:   /* get space for gs template */
246:   gs     = gsi_new();
247:   gs->id = ++num_gs_ids;

249:   /* hmt 6.4.99                                            */
250:   /* caller can set global ids that don't participate to 0 */
251:   /* PCTFS_gs_init ignores all zeros in elm list                 */
252:   /* negative global ids are still invalid                 */
253:   for (i=j=0; i<nel; i++) {
254:     if (in_elms[i]!=0) j++;
255:   }

257:   k=nel; nel=j;

259:   /* copy over in_elms list and create inverse map */
260:   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
261:   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));

263:   for (i=j=0; i<k; i++) {
264:     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
265:   }

267:   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");

269:   /* pre-pass ... check to see if sorted */
270:   elms[nel] = INT_MAX;
271:   iptr      = elms;
272:   unique    = elms+1;
273:   j         =0;
274:   while (*iptr!=INT_MAX) {
275:     if (*iptr++>*unique++) { j=1; break; }
276:   }

278:   /* set up inverse map */
279:   if (j) {
280:     PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
281:     PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
282:   } else { PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
283:   elms[nel] = INT_MIN;

285:   /* first pass */
286:   /* determine number of unique elements, check pd */
287:   for (i=k=0; i<nel; i+=j) {
288:     t2 = elms[i];
289:     j  = ++i;

291:     /* clump 'em for now */
292:     while (elms[j]==t2) j++;

294:     /* how many together and num local */
295:     if (j-=i) { num_local++; k+=j; }
296:   }

298:   /* how many unique elements? */
299:   gs->repeats = k;
300:   gs->nel     = nel-k;


303:   /* number of repeats? */
304:   gs->num_local        = num_local;
305:   num_local           += 2;
306:   gs->local_reduce     = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
307:   gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));

309:   unique         = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
310:   gs->elms       = unique;
311:   gs->nel_total  = nel;
312:   gs->local_elms = elms;
313:   gs->companion  = companion;

315:   /* compess map as well as keep track of local ops */
316:   for (num_local=i=j=0; i<gs->nel; i++) {
317:     k            = j;
318:     t2           = unique[i] = elms[j];
319:     companion[i] = companion[j];

321:     while (elms[j]==t2) j++;

323:     if ((t2=(j-k))>1) {
324:       /* number together */
325:       num_to_reduce[num_local] = t2++;

327:       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));

329:       /* to use binary searching don't remap until we check intersection */
330:       *iptr++ = i;

332:       /* note that we're skipping the first one */
333:       while (++k<j) *(iptr++) = companion[k];
334:       *iptr = -1;
335:     }
336:   }

338:   /* sentinel for ngh_buf */
339:   unique[gs->nel]=INT_MAX;

341:   /* for two partition sort hack */
342:   num_to_reduce[num_local]   = 0;
343:   local_reduce[num_local]    = NULL;
344:   num_to_reduce[++num_local] = 0;
345:   local_reduce[num_local]    = NULL;

347:   /* load 'em up */
348:   /* note one extra to hold NON_UNIFORM flag!!! */
349:   vals[2] = vals[1] = vals[0] = nel;
350:   if (gs->nel>0) {
351:     vals[3] = unique[0];
352:     vals[4] = unique[gs->nel-1];
353:   } else {
354:     vals[3] = INT_MAX;
355:     vals[4] = INT_MIN;
356:   }
357:   vals[5] = level;
358:   vals[6] = num_gs_ids;

360:   /* GLOBAL: send 'em out */
361:   PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);

363:   /* must be semi-pos def - only pairwise depends on this */
364:   /* LATER - remove this restriction */
365:   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
366:   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");

368:   gs->nel_min = vals[0];
369:   gs->nel_max = vals[1];
370:   gs->nel_sum = vals[2];
371:   gs->gl_min  = vals[3];
372:   gs->gl_max  = vals[4];
373:   gs->negl    = vals[4]-vals[3]+1;

375:   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");

377:   /* LATER :: add level == -1 -> program selects level */
378:   if (vals[5]<0) vals[5]=0;
379:   else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
380:   gs->level = vals[5];

382:   return(gs);
383: }

385: /******************************************************************************/
386: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
387: {
388:   PetscInt       i, nel, *elms;
389:   PetscInt       t1;
390:   PetscInt       **reduce;
391:   PetscInt       *map;

395:   /* totally local removes ... PCTFS_ct_bits == 0 */
396:   get_ngh_buf(gs);

398:   if (gs->level) set_pairwise(gs);
399:   if (gs->max_left_over) set_tree(gs);

401:   /* intersection local and pairwise/tree? */
402:   gs->num_local_total      = gs->num_local;
403:   gs->gop_local_reduce     = gs->local_reduce;
404:   gs->num_gop_local_reduce = gs->num_local_reduce;

406:   map = gs->companion;

408:   /* is there any local compression */
409:   if (!gs->num_local) {
410:     gs->local_strength = NONE;
411:     gs->num_local_gop  = 0;
412:   } else {
413:     /* ok find intersection */
414:     map    = gs->companion;
415:     reduce = gs->local_reduce;
416:     for (i=0, t1=0; i<gs->num_local; i++, reduce++)
417:     {
418:       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
419:           ||
420:           PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
421:         t1++;
422:         if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
423:         gs->num_local_reduce[i] *= -1;
424:       }
425:       **reduce=map[**reduce];
426:     }

428:     /* intersection is empty */
429:     if (!t1) {
430:       gs->local_strength = FULL;
431:       gs->num_local_gop  = 0;
432:     } else { /* intersection not empty */
433:       gs->local_strength = PARTIAL;

435:       PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

437:       gs->num_local_gop        = t1;
438:       gs->num_local_total      =  gs->num_local;
439:       gs->num_local           -= t1;
440:       gs->gop_local_reduce     = gs->local_reduce;
441:       gs->num_gop_local_reduce = gs->num_local_reduce;

443:       for (i=0; i<t1; i++) {
444:         if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
445:         gs->num_gop_local_reduce[i] *= -1;
446:         gs->local_reduce++;
447:         gs->num_local_reduce++;
448:       }
449:       gs->local_reduce++;
450:       gs->num_local_reduce++;
451:     }
452:   }

454:   elms = gs->pw_elm_list;
455:   nel  = gs->len_pw_list;
456:   for (i=0; i<nel; i++) elms[i] = map[elms[i]];

458:   elms = gs->tree_map_in;
459:   nel  = gs->tree_map_sz;
460:   for (i=0; i<nel; i++) elms[i] = map[elms[i]];

462:   /* clean up */
463:   free((void*) gs->local_elms);
464:   free((void*) gs->companion);
465:   free((void*) gs->elms);
466:   free((void*) gs->ngh_buf);
467:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
468:   return(0);
469: }

471: /******************************************************************************/
472: static PetscErrorCode place_in_tree(PetscInt elm)
473: {
474:   PetscInt *tp, n;

477:   if (ntree==tree_buf_sz) {
478:     if (tree_buf_sz) {
479:       tp           = tree_buf;
480:       n            = tree_buf_sz;
481:       tree_buf_sz<<=1;
482:       tree_buf     = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
483:       PCTFS_ivec_copy(tree_buf,tp,n);
484:       free(tp);
485:     } else {
486:       tree_buf_sz = TREE_BUF_SZ;
487:       tree_buf    = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
488:     }
489:   }

491:   tree_buf[ntree++] = elm;
492:   return(0);
493: }

495: /******************************************************************************/
496: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
497: {
498:   PetscInt       i, j, npw=0, ntree_map=0;
499:   PetscInt       p_mask_size, ngh_buf_size, buf_size;
500:   PetscInt       *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
501:   PetscInt       *ngh_buf, *buf1, *buf2;
502:   PetscInt       offset, per_load, num_loads, or_ct, start, end;
503:   PetscInt       *ptr1, *ptr2, i_start, negl, nel, *elms;
504:   PetscInt       oper=GL_B_OR;
505:   PetscInt       *ptr3, *t_mask, level, ct1, ct2;

509:   /* to make life easier */
510:   nel   = gs->nel;
511:   elms  = gs->elms;
512:   level = gs->level;

514:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
515:   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
516:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

518:   /* allocate space for masks and info bufs */
519:   gs->nghs       = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
520:   gs->pw_nghs    = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
521:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
522:   t_mask         = (PetscInt*) malloc(p_mask_size);
523:   gs->ngh_buf    = ngh_buf = (PetscInt*) malloc(ngh_buf_size);

525:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
526:   /* had thought I could exploit rendezvous threshold */

528:   /* default is one pass */
529:   per_load      = negl  = gs->negl;
530:   gs->num_loads = num_loads = 1;
531:   i             = p_mask_size*negl;

533:   /* possible overflow on buffer size */
534:   /* overflow hack                    */
535:   if (i<0) i=INT_MAX;

537:   buf_size = PetscMin(msg_buf,i);

539:   /* can we do it? */
540:   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);

542:   /* get PCTFS_giop buf space ... make *only* one malloc */
543:   buf1 = (PetscInt*) malloc(buf_size<<1);

545:   /* more than one gior exchange needed? */
546:   if (buf_size!=i) {
547:     per_load      = buf_size/p_mask_size;
548:     buf_size      = per_load*p_mask_size;
549:     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
550:   }


553:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
554:   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);

556:   /* find PCTFS_giop work space */
557:   buf2 = buf1+buf_size;

559:   /* hold #ints needed for processor masks */
560:   gs->mask_sz=p_mask_size;

562:   /* init buffers */
563:   PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
564:   PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
565:   PCTFS_ivec_zero(ngh_buf,ngh_buf_size);

567:   /* HACK reset tree info */
568:   tree_buf    = NULL;
569:   tree_buf_sz = ntree = 0;

571:   /* ok do it */
572:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
573:     /* identity for bitwise or is 000...000 */
574:     PCTFS_ivec_zero(buf1,buf_size);

576:     /* load msg buffer */
577:     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
578:       offset = (offset-start)*p_mask_size;
579:       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
580:     }

582:     /* GLOBAL: pass buffer */
583:     PCTFS_giop(buf1,buf2,buf_size,&oper);


586:     /* unload buffer into ngh_buf */
587:     ptr2=(elms+i_start);
588:     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
589:       /* I own it ... may have to pairwise it */
590:       if (j==*ptr2) {
591:         /* do i share it w/anyone? */
592:         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
593:         /* guess not */
594:         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }

596:         /* i do ... so keep info and turn off my bit */
597:         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
598:         PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
599:         PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);

601:         /* is it to be done pairwise? */
602:         if (--ct1<=level) {
603:           npw++;

605:           /* turn on high bit to indicate pw need to process */
606:           *ptr2++ |= TOP_BIT;
607:           PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
608:           ptr1    += p_mask_size;
609:           continue;
610:         }

612:         /* get set for next and note that I have a tree contribution */
613:         /* could save exact elm index for tree here -> save a search */
614:         ptr2++; ptr1+=p_mask_size; ntree_map++;
615:       } else { /* i don't but still might be involved in tree */

617:         /* shared by how many? */
618:         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));

620:         /* none! */
621:         if (ct1<2) continue;

623:         /* is it going to be done pairwise? but not by me of course!*/
624:         if (--ct1<=level) continue;
625:       }
626:       /* LATER we're going to have to process it NOW */
627:       /* nope ... tree it */
628:       place_in_tree(j);
629:     }
630:   }

632:   free((void*)t_mask);
633:   free((void*)buf1);

635:   gs->len_pw_list = npw;
636:   gs->num_nghs    = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));

638:   /* expand from bit mask list to int list and save ngh list */
639:   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
640:   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);

642:   gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));

644:   oper         = GL_MAX;
645:   ct1          = gs->num_nghs;
646:   PCTFS_giop(&ct1,&ct2,1,&oper);
647:   gs->max_nghs = ct1;

649:   gs->tree_map_sz  = ntree_map;
650:   gs->max_left_over=ntree;

652:   free((void*)p_mask);
653:   free((void*)sh_proc_mask);
654:   return(0);
655: }

657: /******************************************************************************/
658: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
659: {
660:   PetscInt       i, j;
661:   PetscInt       p_mask_size;
662:   PetscInt       *p_mask, *sh_proc_mask, *tmp_proc_mask;
663:   PetscInt       *ngh_buf, *buf2;
664:   PetscInt       offset;
665:   PetscInt       *msg_list, *msg_size, **msg_nodes, nprs;
666:   PetscInt       *pairwise_elm_list, len_pair_list=0;
667:   PetscInt       *iptr, t1, i_start, nel, *elms;
668:   PetscInt       ct;

672:   /* to make life easier */
673:   nel          = gs->nel;
674:   elms         = gs->elms;
675:   ngh_buf      = gs->ngh_buf;
676:   sh_proc_mask = gs->pw_nghs;

678:   /* need a few temp masks */
679:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
680:   p_mask        = (PetscInt*) malloc(p_mask_size);
681:   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);

683:   /* set mask to my PCTFS_my_id's bit mask */
684:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

686:   p_mask_size /= sizeof(PetscInt);

688:   len_pair_list   = gs->len_pw_list;
689:   gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));

691:   /* how many processors (nghs) do we have to exchange with? */
692:   nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));


695:   /* allocate space for PCTFS_gs_gop() info */
696:   gs->pair_list = msg_list  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
697:   gs->msg_sizes = msg_size  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
698:   gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));

700:   /* init msg_size list */
701:   PCTFS_ivec_zero(msg_size,nprs);

703:   /* expand from bit mask list to int list */
704:   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);

706:   /* keep list of elements being handled pairwise */
707:   for (i=j=0; i<nel; i++) {
708:     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
709:   }
710:   pairwise_elm_list[j] = -1;

712:   gs->msg_ids_out       = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
713:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
714:   gs->msg_ids_in        = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
715:   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
716:   gs->pw_vals           = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);

718:   /* find who goes to each processor */
719:   for (i_start=i=0; i<nprs; i++) {
720:     /* processor i's mask */
721:     PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);

723:     /* det # going to processor i */
724:     for (ct=j=0; j<len_pair_list; j++) {
725:       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
726:       PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
727:       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
728:     }
729:     msg_size[i] = ct;
730:     i_start     = PetscMax(i_start,ct);

732:     /*space to hold nodes in message to first neighbor */
733:     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));

735:     for (j=0;j<len_pair_list;j++) {
736:       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
737:       PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
738:       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
739:     }
740:     *iptr = -1;
741:   }
742:   msg_nodes[nprs] = NULL;

744:   j                  = gs->loc_node_pairs=i_start;
745:   t1                 = GL_MAX;
746:   PCTFS_giop(&i_start,&offset,1,&t1);
747:   gs->max_node_pairs = i_start;

749:   i_start            = j;
750:   t1                 = GL_MIN;
751:   PCTFS_giop(&i_start,&offset,1,&t1);
752:   gs->min_node_pairs = i_start;

754:   i_start            = j;
755:   t1                 = GL_ADD;
756:   PCTFS_giop(&i_start,&offset,1,&t1);
757:   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;

759:   i_start = nprs;
760:   t1      = GL_MAX;
761:   PCTFS_giop(&i_start,&offset,1,&t1);
762:   gs->max_pairs = i_start;


765:   /* remap pairwise in tail of gsi_via_bit_mask() */
766:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
767:   gs->out       = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
768:   gs->in        = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);

770:   /* reset malloc pool */
771:   free((void*)p_mask);
772:   free((void*)tmp_proc_mask);
773:   return(0);
774: }

776: /* to do pruned tree just save ngh buf copy for each one and decode here!
777: ******************************************************************************/
778: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
779: {
780:   PetscInt i, j, n, nel;
781:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

784:   /* local work ptrs */
785:   elms = gs->elms;
786:   nel  = gs->nel;

788:   /* how many via tree */
789:   gs->tree_nel     = n = ntree;
790:   gs->tree_elms    = tree_elms = iptr_in = tree_buf;
791:   gs->tree_buf     = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
792:   gs->tree_work    = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
793:   j                = gs->tree_map_sz;
794:   gs->tree_map_in  = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
795:   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));

797:   /* search the longer of the two lists */
798:   /* note ... could save this info in get_ngh_buf and save searches */
799:   if (n<=nel) {
800:     /* bijective fct w/remap - search elm list */
801:     for (i=0; i<n; i++) {
802:       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
803:     }
804:   } else {
805:     for (i=0; i<nel; i++) {
806:       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
807:     }
808:   }

810:   /* sentinel */
811:   *iptr_in = *iptr_out = -1;
812:   return(0);
813: }

815: /******************************************************************************/
816: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs,  PetscScalar *vals)
817: {
818:   PetscInt    *num, *map, **reduce;
819:   PetscScalar tmp;

822:   num    = gs->num_gop_local_reduce;
823:   reduce = gs->gop_local_reduce;
824:   while ((map = *reduce++)) {
825:     /* wall */
826:     if (*num == 2) {
827:       num++;
828:       vals[map[1]] = vals[map[0]];
829:     } else if (*num == 3) { /* corner shared by three elements */
830:       num++;
831:       vals[map[2]] = vals[map[1]] = vals[map[0]];
832:     } else if (*num == 4) { /* corner shared by four elements */
833:       num++;
834:       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
835:     } else { /* general case ... odd geoms ... 3D*/
836:       num++;
837:       tmp = *(vals + *map++);
838:       while (*map >= 0) *(vals + *map++) = tmp;
839:     }
840:   }
841:   return(0);
842: }

844: /******************************************************************************/
845: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
846: {
847:   PetscInt    *num, *map, **reduce;
848:   PetscScalar tmp;

851:   num    = gs->num_local_reduce;
852:   reduce = gs->local_reduce;
853:   while ((map = *reduce)) {
854:     /* wall */
855:     if (*num == 2) {
856:       num++; reduce++;
857:       vals[map[1]] = vals[map[0]] += vals[map[1]];
858:     } else if (*num == 3) { /* corner shared by three elements */
859:       num++; reduce++;
860:       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
861:     } else if (*num == 4) { /* corner shared by four elements */
862:       num++; reduce++;
863:       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
864:     } else { /* general case ... odd geoms ... 3D*/
865:       num++;
866:       tmp = 0.0;
867:       while (*map >= 0) tmp += *(vals + *map++);

869:       map = *reduce++;
870:       while (*map >= 0) *(vals + *map++) = tmp;
871:     }
872:   }
873:   return(0);
874: }

876: /******************************************************************************/
877: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
878: {
879:   PetscInt    *num, *map, **reduce;
880:   PetscScalar *base;

883:   num    = gs->num_gop_local_reduce;
884:   reduce = gs->gop_local_reduce;
885:   while ((map = *reduce++)) {
886:     /* wall */
887:     if (*num == 2) {
888:       num++;
889:       vals[map[0]] += vals[map[1]];
890:     } else if (*num == 3) { /* corner shared by three elements */
891:       num++;
892:       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
893:     } else if (*num == 4) { /* corner shared by four elements */
894:       num++;
895:       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
896:     } else { /* general case ... odd geoms ... 3D*/
897:       num++;
898:       base = vals + *map++;
899:       while (*map >= 0) *base += *(vals + *map++);
900:     }
901:   }
902:   return(0);
903: }

905: /******************************************************************************/
906: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
907: {
908:   PetscInt i;

911:   if (gs->nghs) free((void*) gs->nghs);
912:   if (gs->pw_nghs) free((void*) gs->pw_nghs);

914:   /* tree */
915:   if (gs->max_left_over) {
916:     if (gs->tree_elms) free((void*) gs->tree_elms);
917:     if (gs->tree_buf) free((void*) gs->tree_buf);
918:     if (gs->tree_work) free((void*) gs->tree_work);
919:     if (gs->tree_map_in) free((void*) gs->tree_map_in);
920:     if (gs->tree_map_out) free((void*) gs->tree_map_out);
921:   }

923:   /* pairwise info */
924:   if (gs->num_pairs) {
925:     /* should be NULL already */
926:     if (gs->ngh_buf) free((void*) gs->ngh_buf);
927:     if (gs->elms) free((void*) gs->elms);
928:     if (gs->local_elms) free((void*) gs->local_elms);
929:     if (gs->companion) free((void*) gs->companion);

931:     /* only set if pairwise */
932:     if (gs->vals) free((void*) gs->vals);
933:     if (gs->in) free((void*) gs->in);
934:     if (gs->out) free((void*) gs->out);
935:     if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
936:     if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
937:     if (gs->pw_vals) free((void*) gs->pw_vals);
938:     if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
939:     if (gs->node_list) {
940:       for (i=0;i<gs->num_pairs;i++) {
941:         if (gs->node_list[i])  {
942:           free((void*) gs->node_list[i]);
943:         }
944:       }
945:       free((void*) gs->node_list);
946:     }
947:     if (gs->msg_sizes) free((void*) gs->msg_sizes);
948:     if (gs->pair_list) free((void*) gs->pair_list);
949:   }

951:   /* local info */
952:   if (gs->num_local_total>=0) {
953:     /*      for (i=0;i<gs->num_local_total;i++) */
954:     for (i=0;i<gs->num_local_total+1;i++) {
955:       if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
956:     }
957:   }

959:   /* if intersection tree/pairwise and local isn't empty */
960:   if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
961:   if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);

963:   free((void*) gs);
964:   return(0);
965: }

967: /******************************************************************************/
968: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
969: {

973:   switch (*op) {
974:   case '+':
975:     PCTFS_gs_gop_vec_plus(gs,vals,step);
976:     break;
977:   default:
978:     PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);
979:     PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");
980:     PCTFS_gs_gop_vec_plus(gs,vals,step);
981:     break;
982:   }
983:   return(0);
984: }

986: /******************************************************************************/
987: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
988: {
990:   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");

992:   /* local only operations!!! */
993:   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);

995:   /* if intersection tree/pairwise and local isn't empty */
996:   if (gs->num_local_gop) {
997:     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);

999:     /* pairwise */
1000:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);

1002:     /* tree */
1003:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);

1005:     PCTFS_gs_gop_vec_local_out(gs,vals,step);
1006:   } else { /* if intersection tree/pairwise and local is empty */
1007:     /* pairwise */
1008:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);

1010:     /* tree */
1011:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
1012:   }
1013:   return(0);
1014: }

1016: /******************************************************************************/
1017: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1018: {
1019:   PetscInt    *num, *map, **reduce;
1020:   PetscScalar *base;

1023:   num    = gs->num_local_reduce;
1024:   reduce = gs->local_reduce;
1025:   while ((map = *reduce)) {
1026:     base = vals + map[0] * step;

1028:     /* wall */
1029:     if (*num == 2) {
1030:       num++; reduce++;
1031:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1032:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1033:     } else if (*num == 3) { /* corner shared by three elements */
1034:       num++; reduce++;
1035:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1036:       PCTFS_rvec_add (base,vals+map[2]*step,step);
1037:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1038:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1039:     } else if (*num == 4) { /* corner shared by four elements */
1040:       num++; reduce++;
1041:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1042:       PCTFS_rvec_add (base,vals+map[2]*step,step);
1043:       PCTFS_rvec_add (base,vals+map[3]*step,step);
1044:       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1045:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1046:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1047:     } else { /* general case ... odd geoms ... 3D */
1048:       num++;
1049:       while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);

1051:       map = *reduce;
1052:       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);

1054:       reduce++;
1055:     }
1056:   }
1057:   return(0);
1058: }

1060: /******************************************************************************/
1061: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1062: {
1063:   PetscInt    *num, *map, **reduce;
1064:   PetscScalar *base;

1067:   num    = gs->num_gop_local_reduce;
1068:   reduce = gs->gop_local_reduce;
1069:   while ((map = *reduce++)) {
1070:     base = vals + map[0] * step;

1072:     /* wall */
1073:     if (*num == 2) {
1074:       num++;
1075:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1076:     } else if (*num == 3) { /* corner shared by three elements */
1077:       num++;
1078:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1079:       PCTFS_rvec_add(base,vals+map[2]*step,step);
1080:     } else if (*num == 4) { /* corner shared by four elements */
1081:       num++;
1082:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1083:       PCTFS_rvec_add(base,vals+map[2]*step,step);
1084:       PCTFS_rvec_add(base,vals+map[3]*step,step);
1085:     } else { /* general case ... odd geoms ... 3D*/
1086:       num++;
1087:       while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1088:     }
1089:   }
1090:   return(0);
1091: }

1093: /******************************************************************************/
1094: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1095: {
1096:   PetscInt    *num, *map, **reduce;
1097:   PetscScalar *base;

1100:   num    = gs->num_gop_local_reduce;
1101:   reduce = gs->gop_local_reduce;
1102:   while ((map = *reduce++)) {
1103:     base = vals + map[0] * step;

1105:     /* wall */
1106:     if (*num == 2) {
1107:       num++;
1108:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1109:     } else if (*num == 3) { /* corner shared by three elements */
1110:       num++;
1111:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1112:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1113:     } else if (*num == 4) { /* corner shared by four elements */
1114:       num++;
1115:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1116:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1117:       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1118:     } else { /* general case ... odd geoms ... 3D*/
1119:       num++;
1120:       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1121:     }
1122:   }
1123:   return(0);
1124: }

1126: /******************************************************************************/
1127: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1128: {
1129:   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1130:   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1131:   PetscInt       *pw, *list, *size, **nodes;
1132:   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1133:   MPI_Status     status;
1134:   PetscBLASInt   i1 = 1,dstep;

1138:   /* strip and load s */
1139:   msg_list    = list     = gs->pair_list;
1140:   msg_size    = size     = gs->msg_sizes;
1141:   msg_nodes   = nodes    = gs->node_list;
1142:   iptr        = pw       = gs->pw_elm_list;
1143:   dptr1       = dptr3    = gs->pw_vals;
1144:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1145:   msg_ids_out = ids_out  = gs->msg_ids_out;
1146:   dptr2                  = gs->out;
1147:   in1=in2                = gs->in;

1149:   /* post the receives */
1150:   /*  msg_nodes=nodes; */
1151:   do {
1152:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1153:         second one *list and do list++ afterwards */
1154:     MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1155:     list++;msg_ids_in++;
1156:     in1 += *size++ *step;
1157:   } while (*++msg_nodes);
1158:   msg_nodes=nodes;

1160:   /* load gs values into in out gs buffers */
1161:   while (*iptr >= 0) {
1162:     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1163:     dptr3+=step;
1164:     iptr++;
1165:   }

1167:   /* load out buffers and post the sends */
1168:   while ((iptr = *msg_nodes++)) {
1169:     dptr3 = dptr2;
1170:     while (*iptr >= 0) {
1171:       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1172:       dptr2+=step;
1173:       iptr++;
1174:     }
1175:     MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1176:     msg_size++; msg_list++;msg_ids_out++;
1177:   }

1179:   /* tree */
1180:   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);

1182:   /* process the received data */
1183:   msg_nodes=nodes;
1184:   while ((iptr = *nodes++)) {
1185:     PetscScalar d1 = 1.0;

1187:     /* Should I check the return value of MPI_Wait() or status? */
1188:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1189:     MPI_Wait(ids_in, &status);
1190:     ids_in++;
1191:     while (*iptr >= 0) {
1192:       PetscBLASIntCast(step,&dstep);
1193:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1194:       in2+=step;
1195:       iptr++;
1196:     }
1197:   }

1199:   /* replace vals */
1200:   while (*pw >= 0) {
1201:     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1202:     dptr1+=step;
1203:     pw++;
1204:   }

1206:   /* clear isend message handles */
1207:   /* This changed for clarity though it could be the same */

1209:   /* Should I check the return value of MPI_Wait() or status? */
1210:   /* Can this loop be replaced by a call to MPI_Waitall()? */
1211:   while (*msg_nodes++) {
1212:     MPI_Wait(ids_out, &status);
1213:     ids_out++;
1214:   }
1215:   return(0);
1216: }

1218: /******************************************************************************/
1219: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1220: {
1221:   PetscInt       size, *in, *out;
1222:   PetscScalar    *buf, *work;
1223:   PetscInt       op[] = {GL_ADD,0};
1224:   PetscBLASInt   i1   = 1;
1226:   PetscBLASInt   dstep;

1229:   /* copy over to local variables */
1230:   in   = gs->tree_map_in;
1231:   out  = gs->tree_map_out;
1232:   buf  = gs->tree_buf;
1233:   work = gs->tree_work;
1234:   size = gs->tree_nel*step;

1236:   /* zero out collection buffer */
1237:   PCTFS_rvec_zero(buf,size);


1240:   /* copy over my contributions */
1241:   while (*in >= 0) {
1242:     PetscBLASIntCast(step,&dstep);
1243:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1244:   }

1246:   /* perform fan in/out on full buffer */
1247:   /* must change PCTFS_grop to handle the blas */
1248:   PCTFS_grop(buf,work,size,op);

1250:   /* reset */
1251:   in  = gs->tree_map_in;
1252:   out = gs->tree_map_out;

1254:   /* get the portion of the results I need */
1255:   while (*in >= 0) {
1256:     PetscBLASIntCast(step,&dstep);
1257:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1258:   }
1259:   return(0);
1260: }

1262: /******************************************************************************/
1263: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1264: {

1268:   switch (*op) {
1269:   case '+':
1270:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1271:     break;
1272:   default:
1273:     PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);
1274:     PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1275:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1276:     break;
1277:   }
1278:   return(0);
1279: }

1281: /******************************************************************************/
1282: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1283: {
1285:   /* if there's nothing to do return */
1286:   if (dim<=0) return(0);

1288:   /* can't do more dimensions then exist */
1289:   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);

1291:   /* local only operations!!! */
1292:   if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);

1294:   /* if intersection tree/pairwise and local isn't empty */
1295:   if (gs->num_local_gop) {
1296:     PCTFS_gs_gop_local_in_plus(gs,vals);

1298:     /* pairwise will do tree inside ... */
1299:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1300:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);

1302:     PCTFS_gs_gop_local_out(gs,vals);
1303:   } else { /* if intersection tree/pairwise and local is empty */
1304:     /* pairwise will do tree inside */
1305:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1306:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1307:   }
1308:   return(0);
1309: }

1311: /******************************************************************************/
1312: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1313: {
1314:   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1315:   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1316:   PetscInt       *pw, *list, *size, **nodes;
1317:   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1318:   MPI_Status     status;
1319:   PetscInt       i, mask=1;

1323:   for (i=1; i<dim; i++) { mask<<=1; mask++; }

1325:   /* strip and load s */
1326:   msg_list    = list     = gs->pair_list;
1327:   msg_size    = size     = gs->msg_sizes;
1328:   msg_nodes   = nodes    = gs->node_list;
1329:   iptr        = pw       = gs->pw_elm_list;
1330:   dptr1       = dptr3    = gs->pw_vals;
1331:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1332:   msg_ids_out = ids_out  = gs->msg_ids_out;
1333:   dptr2       = gs->out;
1334:   in1         = in2      = gs->in;

1336:   /* post the receives */
1337:   /*  msg_nodes=nodes; */
1338:   do {
1339:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1340:         second one *list and do list++ afterwards */
1341:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1342:       MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1343:       list++; msg_ids_in++;in1 += *size++;
1344:     } else { list++; size++; }
1345:   } while (*++msg_nodes);

1347:   /* load gs values into in out gs buffers */
1348:   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

1350:   /* load out buffers and post the sends */
1351:   msg_nodes=nodes;
1352:   list     = msg_list;
1353:   while ((iptr = *msg_nodes++)) {
1354:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1355:       dptr3 = dptr2;
1356:       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1357:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1358:       /* is msg_ids_out++ correct? */
1359:       MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1360:       msg_size++;list++;msg_ids_out++;
1361:     } else {list++; msg_size++;}
1362:   }

1364:   /* do the tree while we're waiting */
1365:   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);

1367:   /* process the received data */
1368:   msg_nodes=nodes;
1369:   list     = msg_list;
1370:   while ((iptr = *nodes++)) {
1371:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1372:       /* Should I check the return value of MPI_Wait() or status? */
1373:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1374:       MPI_Wait(ids_in, &status);
1375:       ids_in++;
1376:       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1377:     }
1378:     list++;
1379:   }

1381:   /* replace vals */
1382:   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

1384:   /* clear isend message handles */
1385:   /* This changed for clarity though it could be the same */
1386:   while (*msg_nodes++) {
1387:     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1388:       /* Should I check the return value of MPI_Wait() or status? */
1389:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1390:       MPI_Wait(ids_out, &status);
1391:       ids_out++;
1392:     }
1393:     msg_list++;
1394:   }
1395:   return(0);
1396: }

1398: /******************************************************************************/
1399: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1400: {
1401:   PetscInt    size;
1402:   PetscInt    *in, *out;
1403:   PetscScalar *buf, *work;
1404:   PetscInt    op[] = {GL_ADD,0};

1407:   in   = gs->tree_map_in;
1408:   out  = gs->tree_map_out;
1409:   buf  = gs->tree_buf;
1410:   work = gs->tree_work;
1411:   size = gs->tree_nel;

1413:   PCTFS_rvec_zero(buf,size);

1415:   while (*in >= 0) *(buf + *out++) = *(vals + *in++);

1417:   in  = gs->tree_map_in;
1418:   out = gs->tree_map_out;

1420:   PCTFS_grop_hc(buf,work,size,op,dim);

1422:   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1423:   return(0);
1424: }