Actual source code: gs.c

petsc-3.3-p7 2013-05-11
  2: /***********************************gs.c***************************************

  4: Author: Henry M. Tufo III

  6: e-mail: hmt@cs.brown.edu

  8: snail-mail:
  9: Division of Applied Mathematics
 10: Brown University
 11: Providence, RI 02912

 13: Last Modification: 
 14: 6.21.97
 15: ************************************gs.c**************************************/

 17: /***********************************gs.c***************************************
 18: File Description:
 19: -----------------

 21: ************************************gs.c**************************************/

 23: #include <../src/ksp/pc/impls/tfs/tfs.h>

 25: /* default length of number of items via tree - doubles if exceeded */
 26: #define TREE_BUF_SZ 2048;
 27: #define GS_VEC_SZ   1



 31: /***********************************gs.c***************************************
 32: Type: struct gather_scatter_id 
 33: ------------------------------

 35: ************************************gs.c**************************************/
 36: typedef struct gather_scatter_id {
 37:   PetscInt id;
 38:   PetscInt nel_min;
 39:   PetscInt nel_max;
 40:   PetscInt nel_sum;
 41:   PetscInt negl;
 42:   PetscInt gl_max;
 43:   PetscInt gl_min;
 44:   PetscInt repeats;
 45:   PetscInt ordered;
 46:   PetscInt positive;
 47:   PetscScalar *vals;

 49:   /* bit mask info */
 50:   PetscInt *my_proc_mask;
 51:   PetscInt mask_sz;
 52:   PetscInt *ngh_buf;
 53:   PetscInt ngh_buf_sz;
 54:   PetscInt *nghs;
 55:   PetscInt num_nghs;
 56:   PetscInt max_nghs;
 57:   PetscInt *pw_nghs;
 58:   PetscInt num_pw_nghs;
 59:   PetscInt *tree_nghs;
 60:   PetscInt num_tree_nghs;

 62:   PetscInt num_loads;

 64:   /* repeats == true -> local info */
 65:   PetscInt nel;         /* number of unique elememts */
 66:   PetscInt *elms;       /* of size nel */
 67:   PetscInt nel_total;
 68:   PetscInt *local_elms; /* of size nel_total */
 69:   PetscInt *companion;  /* of size nel_total */

 71:   /* local info */
 72:   PetscInt num_local_total;
 73:   PetscInt local_strength;
 74:   PetscInt num_local;
 75:   PetscInt *num_local_reduce;
 76:   PetscInt **local_reduce;
 77:   PetscInt num_local_gop;
 78:   PetscInt *num_gop_local_reduce;
 79:   PetscInt **gop_local_reduce;

 81:   /* pairwise info */
 82:   PetscInt level;
 83:   PetscInt num_pairs;
 84:   PetscInt max_pairs;
 85:   PetscInt loc_node_pairs;
 86:   PetscInt max_node_pairs;
 87:   PetscInt min_node_pairs;
 88:   PetscInt avg_node_pairs;
 89:   PetscInt *pair_list;
 90:   PetscInt *msg_sizes;
 91:   PetscInt **node_list;
 92:   PetscInt len_pw_list;
 93:   PetscInt *pw_elm_list;
 94:   PetscScalar *pw_vals;

 96:   MPI_Request *msg_ids_in;
 97:   MPI_Request *msg_ids_out;

 99:   PetscScalar *out;
100:   PetscScalar *in;
101:   PetscInt msg_total;

103:   /* tree - crystal accumulator info */
104:   PetscInt max_left_over;
105:   PetscInt *pre;
106:   PetscInt *in_num;
107:   PetscInt *out_num;
108:   PetscInt **in_list;
109:   PetscInt **out_list;

111:   /* new tree work*/
112:   PetscInt  tree_nel;
113:   PetscInt *tree_elms;
114:   PetscScalar *tree_buf;
115:   PetscScalar *tree_work;

117:   PetscInt  tree_map_sz;
118:   PetscInt *tree_map_in;
119:   PetscInt *tree_map_out;

121:   /* current memory status */
122:   PetscInt gl_bss_min;
123:   PetscInt gl_perm_min;

125:   /* max segment size for PCTFS_gs_gop_vec() */
126:   PetscInt vec_sz;

128:   /* hack to make paul happy */
129:   MPI_Comm PCTFS_gs_comm;

131: } PCTFS_gs_id;

133: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137: static PCTFS_gs_id * gsi_new(void);
138: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

140: /* same for all but vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142: /* vector flavor */
143: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

145: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);


152: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

155: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

159: /* global vars */
160: /* from comm.c module */

162: static PetscInt num_gs_ids = 0;

164: /* should make this dynamic ... later */
165: static PetscInt msg_buf=MAX_MSG_BUF;
166: static PetscInt vec_sz=GS_VEC_SZ;
167: static PetscInt *tree_buf=NULL;
168: static PetscInt tree_buf_sz=0;
169: static PetscInt ntree=0;

171: /***************************************************************************/
172: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173: {
175:   vec_sz = size;
176:   return(0);
177: }

179: /******************************************************************************/
180: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181: {
183:   msg_buf = buf_size;
184:   return(0);
185: }

187: /******************************************************************************/
188: PCTFS_gs_id *PCTFS_gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
189: {
190:    PCTFS_gs_id *gs;
191:   MPI_Group PCTFS_gs_group;
192:   MPI_Comm  PCTFS_gs_comm;

196:   /* ensure that communication package has been initialized */
197:   PCTFS_comm_init();


200:   /* determines if we have enough dynamic/semi-static memory */
201:   /* checks input, allocs and sets gd_id template            */
202:   gs = gsi_check_args(elms,nel,level);

204:   /* only bit mask version up and working for the moment    */
205:   /* LATER :: get int list version working for sparse pblms */
206:   gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);


209:   MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210:   MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211:   gs->PCTFS_gs_comm=PCTFS_gs_comm;

213:   return(gs);
214: }

216: /******************************************************************************/
217: static PCTFS_gs_id *gsi_new(void)
218: {
220:   PCTFS_gs_id *gs;
221:   gs = (PCTFS_gs_id *) malloc(sizeof(PCTFS_gs_id));
222:   PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
223:   return(gs);
224: }

226: /******************************************************************************/
227: static PCTFS_gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
228: {
229:    PetscInt i, j, k, t2;
230:   PetscInt *companion, *elms, *unique, *iptr;
231:   PetscInt num_local=0, *num_to_reduce, **local_reduce;
232:   PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
233:   PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
234:   PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
235:   PCTFS_gs_id *gs;


239:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
240:   if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");

242:   if (nel==0)
243:     {PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}

245:   /* get space for gs template */
246:   gs = gsi_new();
247:   gs->id = ++num_gs_ids;

249:   /* hmt 6.4.99                                            */
250:   /* caller can set global ids that don't participate to 0 */
251:   /* PCTFS_gs_init ignores all zeros in elm list                 */
252:   /* negative global ids are still invalid                 */
253:   for (i=j=0;i<nel;i++)
254:     {if (in_elms[i]!=0) {j++;}}

256:   k=nel; nel=j;

258:   /* copy over in_elms list and create inverse map */
259:   elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
260:   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));

262:   for (i=j=0;i<k;i++)
263:     {
264:       if (in_elms[i]!=0)
265:         {elms[j] = in_elms[i]; companion[j++] = i;}
266:     }

268:   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");

270:   /* pre-pass ... check to see if sorted */
271:   elms[nel] = INT_MAX;
272:   iptr = elms;
273:   unique = elms+1;
274:   j=0;
275:   while (*iptr!=INT_MAX)
276:     {
277:       if (*iptr++>*unique++)
278:         {j=1; break;}
279:     }

281:   /* set up inverse map */
282:   if (j)
283:     {
284:       PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
285:       PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
286:     }
287:   else
288:     {PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
289:   elms[nel] = INT_MIN;

291:   /* first pass */
292:   /* determine number of unique elements, check pd */
293:   for (i=k=0;i<nel;i+=j)
294:     {
295:       t2 = elms[i];
296:       j=++i;
297: 
298:       /* clump 'em for now */
299:       while (elms[j]==t2) {j++;}
300: 
301:       /* how many together and num local */
302:       if (j-=i)
303:         {num_local++; k+=j;}
304:     }

306:   /* how many unique elements? */
307:   gs->repeats=k;
308:   gs->nel = nel-k;


311:   /* number of repeats? */
312:   gs->num_local = num_local;
313:   num_local+=2;
314:   gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
315:   gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));

317:   unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
318:   gs->elms = unique;
319:   gs->nel_total = nel;
320:   gs->local_elms = elms;
321:   gs->companion = companion;

323:   /* compess map as well as keep track of local ops */
324:   for (num_local=i=j=0;i<gs->nel;i++)
325:     {
326:       k=j;
327:       t2 = unique[i] = elms[j];
328:       companion[i] = companion[j];
329: 
330:       while (elms[j]==t2) {j++;}

332:       if ((t2=(j-k))>1)
333:         {
334:           /* number together */
335:           num_to_reduce[num_local] = t2++;
336:           iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));

338:           /* to use binary searching don't remap until we check intersection */
339:           *iptr++ = i;
340: 
341:           /* note that we're skipping the first one */
342:           while (++k<j)
343:             {*(iptr++) = companion[k];}
344:           *iptr = -1;
345:         }
346:     }

348:   /* sentinel for ngh_buf */
349:   unique[gs->nel]=INT_MAX;

351:   /* for two partition sort hack */
352:   num_to_reduce[num_local] = 0;
353:   local_reduce[num_local] = NULL;
354:   num_to_reduce[++num_local] = 0;
355:   local_reduce[num_local] = NULL;

357:   /* load 'em up */
358:   /* note one extra to hold NON_UNIFORM flag!!! */
359:   vals[2] = vals[1] = vals[0] = nel;
360:   if (gs->nel>0)
361:     {
362:        vals[3] = unique[0];
363:        vals[4] = unique[gs->nel-1];
364:     }
365:   else
366:     {
367:        vals[3] = INT_MAX;
368:        vals[4] = INT_MIN;
369:     }
370:   vals[5] = level;
371:   vals[6] = num_gs_ids;

373:   /* GLOBAL: send 'em out */
374:   PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);

376:   /* must be semi-pos def - only pairwise depends on this */
377:   /* LATER - remove this restriction */
378:   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
379:   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");

381:   gs->nel_min = vals[0];
382:   gs->nel_max = vals[1];
383:   gs->nel_sum = vals[2];
384:   gs->gl_min  = vals[3];
385:   gs->gl_max  = vals[4];
386:   gs->negl    = vals[4]-vals[3]+1;

388:   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
389: 
390:   /* LATER :: add level == -1 -> program selects level */
391:   if (vals[5]<0)
392:     {vals[5]=0;}
393:   else if (vals[5]>PCTFS_num_nodes)
394:     {vals[5]=PCTFS_num_nodes;}
395:   gs->level = vals[5];

397:   return(gs);
398: }

400: /******************************************************************************/
401: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
402: {
403:    PetscInt i, nel, *elms;
404:   PetscInt t1;
405:   PetscInt **reduce;
406:   PetscInt *map;

410:   /* totally local removes ... PCTFS_ct_bits == 0 */
411:   get_ngh_buf(gs);

413:   if (gs->level) set_pairwise(gs);
414:   if (gs->max_left_over) set_tree(gs);

416:   /* intersection local and pairwise/tree? */
417:   gs->num_local_total = gs->num_local;
418:   gs->gop_local_reduce = gs->local_reduce;
419:   gs->num_gop_local_reduce = gs->num_local_reduce;

421:   map = gs->companion;

423:   /* is there any local compression */
424:   if (!gs->num_local) {
425:     gs->local_strength = NONE;
426:     gs->num_local_gop = 0;
427:   } else {
428:       /* ok find intersection */
429:       map = gs->companion;
430:       reduce = gs->local_reduce;
431:       for (i=0, t1=0; i<gs->num_local; i++, reduce++)
432:         {
433:           if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
434:               ||
435:               PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
436:             {
437:               t1++;
438:               if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
439:               gs->num_local_reduce[i] *= -1;
440:             }
441:            **reduce=map[**reduce];
442:         }

444:       /* intersection is empty */
445:       if (!t1)
446:         {
447:           gs->local_strength = FULL;
448:           gs->num_local_gop = 0;
449:         }
450:       /* intersection not empty */
451:       else
452:         {
453:           gs->local_strength = PARTIAL;
454:           PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

456:           gs->num_local_gop = t1;
457:           gs->num_local_total =  gs->num_local;
458:           gs->num_local    -= t1;
459:           gs->gop_local_reduce = gs->local_reduce;
460:           gs->num_gop_local_reduce = gs->num_local_reduce;

462:           for (i=0; i<t1; i++)
463:             {
464:               if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
465:               gs->num_gop_local_reduce[i] *= -1;
466:               gs->local_reduce++;
467:               gs->num_local_reduce++;
468:             }
469:           gs->local_reduce++;
470:           gs->num_local_reduce++;
471:         }
472:     }

474:   elms = gs->pw_elm_list;
475:   nel  = gs->len_pw_list;
476:   for (i=0; i<nel; i++)
477:     {elms[i] = map[elms[i]];}

479:   elms = gs->tree_map_in;
480:   nel  = gs->tree_map_sz;
481:   for (i=0; i<nel; i++)
482:     {elms[i] = map[elms[i]];}

484:   /* clean up */
485:   free((void*) gs->local_elms);
486:   free((void*) gs->companion);
487:   free((void*) gs->elms);
488:   free((void*) gs->ngh_buf);
489:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
490:   return(0);
491: }

493: /******************************************************************************/
494: static PetscErrorCode place_in_tree( PetscInt elm)
495: {
496:    PetscInt *tp, n;

499:   if (ntree==tree_buf_sz)
500:     {
501:       if (tree_buf_sz)
502:         {
503:           tp = tree_buf;
504:           n = tree_buf_sz;
505:           tree_buf_sz<<=1;
506:           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
507:           PCTFS_ivec_copy(tree_buf,tp,n);
508:           free(tp);
509:         }
510:       else
511:         {
512:           tree_buf_sz = TREE_BUF_SZ;
513:           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
514:         }
515:     }

517:   tree_buf[ntree++] = elm;
518:   return(0);
519: }

521: /******************************************************************************/
522: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
523: {
524:    PetscInt i, j, npw=0, ntree_map=0;
525:   PetscInt p_mask_size, ngh_buf_size, buf_size;
526:   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
527:   PetscInt *ngh_buf, *buf1, *buf2;
528:   PetscInt offset, per_load, num_loads, or_ct, start, end;
529:   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
530:   PetscInt oper=GL_B_OR;
531:   PetscInt *ptr3, *t_mask, level, ct1, ct2;

535:   /* to make life easier */
536:   nel   = gs->nel;
537:   elms  = gs->elms;
538:   level = gs->level;
539: 
540:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
541:   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
542:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

544:   /* allocate space for masks and info bufs */
545:   gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
546:   gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
547:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
548:   t_mask = (PetscInt*) malloc(p_mask_size);
549:   gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);

551:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
552:   /* had thought I could exploit rendezvous threshold */

554:   /* default is one pass */
555:   per_load = negl  = gs->negl;
556:   gs->num_loads = num_loads = 1;
557:   i=p_mask_size*negl;

559:   /* possible overflow on buffer size */
560:   /* overflow hack                    */
561:   if (i<0) {i=INT_MAX;}

563:   buf_size = PetscMin(msg_buf,i);

565:   /* can we do it? */
566:   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);

568:   /* get PCTFS_giop buf space ... make *only* one malloc */
569:   buf1 = (PetscInt*) malloc(buf_size<<1);

571:   /* more than one gior exchange needed? */
572:   if (buf_size!=i)
573:     {
574:       per_load = buf_size/p_mask_size;
575:       buf_size = per_load*p_mask_size;
576:       gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
577:     }


580:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
581:   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
582: 
583:   /* find PCTFS_giop work space */
584:   buf2 = buf1+buf_size;

586:   /* hold #ints needed for processor masks */
587:   gs->mask_sz=p_mask_size;

589:   /* init buffers */
590:   PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
591:   PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
592:   PCTFS_ivec_zero(ngh_buf,ngh_buf_size);

594:   /* HACK reset tree info */
595:   tree_buf=NULL;
596:   tree_buf_sz=ntree=0;

598:   /* ok do it */
599:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
600:     {
601:       /* identity for bitwise or is 000...000 */
602:       PCTFS_ivec_zero(buf1,buf_size);

604:       /* load msg buffer */
605:       for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
606:         {
607:           offset = (offset-start)*p_mask_size;
608:           PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
609:         }

611:       /* GLOBAL: pass buffer */
612:       PCTFS_giop(buf1,buf2,buf_size,&oper);


615:       /* unload buffer into ngh_buf */
616:       ptr2=(elms+i_start);
617:       for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
618:         {
619:           /* I own it ... may have to pairwise it */
620:           if (j==*ptr2)
621:             {
622:               /* do i share it w/anyone? */
623:               ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
624:               /* guess not */
625:               if (ct1<2)
626:                 {ptr2++; ptr1+=p_mask_size; continue;}

628:               /* i do ... so keep info and turn off my bit */
629:               PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
630:               PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
631:               PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);
632: 
633:               /* is it to be done pairwise? */
634:               if (--ct1<=level)
635:                 {
636:                   npw++;
637: 
638:                   /* turn on high bit to indicate pw need to process */
639:                   *ptr2++ |= TOP_BIT;
640:                   PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
641:                   ptr1+=p_mask_size;
642:                   continue;
643:                 }

645:               /* get set for next and note that I have a tree contribution */
646:               /* could save exact elm index for tree here -> save a search */
647:               ptr2++; ptr1+=p_mask_size; ntree_map++;
648:             }
649:           /* i don't but still might be involved in tree */
650:           else
651:             {

653:               /* shared by how many? */
654:               ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));

656:               /* none! */
657:               if (ct1<2) continue;

659:               /* is it going to be done pairwise? but not by me of course!*/
660:               if (--ct1<=level) continue;
661:             }
662:           /* LATER we're going to have to process it NOW */
663:           /* nope ... tree it */
664:           place_in_tree(j);
665:         }
666:     }

668:   free((void*)t_mask);
669:   free((void*)buf1);

671:   gs->len_pw_list=npw;
672:   gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));

674:   /* expand from bit mask list to int list and save ngh list */
675:   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
676:   PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);

678:   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));

680:   oper = GL_MAX;
681:   ct1 = gs->num_nghs;
682:   PCTFS_giop(&ct1,&ct2,1,&oper);
683:   gs->max_nghs = ct1;

685:   gs->tree_map_sz  = ntree_map;
686:   gs->max_left_over=ntree;

688:   free((void*)p_mask);
689:   free((void*)sh_proc_mask);
690:   return(0);
691: }

693: /******************************************************************************/
694: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
695: {
696:    PetscInt i, j;
697:   PetscInt p_mask_size;
698:   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
699:   PetscInt *ngh_buf, *buf2;
700:   PetscInt offset;
701:   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
702:   PetscInt *pairwise_elm_list, len_pair_list=0;
703:   PetscInt *iptr, t1, i_start, nel, *elms;
704:   PetscInt ct;

708:   /* to make life easier */
709:   nel  = gs->nel;
710:   elms = gs->elms;
711:   ngh_buf = gs->ngh_buf;
712:   sh_proc_mask  = gs->pw_nghs;

714:   /* need a few temp masks */
715:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
716:   p_mask        = (PetscInt*) malloc(p_mask_size);
717:   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);

719:   /* set mask to my PCTFS_my_id's bit mask */
720:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

722:   p_mask_size /= sizeof(PetscInt);
723: 
724:   len_pair_list=gs->len_pw_list;
725:   gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));

727:   /* how many processors (nghs) do we have to exchange with? */
728:   nprs=gs->num_pairs=PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));


731:   /* allocate space for PCTFS_gs_gop() info */
732:   gs->pair_list = msg_list = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
733:   gs->msg_sizes = msg_size  = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
734:   gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));

736:   /* init msg_size list */
737:   PCTFS_ivec_zero(msg_size,nprs);

739:   /* expand from bit mask list to int list */
740:   PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
741: 
742:   /* keep list of elements being handled pairwise */
743:   for (i=j=0;i<nel;i++)
744:     {
745:       if (elms[i] & TOP_BIT)
746:         {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
747:     }
748:   pairwise_elm_list[j] = -1;

750:   gs->msg_ids_out = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
751:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
752:   gs->msg_ids_in = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
753:   gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
754:   gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);

756:   /* find who goes to each processor */
757:   for (i_start=i=0;i<nprs;i++)
758:     {
759:       /* processor i's mask */
760:       PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);

762:       /* det # going to processor i */
763:       for (ct=j=0;j<len_pair_list;j++)
764:         {
765:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
766:           PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
767:           if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
768:             {ct++;}
769:         }
770:       msg_size[i] = ct;
771:       i_start = PetscMax(i_start,ct);

773:       /*space to hold nodes in message to first neighbor */
774:       msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));

776:       for (j=0;j<len_pair_list;j++)
777:         {
778:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
779:           PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
780:           if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
781:             {*iptr++ = j;}
782:         }
783:       *iptr = -1;
784:     }
785:   msg_nodes[nprs] = NULL;

787:   j=gs->loc_node_pairs=i_start;
788:   t1 = GL_MAX;
789:   PCTFS_giop(&i_start,&offset,1,&t1);
790:   gs->max_node_pairs = i_start;

792:   i_start=j;
793:   t1 = GL_MIN;
794:   PCTFS_giop(&i_start,&offset,1,&t1);
795:   gs->min_node_pairs = i_start;

797:   i_start=j;
798:   t1 = GL_ADD;
799:   PCTFS_giop(&i_start,&offset,1,&t1);
800:   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;

802:   i_start=nprs;
803:   t1 = GL_MAX;
804:   PCTFS_giop(&i_start,&offset,1,&t1);
805:   gs->max_pairs = i_start;


808:   /* remap pairwise in tail of gsi_via_bit_mask() */
809:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
810:   gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
811:   gs->in  = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);

813:   /* reset malloc pool */
814:   free((void*)p_mask);
815:   free((void*)tmp_proc_mask);
816:   return(0);
817: }

819: /* to do pruned tree just save ngh buf copy for each one and decode here!
820: ******************************************************************************/
821: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
822: {
823:   PetscInt i, j, n, nel;
824:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

827:   /* local work ptrs */
828:   elms = gs->elms;
829:   nel     = gs->nel;

831:   /* how many via tree */
832:   gs->tree_nel  = n = ntree;
833:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
834:   gs->tree_buf  = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
835:   gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
836:   j=gs->tree_map_sz;
837:   gs->tree_map_in = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
838:   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));

840:   /* search the longer of the two lists */
841:   /* note ... could save this info in get_ngh_buf and save searches */
842:   if (n<=nel)
843:     {
844:       /* bijective fct w/remap - search elm list */
845:       for (i=0; i<n; i++)
846:         {
847:           if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0)
848:             {*iptr_in++ = j; *iptr_out++ = i;}
849:         }
850:     }
851:   else
852:     {
853:       for (i=0; i<nel; i++)
854:         {
855:           if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0)
856:             {*iptr_in++ = i; *iptr_out++ = j;}
857:         }
858:     }

860:   /* sentinel */
861:   *iptr_in = *iptr_out = -1;
862:   return(0);
863: }

865: /******************************************************************************/
866: static PetscErrorCode PCTFS_gs_gop_local_out( PCTFS_gs_id *gs,  PetscScalar *vals)
867: {
868:   PetscInt *num, *map, **reduce;
869:   PetscScalar tmp;

872:   num    = gs->num_gop_local_reduce;
873:   reduce = gs->gop_local_reduce;
874:   while ((map = *reduce++))
875:     {
876:       /* wall */
877:       if (*num == 2)
878:         {
879:           num ++;
880:           vals[map[1]] = vals[map[0]];
881:         }
882:       /* corner shared by three elements */
883:       else if (*num == 3)
884:         {
885:           num ++;
886:           vals[map[2]] = vals[map[1]] = vals[map[0]];
887:         }
888:       /* corner shared by four elements */
889:       else if (*num == 4)
890:         {
891:           num ++;
892:           vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
893:         }
894:       /* general case ... odd geoms ... 3D*/
895:       else
896:         {
897:           num++;
898:           tmp = *(vals + *map++);
899:           while (*map >= 0)
900:             {*(vals + *map++) = tmp;}
901:         }
902:     }
903:   return(0);
904: }

906: /******************************************************************************/
907: static PetscErrorCode PCTFS_gs_gop_local_plus( PCTFS_gs_id *gs,  PetscScalar *vals)
908: {
909:    PetscInt *num, *map, **reduce;
910:    PetscScalar tmp;

913:   num    = gs->num_local_reduce;
914:   reduce = gs->local_reduce;
915:   while ((map = *reduce))
916:     {
917:       /* wall */
918:       if (*num == 2)
919:         {
920:           num ++; reduce++;
921:           vals[map[1]] = vals[map[0]] += vals[map[1]];
922:         }
923:       /* corner shared by three elements */
924:       else if (*num == 3)
925:         {
926:           num ++; reduce++;
927:           vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
928:         }
929:       /* corner shared by four elements */
930:       else if (*num == 4)
931:         {
932:           num ++; reduce++;
933:           vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
934:                                  (vals[map[1]] + vals[map[2]] + vals[map[3]]);
935:         }
936:       /* general case ... odd geoms ... 3D*/
937:       else
938:         {
939:           num ++;
940:           tmp = 0.0;
941:           while (*map >= 0)
942:             {tmp += *(vals + *map++);}

944:           map = *reduce++;
945:           while (*map >= 0)
946:             {*(vals + *map++) = tmp;}
947:         }
948:     }
949:   return(0);
950: }

952: /******************************************************************************/
953: static PetscErrorCode PCTFS_gs_gop_local_in_plus( PCTFS_gs_id *gs,  PetscScalar *vals)
954: {
955:    PetscInt *num, *map, **reduce;
956:    PetscScalar *base;

959:   num    = gs->num_gop_local_reduce;
960:   reduce = gs->gop_local_reduce;
961:   while ((map = *reduce++))
962:     {
963:       /* wall */
964:       if (*num == 2)
965:         {
966:           num ++;
967:           vals[map[0]] += vals[map[1]];
968:         }
969:       /* corner shared by three elements */
970:       else if (*num == 3)
971:         {
972:           num ++;
973:           vals[map[0]] += (vals[map[1]] + vals[map[2]]);
974:         }
975:       /* corner shared by four elements */
976:       else if (*num == 4)
977:         {
978:           num ++;
979:           vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
980:         }
981:       /* general case ... odd geoms ... 3D*/
982:       else
983:         {
984:           num++;
985:           base = vals + *map++;
986:           while (*map >= 0)
987:             {*base += *(vals + *map++);}
988:         }
989:     }
990:   return(0);
991: }

993: /******************************************************************************/
994: PetscErrorCode PCTFS_gs_free( PCTFS_gs_id *gs)
995: {
996:    PetscInt i;

999:   if (gs->nghs) {free((void*) gs->nghs);}
1000:   if (gs->pw_nghs) {free((void*) gs->pw_nghs);}

1002:   /* tree */
1003:   if (gs->max_left_over)
1004:     {
1005:       if (gs->tree_elms) {free((void*) gs->tree_elms);}
1006:       if (gs->tree_buf) {free((void*) gs->tree_buf);}
1007:       if (gs->tree_work) {free((void*) gs->tree_work);}
1008:       if (gs->tree_map_in) {free((void*) gs->tree_map_in);}
1009:       if (gs->tree_map_out) {free((void*) gs->tree_map_out);}
1010:     }

1012:   /* pairwise info */
1013:   if (gs->num_pairs)
1014:     {
1015:       /* should be NULL already */
1016:       if (gs->ngh_buf) {free((void*) gs->ngh_buf);}
1017:       if (gs->elms) {free((void*) gs->elms);}
1018:       if (gs->local_elms) {free((void*) gs->local_elms);}
1019:       if (gs->companion) {free((void*) gs->companion);}
1020: 
1021:       /* only set if pairwise */
1022:       if (gs->vals) {free((void*) gs->vals);}
1023:       if (gs->in) {free((void*) gs->in);}
1024:       if (gs->out) {free((void*) gs->out);}
1025:       if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);}
1026:       if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);}
1027:       if (gs->pw_vals) {free((void*) gs->pw_vals);}
1028:       if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);}
1029:       if (gs->node_list)
1030:         {
1031:           for (i=0;i<gs->num_pairs;i++)
1032:             {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}}
1033:           free((void*) gs->node_list);
1034:         }
1035:       if (gs->msg_sizes) {free((void*) gs->msg_sizes);}
1036:       if (gs->pair_list) {free((void*) gs->pair_list);}
1037:     }

1039:   /* local info */
1040:   if (gs->num_local_total>=0)
1041:     {
1042:       for (i=0;i<gs->num_local_total+1;i++)
1043:         /*      for (i=0;i<gs->num_local_total;i++) */
1044:         {
1045:           if (gs->num_gop_local_reduce[i])
1046:             {free((void*) gs->gop_local_reduce[i]);}
1047:         }
1048:     }

1050:   /* if intersection tree/pairwise and local isn't empty */
1051:   if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);}
1052:   if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);}

1054:   free((void*) gs);
1055:   return(0);
1056: }

1058: /******************************************************************************/
1059: PetscErrorCode PCTFS_gs_gop_vec( PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
1060: {

1064:   switch (*op) {
1065:   case '+':
1066:     PCTFS_gs_gop_vec_plus(gs,vals,step);
1067:     break;
1068:   default:
1069:     PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op",op[0]);
1070:     PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus");
1071:     PCTFS_gs_gop_vec_plus(gs,vals,step);
1072:     break;
1073:   }
1074:   return(0);
1075: }

1077: /******************************************************************************/
1078: static PetscErrorCode PCTFS_gs_gop_vec_plus( PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1079: {
1081:   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");

1083:   /* local only operations!!! */
1084:   if (gs->num_local)
1085:     {PCTFS_gs_gop_vec_local_plus(gs,vals,step);}

1087:   /* if intersection tree/pairwise and local isn't empty */
1088:   if (gs->num_local_gop)
1089:     {
1090:       PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);

1092:       /* pairwise */
1093:       if (gs->num_pairs)
1094:         {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);}

1096:       /* tree */
1097:       else if (gs->max_left_over)
1098:         {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);}

1100:       PCTFS_gs_gop_vec_local_out(gs,vals,step);
1101:     }
1102:   /* if intersection tree/pairwise and local is empty */
1103:   else
1104:     {
1105:       /* pairwise */
1106:       if (gs->num_pairs)
1107:         {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);}

1109:       /* tree */
1110:       else if (gs->max_left_over)
1111:         {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);}
1112:     }
1113:   return(0);
1114: }

1116: /******************************************************************************/
1117: static PetscErrorCode PCTFS_gs_gop_vec_local_plus( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1118: {
1119:    PetscInt *num, *map, **reduce;
1120:    PetscScalar *base;

1123:   num    = gs->num_local_reduce;
1124:   reduce = gs->local_reduce;
1125:   while ((map = *reduce))
1126:     {
1127:       base = vals + map[0] * step;

1129:       /* wall */
1130:       if (*num == 2)
1131:         {
1132:           num++; reduce++;
1133:           PCTFS_rvec_add (base,vals+map[1]*step,step);
1134:           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1135:         }
1136:       /* corner shared by three elements */
1137:       else if (*num == 3)
1138:         {
1139:           num++; reduce++;
1140:           PCTFS_rvec_add (base,vals+map[1]*step,step);
1141:           PCTFS_rvec_add (base,vals+map[2]*step,step);
1142:           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1143:           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1144:         }
1145:       /* corner shared by four elements */
1146:       else if (*num == 4)
1147:         {
1148:           num++; reduce++;
1149:           PCTFS_rvec_add (base,vals+map[1]*step,step);
1150:           PCTFS_rvec_add (base,vals+map[2]*step,step);
1151:           PCTFS_rvec_add (base,vals+map[3]*step,step);
1152:           PCTFS_rvec_copy(vals+map[3]*step,base,step);
1153:           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1154:           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1155:         }
1156:       /* general case ... odd geoms ... 3D */
1157:       else
1158:         {
1159:           num++;
1160:           while (*++map >= 0)
1161:             {PCTFS_rvec_add (base,vals+*map*step,step);}
1162: 
1163:           map = *reduce;
1164:           while (*++map >= 0)
1165:             {PCTFS_rvec_copy(vals+*map*step,base,step);}
1166: 
1167:           reduce++;
1168:         }
1169:     }
1170:   return(0);
1171: }

1173: /******************************************************************************/
1174: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1175: {
1176:    PetscInt  *num, *map, **reduce;
1177:    PetscScalar *base;
1179:   num    = gs->num_gop_local_reduce;
1180:   reduce = gs->gop_local_reduce;
1181:   while ((map = *reduce++))
1182:     {
1183:       base = vals + map[0] * step;

1185:       /* wall */
1186:       if (*num == 2)
1187:         {
1188:           num ++;
1189:           PCTFS_rvec_add(base,vals+map[1]*step,step);
1190:         }
1191:       /* corner shared by three elements */
1192:       else if (*num == 3)
1193:         {
1194:           num ++;
1195:           PCTFS_rvec_add(base,vals+map[1]*step,step);
1196:           PCTFS_rvec_add(base,vals+map[2]*step,step);
1197:         }
1198:       /* corner shared by four elements */
1199:       else if (*num == 4)
1200:         {
1201:           num ++;
1202:           PCTFS_rvec_add(base,vals+map[1]*step,step);
1203:           PCTFS_rvec_add(base,vals+map[2]*step,step);
1204:           PCTFS_rvec_add(base,vals+map[3]*step,step);
1205:         }
1206:       /* general case ... odd geoms ... 3D*/
1207:       else
1208:         {
1209:           num++;
1210:           while (*++map >= 0)
1211:             {PCTFS_rvec_add(base,vals+*map*step,step);}
1212:         }
1213:     }
1214:   return(0);
1215: }

1217: /******************************************************************************/
1218: static PetscErrorCode PCTFS_gs_gop_vec_local_out( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1219: {
1220:    PetscInt *num, *map, **reduce;
1221:    PetscScalar *base;

1224:   num    = gs->num_gop_local_reduce;
1225:   reduce = gs->gop_local_reduce;
1226:   while ((map = *reduce++))
1227:     {
1228:       base = vals + map[0] * step;

1230:       /* wall */
1231:       if (*num == 2)
1232:         {
1233:           num ++;
1234:           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1235:         }
1236:       /* corner shared by three elements */
1237:       else if (*num == 3)
1238:         {
1239:           num ++;
1240:           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1241:           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1242:         }
1243:       /* corner shared by four elements */
1244:       else if (*num == 4)
1245:         {
1246:           num ++;
1247:           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1248:           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1249:           PCTFS_rvec_copy(vals+map[3]*step,base,step);
1250:         }
1251:       /* general case ... odd geoms ... 3D*/
1252:       else
1253:         {
1254:           num++;
1255:           while (*++map >= 0)
1256:             {PCTFS_rvec_copy(vals+*map*step,base,step);}
1257:         }
1258:     }
1259:   return(0);
1260: }

1262: /******************************************************************************/
1263: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus( PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1264: {
1265:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1266:   PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1267:   PetscInt *pw, *list, *size, **nodes;
1268:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1269:   MPI_Status status;
1270:   PetscBLASInt i1 = 1,dstep;

1274:   /* strip and load s */
1275:   msg_list =list         = gs->pair_list;
1276:   msg_size =size         = gs->msg_sizes;
1277:   msg_nodes=nodes        = gs->node_list;
1278:   iptr=pw                = gs->pw_elm_list;
1279:   dptr1=dptr3            = gs->pw_vals;
1280:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1281:   msg_ids_out = ids_out  = gs->msg_ids_out;
1282:   dptr2                  = gs->out;
1283:   in1=in2                = gs->in;

1285:   /* post the receives */
1286:   /*  msg_nodes=nodes; */
1287:   do
1288:     {
1289:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1290:          second one *list and do list++ afterwards */
1291:       MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1292:       list++;msg_ids_in++;
1293:       in1 += *size++ *step;
1294:     }
1295:   while (*++msg_nodes);
1296:   msg_nodes=nodes;

1298:   /* load gs values into in out gs buffers */
1299:   while (*iptr >= 0)
1300:     {
1301:       PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1302:       dptr3+=step;
1303:       iptr++;
1304:     }

1306:   /* load out buffers and post the sends */
1307:   while ((iptr = *msg_nodes++))
1308:     {
1309:       dptr3 = dptr2;
1310:       while (*iptr >= 0)
1311:         {
1312:           PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1313:           dptr2+=step;
1314:           iptr++;
1315:         }
1316:       MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1317:       msg_size++; msg_list++;msg_ids_out++;
1318:     }

1320:   /* tree */
1321:   if (gs->max_left_over)
1322:     {PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);}

1324:   /* process the received data */
1325:   msg_nodes=nodes;
1326:   while ((iptr = *nodes++)){
1327:     PetscScalar d1 = 1.0;
1328:       /* Should I check the return value of MPI_Wait() or status? */
1329:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1330:       MPI_Wait(ids_in, &status);
1331:       ids_in++;
1332:       while (*iptr >= 0) {
1333:         dstep = PetscBLASIntCast(step);
1334:         BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1335:         in2+=step;
1336:         iptr++;
1337:       }
1338:   }

1340:   /* replace vals */
1341:   while (*pw >= 0)
1342:     {
1343:       PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1344:       dptr1+=step;
1345:       pw++;
1346:     }

1348:   /* clear isend message handles */
1349:   /* This changed for clarity though it could be the same */
1350:   while (*msg_nodes++)
1351:     /* Should I check the return value of MPI_Wait() or status? */
1352:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1353:     {MPI_Wait(ids_out, &status);ids_out++;}
1354: 

1356:   return(0);
1357: }

1359: /******************************************************************************/
1360: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus( PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1361: {
1362:   PetscInt size, *in, *out;
1363:   PetscScalar *buf, *work;
1364:   PetscInt op[] = {GL_ADD,0};
1365:   PetscBLASInt i1 = 1;

1368:   /* copy over to local variables */
1369:   in   = gs->tree_map_in;
1370:   out  = gs->tree_map_out;
1371:   buf  = gs->tree_buf;
1372:   work = gs->tree_work;
1373:   size = gs->tree_nel*step;

1375:   /* zero out collection buffer */
1376:   PCTFS_rvec_zero(buf,size);


1379:   /* copy over my contributions */
1380:   while (*in >= 0)
1381:     {
1382:       PetscBLASInt dstep = PetscBLASIntCast(step);
1383:       BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1384:     }

1386:   /* perform fan in/out on full buffer */
1387:   /* must change PCTFS_grop to handle the blas */
1388:   PCTFS_grop(buf,work,size,op);

1390:   /* reset */
1391:   in   = gs->tree_map_in;
1392:   out  = gs->tree_map_out;

1394:   /* get the portion of the results I need */
1395:   while (*in >= 0)
1396:     {
1397:       PetscBLASInt dstep = PetscBLASIntCast(step);
1398:       BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1399:     }
1400:   return(0);
1401: }

1403: /******************************************************************************/
1404: PetscErrorCode PCTFS_gs_gop_hc( PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1405: {

1409:   switch (*op) {
1410:   case '+':
1411:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1412:     break;
1413:   default:
1414:     PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op",op[0]);
1415:     PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1416:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1417:     break;
1418:   }
1419:   return(0);
1420: }

1422: /******************************************************************************/
1423: static PetscErrorCode PCTFS_gs_gop_plus_hc( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1424: {
1426:   /* if there's nothing to do return */
1427:   if (dim<=0)
1428:     {  return(0);}

1430:   /* can't do more dimensions then exist */
1431:   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);

1433:   /* local only operations!!! */
1434:   if (gs->num_local)
1435:     {PCTFS_gs_gop_local_plus(gs,vals);}

1437:   /* if intersection tree/pairwise and local isn't empty */
1438:   if (gs->num_local_gop)
1439:     {
1440:       PCTFS_gs_gop_local_in_plus(gs,vals);

1442:       /* pairwise will do tree inside ... */
1443:       if (gs->num_pairs)
1444:         {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);}

1446:       /* tree only */
1447:       else if (gs->max_left_over)
1448:         {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);}
1449: 
1450:       PCTFS_gs_gop_local_out(gs,vals);
1451:     }
1452:   /* if intersection tree/pairwise and local is empty */
1453:   else
1454:     {
1455:       /* pairwise will do tree inside */
1456:       if (gs->num_pairs)
1457:         {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);}
1458: 
1459:       /* tree */
1460:       else if (gs->max_left_over)
1461:         {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);}
1462:     }
1463:   return(0);
1464: }

1466: /******************************************************************************/
1467: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc( PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1468: {
1469:    PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1470:    PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1471:    PetscInt *pw, *list, *size, **nodes;
1472:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1473:   MPI_Status status;
1474:   PetscInt i, mask=1;

1478:   for (i=1; i<dim; i++)
1479:     {mask<<=1; mask++;}


1482:   /* strip and load s */
1483:   msg_list =list         = gs->pair_list;
1484:   msg_size =size         = gs->msg_sizes;
1485:   msg_nodes=nodes        = gs->node_list;
1486:   iptr=pw                = gs->pw_elm_list;
1487:   dptr1=dptr3            = gs->pw_vals;
1488:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1489:   msg_ids_out = ids_out  = gs->msg_ids_out;
1490:   dptr2                  = gs->out;
1491:   in1=in2                = gs->in;

1493:   /* post the receives */
1494:   /*  msg_nodes=nodes; */
1495:   do
1496:     {
1497:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1498:          second one *list and do list++ afterwards */
1499:       if ((PCTFS_my_id|mask)==(*list|mask))
1500:         {
1501:           MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1502:           list++; msg_ids_in++;in1 += *size++;
1503:         }
1504:       else
1505:         {list++; size++;}
1506:     }
1507:   while (*++msg_nodes);

1509:   /* load gs values into in out gs buffers */
1510:   while (*iptr >= 0)
1511:     {*dptr3++ = *(in_vals + *iptr++);}

1513:   /* load out buffers and post the sends */
1514:   msg_nodes=nodes;
1515:   list = msg_list;
1516:   while ((iptr = *msg_nodes++))
1517:     {
1518:       if ((PCTFS_my_id|mask)==(*list|mask))
1519:         {
1520:           dptr3 = dptr2;
1521:           while (*iptr >= 0)
1522:             {*dptr2++ = *(dptr1 + *iptr++);}
1523:           /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1524:           /* is msg_ids_out++ correct? */
1525:           MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1526:           msg_size++;list++;msg_ids_out++;
1527:         }
1528:       else
1529:         {list++; msg_size++;}
1530:     }

1532:   /* do the tree while we're waiting */
1533:   if (gs->max_left_over)
1534:     {PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);}

1536:   /* process the received data */
1537:   msg_nodes=nodes;
1538:   list = msg_list;
1539:   while ((iptr = *nodes++))
1540:     {
1541:       if ((PCTFS_my_id|mask)==(*list|mask))
1542:         {
1543:           /* Should I check the return value of MPI_Wait() or status? */
1544:           /* Can this loop be replaced by a call to MPI_Waitall()? */
1545:           MPI_Wait(ids_in, &status);
1546:           ids_in++;
1547:           while (*iptr >= 0)
1548:             {*(dptr1 + *iptr++) += *in2++;}
1549:         }
1550:       list++;
1551:     }

1553:   /* replace vals */
1554:   while (*pw >= 0)
1555:     {*(in_vals + *pw++) = *dptr1++;}

1557:   /* clear isend message handles */
1558:   /* This changed for clarity though it could be the same */
1559:   while (*msg_nodes++)
1560:     {
1561:       if ((PCTFS_my_id|mask)==(*msg_list|mask))
1562:         {
1563:           /* Should I check the return value of MPI_Wait() or status? */
1564:           /* Can this loop be replaced by a call to MPI_Waitall()? */
1565:           MPI_Wait(ids_out, &status);
1566:           ids_out++;
1567:         }
1568:       msg_list++;
1569:     }

1571:   return(0);
1572: }

1574: /******************************************************************************/
1575: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1576: {
1577:   PetscInt size;
1578:   PetscInt *in, *out;
1579:   PetscScalar *buf, *work;
1580:   PetscInt op[] = {GL_ADD,0};

1583:   in   = gs->tree_map_in;
1584:   out  = gs->tree_map_out;
1585:   buf  = gs->tree_buf;
1586:   work = gs->tree_work;
1587:   size = gs->tree_nel;

1589:   PCTFS_rvec_zero(buf,size);

1591:   while (*in >= 0)
1592:     {*(buf + *out++) = *(vals + *in++);}

1594:   in   = gs->tree_map_in;
1595:   out  = gs->tree_map_out;

1597:   PCTFS_grop_hc(buf,work,size,op,dim);

1599:   while (*in >= 0)
1600:     {*(vals + *in++) = *(buf + *out++);}
1601:   return(0);
1602: }