Actual source code: gs.c

  1: /*

  3: Author: Henry M. Tufo III

  5: e-mail: hmt@cs.brown.edu

  7: snail-mail:
  8: Division of Applied Mathematics
  9: Brown University
 10: Providence, RI 02912

 12: Last Modification:
 13: 6.21.97

 15: File Description:
 16: -----------------

 18: */

 20: #include <../src/ksp/pc/impls/tfs/tfs.h>

 22: /* default length of number of items via tree - doubles if exceeded */
 23: #define TREE_BUF_SZ 2048
 24: #define GS_VEC_SZ   1

 26: /*
 27: Type: struct gather_scatter_id
 28: */
 29: typedef struct gather_scatter_id {
 30:   PetscInt     id;
 31:   PetscInt     nel_min;
 32:   PetscInt     nel_max;
 33:   PetscInt     nel_sum;
 34:   PetscInt     negl;
 35:   PetscInt     gl_max;
 36:   PetscInt     gl_min;
 37:   PetscInt     repeats;
 38:   PetscInt     ordered;
 39:   PetscInt     positive;
 40:   PetscScalar *vals;

 42:   /* bit mask info */
 43:   PetscInt *my_proc_mask;
 44:   PetscInt  mask_sz;
 45:   PetscInt *ngh_buf;
 46:   PetscInt  ngh_buf_sz;
 47:   PetscInt *nghs;
 48:   PetscInt  num_nghs;
 49:   PetscInt  max_nghs;
 50:   PetscInt *pw_nghs;
 51:   PetscInt  num_pw_nghs;
 52:   PetscInt *tree_nghs;
 53:   PetscInt  num_tree_nghs;

 55:   PetscInt num_loads;

 57:   /* repeats == true -> local info */
 58:   PetscInt  nel;  /* number of unique elements */
 59:   PetscInt *elms; /* of size nel */
 60:   PetscInt  nel_total;
 61:   PetscInt *local_elms; /* of size nel_total */
 62:   PetscInt *companion;  /* of size nel_total */

 64:   /* local info */
 65:   PetscInt   num_local_total;
 66:   PetscInt   local_strength;
 67:   PetscInt   num_local;
 68:   PetscInt  *num_local_reduce;
 69:   PetscInt **local_reduce;
 70:   PetscInt   num_local_gop;
 71:   PetscInt  *num_gop_local_reduce;
 72:   PetscInt **gop_local_reduce;

 74:   /* pairwise info */
 75:   PetscInt     level;
 76:   PetscInt     num_pairs;
 77:   PetscInt     max_pairs;
 78:   PetscInt     loc_node_pairs;
 79:   PetscInt     max_node_pairs;
 80:   PetscInt     min_node_pairs;
 81:   PetscInt     avg_node_pairs;
 82:   PetscInt    *pair_list;
 83:   PetscInt    *msg_sizes;
 84:   PetscInt   **node_list;
 85:   PetscInt     len_pw_list;
 86:   PetscInt    *pw_elm_list;
 87:   PetscScalar *pw_vals;

 89:   MPI_Request *msg_ids_in;
 90:   MPI_Request *msg_ids_out;

 92:   PetscScalar *out;
 93:   PetscScalar *in;
 94:   PetscInt     msg_total;

 96:   /* tree - crystal accumulator info */
 97:   PetscInt   max_left_over;
 98:   PetscInt  *pre;
 99:   PetscInt  *in_num;
100:   PetscInt  *out_num;
101:   PetscInt **in_list;
102:   PetscInt **out_list;

104:   /* new tree work*/
105:   PetscInt     tree_nel;
106:   PetscInt    *tree_elms;
107:   PetscScalar *tree_buf;
108:   PetscScalar *tree_work;

110:   PetscInt  tree_map_sz;
111:   PetscInt *tree_map_in;
112:   PetscInt *tree_map_out;

114:   /* current memory status */
115:   PetscInt gl_bss_min;
116:   PetscInt gl_perm_min;

118:   /* max segment size for PCTFS_gs_gop_vec() */
119:   PetscInt vec_sz;

121:   /* hack to make paul happy */
122:   MPI_Comm PCTFS_gs_comm;

124: } PCTFS_gs_id;

126: static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
127: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
128: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
129: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
130: static PCTFS_gs_id   *gsi_new(void);
131: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

133: /* same for all but vector flavor */
134: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
135: /* vector flavor */
136: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

138: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
139: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
140: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
141: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

144: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
145: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

147: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
148: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
149: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

151: /* global vars */
152: /* from comm.c module */

154: static PetscInt num_gs_ids = 0;

156: /* should make this dynamic ... later */
157: static PetscInt  msg_buf     = MAX_MSG_BUF;
158: static PetscInt  vec_sz      = GS_VEC_SZ;
159: static PetscInt *tree_buf    = NULL;
160: static PetscInt  tree_buf_sz = 0;
161: static PetscInt  ntree       = 0;

163: /***************************************************************************/
164: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
165: {
166:   PetscFunctionBegin;
167:   vec_sz = size;
168:   PetscFunctionReturn(PETSC_SUCCESS);
169: }

171: /******************************************************************************/
172: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
173: {
174:   PetscFunctionBegin;
175:   msg_buf = buf_size;
176:   PetscFunctionReturn(PETSC_SUCCESS);
177: }

179: /******************************************************************************/
180: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
181: {
182:   PCTFS_gs_id *gs;
183:   MPI_Group    PCTFS_gs_group;
184:   MPI_Comm     PCTFS_gs_comm;

186:   /* ensure that communication package has been initialized */
187:   PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());

189:   /* determines if we have enough dynamic/semi-static memory */
190:   /* checks input, allocs and sets gd_id template            */
191:   gs = gsi_check_args(elms, nel, level);

193:   /* only bit mask version up and working for the moment    */
194:   /* LATER :: get int list version working for sparse pblms */
195:   PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));

197:   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
198:   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
199:   PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);

201:   gs->PCTFS_gs_comm = PCTFS_gs_comm;

203:   return gs;
204: }

206: /******************************************************************************/
207: static PCTFS_gs_id *gsi_new(void)
208: {
209:   PCTFS_gs_id *gs;
210:   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
211:   PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
212:   return gs;
213: }

215: /******************************************************************************/
216: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
217: {
218:   PetscInt     i, j, k, t2;
219:   PetscInt    *companion, *elms, *unique, *iptr;
220:   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
221:   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
222:   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
223:   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
224:   PCTFS_gs_id *gs;

226:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
227:   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");

229:   if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));

231:   /* get space for gs template */
232:   gs     = gsi_new();
233:   gs->id = ++num_gs_ids;

235:   /* hmt 6.4.99                                            */
236:   /* caller can set global ids that don't participate to 0 */
237:   /* PCTFS_gs_init ignores all zeros in elm list                 */
238:   /* negative global ids are still invalid                 */
239:   for (i = j = 0; i < nel; i++) {
240:     if (in_elms[i] != 0) j++;
241:   }

243:   k   = nel;
244:   nel = j;

246:   /* copy over in_elms list and create inverse map */
247:   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
248:   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));

250:   for (i = j = 0; i < k; i++) {
251:     if (in_elms[i] != 0) {
252:       elms[j]        = in_elms[i];
253:       companion[j++] = i;
254:     }
255:   }

257:   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");

259:   /* pre-pass ... check to see if sorted */
260:   elms[nel] = INT_MAX;
261:   iptr      = elms;
262:   unique    = elms + 1;
263:   j         = 0;
264:   while (*iptr != INT_MAX) {
265:     if (*iptr++ > *unique++) {
266:       j = 1;
267:       break;
268:     }
269:   }

271:   /* set up inverse map */
272:   if (j) {
273:     PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
274:     PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
275:   } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
276:   elms[nel] = INT_MIN;

278:   /* first pass */
279:   /* determine number of unique elements, check pd */
280:   for (i = k = 0; i < nel; i += j) {
281:     t2 = elms[i];
282:     j  = ++i;

284:     /* clump 'em for now */
285:     while (elms[j] == t2) j++;

287:     /* how many together and num local */
288:     if (j -= i) {
289:       num_local++;
290:       k += j;
291:     }
292:   }

294:   /* how many unique elements? */
295:   gs->repeats = k;
296:   gs->nel     = nel - k;

298:   /* number of repeats? */
299:   gs->num_local = num_local;
300:   num_local += 2;
301:   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
302:   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));

304:   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
305:   gs->elms       = unique;
306:   gs->nel_total  = nel;
307:   gs->local_elms = elms;
308:   gs->companion  = companion;

310:   /* compess map as well as keep track of local ops */
311:   for (num_local = i = j = 0; i < gs->nel; i++) {
312:     k  = j;
313:     t2 = unique[i] = elms[j];
314:     companion[i]   = companion[j];

316:     while (elms[j] == t2) j++;

318:     if ((t2 = (j - k)) > 1) {
319:       /* number together */
320:       num_to_reduce[num_local] = t2++;

322:       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));

324:       /* to use binary searching don't remap until we check intersection */
325:       *iptr++ = i;

327:       /* note that we're skipping the first one */
328:       while (++k < j) *(iptr++) = companion[k];
329:       *iptr = -1;
330:     }
331:   }

333:   /* sentinel for ngh_buf */
334:   unique[gs->nel] = INT_MAX;

336:   /* for two partition sort hack */
337:   num_to_reduce[num_local]   = 0;
338:   local_reduce[num_local]    = NULL;
339:   num_to_reduce[++num_local] = 0;
340:   local_reduce[num_local]    = NULL;

342:   /* load 'em up */
343:   /* note one extra to hold NON_UNIFORM flag!!! */
344:   vals[2] = vals[1] = vals[0] = nel;
345:   if (gs->nel > 0) {
346:     vals[3] = unique[0];
347:     vals[4] = unique[gs->nel - 1];
348:   } else {
349:     vals[3] = INT_MAX;
350:     vals[4] = INT_MIN;
351:   }
352:   vals[5] = level;
353:   vals[6] = num_gs_ids;

355:   /* GLOBAL: send 'em out */
356:   PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));

358:   /* must be semi-pos def - only pairwise depends on this */
359:   /* LATER - remove this restriction */
360:   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
361:   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");

363:   gs->nel_min = vals[0];
364:   gs->nel_max = vals[1];
365:   gs->nel_sum = vals[2];
366:   gs->gl_min  = vals[3];
367:   gs->gl_max  = vals[4];
368:   gs->negl    = vals[4] - vals[3] + 1;

370:   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);

372:   /* LATER :: add level == -1 -> program selects level */
373:   if (vals[5] < 0) vals[5] = 0;
374:   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
375:   gs->level = vals[5];

377:   return gs;
378: }

380: /******************************************************************************/
381: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
382: {
383:   PetscInt   i, nel, *elms;
384:   PetscInt   t1;
385:   PetscInt **reduce;
386:   PetscInt  *map;

388:   PetscFunctionBegin;
389:   /* totally local removes ... PCTFS_ct_bits == 0 */
390:   PetscCall(get_ngh_buf(gs));

392:   if (gs->level) PetscCall(set_pairwise(gs));
393:   if (gs->max_left_over) PetscCall(set_tree(gs));

395:   /* intersection local and pairwise/tree? */
396:   gs->num_local_total      = gs->num_local;
397:   gs->gop_local_reduce     = gs->local_reduce;
398:   gs->num_gop_local_reduce = gs->num_local_reduce;

400:   map = gs->companion;

402:   /* is there any local compression */
403:   if (!gs->num_local) {
404:     gs->local_strength = NONE;
405:     gs->num_local_gop  = 0;
406:   } else {
407:     /* ok find intersection */
408:     map    = gs->companion;
409:     reduce = gs->local_reduce;
410:     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
411:       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
412:         t1++;
413:         PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
414:         gs->num_local_reduce[i] *= -1;
415:       }
416:       **reduce = map[**reduce];
417:     }

419:     /* intersection is empty */
420:     if (!t1) {
421:       gs->local_strength = FULL;
422:       gs->num_local_gop  = 0;
423:     } else { /* intersection not empty */
424:       gs->local_strength = PARTIAL;

426:       PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));

428:       gs->num_local_gop   = t1;
429:       gs->num_local_total = gs->num_local;
430:       gs->num_local -= t1;
431:       gs->gop_local_reduce     = gs->local_reduce;
432:       gs->num_gop_local_reduce = gs->num_local_reduce;

434:       for (i = 0; i < t1; i++) {
435:         PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
436:         gs->num_gop_local_reduce[i] *= -1;
437:         gs->local_reduce++;
438:         gs->num_local_reduce++;
439:       }
440:       gs->local_reduce++;
441:       gs->num_local_reduce++;
442:     }
443:   }

445:   elms = gs->pw_elm_list;
446:   nel  = gs->len_pw_list;
447:   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

449:   elms = gs->tree_map_in;
450:   nel  = gs->tree_map_sz;
451:   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];

453:   /* clean up */
454:   free((void *)gs->local_elms);
455:   free((void *)gs->companion);
456:   free((void *)gs->elms);
457:   free((void *)gs->ngh_buf);
458:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
459:   PetscFunctionReturn(PETSC_SUCCESS);
460: }

462: /******************************************************************************/
463: static PetscErrorCode place_in_tree(PetscInt elm)
464: {
465:   PetscInt *tp, n;

467:   PetscFunctionBegin;
468:   if (ntree == tree_buf_sz) {
469:     if (tree_buf_sz) {
470:       tp = tree_buf;
471:       n  = tree_buf_sz;
472:       tree_buf_sz <<= 1;
473:       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
474:       PCTFS_ivec_copy(tree_buf, tp, n);
475:       free(tp);
476:     } else {
477:       tree_buf_sz = TREE_BUF_SZ;
478:       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
479:     }
480:   }

482:   tree_buf[ntree++] = elm;
483:   PetscFunctionReturn(PETSC_SUCCESS);
484: }

486: /******************************************************************************/
487: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
488: {
489:   PetscInt  i, j, npw = 0, ntree_map = 0;
490:   PetscInt  p_mask_size, ngh_buf_size, buf_size;
491:   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
492:   PetscInt *ngh_buf, *buf1, *buf2;
493:   PetscInt  offset, per_load, num_loads, or_ct, start, end;
494:   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
495:   PetscInt  oper = GL_B_OR;
496:   PetscInt *ptr3, *t_mask, level, ct1, ct2;

498:   PetscFunctionBegin;
499:   /* to make life easier */
500:   nel   = gs->nel;
501:   elms  = gs->elms;
502:   level = gs->level;

504:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505:   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
506:   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));

508:   /* allocate space for masks and info bufs */
509:   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
510:   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
511:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
512:   t_mask                        = (PetscInt *)malloc(p_mask_size);
513:   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);

515:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516:   /* had thought I could exploit rendezvous threshold */

518:   /* default is one pass */
519:   per_load = negl = gs->negl;
520:   gs->num_loads = num_loads = 1;
521:   i                         = p_mask_size * negl;

523:   /* possible overflow on buffer size */
524:   /* overflow hack                    */
525:   if (i < 0) i = INT_MAX;

527:   buf_size = PetscMin(msg_buf, i);

529:   /* can we do it? */
530:   PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);

532:   /* get PCTFS_giop buf space ... make *only* one malloc */
533:   buf1 = (PetscInt *)malloc(buf_size << 1);

535:   /* more than one gior exchange needed? */
536:   if (buf_size != i) {
537:     per_load      = buf_size / p_mask_size;
538:     buf_size      = per_load * p_mask_size;
539:     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
540:   }

542:   /* convert buf sizes from #bytes to #ints - 32-bit only! */
543:   p_mask_size /= sizeof(PetscInt);
544:   ngh_buf_size /= sizeof(PetscInt);
545:   buf_size /= sizeof(PetscInt);

547:   /* find PCTFS_giop work space */
548:   buf2 = buf1 + buf_size;

550:   /* hold #ints needed for processor masks */
551:   gs->mask_sz = p_mask_size;

553:   /* init buffers */
554:   PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
555:   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
556:   PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));

558:   /* HACK reset tree info */
559:   tree_buf    = NULL;
560:   tree_buf_sz = ntree = 0;

562:   /* ok do it */
563:   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
564:     /* identity for bitwise or is 000...000 */
565:     PetscCall(PCTFS_ivec_zero(buf1, buf_size));

567:     /* load msg buffer */
568:     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
569:       offset = (offset - start) * p_mask_size;
570:       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
571:     }

573:     /* GLOBAL: pass buffer */
574:     PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));

576:     /* unload buffer into ngh_buf */
577:     ptr2 = (elms + i_start);
578:     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
579:       /* I own it ... may have to pairwise it */
580:       if (j == *ptr2) {
581:         /* do i share it w/anyone? */
582:         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
583:         /* guess not */
584:         if (ct1 < 2) {
585:           ptr2++;
586:           ptr1 += p_mask_size;
587:           continue;
588:         }

590:         /* i do ... so keep info and turn off my bit */
591:         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
592:         PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
593:         PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));

595:         /* is it to be done pairwise? */
596:         if (--ct1 <= level) {
597:           npw++;

599:           /* turn on high bit to indicate pw need to process */
600:           *ptr2++ |= TOP_BIT;
601:           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
602:           ptr1 += p_mask_size;
603:           continue;
604:         }

606:         /* get set for next and note that I have a tree contribution */
607:         /* could save exact elm index for tree here -> save a search */
608:         ptr2++;
609:         ptr1 += p_mask_size;
610:         ntree_map++;
611:       } else { /* i don't but still might be involved in tree */

613:         /* shared by how many? */
614:         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));

616:         /* none! */
617:         if (ct1 < 2) continue;

619:         /* is it going to be done pairwise? but not by me of course!*/
620:         if (--ct1 <= level) continue;
621:       }
622:       /* LATER we're going to have to process it NOW */
623:       /* nope ... tree it */
624:       PetscCall(place_in_tree(j));
625:     }
626:   }

628:   free((void *)t_mask);
629:   free((void *)buf1);

631:   gs->len_pw_list = npw;
632:   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

634:   /* expand from bit mask list to int list and save ngh list */
635:   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
636:   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));

638:   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));

640:   oper = GL_MAX;
641:   ct1  = gs->num_nghs;
642:   PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
643:   gs->max_nghs = ct1;

645:   gs->tree_map_sz   = ntree_map;
646:   gs->max_left_over = ntree;

648:   free((void *)p_mask);
649:   free((void *)sh_proc_mask);
650:   PetscFunctionReturn(PETSC_SUCCESS);
651: }

653: /******************************************************************************/
654: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
655: {
656:   PetscInt  i, j;
657:   PetscInt  p_mask_size;
658:   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
659:   PetscInt *ngh_buf, *buf2;
660:   PetscInt  offset;
661:   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
662:   PetscInt *pairwise_elm_list, len_pair_list = 0;
663:   PetscInt *iptr, t1, i_start, nel, *elms;
664:   PetscInt  ct;

666:   PetscFunctionBegin;
667:   /* to make life easier */
668:   nel          = gs->nel;
669:   elms         = gs->elms;
670:   ngh_buf      = gs->ngh_buf;
671:   sh_proc_mask = gs->pw_nghs;

673:   /* need a few temp masks */
674:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
675:   p_mask        = (PetscInt *)malloc(p_mask_size);
676:   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);

678:   /* set mask to my PCTFS_my_id's bit mask */
679:   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));

681:   p_mask_size /= sizeof(PetscInt);

683:   len_pair_list   = gs->len_pw_list;
684:   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));

686:   /* how many processors (nghs) do we have to exchange with? */
687:   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));

689:   /* allocate space for PCTFS_gs_gop() info */
690:   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
691:   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
692:   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));

694:   /* init msg_size list */
695:   PetscCall(PCTFS_ivec_zero(msg_size, nprs));

697:   /* expand from bit mask list to int list */
698:   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));

700:   /* keep list of elements being handled pairwise */
701:   for (i = j = 0; i < nel; i++) {
702:     if (elms[i] & TOP_BIT) {
703:       elms[i] ^= TOP_BIT;
704:       pairwise_elm_list[j++] = i;
705:     }
706:   }
707:   pairwise_elm_list[j] = -1;

709:   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
710:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
711:   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
712:   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
713:   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);

715:   /* find who goes to each processor */
716:   for (i_start = i = 0; i < nprs; i++) {
717:     /* processor i's mask */
718:     PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));

720:     /* det # going to processor i */
721:     for (ct = j = 0; j < len_pair_list; j++) {
722:       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
723:       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
724:       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
725:     }
726:     msg_size[i] = ct;
727:     i_start     = PetscMax(i_start, ct);

729:     /*space to hold nodes in message to first neighbor */
730:     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));

732:     for (j = 0; j < len_pair_list; j++) {
733:       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
734:       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
735:       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
736:     }
737:     *iptr = -1;
738:   }
739:   msg_nodes[nprs] = NULL;

741:   j = gs->loc_node_pairs = i_start;
742:   t1                     = GL_MAX;
743:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
744:   gs->max_node_pairs = i_start;

746:   i_start = j;
747:   t1      = GL_MIN;
748:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
749:   gs->min_node_pairs = i_start;

751:   i_start = j;
752:   t1      = GL_ADD;
753:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
754:   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;

756:   i_start = nprs;
757:   t1      = GL_MAX;
758:   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
759:   gs->max_pairs = i_start;

761:   /* remap pairwise in tail of gsi_via_bit_mask() */
762:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
763:   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
764:   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);

766:   /* reset malloc pool */
767:   free((void *)p_mask);
768:   free((void *)tmp_proc_mask);
769:   PetscFunctionReturn(PETSC_SUCCESS);
770: }

772: /* to do pruned tree just save ngh buf copy for each one and decode here!
773: ******************************************************************************/
774: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
775: {
776:   PetscInt  i, j, n, nel;
777:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

779:   PetscFunctionBegin;
780:   /* local work ptrs */
781:   elms = gs->elms;
782:   nel  = gs->nel;

784:   /* how many via tree */
785:   gs->tree_nel = n = ntree;
786:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
787:   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
788:   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
789:   j                                   = gs->tree_map_sz;
790:   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
791:   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));

793:   /* search the longer of the two lists */
794:   /* note ... could save this info in get_ngh_buf and save searches */
795:   if (n <= nel) {
796:     /* bijective fct w/remap - search elm list */
797:     for (i = 0; i < n; i++) {
798:       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
799:         *iptr_in++  = j;
800:         *iptr_out++ = i;
801:       }
802:     }
803:   } else {
804:     for (i = 0; i < nel; i++) {
805:       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
806:         *iptr_in++  = i;
807:         *iptr_out++ = j;
808:       }
809:     }
810:   }

812:   /* sentinel */
813:   *iptr_in = *iptr_out = -1;
814:   PetscFunctionReturn(PETSC_SUCCESS);
815: }

817: /******************************************************************************/
818: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
819: {
820:   PetscInt   *num, *map, **reduce;
821:   PetscScalar tmp;

823:   PetscFunctionBegin;
824:   num    = gs->num_gop_local_reduce;
825:   reduce = gs->gop_local_reduce;
826:   while ((map = *reduce++)) {
827:     /* wall */
828:     if (*num == 2) {
829:       num++;
830:       vals[map[1]] = vals[map[0]];
831:     } else if (*num == 3) { /* corner shared by three elements */
832:       num++;
833:       vals[map[2]] = vals[map[1]] = vals[map[0]];
834:     } else if (*num == 4) { /* corner shared by four elements */
835:       num++;
836:       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
837:     } else { /* general case ... odd geoms ... 3D*/
838:       num++;
839:       tmp = *(vals + *map++);
840:       while (*map >= 0) *(vals + *map++) = tmp;
841:     }
842:   }
843:   PetscFunctionReturn(PETSC_SUCCESS);
844: }

846: /******************************************************************************/
847: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
848: {
849:   PetscInt   *num, *map, **reduce;
850:   PetscScalar tmp;

852:   PetscFunctionBegin;
853:   num    = gs->num_local_reduce;
854:   reduce = gs->local_reduce;
855:   while ((map = *reduce)) {
856:     /* wall */
857:     if (*num == 2) {
858:       num++;
859:       reduce++;
860:       vals[map[1]] = vals[map[0]] += vals[map[1]];
861:     } else if (*num == 3) { /* corner shared by three elements */
862:       num++;
863:       reduce++;
864:       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
865:     } else if (*num == 4) { /* corner shared by four elements */
866:       num++;
867:       reduce++;
868:       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
869:     } else { /* general case ... odd geoms ... 3D*/
870:       num++;
871:       tmp = 0.0;
872:       while (*map >= 0) tmp += *(vals + *map++);

874:       map = *reduce++;
875:       while (*map >= 0) *(vals + *map++) = tmp;
876:     }
877:   }
878:   PetscFunctionReturn(PETSC_SUCCESS);
879: }

881: /******************************************************************************/
882: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
883: {
884:   PetscInt    *num, *map, **reduce;
885:   PetscScalar *base;

887:   PetscFunctionBegin;
888:   num    = gs->num_gop_local_reduce;
889:   reduce = gs->gop_local_reduce;
890:   while ((map = *reduce++)) {
891:     /* wall */
892:     if (*num == 2) {
893:       num++;
894:       vals[map[0]] += vals[map[1]];
895:     } else if (*num == 3) { /* corner shared by three elements */
896:       num++;
897:       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
898:     } else if (*num == 4) { /* corner shared by four elements */
899:       num++;
900:       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
901:     } else { /* general case ... odd geoms ... 3D*/
902:       num++;
903:       base = vals + *map++;
904:       while (*map >= 0) *base += *(vals + *map++);
905:     }
906:   }
907:   PetscFunctionReturn(PETSC_SUCCESS);
908: }

910: /******************************************************************************/
911: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
912: {
913:   PetscInt i;

915:   PetscFunctionBegin;
916:   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
917:   if (gs->nghs) free((void *)gs->nghs);
918:   if (gs->pw_nghs) free((void *)gs->pw_nghs);

920:   /* tree */
921:   if (gs->max_left_over) {
922:     if (gs->tree_elms) free((void *)gs->tree_elms);
923:     if (gs->tree_buf) free((void *)gs->tree_buf);
924:     if (gs->tree_work) free((void *)gs->tree_work);
925:     if (gs->tree_map_in) free((void *)gs->tree_map_in);
926:     if (gs->tree_map_out) free((void *)gs->tree_map_out);
927:   }

929:   /* pairwise info */
930:   if (gs->num_pairs) {
931:     /* should be NULL already */
932:     if (gs->ngh_buf) free((void *)gs->ngh_buf);
933:     if (gs->elms) free((void *)gs->elms);
934:     if (gs->local_elms) free((void *)gs->local_elms);
935:     if (gs->companion) free((void *)gs->companion);

937:     /* only set if pairwise */
938:     if (gs->vals) free((void *)gs->vals);
939:     if (gs->in) free((void *)gs->in);
940:     if (gs->out) free((void *)gs->out);
941:     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
942:     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
943:     if (gs->pw_vals) free((void *)gs->pw_vals);
944:     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
945:     if (gs->node_list) {
946:       for (i = 0; i < gs->num_pairs; i++) {
947:         if (gs->node_list[i]) free((void *)gs->node_list[i]);
948:       }
949:       free((void *)gs->node_list);
950:     }
951:     if (gs->msg_sizes) free((void *)gs->msg_sizes);
952:     if (gs->pair_list) free((void *)gs->pair_list);
953:   }

955:   /* local info */
956:   if (gs->num_local_total >= 0) {
957:     for (i = 0; i < gs->num_local_total + 1; i++) {
958:       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
959:     }
960:   }

962:   /* if intersection tree/pairwise and local isn't empty */
963:   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
964:   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);

966:   free((void *)gs);
967:   PetscFunctionReturn(PETSC_SUCCESS);
968: }

970: /******************************************************************************/
971: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
972: {
973:   PetscFunctionBegin;
974:   switch (*op) {
975:   case '+':
976:     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
977:     break;
978:   default:
979:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
980:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
981:     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
982:     break;
983:   }
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: /******************************************************************************/
988: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
989: {
990:   PetscFunctionBegin;
991:   PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");

993:   /* local only operations!!! */
994:   if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));

996:   /* if intersection tree/pairwise and local isn't empty */
997:   if (gs->num_local_gop) {
998:     PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));

1000:     /* pairwise */
1001:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));

1003:     /* tree */
1004:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));

1006:     PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1007:   } else { /* if intersection tree/pairwise and local is empty */
1008:     /* pairwise */
1009:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));

1011:     /* tree */
1012:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1013:   }
1014:   PetscFunctionReturn(PETSC_SUCCESS);
1015: }

1017: /******************************************************************************/
1018: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1019: {
1020:   PetscInt    *num, *map, **reduce;
1021:   PetscScalar *base;

1023:   PetscFunctionBegin;
1024:   num    = gs->num_local_reduce;
1025:   reduce = gs->local_reduce;
1026:   while ((map = *reduce)) {
1027:     base = vals + map[0] * step;

1029:     /* wall */
1030:     if (*num == 2) {
1031:       num++;
1032:       reduce++;
1033:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1034:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1035:     } else if (*num == 3) { /* corner shared by three elements */
1036:       num++;
1037:       reduce++;
1038:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1039:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1040:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1041:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1042:     } else if (*num == 4) { /* corner shared by four elements */
1043:       num++;
1044:       reduce++;
1045:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1046:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1047:       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1048:       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1049:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1050:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1051:     } else { /* general case ... odd geoms ... 3D */
1052:       num++;
1053:       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));

1055:       map = *reduce;
1056:       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));

1058:       reduce++;
1059:     }
1060:   }
1061:   PetscFunctionReturn(PETSC_SUCCESS);
1062: }

1064: /******************************************************************************/
1065: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1066: {
1067:   PetscInt    *num, *map, **reduce;
1068:   PetscScalar *base;

1070:   PetscFunctionBegin;
1071:   num    = gs->num_gop_local_reduce;
1072:   reduce = gs->gop_local_reduce;
1073:   while ((map = *reduce++)) {
1074:     base = vals + map[0] * step;

1076:     /* wall */
1077:     if (*num == 2) {
1078:       num++;
1079:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1080:     } else if (*num == 3) { /* corner shared by three elements */
1081:       num++;
1082:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1083:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1084:     } else if (*num == 4) { /* corner shared by four elements */
1085:       num++;
1086:       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1087:       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1088:       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1089:     } else { /* general case ... odd geoms ... 3D*/
1090:       num++;
1091:       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1092:     }
1093:   }
1094:   PetscFunctionReturn(PETSC_SUCCESS);
1095: }

1097: /******************************************************************************/
1098: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1099: {
1100:   PetscInt    *num, *map, **reduce;
1101:   PetscScalar *base;

1103:   PetscFunctionBegin;
1104:   num    = gs->num_gop_local_reduce;
1105:   reduce = gs->gop_local_reduce;
1106:   while ((map = *reduce++)) {
1107:     base = vals + map[0] * step;

1109:     /* wall */
1110:     if (*num == 2) {
1111:       num++;
1112:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1113:     } else if (*num == 3) { /* corner shared by three elements */
1114:       num++;
1115:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1116:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1117:     } else if (*num == 4) { /* corner shared by four elements */
1118:       num++;
1119:       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1120:       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1121:       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1122:     } else { /* general case ... odd geoms ... 3D*/
1123:       num++;
1124:       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1125:     }
1126:   }
1127:   PetscFunctionReturn(PETSC_SUCCESS);
1128: }

1130: /******************************************************************************/
1131: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1132: {
1133:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1134:   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1135:   PetscInt    *pw, *list, *size, **nodes;
1136:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1137:   MPI_Status   status;
1138:   PetscBLASInt i1 = 1, dstep;

1140:   PetscFunctionBegin;
1141:   /* strip and load s */
1142:   msg_list = list = gs->pair_list;
1143:   msg_size = size = gs->msg_sizes;
1144:   msg_nodes = nodes = gs->node_list;
1145:   iptr = pw = gs->pw_elm_list;
1146:   dptr1 = dptr3 = gs->pw_vals;
1147:   msg_ids_in = ids_in = gs->msg_ids_in;
1148:   msg_ids_out = ids_out = gs->msg_ids_out;
1149:   dptr2                 = gs->out;
1150:   in1 = in2 = gs->in;

1152:   /* post the receives */
1153:   /*  msg_nodes=nodes; */
1154:   do {
1155:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1156:         second one *list and do list++ afterwards */
1157:     PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1158:     list++;
1159:     msg_ids_in++;
1160:     in1 += *size++ * step;
1161:   } while (*++msg_nodes);
1162:   msg_nodes = nodes;

1164:   /* load gs values into in out gs buffers */
1165:   while (*iptr >= 0) {
1166:     PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1167:     dptr3 += step;
1168:     iptr++;
1169:   }

1171:   /* load out buffers and post the sends */
1172:   while ((iptr = *msg_nodes++)) {
1173:     dptr3 = dptr2;
1174:     while (*iptr >= 0) {
1175:       PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1176:       dptr2 += step;
1177:       iptr++;
1178:     }
1179:     PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1180:     msg_size++;
1181:     msg_list++;
1182:     msg_ids_out++;
1183:   }

1185:   /* tree */
1186:   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));

1188:   /* process the received data */
1189:   msg_nodes = nodes;
1190:   while ((iptr = *nodes++)) {
1191:     PetscScalar d1 = 1.0;

1193:     /* Should I check the return value of MPI_Wait() or status? */
1194:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1195:     PetscCallMPI(MPI_Wait(ids_in, &status));
1196:     ids_in++;
1197:     while (*iptr >= 0) {
1198:       PetscCall(PetscBLASIntCast(step, &dstep));
1199:       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1200:       in2 += step;
1201:       iptr++;
1202:     }
1203:   }

1205:   /* replace vals */
1206:   while (*pw >= 0) {
1207:     PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1208:     dptr1 += step;
1209:     pw++;
1210:   }

1212:   /* clear isend message handles */
1213:   /* This changed for clarity though it could be the same */

1215:   /* Should I check the return value of MPI_Wait() or status? */
1216:   /* Can this loop be replaced by a call to MPI_Waitall()? */
1217:   while (*msg_nodes++) {
1218:     PetscCallMPI(MPI_Wait(ids_out, &status));
1219:     ids_out++;
1220:   }
1221:   PetscFunctionReturn(PETSC_SUCCESS);
1222: }

1224: /******************************************************************************/
1225: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1226: {
1227:   PetscInt     size, *in, *out;
1228:   PetscScalar *buf, *work;
1229:   PetscInt     op[] = {GL_ADD, 0};
1230:   PetscBLASInt i1   = 1;
1231:   PetscBLASInt dstep;

1233:   PetscFunctionBegin;
1234:   /* copy over to local variables */
1235:   in   = gs->tree_map_in;
1236:   out  = gs->tree_map_out;
1237:   buf  = gs->tree_buf;
1238:   work = gs->tree_work;
1239:   size = gs->tree_nel * step;

1241:   /* zero out collection buffer */
1242:   PetscCall(PCTFS_rvec_zero(buf, size));

1244:   /* copy over my contributions */
1245:   while (*in >= 0) {
1246:     PetscCall(PetscBLASIntCast(step, &dstep));
1247:     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1248:   }

1250:   /* perform fan in/out on full buffer */
1251:   /* must change PCTFS_grop to handle the blas */
1252:   PetscCall(PCTFS_grop(buf, work, size, op));

1254:   /* reset */
1255:   in  = gs->tree_map_in;
1256:   out = gs->tree_map_out;

1258:   /* get the portion of the results I need */
1259:   while (*in >= 0) {
1260:     PetscCall(PetscBLASIntCast(step, &dstep));
1261:     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1262:   }
1263:   PetscFunctionReturn(PETSC_SUCCESS);
1264: }

1266: /******************************************************************************/
1267: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1268: {
1269:   PetscFunctionBegin;
1270:   switch (*op) {
1271:   case '+':
1272:     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1273:     break;
1274:   default:
1275:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1276:     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1277:     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1278:     break;
1279:   }
1280:   PetscFunctionReturn(PETSC_SUCCESS);
1281: }

1283: /******************************************************************************/
1284: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1285: {
1286:   PetscFunctionBegin;
1287:   /* if there's nothing to do return */
1288:   if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);

1290:   /* can't do more dimensions then exist */
1291:   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);

1293:   /* local only operations!!! */
1294:   if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));

1296:   /* if intersection tree/pairwise and local isn't empty */
1297:   if (gs->num_local_gop) {
1298:     PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));

1300:     /* pairwise will do tree inside ... */
1301:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1302:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));

1304:     PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1305:   } else { /* if intersection tree/pairwise and local is empty */
1306:     /* pairwise will do tree inside */
1307:     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1308:     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1309:   }
1310:   PetscFunctionReturn(PETSC_SUCCESS);
1311: }

1313: /******************************************************************************/
1314: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1315: {
1316:   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1317:   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1318:   PetscInt    *pw, *list, *size, **nodes;
1319:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1320:   MPI_Status   status;
1321:   PetscInt     i, mask = 1;

1323:   PetscFunctionBegin;
1324:   for (i = 1; i < dim; i++) {
1325:     mask <<= 1;
1326:     mask++;
1327:   }

1329:   /* strip and load s */
1330:   msg_list = list = gs->pair_list;
1331:   msg_size = size = gs->msg_sizes;
1332:   msg_nodes = nodes = gs->node_list;
1333:   iptr = pw = gs->pw_elm_list;
1334:   dptr1 = dptr3 = gs->pw_vals;
1335:   msg_ids_in = ids_in = gs->msg_ids_in;
1336:   msg_ids_out = ids_out = gs->msg_ids_out;
1337:   dptr2                 = gs->out;
1338:   in1 = in2 = gs->in;

1340:   /* post the receives */
1341:   /*  msg_nodes=nodes; */
1342:   do {
1343:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1344:         second one *list and do list++ afterwards */
1345:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1346:       PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1347:       list++;
1348:       msg_ids_in++;
1349:       in1 += *size++;
1350:     } else {
1351:       list++;
1352:       size++;
1353:     }
1354:   } while (*++msg_nodes);

1356:   /* load gs values into in out gs buffers */
1357:   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

1359:   /* load out buffers and post the sends */
1360:   msg_nodes = nodes;
1361:   list      = msg_list;
1362:   while ((iptr = *msg_nodes++)) {
1363:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1364:       dptr3 = dptr2;
1365:       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1366:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1367:       /* is msg_ids_out++ correct? */
1368:       PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1369:       msg_size++;
1370:       list++;
1371:       msg_ids_out++;
1372:     } else {
1373:       list++;
1374:       msg_size++;
1375:     }
1376:   }

1378:   /* do the tree while we're waiting */
1379:   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));

1381:   /* process the received data */
1382:   msg_nodes = nodes;
1383:   list      = msg_list;
1384:   while ((iptr = *nodes++)) {
1385:     if ((PCTFS_my_id | mask) == (*list | mask)) {
1386:       /* Should I check the return value of MPI_Wait() or status? */
1387:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1388:       PetscCallMPI(MPI_Wait(ids_in, &status));
1389:       ids_in++;
1390:       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1391:     }
1392:     list++;
1393:   }

1395:   /* replace vals */
1396:   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

1398:   /* clear isend message handles */
1399:   /* This changed for clarity though it could be the same */
1400:   while (*msg_nodes++) {
1401:     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1402:       /* Should I check the return value of MPI_Wait() or status? */
1403:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1404:       PetscCallMPI(MPI_Wait(ids_out, &status));
1405:       ids_out++;
1406:     }
1407:     msg_list++;
1408:   }
1409:   PetscFunctionReturn(PETSC_SUCCESS);
1410: }

1412: /******************************************************************************/
1413: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1414: {
1415:   PetscInt     size;
1416:   PetscInt    *in, *out;
1417:   PetscScalar *buf, *work;
1418:   PetscInt     op[] = {GL_ADD, 0};

1420:   PetscFunctionBegin;
1421:   in   = gs->tree_map_in;
1422:   out  = gs->tree_map_out;
1423:   buf  = gs->tree_buf;
1424:   work = gs->tree_work;
1425:   size = gs->tree_nel;

1427:   PetscCall(PCTFS_rvec_zero(buf, size));

1429:   while (*in >= 0) *(buf + *out++) = *(vals + *in++);

1431:   in  = gs->tree_map_in;
1432:   out = gs->tree_map_out;

1434:   PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));

1436:   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1437:   PetscFunctionReturn(PETSC_SUCCESS);
1438: }