Actual source code: gs.c
1: /*
3: Author: Henry M. Tufo III
5: e-mail: hmt@cs.brown.edu
7: snail-mail:
8: Division of Applied Mathematics
9: Brown University
10: Providence, RI 02912
12: Last Modification:
13: 6.21.97
15: File Description:
16: -----------------
18: */
20: #include <../src/ksp/pc/impls/tfs/tfs.h>
22: /* default length of number of items via tree - doubles if exceeded */
23: #define TREE_BUF_SZ 2048
24: #define GS_VEC_SZ 1
26: /*
27: Type: struct gather_scatter_id
28: */
29: typedef struct gather_scatter_id {
30: PetscInt id;
31: PetscInt nel_min;
32: PetscInt nel_max;
33: PetscInt nel_sum;
34: PetscInt negl;
35: PetscInt gl_max;
36: PetscInt gl_min;
37: PetscInt repeats;
38: PetscInt ordered;
39: PetscInt positive;
40: PetscScalar *vals;
42: /* bit mask info */
43: PetscInt *my_proc_mask;
44: PetscInt mask_sz;
45: PetscInt *ngh_buf;
46: PetscInt ngh_buf_sz;
47: PetscInt *nghs;
48: PetscInt num_nghs;
49: PetscInt max_nghs;
50: PetscInt *pw_nghs;
51: PetscInt num_pw_nghs;
52: PetscInt *tree_nghs;
53: PetscInt num_tree_nghs;
55: PetscInt num_loads;
57: /* repeats == true -> local info */
58: PetscInt nel; /* number of unique elements */
59: PetscInt *elms; /* of size nel */
60: PetscInt nel_total;
61: PetscInt *local_elms; /* of size nel_total */
62: PetscInt *companion; /* of size nel_total */
64: /* local info */
65: PetscInt num_local_total;
66: PetscInt local_strength;
67: PetscInt num_local;
68: PetscInt *num_local_reduce;
69: PetscInt **local_reduce;
70: PetscInt num_local_gop;
71: PetscInt *num_gop_local_reduce;
72: PetscInt **gop_local_reduce;
74: /* pairwise info */
75: PetscInt level;
76: PetscInt num_pairs;
77: PetscInt max_pairs;
78: PetscInt loc_node_pairs;
79: PetscInt max_node_pairs;
80: PetscInt min_node_pairs;
81: PetscInt avg_node_pairs;
82: PetscInt *pair_list;
83: PetscInt *msg_sizes;
84: PetscInt **node_list;
85: PetscInt len_pw_list;
86: PetscInt *pw_elm_list;
87: PetscScalar *pw_vals;
89: MPI_Request *msg_ids_in;
90: MPI_Request *msg_ids_out;
92: PetscScalar *out;
93: PetscScalar *in;
94: PetscInt msg_total;
96: /* tree - crystal accumulator info */
97: PetscInt max_left_over;
98: PetscInt *pre;
99: PetscInt *in_num;
100: PetscInt *out_num;
101: PetscInt **in_list;
102: PetscInt **out_list;
104: /* new tree work*/
105: PetscInt tree_nel;
106: PetscInt *tree_elms;
107: PetscScalar *tree_buf;
108: PetscScalar *tree_work;
110: PetscInt tree_map_sz;
111: PetscInt *tree_map_in;
112: PetscInt *tree_map_out;
114: /* current memory status */
115: PetscInt gl_bss_min;
116: PetscInt gl_perm_min;
118: /* max segment size for PCTFS_gs_gop_vec() */
119: PetscInt vec_sz;
121: /* hack to make paul happy */
122: MPI_Comm PCTFS_gs_comm;
124: } PCTFS_gs_id;
126: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
127: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
128: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
129: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
130: static PCTFS_gs_id *gsi_new(void);
131: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
133: /* same for all but vector flavor */
134: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
135: /* vector flavor */
136: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
138: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
139: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
140: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
141: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
145: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
147: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
148: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
149: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
151: /* global vars */
152: /* from comm.c module */
154: static PetscInt num_gs_ids = 0;
156: /* should make this dynamic ... later */
157: static PetscInt msg_buf = MAX_MSG_BUF;
158: static PetscInt vec_sz = GS_VEC_SZ;
159: static PetscInt *tree_buf = NULL;
160: static PetscInt tree_buf_sz = 0;
161: static PetscInt ntree = 0;
163: /***************************************************************************/
164: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
165: {
166: PetscFunctionBegin;
167: vec_sz = size;
168: PetscFunctionReturn(PETSC_SUCCESS);
169: }
171: /******************************************************************************/
172: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
173: {
174: PetscFunctionBegin;
175: msg_buf = buf_size;
176: PetscFunctionReturn(PETSC_SUCCESS);
177: }
179: /******************************************************************************/
180: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
181: {
182: PCTFS_gs_id *gs;
183: MPI_Group PCTFS_gs_group;
184: MPI_Comm PCTFS_gs_comm;
186: /* ensure that communication package has been initialized */
187: PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());
189: /* determines if we have enough dynamic/semi-static memory */
190: /* checks input, allocs and sets gd_id template */
191: gs = gsi_check_args(elms, nel, level);
193: /* only bit mask version up and working for the moment */
194: /* LATER :: get int list version working for sparse pblms */
195: PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
197: PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
198: PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
199: PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
201: gs->PCTFS_gs_comm = PCTFS_gs_comm;
203: return gs;
204: }
206: /******************************************************************************/
207: static PCTFS_gs_id *gsi_new(void)
208: {
209: PCTFS_gs_id *gs;
210: gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
211: PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
212: return gs;
213: }
215: /******************************************************************************/
216: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
217: {
218: PetscInt i, j, k, t2;
219: PetscInt *companion, *elms, *unique, *iptr;
220: PetscInt num_local = 0, *num_to_reduce, **local_reduce;
221: PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
222: PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
223: PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
224: PCTFS_gs_id *gs;
226: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
227: if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
229: if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
231: /* get space for gs template */
232: gs = gsi_new();
233: gs->id = ++num_gs_ids;
235: /* hmt 6.4.99 */
236: /* caller can set global ids that don't participate to 0 */
237: /* PCTFS_gs_init ignores all zeros in elm list */
238: /* negative global ids are still invalid */
239: for (i = j = 0; i < nel; i++) {
240: if (in_elms[i] != 0) j++;
241: }
243: k = nel;
244: nel = j;
246: /* copy over in_elms list and create inverse map */
247: elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
248: companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
250: for (i = j = 0; i < k; i++) {
251: if (in_elms[i] != 0) {
252: elms[j] = in_elms[i];
253: companion[j++] = i;
254: }
255: }
257: if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
259: /* pre-pass ... check to see if sorted */
260: elms[nel] = INT_MAX;
261: iptr = elms;
262: unique = elms + 1;
263: j = 0;
264: while (*iptr != INT_MAX) {
265: if (*iptr++ > *unique++) {
266: j = 1;
267: break;
268: }
269: }
271: /* set up inverse map */
272: if (j) {
273: PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
274: PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
275: } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
276: elms[nel] = INT_MIN;
278: /* first pass */
279: /* determine number of unique elements, check pd */
280: for (i = k = 0; i < nel; i += j) {
281: t2 = elms[i];
282: j = ++i;
284: /* clump 'em for now */
285: while (elms[j] == t2) j++;
287: /* how many together and num local */
288: if (j -= i) {
289: num_local++;
290: k += j;
291: }
292: }
294: /* how many unique elements? */
295: gs->repeats = k;
296: gs->nel = nel - k;
298: /* number of repeats? */
299: gs->num_local = num_local;
300: num_local += 2;
301: gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
302: gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
304: unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
305: gs->elms = unique;
306: gs->nel_total = nel;
307: gs->local_elms = elms;
308: gs->companion = companion;
310: /* compess map as well as keep track of local ops */
311: for (num_local = i = j = 0; i < gs->nel; i++) {
312: k = j;
313: t2 = unique[i] = elms[j];
314: companion[i] = companion[j];
316: while (elms[j] == t2) j++;
318: if ((t2 = (j - k)) > 1) {
319: /* number together */
320: num_to_reduce[num_local] = t2++;
322: iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
324: /* to use binary searching don't remap until we check intersection */
325: *iptr++ = i;
327: /* note that we're skipping the first one */
328: while (++k < j) *(iptr++) = companion[k];
329: *iptr = -1;
330: }
331: }
333: /* sentinel for ngh_buf */
334: unique[gs->nel] = INT_MAX;
336: /* for two partition sort hack */
337: num_to_reduce[num_local] = 0;
338: local_reduce[num_local] = NULL;
339: num_to_reduce[++num_local] = 0;
340: local_reduce[num_local] = NULL;
342: /* load 'em up */
343: /* note one extra to hold NON_UNIFORM flag!!! */
344: vals[2] = vals[1] = vals[0] = nel;
345: if (gs->nel > 0) {
346: vals[3] = unique[0];
347: vals[4] = unique[gs->nel - 1];
348: } else {
349: vals[3] = INT_MAX;
350: vals[4] = INT_MIN;
351: }
352: vals[5] = level;
353: vals[6] = num_gs_ids;
355: /* GLOBAL: send 'em out */
356: PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
358: /* must be semi-pos def - only pairwise depends on this */
359: /* LATER - remove this restriction */
360: if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
361: if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
363: gs->nel_min = vals[0];
364: gs->nel_max = vals[1];
365: gs->nel_sum = vals[2];
366: gs->gl_min = vals[3];
367: gs->gl_max = vals[4];
368: gs->negl = vals[4] - vals[3] + 1;
370: if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
372: /* LATER :: add level == -1 -> program selects level */
373: if (vals[5] < 0) vals[5] = 0;
374: else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
375: gs->level = vals[5];
377: return gs;
378: }
380: /******************************************************************************/
381: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
382: {
383: PetscInt i, nel, *elms;
384: PetscInt t1;
385: PetscInt **reduce;
386: PetscInt *map;
388: PetscFunctionBegin;
389: /* totally local removes ... PCTFS_ct_bits == 0 */
390: PetscCall(get_ngh_buf(gs));
392: if (gs->level) PetscCall(set_pairwise(gs));
393: if (gs->max_left_over) PetscCall(set_tree(gs));
395: /* intersection local and pairwise/tree? */
396: gs->num_local_total = gs->num_local;
397: gs->gop_local_reduce = gs->local_reduce;
398: gs->num_gop_local_reduce = gs->num_local_reduce;
400: map = gs->companion;
402: /* is there any local compression */
403: if (!gs->num_local) {
404: gs->local_strength = NONE;
405: gs->num_local_gop = 0;
406: } else {
407: /* ok find intersection */
408: map = gs->companion;
409: reduce = gs->local_reduce;
410: for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
411: if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
412: t1++;
413: PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
414: gs->num_local_reduce[i] *= -1;
415: }
416: **reduce = map[**reduce];
417: }
419: /* intersection is empty */
420: if (!t1) {
421: gs->local_strength = FULL;
422: gs->num_local_gop = 0;
423: } else { /* intersection not empty */
424: gs->local_strength = PARTIAL;
426: PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
428: gs->num_local_gop = t1;
429: gs->num_local_total = gs->num_local;
430: gs->num_local -= t1;
431: gs->gop_local_reduce = gs->local_reduce;
432: gs->num_gop_local_reduce = gs->num_local_reduce;
434: for (i = 0; i < t1; i++) {
435: PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
436: gs->num_gop_local_reduce[i] *= -1;
437: gs->local_reduce++;
438: gs->num_local_reduce++;
439: }
440: gs->local_reduce++;
441: gs->num_local_reduce++;
442: }
443: }
445: elms = gs->pw_elm_list;
446: nel = gs->len_pw_list;
447: for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
449: elms = gs->tree_map_in;
450: nel = gs->tree_map_sz;
451: for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
453: /* clean up */
454: free((void *)gs->local_elms);
455: free((void *)gs->companion);
456: free((void *)gs->elms);
457: free((void *)gs->ngh_buf);
458: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
459: PetscFunctionReturn(PETSC_SUCCESS);
460: }
462: /******************************************************************************/
463: static PetscErrorCode place_in_tree(PetscInt elm)
464: {
465: PetscInt *tp, n;
467: PetscFunctionBegin;
468: if (ntree == tree_buf_sz) {
469: if (tree_buf_sz) {
470: tp = tree_buf;
471: n = tree_buf_sz;
472: tree_buf_sz <<= 1;
473: tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
474: PCTFS_ivec_copy(tree_buf, tp, n);
475: free(tp);
476: } else {
477: tree_buf_sz = TREE_BUF_SZ;
478: tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
479: }
480: }
482: tree_buf[ntree++] = elm;
483: PetscFunctionReturn(PETSC_SUCCESS);
484: }
486: /******************************************************************************/
487: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
488: {
489: PetscInt i, j, npw = 0, ntree_map = 0;
490: PetscInt p_mask_size, ngh_buf_size, buf_size;
491: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
492: PetscInt *ngh_buf, *buf1, *buf2;
493: PetscInt offset, per_load, num_loads, or_ct, start, end;
494: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
495: PetscInt oper = GL_B_OR;
496: PetscInt *ptr3, *t_mask, level, ct1, ct2;
498: PetscFunctionBegin;
499: /* to make life easier */
500: nel = gs->nel;
501: elms = gs->elms;
502: level = gs->level;
504: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505: p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
506: PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
508: /* allocate space for masks and info bufs */
509: gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
510: gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
511: gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
512: t_mask = (PetscInt *)malloc(p_mask_size);
513: gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
515: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516: /* had thought I could exploit rendezvous threshold */
518: /* default is one pass */
519: per_load = negl = gs->negl;
520: gs->num_loads = num_loads = 1;
521: i = p_mask_size * negl;
523: /* possible overflow on buffer size */
524: /* overflow hack */
525: if (i < 0) i = INT_MAX;
527: buf_size = PetscMin(msg_buf, i);
529: /* can we do it? */
530: PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
532: /* get PCTFS_giop buf space ... make *only* one malloc */
533: buf1 = (PetscInt *)malloc(buf_size << 1);
535: /* more than one gior exchange needed? */
536: if (buf_size != i) {
537: per_load = buf_size / p_mask_size;
538: buf_size = per_load * p_mask_size;
539: gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
540: }
542: /* convert buf sizes from #bytes to #ints - 32-bit only! */
543: p_mask_size /= sizeof(PetscInt);
544: ngh_buf_size /= sizeof(PetscInt);
545: buf_size /= sizeof(PetscInt);
547: /* find PCTFS_giop work space */
548: buf2 = buf1 + buf_size;
550: /* hold #ints needed for processor masks */
551: gs->mask_sz = p_mask_size;
553: /* init buffers */
554: PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
555: PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
556: PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
558: /* HACK reset tree info */
559: tree_buf = NULL;
560: tree_buf_sz = ntree = 0;
562: /* ok do it */
563: for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
564: /* identity for bitwise or is 000...000 */
565: PetscCall(PCTFS_ivec_zero(buf1, buf_size));
567: /* load msg buffer */
568: for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
569: offset = (offset - start) * p_mask_size;
570: PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
571: }
573: /* GLOBAL: pass buffer */
574: PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
576: /* unload buffer into ngh_buf */
577: ptr2 = (elms + i_start);
578: for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
579: /* I own it ... may have to pairwise it */
580: if (j == *ptr2) {
581: /* do i share it w/anyone? */
582: ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
583: /* guess not */
584: if (ct1 < 2) {
585: ptr2++;
586: ptr1 += p_mask_size;
587: continue;
588: }
590: /* i do ... so keep info and turn off my bit */
591: PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
592: PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
593: PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
595: /* is it to be done pairwise? */
596: if (--ct1 <= level) {
597: npw++;
599: /* turn on high bit to indicate pw need to process */
600: *ptr2++ |= TOP_BIT;
601: PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
602: ptr1 += p_mask_size;
603: continue;
604: }
606: /* get set for next and note that I have a tree contribution */
607: /* could save exact elm index for tree here -> save a search */
608: ptr2++;
609: ptr1 += p_mask_size;
610: ntree_map++;
611: } else { /* i don't but still might be involved in tree */
613: /* shared by how many? */
614: ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
616: /* none! */
617: if (ct1 < 2) continue;
619: /* is it going to be done pairwise? but not by me of course!*/
620: if (--ct1 <= level) continue;
621: }
622: /* LATER we're going to have to process it NOW */
623: /* nope ... tree it */
624: PetscCall(place_in_tree(j));
625: }
626: }
628: free((void *)t_mask);
629: free((void *)buf1);
631: gs->len_pw_list = npw;
632: gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
634: /* expand from bit mask list to int list and save ngh list */
635: gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
636: PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));
638: gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
640: oper = GL_MAX;
641: ct1 = gs->num_nghs;
642: PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
643: gs->max_nghs = ct1;
645: gs->tree_map_sz = ntree_map;
646: gs->max_left_over = ntree;
648: free((void *)p_mask);
649: free((void *)sh_proc_mask);
650: PetscFunctionReturn(PETSC_SUCCESS);
651: }
653: /******************************************************************************/
654: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
655: {
656: PetscInt i, j;
657: PetscInt p_mask_size;
658: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
659: PetscInt *ngh_buf, *buf2;
660: PetscInt offset;
661: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
662: PetscInt *pairwise_elm_list, len_pair_list = 0;
663: PetscInt *iptr, t1, i_start, nel, *elms;
664: PetscInt ct;
666: PetscFunctionBegin;
667: /* to make life easier */
668: nel = gs->nel;
669: elms = gs->elms;
670: ngh_buf = gs->ngh_buf;
671: sh_proc_mask = gs->pw_nghs;
673: /* need a few temp masks */
674: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
675: p_mask = (PetscInt *)malloc(p_mask_size);
676: tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
678: /* set mask to my PCTFS_my_id's bit mask */
679: PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
681: p_mask_size /= sizeof(PetscInt);
683: len_pair_list = gs->len_pw_list;
684: gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
686: /* how many processors (nghs) do we have to exchange with? */
687: nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
689: /* allocate space for PCTFS_gs_gop() info */
690: gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
691: gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
692: gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
694: /* init msg_size list */
695: PetscCall(PCTFS_ivec_zero(msg_size, nprs));
697: /* expand from bit mask list to int list */
698: PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
700: /* keep list of elements being handled pairwise */
701: for (i = j = 0; i < nel; i++) {
702: if (elms[i] & TOP_BIT) {
703: elms[i] ^= TOP_BIT;
704: pairwise_elm_list[j++] = i;
705: }
706: }
707: pairwise_elm_list[j] = -1;
709: gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
710: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
711: gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
712: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
713: gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
715: /* find who goes to each processor */
716: for (i_start = i = 0; i < nprs; i++) {
717: /* processor i's mask */
718: PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
720: /* det # going to processor i */
721: for (ct = j = 0; j < len_pair_list; j++) {
722: buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
723: PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
724: if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
725: }
726: msg_size[i] = ct;
727: i_start = PetscMax(i_start, ct);
729: /*space to hold nodes in message to first neighbor */
730: msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
732: for (j = 0; j < len_pair_list; j++) {
733: buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
734: PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
735: if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
736: }
737: *iptr = -1;
738: }
739: msg_nodes[nprs] = NULL;
741: j = gs->loc_node_pairs = i_start;
742: t1 = GL_MAX;
743: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
744: gs->max_node_pairs = i_start;
746: i_start = j;
747: t1 = GL_MIN;
748: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
749: gs->min_node_pairs = i_start;
751: i_start = j;
752: t1 = GL_ADD;
753: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
754: gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
756: i_start = nprs;
757: t1 = GL_MAX;
758: PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
759: gs->max_pairs = i_start;
761: /* remap pairwise in tail of gsi_via_bit_mask() */
762: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
763: gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
764: gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
766: /* reset malloc pool */
767: free((void *)p_mask);
768: free((void *)tmp_proc_mask);
769: PetscFunctionReturn(PETSC_SUCCESS);
770: }
772: /* to do pruned tree just save ngh buf copy for each one and decode here!
773: ******************************************************************************/
774: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
775: {
776: PetscInt i, j, n, nel;
777: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
779: PetscFunctionBegin;
780: /* local work ptrs */
781: elms = gs->elms;
782: nel = gs->nel;
784: /* how many via tree */
785: gs->tree_nel = n = ntree;
786: gs->tree_elms = tree_elms = iptr_in = tree_buf;
787: gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
788: gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
789: j = gs->tree_map_sz;
790: gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
791: gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
793: /* search the longer of the two lists */
794: /* note ... could save this info in get_ngh_buf and save searches */
795: if (n <= nel) {
796: /* bijective fct w/remap - search elm list */
797: for (i = 0; i < n; i++) {
798: if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
799: *iptr_in++ = j;
800: *iptr_out++ = i;
801: }
802: }
803: } else {
804: for (i = 0; i < nel; i++) {
805: if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
806: *iptr_in++ = i;
807: *iptr_out++ = j;
808: }
809: }
810: }
812: /* sentinel */
813: *iptr_in = *iptr_out = -1;
814: PetscFunctionReturn(PETSC_SUCCESS);
815: }
817: /******************************************************************************/
818: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
819: {
820: PetscInt *num, *map, **reduce;
821: PetscScalar tmp;
823: PetscFunctionBegin;
824: num = gs->num_gop_local_reduce;
825: reduce = gs->gop_local_reduce;
826: while ((map = *reduce++)) {
827: /* wall */
828: if (*num == 2) {
829: num++;
830: vals[map[1]] = vals[map[0]];
831: } else if (*num == 3) { /* corner shared by three elements */
832: num++;
833: vals[map[2]] = vals[map[1]] = vals[map[0]];
834: } else if (*num == 4) { /* corner shared by four elements */
835: num++;
836: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
837: } else { /* general case ... odd geoms ... 3D*/
838: num++;
839: tmp = *(vals + *map++);
840: while (*map >= 0) *(vals + *map++) = tmp;
841: }
842: }
843: PetscFunctionReturn(PETSC_SUCCESS);
844: }
846: /******************************************************************************/
847: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
848: {
849: PetscInt *num, *map, **reduce;
850: PetscScalar tmp;
852: PetscFunctionBegin;
853: num = gs->num_local_reduce;
854: reduce = gs->local_reduce;
855: while ((map = *reduce)) {
856: /* wall */
857: if (*num == 2) {
858: num++;
859: reduce++;
860: vals[map[1]] = vals[map[0]] += vals[map[1]];
861: } else if (*num == 3) { /* corner shared by three elements */
862: num++;
863: reduce++;
864: vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
865: } else if (*num == 4) { /* corner shared by four elements */
866: num++;
867: reduce++;
868: vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
869: } else { /* general case ... odd geoms ... 3D*/
870: num++;
871: tmp = 0.0;
872: while (*map >= 0) tmp += *(vals + *map++);
874: map = *reduce++;
875: while (*map >= 0) *(vals + *map++) = tmp;
876: }
877: }
878: PetscFunctionReturn(PETSC_SUCCESS);
879: }
881: /******************************************************************************/
882: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
883: {
884: PetscInt *num, *map, **reduce;
885: PetscScalar *base;
887: PetscFunctionBegin;
888: num = gs->num_gop_local_reduce;
889: reduce = gs->gop_local_reduce;
890: while ((map = *reduce++)) {
891: /* wall */
892: if (*num == 2) {
893: num++;
894: vals[map[0]] += vals[map[1]];
895: } else if (*num == 3) { /* corner shared by three elements */
896: num++;
897: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
898: } else if (*num == 4) { /* corner shared by four elements */
899: num++;
900: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
901: } else { /* general case ... odd geoms ... 3D*/
902: num++;
903: base = vals + *map++;
904: while (*map >= 0) *base += *(vals + *map++);
905: }
906: }
907: PetscFunctionReturn(PETSC_SUCCESS);
908: }
910: /******************************************************************************/
911: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
912: {
913: PetscInt i;
915: PetscFunctionBegin;
916: PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
917: if (gs->nghs) free((void *)gs->nghs);
918: if (gs->pw_nghs) free((void *)gs->pw_nghs);
920: /* tree */
921: if (gs->max_left_over) {
922: if (gs->tree_elms) free((void *)gs->tree_elms);
923: if (gs->tree_buf) free((void *)gs->tree_buf);
924: if (gs->tree_work) free((void *)gs->tree_work);
925: if (gs->tree_map_in) free((void *)gs->tree_map_in);
926: if (gs->tree_map_out) free((void *)gs->tree_map_out);
927: }
929: /* pairwise info */
930: if (gs->num_pairs) {
931: /* should be NULL already */
932: if (gs->ngh_buf) free((void *)gs->ngh_buf);
933: if (gs->elms) free((void *)gs->elms);
934: if (gs->local_elms) free((void *)gs->local_elms);
935: if (gs->companion) free((void *)gs->companion);
937: /* only set if pairwise */
938: if (gs->vals) free((void *)gs->vals);
939: if (gs->in) free((void *)gs->in);
940: if (gs->out) free((void *)gs->out);
941: if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
942: if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
943: if (gs->pw_vals) free((void *)gs->pw_vals);
944: if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
945: if (gs->node_list) {
946: for (i = 0; i < gs->num_pairs; i++) {
947: if (gs->node_list[i]) free((void *)gs->node_list[i]);
948: }
949: free((void *)gs->node_list);
950: }
951: if (gs->msg_sizes) free((void *)gs->msg_sizes);
952: if (gs->pair_list) free((void *)gs->pair_list);
953: }
955: /* local info */
956: if (gs->num_local_total >= 0) {
957: for (i = 0; i < gs->num_local_total + 1; i++) {
958: if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
959: }
960: }
962: /* if intersection tree/pairwise and local isn't empty */
963: if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
964: if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
966: free((void *)gs);
967: PetscFunctionReturn(PETSC_SUCCESS);
968: }
970: /******************************************************************************/
971: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
972: {
973: PetscFunctionBegin;
974: switch (*op) {
975: case '+':
976: PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
977: break;
978: default:
979: PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
980: PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
981: PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
982: break;
983: }
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: /******************************************************************************/
988: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
989: {
990: PetscFunctionBegin;
991: PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
993: /* local only operations!!! */
994: if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));
996: /* if intersection tree/pairwise and local isn't empty */
997: if (gs->num_local_gop) {
998: PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));
1000: /* pairwise */
1001: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1003: /* tree */
1004: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1006: PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1007: } else { /* if intersection tree/pairwise and local is empty */
1008: /* pairwise */
1009: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1011: /* tree */
1012: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1013: }
1014: PetscFunctionReturn(PETSC_SUCCESS);
1015: }
1017: /******************************************************************************/
1018: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1019: {
1020: PetscInt *num, *map, **reduce;
1021: PetscScalar *base;
1023: PetscFunctionBegin;
1024: num = gs->num_local_reduce;
1025: reduce = gs->local_reduce;
1026: while ((map = *reduce)) {
1027: base = vals + map[0] * step;
1029: /* wall */
1030: if (*num == 2) {
1031: num++;
1032: reduce++;
1033: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1034: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1035: } else if (*num == 3) { /* corner shared by three elements */
1036: num++;
1037: reduce++;
1038: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1039: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1040: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1041: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1042: } else if (*num == 4) { /* corner shared by four elements */
1043: num++;
1044: reduce++;
1045: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1046: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1047: PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1048: PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1049: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1050: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1051: } else { /* general case ... odd geoms ... 3D */
1052: num++;
1053: while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1055: map = *reduce;
1056: while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1058: reduce++;
1059: }
1060: }
1061: PetscFunctionReturn(PETSC_SUCCESS);
1062: }
1064: /******************************************************************************/
1065: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1066: {
1067: PetscInt *num, *map, **reduce;
1068: PetscScalar *base;
1070: PetscFunctionBegin;
1071: num = gs->num_gop_local_reduce;
1072: reduce = gs->gop_local_reduce;
1073: while ((map = *reduce++)) {
1074: base = vals + map[0] * step;
1076: /* wall */
1077: if (*num == 2) {
1078: num++;
1079: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1080: } else if (*num == 3) { /* corner shared by three elements */
1081: num++;
1082: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1083: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1084: } else if (*num == 4) { /* corner shared by four elements */
1085: num++;
1086: PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1087: PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1088: PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1089: } else { /* general case ... odd geoms ... 3D*/
1090: num++;
1091: while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1092: }
1093: }
1094: PetscFunctionReturn(PETSC_SUCCESS);
1095: }
1097: /******************************************************************************/
1098: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1099: {
1100: PetscInt *num, *map, **reduce;
1101: PetscScalar *base;
1103: PetscFunctionBegin;
1104: num = gs->num_gop_local_reduce;
1105: reduce = gs->gop_local_reduce;
1106: while ((map = *reduce++)) {
1107: base = vals + map[0] * step;
1109: /* wall */
1110: if (*num == 2) {
1111: num++;
1112: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1113: } else if (*num == 3) { /* corner shared by three elements */
1114: num++;
1115: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1116: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1117: } else if (*num == 4) { /* corner shared by four elements */
1118: num++;
1119: PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1120: PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1121: PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1122: } else { /* general case ... odd geoms ... 3D*/
1123: num++;
1124: while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1125: }
1126: }
1127: PetscFunctionReturn(PETSC_SUCCESS);
1128: }
1130: /******************************************************************************/
1131: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1132: {
1133: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1134: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1135: PetscInt *pw, *list, *size, **nodes;
1136: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1137: MPI_Status status;
1138: PetscBLASInt i1 = 1, dstep;
1140: PetscFunctionBegin;
1141: /* strip and load s */
1142: msg_list = list = gs->pair_list;
1143: msg_size = size = gs->msg_sizes;
1144: msg_nodes = nodes = gs->node_list;
1145: iptr = pw = gs->pw_elm_list;
1146: dptr1 = dptr3 = gs->pw_vals;
1147: msg_ids_in = ids_in = gs->msg_ids_in;
1148: msg_ids_out = ids_out = gs->msg_ids_out;
1149: dptr2 = gs->out;
1150: in1 = in2 = gs->in;
1152: /* post the receives */
1153: /* msg_nodes=nodes; */
1154: do {
1155: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1156: second one *list and do list++ afterwards */
1157: PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1158: list++;
1159: msg_ids_in++;
1160: in1 += *size++ * step;
1161: } while (*++msg_nodes);
1162: msg_nodes = nodes;
1164: /* load gs values into in out gs buffers */
1165: while (*iptr >= 0) {
1166: PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1167: dptr3 += step;
1168: iptr++;
1169: }
1171: /* load out buffers and post the sends */
1172: while ((iptr = *msg_nodes++)) {
1173: dptr3 = dptr2;
1174: while (*iptr >= 0) {
1175: PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1176: dptr2 += step;
1177: iptr++;
1178: }
1179: PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1180: msg_size++;
1181: msg_list++;
1182: msg_ids_out++;
1183: }
1185: /* tree */
1186: if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));
1188: /* process the received data */
1189: msg_nodes = nodes;
1190: while ((iptr = *nodes++)) {
1191: PetscScalar d1 = 1.0;
1193: /* Should I check the return value of MPI_Wait() or status? */
1194: /* Can this loop be replaced by a call to MPI_Waitall()? */
1195: PetscCallMPI(MPI_Wait(ids_in, &status));
1196: ids_in++;
1197: while (*iptr >= 0) {
1198: PetscCall(PetscBLASIntCast(step, &dstep));
1199: PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1200: in2 += step;
1201: iptr++;
1202: }
1203: }
1205: /* replace vals */
1206: while (*pw >= 0) {
1207: PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1208: dptr1 += step;
1209: pw++;
1210: }
1212: /* clear isend message handles */
1213: /* This changed for clarity though it could be the same */
1215: /* Should I check the return value of MPI_Wait() or status? */
1216: /* Can this loop be replaced by a call to MPI_Waitall()? */
1217: while (*msg_nodes++) {
1218: PetscCallMPI(MPI_Wait(ids_out, &status));
1219: ids_out++;
1220: }
1221: PetscFunctionReturn(PETSC_SUCCESS);
1222: }
1224: /******************************************************************************/
1225: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1226: {
1227: PetscInt size, *in, *out;
1228: PetscScalar *buf, *work;
1229: PetscInt op[] = {GL_ADD, 0};
1230: PetscBLASInt i1 = 1;
1231: PetscBLASInt dstep;
1233: PetscFunctionBegin;
1234: /* copy over to local variables */
1235: in = gs->tree_map_in;
1236: out = gs->tree_map_out;
1237: buf = gs->tree_buf;
1238: work = gs->tree_work;
1239: size = gs->tree_nel * step;
1241: /* zero out collection buffer */
1242: PetscCall(PCTFS_rvec_zero(buf, size));
1244: /* copy over my contributions */
1245: while (*in >= 0) {
1246: PetscCall(PetscBLASIntCast(step, &dstep));
1247: PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1248: }
1250: /* perform fan in/out on full buffer */
1251: /* must change PCTFS_grop to handle the blas */
1252: PetscCall(PCTFS_grop(buf, work, size, op));
1254: /* reset */
1255: in = gs->tree_map_in;
1256: out = gs->tree_map_out;
1258: /* get the portion of the results I need */
1259: while (*in >= 0) {
1260: PetscCall(PetscBLASIntCast(step, &dstep));
1261: PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1262: }
1263: PetscFunctionReturn(PETSC_SUCCESS);
1264: }
1266: /******************************************************************************/
1267: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1268: {
1269: PetscFunctionBegin;
1270: switch (*op) {
1271: case '+':
1272: PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1273: break;
1274: default:
1275: PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1276: PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1277: PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1278: break;
1279: }
1280: PetscFunctionReturn(PETSC_SUCCESS);
1281: }
1283: /******************************************************************************/
1284: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1285: {
1286: PetscFunctionBegin;
1287: /* if there's nothing to do return */
1288: if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);
1290: /* can't do more dimensions then exist */
1291: dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1293: /* local only operations!!! */
1294: if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));
1296: /* if intersection tree/pairwise and local isn't empty */
1297: if (gs->num_local_gop) {
1298: PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));
1300: /* pairwise will do tree inside ... */
1301: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1302: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1304: PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1305: } else { /* if intersection tree/pairwise and local is empty */
1306: /* pairwise will do tree inside */
1307: if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1308: else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1309: }
1310: PetscFunctionReturn(PETSC_SUCCESS);
1311: }
1313: /******************************************************************************/
1314: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1315: {
1316: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1317: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1318: PetscInt *pw, *list, *size, **nodes;
1319: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1320: MPI_Status status;
1321: PetscInt i, mask = 1;
1323: PetscFunctionBegin;
1324: for (i = 1; i < dim; i++) {
1325: mask <<= 1;
1326: mask++;
1327: }
1329: /* strip and load s */
1330: msg_list = list = gs->pair_list;
1331: msg_size = size = gs->msg_sizes;
1332: msg_nodes = nodes = gs->node_list;
1333: iptr = pw = gs->pw_elm_list;
1334: dptr1 = dptr3 = gs->pw_vals;
1335: msg_ids_in = ids_in = gs->msg_ids_in;
1336: msg_ids_out = ids_out = gs->msg_ids_out;
1337: dptr2 = gs->out;
1338: in1 = in2 = gs->in;
1340: /* post the receives */
1341: /* msg_nodes=nodes; */
1342: do {
1343: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1344: second one *list and do list++ afterwards */
1345: if ((PCTFS_my_id | mask) == (*list | mask)) {
1346: PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1347: list++;
1348: msg_ids_in++;
1349: in1 += *size++;
1350: } else {
1351: list++;
1352: size++;
1353: }
1354: } while (*++msg_nodes);
1356: /* load gs values into in out gs buffers */
1357: while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1359: /* load out buffers and post the sends */
1360: msg_nodes = nodes;
1361: list = msg_list;
1362: while ((iptr = *msg_nodes++)) {
1363: if ((PCTFS_my_id | mask) == (*list | mask)) {
1364: dptr3 = dptr2;
1365: while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1366: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1367: /* is msg_ids_out++ correct? */
1368: PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1369: msg_size++;
1370: list++;
1371: msg_ids_out++;
1372: } else {
1373: list++;
1374: msg_size++;
1375: }
1376: }
1378: /* do the tree while we're waiting */
1379: if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));
1381: /* process the received data */
1382: msg_nodes = nodes;
1383: list = msg_list;
1384: while ((iptr = *nodes++)) {
1385: if ((PCTFS_my_id | mask) == (*list | mask)) {
1386: /* Should I check the return value of MPI_Wait() or status? */
1387: /* Can this loop be replaced by a call to MPI_Waitall()? */
1388: PetscCallMPI(MPI_Wait(ids_in, &status));
1389: ids_in++;
1390: while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1391: }
1392: list++;
1393: }
1395: /* replace vals */
1396: while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1398: /* clear isend message handles */
1399: /* This changed for clarity though it could be the same */
1400: while (*msg_nodes++) {
1401: if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1402: /* Should I check the return value of MPI_Wait() or status? */
1403: /* Can this loop be replaced by a call to MPI_Waitall()? */
1404: PetscCallMPI(MPI_Wait(ids_out, &status));
1405: ids_out++;
1406: }
1407: msg_list++;
1408: }
1409: PetscFunctionReturn(PETSC_SUCCESS);
1410: }
1412: /******************************************************************************/
1413: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1414: {
1415: PetscInt size;
1416: PetscInt *in, *out;
1417: PetscScalar *buf, *work;
1418: PetscInt op[] = {GL_ADD, 0};
1420: PetscFunctionBegin;
1421: in = gs->tree_map_in;
1422: out = gs->tree_map_out;
1423: buf = gs->tree_buf;
1424: work = gs->tree_work;
1425: size = gs->tree_nel;
1427: PetscCall(PCTFS_rvec_zero(buf, size));
1429: while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1431: in = gs->tree_map_in;
1432: out = gs->tree_map_out;
1434: PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));
1436: while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1437: PetscFunctionReturn(PETSC_SUCCESS);
1438: }