Actual source code: gs.c
petsc-3.14.6 2021-03-30
2: /***********************************gs.c***************************************
4: Author: Henry M. Tufo III
6: e-mail: hmt@cs.brown.edu
8: snail-mail:
9: Division of Applied Mathematics
10: Brown University
11: Providence, RI 02912
13: Last Modification:
14: 6.21.97
15: ************************************gs.c**************************************/
17: /***********************************gs.c***************************************
18: File Description:
19: -----------------
21: ************************************gs.c**************************************/
23: #include <../src/ksp/pc/impls/tfs/tfs.h>
25: /* default length of number of items via tree - doubles if exceeded */
26: #define TREE_BUF_SZ 2048;
27: #define GS_VEC_SZ 1
31: /***********************************gs.c***************************************
32: Type: struct gather_scatter_id
33: ------------------------------
35: ************************************gs.c**************************************/
36: typedef struct gather_scatter_id {
37: PetscInt id;
38: PetscInt nel_min;
39: PetscInt nel_max;
40: PetscInt nel_sum;
41: PetscInt negl;
42: PetscInt gl_max;
43: PetscInt gl_min;
44: PetscInt repeats;
45: PetscInt ordered;
46: PetscInt positive;
47: PetscScalar *vals;
49: /* bit mask info */
50: PetscInt *my_proc_mask;
51: PetscInt mask_sz;
52: PetscInt *ngh_buf;
53: PetscInt ngh_buf_sz;
54: PetscInt *nghs;
55: PetscInt num_nghs;
56: PetscInt max_nghs;
57: PetscInt *pw_nghs;
58: PetscInt num_pw_nghs;
59: PetscInt *tree_nghs;
60: PetscInt num_tree_nghs;
62: PetscInt num_loads;
64: /* repeats == true -> local info */
65: PetscInt nel; /* number of unique elememts */
66: PetscInt *elms; /* of size nel */
67: PetscInt nel_total;
68: PetscInt *local_elms; /* of size nel_total */
69: PetscInt *companion; /* of size nel_total */
71: /* local info */
72: PetscInt num_local_total;
73: PetscInt local_strength;
74: PetscInt num_local;
75: PetscInt *num_local_reduce;
76: PetscInt **local_reduce;
77: PetscInt num_local_gop;
78: PetscInt *num_gop_local_reduce;
79: PetscInt **gop_local_reduce;
81: /* pairwise info */
82: PetscInt level;
83: PetscInt num_pairs;
84: PetscInt max_pairs;
85: PetscInt loc_node_pairs;
86: PetscInt max_node_pairs;
87: PetscInt min_node_pairs;
88: PetscInt avg_node_pairs;
89: PetscInt *pair_list;
90: PetscInt *msg_sizes;
91: PetscInt **node_list;
92: PetscInt len_pw_list;
93: PetscInt *pw_elm_list;
94: PetscScalar *pw_vals;
96: MPI_Request *msg_ids_in;
97: MPI_Request *msg_ids_out;
99: PetscScalar *out;
100: PetscScalar *in;
101: PetscInt msg_total;
103: /* tree - crystal accumulator info */
104: PetscInt max_left_over;
105: PetscInt *pre;
106: PetscInt *in_num;
107: PetscInt *out_num;
108: PetscInt **in_list;
109: PetscInt **out_list;
111: /* new tree work*/
112: PetscInt tree_nel;
113: PetscInt *tree_elms;
114: PetscScalar *tree_buf;
115: PetscScalar *tree_work;
117: PetscInt tree_map_sz;
118: PetscInt *tree_map_in;
119: PetscInt *tree_map_out;
121: /* current memory status */
122: PetscInt gl_bss_min;
123: PetscInt gl_perm_min;
125: /* max segment size for PCTFS_gs_gop_vec() */
126: PetscInt vec_sz;
128: /* hack to make paul happy */
129: MPI_Comm PCTFS_gs_comm;
131: } PCTFS_gs_id;
133: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137: static PCTFS_gs_id *gsi_new(void);
138: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
140: /* same for all but vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142: /* vector flavor */
143: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
152: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
155: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
159: /* global vars */
160: /* from comm.c module */
162: static PetscInt num_gs_ids = 0;
164: /* should make this dynamic ... later */
165: static PetscInt msg_buf =MAX_MSG_BUF;
166: static PetscInt vec_sz =GS_VEC_SZ;
167: static PetscInt *tree_buf =NULL;
168: static PetscInt tree_buf_sz=0;
169: static PetscInt ntree =0;
171: /***************************************************************************/
172: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173: {
175: vec_sz = size;
176: return(0);
177: }
179: /******************************************************************************/
180: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181: {
183: msg_buf = buf_size;
184: return(0);
185: }
187: /******************************************************************************/
188: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
189: {
190: PCTFS_gs_id *gs;
191: MPI_Group PCTFS_gs_group;
192: MPI_Comm PCTFS_gs_comm;
196: /* ensure that communication package has been initialized */
197: PCTFS_comm_init();
200: /* determines if we have enough dynamic/semi-static memory */
201: /* checks input, allocs and sets gd_id template */
202: gs = gsi_check_args(elms,nel,level);
204: /* only bit mask version up and working for the moment */
205: /* LATER :: get int list version working for sparse pblms */
206: gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
209: MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210: MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211: MPI_Group_free(&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
213: gs->PCTFS_gs_comm=PCTFS_gs_comm;
215: return(gs);
216: }
218: /******************************************************************************/
219: static PCTFS_gs_id *gsi_new(void)
220: {
222: PCTFS_gs_id *gs;
223: gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
224: PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
225: return(gs);
226: }
228: /******************************************************************************/
229: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
230: {
231: PetscInt i, j, k, t2;
232: PetscInt *companion, *elms, *unique, *iptr;
233: PetscInt num_local=0, *num_to_reduce, **local_reduce;
234: PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
235: PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
236: PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
237: PCTFS_gs_id *gs;
241: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
242: if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
244: if (nel==0) { PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
246: /* get space for gs template */
247: gs = gsi_new();
248: gs->id = ++num_gs_ids;
250: /* hmt 6.4.99 */
251: /* caller can set global ids that don't participate to 0 */
252: /* PCTFS_gs_init ignores all zeros in elm list */
253: /* negative global ids are still invalid */
254: for (i=j=0; i<nel; i++) {
255: if (in_elms[i]!=0) j++;
256: }
258: k=nel; nel=j;
260: /* copy over in_elms list and create inverse map */
261: elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
262: companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
264: for (i=j=0; i<k; i++) {
265: if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
266: }
268: if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
270: /* pre-pass ... check to see if sorted */
271: elms[nel] = INT_MAX;
272: iptr = elms;
273: unique = elms+1;
274: j =0;
275: while (*iptr!=INT_MAX) {
276: if (*iptr++>*unique++) { j=1; break; }
277: }
279: /* set up inverse map */
280: if (j) {
281: PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
282: PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
283: } else { PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
284: elms[nel] = INT_MIN;
286: /* first pass */
287: /* determine number of unique elements, check pd */
288: for (i=k=0; i<nel; i+=j) {
289: t2 = elms[i];
290: j = ++i;
292: /* clump 'em for now */
293: while (elms[j]==t2) j++;
295: /* how many together and num local */
296: if (j-=i) { num_local++; k+=j; }
297: }
299: /* how many unique elements? */
300: gs->repeats = k;
301: gs->nel = nel-k;
304: /* number of repeats? */
305: gs->num_local = num_local;
306: num_local += 2;
307: gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
308: gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
310: unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
311: gs->elms = unique;
312: gs->nel_total = nel;
313: gs->local_elms = elms;
314: gs->companion = companion;
316: /* compess map as well as keep track of local ops */
317: for (num_local=i=j=0; i<gs->nel; i++) {
318: k = j;
319: t2 = unique[i] = elms[j];
320: companion[i] = companion[j];
322: while (elms[j]==t2) j++;
324: if ((t2=(j-k))>1) {
325: /* number together */
326: num_to_reduce[num_local] = t2++;
328: iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
330: /* to use binary searching don't remap until we check intersection */
331: *iptr++ = i;
333: /* note that we're skipping the first one */
334: while (++k<j) *(iptr++) = companion[k];
335: *iptr = -1;
336: }
337: }
339: /* sentinel for ngh_buf */
340: unique[gs->nel]=INT_MAX;
342: /* for two partition sort hack */
343: num_to_reduce[num_local] = 0;
344: local_reduce[num_local] = NULL;
345: num_to_reduce[++num_local] = 0;
346: local_reduce[num_local] = NULL;
348: /* load 'em up */
349: /* note one extra to hold NON_UNIFORM flag!!! */
350: vals[2] = vals[1] = vals[0] = nel;
351: if (gs->nel>0) {
352: vals[3] = unique[0];
353: vals[4] = unique[gs->nel-1];
354: } else {
355: vals[3] = INT_MAX;
356: vals[4] = INT_MIN;
357: }
358: vals[5] = level;
359: vals[6] = num_gs_ids;
361: /* GLOBAL: send 'em out */
362: PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
364: /* must be semi-pos def - only pairwise depends on this */
365: /* LATER - remove this restriction */
366: if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
367: if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
369: gs->nel_min = vals[0];
370: gs->nel_max = vals[1];
371: gs->nel_sum = vals[2];
372: gs->gl_min = vals[3];
373: gs->gl_max = vals[4];
374: gs->negl = vals[4]-vals[3]+1;
376: if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
378: /* LATER :: add level == -1 -> program selects level */
379: if (vals[5]<0) vals[5]=0;
380: else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
381: gs->level = vals[5];
383: return(gs);
384: }
386: /******************************************************************************/
387: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
388: {
389: PetscInt i, nel, *elms;
390: PetscInt t1;
391: PetscInt **reduce;
392: PetscInt *map;
396: /* totally local removes ... PCTFS_ct_bits == 0 */
397: get_ngh_buf(gs);
399: if (gs->level) set_pairwise(gs);
400: if (gs->max_left_over) set_tree(gs);
402: /* intersection local and pairwise/tree? */
403: gs->num_local_total = gs->num_local;
404: gs->gop_local_reduce = gs->local_reduce;
405: gs->num_gop_local_reduce = gs->num_local_reduce;
407: map = gs->companion;
409: /* is there any local compression */
410: if (!gs->num_local) {
411: gs->local_strength = NONE;
412: gs->num_local_gop = 0;
413: } else {
414: /* ok find intersection */
415: map = gs->companion;
416: reduce = gs->local_reduce;
417: for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
418: if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
419: t1++;
420: if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
421: gs->num_local_reduce[i] *= -1;
422: }
423: **reduce=map[**reduce];
424: }
426: /* intersection is empty */
427: if (!t1) {
428: gs->local_strength = FULL;
429: gs->num_local_gop = 0;
430: } else { /* intersection not empty */
431: gs->local_strength = PARTIAL;
433: PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);
435: gs->num_local_gop = t1;
436: gs->num_local_total = gs->num_local;
437: gs->num_local -= t1;
438: gs->gop_local_reduce = gs->local_reduce;
439: gs->num_gop_local_reduce = gs->num_local_reduce;
441: for (i=0; i<t1; i++) {
442: if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
443: gs->num_gop_local_reduce[i] *= -1;
444: gs->local_reduce++;
445: gs->num_local_reduce++;
446: }
447: gs->local_reduce++;
448: gs->num_local_reduce++;
449: }
450: }
452: elms = gs->pw_elm_list;
453: nel = gs->len_pw_list;
454: for (i=0; i<nel; i++) elms[i] = map[elms[i]];
456: elms = gs->tree_map_in;
457: nel = gs->tree_map_sz;
458: for (i=0; i<nel; i++) elms[i] = map[elms[i]];
460: /* clean up */
461: free((void*) gs->local_elms);
462: free((void*) gs->companion);
463: free((void*) gs->elms);
464: free((void*) gs->ngh_buf);
465: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
466: return(0);
467: }
469: /******************************************************************************/
470: static PetscErrorCode place_in_tree(PetscInt elm)
471: {
472: PetscInt *tp, n;
475: if (ntree==tree_buf_sz) {
476: if (tree_buf_sz) {
477: tp = tree_buf;
478: n = tree_buf_sz;
479: tree_buf_sz<<=1;
480: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
481: PCTFS_ivec_copy(tree_buf,tp,n);
482: free(tp);
483: } else {
484: tree_buf_sz = TREE_BUF_SZ;
485: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
486: }
487: }
489: tree_buf[ntree++] = elm;
490: return(0);
491: }
493: /******************************************************************************/
494: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
495: {
496: PetscInt i, j, npw=0, ntree_map=0;
497: PetscInt p_mask_size, ngh_buf_size, buf_size;
498: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
499: PetscInt *ngh_buf, *buf1, *buf2;
500: PetscInt offset, per_load, num_loads, or_ct, start, end;
501: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
502: PetscInt oper=GL_B_OR;
503: PetscInt *ptr3, *t_mask, level, ct1, ct2;
507: /* to make life easier */
508: nel = gs->nel;
509: elms = gs->elms;
510: level = gs->level;
512: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
513: p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
514: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
516: /* allocate space for masks and info bufs */
517: gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
518: gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
519: gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
520: t_mask = (PetscInt*) malloc(p_mask_size);
521: gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
523: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
524: /* had thought I could exploit rendezvous threshold */
526: /* default is one pass */
527: per_load = negl = gs->negl;
528: gs->num_loads = num_loads = 1;
529: i = p_mask_size*negl;
531: /* possible overflow on buffer size */
532: /* overflow hack */
533: if (i<0) i=INT_MAX;
535: buf_size = PetscMin(msg_buf,i);
537: /* can we do it? */
538: if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
540: /* get PCTFS_giop buf space ... make *only* one malloc */
541: buf1 = (PetscInt*) malloc(buf_size<<1);
543: /* more than one gior exchange needed? */
544: if (buf_size!=i) {
545: per_load = buf_size/p_mask_size;
546: buf_size = per_load*p_mask_size;
547: gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
548: }
551: /* convert buf sizes from #bytes to #ints - 32 bit only! */
552: p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
554: /* find PCTFS_giop work space */
555: buf2 = buf1+buf_size;
557: /* hold #ints needed for processor masks */
558: gs->mask_sz=p_mask_size;
560: /* init buffers */
561: PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
562: PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
563: PCTFS_ivec_zero(ngh_buf,ngh_buf_size);
565: /* HACK reset tree info */
566: tree_buf = NULL;
567: tree_buf_sz = ntree = 0;
569: /* ok do it */
570: for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
571: /* identity for bitwise or is 000...000 */
572: PCTFS_ivec_zero(buf1,buf_size);
574: /* load msg buffer */
575: for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
576: offset = (offset-start)*p_mask_size;
577: PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
578: }
580: /* GLOBAL: pass buffer */
581: PCTFS_giop(buf1,buf2,buf_size,&oper);
584: /* unload buffer into ngh_buf */
585: ptr2=(elms+i_start);
586: for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
587: /* I own it ... may have to pairwise it */
588: if (j==*ptr2) {
589: /* do i share it w/anyone? */
590: ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
591: /* guess not */
592: if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
594: /* i do ... so keep info and turn off my bit */
595: PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
596: PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
597: PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);
599: /* is it to be done pairwise? */
600: if (--ct1<=level) {
601: npw++;
603: /* turn on high bit to indicate pw need to process */
604: *ptr2++ |= TOP_BIT;
605: PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
606: ptr1 += p_mask_size;
607: continue;
608: }
610: /* get set for next and note that I have a tree contribution */
611: /* could save exact elm index for tree here -> save a search */
612: ptr2++; ptr1+=p_mask_size; ntree_map++;
613: } else { /* i don't but still might be involved in tree */
615: /* shared by how many? */
616: ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
618: /* none! */
619: if (ct1<2) continue;
621: /* is it going to be done pairwise? but not by me of course!*/
622: if (--ct1<=level) continue;
623: }
624: /* LATER we're going to have to process it NOW */
625: /* nope ... tree it */
626: place_in_tree(j);
627: }
628: }
630: free((void*)t_mask);
631: free((void*)buf1);
633: gs->len_pw_list = npw;
634: gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
636: /* expand from bit mask list to int list and save ngh list */
637: gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
638: PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
640: gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
642: oper = GL_MAX;
643: ct1 = gs->num_nghs;
644: PCTFS_giop(&ct1,&ct2,1,&oper);
645: gs->max_nghs = ct1;
647: gs->tree_map_sz = ntree_map;
648: gs->max_left_over=ntree;
650: free((void*)p_mask);
651: free((void*)sh_proc_mask);
652: return(0);
653: }
655: /******************************************************************************/
656: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
657: {
658: PetscInt i, j;
659: PetscInt p_mask_size;
660: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
661: PetscInt *ngh_buf, *buf2;
662: PetscInt offset;
663: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
664: PetscInt *pairwise_elm_list, len_pair_list=0;
665: PetscInt *iptr, t1, i_start, nel, *elms;
666: PetscInt ct;
670: /* to make life easier */
671: nel = gs->nel;
672: elms = gs->elms;
673: ngh_buf = gs->ngh_buf;
674: sh_proc_mask = gs->pw_nghs;
676: /* need a few temp masks */
677: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
678: p_mask = (PetscInt*) malloc(p_mask_size);
679: tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
681: /* set mask to my PCTFS_my_id's bit mask */
682: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
684: p_mask_size /= sizeof(PetscInt);
686: len_pair_list = gs->len_pw_list;
687: gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
689: /* how many processors (nghs) do we have to exchange with? */
690: nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
693: /* allocate space for PCTFS_gs_gop() info */
694: gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs);
695: gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs);
696: gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));
698: /* init msg_size list */
699: PCTFS_ivec_zero(msg_size,nprs);
701: /* expand from bit mask list to int list */
702: PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
704: /* keep list of elements being handled pairwise */
705: for (i=j=0; i<nel; i++) {
706: if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
707: }
708: pairwise_elm_list[j] = -1;
710: gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1));
711: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
712: gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1));
713: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
714: gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
716: /* find who goes to each processor */
717: for (i_start=i=0; i<nprs; i++) {
718: /* processor i's mask */
719: PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);
721: /* det # going to processor i */
722: for (ct=j=0; j<len_pair_list; j++) {
723: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
724: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
725: if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
726: }
727: msg_size[i] = ct;
728: i_start = PetscMax(i_start,ct);
730: /*space to hold nodes in message to first neighbor */
731: msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
733: for (j=0;j<len_pair_list;j++) {
734: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
735: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
736: if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
737: }
738: *iptr = -1;
739: }
740: msg_nodes[nprs] = NULL;
742: j = gs->loc_node_pairs=i_start;
743: t1 = GL_MAX;
744: PCTFS_giop(&i_start,&offset,1,&t1);
745: gs->max_node_pairs = i_start;
747: i_start = j;
748: t1 = GL_MIN;
749: PCTFS_giop(&i_start,&offset,1,&t1);
750: gs->min_node_pairs = i_start;
752: i_start = j;
753: t1 = GL_ADD;
754: PCTFS_giop(&i_start,&offset,1,&t1);
755: gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
757: i_start = nprs;
758: t1 = GL_MAX;
759: PCTFS_giop(&i_start,&offset,1,&t1);
760: gs->max_pairs = i_start;
763: /* remap pairwise in tail of gsi_via_bit_mask() */
764: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
765: gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
766: gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
768: /* reset malloc pool */
769: free((void*)p_mask);
770: free((void*)tmp_proc_mask);
771: return(0);
772: }
774: /* to do pruned tree just save ngh buf copy for each one and decode here!
775: ******************************************************************************/
776: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
777: {
778: PetscInt i, j, n, nel;
779: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
782: /* local work ptrs */
783: elms = gs->elms;
784: nel = gs->nel;
786: /* how many via tree */
787: gs->tree_nel = n = ntree;
788: gs->tree_elms = tree_elms = iptr_in = tree_buf;
789: gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
790: gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
791: j = gs->tree_map_sz;
792: gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
793: gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
795: /* search the longer of the two lists */
796: /* note ... could save this info in get_ngh_buf and save searches */
797: if (n<=nel) {
798: /* bijective fct w/remap - search elm list */
799: for (i=0; i<n; i++) {
800: if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
801: }
802: } else {
803: for (i=0; i<nel; i++) {
804: if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
805: }
806: }
808: /* sentinel */
809: *iptr_in = *iptr_out = -1;
810: return(0);
811: }
813: /******************************************************************************/
814: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
815: {
816: PetscInt *num, *map, **reduce;
817: PetscScalar tmp;
820: num = gs->num_gop_local_reduce;
821: reduce = gs->gop_local_reduce;
822: while ((map = *reduce++)) {
823: /* wall */
824: if (*num == 2) {
825: num++;
826: vals[map[1]] = vals[map[0]];
827: } else if (*num == 3) { /* corner shared by three elements */
828: num++;
829: vals[map[2]] = vals[map[1]] = vals[map[0]];
830: } else if (*num == 4) { /* corner shared by four elements */
831: num++;
832: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
833: } else { /* general case ... odd geoms ... 3D*/
834: num++;
835: tmp = *(vals + *map++);
836: while (*map >= 0) *(vals + *map++) = tmp;
837: }
838: }
839: return(0);
840: }
842: /******************************************************************************/
843: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
844: {
845: PetscInt *num, *map, **reduce;
846: PetscScalar tmp;
849: num = gs->num_local_reduce;
850: reduce = gs->local_reduce;
851: while ((map = *reduce)) {
852: /* wall */
853: if (*num == 2) {
854: num++; reduce++;
855: vals[map[1]] = vals[map[0]] += vals[map[1]];
856: } else if (*num == 3) { /* corner shared by three elements */
857: num++; reduce++;
858: vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
859: } else if (*num == 4) { /* corner shared by four elements */
860: num++; reduce++;
861: vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
862: } else { /* general case ... odd geoms ... 3D*/
863: num++;
864: tmp = 0.0;
865: while (*map >= 0) tmp += *(vals + *map++);
867: map = *reduce++;
868: while (*map >= 0) *(vals + *map++) = tmp;
869: }
870: }
871: return(0);
872: }
874: /******************************************************************************/
875: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
876: {
877: PetscInt *num, *map, **reduce;
878: PetscScalar *base;
881: num = gs->num_gop_local_reduce;
882: reduce = gs->gop_local_reduce;
883: while ((map = *reduce++)) {
884: /* wall */
885: if (*num == 2) {
886: num++;
887: vals[map[0]] += vals[map[1]];
888: } else if (*num == 3) { /* corner shared by three elements */
889: num++;
890: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
891: } else if (*num == 4) { /* corner shared by four elements */
892: num++;
893: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
894: } else { /* general case ... odd geoms ... 3D*/
895: num++;
896: base = vals + *map++;
897: while (*map >= 0) *base += *(vals + *map++);
898: }
899: }
900: return(0);
901: }
903: /******************************************************************************/
904: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
905: {
906: PetscInt i;
910: MPI_Comm_free(&gs->PCTFS_gs_comm);
911: if (gs->nghs) free((void*) gs->nghs);
912: if (gs->pw_nghs) free((void*) gs->pw_nghs);
914: /* tree */
915: if (gs->max_left_over) {
916: if (gs->tree_elms) free((void*) gs->tree_elms);
917: if (gs->tree_buf) free((void*) gs->tree_buf);
918: if (gs->tree_work) free((void*) gs->tree_work);
919: if (gs->tree_map_in) free((void*) gs->tree_map_in);
920: if (gs->tree_map_out) free((void*) gs->tree_map_out);
921: }
923: /* pairwise info */
924: if (gs->num_pairs) {
925: /* should be NULL already */
926: if (gs->ngh_buf) free((void*) gs->ngh_buf);
927: if (gs->elms) free((void*) gs->elms);
928: if (gs->local_elms) free((void*) gs->local_elms);
929: if (gs->companion) free((void*) gs->companion);
931: /* only set if pairwise */
932: if (gs->vals) free((void*) gs->vals);
933: if (gs->in) free((void*) gs->in);
934: if (gs->out) free((void*) gs->out);
935: if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
936: if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
937: if (gs->pw_vals) free((void*) gs->pw_vals);
938: if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
939: if (gs->node_list) {
940: for (i=0;i<gs->num_pairs;i++) {
941: if (gs->node_list[i]) {
942: free((void*) gs->node_list[i]);
943: }
944: }
945: free((void*) gs->node_list);
946: }
947: if (gs->msg_sizes) free((void*) gs->msg_sizes);
948: if (gs->pair_list) free((void*) gs->pair_list);
949: }
951: /* local info */
952: if (gs->num_local_total>=0) {
953: for (i=0;i<gs->num_local_total+1;i++) {
954: if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
955: }
956: }
958: /* if intersection tree/pairwise and local isn't empty */
959: if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
960: if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);
962: free((void*) gs);
963: return(0);
964: }
966: /******************************************************************************/
967: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
968: {
972: switch (*op) {
973: case '+':
974: PCTFS_gs_gop_vec_plus(gs,vals,step);
975: break;
976: default:
977: PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);
978: PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");
979: PCTFS_gs_gop_vec_plus(gs,vals,step);
980: break;
981: }
982: return(0);
983: }
985: /******************************************************************************/
986: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
987: {
989: if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
991: /* local only operations!!! */
992: if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);
994: /* if intersection tree/pairwise and local isn't empty */
995: if (gs->num_local_gop) {
996: PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
998: /* pairwise */
999: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
1001: /* tree */
1002: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
1004: PCTFS_gs_gop_vec_local_out(gs,vals,step);
1005: } else { /* if intersection tree/pairwise and local is empty */
1006: /* pairwise */
1007: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
1009: /* tree */
1010: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
1011: }
1012: return(0);
1013: }
1015: /******************************************************************************/
1016: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1017: {
1018: PetscInt *num, *map, **reduce;
1019: PetscScalar *base;
1022: num = gs->num_local_reduce;
1023: reduce = gs->local_reduce;
1024: while ((map = *reduce)) {
1025: base = vals + map[0] * step;
1027: /* wall */
1028: if (*num == 2) {
1029: num++; reduce++;
1030: PCTFS_rvec_add (base,vals+map[1]*step,step);
1031: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1032: } else if (*num == 3) { /* corner shared by three elements */
1033: num++; reduce++;
1034: PCTFS_rvec_add (base,vals+map[1]*step,step);
1035: PCTFS_rvec_add (base,vals+map[2]*step,step);
1036: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1037: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1038: } else if (*num == 4) { /* corner shared by four elements */
1039: num++; reduce++;
1040: PCTFS_rvec_add (base,vals+map[1]*step,step);
1041: PCTFS_rvec_add (base,vals+map[2]*step,step);
1042: PCTFS_rvec_add (base,vals+map[3]*step,step);
1043: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1044: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1045: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1046: } else { /* general case ... odd geoms ... 3D */
1047: num++;
1048: while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);
1050: map = *reduce;
1051: while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1053: reduce++;
1054: }
1055: }
1056: return(0);
1057: }
1059: /******************************************************************************/
1060: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1061: {
1062: PetscInt *num, *map, **reduce;
1063: PetscScalar *base;
1066: num = gs->num_gop_local_reduce;
1067: reduce = gs->gop_local_reduce;
1068: while ((map = *reduce++)) {
1069: base = vals + map[0] * step;
1071: /* wall */
1072: if (*num == 2) {
1073: num++;
1074: PCTFS_rvec_add(base,vals+map[1]*step,step);
1075: } else if (*num == 3) { /* corner shared by three elements */
1076: num++;
1077: PCTFS_rvec_add(base,vals+map[1]*step,step);
1078: PCTFS_rvec_add(base,vals+map[2]*step,step);
1079: } else if (*num == 4) { /* corner shared by four elements */
1080: num++;
1081: PCTFS_rvec_add(base,vals+map[1]*step,step);
1082: PCTFS_rvec_add(base,vals+map[2]*step,step);
1083: PCTFS_rvec_add(base,vals+map[3]*step,step);
1084: } else { /* general case ... odd geoms ... 3D*/
1085: num++;
1086: while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1087: }
1088: }
1089: return(0);
1090: }
1092: /******************************************************************************/
1093: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1094: {
1095: PetscInt *num, *map, **reduce;
1096: PetscScalar *base;
1099: num = gs->num_gop_local_reduce;
1100: reduce = gs->gop_local_reduce;
1101: while ((map = *reduce++)) {
1102: base = vals + map[0] * step;
1104: /* wall */
1105: if (*num == 2) {
1106: num++;
1107: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1108: } else if (*num == 3) { /* corner shared by three elements */
1109: num++;
1110: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1111: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1112: } else if (*num == 4) { /* corner shared by four elements */
1113: num++;
1114: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1115: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1116: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1117: } else { /* general case ... odd geoms ... 3D*/
1118: num++;
1119: while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1120: }
1121: }
1122: return(0);
1123: }
1125: /******************************************************************************/
1126: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1127: {
1128: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1129: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1130: PetscInt *pw, *list, *size, **nodes;
1131: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1132: MPI_Status status;
1133: PetscBLASInt i1 = 1,dstep;
1137: /* strip and load s */
1138: msg_list = list = gs->pair_list;
1139: msg_size = size = gs->msg_sizes;
1140: msg_nodes = nodes = gs->node_list;
1141: iptr = pw = gs->pw_elm_list;
1142: dptr1 = dptr3 = gs->pw_vals;
1143: msg_ids_in = ids_in = gs->msg_ids_in;
1144: msg_ids_out = ids_out = gs->msg_ids_out;
1145: dptr2 = gs->out;
1146: in1=in2 = gs->in;
1148: /* post the receives */
1149: /* msg_nodes=nodes; */
1150: do {
1151: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1152: second one *list and do list++ afterwards */
1153: MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1154: list++;msg_ids_in++;
1155: in1 += *size++ *step;
1156: } while (*++msg_nodes);
1157: msg_nodes=nodes;
1159: /* load gs values into in out gs buffers */
1160: while (*iptr >= 0) {
1161: PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1162: dptr3+=step;
1163: iptr++;
1164: }
1166: /* load out buffers and post the sends */
1167: while ((iptr = *msg_nodes++)) {
1168: dptr3 = dptr2;
1169: while (*iptr >= 0) {
1170: PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1171: dptr2+=step;
1172: iptr++;
1173: }
1174: MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1175: msg_size++; msg_list++;msg_ids_out++;
1176: }
1178: /* tree */
1179: if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);
1181: /* process the received data */
1182: msg_nodes=nodes;
1183: while ((iptr = *nodes++)) {
1184: PetscScalar d1 = 1.0;
1186: /* Should I check the return value of MPI_Wait() or status? */
1187: /* Can this loop be replaced by a call to MPI_Waitall()? */
1188: MPI_Wait(ids_in, &status);
1189: ids_in++;
1190: while (*iptr >= 0) {
1191: PetscBLASIntCast(step,&dstep);
1192: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1193: in2+=step;
1194: iptr++;
1195: }
1196: }
1198: /* replace vals */
1199: while (*pw >= 0) {
1200: PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1201: dptr1+=step;
1202: pw++;
1203: }
1205: /* clear isend message handles */
1206: /* This changed for clarity though it could be the same */
1208: /* Should I check the return value of MPI_Wait() or status? */
1209: /* Can this loop be replaced by a call to MPI_Waitall()? */
1210: while (*msg_nodes++) {
1211: MPI_Wait(ids_out, &status);
1212: ids_out++;
1213: }
1214: return(0);
1215: }
1217: /******************************************************************************/
1218: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1219: {
1220: PetscInt size, *in, *out;
1221: PetscScalar *buf, *work;
1222: PetscInt op[] = {GL_ADD,0};
1223: PetscBLASInt i1 = 1;
1225: PetscBLASInt dstep;
1228: /* copy over to local variables */
1229: in = gs->tree_map_in;
1230: out = gs->tree_map_out;
1231: buf = gs->tree_buf;
1232: work = gs->tree_work;
1233: size = gs->tree_nel*step;
1235: /* zero out collection buffer */
1236: PCTFS_rvec_zero(buf,size);
1239: /* copy over my contributions */
1240: while (*in >= 0) {
1241: PetscBLASIntCast(step,&dstep);
1242: PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1243: }
1245: /* perform fan in/out on full buffer */
1246: /* must change PCTFS_grop to handle the blas */
1247: PCTFS_grop(buf,work,size,op);
1249: /* reset */
1250: in = gs->tree_map_in;
1251: out = gs->tree_map_out;
1253: /* get the portion of the results I need */
1254: while (*in >= 0) {
1255: PetscBLASIntCast(step,&dstep);
1256: PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1257: }
1258: return(0);
1259: }
1261: /******************************************************************************/
1262: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1263: {
1267: switch (*op) {
1268: case '+':
1269: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1270: break;
1271: default:
1272: PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);
1273: PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1274: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1275: break;
1276: }
1277: return(0);
1278: }
1280: /******************************************************************************/
1281: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1282: {
1284: /* if there's nothing to do return */
1285: if (dim<=0) return(0);
1287: /* can't do more dimensions then exist */
1288: dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1290: /* local only operations!!! */
1291: if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);
1293: /* if intersection tree/pairwise and local isn't empty */
1294: if (gs->num_local_gop) {
1295: PCTFS_gs_gop_local_in_plus(gs,vals);
1297: /* pairwise will do tree inside ... */
1298: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1299: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1301: PCTFS_gs_gop_local_out(gs,vals);
1302: } else { /* if intersection tree/pairwise and local is empty */
1303: /* pairwise will do tree inside */
1304: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1305: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1306: }
1307: return(0);
1308: }
1310: /******************************************************************************/
1311: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1312: {
1313: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1314: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1315: PetscInt *pw, *list, *size, **nodes;
1316: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1317: MPI_Status status;
1318: PetscInt i, mask=1;
1322: for (i=1; i<dim; i++) { mask<<=1; mask++; }
1324: /* strip and load s */
1325: msg_list = list = gs->pair_list;
1326: msg_size = size = gs->msg_sizes;
1327: msg_nodes = nodes = gs->node_list;
1328: iptr = pw = gs->pw_elm_list;
1329: dptr1 = dptr3 = gs->pw_vals;
1330: msg_ids_in = ids_in = gs->msg_ids_in;
1331: msg_ids_out = ids_out = gs->msg_ids_out;
1332: dptr2 = gs->out;
1333: in1 = in2 = gs->in;
1335: /* post the receives */
1336: /* msg_nodes=nodes; */
1337: do {
1338: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1339: second one *list and do list++ afterwards */
1340: if ((PCTFS_my_id|mask)==(*list|mask)) {
1341: MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1342: list++; msg_ids_in++;in1 += *size++;
1343: } else { list++; size++; }
1344: } while (*++msg_nodes);
1346: /* load gs values into in out gs buffers */
1347: while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1349: /* load out buffers and post the sends */
1350: msg_nodes=nodes;
1351: list = msg_list;
1352: while ((iptr = *msg_nodes++)) {
1353: if ((PCTFS_my_id|mask)==(*list|mask)) {
1354: dptr3 = dptr2;
1355: while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1356: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1357: /* is msg_ids_out++ correct? */
1358: MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1359: msg_size++;list++;msg_ids_out++;
1360: } else {list++; msg_size++;}
1361: }
1363: /* do the tree while we're waiting */
1364: if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);
1366: /* process the received data */
1367: msg_nodes=nodes;
1368: list = msg_list;
1369: while ((iptr = *nodes++)) {
1370: if ((PCTFS_my_id|mask)==(*list|mask)) {
1371: /* Should I check the return value of MPI_Wait() or status? */
1372: /* Can this loop be replaced by a call to MPI_Waitall()? */
1373: MPI_Wait(ids_in, &status);
1374: ids_in++;
1375: while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1376: }
1377: list++;
1378: }
1380: /* replace vals */
1381: while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1383: /* clear isend message handles */
1384: /* This changed for clarity though it could be the same */
1385: while (*msg_nodes++) {
1386: if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1387: /* Should I check the return value of MPI_Wait() or status? */
1388: /* Can this loop be replaced by a call to MPI_Waitall()? */
1389: MPI_Wait(ids_out, &status);
1390: ids_out++;
1391: }
1392: msg_list++;
1393: }
1394: return(0);
1395: }
1397: /******************************************************************************/
1398: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1399: {
1400: PetscInt size;
1401: PetscInt *in, *out;
1402: PetscScalar *buf, *work;
1403: PetscInt op[] = {GL_ADD,0};
1406: in = gs->tree_map_in;
1407: out = gs->tree_map_out;
1408: buf = gs->tree_buf;
1409: work = gs->tree_work;
1410: size = gs->tree_nel;
1412: PCTFS_rvec_zero(buf,size);
1414: while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1416: in = gs->tree_map_in;
1417: out = gs->tree_map_out;
1419: PCTFS_grop_hc(buf,work,size,op,dim);
1421: while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1422: return(0);
1423: }