Actual source code: gs.c
petsc-3.3-p7 2013-05-11
2: /***********************************gs.c***************************************
4: Author: Henry M. Tufo III
6: e-mail: hmt@cs.brown.edu
8: snail-mail:
9: Division of Applied Mathematics
10: Brown University
11: Providence, RI 02912
13: Last Modification:
14: 6.21.97
15: ************************************gs.c**************************************/
17: /***********************************gs.c***************************************
18: File Description:
19: -----------------
21: ************************************gs.c**************************************/
23: #include <../src/ksp/pc/impls/tfs/tfs.h>
25: /* default length of number of items via tree - doubles if exceeded */
26: #define TREE_BUF_SZ 2048;
27: #define GS_VEC_SZ 1
31: /***********************************gs.c***************************************
32: Type: struct gather_scatter_id
33: ------------------------------
35: ************************************gs.c**************************************/
36: typedef struct gather_scatter_id {
37: PetscInt id;
38: PetscInt nel_min;
39: PetscInt nel_max;
40: PetscInt nel_sum;
41: PetscInt negl;
42: PetscInt gl_max;
43: PetscInt gl_min;
44: PetscInt repeats;
45: PetscInt ordered;
46: PetscInt positive;
47: PetscScalar *vals;
49: /* bit mask info */
50: PetscInt *my_proc_mask;
51: PetscInt mask_sz;
52: PetscInt *ngh_buf;
53: PetscInt ngh_buf_sz;
54: PetscInt *nghs;
55: PetscInt num_nghs;
56: PetscInt max_nghs;
57: PetscInt *pw_nghs;
58: PetscInt num_pw_nghs;
59: PetscInt *tree_nghs;
60: PetscInt num_tree_nghs;
62: PetscInt num_loads;
64: /* repeats == true -> local info */
65: PetscInt nel; /* number of unique elememts */
66: PetscInt *elms; /* of size nel */
67: PetscInt nel_total;
68: PetscInt *local_elms; /* of size nel_total */
69: PetscInt *companion; /* of size nel_total */
71: /* local info */
72: PetscInt num_local_total;
73: PetscInt local_strength;
74: PetscInt num_local;
75: PetscInt *num_local_reduce;
76: PetscInt **local_reduce;
77: PetscInt num_local_gop;
78: PetscInt *num_gop_local_reduce;
79: PetscInt **gop_local_reduce;
81: /* pairwise info */
82: PetscInt level;
83: PetscInt num_pairs;
84: PetscInt max_pairs;
85: PetscInt loc_node_pairs;
86: PetscInt max_node_pairs;
87: PetscInt min_node_pairs;
88: PetscInt avg_node_pairs;
89: PetscInt *pair_list;
90: PetscInt *msg_sizes;
91: PetscInt **node_list;
92: PetscInt len_pw_list;
93: PetscInt *pw_elm_list;
94: PetscScalar *pw_vals;
96: MPI_Request *msg_ids_in;
97: MPI_Request *msg_ids_out;
99: PetscScalar *out;
100: PetscScalar *in;
101: PetscInt msg_total;
103: /* tree - crystal accumulator info */
104: PetscInt max_left_over;
105: PetscInt *pre;
106: PetscInt *in_num;
107: PetscInt *out_num;
108: PetscInt **in_list;
109: PetscInt **out_list;
111: /* new tree work*/
112: PetscInt tree_nel;
113: PetscInt *tree_elms;
114: PetscScalar *tree_buf;
115: PetscScalar *tree_work;
117: PetscInt tree_map_sz;
118: PetscInt *tree_map_in;
119: PetscInt *tree_map_out;
121: /* current memory status */
122: PetscInt gl_bss_min;
123: PetscInt gl_perm_min;
125: /* max segment size for PCTFS_gs_gop_vec() */
126: PetscInt vec_sz;
128: /* hack to make paul happy */
129: MPI_Comm PCTFS_gs_comm;
131: } PCTFS_gs_id;
133: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137: static PCTFS_gs_id * gsi_new(void);
138: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
140: /* same for all but vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142: /* vector flavor */
143: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
152: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
155: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
159: /* global vars */
160: /* from comm.c module */
162: static PetscInt num_gs_ids = 0;
164: /* should make this dynamic ... later */
165: static PetscInt msg_buf=MAX_MSG_BUF;
166: static PetscInt vec_sz=GS_VEC_SZ;
167: static PetscInt *tree_buf=NULL;
168: static PetscInt tree_buf_sz=0;
169: static PetscInt ntree=0;
171: /***************************************************************************/
172: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173: {
175: vec_sz = size;
176: return(0);
177: }
179: /******************************************************************************/
180: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181: {
183: msg_buf = buf_size;
184: return(0);
185: }
187: /******************************************************************************/
188: PCTFS_gs_id *PCTFS_gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
189: {
190: PCTFS_gs_id *gs;
191: MPI_Group PCTFS_gs_group;
192: MPI_Comm PCTFS_gs_comm;
196: /* ensure that communication package has been initialized */
197: PCTFS_comm_init();
200: /* determines if we have enough dynamic/semi-static memory */
201: /* checks input, allocs and sets gd_id template */
202: gs = gsi_check_args(elms,nel,level);
204: /* only bit mask version up and working for the moment */
205: /* LATER :: get int list version working for sparse pblms */
206: gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
209: MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210: MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211: gs->PCTFS_gs_comm=PCTFS_gs_comm;
213: return(gs);
214: }
216: /******************************************************************************/
217: static PCTFS_gs_id *gsi_new(void)
218: {
220: PCTFS_gs_id *gs;
221: gs = (PCTFS_gs_id *) malloc(sizeof(PCTFS_gs_id));
222: PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
223: return(gs);
224: }
226: /******************************************************************************/
227: static PCTFS_gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
228: {
229: PetscInt i, j, k, t2;
230: PetscInt *companion, *elms, *unique, *iptr;
231: PetscInt num_local=0, *num_to_reduce, **local_reduce;
232: PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
233: PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
234: PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
235: PCTFS_gs_id *gs;
239: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
240: if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
242: if (nel==0)
243: {PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
245: /* get space for gs template */
246: gs = gsi_new();
247: gs->id = ++num_gs_ids;
249: /* hmt 6.4.99 */
250: /* caller can set global ids that don't participate to 0 */
251: /* PCTFS_gs_init ignores all zeros in elm list */
252: /* negative global ids are still invalid */
253: for (i=j=0;i<nel;i++)
254: {if (in_elms[i]!=0) {j++;}}
256: k=nel; nel=j;
258: /* copy over in_elms list and create inverse map */
259: elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
260: companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
262: for (i=j=0;i<k;i++)
263: {
264: if (in_elms[i]!=0)
265: {elms[j] = in_elms[i]; companion[j++] = i;}
266: }
268: if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
270: /* pre-pass ... check to see if sorted */
271: elms[nel] = INT_MAX;
272: iptr = elms;
273: unique = elms+1;
274: j=0;
275: while (*iptr!=INT_MAX)
276: {
277: if (*iptr++>*unique++)
278: {j=1; break;}
279: }
281: /* set up inverse map */
282: if (j)
283: {
284: PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
285: PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
286: }
287: else
288: {PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
289: elms[nel] = INT_MIN;
291: /* first pass */
292: /* determine number of unique elements, check pd */
293: for (i=k=0;i<nel;i+=j)
294: {
295: t2 = elms[i];
296: j=++i;
297:
298: /* clump 'em for now */
299: while (elms[j]==t2) {j++;}
300:
301: /* how many together and num local */
302: if (j-=i)
303: {num_local++; k+=j;}
304: }
306: /* how many unique elements? */
307: gs->repeats=k;
308: gs->nel = nel-k;
311: /* number of repeats? */
312: gs->num_local = num_local;
313: num_local+=2;
314: gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
315: gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
317: unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
318: gs->elms = unique;
319: gs->nel_total = nel;
320: gs->local_elms = elms;
321: gs->companion = companion;
323: /* compess map as well as keep track of local ops */
324: for (num_local=i=j=0;i<gs->nel;i++)
325: {
326: k=j;
327: t2 = unique[i] = elms[j];
328: companion[i] = companion[j];
329:
330: while (elms[j]==t2) {j++;}
332: if ((t2=(j-k))>1)
333: {
334: /* number together */
335: num_to_reduce[num_local] = t2++;
336: iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
338: /* to use binary searching don't remap until we check intersection */
339: *iptr++ = i;
340:
341: /* note that we're skipping the first one */
342: while (++k<j)
343: {*(iptr++) = companion[k];}
344: *iptr = -1;
345: }
346: }
348: /* sentinel for ngh_buf */
349: unique[gs->nel]=INT_MAX;
351: /* for two partition sort hack */
352: num_to_reduce[num_local] = 0;
353: local_reduce[num_local] = NULL;
354: num_to_reduce[++num_local] = 0;
355: local_reduce[num_local] = NULL;
357: /* load 'em up */
358: /* note one extra to hold NON_UNIFORM flag!!! */
359: vals[2] = vals[1] = vals[0] = nel;
360: if (gs->nel>0)
361: {
362: vals[3] = unique[0];
363: vals[4] = unique[gs->nel-1];
364: }
365: else
366: {
367: vals[3] = INT_MAX;
368: vals[4] = INT_MIN;
369: }
370: vals[5] = level;
371: vals[6] = num_gs_ids;
373: /* GLOBAL: send 'em out */
374: PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
376: /* must be semi-pos def - only pairwise depends on this */
377: /* LATER - remove this restriction */
378: if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
379: if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
381: gs->nel_min = vals[0];
382: gs->nel_max = vals[1];
383: gs->nel_sum = vals[2];
384: gs->gl_min = vals[3];
385: gs->gl_max = vals[4];
386: gs->negl = vals[4]-vals[3]+1;
388: if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
389:
390: /* LATER :: add level == -1 -> program selects level */
391: if (vals[5]<0)
392: {vals[5]=0;}
393: else if (vals[5]>PCTFS_num_nodes)
394: {vals[5]=PCTFS_num_nodes;}
395: gs->level = vals[5];
397: return(gs);
398: }
400: /******************************************************************************/
401: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
402: {
403: PetscInt i, nel, *elms;
404: PetscInt t1;
405: PetscInt **reduce;
406: PetscInt *map;
410: /* totally local removes ... PCTFS_ct_bits == 0 */
411: get_ngh_buf(gs);
413: if (gs->level) set_pairwise(gs);
414: if (gs->max_left_over) set_tree(gs);
416: /* intersection local and pairwise/tree? */
417: gs->num_local_total = gs->num_local;
418: gs->gop_local_reduce = gs->local_reduce;
419: gs->num_gop_local_reduce = gs->num_local_reduce;
421: map = gs->companion;
423: /* is there any local compression */
424: if (!gs->num_local) {
425: gs->local_strength = NONE;
426: gs->num_local_gop = 0;
427: } else {
428: /* ok find intersection */
429: map = gs->companion;
430: reduce = gs->local_reduce;
431: for (i=0, t1=0; i<gs->num_local; i++, reduce++)
432: {
433: if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
434: ||
435: PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
436: {
437: t1++;
438: if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
439: gs->num_local_reduce[i] *= -1;
440: }
441: **reduce=map[**reduce];
442: }
444: /* intersection is empty */
445: if (!t1)
446: {
447: gs->local_strength = FULL;
448: gs->num_local_gop = 0;
449: }
450: /* intersection not empty */
451: else
452: {
453: gs->local_strength = PARTIAL;
454: PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);
456: gs->num_local_gop = t1;
457: gs->num_local_total = gs->num_local;
458: gs->num_local -= t1;
459: gs->gop_local_reduce = gs->local_reduce;
460: gs->num_gop_local_reduce = gs->num_local_reduce;
462: for (i=0; i<t1; i++)
463: {
464: if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
465: gs->num_gop_local_reduce[i] *= -1;
466: gs->local_reduce++;
467: gs->num_local_reduce++;
468: }
469: gs->local_reduce++;
470: gs->num_local_reduce++;
471: }
472: }
474: elms = gs->pw_elm_list;
475: nel = gs->len_pw_list;
476: for (i=0; i<nel; i++)
477: {elms[i] = map[elms[i]];}
479: elms = gs->tree_map_in;
480: nel = gs->tree_map_sz;
481: for (i=0; i<nel; i++)
482: {elms[i] = map[elms[i]];}
484: /* clean up */
485: free((void*) gs->local_elms);
486: free((void*) gs->companion);
487: free((void*) gs->elms);
488: free((void*) gs->ngh_buf);
489: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
490: return(0);
491: }
493: /******************************************************************************/
494: static PetscErrorCode place_in_tree( PetscInt elm)
495: {
496: PetscInt *tp, n;
499: if (ntree==tree_buf_sz)
500: {
501: if (tree_buf_sz)
502: {
503: tp = tree_buf;
504: n = tree_buf_sz;
505: tree_buf_sz<<=1;
506: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
507: PCTFS_ivec_copy(tree_buf,tp,n);
508: free(tp);
509: }
510: else
511: {
512: tree_buf_sz = TREE_BUF_SZ;
513: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
514: }
515: }
517: tree_buf[ntree++] = elm;
518: return(0);
519: }
521: /******************************************************************************/
522: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
523: {
524: PetscInt i, j, npw=0, ntree_map=0;
525: PetscInt p_mask_size, ngh_buf_size, buf_size;
526: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
527: PetscInt *ngh_buf, *buf1, *buf2;
528: PetscInt offset, per_load, num_loads, or_ct, start, end;
529: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
530: PetscInt oper=GL_B_OR;
531: PetscInt *ptr3, *t_mask, level, ct1, ct2;
535: /* to make life easier */
536: nel = gs->nel;
537: elms = gs->elms;
538: level = gs->level;
539:
540: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
541: p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
542: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
544: /* allocate space for masks and info bufs */
545: gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
546: gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
547: gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
548: t_mask = (PetscInt*) malloc(p_mask_size);
549: gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
551: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
552: /* had thought I could exploit rendezvous threshold */
554: /* default is one pass */
555: per_load = negl = gs->negl;
556: gs->num_loads = num_loads = 1;
557: i=p_mask_size*negl;
559: /* possible overflow on buffer size */
560: /* overflow hack */
561: if (i<0) {i=INT_MAX;}
563: buf_size = PetscMin(msg_buf,i);
565: /* can we do it? */
566: if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
568: /* get PCTFS_giop buf space ... make *only* one malloc */
569: buf1 = (PetscInt*) malloc(buf_size<<1);
571: /* more than one gior exchange needed? */
572: if (buf_size!=i)
573: {
574: per_load = buf_size/p_mask_size;
575: buf_size = per_load*p_mask_size;
576: gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
577: }
580: /* convert buf sizes from #bytes to #ints - 32 bit only! */
581: p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
582:
583: /* find PCTFS_giop work space */
584: buf2 = buf1+buf_size;
586: /* hold #ints needed for processor masks */
587: gs->mask_sz=p_mask_size;
589: /* init buffers */
590: PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
591: PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
592: PCTFS_ivec_zero(ngh_buf,ngh_buf_size);
594: /* HACK reset tree info */
595: tree_buf=NULL;
596: tree_buf_sz=ntree=0;
598: /* ok do it */
599: for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
600: {
601: /* identity for bitwise or is 000...000 */
602: PCTFS_ivec_zero(buf1,buf_size);
604: /* load msg buffer */
605: for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
606: {
607: offset = (offset-start)*p_mask_size;
608: PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
609: }
611: /* GLOBAL: pass buffer */
612: PCTFS_giop(buf1,buf2,buf_size,&oper);
615: /* unload buffer into ngh_buf */
616: ptr2=(elms+i_start);
617: for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
618: {
619: /* I own it ... may have to pairwise it */
620: if (j==*ptr2)
621: {
622: /* do i share it w/anyone? */
623: ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
624: /* guess not */
625: if (ct1<2)
626: {ptr2++; ptr1+=p_mask_size; continue;}
628: /* i do ... so keep info and turn off my bit */
629: PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
630: PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
631: PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);
632:
633: /* is it to be done pairwise? */
634: if (--ct1<=level)
635: {
636: npw++;
637:
638: /* turn on high bit to indicate pw need to process */
639: *ptr2++ |= TOP_BIT;
640: PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
641: ptr1+=p_mask_size;
642: continue;
643: }
645: /* get set for next and note that I have a tree contribution */
646: /* could save exact elm index for tree here -> save a search */
647: ptr2++; ptr1+=p_mask_size; ntree_map++;
648: }
649: /* i don't but still might be involved in tree */
650: else
651: {
653: /* shared by how many? */
654: ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
656: /* none! */
657: if (ct1<2) continue;
659: /* is it going to be done pairwise? but not by me of course!*/
660: if (--ct1<=level) continue;
661: }
662: /* LATER we're going to have to process it NOW */
663: /* nope ... tree it */
664: place_in_tree(j);
665: }
666: }
668: free((void*)t_mask);
669: free((void*)buf1);
671: gs->len_pw_list=npw;
672: gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
674: /* expand from bit mask list to int list and save ngh list */
675: gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
676: PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
678: gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
680: oper = GL_MAX;
681: ct1 = gs->num_nghs;
682: PCTFS_giop(&ct1,&ct2,1,&oper);
683: gs->max_nghs = ct1;
685: gs->tree_map_sz = ntree_map;
686: gs->max_left_over=ntree;
688: free((void*)p_mask);
689: free((void*)sh_proc_mask);
690: return(0);
691: }
693: /******************************************************************************/
694: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
695: {
696: PetscInt i, j;
697: PetscInt p_mask_size;
698: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
699: PetscInt *ngh_buf, *buf2;
700: PetscInt offset;
701: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
702: PetscInt *pairwise_elm_list, len_pair_list=0;
703: PetscInt *iptr, t1, i_start, nel, *elms;
704: PetscInt ct;
708: /* to make life easier */
709: nel = gs->nel;
710: elms = gs->elms;
711: ngh_buf = gs->ngh_buf;
712: sh_proc_mask = gs->pw_nghs;
714: /* need a few temp masks */
715: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
716: p_mask = (PetscInt*) malloc(p_mask_size);
717: tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
719: /* set mask to my PCTFS_my_id's bit mask */
720: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
722: p_mask_size /= sizeof(PetscInt);
723:
724: len_pair_list=gs->len_pw_list;
725: gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
727: /* how many processors (nghs) do we have to exchange with? */
728: nprs=gs->num_pairs=PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
731: /* allocate space for PCTFS_gs_gop() info */
732: gs->pair_list = msg_list = (PetscInt *) malloc(sizeof(PetscInt)*nprs);
733: gs->msg_sizes = msg_size = (PetscInt *) malloc(sizeof(PetscInt)*nprs);
734: gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));
736: /* init msg_size list */
737: PCTFS_ivec_zero(msg_size,nprs);
739: /* expand from bit mask list to int list */
740: PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
741:
742: /* keep list of elements being handled pairwise */
743: for (i=j=0;i<nel;i++)
744: {
745: if (elms[i] & TOP_BIT)
746: {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
747: }
748: pairwise_elm_list[j] = -1;
750: gs->msg_ids_out = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1));
751: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
752: gs->msg_ids_in = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1));
753: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
754: gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
756: /* find who goes to each processor */
757: for (i_start=i=0;i<nprs;i++)
758: {
759: /* processor i's mask */
760: PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);
762: /* det # going to processor i */
763: for (ct=j=0;j<len_pair_list;j++)
764: {
765: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
766: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
767: if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
768: {ct++;}
769: }
770: msg_size[i] = ct;
771: i_start = PetscMax(i_start,ct);
773: /*space to hold nodes in message to first neighbor */
774: msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
776: for (j=0;j<len_pair_list;j++)
777: {
778: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
779: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
780: if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
781: {*iptr++ = j;}
782: }
783: *iptr = -1;
784: }
785: msg_nodes[nprs] = NULL;
787: j=gs->loc_node_pairs=i_start;
788: t1 = GL_MAX;
789: PCTFS_giop(&i_start,&offset,1,&t1);
790: gs->max_node_pairs = i_start;
792: i_start=j;
793: t1 = GL_MIN;
794: PCTFS_giop(&i_start,&offset,1,&t1);
795: gs->min_node_pairs = i_start;
797: i_start=j;
798: t1 = GL_ADD;
799: PCTFS_giop(&i_start,&offset,1,&t1);
800: gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
802: i_start=nprs;
803: t1 = GL_MAX;
804: PCTFS_giop(&i_start,&offset,1,&t1);
805: gs->max_pairs = i_start;
808: /* remap pairwise in tail of gsi_via_bit_mask() */
809: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
810: gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
811: gs->in = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
813: /* reset malloc pool */
814: free((void*)p_mask);
815: free((void*)tmp_proc_mask);
816: return(0);
817: }
819: /* to do pruned tree just save ngh buf copy for each one and decode here!
820: ******************************************************************************/
821: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
822: {
823: PetscInt i, j, n, nel;
824: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
827: /* local work ptrs */
828: elms = gs->elms;
829: nel = gs->nel;
831: /* how many via tree */
832: gs->tree_nel = n = ntree;
833: gs->tree_elms = tree_elms = iptr_in = tree_buf;
834: gs->tree_buf = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
835: gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
836: j=gs->tree_map_sz;
837: gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
838: gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
840: /* search the longer of the two lists */
841: /* note ... could save this info in get_ngh_buf and save searches */
842: if (n<=nel)
843: {
844: /* bijective fct w/remap - search elm list */
845: for (i=0; i<n; i++)
846: {
847: if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0)
848: {*iptr_in++ = j; *iptr_out++ = i;}
849: }
850: }
851: else
852: {
853: for (i=0; i<nel; i++)
854: {
855: if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0)
856: {*iptr_in++ = i; *iptr_out++ = j;}
857: }
858: }
860: /* sentinel */
861: *iptr_in = *iptr_out = -1;
862: return(0);
863: }
865: /******************************************************************************/
866: static PetscErrorCode PCTFS_gs_gop_local_out( PCTFS_gs_id *gs, PetscScalar *vals)
867: {
868: PetscInt *num, *map, **reduce;
869: PetscScalar tmp;
872: num = gs->num_gop_local_reduce;
873: reduce = gs->gop_local_reduce;
874: while ((map = *reduce++))
875: {
876: /* wall */
877: if (*num == 2)
878: {
879: num ++;
880: vals[map[1]] = vals[map[0]];
881: }
882: /* corner shared by three elements */
883: else if (*num == 3)
884: {
885: num ++;
886: vals[map[2]] = vals[map[1]] = vals[map[0]];
887: }
888: /* corner shared by four elements */
889: else if (*num == 4)
890: {
891: num ++;
892: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
893: }
894: /* general case ... odd geoms ... 3D*/
895: else
896: {
897: num++;
898: tmp = *(vals + *map++);
899: while (*map >= 0)
900: {*(vals + *map++) = tmp;}
901: }
902: }
903: return(0);
904: }
906: /******************************************************************************/
907: static PetscErrorCode PCTFS_gs_gop_local_plus( PCTFS_gs_id *gs, PetscScalar *vals)
908: {
909: PetscInt *num, *map, **reduce;
910: PetscScalar tmp;
913: num = gs->num_local_reduce;
914: reduce = gs->local_reduce;
915: while ((map = *reduce))
916: {
917: /* wall */
918: if (*num == 2)
919: {
920: num ++; reduce++;
921: vals[map[1]] = vals[map[0]] += vals[map[1]];
922: }
923: /* corner shared by three elements */
924: else if (*num == 3)
925: {
926: num ++; reduce++;
927: vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
928: }
929: /* corner shared by four elements */
930: else if (*num == 4)
931: {
932: num ++; reduce++;
933: vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
934: (vals[map[1]] + vals[map[2]] + vals[map[3]]);
935: }
936: /* general case ... odd geoms ... 3D*/
937: else
938: {
939: num ++;
940: tmp = 0.0;
941: while (*map >= 0)
942: {tmp += *(vals + *map++);}
944: map = *reduce++;
945: while (*map >= 0)
946: {*(vals + *map++) = tmp;}
947: }
948: }
949: return(0);
950: }
952: /******************************************************************************/
953: static PetscErrorCode PCTFS_gs_gop_local_in_plus( PCTFS_gs_id *gs, PetscScalar *vals)
954: {
955: PetscInt *num, *map, **reduce;
956: PetscScalar *base;
959: num = gs->num_gop_local_reduce;
960: reduce = gs->gop_local_reduce;
961: while ((map = *reduce++))
962: {
963: /* wall */
964: if (*num == 2)
965: {
966: num ++;
967: vals[map[0]] += vals[map[1]];
968: }
969: /* corner shared by three elements */
970: else if (*num == 3)
971: {
972: num ++;
973: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
974: }
975: /* corner shared by four elements */
976: else if (*num == 4)
977: {
978: num ++;
979: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
980: }
981: /* general case ... odd geoms ... 3D*/
982: else
983: {
984: num++;
985: base = vals + *map++;
986: while (*map >= 0)
987: {*base += *(vals + *map++);}
988: }
989: }
990: return(0);
991: }
993: /******************************************************************************/
994: PetscErrorCode PCTFS_gs_free( PCTFS_gs_id *gs)
995: {
996: PetscInt i;
999: if (gs->nghs) {free((void*) gs->nghs);}
1000: if (gs->pw_nghs) {free((void*) gs->pw_nghs);}
1002: /* tree */
1003: if (gs->max_left_over)
1004: {
1005: if (gs->tree_elms) {free((void*) gs->tree_elms);}
1006: if (gs->tree_buf) {free((void*) gs->tree_buf);}
1007: if (gs->tree_work) {free((void*) gs->tree_work);}
1008: if (gs->tree_map_in) {free((void*) gs->tree_map_in);}
1009: if (gs->tree_map_out) {free((void*) gs->tree_map_out);}
1010: }
1012: /* pairwise info */
1013: if (gs->num_pairs)
1014: {
1015: /* should be NULL already */
1016: if (gs->ngh_buf) {free((void*) gs->ngh_buf);}
1017: if (gs->elms) {free((void*) gs->elms);}
1018: if (gs->local_elms) {free((void*) gs->local_elms);}
1019: if (gs->companion) {free((void*) gs->companion);}
1020:
1021: /* only set if pairwise */
1022: if (gs->vals) {free((void*) gs->vals);}
1023: if (gs->in) {free((void*) gs->in);}
1024: if (gs->out) {free((void*) gs->out);}
1025: if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);}
1026: if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);}
1027: if (gs->pw_vals) {free((void*) gs->pw_vals);}
1028: if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);}
1029: if (gs->node_list)
1030: {
1031: for (i=0;i<gs->num_pairs;i++)
1032: {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}}
1033: free((void*) gs->node_list);
1034: }
1035: if (gs->msg_sizes) {free((void*) gs->msg_sizes);}
1036: if (gs->pair_list) {free((void*) gs->pair_list);}
1037: }
1039: /* local info */
1040: if (gs->num_local_total>=0)
1041: {
1042: for (i=0;i<gs->num_local_total+1;i++)
1043: /* for (i=0;i<gs->num_local_total;i++) */
1044: {
1045: if (gs->num_gop_local_reduce[i])
1046: {free((void*) gs->gop_local_reduce[i]);}
1047: }
1048: }
1050: /* if intersection tree/pairwise and local isn't empty */
1051: if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);}
1052: if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);}
1054: free((void*) gs);
1055: return(0);
1056: }
1058: /******************************************************************************/
1059: PetscErrorCode PCTFS_gs_gop_vec( PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
1060: {
1064: switch (*op) {
1065: case '+':
1066: PCTFS_gs_gop_vec_plus(gs,vals,step);
1067: break;
1068: default:
1069: PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op",op[0]);
1070: PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus");
1071: PCTFS_gs_gop_vec_plus(gs,vals,step);
1072: break;
1073: }
1074: return(0);
1075: }
1077: /******************************************************************************/
1078: static PetscErrorCode PCTFS_gs_gop_vec_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1079: {
1081: if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
1083: /* local only operations!!! */
1084: if (gs->num_local)
1085: {PCTFS_gs_gop_vec_local_plus(gs,vals,step);}
1087: /* if intersection tree/pairwise and local isn't empty */
1088: if (gs->num_local_gop)
1089: {
1090: PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
1092: /* pairwise */
1093: if (gs->num_pairs)
1094: {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);}
1096: /* tree */
1097: else if (gs->max_left_over)
1098: {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);}
1100: PCTFS_gs_gop_vec_local_out(gs,vals,step);
1101: }
1102: /* if intersection tree/pairwise and local is empty */
1103: else
1104: {
1105: /* pairwise */
1106: if (gs->num_pairs)
1107: {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);}
1109: /* tree */
1110: else if (gs->max_left_over)
1111: {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);}
1112: }
1113: return(0);
1114: }
1116: /******************************************************************************/
1117: static PetscErrorCode PCTFS_gs_gop_vec_local_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1118: {
1119: PetscInt *num, *map, **reduce;
1120: PetscScalar *base;
1123: num = gs->num_local_reduce;
1124: reduce = gs->local_reduce;
1125: while ((map = *reduce))
1126: {
1127: base = vals + map[0] * step;
1129: /* wall */
1130: if (*num == 2)
1131: {
1132: num++; reduce++;
1133: PCTFS_rvec_add (base,vals+map[1]*step,step);
1134: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1135: }
1136: /* corner shared by three elements */
1137: else if (*num == 3)
1138: {
1139: num++; reduce++;
1140: PCTFS_rvec_add (base,vals+map[1]*step,step);
1141: PCTFS_rvec_add (base,vals+map[2]*step,step);
1142: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1143: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1144: }
1145: /* corner shared by four elements */
1146: else if (*num == 4)
1147: {
1148: num++; reduce++;
1149: PCTFS_rvec_add (base,vals+map[1]*step,step);
1150: PCTFS_rvec_add (base,vals+map[2]*step,step);
1151: PCTFS_rvec_add (base,vals+map[3]*step,step);
1152: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1153: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1154: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1155: }
1156: /* general case ... odd geoms ... 3D */
1157: else
1158: {
1159: num++;
1160: while (*++map >= 0)
1161: {PCTFS_rvec_add (base,vals+*map*step,step);}
1162:
1163: map = *reduce;
1164: while (*++map >= 0)
1165: {PCTFS_rvec_copy(vals+*map*step,base,step);}
1166:
1167: reduce++;
1168: }
1169: }
1170: return(0);
1171: }
1173: /******************************************************************************/
1174: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1175: {
1176: PetscInt *num, *map, **reduce;
1177: PetscScalar *base;
1179: num = gs->num_gop_local_reduce;
1180: reduce = gs->gop_local_reduce;
1181: while ((map = *reduce++))
1182: {
1183: base = vals + map[0] * step;
1185: /* wall */
1186: if (*num == 2)
1187: {
1188: num ++;
1189: PCTFS_rvec_add(base,vals+map[1]*step,step);
1190: }
1191: /* corner shared by three elements */
1192: else if (*num == 3)
1193: {
1194: num ++;
1195: PCTFS_rvec_add(base,vals+map[1]*step,step);
1196: PCTFS_rvec_add(base,vals+map[2]*step,step);
1197: }
1198: /* corner shared by four elements */
1199: else if (*num == 4)
1200: {
1201: num ++;
1202: PCTFS_rvec_add(base,vals+map[1]*step,step);
1203: PCTFS_rvec_add(base,vals+map[2]*step,step);
1204: PCTFS_rvec_add(base,vals+map[3]*step,step);
1205: }
1206: /* general case ... odd geoms ... 3D*/
1207: else
1208: {
1209: num++;
1210: while (*++map >= 0)
1211: {PCTFS_rvec_add(base,vals+*map*step,step);}
1212: }
1213: }
1214: return(0);
1215: }
1217: /******************************************************************************/
1218: static PetscErrorCode PCTFS_gs_gop_vec_local_out( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1219: {
1220: PetscInt *num, *map, **reduce;
1221: PetscScalar *base;
1224: num = gs->num_gop_local_reduce;
1225: reduce = gs->gop_local_reduce;
1226: while ((map = *reduce++))
1227: {
1228: base = vals + map[0] * step;
1230: /* wall */
1231: if (*num == 2)
1232: {
1233: num ++;
1234: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1235: }
1236: /* corner shared by three elements */
1237: else if (*num == 3)
1238: {
1239: num ++;
1240: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1241: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1242: }
1243: /* corner shared by four elements */
1244: else if (*num == 4)
1245: {
1246: num ++;
1247: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1248: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1249: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1250: }
1251: /* general case ... odd geoms ... 3D*/
1252: else
1253: {
1254: num++;
1255: while (*++map >= 0)
1256: {PCTFS_rvec_copy(vals+*map*step,base,step);}
1257: }
1258: }
1259: return(0);
1260: }
1262: /******************************************************************************/
1263: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus( PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1264: {
1265: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1266: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1267: PetscInt *pw, *list, *size, **nodes;
1268: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1269: MPI_Status status;
1270: PetscBLASInt i1 = 1,dstep;
1274: /* strip and load s */
1275: msg_list =list = gs->pair_list;
1276: msg_size =size = gs->msg_sizes;
1277: msg_nodes=nodes = gs->node_list;
1278: iptr=pw = gs->pw_elm_list;
1279: dptr1=dptr3 = gs->pw_vals;
1280: msg_ids_in = ids_in = gs->msg_ids_in;
1281: msg_ids_out = ids_out = gs->msg_ids_out;
1282: dptr2 = gs->out;
1283: in1=in2 = gs->in;
1285: /* post the receives */
1286: /* msg_nodes=nodes; */
1287: do
1288: {
1289: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1290: second one *list and do list++ afterwards */
1291: MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1292: list++;msg_ids_in++;
1293: in1 += *size++ *step;
1294: }
1295: while (*++msg_nodes);
1296: msg_nodes=nodes;
1298: /* load gs values into in out gs buffers */
1299: while (*iptr >= 0)
1300: {
1301: PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1302: dptr3+=step;
1303: iptr++;
1304: }
1306: /* load out buffers and post the sends */
1307: while ((iptr = *msg_nodes++))
1308: {
1309: dptr3 = dptr2;
1310: while (*iptr >= 0)
1311: {
1312: PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1313: dptr2+=step;
1314: iptr++;
1315: }
1316: MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1317: msg_size++; msg_list++;msg_ids_out++;
1318: }
1320: /* tree */
1321: if (gs->max_left_over)
1322: {PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);}
1324: /* process the received data */
1325: msg_nodes=nodes;
1326: while ((iptr = *nodes++)){
1327: PetscScalar d1 = 1.0;
1328: /* Should I check the return value of MPI_Wait() or status? */
1329: /* Can this loop be replaced by a call to MPI_Waitall()? */
1330: MPI_Wait(ids_in, &status);
1331: ids_in++;
1332: while (*iptr >= 0) {
1333: dstep = PetscBLASIntCast(step);
1334: BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1335: in2+=step;
1336: iptr++;
1337: }
1338: }
1340: /* replace vals */
1341: while (*pw >= 0)
1342: {
1343: PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1344: dptr1+=step;
1345: pw++;
1346: }
1348: /* clear isend message handles */
1349: /* This changed for clarity though it could be the same */
1350: while (*msg_nodes++)
1351: /* Should I check the return value of MPI_Wait() or status? */
1352: /* Can this loop be replaced by a call to MPI_Waitall()? */
1353: {MPI_Wait(ids_out, &status);ids_out++;}
1354:
1356: return(0);
1357: }
1359: /******************************************************************************/
1360: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1361: {
1362: PetscInt size, *in, *out;
1363: PetscScalar *buf, *work;
1364: PetscInt op[] = {GL_ADD,0};
1365: PetscBLASInt i1 = 1;
1368: /* copy over to local variables */
1369: in = gs->tree_map_in;
1370: out = gs->tree_map_out;
1371: buf = gs->tree_buf;
1372: work = gs->tree_work;
1373: size = gs->tree_nel*step;
1375: /* zero out collection buffer */
1376: PCTFS_rvec_zero(buf,size);
1379: /* copy over my contributions */
1380: while (*in >= 0)
1381: {
1382: PetscBLASInt dstep = PetscBLASIntCast(step);
1383: BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1384: }
1386: /* perform fan in/out on full buffer */
1387: /* must change PCTFS_grop to handle the blas */
1388: PCTFS_grop(buf,work,size,op);
1390: /* reset */
1391: in = gs->tree_map_in;
1392: out = gs->tree_map_out;
1394: /* get the portion of the results I need */
1395: while (*in >= 0)
1396: {
1397: PetscBLASInt dstep = PetscBLASIntCast(step);
1398: BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1399: }
1400: return(0);
1401: }
1403: /******************************************************************************/
1404: PetscErrorCode PCTFS_gs_gop_hc( PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1405: {
1409: switch (*op) {
1410: case '+':
1411: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1412: break;
1413: default:
1414: PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op",op[0]);
1415: PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1416: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1417: break;
1418: }
1419: return(0);
1420: }
1422: /******************************************************************************/
1423: static PetscErrorCode PCTFS_gs_gop_plus_hc( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1424: {
1426: /* if there's nothing to do return */
1427: if (dim<=0)
1428: { return(0);}
1430: /* can't do more dimensions then exist */
1431: dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1433: /* local only operations!!! */
1434: if (gs->num_local)
1435: {PCTFS_gs_gop_local_plus(gs,vals);}
1437: /* if intersection tree/pairwise and local isn't empty */
1438: if (gs->num_local_gop)
1439: {
1440: PCTFS_gs_gop_local_in_plus(gs,vals);
1442: /* pairwise will do tree inside ... */
1443: if (gs->num_pairs)
1444: {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);}
1446: /* tree only */
1447: else if (gs->max_left_over)
1448: {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);}
1449:
1450: PCTFS_gs_gop_local_out(gs,vals);
1451: }
1452: /* if intersection tree/pairwise and local is empty */
1453: else
1454: {
1455: /* pairwise will do tree inside */
1456: if (gs->num_pairs)
1457: {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);}
1458:
1459: /* tree */
1460: else if (gs->max_left_over)
1461: {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);}
1462: }
1463: return(0);
1464: }
1466: /******************************************************************************/
1467: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc( PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1468: {
1469: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1470: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1471: PetscInt *pw, *list, *size, **nodes;
1472: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1473: MPI_Status status;
1474: PetscInt i, mask=1;
1478: for (i=1; i<dim; i++)
1479: {mask<<=1; mask++;}
1482: /* strip and load s */
1483: msg_list =list = gs->pair_list;
1484: msg_size =size = gs->msg_sizes;
1485: msg_nodes=nodes = gs->node_list;
1486: iptr=pw = gs->pw_elm_list;
1487: dptr1=dptr3 = gs->pw_vals;
1488: msg_ids_in = ids_in = gs->msg_ids_in;
1489: msg_ids_out = ids_out = gs->msg_ids_out;
1490: dptr2 = gs->out;
1491: in1=in2 = gs->in;
1493: /* post the receives */
1494: /* msg_nodes=nodes; */
1495: do
1496: {
1497: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1498: second one *list and do list++ afterwards */
1499: if ((PCTFS_my_id|mask)==(*list|mask))
1500: {
1501: MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1502: list++; msg_ids_in++;in1 += *size++;
1503: }
1504: else
1505: {list++; size++;}
1506: }
1507: while (*++msg_nodes);
1509: /* load gs values into in out gs buffers */
1510: while (*iptr >= 0)
1511: {*dptr3++ = *(in_vals + *iptr++);}
1513: /* load out buffers and post the sends */
1514: msg_nodes=nodes;
1515: list = msg_list;
1516: while ((iptr = *msg_nodes++))
1517: {
1518: if ((PCTFS_my_id|mask)==(*list|mask))
1519: {
1520: dptr3 = dptr2;
1521: while (*iptr >= 0)
1522: {*dptr2++ = *(dptr1 + *iptr++);}
1523: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1524: /* is msg_ids_out++ correct? */
1525: MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1526: msg_size++;list++;msg_ids_out++;
1527: }
1528: else
1529: {list++; msg_size++;}
1530: }
1532: /* do the tree while we're waiting */
1533: if (gs->max_left_over)
1534: {PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);}
1536: /* process the received data */
1537: msg_nodes=nodes;
1538: list = msg_list;
1539: while ((iptr = *nodes++))
1540: {
1541: if ((PCTFS_my_id|mask)==(*list|mask))
1542: {
1543: /* Should I check the return value of MPI_Wait() or status? */
1544: /* Can this loop be replaced by a call to MPI_Waitall()? */
1545: MPI_Wait(ids_in, &status);
1546: ids_in++;
1547: while (*iptr >= 0)
1548: {*(dptr1 + *iptr++) += *in2++;}
1549: }
1550: list++;
1551: }
1553: /* replace vals */
1554: while (*pw >= 0)
1555: {*(in_vals + *pw++) = *dptr1++;}
1557: /* clear isend message handles */
1558: /* This changed for clarity though it could be the same */
1559: while (*msg_nodes++)
1560: {
1561: if ((PCTFS_my_id|mask)==(*msg_list|mask))
1562: {
1563: /* Should I check the return value of MPI_Wait() or status? */
1564: /* Can this loop be replaced by a call to MPI_Waitall()? */
1565: MPI_Wait(ids_out, &status);
1566: ids_out++;
1567: }
1568: msg_list++;
1569: }
1571: return(0);
1572: }
1574: /******************************************************************************/
1575: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1576: {
1577: PetscInt size;
1578: PetscInt *in, *out;
1579: PetscScalar *buf, *work;
1580: PetscInt op[] = {GL_ADD,0};
1583: in = gs->tree_map_in;
1584: out = gs->tree_map_out;
1585: buf = gs->tree_buf;
1586: work = gs->tree_work;
1587: size = gs->tree_nel;
1589: PCTFS_rvec_zero(buf,size);
1591: while (*in >= 0)
1592: {*(buf + *out++) = *(vals + *in++);}
1594: in = gs->tree_map_in;
1595: out = gs->tree_map_out;
1597: PCTFS_grop_hc(buf,work,size,op,dim);
1599: while (*in >= 0)
1600: {*(vals + *in++) = *(buf + *out++);}
1601: return(0);
1602: }