Actual source code: threadcommred.c
petsc-3.5.4 2015-05-23
1: /* This file contains code for threaded reductions */
2: #include <petsc-private/threadcommimpl.h> /*I "petscthreadcomm.h" I*/
6: /*@C
7: PetscThreadReductionBegin - Initiates a threaded reduction and returns a
8: reduction object to be passed to PetscThreadCommRunKernel
10: Input Parameters:
11: + comm - the MPI comm
12: . op - the reduction operation
13: . type - the data type for reduction
14: - nreds - Number of reductions
16: Output Parameters:
17: . redout - the reduction context
19: Level: developer
21: Notes:
22: See include/petscthreadcomm.h for the available reduction operations
24: To be called from the main thread before calling PetscThreadCommRunKernel
26: .seealso: PetscThreadCommReductionKernelPost(), PetscThreadCommReductionKernelEnd(), PetscThreadCommReductionEnd()
27: @*/
28: PetscErrorCode PetscThreadReductionBegin(MPI_Comm comm,PetscThreadCommReductionOp op, PetscDataType type,PetscInt nreds,PetscThreadCommReduction *redout)
29: {
30: PetscErrorCode ierr;
31: PetscThreadComm tcomm;
32: PetscInt i;
33: PetscThreadCommRedCtx redctx;
34: PetscThreadCommReduction red;
37: PetscCommGetThreadComm(comm,&tcomm);
38: red = tcomm->red;
39: if (red->ctr+nreds > PETSC_REDUCTIONS_MAX) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Reductions in operation: %D Max. allowed: %D",red->ctr+nreds,PETSC_REDUCTIONS_MAX);
40: for (i=red->ctr; i<red->ctr+nreds; i++) {
41: redctx = &red->redctx[i];
42: redctx->op = op;
43: redctx->type = type;
44: redctx->red_status = THREADCOMM_REDUCTION_NEW;
45: redctx->tcomm = tcomm;
46: }
47: red->nreds += nreds;
48: red->ctr = red->ctr+nreds;
49: *redout = red;
50: return(0);
51: }
55: /*
56: PetscThreadCommReductionDestroy - Destroys the reduction context
58: Input Parameters:
59: . red - the reduction context
61: */
62: PetscErrorCode PetscThreadCommReductionDestroy(PetscThreadCommReduction red)
63: {
67: if (!red) return(0);
69: PetscFree(red->redctx[0].thread_status);
70: PetscFree(red->redctx[0].local_red);
71: PetscFree(red->redctx);
72: PetscFree(red->thread_ctr);
73: PetscFree(red);
74: return(0);
75: }
79: /*
80: PetscThreadReductionKernelPost - Begins a threaded reduction operation
82: Input Parameters:
83: + trank - Rank of the calling thread
84: . red - the reduction context
85: . lred - local contribution from the thread
87: Level: developer
89: Notes:
90: This routine posts the local reduction of each thread in the reduction context and
91: updates its reduction status.
93: Must call PetscThreadReductionBegin before launching the kernel.
94: */
95: PetscErrorCode PetscThreadReductionKernelPost(PetscInt trank,PetscThreadCommReduction red,void *lred)
96: {
97: PetscThreadCommRedCtx redctx=&red->redctx[red->thread_ctr[trank]];
98: red->thread_ctr[trank] = (red->thread_ctr[trank]+1)%PETSC_REDUCTIONS_MAX;
100: if (PetscReadOnce(int,redctx->red_status) != THREADCOMM_REDUCTION_NEW) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Did not call PetscThreadReductionBegin() before calling PetscThreadCommRunKernel()");
102: if (redctx->op == THREADCOMM_MAXLOC || redctx->op == THREADCOMM_MINLOC) {
103: switch (redctx->type) {
104: case PETSC_INT:
105: ((PetscInt*)redctx->local_red)[trank] = ((PetscInt*)lred)[0];
106: ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads+trank] = ((PetscInt*)lred)[1];
107: break;
108: #if defined(PETSC_USE_COMPLEX)
109: case PETSC_REAL:
110: ((PetscReal*)redctx->local_red)[trank] = ((PetscReal*)lred)[0];
111: ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads+trank] = ((PetscReal*)lred)[1];
112: break;
113: #endif
114: case PETSC_SCALAR:
115: ((PetscScalar*)redctx->local_red)[trank] = ((PetscScalar*)lred)[0];
116: ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads+trank] = ((PetscScalar*)lred)[1];
117: break;
118: default:
119: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Unknown datatype provided for kernel reduction");
120: break;
121: }
122: } else {
123: switch (redctx->type) {
124: case PETSC_INT:
125: ((PetscInt*)redctx->local_red)[trank] = *(PetscInt*)lred;
126: break;
127: #if defined(PETSC_USE_COMPLEX)
128: case PETSC_REAL:
129: ((PetscReal*)redctx->local_red)[trank] = *(PetscReal*)lred;
130: break;
131: #endif
132: case PETSC_SCALAR:
133: ((PetscScalar*)redctx->local_red)[trank] = *(PetscScalar*)lred;
134: break;
135: default:
136: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Unknown datatype provided for kernel reduction");
137: break;
138: }
139: }
140: redctx->thread_status[trank] = THREADCOMM_THREAD_POSTED_LOCALRED;
141: return 0;
142: }
144: /* Completes the given reduction */
147: PetscErrorCode PetscThreadReductionEnd_Private(PetscThreadCommRedCtx redctx,void * outdata)
148: {
149: /* Check whether all threads have posted their contributions */
150: PetscBool wait=PETSC_TRUE;
151: PetscInt i;
152: while (wait) {
153: for (i=0; i < redctx->tcomm->nworkThreads; i++) {
154: if (PetscReadOnce(int,redctx->thread_status[i]) != THREADCOMM_THREAD_POSTED_LOCALRED) {
155: wait = PETSC_TRUE;
156: break;
157: }
158: wait = PETSC_FALSE;
159: }
160: }
162: /* Apply the reduction operation */
163: switch (redctx->op) {
164: case THREADCOMM_SUM:
165: if (redctx->type == PETSC_REAL) {
166: PetscReal red_sum=0.0;
167: for (i=0; i < redctx->tcomm->nworkThreads; i++) red_sum += ((PetscReal*)redctx->local_red)[i];
168: PetscMemcpy(outdata,&red_sum,sizeof(PetscReal));
169: break;
170: }
171: if (redctx->type == PETSC_SCALAR) {
172: PetscScalar red_sum=0.0;
173: for (i=0; i < redctx->tcomm->nworkThreads; i++) red_sum += ((PetscScalar*)redctx->local_red)[i];
174: PetscMemcpy(outdata,&red_sum,sizeof(PetscScalar));
175: break;
176: }
177: if (redctx->type == PETSC_INT) {
178: PetscInt red_sum=0;
179: for (i=0; i < redctx->tcomm->nworkThreads; i++) red_sum += ((PetscInt*)redctx->local_red)[i];
180: PetscMemcpy(outdata,&red_sum,sizeof(PetscInt));
181: }
182: break;
183: case THREADCOMM_PROD:
184: if (redctx->type == PETSC_REAL) {
185: PetscReal red_prod=0.0;
186: for (i=0; i < redctx->tcomm->nworkThreads; i++) red_prod *= ((PetscReal*)redctx->local_red)[i];
187: PetscMemcpy(outdata,&red_prod,sizeof(PetscReal));
188: break;
189: }
190: if (redctx->type == PETSC_SCALAR) {
191: PetscScalar red_prod=0.0;
192: for (i=0; i < redctx->tcomm->nworkThreads; i++) red_prod *= ((PetscScalar*)redctx->local_red)[i];
193: PetscMemcpy(outdata,&red_prod,sizeof(PetscScalar));
194: break;
195: }
196: if (redctx->type == PETSC_INT) {
197: PetscInt red_prod=0.0;
198: for (i=0; i < redctx->tcomm->nworkThreads; i++) red_prod *= ((PetscInt*)redctx->local_red)[i];
199: PetscMemcpy(outdata,&red_prod,sizeof(PetscInt));
200: }
201: break;
202: case THREADCOMM_MIN:
203: #if defined(PETSC_USE_COMPLEX)
204: if (redctx->type == PETSC_REAL) {
205: PetscReal min = ((PetscReal*)redctx->local_red)[0];
206: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
207: if (((PetscReal*)redctx->local_red)[i] < min) min = ((PetscReal*)redctx->local_red)[i];
208: }
209: PetscMemcpy(outdata,&min,sizeof(PetscReal));
210: break;
211: }
212: #endif
213: if (redctx->type == PETSC_SCALAR) {
214: PetscScalar min = ((PetscScalar*)redctx->local_red)[0];
215: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
216: if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) < PetscRealPart(min)) min = ((PetscScalar*)redctx->local_red)[i];
217: }
218: PetscMemcpy(outdata,&min,sizeof(PetscScalar));
219: break;
220: }
221: if (redctx->type == PETSC_INT) {
222: PetscInt min = ((PetscInt*)redctx->local_red)[0];
223: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
224: if (((PetscInt*)redctx->local_red)[i] < min) min = ((PetscInt*)redctx->local_red)[i];
225: }
226: PetscMemcpy(outdata,&min,sizeof(PetscInt));
227: }
228: break;
229: case THREADCOMM_MAX:
230: #if defined(PETSC_USE_COMPLEX)
231: if (redctx->type == PETSC_REAL) {
232: PetscReal max = ((PetscReal*)redctx->local_red)[0];
233: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
234: if (((PetscReal*)redctx->local_red)[i] > max) max = ((PetscReal*)redctx->local_red)[i];
235: }
236: PetscMemcpy(outdata,&max,sizeof(PetscReal));
237: break;
238: }
239: #endif
240: if (redctx->type == PETSC_SCALAR) {
241: PetscScalar max = ((PetscScalar*)redctx->local_red)[0];
242: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
243: if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) > PetscRealPart(max)) max = ((PetscScalar*)redctx->local_red)[i];
244: }
245: PetscMemcpy(outdata,&max,sizeof(PetscScalar));
246: break;
247: }
248: if (redctx->type == PETSC_INT) {
249: PetscInt max = ((PetscInt*)redctx->local_red)[0];
250: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
251: if (((PetscInt*)redctx->local_red)[i] > max) max = ((PetscInt*)redctx->local_red)[i];
252: }
253: PetscMemcpy(outdata,&max,sizeof(PetscInt));
254: }
255: break;
256: case THREADCOMM_MAXLOC:
257: #if defined(PETSC_USE_COMPLEX)
258: if (redctx->type == PETSC_REAL) {
259: PetscReal maxloc[2];
260: maxloc[0] = ((PetscReal*)redctx->local_red)[0];
261: maxloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads];
262: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
263: if (((PetscReal*)redctx->local_red)[i] > maxloc[0]) {
264: maxloc[0] = ((PetscReal*)redctx->local_red)[i];
265: maxloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
266: }
267: }
268: PetscMemcpy(outdata,maxloc,2*sizeof(PetscReal));
269: break;
270: }
271: #endif
272: if (redctx->type == PETSC_SCALAR) {
273: PetscScalar maxloc[2];
274: maxloc[0] = ((PetscScalar*)redctx->local_red)[0];
275: maxloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads];
276: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
277: if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) > PetscRealPart(maxloc[0])) {
278: maxloc[0] = ((PetscScalar*)redctx->local_red)[i];
279: maxloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
280: }
281: }
282: PetscMemcpy(outdata,maxloc,2*sizeof(PetscScalar));
283: break;
284: }
285: if (redctx->type == PETSC_INT) {
286: PetscInt maxloc[2];
287: maxloc[0] = ((PetscInt*)redctx->local_red)[0];
288: maxloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads];
289: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
290: if (((PetscInt*)redctx->local_red)[i] > maxloc[0]) {
291: maxloc[0] = ((PetscInt*)redctx->local_red)[i];
292: maxloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
293: }
294: }
295: PetscMemcpy(outdata,maxloc,2*sizeof(PetscInt));
296: }
297: break;
298: case THREADCOMM_MINLOC:
299: #if defined(PETSC_USE_COMPLEX)
300: if (redctx->type == PETSC_REAL) {
301: PetscReal minloc[2];
302: minloc[0] = ((PetscReal*)redctx->local_red)[0];
303: minloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads];
304: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
305: if (((PetscReal*)redctx->local_red)[i] < minloc[0]) {
306: minloc[0] = ((PetscReal*)redctx->local_red)[i];
307: minloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
308: }
309: }
310: PetscMemcpy(outdata,minloc,2*sizeof(PetscReal));
311: break;
312: }
313: #endif
314: if (redctx->type == PETSC_SCALAR) {
315: PetscScalar minloc[2];
316: minloc[0] = ((PetscScalar*)redctx->local_red)[0];
317: minloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads];
318: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
319: if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) < PetscRealPart(minloc[0])) {
320: minloc[0] = ((PetscScalar*)redctx->local_red)[i];
321: minloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
322: }
323: }
324: PetscMemcpy(outdata,minloc,2*sizeof(PetscScalar));
325: break;
326: }
327: if (redctx->type == PETSC_INT) {
328: PetscInt minloc[2];
329: minloc[0] = ((PetscInt*)redctx->local_red)[0];
330: minloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads];
331: for (i=1; i < redctx->tcomm->nworkThreads; i++) {
332: if (((PetscInt*)redctx->local_red)[i] < minloc[0]) {
333: minloc[0] = ((PetscInt*)redctx->local_red)[i];
334: minloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
335: }
336: }
337: PetscMemcpy(outdata,minloc,2*sizeof(PetscInt));
338: }
339: break;
340: default:
341: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Undefined thread reduction operation");
342: break;
343: }
344: return 0;
345: }
349: /*@C
350: PetscThreadReductionEnd - Completes the given reduction
352: Input Parameters:
353: + red - the reduction context
354: . outdata - the reduction result
356: Level: developer
358: Notes:
359: To be called by the main thread only
360: @*/
361: PetscErrorCode PetscThreadReductionEnd(PetscThreadCommReduction red,void *outdata)
362: {
363: PetscErrorCode ierr;
364: PetscThreadCommRedCtx redctx;
365: PetscInt i;
368: redctx = &red->redctx[red->ctr-red->nreds];
369: PetscThreadReductionEnd_Private(redctx,outdata);
370: redctx->red_status = THREADCOMM_REDUCTION_COMPLETE;
371: red->nreds--;
372: if (!red->nreds) {
373: /* Reset the counters */
374: red->ctr=0;
375: for (i=0; i<redctx->tcomm->nworkThreads; i++) red->thread_ctr[i] = 0;
376: }
377: return(0);
378: }
382: /*
383: PetscThreadReductionKernelEnd - Finishes a reduction operation
385: Input Parameters:
386: + trank - Rank of the calling thread
387: . red - the reduction context
388: - outdata - the reduction result
390: Level: developer
392: Notes: This should be called only from kernels only if the reduction needs to
393: be completed while in the kernel for some future operation.
395: */
396: PetscErrorCode PetscThreadReductionKernelEnd(PetscInt trank,PetscThreadCommReduction red,void *outdata)
397: {
398: PetscThreadCommRedCtx redctx=&red->redctx[red->ctr];
400: if (PetscReadOnce(int,redctx->tcomm->leader) == trank) {
401: PetscThreadReductionEnd_Private(redctx,outdata);
402: redctx->red_status = THREADCOMM_REDUCTION_COMPLETE;
403: red->ctr++;
404: }
406: /* Wait till the leader performs the reduction so that the other threads
407: can also see the reduction result */
408: while (PetscReadOnce(int,redctx->red_status) != THREADCOMM_REDUCTION_COMPLETE) ;
409: redctx->thread_status[trank] = THREADCOMM_THREAD_WAITING_FOR_NEWRED;
410: return 0;
411: }
415: /*
416: PetscThreadCommReductionCreate - Allocates the reduction context and
417: initializes it
419: Input Parameters:
420: + tcomm - the thread communicator
421: . red - the reduction context
423: */
424: PetscErrorCode PetscThreadCommReductionCreate(PetscThreadComm tcomm,PetscThreadCommReduction *newred)
425: {
426: PetscErrorCode ierr;
427: PetscThreadCommReduction redout;
428: PetscThreadCommRedCtx redctx;
429: PetscInt i,j;
432: PetscNew(&redout);
433: redout->nreds=0;
434: redout->ctr = 0;
435: PetscMalloc1(PETSC_REDUCTIONS_MAX,(struct _p_PetscThreadCommRedCtx**)&redout->redctx);
436: PetscMalloc1(PETSC_REDUCTIONS_MAX*tcomm->nworkThreads,&redout->redctx[0].thread_status);
437: /* Note that the size of local_red is twice the number of threads. The first half holds the local reductions
438: from each thread while the second half is used only for maxloc and minloc operations to hold the local max and min locations
439: */
440: PetscMalloc1(PETSC_REDUCTIONS_MAX*2*tcomm->nworkThreads,(PetscScalar**)&redout->redctx[0].local_red);
441: for (i=0; i < PETSC_REDUCTIONS_MAX; i++) {
442: redctx = &redout->redctx[i];
443: redctx->thread_status = redout->redctx[0].thread_status + i*tcomm->nworkThreads;
444: for (j=0; j<tcomm->nworkThreads; j++) redctx->thread_status[j] = THREADCOMM_THREAD_WAITING_FOR_NEWRED;
445: redctx->local_red = (char*)redout->redctx[0].local_red + i*2*tcomm->nworkThreads*sizeof(PetscScalar);
446: redctx->red_status = THREADCOMM_REDUCTION_NONE;
447: }
448: PetscMalloc1(tcomm->nworkThreads,&redout->thread_ctr);
449: for (j=0; j<tcomm->nworkThreads; j++) redout->thread_ctr[j] = 0;
450: *newred = redout;
451: return(0);
452: }