Actual source code: threadcommred.c

petsc-3.4.5 2014-06-29
  1: /* This file contains code for threaded reductions */
  2: #include <petsc-private/threadcommimpl.h>      /*I "petscthreadcomm.h" I*/

  6: /*@C
  7:    PetscThreadReductionBegin - Initiates a threaded reduction and returns a
  8:                                reduction object to be passed to PetscThreadCommRunKernel

 10:    Input Parameters:
 11: +  comm - the MPI comm
 12: .  op   - the reduction operation
 13: .  type - the data type for reduction
 14: -  nreds - Number of reductions

 16:    Output Parameters:
 17: .  redout  - the reduction context

 19:    Level: developer

 21:    Notes:
 22:    See include/petscthreadcomm.h for the available reduction operations

 24:    To be called from the main thread before calling PetscThreadCommRunKernel

 26: .seealso: PetscThreadCommReductionKernelPost(), PetscThreadCommReductionKernelEnd(), PetscThreadCommReductionEnd()
 27: @*/
 28: PetscErrorCode PetscThreadReductionBegin(MPI_Comm comm,PetscThreadCommReductionOp op, PetscDataType type,PetscInt nreds,PetscThreadCommReduction *redout)
 29: {
 30:   PetscErrorCode           ierr;
 31:   PetscThreadComm          tcomm;
 32:   PetscInt                 i;
 33:   PetscThreadCommRedCtx    redctx;
 34:   PetscThreadCommReduction red;

 37:   PetscCommGetThreadComm(comm,&tcomm);
 38:   red  = tcomm->red;
 39:   if (red->ctr+nreds > PETSC_REDUCTIONS_MAX) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Reductions in operation: %D Max. allowed: %D",red->ctr+nreds,PETSC_REDUCTIONS_MAX);
 40:   for (i=red->ctr; i<red->ctr+nreds; i++) {
 41:     redctx             = &red->redctx[i];
 42:     redctx->op         = op;
 43:     redctx->type       = type;
 44:     redctx->red_status = THREADCOMM_REDUCTION_NEW;
 45:     redctx->tcomm      = tcomm;
 46:   }
 47:   red->nreds += nreds;
 48:   red->ctr    = red->ctr+nreds;
 49:   *redout     = red;
 50:   return(0);
 51: }

 55: /*
 56:    PetscThreadCommReductionDestroy - Destroys the reduction context

 58:    Input Parameters:
 59: .  red - the reduction context

 61: */
 62: PetscErrorCode PetscThreadCommReductionDestroy(PetscThreadCommReduction red)
 63: {

 67:   if (!red) return(0);

 69:   PetscFree(red->redctx[0].thread_status);
 70:   PetscFree(red->redctx[0].local_red);
 71:   PetscFree(red->redctx);
 72:   PetscFree(red->thread_ctr);
 73:   PetscFree(red);
 74:   return(0);
 75: }

 79: /*
 80:    PetscThreadReductionKernelPost - Begins a threaded reduction operation

 82:    Input Parameters:
 83: +  trank   - Rank of the calling thread
 84: .  red     - the reduction context
 85: .  lred    - local contribution from the thread

 87:    Level: developer

 89:    Notes:
 90:    This routine posts the local reduction of each thread in the reduction context and
 91:    updates its reduction status.

 93:    Must call PetscThreadReductionBegin before launching the kernel.
 94: */
 95: PetscErrorCode PetscThreadReductionKernelPost(PetscInt trank,PetscThreadCommReduction red,void *lred)
 96: {
 97:   PetscThreadCommRedCtx redctx=&red->redctx[red->thread_ctr[trank]];
 98:   red->thread_ctr[trank] = (red->thread_ctr[trank]+1)%PETSC_REDUCTIONS_MAX;

100:   if (PetscReadOnce(int,redctx->red_status) != THREADCOMM_REDUCTION_NEW) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Did not call PetscThreadReductionBegin() before calling PetscThreadCommRunKernel()");

102:   if (redctx->op == THREADCOMM_MAXLOC || redctx->op == THREADCOMM_MINLOC) {
103:     switch (redctx->type) {
104:     case PETSC_INT:
105:       ((PetscInt*)redctx->local_red)[trank]                             = ((PetscInt*)lred)[0];
106:       ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads+trank] = ((PetscInt*)lred)[1];
107:       break;
108: #if defined(PETSC_USE_COMPLEX)
109:     case PETSC_REAL:
110:       ((PetscReal*)redctx->local_red)[trank]                             = ((PetscReal*)lred)[0];
111:       ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads+trank] = ((PetscReal*)lred)[1];
112:       break;
113: #endif
114:     case PETSC_SCALAR:
115:       ((PetscScalar*)redctx->local_red)[trank]                             = ((PetscScalar*)lred)[0];
116:       ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads+trank] = ((PetscScalar*)lred)[1];
117:       break;
118:     default:
119:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Unknown datatype provided for kernel reduction");
120:       break;
121:     }
122:   } else {
123:     switch (redctx->type) {
124:     case PETSC_INT:
125:       ((PetscInt*)redctx->local_red)[trank] = *(PetscInt*)lred;
126:       break;
127: #if defined(PETSC_USE_COMPLEX)
128:     case PETSC_REAL:
129:       ((PetscReal*)redctx->local_red)[trank] = *(PetscReal*)lred;
130:       break;
131: #endif
132:     case PETSC_SCALAR:
133:       ((PetscScalar*)redctx->local_red)[trank] = *(PetscScalar*)lred;
134:       break;
135:     default:
136:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Unknown datatype provided for kernel reduction");
137:       break;
138:     }
139:   }
140:   redctx->thread_status[trank] = THREADCOMM_THREAD_POSTED_LOCALRED;
141:   return 0;
142: }

144: /* Completes the given reduction */
147: PetscErrorCode PetscThreadReductionEnd_Private(PetscThreadCommRedCtx redctx,void * outdata)
148: {
149:   /* Check whether all threads have posted their contributions */
150:   PetscBool wait=PETSC_TRUE;
151:   PetscInt  i;
152:   while (wait) {
153:     for (i=0; i < redctx->tcomm->nworkThreads; i++) {
154:       if (PetscReadOnce(int,redctx->thread_status[i]) != THREADCOMM_THREAD_POSTED_LOCALRED) {
155:         wait = PETSC_TRUE;
156:         break;
157:       }
158:       wait = PETSC_FALSE;
159:     }
160:   }

162:   /* Apply the reduction operation */
163:   switch (redctx->op) {
164:   case THREADCOMM_SUM:
165:     if (redctx->type == PETSC_REAL) {
166:       PetscReal red_sum=0.0;
167:       for (i=0; i < redctx->tcomm->nworkThreads; i++) red_sum += ((PetscReal*)redctx->local_red)[i];
168:       PetscMemcpy(outdata,&red_sum,sizeof(PetscReal));
169:       break;
170:     }
171:     if (redctx->type == PETSC_SCALAR) {
172:       PetscScalar red_sum=0.0;
173:       for (i=0; i < redctx->tcomm->nworkThreads; i++) red_sum += ((PetscScalar*)redctx->local_red)[i];
174:       PetscMemcpy(outdata,&red_sum,sizeof(PetscScalar));
175:       break;
176:     }
177:     if (redctx->type == PETSC_INT) {
178:       PetscInt red_sum=0;
179:       for (i=0; i < redctx->tcomm->nworkThreads; i++) red_sum += ((PetscInt*)redctx->local_red)[i];
180:       PetscMemcpy(outdata,&red_sum,sizeof(PetscInt));
181:     }
182:     break;
183:   case THREADCOMM_PROD:
184:     if (redctx->type == PETSC_REAL) {
185:       PetscReal red_prod=0.0;
186:       for (i=0; i < redctx->tcomm->nworkThreads; i++) red_prod *= ((PetscReal*)redctx->local_red)[i];
187:       PetscMemcpy(outdata,&red_prod,sizeof(PetscReal));
188:       break;
189:     }
190:     if (redctx->type == PETSC_SCALAR) {
191:       PetscScalar red_prod=0.0;
192:       for (i=0; i < redctx->tcomm->nworkThreads; i++) red_prod *= ((PetscScalar*)redctx->local_red)[i];
193:       PetscMemcpy(outdata,&red_prod,sizeof(PetscScalar));
194:       break;
195:     }
196:     if (redctx->type == PETSC_INT) {
197:       PetscInt red_prod=0.0;
198:       for (i=0; i < redctx->tcomm->nworkThreads; i++) red_prod *= ((PetscInt*)redctx->local_red)[i];
199:       PetscMemcpy(outdata,&red_prod,sizeof(PetscInt));
200:     }
201:     break;
202:   case THREADCOMM_MIN:
203: #if defined(PETSC_USE_COMPLEX)
204:     if (redctx->type == PETSC_REAL) {
205:       PetscReal min = ((PetscReal*)redctx->local_red)[0];
206:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
207:         if (((PetscReal*)redctx->local_red)[i] < min) min = ((PetscReal*)redctx->local_red)[i];
208:       }
209:       PetscMemcpy(outdata,&min,sizeof(PetscReal));
210:       break;
211:     }
212: #endif
213:     if (redctx->type == PETSC_SCALAR) {
214:       PetscScalar min = ((PetscScalar*)redctx->local_red)[0];
215:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
216:         if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) < PetscRealPart(min)) min = ((PetscScalar*)redctx->local_red)[i];
217:       }
218:       PetscMemcpy(outdata,&min,sizeof(PetscScalar));
219:       break;
220:     }
221:     if (redctx->type == PETSC_INT) {
222:       PetscInt min = ((PetscInt*)redctx->local_red)[0];
223:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
224:         if (((PetscInt*)redctx->local_red)[i] < min) min = ((PetscInt*)redctx->local_red)[i];
225:       }
226:       PetscMemcpy(outdata,&min,sizeof(PetscInt));
227:     }
228:     break;
229:   case THREADCOMM_MAX:
230: #if defined(PETSC_USE_COMPLEX)
231:     if (redctx->type == PETSC_REAL) {
232:       PetscReal max = ((PetscReal*)redctx->local_red)[0];
233:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
234:         if (((PetscReal*)redctx->local_red)[i] > max) max = ((PetscReal*)redctx->local_red)[i];
235:       }
236:       PetscMemcpy(outdata,&max,sizeof(PetscReal));
237:       break;
238:     }
239: #endif
240:     if (redctx->type == PETSC_SCALAR) {
241:       PetscScalar max = ((PetscScalar*)redctx->local_red)[0];
242:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
243:         if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) > PetscRealPart(max)) max = ((PetscScalar*)redctx->local_red)[i];
244:       }
245:       PetscMemcpy(outdata,&max,sizeof(PetscScalar));
246:       break;
247:     }
248:     if (redctx->type == PETSC_INT) {
249:       PetscInt max = ((PetscInt*)redctx->local_red)[0];
250:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
251:         if (((PetscInt*)redctx->local_red)[i] > max) max = ((PetscInt*)redctx->local_red)[i];
252:       }
253:       PetscMemcpy(outdata,&max,sizeof(PetscInt));
254:     }
255:     break;
256:   case THREADCOMM_MAXLOC:
257: #if defined(PETSC_USE_COMPLEX)
258:     if (redctx->type == PETSC_REAL) {
259:       PetscReal maxloc[2];
260:       maxloc[0] = ((PetscReal*)redctx->local_red)[0];
261:       maxloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads];
262:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
263:         if (((PetscReal*)redctx->local_red)[i] > maxloc[0]) {
264:           maxloc[0] = ((PetscReal*)redctx->local_red)[i];
265:           maxloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
266:         }
267:       }
268:       PetscMemcpy(outdata,maxloc,2*sizeof(PetscReal));
269:       break;
270:     }
271: #endif
272:     if (redctx->type == PETSC_SCALAR) {
273:       PetscScalar maxloc[2];
274:       maxloc[0] = ((PetscScalar*)redctx->local_red)[0];
275:       maxloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads];
276:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
277:         if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) > PetscRealPart(maxloc[0])) {
278:           maxloc[0] = ((PetscScalar*)redctx->local_red)[i];
279:           maxloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
280:         }
281:       }
282:       PetscMemcpy(outdata,maxloc,2*sizeof(PetscScalar));
283:       break;
284:     }
285:     if (redctx->type == PETSC_INT) {
286:       PetscInt maxloc[2];
287:       maxloc[0] = ((PetscInt*)redctx->local_red)[0];
288:       maxloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads];
289:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
290:         if (((PetscInt*)redctx->local_red)[i] > maxloc[0]) {
291:           maxloc[0] = ((PetscInt*)redctx->local_red)[i];
292:           maxloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
293:         }
294:       }
295:       PetscMemcpy(outdata,maxloc,2*sizeof(PetscInt));
296:     }
297:     break;
298:   case THREADCOMM_MINLOC:
299: #if defined(PETSC_USE_COMPLEX)
300:     if (redctx->type == PETSC_REAL) {
301:       PetscReal minloc[2];
302:       minloc[0] = ((PetscReal*)redctx->local_red)[0];
303:       minloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads];
304:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
305:         if (((PetscReal*)redctx->local_red)[i] < minloc[0]) {
306:           minloc[0] = ((PetscReal*)redctx->local_red)[i];
307:           minloc[1] = ((PetscReal*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
308:         }
309:       }
310:       PetscMemcpy(outdata,minloc,2*sizeof(PetscReal));
311:       break;
312:     }
313: #endif
314:     if (redctx->type == PETSC_SCALAR) {
315:       PetscScalar minloc[2];
316:       minloc[0] = ((PetscScalar*)redctx->local_red)[0];
317:       minloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads];
318:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
319:         if (PetscRealPart(((PetscScalar*)redctx->local_red)[i]) < PetscRealPart(minloc[0])) {
320:           minloc[0] = ((PetscScalar*)redctx->local_red)[i];
321:           minloc[1] = ((PetscScalar*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
322:         }
323:       }
324:       PetscMemcpy(outdata,minloc,2*sizeof(PetscScalar));
325:       break;
326:     }
327:     if (redctx->type == PETSC_INT) {
328:       PetscInt minloc[2];
329:       minloc[0] = ((PetscInt*)redctx->local_red)[0];
330:       minloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads];
331:       for (i=1; i < redctx->tcomm->nworkThreads; i++) {
332:         if (((PetscInt*)redctx->local_red)[i] < minloc[0]) {
333:           minloc[0] = ((PetscInt*)redctx->local_red)[i];
334:           minloc[1] = ((PetscInt*)redctx->local_red)[redctx->tcomm->nworkThreads+i];
335:         }
336:       }
337:       PetscMemcpy(outdata,minloc,2*sizeof(PetscInt));
338:     }
339:     break;
340:   default:
341:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Undefined thread reduction operation");
342:     break;
343:   }
344:   return 0;
345: }

349: /*@C
350:    PetscThreadReductionEnd - Completes the given reduction

352:    Input Parameters:
353: +  red     - the reduction context
354: .  outdata - the reduction result

356:    Level: developer

358:    Notes:
359:    To be called by the main thread only
360: @*/
361: PetscErrorCode PetscThreadReductionEnd(PetscThreadCommReduction red,void *outdata)
362: {
363:   PetscErrorCode        ierr;
364:   PetscThreadCommRedCtx redctx;
365:   PetscInt              i;

368:   redctx             = &red->redctx[red->ctr-red->nreds];
369:   PetscThreadReductionEnd_Private(redctx,outdata);
370:   redctx->red_status = THREADCOMM_REDUCTION_COMPLETE;
371:   red->nreds--;
372:   if (!red->nreds) {
373:     /* Reset the counters */
374:     red->ctr=0;
375:     for (i=0; i<redctx->tcomm->nworkThreads; i++) red->thread_ctr[i] = 0;
376:   }
377:   return(0);
378: }

382: /*
383:    PetscThreadReductionKernelEnd - Finishes a reduction operation

385:    Input Parameters:
386: +  trank   - Rank of the calling thread
387: .  red     - the reduction context
388: -  outdata - the reduction result

390:    Level: developer

392:    Notes: This should be called only from kernels only if the reduction needs to
393:    be completed while in the kernel for some future operation.

395: */
396: PetscErrorCode PetscThreadReductionKernelEnd(PetscInt trank,PetscThreadCommReduction red,void *outdata)
397: {
398:   PetscThreadCommRedCtx redctx=&red->redctx[red->ctr];

400:   if (PetscReadOnce(int,redctx->tcomm->leader) == trank) {
401:     PetscThreadReductionEnd_Private(redctx,outdata);
402:     redctx->red_status = THREADCOMM_REDUCTION_COMPLETE;
403:     red->ctr++;
404:   }

406:   /* Wait till the leader performs the reduction so that the other threads
407:      can also see the reduction result */
408:   while (PetscReadOnce(int,redctx->red_status) != THREADCOMM_REDUCTION_COMPLETE) ;
409:   redctx->thread_status[trank] = THREADCOMM_THREAD_WAITING_FOR_NEWRED;
410:   return 0;
411: }

415: /*
416:    PetscThreadCommReductionCreate - Allocates the reduction context and
417:                                    initializes it

419:    Input Parameters:
420: +  tcomm - the thread communicator
421: .  red   - the reduction context

423: */
424: PetscErrorCode PetscThreadCommReductionCreate(PetscThreadComm tcomm,PetscThreadCommReduction *newred)
425: {
426:   PetscErrorCode           ierr;
427:   PetscThreadCommReduction redout;
428:   PetscThreadCommRedCtx    redctx;
429:   PetscInt                 i,j;

432:   PetscNew(struct _p_PetscThreadCommReduction,&redout);
433:   redout->nreds=0;
434:   redout->ctr  = 0;
435:   PetscMalloc(PETSC_REDUCTIONS_MAX*sizeof(struct _p_PetscThreadCommRedCtx),&redout->redctx);
436:   PetscMalloc(PETSC_REDUCTIONS_MAX*tcomm->nworkThreads*sizeof(PetscInt),&redout->redctx[0].thread_status);
437:   /* Note that the size of local_red is twice the number of threads. The first half holds the local reductions
438:      from each thread while the second half is used only for maxloc and minloc operations to hold the local max and min locations
439:   */
440:   PetscMalloc(PETSC_REDUCTIONS_MAX*2*tcomm->nworkThreads*sizeof(PetscScalar),&redout->redctx[0].local_red);
441:   for (i=0; i < PETSC_REDUCTIONS_MAX; i++) {
442:     redctx                = &redout->redctx[i];
443:     redctx->thread_status = redout->redctx[0].thread_status + i*tcomm->nworkThreads;
444:     for (j=0; j<tcomm->nworkThreads; j++) redctx->thread_status[j] = THREADCOMM_THREAD_WAITING_FOR_NEWRED;
445:     redctx->local_red  = (char*)redout->redctx[0].local_red + i*2*tcomm->nworkThreads*sizeof(PetscScalar);
446:     redctx->red_status = THREADCOMM_REDUCTION_NONE;
447:   }
448:   PetscMalloc(tcomm->nworkThreads*sizeof(PetscInt),&redout->thread_ctr);
449:   for (j=0; j<tcomm->nworkThreads; j++) redout->thread_ctr[j] = 0;
450:   *newred = redout;
451:   return(0);
452: }