Actual source code: fftw.c

petsc-3.6.4 2016-04-12
Report Typos and Errors
  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/examples/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t    ndim_fftw,*dim_fftw;
 14: #if defined(PETSC_USE_64BIT_INDICES)
 15:   fftw_iodim64 *iodims;
 16: #else
 17:   fftw_iodim   *iodims;
 18: #endif
 19:   PetscInt     partial_dim;
 20:   fftw_plan    p_forward,p_backward;
 21:   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 22:   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 23:                                                             executed for the arrays with which the plan was created */
 24: } Mat_FFTW;

 26: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 27: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 28: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 29: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 30: extern PetscErrorCode MatDestroy_FFTW(Mat);
 31: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);

 33: /* MatMult_SeqFFTW performs forward DFT in parallel
 34:    Input parameter:
 35:      A - the matrix
 36:      x - the vector on which FDFT will be performed

 38:    Output parameter:
 39:      y - vector that stores result of FDFT
 40: */
 43: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 44: {
 46:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 47:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 48:   const PetscScalar *x_array;
 49:   PetscScalar    *y_array;
 50: #if defined(PETSC_USE_COMPLEX)
 51: #if defined(PETSC_USE_64BIT_INDICES) 
 52:   fftw_iodim64   *iodims;
 53: #else
 54:   fftw_iodim     *iodims;
 55: #endif
 56:   PetscInt       i;
 57: #endif
 58:   PetscInt       ndim = fft->ndim,*dim = fft->dim;

 61:   VecGetArrayRead(x,&x_array);
 62:   VecGetArray(y,&y_array);
 63:   if (!fftw->p_forward) { /* create a plan, then excute it */
 64:     switch (ndim) {
 65:     case 1:
 66: #if defined(PETSC_USE_COMPLEX)
 67:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 68: #else
 69:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 70: #endif
 71:       break;
 72:     case 2:
 73: #if defined(PETSC_USE_COMPLEX)
 74:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 75: #else
 76:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 77: #endif
 78:       break;
 79:     case 3:
 80: #if defined(PETSC_USE_COMPLEX)
 81:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 82: #else
 83:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 84: #endif
 85:       break;
 86:     default:
 87: #if defined(PETSC_USE_COMPLEX)
 88:       iodims = fftw->iodims;
 89: #if defined(PETSC_USE_64BIT_INDICES)
 90:       if (ndim) {
 91:         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
 92:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
 93:         for (i=ndim-2; i>=0; --i) {
 94:           iodims[i].n = (ptrdiff_t)dim[i];
 95:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
 96:         }
 97:       }
 98:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 99: #else
100:       if (ndim) {
101:         iodims[ndim-1].n = (int)dim[ndim-1];
102:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
103:         for (i=ndim-2; i>=0; --i) {
104:           iodims[i].n = (int)dim[i];
105:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
106:         }
107:       }
108:       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
109: #endif

111: #else
112:       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
113: #endif
114:       break;
115:     }
116:     fftw->finarray  = (PetscScalar *) x_array;
117:     fftw->foutarray = y_array;
118:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
119:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
120:     fftw_execute(fftw->p_forward);
121:   } else { /* use existing plan */
122:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
123:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
124:     } else {
125:       fftw_execute(fftw->p_forward);
126:     }
127:   }
128:   VecRestoreArray(y,&y_array);
129:   VecRestoreArrayRead(x,&x_array);
130:   return(0);
131: }

133: /* MatMultTranspose_SeqFFTW performs serial backward DFT
134:    Input parameter:
135:      A - the matrix
136:      x - the vector on which BDFT will be performed

138:    Output parameter:
139:      y - vector that stores result of BDFT
140: */

144: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
145: {
147:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
148:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
149:   const PetscScalar *x_array;
150:   PetscScalar    *y_array;
151:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
152: #if defined(PETSC_USE_COMPLEX)
153: #if defined(PETSC_USE_64BIT_INDICES)
154:   fftw_iodim64   *iodims=fftw->iodims;
155: #else
156:   fftw_iodim     *iodims=fftw->iodims;
157: #endif
158: #endif
159: 
161:   VecGetArrayRead(x,&x_array);
162:   VecGetArray(y,&y_array);
163:   if (!fftw->p_backward) { /* create a plan, then excute it */
164:     switch (ndim) {
165:     case 1:
166: #if defined(PETSC_USE_COMPLEX)
167:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
168: #else
169:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
170: #endif
171:       break;
172:     case 2:
173: #if defined(PETSC_USE_COMPLEX)
174:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
175: #else
176:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
177: #endif
178:       break;
179:     case 3:
180: #if defined(PETSC_USE_COMPLEX)
181:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
182: #else
183:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
184: #endif
185:       break;
186:     default:
187: #if defined(PETSC_USE_COMPLEX)
188: #if defined(PETSC_USE_64BIT_INDICES)
189:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
190: #else
191:       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
192: #endif
193: #else
194:       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
195: #endif
196:       break;
197:     }
198:     fftw->binarray  = (PetscScalar *) x_array;
199:     fftw->boutarray = y_array;
200:     fftw_execute(fftw->p_backward);
201:   } else { /* use existing plan */
202:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
203:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
204:     } else {
205:       fftw_execute(fftw->p_backward);
206:     }
207:   }
208:   VecRestoreArray(y,&y_array);
209:   VecRestoreArrayRead(x,&x_array);
210:   return(0);
211: }

213: /* MatMult_MPIFFTW performs forward DFT in parallel
214:    Input parameter:
215:      A - the matrix
216:      x - the vector on which FDFT will be performed

218:    Output parameter:
219:    y   - vector that stores result of FDFT
220: */
223: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
224: {
226:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
227:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
228:   const PetscScalar *x_array;
229:   PetscScalar    *y_array;
230:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
231:   MPI_Comm       comm;

234:   PetscObjectGetComm((PetscObject)A,&comm);
235:   VecGetArrayRead(x,&x_array);
236:   VecGetArray(y,&y_array);
237:   if (!fftw->p_forward) { /* create a plan, then excute it */
238:     switch (ndim) {
239:     case 1:
240: #if defined(PETSC_USE_COMPLEX)
241:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
242: #else
243:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
244: #endif
245:       break;
246:     case 2:
247: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
248:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
249: #else
250:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
251: #endif
252:       break;
253:     case 3:
254: #if defined(PETSC_USE_COMPLEX)
255:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
256: #else
257:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
258: #endif
259:       break;
260:     default:
261: #if defined(PETSC_USE_COMPLEX)
262:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
263: #else
264:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
265: #endif
266:       break;
267:     }
268:     fftw->finarray  = (PetscScalar *) x_array;
269:     fftw->foutarray = y_array;
270:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
271:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
272:     fftw_execute(fftw->p_forward);
273:   } else { /* use existing plan */
274:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
275:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
276:     } else {
277:       fftw_execute(fftw->p_forward);
278:     }
279:   }
280:   VecRestoreArray(y,&y_array);
281:   VecRestoreArrayRead(x,&x_array);
282:   return(0);
283: }

285: /* MatMultTranspose_MPIFFTW performs parallel backward DFT
286:    Input parameter:
287:      A - the matrix
288:      x - the vector on which BDFT will be performed

290:    Output parameter:
291:      y - vector that stores result of BDFT
292: */
295: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
296: {
298:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
299:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
300:   const PetscScalar *x_array;
301:   PetscScalar    *y_array;
302:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
303:   MPI_Comm       comm;

306:   PetscObjectGetComm((PetscObject)A,&comm);
307:   VecGetArrayRead(x,&x_array);
308:   VecGetArray(y,&y_array);
309:   if (!fftw->p_backward) { /* create a plan, then excute it */
310:     switch (ndim) {
311:     case 1:
312: #if defined(PETSC_USE_COMPLEX)
313:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
314: #else
315:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
316: #endif
317:       break;
318:     case 2:
319: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
320:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
321: #else
322:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
323: #endif
324:       break;
325:     case 3:
326: #if defined(PETSC_USE_COMPLEX)
327:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
328: #else
329:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
330: #endif
331:       break;
332:     default:
333: #if defined(PETSC_USE_COMPLEX)
334:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
335: #else
336:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
337: #endif
338:       break;
339:     }
340:     fftw->binarray  = (PetscScalar *) x_array;
341:     fftw->boutarray = y_array;
342:     fftw_execute(fftw->p_backward);
343:   } else { /* use existing plan */
344:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
345:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
346:     } else {
347:       fftw_execute(fftw->p_backward);
348:     }
349:   }
350:   VecRestoreArray(y,&y_array);
351:   VecRestoreArrayRead(x,&x_array);
352:   return(0);
353: }

357: PetscErrorCode MatDestroy_FFTW(Mat A)
358: {
359:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
360:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

364:   fftw_destroy_plan(fftw->p_forward);
365:   fftw_destroy_plan(fftw->p_backward);
366:   if (fftw->iodims) {
367:     free(fftw->iodims);
368:   }
369:   PetscFree(fftw->dim_fftw);
370:   PetscFree(fft->data);
371:   fftw_mpi_cleanup();
372:   return(0);
373: }

375: #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
378: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
379: {
381:   PetscScalar    *array;

384:   VecGetArray(v,&array);
385:   fftw_free((fftw_complex*)array);
386:   VecRestoreArray(v,&array);
387:   VecDestroy_MPI(v);
388:   return(0);
389: }

393: /*@
394:    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
395:      parallel layout determined by FFTW

397:    Collective on Mat

399:    Input Parameter:
400: .   A - the matrix

402:    Output Parameter:
403: +   x - (optional) input vector of forward FFTW
404: -   y - (optional) output vector of forward FFTW
405: -   z - (optional) output vector of backward FFTW

407:   Level: advanced

409:   Note: The parallel layout of output of forward FFTW is always same as the input
410:         of the backward FFTW. But parallel layout of the input vector of forward
411:         FFTW might not be same as the output of backward FFTW.
412:         Also note that we need to provide enough space while doing parallel real transform.
413:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
414:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
415:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
416:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
417:         zeros if it is odd only one zero is needed.
418:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
419:         figures out how much space is needed, i.e. it figures out the data+scratch space for
420:         each processor and returns that.

422: .seealso: MatCreateFFTW()
423: @*/
424: PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
425: {

429:   PetscTryMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
430:   return(0);
431: }

435: PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
436: {
438:   PetscMPIInt    size,rank;
439:   MPI_Comm       comm;
440:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
441:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
442:   PetscInt       N     = fft->N;
443:   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;

448:   PetscObjectGetComm((PetscObject)A,&comm);

450:   MPI_Comm_size(comm, &size);
451:   MPI_Comm_rank(comm, &rank);
452:   if (size == 1) { /* sequential case */
453: #if defined(PETSC_USE_COMPLEX)
454:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
455:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,N,fout);}
456:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,N,bout);}
457: #else
458:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
459:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,n,fout);}
460:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,n,bout);}
461: #endif
462:   } else { /* parallel cases */
463:     ptrdiff_t    alloc_local,local_n0,local_0_start;
464:     ptrdiff_t    local_n1;
465:     fftw_complex *data_fout;
466:     ptrdiff_t    local_1_start;
467: #if defined(PETSC_USE_COMPLEX)
468:     fftw_complex *data_fin,*data_bout;
469: #else
470:     double    *data_finr,*data_boutr;
471:     PetscInt  n1,N1;
472:     ptrdiff_t temp;
473: #endif

475:     switch (ndim) {
476:     case 1:
477: #if !defined(PETSC_USE_COMPLEX)
478:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
479: #else
480:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
481:       if (fin) {
482:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
483:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);

485:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
486:       }
487:       if (fout) {
488:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
489:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);

491:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
492:       }
493:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
494:       if (bout) {
495:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
496:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);

498:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
499:       }
500:       break;
501: #endif
502:     case 2:
503: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
504:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
505:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
506:       if (fin) {
507:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
508:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

510:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
511:       }
512:       if (bout) {
513:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
514:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

516:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
517:       }
518:       if (fout) {
519:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
520:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);

522:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
523:       }
524: #else
525:       /* Get local size */
526:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
527:       if (fin) {
528:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
529:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

531:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
532:       }
533:       if (fout) {
534:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
535:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

537:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
538:       }
539:       if (bout) {
540:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
541:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

543:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
544:       }
545: #endif
546:       break;
547:     case 3:
548: #if !defined(PETSC_USE_COMPLEX)
549:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
550:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
551:       if (fin) {
552:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
553:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

555:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
556:       }
557:       if (bout) {
558:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
559:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

561:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
562:       }

564:       if (fout) {
565:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
566:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);

568:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
569:       }
570: #else
571:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
572:       if (fin) {
573:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
574:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

576:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
577:       }
578:       if (fout) {
579:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
580:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

582:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
583:       }
584:       if (bout) {
585:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
586:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

588:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
589:       }
590: #endif
591:       break;
592:     default:
593: #if !defined(PETSC_USE_COMPLEX)
594:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

596:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

598:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
599:       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

601:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

603:       if (fin) {
604:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
605:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);

607:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
608:       }
609:       if (bout) {
610:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
611:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);

613:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
614:       }

616:       if (fout) {
617:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
618:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);

620:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
621:       }
622: #else
623:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
624:       if (fin) {
625:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
626:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

628:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
629:       }
630:       if (fout) {
631:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
632:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

634:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
635:       }
636:       if (bout) {
637:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
638:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

640:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
641:       }
642: #endif
643:       break;
644:     }
645:   }
646:   return(0);
647: }

651: /*@
652:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

654:    Collective on Mat

656:    Input Parameters:
657: +  A - FFTW matrix
658: -  x - the PETSc vector

660:    Output Parameters:
661: .  y - the FFTW vector

663:   Options Database Keys:
664: . -mat_fftw_plannerflags - set FFTW planner flags

666:    Level: intermediate

668:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
669:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
670:          zeros. This routine does that job by scattering operation.

672: .seealso: VecScatterFFTWToPetsc()
673: @*/
674: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
675: {

679:   PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
680:   return(0);
681: }

685: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
686: {
688:   MPI_Comm       comm;
689:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
690:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
691:   PetscInt       N     =fft->N;
692:   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
693:   PetscInt       low;
694:   PetscMPIInt    rank,size;
695:   PetscInt       vsize,vsize1;
696:   ptrdiff_t      local_n0,local_0_start;
697:   ptrdiff_t      local_n1,local_1_start;
698:   VecScatter     vecscat;
699:   IS             list1,list2;
700: #if !defined(PETSC_USE_COMPLEX)
701:   PetscInt       i,j,k,partial_dim;
702:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
703:   PetscInt       NM;
704:   ptrdiff_t      temp;
705: #endif

708:   PetscObjectGetComm((PetscObject)A,&comm);
709:   MPI_Comm_size(comm, &size);
710:   MPI_Comm_rank(comm, &rank);
711:   VecGetOwnershipRange(y,&low,NULL);

713:   if (size==1) {
714:     VecGetSize(x,&vsize);
715:     VecGetSize(y,&vsize1);
716:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
717:     VecScatterCreate(x,list1,y,list1,&vecscat);
718:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
719:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
720:     VecScatterDestroy(&vecscat);
721:     ISDestroy(&list1);
722:   } else {
723:     switch (ndim) {
724:     case 1:
725: #if defined(PETSC_USE_COMPLEX)
726:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

728:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
729:       ISCreateStride(comm,local_n0,low,1,&list2);
730:       VecScatterCreate(x,list1,y,list2,&vecscat);
731:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
732:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
733:       VecScatterDestroy(&vecscat);
734:       ISDestroy(&list1);
735:       ISDestroy(&list2);
736: #else
737:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
738: #endif
739:       break;
740:     case 2:
741: #if defined(PETSC_USE_COMPLEX)
742:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

744:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
745:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
746:       VecScatterCreate(x,list1,y,list2,&vecscat);
747:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
748:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
749:       VecScatterDestroy(&vecscat);
750:       ISDestroy(&list1);
751:       ISDestroy(&list2);
752: #else
753:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

755:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
756:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

758:       if (dim[1]%2==0) {
759:         NM = dim[1]+2;
760:       } else {
761:         NM = dim[1]+1;
762:       }
763:       for (i=0; i<local_n0; i++) {
764:         for (j=0; j<dim[1]; j++) {
765:           tempindx  = i*dim[1] + j;
766:           tempindx1 = i*NM + j;

768:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
769:           indx2[tempindx]=low+tempindx1;
770:         }
771:       }

773:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
774:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

776:       VecScatterCreate(x,list1,y,list2,&vecscat);
777:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
778:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
779:       VecScatterDestroy(&vecscat);
780:       ISDestroy(&list1);
781:       ISDestroy(&list2);
782:       PetscFree(indx1);
783:       PetscFree(indx2);
784: #endif
785:       break;

787:     case 3:
788: #if defined(PETSC_USE_COMPLEX)
789:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

791:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
792:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
793:       VecScatterCreate(x,list1,y,list2,&vecscat);
794:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
795:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
796:       VecScatterDestroy(&vecscat);
797:       ISDestroy(&list1);
798:       ISDestroy(&list2);
799: #else
800:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
801:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
802:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

804:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
805:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

807:       if (dim[2]%2==0) NM = dim[2]+2;
808:       else             NM = dim[2]+1;

810:       for (i=0; i<local_n0; i++) {
811:         for (j=0; j<dim[1]; j++) {
812:           for (k=0; k<dim[2]; k++) {
813:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
814:             tempindx1 = i*dim[1]*NM + j*NM + k;

816:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
817:             indx2[tempindx]=low+tempindx1;
818:           }
819:         }
820:       }

822:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
823:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
824:       VecScatterCreate(x,list1,y,list2,&vecscat);
825:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
826:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
827:       VecScatterDestroy(&vecscat);
828:       ISDestroy(&list1);
829:       ISDestroy(&list2);
830:       PetscFree(indx1);
831:       PetscFree(indx2);
832: #endif
833:       break;

835:     default:
836: #if defined(PETSC_USE_COMPLEX)
837:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

839:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
840:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
841:       VecScatterCreate(x,list1,y,list2,&vecscat);
842:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
843:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
844:       VecScatterDestroy(&vecscat);
845:       ISDestroy(&list1);
846:       ISDestroy(&list2);
847: #else
848:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
849:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
850:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

852:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

854:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

856:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

858:       partial_dim = fftw->partial_dim;

860:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
861:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

863:       if (dim[ndim-1]%2==0) NM = 2;
864:       else                  NM = 1;

866:       j = low;
867:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
868:         indx1[i] = local_0_start*partial_dim + i;
869:         indx2[i] = j;
870:         if (k%dim[ndim-1]==0) j+=NM;
871:         j++;
872:       }
873:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
874:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
875:       VecScatterCreate(x,list1,y,list2,&vecscat);
876:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
877:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
878:       VecScatterDestroy(&vecscat);
879:       ISDestroy(&list1);
880:       ISDestroy(&list2);
881:       PetscFree(indx1);
882:       PetscFree(indx2);
883: #endif
884:       break;
885:     }
886:   }
887:   return(0);
888: }

892: /*@
893:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

895:    Collective on Mat

897:     Input Parameters:
898: +   A - FFTW matrix
899: -   x - FFTW vector

901:    Output Parameters:
902: .  y - PETSc vector

904:    Level: intermediate

906:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
907:          VecScatterFFTWToPetsc removes those extra zeros.

909: .seealso: VecScatterPetscToFFTW()
910: @*/
911: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
912: {

916:   PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
917:   return(0);
918: }

922: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
923: {
925:   MPI_Comm       comm;
926:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
927:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
928:   PetscInt       N     = fft->N;
929:   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
930:   PetscInt       low;
931:   PetscMPIInt    rank,size;
932:   ptrdiff_t      local_n0,local_0_start;
933:   ptrdiff_t      local_n1,local_1_start;
934:   VecScatter     vecscat;
935:   IS             list1,list2;
936: #if !defined(PETSC_USE_COMPLEX)
937:   PetscInt       i,j,k,partial_dim;
938:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
939:   PetscInt       NM;
940:   ptrdiff_t      temp;
941: #endif

944:   PetscObjectGetComm((PetscObject)A,&comm);
945:   MPI_Comm_size(comm, &size);
946:   MPI_Comm_rank(comm, &rank);
947:   VecGetOwnershipRange(x,&low,NULL);

949:   if (size==1) {
950:     ISCreateStride(comm,N,0,1,&list1);
951:     VecScatterCreate(x,list1,y,list1,&vecscat);
952:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
953:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
954:     VecScatterDestroy(&vecscat);
955:     ISDestroy(&list1);

957:   } else {
958:     switch (ndim) {
959:     case 1:
960: #if defined(PETSC_USE_COMPLEX)
961:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

963:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
964:       ISCreateStride(comm,local_n1,low,1,&list2);
965:       VecScatterCreate(x,list1,y,list2,&vecscat);
966:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
967:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
968:       VecScatterDestroy(&vecscat);
969:       ISDestroy(&list1);
970:       ISDestroy(&list2);
971: #else
972:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
973: #endif
974:       break;
975:     case 2:
976: #if defined(PETSC_USE_COMPLEX)
977:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

979:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
980:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
981:       VecScatterCreate(x,list2,y,list1,&vecscat);
982:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
983:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
984:       VecScatterDestroy(&vecscat);
985:       ISDestroy(&list1);
986:       ISDestroy(&list2);
987: #else
988:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

990:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
991:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

993:       if (dim[1]%2==0) NM = dim[1]+2;
994:       else             NM = dim[1]+1;

996:       for (i=0; i<local_n0; i++) {
997:         for (j=0; j<dim[1]; j++) {
998:           tempindx = i*dim[1] + j;
999:           tempindx1 = i*NM + j;

1001:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1002:           indx2[tempindx]=low+tempindx1;
1003:         }
1004:       }

1006:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
1007:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

1009:       VecScatterCreate(x,list2,y,list1,&vecscat);
1010:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1011:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1012:       VecScatterDestroy(&vecscat);
1013:       ISDestroy(&list1);
1014:       ISDestroy(&list2);
1015:       PetscFree(indx1);
1016:       PetscFree(indx2);
1017: #endif
1018:       break;
1019:     case 3:
1020: #if defined(PETSC_USE_COMPLEX)
1021:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1023:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1024:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1025:       VecScatterCreate(x,list1,y,list2,&vecscat);
1026:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1027:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1028:       VecScatterDestroy(&vecscat);
1029:       ISDestroy(&list1);
1030:       ISDestroy(&list2);
1031: #else
1032:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1034:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
1035:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

1037:       if (dim[2]%2==0) NM = dim[2]+2;
1038:       else             NM = dim[2]+1;

1040:       for (i=0; i<local_n0; i++) {
1041:         for (j=0; j<dim[1]; j++) {
1042:           for (k=0; k<dim[2]; k++) {
1043:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1044:             tempindx1 = i*dim[1]*NM + j*NM + k;

1046:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1047:             indx2[tempindx]=low+tempindx1;
1048:           }
1049:         }
1050:       }

1052:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1053:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1055:       VecScatterCreate(x,list2,y,list1,&vecscat);
1056:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1057:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1058:       VecScatterDestroy(&vecscat);
1059:       ISDestroy(&list1);
1060:       ISDestroy(&list2);
1061:       PetscFree(indx1);
1062:       PetscFree(indx2);
1063: #endif
1064:       break;
1065:     default:
1066: #if defined(PETSC_USE_COMPLEX)
1067:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1069:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1070:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1071:       VecScatterCreate(x,list1,y,list2,&vecscat);
1072:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1073:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1074:       VecScatterDestroy(&vecscat);
1075:       ISDestroy(&list1);
1076:       ISDestroy(&list2);
1077: #else
1078:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1080:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1082:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1084:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1086:       partial_dim = fftw->partial_dim;

1088:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
1089:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

1091:       if (dim[ndim-1]%2==0) NM = 2;
1092:       else                  NM = 1;

1094:       j = low;
1095:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1096:         indx1[i] = local_0_start*partial_dim + i;
1097:         indx2[i] = j;
1098:         if (k%dim[ndim-1]==0) j+=NM;
1099:         j++;
1100:       }
1101:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1102:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1104:       VecScatterCreate(x,list2,y,list1,&vecscat);
1105:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1106:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1107:       VecScatterDestroy(&vecscat);
1108:       ISDestroy(&list1);
1109:       ISDestroy(&list2);
1110:       PetscFree(indx1);
1111:       PetscFree(indx2);
1112: #endif
1113:       break;
1114:     }
1115:   }
1116:   return(0);
1117: }

1121: /*
1122:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1124:   Options Database Keys:
1125: + -mat_fftw_plannerflags - set FFTW planner flags

1127:    Level: intermediate

1129: */
1130: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1131: {
1133:   MPI_Comm       comm;
1134:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1135:   Mat_FFTW       *fftw;
1136:   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1137:   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1138:   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1139:   PetscBool      flg;
1140:   PetscInt       p_flag,partial_dim=1,ctr;
1141:   PetscMPIInt    size,rank;
1142:   ptrdiff_t      *pdim;
1143:   ptrdiff_t      local_n1,local_1_start;
1144: #if !defined(PETSC_USE_COMPLEX)
1145:   ptrdiff_t      temp;
1146:   PetscInt       N1,tot_dim;
1147: #else
1148:   PetscInt       n1;
1149: #endif

1152:   PetscObjectGetComm((PetscObject)A,&comm);
1153:   MPI_Comm_size(comm, &size);
1154:   MPI_Comm_rank(comm, &rank);

1156:   fftw_mpi_init();
1157:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1158:   pdim[0] = dim[0];
1159: #if !defined(PETSC_USE_COMPLEX)
1160:   tot_dim = 2*dim[0];
1161: #endif
1162:   for (ctr=1; ctr<ndim; ctr++) {
1163:     partial_dim *= dim[ctr];
1164:     pdim[ctr]    = dim[ctr];
1165: #if !defined(PETSC_USE_COMPLEX)
1166:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1167:     else             tot_dim *= dim[ctr];
1168: #endif
1169:   }

1171:   if (size == 1) {
1172: #if defined(PETSC_USE_COMPLEX)
1173:     MatSetSizes(A,N,N,N,N);
1174:     n    = N;
1175: #else
1176:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1177:     n    = tot_dim;
1178: #endif

1180:   } else {
1181:     ptrdiff_t local_n0,local_0_start;
1182:     switch (ndim) {
1183:     case 1:
1184: #if !defined(PETSC_USE_COMPLEX)
1185:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1186: #else
1187:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1189:       n    = (PetscInt)local_n0;
1190:       n1   = (PetscInt)local_n1;
1191:       MatSetSizes(A,n1,n,N,N);
1192: #endif
1193:       break;
1194:     case 2:
1195: #if defined(PETSC_USE_COMPLEX)
1196:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1197:       /*
1198:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1199:        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1200:        */
1201:       n    = (PetscInt)local_n0*dim[1];
1202:       MatSetSizes(A,n,n,N,N);
1203: #else
1204:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1206:       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1207:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1208: #endif
1209:       break;
1210:     case 3:
1211: #if defined(PETSC_USE_COMPLEX)
1212:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1214:       n    = (PetscInt)local_n0*dim[1]*dim[2];
1215:       MatSetSizes(A,n,n,N,N);
1216: #else
1217:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1219:       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1220:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1221: #endif
1222:       break;
1223:     default:
1224: #if defined(PETSC_USE_COMPLEX)
1225:       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1227:       n    = (PetscInt)local_n0*partial_dim;
1228:       MatSetSizes(A,n,n,N,N);
1229: #else
1230:       temp = pdim[ndim-1];

1232:       pdim[ndim-1] = temp/2 + 1;

1234:       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1236:       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1237:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1239:       pdim[ndim-1] = temp;

1241:       MatSetSizes(A,n,n,N1,N1);
1242: #endif
1243:       break;
1244:     }
1245:   }
1246:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1247:   PetscNewLog(A,&fftw);
1248:   fft->data = (void*)fftw;

1250:   fft->n            = n;
1251:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1252:   fftw->partial_dim = partial_dim;

1254:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1255:   if (size == 1) {
1256: #if defined(PETSC_USE_64BIT_INDICES)
1257:     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1258: #else
1259:     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1260: #endif
1261:   }

1263:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1265:   fftw->p_forward  = 0;
1266:   fftw->p_backward = 0;
1267:   fftw->p_flag     = FFTW_ESTIMATE;

1269:   if (size == 1) {
1270:     A->ops->mult          = MatMult_SeqFFTW;
1271:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1272:   } else {
1273:     A->ops->mult          = MatMult_MPIFFTW;
1274:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1275:   }
1276:   fft->matdestroy = MatDestroy_FFTW;
1277:   A->assembled    = PETSC_TRUE;
1278:   A->preallocated = PETSC_TRUE;

1280:   PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);
1281:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1282:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1284:   /* get runtime options */
1285:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1286:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);
1287:   if (flg) {
1288:     fftw->p_flag = iplans[p_flag];
1289:   }
1290:   PetscOptionsEnd();
1291:   return(0);
1292: }