Actual source code: fftw.c

petsc-3.7.7 2017-09-25
Report Typos and Errors
  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/examples/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t    ndim_fftw,*dim_fftw;
 14: #if defined(PETSC_USE_64BIT_INDICES)
 15:   fftw_iodim64 *iodims;
 16: #else
 17:   fftw_iodim   *iodims;
 18: #endif
 19:   PetscInt     partial_dim;
 20:   fftw_plan    p_forward,p_backward;
 21:   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 22:   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 23:                                                             executed for the arrays with which the plan was created */
 24: } Mat_FFTW;

 26: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 27: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 28: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 29: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 30: extern PetscErrorCode MatDestroy_FFTW(Mat);
 31: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);

 33: /* MatMult_SeqFFTW performs forward DFT in parallel
 34:    Input parameter:
 35:      A - the matrix
 36:      x - the vector on which FDFT will be performed

 38:    Output parameter:
 39:      y - vector that stores result of FDFT
 40: */
 43: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 44: {
 46:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 47:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 48:   const PetscScalar *x_array;
 49:   PetscScalar    *y_array;
 50: #if defined(PETSC_USE_COMPLEX)
 51: #if defined(PETSC_USE_64BIT_INDICES) 
 52:   fftw_iodim64   *iodims;
 53: #else
 54:   fftw_iodim     *iodims;
 55: #endif
 56:   PetscInt       i;
 57: #endif
 58:   PetscInt       ndim = fft->ndim,*dim = fft->dim;

 61:   VecGetArrayRead(x,&x_array);
 62:   VecGetArray(y,&y_array);
 63:   if (!fftw->p_forward) { /* create a plan, then excute it */
 64:     switch (ndim) {
 65:     case 1:
 66: #if defined(PETSC_USE_COMPLEX)
 67:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 68: #else
 69:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 70: #endif
 71:       break;
 72:     case 2:
 73: #if defined(PETSC_USE_COMPLEX)
 74:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 75: #else
 76:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 77: #endif
 78:       break;
 79:     case 3:
 80: #if defined(PETSC_USE_COMPLEX)
 81:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 82: #else
 83:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 84: #endif
 85:       break;
 86:     default:
 87: #if defined(PETSC_USE_COMPLEX)
 88:       iodims = fftw->iodims;
 89: #if defined(PETSC_USE_64BIT_INDICES)
 90:       if (ndim) {
 91:         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
 92:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
 93:         for (i=ndim-2; i>=0; --i) {
 94:           iodims[i].n = (ptrdiff_t)dim[i];
 95:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
 96:         }
 97:       }
 98:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 99: #else
100:       if (ndim) {
101:         iodims[ndim-1].n = (int)dim[ndim-1];
102:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
103:         for (i=ndim-2; i>=0; --i) {
104:           iodims[i].n = (int)dim[i];
105:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
106:         }
107:       }
108:       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
109: #endif

111: #else
112:       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
113: #endif
114:       break;
115:     }
116:     fftw->finarray  = (PetscScalar *) x_array;
117:     fftw->foutarray = y_array;
118:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
119:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
120:     fftw_execute(fftw->p_forward);
121:   } else { /* use existing plan */
122:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
123: #if defined(PETSC_USE_COMPLEX)
124:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
125: #else
126:       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
127: #endif
128:     } else {
129:       fftw_execute(fftw->p_forward);
130:     }
131:   }
132:   VecRestoreArray(y,&y_array);
133:   VecRestoreArrayRead(x,&x_array);
134:   return(0);
135: }

137: /* MatMultTranspose_SeqFFTW performs serial backward DFT
138:    Input parameter:
139:      A - the matrix
140:      x - the vector on which BDFT will be performed

142:    Output parameter:
143:      y - vector that stores result of BDFT
144: */

148: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
149: {
151:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
152:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
153:   const PetscScalar *x_array;
154:   PetscScalar    *y_array;
155:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
156: #if defined(PETSC_USE_COMPLEX)
157: #if defined(PETSC_USE_64BIT_INDICES)
158:   fftw_iodim64   *iodims=fftw->iodims;
159: #else
160:   fftw_iodim     *iodims=fftw->iodims;
161: #endif
162: #endif
163: 
165:   VecGetArrayRead(x,&x_array);
166:   VecGetArray(y,&y_array);
167:   if (!fftw->p_backward) { /* create a plan, then excute it */
168:     switch (ndim) {
169:     case 1:
170: #if defined(PETSC_USE_COMPLEX)
171:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
172: #else
173:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
174: #endif
175:       break;
176:     case 2:
177: #if defined(PETSC_USE_COMPLEX)
178:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
179: #else
180:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
181: #endif
182:       break;
183:     case 3:
184: #if defined(PETSC_USE_COMPLEX)
185:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
186: #else
187:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
188: #endif
189:       break;
190:     default:
191: #if defined(PETSC_USE_COMPLEX)
192: #if defined(PETSC_USE_64BIT_INDICES)
193:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
194: #else
195:       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
196: #endif
197: #else
198:       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
199: #endif
200:       break;
201:     }
202:     fftw->binarray  = (PetscScalar *) x_array;
203:     fftw->boutarray = y_array;
204:     fftw_execute(fftw->p_backward);
205:   } else { /* use existing plan */
206:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
207: #if defined(PETSC_USE_COMPLEX)
208:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
209: #else
210:       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
211: #endif
212:     } else {
213:       fftw_execute(fftw->p_backward);
214:     }
215:   }
216:   VecRestoreArray(y,&y_array);
217:   VecRestoreArrayRead(x,&x_array);
218:   return(0);
219: }

221: /* MatMult_MPIFFTW performs forward DFT in parallel
222:    Input parameter:
223:      A - the matrix
224:      x - the vector on which FDFT will be performed

226:    Output parameter:
227:    y   - vector that stores result of FDFT
228: */
231: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
232: {
234:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
235:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
236:   const PetscScalar *x_array;
237:   PetscScalar    *y_array;
238:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
239:   MPI_Comm       comm;

242:   PetscObjectGetComm((PetscObject)A,&comm);
243:   VecGetArrayRead(x,&x_array);
244:   VecGetArray(y,&y_array);
245:   if (!fftw->p_forward) { /* create a plan, then excute it */
246:     switch (ndim) {
247:     case 1:
248: #if defined(PETSC_USE_COMPLEX)
249:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
250: #else
251:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
252: #endif
253:       break;
254:     case 2:
255: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
256:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
257: #else
258:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
259: #endif
260:       break;
261:     case 3:
262: #if defined(PETSC_USE_COMPLEX)
263:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
264: #else
265:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
266: #endif
267:       break;
268:     default:
269: #if defined(PETSC_USE_COMPLEX)
270:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
271: #else
272:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
273: #endif
274:       break;
275:     }
276:     fftw->finarray  = (PetscScalar *) x_array;
277:     fftw->foutarray = y_array;
278:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
279:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
280:     fftw_execute(fftw->p_forward);
281:   } else { /* use existing plan */
282:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
283:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
284:     } else {
285:       fftw_execute(fftw->p_forward);
286:     }
287:   }
288:   VecRestoreArray(y,&y_array);
289:   VecRestoreArrayRead(x,&x_array);
290:   return(0);
291: }

293: /* MatMultTranspose_MPIFFTW performs parallel backward DFT
294:    Input parameter:
295:      A - the matrix
296:      x - the vector on which BDFT will be performed

298:    Output parameter:
299:      y - vector that stores result of BDFT
300: */
303: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
304: {
306:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
307:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
308:   const PetscScalar *x_array;
309:   PetscScalar    *y_array;
310:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
311:   MPI_Comm       comm;

314:   PetscObjectGetComm((PetscObject)A,&comm);
315:   VecGetArrayRead(x,&x_array);
316:   VecGetArray(y,&y_array);
317:   if (!fftw->p_backward) { /* create a plan, then excute it */
318:     switch (ndim) {
319:     case 1:
320: #if defined(PETSC_USE_COMPLEX)
321:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
322: #else
323:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
324: #endif
325:       break;
326:     case 2:
327: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
328:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
329: #else
330:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
331: #endif
332:       break;
333:     case 3:
334: #if defined(PETSC_USE_COMPLEX)
335:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
336: #else
337:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
338: #endif
339:       break;
340:     default:
341: #if defined(PETSC_USE_COMPLEX)
342:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
343: #else
344:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
345: #endif
346:       break;
347:     }
348:     fftw->binarray  = (PetscScalar *) x_array;
349:     fftw->boutarray = y_array;
350:     fftw_execute(fftw->p_backward);
351:   } else { /* use existing plan */
352:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
353:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
354:     } else {
355:       fftw_execute(fftw->p_backward);
356:     }
357:   }
358:   VecRestoreArray(y,&y_array);
359:   VecRestoreArrayRead(x,&x_array);
360:   return(0);
361: }

365: PetscErrorCode MatDestroy_FFTW(Mat A)
366: {
367:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
368:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

372:   fftw_destroy_plan(fftw->p_forward);
373:   fftw_destroy_plan(fftw->p_backward);
374:   if (fftw->iodims) {
375:     free(fftw->iodims);
376:   }
377:   PetscFree(fftw->dim_fftw);
378:   PetscFree(fft->data);
379:   fftw_mpi_cleanup();
380:   return(0);
381: }

383: #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
386: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
387: {
389:   PetscScalar    *array;

392:   VecGetArray(v,&array);
393:   fftw_free((fftw_complex*)array);
394:   VecRestoreArray(v,&array);
395:   VecDestroy_MPI(v);
396:   return(0);
397: }

401: /*@
402:    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
403:      parallel layout determined by FFTW

405:    Collective on Mat

407:    Input Parameter:
408: .   A - the matrix

410:    Output Parameter:
411: +   x - (optional) input vector of forward FFTW
412: -   y - (optional) output vector of forward FFTW
413: -   z - (optional) output vector of backward FFTW

415:   Level: advanced

417:   Note: The parallel layout of output of forward FFTW is always same as the input
418:         of the backward FFTW. But parallel layout of the input vector of forward
419:         FFTW might not be same as the output of backward FFTW.
420:         Also note that we need to provide enough space while doing parallel real transform.
421:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
422:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
423:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
424:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
425:         zeros if it is odd only one zero is needed.
426:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
427:         figures out how much space is needed, i.e. it figures out the data+scratch space for
428:         each processor and returns that.

430: .seealso: MatCreateFFTW()
431: @*/
432: PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
433: {

437:   PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
438:   return(0);
439: }

443: PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
444: {
446:   PetscMPIInt    size,rank;
447:   MPI_Comm       comm;
448:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
449:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
450:   PetscInt       N     = fft->N;
451:   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;

456:   PetscObjectGetComm((PetscObject)A,&comm);

458:   MPI_Comm_size(comm, &size);
459:   MPI_Comm_rank(comm, &rank);
460:   if (size == 1) { /* sequential case */
461: #if defined(PETSC_USE_COMPLEX)
462:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
463:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,N,fout);}
464:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,N,bout);}
465: #else
466:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
467:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,n,fout);}
468:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,n,bout);}
469: #endif
470:   } else { /* parallel cases */
471:     ptrdiff_t    alloc_local,local_n0,local_0_start;
472:     ptrdiff_t    local_n1;
473:     fftw_complex *data_fout;
474:     ptrdiff_t    local_1_start;
475: #if defined(PETSC_USE_COMPLEX)
476:     fftw_complex *data_fin,*data_bout;
477: #else
478:     double    *data_finr,*data_boutr;
479:     PetscInt  n1,N1;
480:     ptrdiff_t temp;
481: #endif

483:     switch (ndim) {
484:     case 1:
485: #if !defined(PETSC_USE_COMPLEX)
486:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
487: #else
488:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
489:       if (fin) {
490:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
491:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);

493:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
494:       }
495:       if (fout) {
496:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
497:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);

499:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
500:       }
501:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
502:       if (bout) {
503:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
504:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);

506:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
507:       }
508:       break;
509: #endif
510:     case 2:
511: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
512:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
513:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
514:       if (fin) {
515:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
516:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

518:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
519:       }
520:       if (bout) {
521:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
522:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

524:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
525:       }
526:       if (fout) {
527:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
528:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);

530:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
531:       }
532: #else
533:       /* Get local size */
534:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
535:       if (fin) {
536:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
537:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

539:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
540:       }
541:       if (fout) {
542:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
543:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

545:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
546:       }
547:       if (bout) {
548:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
549:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

551:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
552:       }
553: #endif
554:       break;
555:     case 3:
556: #if !defined(PETSC_USE_COMPLEX)
557:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
558:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
559:       if (fin) {
560:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
561:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

563:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
564:       }
565:       if (bout) {
566:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
567:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

569:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
570:       }

572:       if (fout) {
573:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
574:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);

576:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
577:       }
578: #else
579:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
580:       if (fin) {
581:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
582:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

584:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
585:       }
586:       if (fout) {
587:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
588:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

590:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
591:       }
592:       if (bout) {
593:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
594:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

596:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
597:       }
598: #endif
599:       break;
600:     default:
601: #if !defined(PETSC_USE_COMPLEX)
602:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

604:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

606:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
607:       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

609:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

611:       if (fin) {
612:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
613:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);

615:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
616:       }
617:       if (bout) {
618:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
619:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);

621:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
622:       }

624:       if (fout) {
625:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
626:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);

628:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
629:       }
630: #else
631:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
632:       if (fin) {
633:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
634:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

636:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
637:       }
638:       if (fout) {
639:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
640:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

642:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
643:       }
644:       if (bout) {
645:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
646:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

648:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
649:       }
650: #endif
651:       break;
652:     }
653:   }
654:   return(0);
655: }

659: /*@
660:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

662:    Collective on Mat

664:    Input Parameters:
665: +  A - FFTW matrix
666: -  x - the PETSc vector

668:    Output Parameters:
669: .  y - the FFTW vector

671:   Options Database Keys:
672: . -mat_fftw_plannerflags - set FFTW planner flags

674:    Level: intermediate

676:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
677:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
678:          zeros. This routine does that job by scattering operation.

680: .seealso: VecScatterFFTWToPetsc()
681: @*/
682: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
683: {

687:   PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
688:   return(0);
689: }

693: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
694: {
696:   MPI_Comm       comm;
697:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
698:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
699:   PetscInt       N     =fft->N;
700:   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
701:   PetscInt       low;
702:   PetscMPIInt    rank,size;
703:   PetscInt       vsize,vsize1;
704:   ptrdiff_t      local_n0,local_0_start;
705:   ptrdiff_t      local_n1,local_1_start;
706:   VecScatter     vecscat;
707:   IS             list1,list2;
708: #if !defined(PETSC_USE_COMPLEX)
709:   PetscInt       i,j,k,partial_dim;
710:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
711:   PetscInt       NM;
712:   ptrdiff_t      temp;
713: #endif

716:   PetscObjectGetComm((PetscObject)A,&comm);
717:   MPI_Comm_size(comm, &size);
718:   MPI_Comm_rank(comm, &rank);
719:   VecGetOwnershipRange(y,&low,NULL);

721:   if (size==1) {
722:     VecGetSize(x,&vsize);
723:     VecGetSize(y,&vsize1);
724:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
725:     VecScatterCreate(x,list1,y,list1,&vecscat);
726:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
727:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
728:     VecScatterDestroy(&vecscat);
729:     ISDestroy(&list1);
730:   } else {
731:     switch (ndim) {
732:     case 1:
733: #if defined(PETSC_USE_COMPLEX)
734:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

736:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
737:       ISCreateStride(comm,local_n0,low,1,&list2);
738:       VecScatterCreate(x,list1,y,list2,&vecscat);
739:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
740:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
741:       VecScatterDestroy(&vecscat);
742:       ISDestroy(&list1);
743:       ISDestroy(&list2);
744: #else
745:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
746: #endif
747:       break;
748:     case 2:
749: #if defined(PETSC_USE_COMPLEX)
750:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

752:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
753:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
754:       VecScatterCreate(x,list1,y,list2,&vecscat);
755:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
756:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
757:       VecScatterDestroy(&vecscat);
758:       ISDestroy(&list1);
759:       ISDestroy(&list2);
760: #else
761:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

763:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
764:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

766:       if (dim[1]%2==0) {
767:         NM = dim[1]+2;
768:       } else {
769:         NM = dim[1]+1;
770:       }
771:       for (i=0; i<local_n0; i++) {
772:         for (j=0; j<dim[1]; j++) {
773:           tempindx  = i*dim[1] + j;
774:           tempindx1 = i*NM + j;

776:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
777:           indx2[tempindx]=low+tempindx1;
778:         }
779:       }

781:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
782:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

784:       VecScatterCreate(x,list1,y,list2,&vecscat);
785:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
786:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
787:       VecScatterDestroy(&vecscat);
788:       ISDestroy(&list1);
789:       ISDestroy(&list2);
790:       PetscFree(indx1);
791:       PetscFree(indx2);
792: #endif
793:       break;

795:     case 3:
796: #if defined(PETSC_USE_COMPLEX)
797:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

799:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
800:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
801:       VecScatterCreate(x,list1,y,list2,&vecscat);
802:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
803:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
804:       VecScatterDestroy(&vecscat);
805:       ISDestroy(&list1);
806:       ISDestroy(&list2);
807: #else
808:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
809:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
810:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

812:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
813:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

815:       if (dim[2]%2==0) NM = dim[2]+2;
816:       else             NM = dim[2]+1;

818:       for (i=0; i<local_n0; i++) {
819:         for (j=0; j<dim[1]; j++) {
820:           for (k=0; k<dim[2]; k++) {
821:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
822:             tempindx1 = i*dim[1]*NM + j*NM + k;

824:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
825:             indx2[tempindx]=low+tempindx1;
826:           }
827:         }
828:       }

830:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
831:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
832:       VecScatterCreate(x,list1,y,list2,&vecscat);
833:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
834:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
835:       VecScatterDestroy(&vecscat);
836:       ISDestroy(&list1);
837:       ISDestroy(&list2);
838:       PetscFree(indx1);
839:       PetscFree(indx2);
840: #endif
841:       break;

843:     default:
844: #if defined(PETSC_USE_COMPLEX)
845:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

847:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
848:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
849:       VecScatterCreate(x,list1,y,list2,&vecscat);
850:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
851:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
852:       VecScatterDestroy(&vecscat);
853:       ISDestroy(&list1);
854:       ISDestroy(&list2);
855: #else
856:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
857:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
858:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

860:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

862:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

864:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

866:       partial_dim = fftw->partial_dim;

868:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
869:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

871:       if (dim[ndim-1]%2==0) NM = 2;
872:       else                  NM = 1;

874:       j = low;
875:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
876:         indx1[i] = local_0_start*partial_dim + i;
877:         indx2[i] = j;
878:         if (k%dim[ndim-1]==0) j+=NM;
879:         j++;
880:       }
881:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
882:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
883:       VecScatterCreate(x,list1,y,list2,&vecscat);
884:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
885:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
886:       VecScatterDestroy(&vecscat);
887:       ISDestroy(&list1);
888:       ISDestroy(&list2);
889:       PetscFree(indx1);
890:       PetscFree(indx2);
891: #endif
892:       break;
893:     }
894:   }
895:   return(0);
896: }

900: /*@
901:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

903:    Collective on Mat

905:     Input Parameters:
906: +   A - FFTW matrix
907: -   x - FFTW vector

909:    Output Parameters:
910: .  y - PETSc vector

912:    Level: intermediate

914:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
915:          VecScatterFFTWToPetsc removes those extra zeros.

917: .seealso: VecScatterPetscToFFTW()
918: @*/
919: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
920: {

924:   PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
925:   return(0);
926: }

930: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
931: {
933:   MPI_Comm       comm;
934:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
935:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
936:   PetscInt       N     = fft->N;
937:   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
938:   PetscInt       low;
939:   PetscMPIInt    rank,size;
940:   ptrdiff_t      local_n0,local_0_start;
941:   ptrdiff_t      local_n1,local_1_start;
942:   VecScatter     vecscat;
943:   IS             list1,list2;
944: #if !defined(PETSC_USE_COMPLEX)
945:   PetscInt       i,j,k,partial_dim;
946:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
947:   PetscInt       NM;
948:   ptrdiff_t      temp;
949: #endif

952:   PetscObjectGetComm((PetscObject)A,&comm);
953:   MPI_Comm_size(comm, &size);
954:   MPI_Comm_rank(comm, &rank);
955:   VecGetOwnershipRange(x,&low,NULL);

957:   if (size==1) {
958:     ISCreateStride(comm,N,0,1,&list1);
959:     VecScatterCreate(x,list1,y,list1,&vecscat);
960:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
961:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
962:     VecScatterDestroy(&vecscat);
963:     ISDestroy(&list1);

965:   } else {
966:     switch (ndim) {
967:     case 1:
968: #if defined(PETSC_USE_COMPLEX)
969:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

971:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
972:       ISCreateStride(comm,local_n1,low,1,&list2);
973:       VecScatterCreate(x,list1,y,list2,&vecscat);
974:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
975:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
976:       VecScatterDestroy(&vecscat);
977:       ISDestroy(&list1);
978:       ISDestroy(&list2);
979: #else
980:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
981: #endif
982:       break;
983:     case 2:
984: #if defined(PETSC_USE_COMPLEX)
985:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

987:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
988:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
989:       VecScatterCreate(x,list2,y,list1,&vecscat);
990:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
991:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
992:       VecScatterDestroy(&vecscat);
993:       ISDestroy(&list1);
994:       ISDestroy(&list2);
995: #else
996:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

998:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
999:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

1001:       if (dim[1]%2==0) NM = dim[1]+2;
1002:       else             NM = dim[1]+1;

1004:       for (i=0; i<local_n0; i++) {
1005:         for (j=0; j<dim[1]; j++) {
1006:           tempindx = i*dim[1] + j;
1007:           tempindx1 = i*NM + j;

1009:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1010:           indx2[tempindx]=low+tempindx1;
1011:         }
1012:       }

1014:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
1015:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

1017:       VecScatterCreate(x,list2,y,list1,&vecscat);
1018:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1019:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1020:       VecScatterDestroy(&vecscat);
1021:       ISDestroy(&list1);
1022:       ISDestroy(&list2);
1023:       PetscFree(indx1);
1024:       PetscFree(indx2);
1025: #endif
1026:       break;
1027:     case 3:
1028: #if defined(PETSC_USE_COMPLEX)
1029:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1031:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1032:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1033:       VecScatterCreate(x,list1,y,list2,&vecscat);
1034:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1035:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1036:       VecScatterDestroy(&vecscat);
1037:       ISDestroy(&list1);
1038:       ISDestroy(&list2);
1039: #else
1040:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1042:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
1043:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

1045:       if (dim[2]%2==0) NM = dim[2]+2;
1046:       else             NM = dim[2]+1;

1048:       for (i=0; i<local_n0; i++) {
1049:         for (j=0; j<dim[1]; j++) {
1050:           for (k=0; k<dim[2]; k++) {
1051:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1052:             tempindx1 = i*dim[1]*NM + j*NM + k;

1054:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1055:             indx2[tempindx]=low+tempindx1;
1056:           }
1057:         }
1058:       }

1060:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1061:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1063:       VecScatterCreate(x,list2,y,list1,&vecscat);
1064:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1065:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1066:       VecScatterDestroy(&vecscat);
1067:       ISDestroy(&list1);
1068:       ISDestroy(&list2);
1069:       PetscFree(indx1);
1070:       PetscFree(indx2);
1071: #endif
1072:       break;
1073:     default:
1074: #if defined(PETSC_USE_COMPLEX)
1075:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1077:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1078:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1079:       VecScatterCreate(x,list1,y,list2,&vecscat);
1080:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1081:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1082:       VecScatterDestroy(&vecscat);
1083:       ISDestroy(&list1);
1084:       ISDestroy(&list2);
1085: #else
1086:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1088:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1090:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1092:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1094:       partial_dim = fftw->partial_dim;

1096:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
1097:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

1099:       if (dim[ndim-1]%2==0) NM = 2;
1100:       else                  NM = 1;

1102:       j = low;
1103:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1104:         indx1[i] = local_0_start*partial_dim + i;
1105:         indx2[i] = j;
1106:         if (k%dim[ndim-1]==0) j+=NM;
1107:         j++;
1108:       }
1109:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1110:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1112:       VecScatterCreate(x,list2,y,list1,&vecscat);
1113:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1114:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1115:       VecScatterDestroy(&vecscat);
1116:       ISDestroy(&list1);
1117:       ISDestroy(&list2);
1118:       PetscFree(indx1);
1119:       PetscFree(indx2);
1120: #endif
1121:       break;
1122:     }
1123:   }
1124:   return(0);
1125: }

1129: /*
1130:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1132:   Options Database Keys:
1133: + -mat_fftw_plannerflags - set FFTW planner flags

1135:    Level: intermediate

1137: */
1138: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1139: {
1141:   MPI_Comm       comm;
1142:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1143:   Mat_FFTW       *fftw;
1144:   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1145:   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1146:   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1147:   PetscBool      flg;
1148:   PetscInt       p_flag,partial_dim=1,ctr;
1149:   PetscMPIInt    size,rank;
1150:   ptrdiff_t      *pdim;
1151:   ptrdiff_t      local_n1,local_1_start;
1152: #if !defined(PETSC_USE_COMPLEX)
1153:   ptrdiff_t      temp;
1154:   PetscInt       N1,tot_dim;
1155: #else
1156:   PetscInt       n1;
1157: #endif

1160:   PetscObjectGetComm((PetscObject)A,&comm);
1161:   MPI_Comm_size(comm, &size);
1162:   MPI_Comm_rank(comm, &rank);

1164:   fftw_mpi_init();
1165:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1166:   pdim[0] = dim[0];
1167: #if !defined(PETSC_USE_COMPLEX)
1168:   tot_dim = 2*dim[0];
1169: #endif
1170:   for (ctr=1; ctr<ndim; ctr++) {
1171:     partial_dim *= dim[ctr];
1172:     pdim[ctr]    = dim[ctr];
1173: #if !defined(PETSC_USE_COMPLEX)
1174:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1175:     else             tot_dim *= dim[ctr];
1176: #endif
1177:   }

1179:   if (size == 1) {
1180: #if defined(PETSC_USE_COMPLEX)
1181:     MatSetSizes(A,N,N,N,N);
1182:     n    = N;
1183: #else
1184:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1185:     n    = tot_dim;
1186: #endif

1188:   } else {
1189:     ptrdiff_t local_n0,local_0_start;
1190:     switch (ndim) {
1191:     case 1:
1192: #if !defined(PETSC_USE_COMPLEX)
1193:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1194: #else
1195:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1197:       n    = (PetscInt)local_n0;
1198:       n1   = (PetscInt)local_n1;
1199:       MatSetSizes(A,n1,n,N,N);
1200: #endif
1201:       break;
1202:     case 2:
1203: #if defined(PETSC_USE_COMPLEX)
1204:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1205:       /*
1206:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1207:        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1208:        */
1209:       n    = (PetscInt)local_n0*dim[1];
1210:       MatSetSizes(A,n,n,N,N);
1211: #else
1212:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1214:       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1215:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1216: #endif
1217:       break;
1218:     case 3:
1219: #if defined(PETSC_USE_COMPLEX)
1220:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1222:       n    = (PetscInt)local_n0*dim[1]*dim[2];
1223:       MatSetSizes(A,n,n,N,N);
1224: #else
1225:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1227:       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1228:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1229: #endif
1230:       break;
1231:     default:
1232: #if defined(PETSC_USE_COMPLEX)
1233:       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1235:       n    = (PetscInt)local_n0*partial_dim;
1236:       MatSetSizes(A,n,n,N,N);
1237: #else
1238:       temp = pdim[ndim-1];

1240:       pdim[ndim-1] = temp/2 + 1;

1242:       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1244:       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1245:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1247:       pdim[ndim-1] = temp;

1249:       MatSetSizes(A,n,n,N1,N1);
1250: #endif
1251:       break;
1252:     }
1253:   }
1254:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1255:   PetscNewLog(A,&fftw);
1256:   fft->data = (void*)fftw;

1258:   fft->n            = n;
1259:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1260:   fftw->partial_dim = partial_dim;

1262:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1263:   if (size == 1) {
1264: #if defined(PETSC_USE_64BIT_INDICES)
1265:     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1266: #else
1267:     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1268: #endif
1269:   }

1271:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1273:   fftw->p_forward  = 0;
1274:   fftw->p_backward = 0;
1275:   fftw->p_flag     = FFTW_ESTIMATE;

1277:   if (size == 1) {
1278:     A->ops->mult          = MatMult_SeqFFTW;
1279:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1280:   } else {
1281:     A->ops->mult          = MatMult_MPIFFTW;
1282:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1283:   }
1284:   fft->matdestroy = MatDestroy_FFTW;
1285:   A->assembled    = PETSC_TRUE;
1286:   A->preallocated = PETSC_TRUE;

1288:   PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);
1289:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1290:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1292:   /* get runtime options */
1293:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1294:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);
1295:   if (flg) {
1296:     fftw->p_flag = iplans[p_flag];
1297:   }
1298:   PetscOptionsEnd();
1299:   return(0);
1300: }