Actual source code: fftw.c

petsc-3.4.5 2014-06-29
  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/examples/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t   ndim_fftw,*dim_fftw;
 14:   PetscInt    partial_dim;
 15:   fftw_plan   p_forward,p_backward;
 16:   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 17:   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 18:                                                             executed for the arrays with which the plan was created */
 19: } Mat_FFTW;

 21: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 22: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 23: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 24: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 25: extern PetscErrorCode MatDestroy_FFTW(Mat);
 26: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);

 28: /* MatMult_SeqFFTW performs forward DFT in parallel
 29:    Input parameter:
 30:      A - the matrix
 31:      x - the vector on which FDFT will be performed

 33:    Output parameter:
 34:      y - vector that stores result of FDFT
 35: */
 38: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 39: {
 41:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 42:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 43:   PetscScalar    *x_array,*y_array;
 44:   PetscInt       ndim=fft->ndim,*dim=fft->dim;

 47:   VecGetArray(x,&x_array);
 48:   VecGetArray(y,&y_array);
 49:   if (!fftw->p_forward) { /* create a plan, then excute it */
 50:     switch (ndim) {
 51:     case 1:
 52: #if defined(PETSC_USE_COMPLEX)
 53:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 54: #else
 55:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 56: #endif
 57:       break;
 58:     case 2:
 59: #if defined(PETSC_USE_COMPLEX)
 60:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 61: #else
 62:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 63: #endif
 64:       break;
 65:     case 3:
 66: #if defined(PETSC_USE_COMPLEX)
 67:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 68: #else
 69:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 70: #endif
 71:       break;
 72:     default:
 73: #if defined(PETSC_USE_COMPLEX)
 74:       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 75: #else
 76:       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 77: #endif
 78:       break;
 79:     }
 80:     fftw->finarray  = x_array;
 81:     fftw->foutarray = y_array;
 82:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
 83:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
 84:     fftw_execute(fftw->p_forward);
 85:   } else { /* use existing plan */
 86:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
 87:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
 88:     } else {
 89:       fftw_execute(fftw->p_forward);
 90:     }
 91:   }
 92:   VecRestoreArray(y,&y_array);
 93:   VecRestoreArray(x,&x_array);
 94:   return(0);
 95: }

 97: /* MatMultTranspose_SeqFFTW performs serial backward DFT
 98:    Input parameter:
 99:      A - the matrix
100:      x - the vector on which BDFT will be performed

102:    Output parameter:
103:      y - vector that stores result of BDFT
104: */

108: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109: {
111:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
112:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113:   PetscScalar    *x_array,*y_array;
114:   PetscInt       ndim=fft->ndim,*dim=fft->dim;

117:   VecGetArray(x,&x_array);
118:   VecGetArray(y,&y_array);
119:   if (!fftw->p_backward) { /* create a plan, then excute it */
120:     switch (ndim) {
121:     case 1:
122: #if defined(PETSC_USE_COMPLEX)
123:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124: #else
125:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
126: #endif
127:       break;
128:     case 2:
129: #if defined(PETSC_USE_COMPLEX)
130:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131: #else
132:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
133: #endif
134:       break;
135:     case 3:
136: #if defined(PETSC_USE_COMPLEX)
137:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138: #else
139:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
140: #endif
141:       break;
142:     default:
143: #if defined(PETSC_USE_COMPLEX)
144:       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145: #else
146:       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
147: #endif
148:       break;
149:     }
150:     fftw->binarray  = x_array;
151:     fftw->boutarray = y_array;
152:     fftw_execute(fftw->p_backward);
153:   } else { /* use existing plan */
154:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
155:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156:     } else {
157:       fftw_execute(fftw->p_backward);
158:     }
159:   }
160:   VecRestoreArray(y,&y_array);
161:   VecRestoreArray(x,&x_array);
162:   return(0);
163: }

165: /* MatMult_MPIFFTW performs forward DFT in parallel
166:    Input parameter:
167:      A - the matrix
168:      x - the vector on which FDFT will be performed

170:    Output parameter:
171:    y   - vector that stores result of FDFT
172: */
175: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176: {
178:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180:   PetscScalar    *x_array,*y_array;
181:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182:   MPI_Comm       comm;

185:   PetscObjectGetComm((PetscObject)A,&comm);
186:   VecGetArray(x,&x_array);
187:   VecGetArray(y,&y_array);
188:   if (!fftw->p_forward) { /* create a plan, then excute it */
189:     switch (ndim) {
190:     case 1:
191: #if defined(PETSC_USE_COMPLEX)
192:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
193: #else
194:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
195: #endif
196:       break;
197:     case 2:
198: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
199:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
200: #else
201:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
202: #endif
203:       break;
204:     case 3:
205: #if defined(PETSC_USE_COMPLEX)
206:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
207: #else
208:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
209: #endif
210:       break;
211:     default:
212: #if defined(PETSC_USE_COMPLEX)
213:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
214: #else
215:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
216: #endif
217:       break;
218:     }
219:     fftw->finarray  = x_array;
220:     fftw->foutarray = y_array;
221:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
222:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
223:     fftw_execute(fftw->p_forward);
224:   } else { /* use existing plan */
225:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
226:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
227:     } else {
228:       fftw_execute(fftw->p_forward);
229:     }
230:   }
231:   VecRestoreArray(y,&y_array);
232:   VecRestoreArray(x,&x_array);
233:   return(0);
234: }

236: /* MatMultTranspose_MPIFFTW performs parallel backward DFT
237:    Input parameter:
238:      A - the matrix
239:      x - the vector on which BDFT will be performed

241:    Output parameter:
242:      y - vector that stores result of BDFT
243: */
246: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
247: {
249:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
250:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
251:   PetscScalar    *x_array,*y_array;
252:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
253:   MPI_Comm       comm;

256:   PetscObjectGetComm((PetscObject)A,&comm);
257:   VecGetArray(x,&x_array);
258:   VecGetArray(y,&y_array);
259:   if (!fftw->p_backward) { /* create a plan, then excute it */
260:     switch (ndim) {
261:     case 1:
262: #if defined(PETSC_USE_COMPLEX)
263:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
264: #else
265:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
266: #endif
267:       break;
268:     case 2:
269: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
270:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
271: #else
272:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
273: #endif
274:       break;
275:     case 3:
276: #if defined(PETSC_USE_COMPLEX)
277:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
278: #else
279:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
280: #endif
281:       break;
282:     default:
283: #if defined(PETSC_USE_COMPLEX)
284:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
285: #else
286:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
287: #endif
288:       break;
289:     }
290:     fftw->binarray  = x_array;
291:     fftw->boutarray = y_array;
292:     fftw_execute(fftw->p_backward);
293:   } else { /* use existing plan */
294:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
295:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
296:     } else {
297:       fftw_execute(fftw->p_backward);
298:     }
299:   }
300:   VecRestoreArray(y,&y_array);
301:   VecRestoreArray(x,&x_array);
302:   return(0);
303: }

307: PetscErrorCode MatDestroy_FFTW(Mat A)
308: {
309:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
310:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

314:   fftw_destroy_plan(fftw->p_forward);
315:   fftw_destroy_plan(fftw->p_backward);
316:   PetscFree(fftw->dim_fftw);
317:   PetscFree(fft->data);
318:   fftw_mpi_cleanup();
319:   return(0);
320: }

322: #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
325: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
326: {
328:   PetscScalar    *array;

331:   VecGetArray(v,&array);
332:   fftw_free((fftw_complex*)array);
333:   VecRestoreArray(v,&array);
334:   VecDestroy_MPI(v);
335:   return(0);
336: }

340: /*@
341:    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
342:      parallel layout determined by FFTW

344:    Collective on Mat

346:    Input Parameter:
347: .   A - the matrix

349:    Output Parameter:
350: +   x - (optional) input vector of forward FFTW
351: -   y - (optional) output vector of forward FFTW
352: -   z - (optional) output vector of backward FFTW

354:   Level: advanced

356:   Note: The parallel layout of output of forward FFTW is always same as the input
357:         of the backward FFTW. But parallel layout of the input vector of forward
358:         FFTW might not be same as the output of backward FFTW.
359:         Also note that we need to provide enough space while doing parallel real transform.
360:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
361:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
362:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
363:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
364:         zeros if it is odd only one zero is needed.
365:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
366:         figures out how much space is needed, i.e. it figures out the data+scratch space for
367:         each processor and returns that.

369: .seealso: MatCreateFFTW()
370: @*/
371: PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
372: {

376:   PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
377:   return(0);
378: }

382: PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
383: {
385:   PetscMPIInt    size,rank;
386:   MPI_Comm       comm;
387:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
388:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
389:   PetscInt       N     = fft->N;
390:   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;

395:   PetscObjectGetComm((PetscObject)A,&comm);

397:   MPI_Comm_size(comm, &size);
398:   MPI_Comm_rank(comm, &rank);
399:   if (size == 1) { /* sequential case */
400: #if defined(PETSC_USE_COMPLEX)
401:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
402:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,N,fout);}
403:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,N,bout);}
404: #else
405:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
406:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,n,fout);}
407:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,n,bout);}
408: #endif
409:   } else { /* parallel cases */
410:     ptrdiff_t    alloc_local,local_n0,local_0_start;
411:     ptrdiff_t    local_n1;
412:     fftw_complex *data_fout;
413:     ptrdiff_t    local_1_start;
414: #if defined(PETSC_USE_COMPLEX)
415:     fftw_complex *data_fin,*data_bout;
416: #else
417:     double    *data_finr,*data_boutr;
418:     PetscInt  n1,N1;
419:     ptrdiff_t temp;
420: #endif

422:     switch (ndim) {
423:     case 1:
424: #if !defined(PETSC_USE_COMPLEX)
425:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
426: #else
427:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
428:       if (fin) {
429:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
430:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);

432:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
433:       }
434:       if (fout) {
435:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
436:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);

438:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
439:       }
440:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
441:       if (bout) {
442:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
443:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);

445:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
446:       }
447:       break;
448: #endif
449:     case 2:
450: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
451:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
452:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
453:       if (fin) {
454:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
455:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

457:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
458:       }
459:       if (bout) {
460:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
461:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

463:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
464:       }
465:       if (fout) {
466:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
467:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);

469:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
470:       }
471: #else
472:       /* Get local size */
473:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
474:       if (fin) {
475:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
476:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

478:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
479:       }
480:       if (fout) {
481:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
482:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

484:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
485:       }
486:       if (bout) {
487:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
488:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

490:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
491:       }
492: #endif
493:       break;
494:     case 3:
495: #if !defined(PETSC_USE_COMPLEX)
496:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
497:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
498:       if (fin) {
499:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
500:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

502:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
503:       }
504:       if (bout) {
505:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
506:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

508:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
509:       }

511:       if (fout) {
512:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
513:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);

515:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
516:       }
517: #else
518:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
519:       if (fin) {
520:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
521:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

523:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
524:       }
525:       if (fout) {
526:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
527:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

529:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
530:       }
531:       if (bout) {
532:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
533:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

535:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
536:       }
537: #endif
538:       break;
539:     default:
540: #if !defined(PETSC_USE_COMPLEX)
541:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

543:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

545:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
546:       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

548:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

550:       if (fin) {
551:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
552:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);

554:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
555:       }
556:       if (bout) {
557:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
558:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);

560:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
561:       }

563:       if (fout) {
564:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
565:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);

567:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
568:       }
569: #else
570:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
571:       if (fin) {
572:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
573:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

575:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
576:       }
577:       if (fout) {
578:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
579:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

581:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
582:       }
583:       if (bout) {
584:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
585:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

587:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
588:       }
589: #endif
590:       break;
591:     }
592:   }
593:   return(0);
594: }

598: /*@
599:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

601:    Collective on Mat

603:    Input Parameters:
604: +  A - FFTW matrix
605: -  x - the PETSc vector

607:    Output Parameters:
608: .  y - the FFTW vector

610:   Options Database Keys:
611: . -mat_fftw_plannerflags - set FFTW planner flags

613:    Level: intermediate

615:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
616:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
617:          zeros. This routine does that job by scattering operation.

619: .seealso: VecScatterFFTWToPetsc()
620: @*/
621: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
622: {

626:   PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
627:   return(0);
628: }

632: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
633: {
635:   MPI_Comm       comm;
636:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
637:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
638:   PetscInt       N     =fft->N;
639:   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
640:   PetscInt       low;
641:   PetscInt       rank,size,vsize,vsize1;
642:   ptrdiff_t      alloc_local,local_n0,local_0_start;
643:   ptrdiff_t      local_n1,local_1_start;
644:   VecScatter     vecscat;
645:   IS             list1,list2;
646: #if !defined(PETSC_USE_COMPLEX)
647:   PetscInt       i,j,k,partial_dim;
648:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
649:   PetscInt       N1, n1,NM;
650:   ptrdiff_t      temp;
651: #endif

654:   PetscObjectGetComm((PetscObject)A,&comm);
655:   MPI_Comm_size(comm, &size);
656:   MPI_Comm_rank(comm, &rank);
657:   VecGetOwnershipRange(y,&low,NULL);

659:   if (size==1) {
660:     VecGetSize(x,&vsize);
661:     VecGetSize(y,&vsize1);
662:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
663:     VecScatterCreate(x,list1,y,list1,&vecscat);
664:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
665:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
666:     VecScatterDestroy(&vecscat);
667:     ISDestroy(&list1);
668:   } else {
669:     switch (ndim) {
670:     case 1:
671: #if defined(PETSC_USE_COMPLEX)
672:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

674:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
675:       ISCreateStride(comm,local_n0,low,1,&list2);
676:       VecScatterCreate(x,list1,y,list2,&vecscat);
677:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
678:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
679:       VecScatterDestroy(&vecscat);
680:       ISDestroy(&list1);
681:       ISDestroy(&list2);
682: #else
683:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
684: #endif
685:       break;
686:     case 2:
687: #if defined(PETSC_USE_COMPLEX)
688:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

690:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
691:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
692:       VecScatterCreate(x,list1,y,list2,&vecscat);
693:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
694:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
695:       VecScatterDestroy(&vecscat);
696:       ISDestroy(&list1);
697:       ISDestroy(&list2);
698: #else
699:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

701:       N1   = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
702:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);
703:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);

705:       if (dim[1]%2==0) {
706:         NM = dim[1]+2;
707:       } else {
708:         NM = dim[1]+1;
709:       }
710:       for (i=0; i<local_n0; i++) {
711:         for (j=0; j<dim[1]; j++) {
712:           tempindx  = i*dim[1] + j;
713:           tempindx1 = i*NM + j;

715:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
716:           indx2[tempindx]=low+tempindx1;
717:         }
718:       }

720:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
721:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

723:       VecScatterCreate(x,list1,y,list2,&vecscat);
724:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
725:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
726:       VecScatterDestroy(&vecscat);
727:       ISDestroy(&list1);
728:       ISDestroy(&list2);
729:       PetscFree(indx1);
730:       PetscFree(indx2);
731: #endif
732:       break;

734:     case 3:
735: #if defined(PETSC_USE_COMPLEX)
736:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

738:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
739:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
740:       VecScatterCreate(x,list1,y,list2,&vecscat);
741:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
742:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
743:       VecScatterDestroy(&vecscat);
744:       ISDestroy(&list1);
745:       ISDestroy(&list2);
746: #else
747:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

749:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);

751:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
752:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

754:       if (dim[2]%2==0) NM = dim[2]+2;
755:       else             NM = dim[2]+1;

757:       for (i=0; i<local_n0; i++) {
758:         for (j=0; j<dim[1]; j++) {
759:           for (k=0; k<dim[2]; k++) {
760:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
761:             tempindx1 = i*dim[1]*NM + j*NM + k;

763:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
764:             indx2[tempindx]=low+tempindx1;
765:           }
766:         }
767:       }

769:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
770:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
771:       VecScatterCreate(x,list1,y,list2,&vecscat);
772:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
773:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
774:       VecScatterDestroy(&vecscat);
775:       ISDestroy(&list1);
776:       ISDestroy(&list2);
777:       PetscFree(indx1);
778:       PetscFree(indx2);
779: #endif
780:       break;

782:     default:
783: #if defined(PETSC_USE_COMPLEX)
784:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

786:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
787:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
788:       VecScatterCreate(x,list1,y,list2,&vecscat);
789:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
790:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
791:       VecScatterDestroy(&vecscat);
792:       ISDestroy(&list1);
793:       ISDestroy(&list2);
794: #else
795:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

797:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

799:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

801:       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

803:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

805:       partial_dim = fftw->partial_dim;

807:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);
808:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);

810:       if (dim[ndim-1]%2==0) NM = 2;
811:       else                  NM = 1;

813:       j = low;
814:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
815:         indx1[i] = local_0_start*partial_dim + i;
816:         indx2[i] = j;
817:         if (k%dim[ndim-1]==0) j+=NM;
818:         j++;
819:       }
820:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
821:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
822:       VecScatterCreate(x,list1,y,list2,&vecscat);
823:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
824:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
825:       VecScatterDestroy(&vecscat);
826:       ISDestroy(&list1);
827:       ISDestroy(&list2);
828:       PetscFree(indx1);
829:       PetscFree(indx2);
830: #endif
831:       break;
832:     }
833:   }
834:   return(0);
835: }

839: /*@
840:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

842:    Collective on Mat

844:     Input Parameters:
845: +   A - FFTW matrix
846: -   x - FFTW vector

848:    Output Parameters:
849: .  y - PETSc vector

851:    Level: intermediate

853:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
854:          VecScatterFFTWToPetsc removes those extra zeros.

856: .seealso: VecScatterPetscToFFTW()
857: @*/
858: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
859: {

863:   PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
864:   return(0);
865: }

869: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
870: {
872:   MPI_Comm       comm;
873:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
874:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
875:   PetscInt       N     = fft->N;
876:   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
877:   PetscInt       low;
878:   PetscInt       rank,size;
879:   ptrdiff_t      alloc_local,local_n0,local_0_start;
880:   ptrdiff_t      local_n1,local_1_start;
881:   VecScatter     vecscat;
882:   IS             list1,list2;
883: #if !defined(PETSC_USE_COMPLEX)
884:   PetscInt  i,j,k,partial_dim;
885:   PetscInt  *indx1, *indx2, tempindx, tempindx1;
886:   PetscInt  N1, n1,NM;
887:   ptrdiff_t temp;
888: #endif

891:   PetscObjectGetComm((PetscObject)A,&comm);
892:   MPI_Comm_size(comm, &size);
893:   MPI_Comm_rank(comm, &rank);
894:   VecGetOwnershipRange(x,&low,NULL);

896:   if (size==1) {
897:     ISCreateStride(comm,N,0,1,&list1);
898:     VecScatterCreate(x,list1,y,list1,&vecscat);
899:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
900:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
901:     VecScatterDestroy(&vecscat);
902:     ISDestroy(&list1);

904:   } else {
905:     switch (ndim) {
906:     case 1:
907: #if defined(PETSC_USE_COMPLEX)
908:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

910:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
911:       ISCreateStride(comm,local_n1,low,1,&list2);
912:       VecScatterCreate(x,list1,y,list2,&vecscat);
913:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
914:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
915:       VecScatterDestroy(&vecscat);
916:       ISDestroy(&list1);
917:       ISDestroy(&list2);
918: #else
919:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
920: #endif
921:       break;
922:     case 2:
923: #if defined(PETSC_USE_COMPLEX)
924:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

926:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
927:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
928:       VecScatterCreate(x,list2,y,list1,&vecscat);
929:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
930:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
931:       VecScatterDestroy(&vecscat);
932:       ISDestroy(&list1);
933:       ISDestroy(&list2);
934: #else
935:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

937:       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);

939:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);
940:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);

942:       if (dim[1]%2==0) NM = dim[1]+2;
943:       else             NM = dim[1]+1;

945:       for (i=0; i<local_n0; i++) {
946:         for (j=0; j<dim[1]; j++) {
947:           tempindx = i*dim[1] + j;
948:           tempindx1 = i*NM + j;

950:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
951:           indx2[tempindx]=low+tempindx1;
952:         }
953:       }

955:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
956:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

958:       VecScatterCreate(x,list2,y,list1,&vecscat);
959:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
960:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
961:       VecScatterDestroy(&vecscat);
962:       ISDestroy(&list1);
963:       ISDestroy(&list2);
964:       PetscFree(indx1);
965:       PetscFree(indx2);
966: #endif
967:       break;
968:     case 3:
969: #if defined(PETSC_USE_COMPLEX)
970:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

972:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
973:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
974:       VecScatterCreate(x,list1,y,list2,&vecscat);
975:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
976:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
977:       VecScatterDestroy(&vecscat);
978:       ISDestroy(&list1);
979:       ISDestroy(&list2);
980: #else

982:       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

984:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);

986:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
987:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

989:       if (dim[2]%2==0) NM = dim[2]+2;
990:       else             NM = dim[2]+1;

992:       for (i=0; i<local_n0; i++) {
993:         for (j=0; j<dim[1]; j++) {
994:           for (k=0; k<dim[2]; k++) {
995:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
996:             tempindx1 = i*dim[1]*NM + j*NM + k;

998:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
999:             indx2[tempindx]=low+tempindx1;
1000:           }
1001:         }
1002:       }

1004:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1005:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1007:       VecScatterCreate(x,list2,y,list1,&vecscat);
1008:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1009:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1010:       VecScatterDestroy(&vecscat);
1011:       ISDestroy(&list1);
1012:       ISDestroy(&list2);
1013:       PetscFree(indx1);
1014:       PetscFree(indx2);
1015: #endif
1016:       break;
1017:     default:
1018: #if defined(PETSC_USE_COMPLEX)
1019:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1021:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1022:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1023:       VecScatterCreate(x,list1,y,list2,&vecscat);
1024:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1025:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1026:       VecScatterDestroy(&vecscat);
1027:       ISDestroy(&list1);
1028:       ISDestroy(&list2);
1029: #else
1030:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1032:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1034:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1036:       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

1038:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1040:       partial_dim = fftw->partial_dim;

1042:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);
1043:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);

1045:       if (dim[ndim-1]%2==0) NM = 2;
1046:       else                  NM = 1;

1048:       j = low;
1049:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1050:         indx1[i] = local_0_start*partial_dim + i;
1051:         indx2[i] = j;
1052:         if (k%dim[ndim-1]==0) j+=NM;
1053:         j++;
1054:       }
1055:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1056:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1058:       VecScatterCreate(x,list2,y,list1,&vecscat);
1059:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1060:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1061:       VecScatterDestroy(&vecscat);
1062:       ISDestroy(&list1);
1063:       ISDestroy(&list2);
1064:       PetscFree(indx1);
1065:       PetscFree(indx2);
1066: #endif
1067:       break;
1068:     }
1069:   }
1070:   return(0);
1071: }

1075: /*
1076:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1078:   Options Database Keys:
1079: + -mat_fftw_plannerflags - set FFTW planner flags

1081:    Level: intermediate

1083: */
1084: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1085: {
1087:   MPI_Comm       comm;
1088:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1089:   Mat_FFTW       *fftw;
1090:   PetscInt       n         =fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1091:   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1092:   PetscBool      flg;
1093:   PetscInt       p_flag,partial_dim=1,ctr;
1094:   PetscMPIInt    size,rank;
1095:   ptrdiff_t      *pdim;
1096:   ptrdiff_t      local_n1,local_1_start;
1097: #if !defined(PETSC_USE_COMPLEX)
1098:   ptrdiff_t temp;
1099:   PetscInt  N1,tot_dim;
1100: #else
1101:   PetscInt n1;
1102: #endif

1105:   PetscObjectGetComm((PetscObject)A,&comm);
1106:   MPI_Comm_size(comm, &size);
1107:   MPI_Comm_rank(comm, &rank);

1109:   fftw_mpi_init();
1110:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1111:   pdim[0] = dim[0];
1112: #if !defined(PETSC_USE_COMPLEX)
1113:   tot_dim = 2*dim[0];
1114: #endif
1115:   for (ctr=1; ctr<ndim; ctr++) {
1116:     partial_dim *= dim[ctr];
1117:     pdim[ctr]    = dim[ctr];
1118: #if !defined(PETSC_USE_COMPLEX)
1119:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1120:     else             tot_dim *= dim[ctr];
1121: #endif
1122:   }

1124:   if (size == 1) {
1125: #if defined(PETSC_USE_COMPLEX)
1126:     MatSetSizes(A,N,N,N,N);
1127:     n    = N;
1128: #else
1129:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1130:     n    = tot_dim;
1131: #endif

1133:   } else {
1134:     ptrdiff_t alloc_local,local_n0,local_0_start;
1135:     switch (ndim) {
1136:     case 1:
1137: #if !defined(PETSC_USE_COMPLEX)
1138:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1139: #else
1140:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1142:       n    = (PetscInt)local_n0;
1143:       n1   = (PetscInt)local_n1;
1144:       MatSetSizes(A,n1,n,N,N);
1145: #endif
1146:       break;
1147:     case 2:
1148: #if defined(PETSC_USE_COMPLEX)
1149:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1150:       /*
1151:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1152:        PetscSynchronizedFlush(comm);
1153:        */
1154:       n    = (PetscInt)local_n0*dim[1];
1155:       MatSetSizes(A,n,n,N,N);
1156: #else
1157:       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1159:       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1160:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1161: #endif
1162:       break;
1163:     case 3:
1164: #if defined(PETSC_USE_COMPLEX)
1165:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1167:       n    = (PetscInt)local_n0*dim[1]*dim[2];
1168:       MatSetSizes(A,n,n,N,N);
1169: #else
1170:       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1172:       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1173:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1174: #endif
1175:       break;
1176:     default:
1177: #if defined(PETSC_USE_COMPLEX)
1178:       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1180:       n    = (PetscInt)local_n0*partial_dim;
1181:       MatSetSizes(A,n,n,N,N);
1182: #else
1183:       temp = pdim[ndim-1];

1185:       pdim[ndim-1] = temp/2 + 1;

1187:       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1189:       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1190:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1192:       pdim[ndim-1] = temp;

1194:       MatSetSizes(A,n,n,N1,N1);
1195: #endif
1196:       break;
1197:     }
1198:   }
1199:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1200:   PetscNewLog(A,Mat_FFTW,&fftw);
1201:   fft->data = (void*)fftw;

1203:   fft->n            = n;
1204:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1205:   fftw->partial_dim = partial_dim;

1207:   PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t*)&(fftw->dim_fftw));

1209:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1211:   fftw->p_forward  = 0;
1212:   fftw->p_backward = 0;
1213:   fftw->p_flag     = FFTW_ESTIMATE;

1215:   if (size == 1) {
1216:     A->ops->mult          = MatMult_SeqFFTW;
1217:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1218:   } else {
1219:     A->ops->mult          = MatMult_MPIFFTW;
1220:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1221:   }
1222:   fft->matdestroy = MatDestroy_FFTW;
1223:   A->assembled    = PETSC_TRUE;
1224:   A->preallocated = PETSC_TRUE;

1226:   PetscObjectComposeFunction((PetscObject)A,"MatGetVecsFFTW_C",MatGetVecsFFTW_FFTW);
1227:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1228:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1230:   /* get runtime options */
1231:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1232:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);
1233:   if (flg) fftw->p_flag = (unsigned)p_flag;
1234:   PetscOptionsEnd();
1235:   return(0);
1236: }