Actual source code: fftw.c

petsc-3.13.6 2020-09-29
Report Typos and Errors

  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/tests
  5: */

  7:  #include <../src/mat/impls/fft/fft.h>
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t    ndim_fftw,*dim_fftw;
 14: #if defined(PETSC_USE_64BIT_INDICES)
 15:   fftw_iodim64 *iodims;
 16: #else
 17:   fftw_iodim   *iodims;
 18: #endif
 19:   PetscInt     partial_dim;
 20:   fftw_plan    p_forward,p_backward;
 21:   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 22:   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 23:                                                             executed for the arrays with which the plan was created */
 24: } Mat_FFTW;

 26: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 27: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 28: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 29: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 30: extern PetscErrorCode MatDestroy_FFTW(Mat);
 31: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
 32: extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);

 34: /* MatMult_SeqFFTW performs forward DFT in parallel
 35:    Input parameter:
 36:      A - the matrix
 37:      x - the vector on which FDFT will be performed

 39:    Output parameter:
 40:      y - vector that stores result of FDFT
 41: */
 42: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 43: {
 45:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 46:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 47:   const PetscScalar *x_array;
 48:   PetscScalar    *y_array;
 49: #if defined(PETSC_USE_COMPLEX)
 50: #if defined(PETSC_USE_64BIT_INDICES)
 51:   fftw_iodim64   *iodims;
 52: #else
 53:   fftw_iodim     *iodims;
 54: #endif
 55:   PetscInt       i;
 56: #endif
 57:   PetscInt       ndim = fft->ndim,*dim = fft->dim;

 60:   VecGetArrayRead(x,&x_array);
 61:   VecGetArray(y,&y_array);
 62:   if (!fftw->p_forward) { /* create a plan, then excute it */
 63:     switch (ndim) {
 64:     case 1:
 65: #if defined(PETSC_USE_COMPLEX)
 66:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 67: #else
 68:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 69: #endif
 70:       break;
 71:     case 2:
 72: #if defined(PETSC_USE_COMPLEX)
 73:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 74: #else
 75:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 76: #endif
 77:       break;
 78:     case 3:
 79: #if defined(PETSC_USE_COMPLEX)
 80:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 81: #else
 82:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 83: #endif
 84:       break;
 85:     default:
 86: #if defined(PETSC_USE_COMPLEX)
 87:       iodims = fftw->iodims;
 88: #if defined(PETSC_USE_64BIT_INDICES)
 89:       if (ndim) {
 90:         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
 91:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
 92:         for (i=ndim-2; i>=0; --i) {
 93:           iodims[i].n = (ptrdiff_t)dim[i];
 94:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
 95:         }
 96:       }
 97:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 98: #else
 99:       if (ndim) {
100:         iodims[ndim-1].n = (int)dim[ndim-1];
101:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
102:         for (i=ndim-2; i>=0; --i) {
103:           iodims[i].n = (int)dim[i];
104:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
105:         }
106:       }
107:       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
108: #endif

110: #else
111:       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
112: #endif
113:       break;
114:     }
115:     fftw->finarray  = (PetscScalar *) x_array;
116:     fftw->foutarray = y_array;
117:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
118:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
119:     fftw_execute(fftw->p_forward);
120:   } else { /* use existing plan */
121:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
122: #if defined(PETSC_USE_COMPLEX)
123:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
124: #else
125:       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
126: #endif
127:     } else {
128:       fftw_execute(fftw->p_forward);
129:     }
130:   }
131:   VecRestoreArray(y,&y_array);
132:   VecRestoreArrayRead(x,&x_array);
133:   return(0);
134: }

136: /* MatMultTranspose_SeqFFTW performs serial backward DFT
137:    Input parameter:
138:      A - the matrix
139:      x - the vector on which BDFT will be performed

141:    Output parameter:
142:      y - vector that stores result of BDFT
143: */

145: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
146: {
148:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
149:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
150:   const PetscScalar *x_array;
151:   PetscScalar    *y_array;
152:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
153: #if defined(PETSC_USE_COMPLEX)
154: #if defined(PETSC_USE_64BIT_INDICES)
155:   fftw_iodim64   *iodims=fftw->iodims;
156: #else
157:   fftw_iodim     *iodims=fftw->iodims;
158: #endif
159: #endif

162:   VecGetArrayRead(x,&x_array);
163:   VecGetArray(y,&y_array);
164:   if (!fftw->p_backward) { /* create a plan, then excute it */
165:     switch (ndim) {
166:     case 1:
167: #if defined(PETSC_USE_COMPLEX)
168:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
169: #else
170:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
171: #endif
172:       break;
173:     case 2:
174: #if defined(PETSC_USE_COMPLEX)
175:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
176: #else
177:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
178: #endif
179:       break;
180:     case 3:
181: #if defined(PETSC_USE_COMPLEX)
182:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
183: #else
184:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
185: #endif
186:       break;
187:     default:
188: #if defined(PETSC_USE_COMPLEX)
189: #if defined(PETSC_USE_64BIT_INDICES)
190:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
191: #else
192:       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
193: #endif
194: #else
195:       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
196: #endif
197:       break;
198:     }
199:     fftw->binarray  = (PetscScalar *) x_array;
200:     fftw->boutarray = y_array;
201:     fftw_execute(fftw->p_backward);
202:   } else { /* use existing plan */
203:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
204: #if defined(PETSC_USE_COMPLEX)
205:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
206: #else
207:       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
208: #endif
209:     } else {
210:       fftw_execute(fftw->p_backward);
211:     }
212:   }
213:   VecRestoreArray(y,&y_array);
214:   VecRestoreArrayRead(x,&x_array);
215:   return(0);
216: }

218: /* MatMult_MPIFFTW performs forward DFT in parallel
219:    Input parameter:
220:      A - the matrix
221:      x - the vector on which FDFT will be performed

223:    Output parameter:
224:    y   - vector that stores result of FDFT
225: */
226: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
227: {
229:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
230:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
231:   const PetscScalar *x_array;
232:   PetscScalar    *y_array;
233:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
234:   MPI_Comm       comm;

237:   PetscObjectGetComm((PetscObject)A,&comm);
238:   VecGetArrayRead(x,&x_array);
239:   VecGetArray(y,&y_array);
240:   if (!fftw->p_forward) { /* create a plan, then excute it */
241:     switch (ndim) {
242:     case 1:
243: #if defined(PETSC_USE_COMPLEX)
244:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
245: #else
246:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
247: #endif
248:       break;
249:     case 2:
250: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
251:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
252: #else
253:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
254: #endif
255:       break;
256:     case 3:
257: #if defined(PETSC_USE_COMPLEX)
258:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
259: #else
260:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
261: #endif
262:       break;
263:     default:
264: #if defined(PETSC_USE_COMPLEX)
265:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
266: #else
267:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
268: #endif
269:       break;
270:     }
271:     fftw->finarray  = (PetscScalar *) x_array;
272:     fftw->foutarray = y_array;
273:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
274:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
275:     fftw_execute(fftw->p_forward);
276:   } else { /* use existing plan */
277:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
278:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
279:     } else {
280:       fftw_execute(fftw->p_forward);
281:     }
282:   }
283:   VecRestoreArray(y,&y_array);
284:   VecRestoreArrayRead(x,&x_array);
285:   return(0);
286: }

288: /* MatMultTranspose_MPIFFTW performs parallel backward DFT
289:    Input parameter:
290:      A - the matrix
291:      x - the vector on which BDFT will be performed

293:    Output parameter:
294:      y - vector that stores result of BDFT
295: */
296: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
297: {
299:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
300:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
301:   const PetscScalar *x_array;
302:   PetscScalar    *y_array;
303:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
304:   MPI_Comm       comm;

307:   PetscObjectGetComm((PetscObject)A,&comm);
308:   VecGetArrayRead(x,&x_array);
309:   VecGetArray(y,&y_array);
310:   if (!fftw->p_backward) { /* create a plan, then excute it */
311:     switch (ndim) {
312:     case 1:
313: #if defined(PETSC_USE_COMPLEX)
314:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
315: #else
316:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
317: #endif
318:       break;
319:     case 2:
320: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
321:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
322: #else
323:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
324: #endif
325:       break;
326:     case 3:
327: #if defined(PETSC_USE_COMPLEX)
328:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
329: #else
330:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
331: #endif
332:       break;
333:     default:
334: #if defined(PETSC_USE_COMPLEX)
335:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
336: #else
337:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
338: #endif
339:       break;
340:     }
341:     fftw->binarray  = (PetscScalar *) x_array;
342:     fftw->boutarray = y_array;
343:     fftw_execute(fftw->p_backward);
344:   } else { /* use existing plan */
345:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
346:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
347:     } else {
348:       fftw_execute(fftw->p_backward);
349:     }
350:   }
351:   VecRestoreArray(y,&y_array);
352:   VecRestoreArrayRead(x,&x_array);
353:   return(0);
354: }

356: PetscErrorCode MatDestroy_FFTW(Mat A)
357: {
358:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
359:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

363:   fftw_destroy_plan(fftw->p_forward);
364:   fftw_destroy_plan(fftw->p_backward);
365:   if (fftw->iodims) {
366:     free(fftw->iodims);
367:   }
368:   PetscFree(fftw->dim_fftw);
369:   PetscFree(fft->data);
370:   fftw_mpi_cleanup();
371:   return(0);
372: }

374:  #include <../src/vec/vec/impls/mpi/pvecimpl.h>
375: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
376: {
378:   PetscScalar    *array;

381:   VecGetArray(v,&array);
382:   fftw_free((fftw_complex*)array);
383:   VecRestoreArray(v,&array);
384:   VecDestroy_MPI(v);
385:   return(0);
386: }


389: static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
390: {
392:   Mat            A;

395:   PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A);
396:   MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL);
397:   return(0);
398: }

400: static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
401: {
403:   Mat            A;

406:   PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A);
407:   MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL);
408:   return(0);
409: }

411: static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
412: {
414:   Mat            A;

417:   PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A);
418:   MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new);
419:   return(0);
420: }


423: /*@
424:    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
425:      parallel layout determined by FFTW

427:    Collective on Mat

429:    Input Parameter:
430: .   A - the matrix

432:    Output Parameter:
433: +   x - (optional) input vector of forward FFTW
434: -   y - (optional) output vector of forward FFTW
435: -   z - (optional) output vector of backward FFTW

437:   Level: advanced

439:   Note: The parallel layout of output of forward FFTW is always same as the input
440:         of the backward FFTW. But parallel layout of the input vector of forward
441:         FFTW might not be same as the output of backward FFTW.
442:         Also note that we need to provide enough space while doing parallel real transform.
443:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
444:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
445:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
446:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
447:         zeros if it is odd only one zero is needed.
448:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
449:         figures out how much space is needed, i.e. it figures out the data+scratch space for
450:         each processor and returns that.

452: .seealso: MatCreateFFT()
453: @*/
454: PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
455: {

459:   PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
460:   return(0);
461: }

463: PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
464: {
466:   PetscMPIInt    size,rank;
467:   MPI_Comm       comm;
468:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
469:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
470:   PetscInt       N     = fft->N;
471:   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;

476:   PetscObjectGetComm((PetscObject)A,&comm);

478:   MPI_Comm_size(comm, &size);
479:   MPI_Comm_rank(comm, &rank);
480:   if (size == 1) { /* sequential case */
481: #if defined(PETSC_USE_COMPLEX)
482:     if (fin)  {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
483:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,N,fout);}
484:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,N,bout);}
485: #else
486:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
487:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,n,fout);}
488:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,n,bout);}
489: #endif
490:   } else { /* parallel cases */
491:     ptrdiff_t    alloc_local,local_n0,local_0_start;
492:     ptrdiff_t    local_n1;
493:     fftw_complex *data_fout;
494:     ptrdiff_t    local_1_start;
495: #if defined(PETSC_USE_COMPLEX)
496:     fftw_complex *data_fin,*data_bout;
497: #else
498:     double    *data_finr,*data_boutr;
499:     PetscInt  n1,N1;
500:     ptrdiff_t temp;
501: #endif

503:     switch (ndim) {
504:     case 1:
505: #if !defined(PETSC_USE_COMPLEX)
506:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
507: #else
508:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
509:       if (fin) {
510:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
511:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);
512:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
513:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
514:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
515:       }
516:       if (fout) {
517:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
518:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);
519:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
520:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
521:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
522:       }
523:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
524:       if (bout) {
525:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
526:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);
527:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
528:         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
529:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
530:       }
531:       break;
532: #endif
533:     case 2:
534: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
535:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
536:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
537:       if (fin) {
538:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
539:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);
540:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
541:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
542:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
543:       }
544:       if (fout) {
545:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
546:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);
547:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
548:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
549:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
550:       }
551:       if (bout) {
552:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
553:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);
554:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
555:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
556:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
557:       }
558: #else
559:       /* Get local size */
560:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
561:       if (fin) {
562:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
563:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);
564:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
565:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
566:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
567:       }
568:       if (fout) {
569:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
570:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);
571:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
572:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
573:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
574:       }
575:       if (bout) {
576:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
577:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);
578:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
579:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
580:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
581:       }
582: #endif
583:       break;
584:     case 3:
585: #if !defined(PETSC_USE_COMPLEX)
586:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
587:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
588:       if (fin) {
589:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
590:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);
591:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
592:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
593:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
594:       }
595:       if (fout) {
596:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
597:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);
598:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
599:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
600:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
601:       }
602:       if (bout) {
603:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
604:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);
605:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
606:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
607:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
608:       }
609: #else
610:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
611:       if (fin) {
612:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
613:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);
614:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
615:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
616:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
617:       }
618:       if (fout) {
619:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
620:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);
621:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
622:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
623:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
624:       }
625:       if (bout) {
626:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
627:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);
628:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
629:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
630:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
631:       }
632: #endif
633:       break;
634:     default:
635: #if !defined(PETSC_USE_COMPLEX)
636:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

638:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

640:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
641:       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

643:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

645:       if (fin) {
646:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
647:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);
648:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
649:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
650:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
651:       }
652:       if (fout) {
653:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
654:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);
655:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
656:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
657:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
658:       }
659:       if (bout) {
660:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
661:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);
662:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
663:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
664:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
665:       }
666: #else
667:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
668:       if (fin) {
669:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
670:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);
671:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
672:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
673:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
674:       }
675:       if (fout) {
676:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
677:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);
678:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
679:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
680:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
681:       }
682:       if (bout) {
683:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
684:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);
685:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
686:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
687:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
688:       }
689: #endif
690:       break;
691:     }
692:     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
693:        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
694:        memory leaks. We void these pointers here to avoid acident uses.
695:      */
696:     if (fin)  (*fin)->ops->replacearray = NULL;
697:     if (fout) (*fout)->ops->replacearray = NULL;
698:     if (bout) (*bout)->ops->replacearray = NULL;
699:   }
700:   return(0);
701: }

703: /*@
704:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

706:    Collective on Mat

708:    Input Parameters:
709: +  A - FFTW matrix
710: -  x - the PETSc vector

712:    Output Parameters:
713: .  y - the FFTW vector

715:   Options Database Keys:
716: . -mat_fftw_plannerflags - set FFTW planner flags

718:    Level: intermediate

720:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
721:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
722:          zeros. This routine does that job by scattering operation.

724: .seealso: VecScatterFFTWToPetsc()
725: @*/
726: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
727: {

731:   PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
732:   return(0);
733: }

735: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
736: {
738:   MPI_Comm       comm;
739:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
740:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
741:   PetscInt       N     =fft->N;
742:   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
743:   PetscInt       low;
744:   PetscMPIInt    rank,size;
745:   PetscInt       vsize,vsize1;
746:   ptrdiff_t      local_n0,local_0_start;
747:   ptrdiff_t      local_n1,local_1_start;
748:   VecScatter     vecscat;
749:   IS             list1,list2;
750: #if !defined(PETSC_USE_COMPLEX)
751:   PetscInt       i,j,k,partial_dim;
752:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
753:   PetscInt       NM;
754:   ptrdiff_t      temp;
755: #endif

758:   PetscObjectGetComm((PetscObject)A,&comm);
759:   MPI_Comm_size(comm, &size);
760:   MPI_Comm_rank(comm, &rank);
761:   VecGetOwnershipRange(y,&low,NULL);

763:   if (size==1) {
764:     VecGetSize(x,&vsize);
765:     VecGetSize(y,&vsize1);
766:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
767:     VecScatterCreate(x,list1,y,list1,&vecscat);
768:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
769:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
770:     VecScatterDestroy(&vecscat);
771:     ISDestroy(&list1);
772:   } else {
773:     switch (ndim) {
774:     case 1:
775: #if defined(PETSC_USE_COMPLEX)
776:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

778:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
779:       ISCreateStride(comm,local_n0,low,1,&list2);
780:       VecScatterCreate(x,list1,y,list2,&vecscat);
781:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
782:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
783:       VecScatterDestroy(&vecscat);
784:       ISDestroy(&list1);
785:       ISDestroy(&list2);
786: #else
787:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
788: #endif
789:       break;
790:     case 2:
791: #if defined(PETSC_USE_COMPLEX)
792:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

794:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
795:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
796:       VecScatterCreate(x,list1,y,list2,&vecscat);
797:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
798:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
799:       VecScatterDestroy(&vecscat);
800:       ISDestroy(&list1);
801:       ISDestroy(&list2);
802: #else
803:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

805:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
806:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

808:       if (dim[1]%2==0) {
809:         NM = dim[1]+2;
810:       } else {
811:         NM = dim[1]+1;
812:       }
813:       for (i=0; i<local_n0; i++) {
814:         for (j=0; j<dim[1]; j++) {
815:           tempindx  = i*dim[1] + j;
816:           tempindx1 = i*NM + j;

818:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
819:           indx2[tempindx]=low+tempindx1;
820:         }
821:       }

823:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
824:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

826:       VecScatterCreate(x,list1,y,list2,&vecscat);
827:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
828:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
829:       VecScatterDestroy(&vecscat);
830:       ISDestroy(&list1);
831:       ISDestroy(&list2);
832:       PetscFree(indx1);
833:       PetscFree(indx2);
834: #endif
835:       break;

837:     case 3:
838: #if defined(PETSC_USE_COMPLEX)
839:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

841:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
842:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
843:       VecScatterCreate(x,list1,y,list2,&vecscat);
844:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
845:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
846:       VecScatterDestroy(&vecscat);
847:       ISDestroy(&list1);
848:       ISDestroy(&list2);
849: #else
850:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
851:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
852:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

854:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
855:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

857:       if (dim[2]%2==0) NM = dim[2]+2;
858:       else             NM = dim[2]+1;

860:       for (i=0; i<local_n0; i++) {
861:         for (j=0; j<dim[1]; j++) {
862:           for (k=0; k<dim[2]; k++) {
863:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
864:             tempindx1 = i*dim[1]*NM + j*NM + k;

866:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
867:             indx2[tempindx]=low+tempindx1;
868:           }
869:         }
870:       }

872:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
873:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
874:       VecScatterCreate(x,list1,y,list2,&vecscat);
875:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
876:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
877:       VecScatterDestroy(&vecscat);
878:       ISDestroy(&list1);
879:       ISDestroy(&list2);
880:       PetscFree(indx1);
881:       PetscFree(indx2);
882: #endif
883:       break;

885:     default:
886: #if defined(PETSC_USE_COMPLEX)
887:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

889:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
890:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
891:       VecScatterCreate(x,list1,y,list2,&vecscat);
892:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
893:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
894:       VecScatterDestroy(&vecscat);
895:       ISDestroy(&list1);
896:       ISDestroy(&list2);
897: #else
898:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
899:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
900:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

902:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

904:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

906:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

908:       partial_dim = fftw->partial_dim;

910:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
911:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

913:       if (dim[ndim-1]%2==0) NM = 2;
914:       else                  NM = 1;

916:       j = low;
917:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
918:         indx1[i] = local_0_start*partial_dim + i;
919:         indx2[i] = j;
920:         if (k%dim[ndim-1]==0) j+=NM;
921:         j++;
922:       }
923:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
924:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
925:       VecScatterCreate(x,list1,y,list2,&vecscat);
926:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
927:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
928:       VecScatterDestroy(&vecscat);
929:       ISDestroy(&list1);
930:       ISDestroy(&list2);
931:       PetscFree(indx1);
932:       PetscFree(indx2);
933: #endif
934:       break;
935:     }
936:   }
937:   return(0);
938: }

940: /*@
941:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

943:    Collective on Mat

945:     Input Parameters:
946: +   A - FFTW matrix
947: -   x - FFTW vector

949:    Output Parameters:
950: .  y - PETSc vector

952:    Level: intermediate

954:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
955:          VecScatterFFTWToPetsc removes those extra zeros.

957: .seealso: VecScatterPetscToFFTW()
958: @*/
959: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
960: {

964:   PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
965:   return(0);
966: }

968: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
969: {
971:   MPI_Comm       comm;
972:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
973:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
974:   PetscInt       N     = fft->N;
975:   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
976:   PetscInt       low;
977:   PetscMPIInt    rank,size;
978:   ptrdiff_t      local_n0,local_0_start;
979:   ptrdiff_t      local_n1,local_1_start;
980:   VecScatter     vecscat;
981:   IS             list1,list2;
982: #if !defined(PETSC_USE_COMPLEX)
983:   PetscInt       i,j,k,partial_dim;
984:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
985:   PetscInt       NM;
986:   ptrdiff_t      temp;
987: #endif

990:   PetscObjectGetComm((PetscObject)A,&comm);
991:   MPI_Comm_size(comm, &size);
992:   MPI_Comm_rank(comm, &rank);
993:   VecGetOwnershipRange(x,&low,NULL);

995:   if (size==1) {
996:     ISCreateStride(comm,N,0,1,&list1);
997:     VecScatterCreate(x,list1,y,list1,&vecscat);
998:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
999:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1000:     VecScatterDestroy(&vecscat);
1001:     ISDestroy(&list1);

1003:   } else {
1004:     switch (ndim) {
1005:     case 1:
1006: #if defined(PETSC_USE_COMPLEX)
1007:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1009:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
1010:       ISCreateStride(comm,local_n1,low,1,&list2);
1011:       VecScatterCreate(x,list1,y,list2,&vecscat);
1012:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1013:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1014:       VecScatterDestroy(&vecscat);
1015:       ISDestroy(&list1);
1016:       ISDestroy(&list2);
1017: #else
1018:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1019: #endif
1020:       break;
1021:     case 2:
1022: #if defined(PETSC_USE_COMPLEX)
1023:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

1025:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
1026:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
1027:       VecScatterCreate(x,list2,y,list1,&vecscat);
1028:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1029:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1030:       VecScatterDestroy(&vecscat);
1031:       ISDestroy(&list1);
1032:       ISDestroy(&list2);
1033: #else
1034:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1036:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
1037:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

1039:       if (dim[1]%2==0) NM = dim[1]+2;
1040:       else             NM = dim[1]+1;

1042:       for (i=0; i<local_n0; i++) {
1043:         for (j=0; j<dim[1]; j++) {
1044:           tempindx = i*dim[1] + j;
1045:           tempindx1 = i*NM + j;

1047:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1048:           indx2[tempindx]=low+tempindx1;
1049:         }
1050:       }

1052:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
1053:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

1055:       VecScatterCreate(x,list2,y,list1,&vecscat);
1056:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1057:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1058:       VecScatterDestroy(&vecscat);
1059:       ISDestroy(&list1);
1060:       ISDestroy(&list2);
1061:       PetscFree(indx1);
1062:       PetscFree(indx2);
1063: #endif
1064:       break;
1065:     case 3:
1066: #if defined(PETSC_USE_COMPLEX)
1067:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1069:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1070:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1071:       VecScatterCreate(x,list1,y,list2,&vecscat);
1072:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1073:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1074:       VecScatterDestroy(&vecscat);
1075:       ISDestroy(&list1);
1076:       ISDestroy(&list2);
1077: #else
1078:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1080:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
1081:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

1083:       if (dim[2]%2==0) NM = dim[2]+2;
1084:       else             NM = dim[2]+1;

1086:       for (i=0; i<local_n0; i++) {
1087:         for (j=0; j<dim[1]; j++) {
1088:           for (k=0; k<dim[2]; k++) {
1089:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1090:             tempindx1 = i*dim[1]*NM + j*NM + k;

1092:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1093:             indx2[tempindx]=low+tempindx1;
1094:           }
1095:         }
1096:       }

1098:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1099:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1101:       VecScatterCreate(x,list2,y,list1,&vecscat);
1102:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1103:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1104:       VecScatterDestroy(&vecscat);
1105:       ISDestroy(&list1);
1106:       ISDestroy(&list2);
1107:       PetscFree(indx1);
1108:       PetscFree(indx2);
1109: #endif
1110:       break;
1111:     default:
1112: #if defined(PETSC_USE_COMPLEX)
1113:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1115:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1116:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1117:       VecScatterCreate(x,list1,y,list2,&vecscat);
1118:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1119:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1120:       VecScatterDestroy(&vecscat);
1121:       ISDestroy(&list1);
1122:       ISDestroy(&list2);
1123: #else
1124:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1126:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1128:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1130:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1132:       partial_dim = fftw->partial_dim;

1134:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
1135:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

1137:       if (dim[ndim-1]%2==0) NM = 2;
1138:       else                  NM = 1;

1140:       j = low;
1141:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1142:         indx1[i] = local_0_start*partial_dim + i;
1143:         indx2[i] = j;
1144:         if (k%dim[ndim-1]==0) j+=NM;
1145:         j++;
1146:       }
1147:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1148:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1150:       VecScatterCreate(x,list2,y,list1,&vecscat);
1151:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1152:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1153:       VecScatterDestroy(&vecscat);
1154:       ISDestroy(&list1);
1155:       ISDestroy(&list2);
1156:       PetscFree(indx1);
1157:       PetscFree(indx2);
1158: #endif
1159:       break;
1160:     }
1161:   }
1162:   return(0);
1163: }

1165: /*
1166:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1168:   Options Database Keys:
1169: + -mat_fftw_plannerflags - set FFTW planner flags

1171:    Level: intermediate

1173: */
1174: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1175: {
1177:   MPI_Comm       comm;
1178:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1179:   Mat_FFTW       *fftw;
1180:   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1181:   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1182:   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1183:   PetscBool      flg;
1184:   PetscInt       p_flag,partial_dim=1,ctr;
1185:   PetscMPIInt    size,rank;
1186:   ptrdiff_t      *pdim;
1187:   ptrdiff_t      local_n1,local_1_start;
1188: #if !defined(PETSC_USE_COMPLEX)
1189:   ptrdiff_t      temp;
1190:   PetscInt       N1,tot_dim;
1191: #else
1192:   PetscInt       n1;
1193: #endif

1196:   PetscObjectGetComm((PetscObject)A,&comm);
1197:   MPI_Comm_size(comm, &size);
1198:   MPI_Comm_rank(comm, &rank);

1200:   fftw_mpi_init();
1201:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1202:   pdim[0] = dim[0];
1203: #if !defined(PETSC_USE_COMPLEX)
1204:   tot_dim = 2*dim[0];
1205: #endif
1206:   for (ctr=1; ctr<ndim; ctr++) {
1207:     partial_dim *= dim[ctr];
1208:     pdim[ctr]    = dim[ctr];
1209: #if !defined(PETSC_USE_COMPLEX)
1210:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1211:     else             tot_dim *= dim[ctr];
1212: #endif
1213:   }

1215:   if (size == 1) {
1216: #if defined(PETSC_USE_COMPLEX)
1217:     MatSetSizes(A,N,N,N,N);
1218:     n    = N;
1219: #else
1220:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1221:     n    = tot_dim;
1222: #endif

1224:   } else {
1225:     ptrdiff_t local_n0,local_0_start;
1226:     switch (ndim) {
1227:     case 1:
1228: #if !defined(PETSC_USE_COMPLEX)
1229:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1230: #else
1231:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1233:       n    = (PetscInt)local_n0;
1234:       n1   = (PetscInt)local_n1;
1235:       MatSetSizes(A,n1,n,N,N);
1236: #endif
1237:       break;
1238:     case 2:
1239: #if defined(PETSC_USE_COMPLEX)
1240:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1241:       /*
1242:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1243:        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1244:        */
1245:       n    = (PetscInt)local_n0*dim[1];
1246:       MatSetSizes(A,n,n,N,N);
1247: #else
1248:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1250:       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1251:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1252: #endif
1253:       break;
1254:     case 3:
1255: #if defined(PETSC_USE_COMPLEX)
1256:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1258:       n    = (PetscInt)local_n0*dim[1]*dim[2];
1259:       MatSetSizes(A,n,n,N,N);
1260: #else
1261:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1263:       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1264:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1265: #endif
1266:       break;
1267:     default:
1268: #if defined(PETSC_USE_COMPLEX)
1269:       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1271:       n    = (PetscInt)local_n0*partial_dim;
1272:       MatSetSizes(A,n,n,N,N);
1273: #else
1274:       temp = pdim[ndim-1];

1276:       pdim[ndim-1] = temp/2 + 1;

1278:       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1280:       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1281:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1283:       pdim[ndim-1] = temp;

1285:       MatSetSizes(A,n,n,N1,N1);
1286: #endif
1287:       break;
1288:     }
1289:   }
1290:   free(pdim);
1291:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1292:   PetscNewLog(A,&fftw);
1293:   fft->data = (void*)fftw;

1295:   fft->n            = n;
1296:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1297:   fftw->partial_dim = partial_dim;

1299:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1300:   if (size == 1) {
1301: #if defined(PETSC_USE_64BIT_INDICES)
1302:     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1303: #else
1304:     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1305: #endif
1306:   }

1308:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1310:   fftw->p_forward  = 0;
1311:   fftw->p_backward = 0;
1312:   fftw->p_flag     = FFTW_ESTIMATE;

1314:   if (size == 1) {
1315:     A->ops->mult          = MatMult_SeqFFTW;
1316:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1317:   } else {
1318:     A->ops->mult          = MatMult_MPIFFTW;
1319:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1320:   }
1321:   fft->matdestroy = MatDestroy_FFTW;
1322:   A->assembled    = PETSC_TRUE;
1323:   A->preallocated = PETSC_TRUE;

1325:   PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);
1326:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1327:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1329:   /* get runtime options */
1330:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1331:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);
1332:   if (flg) {
1333:     fftw->p_flag = iplans[p_flag];
1334:   }
1335:   PetscOptionsEnd();
1336:   return(0);
1337: }