Actual source code: fftw.c


  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>
  8: EXTERN_C_BEGIN
  9: #if !PetscDefined(HAVE_MPIUNI)
 10: #include <fftw3-mpi.h>
 11: #else
 12: #include <fftw3.h>
 13: #endif
 14: EXTERN_C_END

 16: typedef struct {
 17:   ptrdiff_t    ndim_fftw,*dim_fftw;
 18: #if defined(PETSC_USE_64BIT_INDICES)
 19:   fftw_iodim64 *iodims;
 20: #else
 21:   fftw_iodim   *iodims;
 22: #endif
 23:   PetscInt     partial_dim;
 24:   fftw_plan    p_forward,p_backward;
 25:   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 26:   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 27:                                                             executed for the arrays with which the plan was created */
 28: } Mat_FFTW;

 30: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 31: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 32: extern PetscErrorCode MatDestroy_FFTW(Mat);
 33: extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
 34: #if !PetscDefined(HAVE_MPIUNI)
 35: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 36: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 37: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
 38: #endif

 40: /*
 41:    MatMult_SeqFFTW performs forward DFT
 42:    Input parameter:
 43:      A - the matrix
 44:      x - the vector on which FDFT will be performed

 46:    Output parameter:
 47:      y - vector that stores result of FDFT
 48: */
 49: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 50: {
 51:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 52:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 53:   const PetscScalar *x_array;
 54:   PetscScalar    *y_array;
 55: #if defined(PETSC_USE_COMPLEX)
 56: #if defined(PETSC_USE_64BIT_INDICES)
 57:   fftw_iodim64   *iodims;
 58: #else
 59:   fftw_iodim     *iodims;
 60: #endif
 61:   PetscInt       i;
 62: #endif
 63:   PetscInt       ndim = fft->ndim,*dim = fft->dim;

 65:   VecGetArrayRead(x,&x_array);
 66:   VecGetArray(y,&y_array);
 67:   if (!fftw->p_forward) { /* create a plan, then execute it */
 68:     switch (ndim) {
 69:     case 1:
 70: #if defined(PETSC_USE_COMPLEX)
 71:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 72: #else
 73:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 74: #endif
 75:       break;
 76:     case 2:
 77: #if defined(PETSC_USE_COMPLEX)
 78:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 79: #else
 80:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 81: #endif
 82:       break;
 83:     case 3:
 84: #if defined(PETSC_USE_COMPLEX)
 85:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 86: #else
 87:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 88: #endif
 89:       break;
 90:     default:
 91: #if defined(PETSC_USE_COMPLEX)
 92:       iodims = fftw->iodims;
 93: #if defined(PETSC_USE_64BIT_INDICES)
 94:       if (ndim) {
 95:         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
 96:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
 97:         for (i=ndim-2; i>=0; --i) {
 98:           iodims[i].n = (ptrdiff_t)dim[i];
 99:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
100:         }
101:       }
102:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
103: #else
104:       if (ndim) {
105:         iodims[ndim-1].n = (int)dim[ndim-1];
106:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
107:         for (i=ndim-2; i>=0; --i) {
108:           iodims[i].n = (int)dim[i];
109:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
110:         }
111:       }
112:       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
113: #endif

115: #else
116:       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
117: #endif
118:       break;
119:     }
120:     fftw->finarray  = (PetscScalar *) x_array;
121:     fftw->foutarray = y_array;
122:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
123:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
124:     fftw_execute(fftw->p_forward);
125:   } else { /* use existing plan */
126:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
127: #if defined(PETSC_USE_COMPLEX)
128:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
129: #else
130:       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
131: #endif
132:     } else {
133:       fftw_execute(fftw->p_forward);
134:     }
135:   }
136:   VecRestoreArray(y,&y_array);
137:   VecRestoreArrayRead(x,&x_array);
138:   return 0;
139: }

141: /* MatMultTranspose_SeqFFTW performs serial backward DFT
142:    Input parameter:
143:      A - the matrix
144:      x - the vector on which BDFT will be performed

146:    Output parameter:
147:      y - vector that stores result of BDFT
148: */

150: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
151: {
152:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
153:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
154:   const PetscScalar *x_array;
155:   PetscScalar    *y_array;
156:   PetscInt       ndim = fft->ndim,*dim = fft->dim;
157: #if defined(PETSC_USE_COMPLEX)
158: #if defined(PETSC_USE_64BIT_INDICES)
159:   fftw_iodim64   *iodims = fftw->iodims;
160: #else
161:   fftw_iodim     *iodims = fftw->iodims;
162: #endif
163: #endif

165:   VecGetArrayRead(x,&x_array);
166:   VecGetArray(y,&y_array);
167:   if (!fftw->p_backward) { /* create a plan, then execute it */
168:     switch (ndim) {
169:     case 1:
170: #if defined(PETSC_USE_COMPLEX)
171:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
172: #else
173:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
174: #endif
175:       break;
176:     case 2:
177: #if defined(PETSC_USE_COMPLEX)
178:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
179: #else
180:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
181: #endif
182:       break;
183:     case 3:
184: #if defined(PETSC_USE_COMPLEX)
185:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
186: #else
187:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
188: #endif
189:       break;
190:     default:
191: #if defined(PETSC_USE_COMPLEX)
192: #if defined(PETSC_USE_64BIT_INDICES)
193:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
194: #else
195:       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
196: #endif
197: #else
198:       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
199: #endif
200:       break;
201:     }
202:     fftw->binarray  = (PetscScalar *) x_array;
203:     fftw->boutarray = y_array;
204:     fftw_execute(fftw->p_backward);
205:   } else { /* use existing plan */
206:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
207: #if defined(PETSC_USE_COMPLEX)
208:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
209: #else
210:       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
211: #endif
212:     } else {
213:       fftw_execute(fftw->p_backward);
214:     }
215:   }
216:   VecRestoreArray(y,&y_array);
217:   VecRestoreArrayRead(x,&x_array);
218:   return 0;
219: }

221: #if !PetscDefined(HAVE_MPIUNI)
222: /* MatMult_MPIFFTW performs forward DFT in parallel
223:    Input parameter:
224:      A - the matrix
225:      x - the vector on which FDFT will be performed

227:    Output parameter:
228:    y   - vector that stores result of FDFT
229: */
230: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
231: {
232:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
233:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
234:   const PetscScalar *x_array;
235:   PetscScalar    *y_array;
236:   PetscInt       ndim = fft->ndim,*dim = fft->dim;
237:   MPI_Comm       comm;

239:   PetscObjectGetComm((PetscObject)A,&comm);
240:   VecGetArrayRead(x,&x_array);
241:   VecGetArray(y,&y_array);
242:   if (!fftw->p_forward) { /* create a plan, then execute it */
243:     switch (ndim) {
244:     case 1:
245: #if defined(PETSC_USE_COMPLEX)
246:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
247: #else
248:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
249: #endif
250:       break;
251:     case 2:
252: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
253:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
254: #else
255:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
256: #endif
257:       break;
258:     case 3:
259: #if defined(PETSC_USE_COMPLEX)
260:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
261: #else
262:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
263: #endif
264:       break;
265:     default:
266: #if defined(PETSC_USE_COMPLEX)
267:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
268: #else
269:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
270: #endif
271:       break;
272:     }
273:     fftw->finarray  = (PetscScalar *) x_array;
274:     fftw->foutarray = y_array;
275:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
276:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
277:     fftw_execute(fftw->p_forward);
278:   } else { /* use existing plan */
279:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
280:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
281:     } else {
282:       fftw_execute(fftw->p_forward);
283:     }
284:   }
285:   VecRestoreArray(y,&y_array);
286:   VecRestoreArrayRead(x,&x_array);
287:   return 0;
288: }

290: /*
291:    MatMultTranspose_MPIFFTW performs parallel backward DFT
292:    Input parameter:
293:      A - the matrix
294:      x - the vector on which BDFT will be performed

296:    Output parameter:
297:      y - vector that stores result of BDFT
298: */
299: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
300: {
301:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
302:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
303:   const PetscScalar *x_array;
304:   PetscScalar    *y_array;
305:   PetscInt       ndim = fft->ndim,*dim = fft->dim;
306:   MPI_Comm       comm;

308:   PetscObjectGetComm((PetscObject)A,&comm);
309:   VecGetArrayRead(x,&x_array);
310:   VecGetArray(y,&y_array);
311:   if (!fftw->p_backward) { /* create a plan, then execute it */
312:     switch (ndim) {
313:     case 1:
314: #if defined(PETSC_USE_COMPLEX)
315:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
316: #else
317:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
318: #endif
319:       break;
320:     case 2:
321: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
322:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
323: #else
324:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
325: #endif
326:       break;
327:     case 3:
328: #if defined(PETSC_USE_COMPLEX)
329:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
330: #else
331:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
332: #endif
333:       break;
334:     default:
335: #if defined(PETSC_USE_COMPLEX)
336:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
337: #else
338:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
339: #endif
340:       break;
341:     }
342:     fftw->binarray  = (PetscScalar *) x_array;
343:     fftw->boutarray = y_array;
344:     fftw_execute(fftw->p_backward);
345:   } else { /* use existing plan */
346:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
347:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
348:     } else {
349:       fftw_execute(fftw->p_backward);
350:     }
351:   }
352:   VecRestoreArray(y,&y_array);
353:   VecRestoreArrayRead(x,&x_array);
354:   return 0;
355: }
356: #endif

358: PetscErrorCode MatDestroy_FFTW(Mat A)
359: {
360:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
361:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

363:   fftw_destroy_plan(fftw->p_forward);
364:   fftw_destroy_plan(fftw->p_backward);
365:   if (fftw->iodims) {
366:     free(fftw->iodims);
367:   }
368:   PetscFree(fftw->dim_fftw);
369:   PetscFree(fft->data);
370: #if !PetscDefined(HAVE_MPIUNI)
371:   fftw_mpi_cleanup();
372: #endif
373:   return 0;
374: }

376: #if !PetscDefined(HAVE_MPIUNI)
377: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
378: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
379: {
380:   PetscScalar    *array;

382:   VecGetArray(v,&array);
383:   fftw_free((fftw_complex*)array);
384:   VecRestoreArray(v,&array);
385:   VecDestroy_MPI(v);
386:   return 0;
387: }
388: #endif

390: #if !PetscDefined(HAVE_MPIUNI)
391: static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
392: {
393:   Mat            A;

395:   PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A);
396:   MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL);
397:   return 0;
398: }

400: static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
401: {
402:   Mat            A;

404:   PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A);
405:   MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL);
406:   return 0;
407: }

409: static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
410: {
411:   Mat            A;

413:   PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A);
414:   MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new);
415:   return 0;
416: }
417: #endif

419: /*@
420:    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
421:      parallel layout determined by FFTW

423:    Collective on Mat

425:    Input Parameter:
426: .   A - the matrix

428:    Output Parameters:
429: +   x - (optional) input vector of forward FFTW
430: .   y - (optional) output vector of forward FFTW
431: -   z - (optional) output vector of backward FFTW

433:   Level: advanced

435:   Note: The parallel layout of output of forward FFTW is always same as the input
436:         of the backward FFTW. But parallel layout of the input vector of forward
437:         FFTW might not be same as the output of backward FFTW.
438:         Also note that we need to provide enough space while doing parallel real transform.
439:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
440:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
441:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
442:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
443:         zeros if it is odd only one zero is needed.
444:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
445:         figures out how much space is needed, i.e. it figures out the data+scratch space for
446:         each processor and returns that.

448: .seealso: MatCreateFFT()
449: @*/
450: PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
451: {
452:   PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
453:   return 0;
454: }

456: PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
457: {
458:   PetscMPIInt    size,rank;
459:   MPI_Comm       comm;
460:   Mat_FFT        *fft  = (Mat_FFT*)A->data;

464:   PetscObjectGetComm((PetscObject)A,&comm);

466:   MPI_Comm_size(comm, &size);
467:   MPI_Comm_rank(comm, &rank);
468:   if (size == 1) { /* sequential case */
469: #if defined(PETSC_USE_COMPLEX)
470:     if (fin)  VecCreateSeq(PETSC_COMM_SELF,fft->N,fin);
471:     if (fout) VecCreateSeq(PETSC_COMM_SELF,fft->N,fout);
472:     if (bout) VecCreateSeq(PETSC_COMM_SELF,fft->N,bout);
473: #else
474:     if (fin) VecCreateSeq(PETSC_COMM_SELF,fft->n,fin);
475:     if (fout) VecCreateSeq(PETSC_COMM_SELF,fft->n,fout);
476:     if (bout) VecCreateSeq(PETSC_COMM_SELF,fft->n,bout);
477: #endif
478: #if !PetscDefined(HAVE_MPIUNI)
479:   } else { /* parallel cases */
480:     Mat_FFTW     *fftw = (Mat_FFTW*)fft->data;
481:     PetscInt     ndim  = fft->ndim,*dim = fft->dim;
482:     ptrdiff_t    alloc_local,local_n0,local_0_start;
483:     ptrdiff_t    local_n1;
484:     fftw_complex *data_fout;
485:     ptrdiff_t    local_1_start;
486: #if defined(PETSC_USE_COMPLEX)
487:     fftw_complex *data_fin,*data_bout;
488: #else
489:     double       *data_finr,*data_boutr;
490:     PetscInt     n1,N1;
491:     ptrdiff_t    temp;
492: #endif

494:     switch (ndim) {
495:     case 1:
496: #if !defined(PETSC_USE_COMPLEX)
497:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
498: #else
499:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
500:       if (fin) {
501:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
502:         VecCreateMPIWithArray(comm,1,local_n0,fft->N,(const PetscScalar*)data_fin,fin);
503:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
504:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
505:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
506:       }
507:       if (fout) {
508:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
509:         VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_fout,fout);
510:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
511:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
512:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
513:       }
514:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
515:       if (bout) {
516:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517:         VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_bout,bout);
518:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
519:         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
520:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
521:       }
522:       break;
523: #endif
524:     case 2:
525: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
526:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
527:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
528:       if (fin) {
529:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
530:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);
531:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
532:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
533:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
534:       }
535:       if (fout) {
536:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
537:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);
538:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
539:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
540:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
541:       }
542:       if (bout) {
543:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
544:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);
545:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
546:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
547:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
548:       }
549: #else
550:       /* Get local size */
551:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
552:       if (fin) {
553:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
554:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin);
555:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
556:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
557:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
558:       }
559:       if (fout) {
560:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
561:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout);
562:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
563:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
564:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
565:       }
566:       if (bout) {
567:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
568:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout);
569:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
570:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
571:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
572:       }
573: #endif
574:       break;
575:     case 3:
576: #if !defined(PETSC_USE_COMPLEX)
577:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
578:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
579:       if (fin) {
580:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
581:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);
582:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
583:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
584:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
585:       }
586:       if (fout) {
587:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
588:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);
589:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
590:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
591:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
592:       }
593:       if (bout) {
594:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
595:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);
596:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
597:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
598:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
599:       }
600: #else
601:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
602:       if (fin) {
603:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
604:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin);
605:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
606:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
607:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
608:       }
609:       if (fout) {
610:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
611:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout);
612:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
613:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
614:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
615:       }
616:       if (bout) {
617:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
618:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout);
619:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
620:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
621:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
622:       }
623: #endif
624:       break;
625:     default:
626: #if !defined(PETSC_USE_COMPLEX)
627:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
628:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
629:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
630:       N1          = 2*fft->N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
631:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

633:       if (fin) {
634:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
635:         VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_finr,fin);
636:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
637:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
638:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
639:       }
640:       if (fout) {
641:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
642:         VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_fout,fout);
643:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
644:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
645:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
646:       }
647:       if (bout) {
648:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
649:         VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_boutr,bout);
650:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
651:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
652:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
653:       }
654: #else
655:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
656:       if (fin) {
657:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
658:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin);
659:         PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A);
660:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
661:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
662:       }
663:       if (fout) {
664:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
665:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout);
666:         PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A);
667:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
668:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
669:       }
670:       if (bout) {
671:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
672:         VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout);
673:         PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A);
674:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
675:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
676:       }
677: #endif
678:       break;
679:     }
680:     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
681:        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
682:        memory leaks. We void these pointers here to avoid acident uses.
683:     */
684:     if (fin)  (*fin)->ops->replacearray = NULL;
685:     if (fout) (*fout)->ops->replacearray = NULL;
686:     if (bout) (*bout)->ops->replacearray = NULL;
687: #endif
688:   }
689:   return 0;
690: }

692: /*@
693:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

695:    Collective on Mat

697:    Input Parameters:
698: +  A - FFTW matrix
699: -  x - the PETSc vector

701:    Output Parameters:
702: .  y - the FFTW vector

704:   Options Database Keys:
705: . -mat_fftw_plannerflags - set FFTW planner flags

707:    Level: intermediate

709:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
710:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
711:          zeros. This routine does that job by scattering operation.

713: .seealso: VecScatterFFTWToPetsc()
714: @*/
715: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
716: {
717:   PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
718:   return 0;
719: }

721: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
722: {
723:   MPI_Comm       comm;
724:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
725:   PetscInt       low;
726:   PetscMPIInt    rank,size;
727:   PetscInt       vsize,vsize1;
728:   VecScatter     vecscat;
729:   IS             list1;

731:   PetscObjectGetComm((PetscObject)A,&comm);
732:   MPI_Comm_size(comm, &size);
733:   MPI_Comm_rank(comm, &rank);
734:   VecGetOwnershipRange(y,&low,NULL);

736:   if (size==1) {
737:     VecGetSize(x,&vsize);
738:     VecGetSize(y,&vsize1);
739:     ISCreateStride(PETSC_COMM_SELF,fft->N,0,1,&list1);
740:     VecScatterCreate(x,list1,y,list1,&vecscat);
741:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
742:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
743:     VecScatterDestroy(&vecscat);
744:     ISDestroy(&list1);
745: #if !PetscDefined(HAVE_MPIUNI)
746:   } else {
747:     Mat_FFTW   *fftw = (Mat_FFTW*)fft->data;
748:     PetscInt   ndim  = fft->ndim,*dim = fft->dim;
749:     ptrdiff_t  local_n0,local_0_start;
750:     ptrdiff_t  local_n1,local_1_start;
751:     IS         list2;
752: #if !defined(PETSC_USE_COMPLEX)
753:     PetscInt   i,j,k,partial_dim;
754:     PetscInt   *indx1, *indx2, tempindx, tempindx1;
755:     PetscInt   NM;
756:     ptrdiff_t  temp;
757: #endif

759:     switch (ndim) {
760:     case 1:
761: #if defined(PETSC_USE_COMPLEX)
762:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

764:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
765:       ISCreateStride(comm,local_n0,low,1,&list2);
766:       VecScatterCreate(x,list1,y,list2,&vecscat);
767:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
768:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
769:       VecScatterDestroy(&vecscat);
770:       ISDestroy(&list1);
771:       ISDestroy(&list2);
772: #else
773:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
774: #endif
775:       break;
776:     case 2:
777: #if defined(PETSC_USE_COMPLEX)
778:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

780:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
781:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
782:       VecScatterCreate(x,list1,y,list2,&vecscat);
783:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
784:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
785:       VecScatterDestroy(&vecscat);
786:       ISDestroy(&list1);
787:       ISDestroy(&list2);
788: #else
789:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

791:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
792:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

794:       if (dim[1]%2==0) {
795:         NM = dim[1]+2;
796:       } else {
797:         NM = dim[1]+1;
798:       }
799:       for (i=0; i<local_n0; i++) {
800:         for (j=0; j<dim[1]; j++) {
801:           tempindx  = i*dim[1] + j;
802:           tempindx1 = i*NM + j;

804:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
805:           indx2[tempindx]=low+tempindx1;
806:         }
807:       }

809:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
810:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

812:       VecScatterCreate(x,list1,y,list2,&vecscat);
813:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
814:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
815:       VecScatterDestroy(&vecscat);
816:       ISDestroy(&list1);
817:       ISDestroy(&list2);
818:       PetscFree(indx1);
819:       PetscFree(indx2);
820: #endif
821:       break;

823:     case 3:
824: #if defined(PETSC_USE_COMPLEX)
825:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

827:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
828:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
829:       VecScatterCreate(x,list1,y,list2,&vecscat);
830:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
831:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
832:       VecScatterDestroy(&vecscat);
833:       ISDestroy(&list1);
834:       ISDestroy(&list2);
835: #else
836:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
837:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
838:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

840:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
841:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

843:       if (dim[2]%2==0) NM = dim[2]+2;
844:       else             NM = dim[2]+1;

846:       for (i=0; i<local_n0; i++) {
847:         for (j=0; j<dim[1]; j++) {
848:           for (k=0; k<dim[2]; k++) {
849:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
850:             tempindx1 = i*dim[1]*NM + j*NM + k;

852:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
853:             indx2[tempindx]=low+tempindx1;
854:           }
855:         }
856:       }

858:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
859:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
860:       VecScatterCreate(x,list1,y,list2,&vecscat);
861:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
862:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
863:       VecScatterDestroy(&vecscat);
864:       ISDestroy(&list1);
865:       ISDestroy(&list2);
866:       PetscFree(indx1);
867:       PetscFree(indx2);
868: #endif
869:       break;

871:     default:
872: #if defined(PETSC_USE_COMPLEX)
873:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

875:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
876:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
877:       VecScatterCreate(x,list1,y,list2,&vecscat);
878:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
879:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
880:       VecScatterDestroy(&vecscat);
881:       ISDestroy(&list1);
882:       ISDestroy(&list2);
883: #else
884:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
885:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
886:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

888:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

890:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

892:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

894:       partial_dim = fftw->partial_dim;

896:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
897:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

899:       if (dim[ndim-1]%2==0) NM = 2;
900:       else                  NM = 1;

902:       j = low;
903:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
904:         indx1[i] = local_0_start*partial_dim + i;
905:         indx2[i] = j;
906:         if (k%dim[ndim-1]==0) j+=NM;
907:         j++;
908:       }
909:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
910:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
911:       VecScatterCreate(x,list1,y,list2,&vecscat);
912:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
913:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
914:       VecScatterDestroy(&vecscat);
915:       ISDestroy(&list1);
916:       ISDestroy(&list2);
917:       PetscFree(indx1);
918:       PetscFree(indx2);
919: #endif
920:       break;
921:     }
922: #endif
923:   }
924:   return 0;
925: }

927: /*@
928:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

930:    Collective on Mat

932:     Input Parameters:
933: +   A - FFTW matrix
934: -   x - FFTW vector

936:    Output Parameters:
937: .  y - PETSc vector

939:    Level: intermediate

941:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
942:          VecScatterFFTWToPetsc removes those extra zeros.

944: .seealso: VecScatterPetscToFFTW()
945: @*/
946: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
947: {
948:   PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
949:   return 0;
950: }

952: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
953: {
954:   MPI_Comm       comm;
955:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
956:   PetscInt       low;
957:   PetscMPIInt    rank,size;
958:   VecScatter     vecscat;
959:   IS             list1;

961:   PetscObjectGetComm((PetscObject)A,&comm);
962:   MPI_Comm_size(comm, &size);
963:   MPI_Comm_rank(comm, &rank);
964:   VecGetOwnershipRange(x,&low,NULL);

966:   if (size==1) {
967:     ISCreateStride(comm,fft->N,0,1,&list1);
968:     VecScatterCreate(x,list1,y,list1,&vecscat);
969:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
970:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
971:     VecScatterDestroy(&vecscat);
972:     ISDestroy(&list1);

974: #if !PetscDefined(HAVE_MPIUNI)
975:   } else {
976:     Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
977:     PetscInt       ndim  = fft->ndim,*dim = fft->dim;
978:     ptrdiff_t      local_n0,local_0_start;
979:     ptrdiff_t      local_n1,local_1_start;
980:     IS             list2;
981: #if !defined(PETSC_USE_COMPLEX)
982:     PetscInt       i,j,k,partial_dim;
983:     PetscInt       *indx1, *indx2, tempindx, tempindx1;
984:     PetscInt       NM;
985:     ptrdiff_t      temp;
986: #endif
987:     switch (ndim) {
988:     case 1:
989: #if defined(PETSC_USE_COMPLEX)
990:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

992:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
993:       ISCreateStride(comm,local_n1,low,1,&list2);
994:       VecScatterCreate(x,list1,y,list2,&vecscat);
995:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
996:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
997:       VecScatterDestroy(&vecscat);
998:       ISDestroy(&list1);
999:       ISDestroy(&list2);
1000: #else
1001:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1002: #endif
1003:       break;
1004:     case 2:
1005: #if defined(PETSC_USE_COMPLEX)
1006:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

1008:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
1009:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
1010:       VecScatterCreate(x,list2,y,list1,&vecscat);
1011:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1012:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1013:       VecScatterDestroy(&vecscat);
1014:       ISDestroy(&list1);
1015:       ISDestroy(&list2);
1016: #else
1017:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1019:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
1020:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

1022:       if (dim[1]%2==0) NM = dim[1]+2;
1023:       else             NM = dim[1]+1;

1025:       for (i=0; i<local_n0; i++) {
1026:         for (j=0; j<dim[1]; j++) {
1027:           tempindx = i*dim[1] + j;
1028:           tempindx1 = i*NM + j;

1030:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1031:           indx2[tempindx]=low+tempindx1;
1032:         }
1033:       }

1035:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
1036:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

1038:       VecScatterCreate(x,list2,y,list1,&vecscat);
1039:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1040:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1041:       VecScatterDestroy(&vecscat);
1042:       ISDestroy(&list1);
1043:       ISDestroy(&list2);
1044:       PetscFree(indx1);
1045:       PetscFree(indx2);
1046: #endif
1047:       break;
1048:     case 3:
1049: #if defined(PETSC_USE_COMPLEX)
1050:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1052:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1053:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1054:       VecScatterCreate(x,list1,y,list2,&vecscat);
1055:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1056:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1057:       VecScatterDestroy(&vecscat);
1058:       ISDestroy(&list1);
1059:       ISDestroy(&list2);
1060: #else
1061:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1063:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
1064:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

1066:       if (dim[2]%2==0) NM = dim[2]+2;
1067:       else             NM = dim[2]+1;

1069:       for (i=0; i<local_n0; i++) {
1070:         for (j=0; j<dim[1]; j++) {
1071:           for (k=0; k<dim[2]; k++) {
1072:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1073:             tempindx1 = i*dim[1]*NM + j*NM + k;

1075:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1076:             indx2[tempindx]=low+tempindx1;
1077:           }
1078:         }
1079:       }

1081:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1082:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1084:       VecScatterCreate(x,list2,y,list1,&vecscat);
1085:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1086:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1087:       VecScatterDestroy(&vecscat);
1088:       ISDestroy(&list1);
1089:       ISDestroy(&list2);
1090:       PetscFree(indx1);
1091:       PetscFree(indx2);
1092: #endif
1093:       break;
1094:     default:
1095: #if defined(PETSC_USE_COMPLEX)
1096:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1098:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1099:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1100:       VecScatterCreate(x,list1,y,list2,&vecscat);
1101:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1102:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1103:       VecScatterDestroy(&vecscat);
1104:       ISDestroy(&list1);
1105:       ISDestroy(&list2);
1106: #else
1107:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1109:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1111:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1113:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1115:       partial_dim = fftw->partial_dim;

1117:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
1118:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

1120:       if (dim[ndim-1]%2==0) NM = 2;
1121:       else                  NM = 1;

1123:       j = low;
1124:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1125:         indx1[i] = local_0_start*partial_dim + i;
1126:         indx2[i] = j;
1127:         if (k%dim[ndim-1]==0) j+=NM;
1128:         j++;
1129:       }
1130:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1131:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1133:       VecScatterCreate(x,list2,y,list1,&vecscat);
1134:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1135:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1136:       VecScatterDestroy(&vecscat);
1137:       ISDestroy(&list1);
1138:       ISDestroy(&list2);
1139:       PetscFree(indx1);
1140:       PetscFree(indx2);
1141: #endif
1142:       break;
1143:     }
1144: #endif
1145:   }
1146:   return 0;
1147: }

1149: /*
1150:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1152:   Options Database Keys:
1153: + -mat_fftw_plannerflags - set FFTW planner flags

1155:    Level: intermediate

1157: */
1158: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1159: {
1160:   MPI_Comm       comm;
1161:   Mat_FFT        *fft = (Mat_FFT*)A->data;
1162:   Mat_FFTW       *fftw;
1163:   PetscInt       ndim = fft->ndim,*dim = fft->dim;
1164:   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1165:   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1166:   PetscBool      flg;
1167:   PetscInt       p_flag,partial_dim=1,ctr;
1168:   PetscMPIInt    size,rank;
1169:   ptrdiff_t      *pdim;
1171: #if !defined(PETSC_USE_COMPLEX)
1172:   PetscInt       tot_dim;
1173: #endif

1175:   PetscObjectGetComm((PetscObject)A,&comm);
1176:   MPI_Comm_size(comm, &size);
1177:   MPI_Comm_rank(comm, &rank);

1179: #if !PetscDefined(HAVE_MPIUNI)
1180:   fftw_mpi_init();
1181: #endif
1182:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1183:   pdim[0] = dim[0];
1184: #if !defined(PETSC_USE_COMPLEX)
1185:   tot_dim = 2*dim[0];
1186: #endif
1187:   for (ctr=1; ctr<ndim; ctr++) {
1188:     partial_dim *= dim[ctr];
1189:     pdim[ctr]    = dim[ctr];
1190: #if !defined(PETSC_USE_COMPLEX)
1191:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1192:     else             tot_dim *= dim[ctr];
1193: #endif
1194:   }

1196:   if (size == 1) {
1197: #if defined(PETSC_USE_COMPLEX)
1198:     MatSetSizes(A,fft->N,fft->N,fft->N,fft->N);
1199:     fft->n = fft->N;
1200: #else
1201:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1202:     fft->n = tot_dim;
1203: #endif
1204: #if !PetscDefined(HAVE_MPIUNI)
1205:   } else {
1206:     ptrdiff_t local_n0,local_0_start,local_n1,local_1_start;
1207: #if !defined(PETSC_USE_COMPLEX)
1208:     ptrdiff_t temp;
1209:     PetscInt  N1;
1210: #endif

1212:     switch (ndim) {
1213:     case 1:
1214: #if !defined(PETSC_USE_COMPLEX)
1215:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1216: #else
1217:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1218:       fft->n = (PetscInt)local_n0;
1219:       MatSetSizes(A,local_n1,fft->n,fft->N,fft->N);
1220: #endif
1221:       break;
1222:     case 2:
1223: #if defined(PETSC_USE_COMPLEX)
1224:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1225:       fft->n    = (PetscInt)local_n0*dim[1];
1226:       MatSetSizes(A,fft->n,fft->n,fft->N,fft->N);
1227: #else
1228:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1230:       fft->n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1231:       MatSetSizes(A,fft->n,fft->n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1232: #endif
1233:       break;
1234:     case 3:
1235: #if defined(PETSC_USE_COMPLEX)
1236:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1238:       fft->n = (PetscInt)local_n0*dim[1]*dim[2];
1239:       MatSetSizes(A,fft->n,fft->n,fft->N,fft->N);
1240: #else
1241:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1243:       fft->n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1244:       MatSetSizes(A,fft->n,fft->n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1245: #endif
1246:       break;
1247:     default:
1248: #if defined(PETSC_USE_COMPLEX)
1249:       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1251:       fft->n = (PetscInt)local_n0*partial_dim;
1252:       MatSetSizes(A,fft->n,fft->n,fft->N,fft->N);
1253: #else
1254:       temp = pdim[ndim-1];

1256:       pdim[ndim-1] = temp/2 + 1;

1258:       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1260:       fft->n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1261:       N1 = 2*fft->N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1263:       pdim[ndim-1] = temp;

1265:       MatSetSizes(A,fft->n,fft->n,N1,N1);
1266: #endif
1267:       break;
1268:     }
1269: #endif
1270:   }
1271:   free(pdim);
1272:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1273:   PetscNewLog(A,&fftw);
1274:   fft->data = (void*)fftw;

1276:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1277:   fftw->partial_dim = partial_dim;

1279:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1280:   if (size == 1) {
1281: #if defined(PETSC_USE_64BIT_INDICES)
1282:     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1283: #else
1284:     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1285: #endif
1286:   }

1288:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1290:   fftw->p_forward  = NULL;
1291:   fftw->p_backward = NULL;
1292:   fftw->p_flag     = FFTW_ESTIMATE;

1294:   if (size == 1) {
1295:     A->ops->mult          = MatMult_SeqFFTW;
1296:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1297: #if !PetscDefined(HAVE_MPIUNI)
1298:   } else {
1299:     A->ops->mult          = MatMult_MPIFFTW;
1300:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1301: #endif
1302:   }
1303:   fft->matdestroy = MatDestroy_FFTW;
1304:   A->assembled    = PETSC_TRUE;
1305:   A->preallocated = PETSC_TRUE;

1307:   PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);
1308:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1309:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1311:   /* get runtime options */
1312:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1313:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);
1314:   if (flg) fftw->p_flag = iplans[p_flag];
1315:   PetscOptionsEnd();
1316:   return 0;
1317: }