Actual source code: fftw.c

petsc-3.9.4 2018-09-11
Report Typos and Errors

  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/examples/tests
  5: */

  7:  #include <../src/mat/impls/fft/fft.h>
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t    ndim_fftw,*dim_fftw;
 14: #if defined(PETSC_USE_64BIT_INDICES)
 15:   fftw_iodim64 *iodims;
 16: #else
 17:   fftw_iodim   *iodims;
 18: #endif
 19:   PetscInt     partial_dim;
 20:   fftw_plan    p_forward,p_backward;
 21:   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 22:   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
 23:                                                             executed for the arrays with which the plan was created */
 24: } Mat_FFTW;

 26: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 27: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 28: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 29: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 30: extern PetscErrorCode MatDestroy_FFTW(Mat);
 31: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);

 33: /* MatMult_SeqFFTW performs forward DFT in parallel
 34:    Input parameter:
 35:      A - the matrix
 36:      x - the vector on which FDFT will be performed

 38:    Output parameter:
 39:      y - vector that stores result of FDFT
 40: */
 41: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 42: {
 44:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 45:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 46:   const PetscScalar *x_array;
 47:   PetscScalar    *y_array;
 48: #if defined(PETSC_USE_COMPLEX)
 49: #if defined(PETSC_USE_64BIT_INDICES) 
 50:   fftw_iodim64   *iodims;
 51: #else
 52:   fftw_iodim     *iodims;
 53: #endif
 54:   PetscInt       i;
 55: #endif
 56:   PetscInt       ndim = fft->ndim,*dim = fft->dim;

 59:   VecGetArrayRead(x,&x_array);
 60:   VecGetArray(y,&y_array);
 61:   if (!fftw->p_forward) { /* create a plan, then excute it */
 62:     switch (ndim) {
 63:     case 1:
 64: #if defined(PETSC_USE_COMPLEX)
 65:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 66: #else
 67:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 68: #endif
 69:       break;
 70:     case 2:
 71: #if defined(PETSC_USE_COMPLEX)
 72:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 73: #else
 74:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 75: #endif
 76:       break;
 77:     case 3:
 78: #if defined(PETSC_USE_COMPLEX)
 79:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 80: #else
 81:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
 82: #endif
 83:       break;
 84:     default:
 85: #if defined(PETSC_USE_COMPLEX)
 86:       iodims = fftw->iodims;
 87: #if defined(PETSC_USE_64BIT_INDICES)
 88:       if (ndim) {
 89:         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
 90:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
 91:         for (i=ndim-2; i>=0; --i) {
 92:           iodims[i].n = (ptrdiff_t)dim[i];
 93:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
 94:         }
 95:       }
 96:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 97: #else
 98:       if (ndim) {
 99:         iodims[ndim-1].n = (int)dim[ndim-1];
100:         iodims[ndim-1].is = iodims[ndim-1].os = 1;
101:         for (i=ndim-2; i>=0; --i) {
102:           iodims[i].n = (int)dim[i];
103:           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
104:         }
105:       }
106:       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
107: #endif

109: #else
110:       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
111: #endif
112:       break;
113:     }
114:     fftw->finarray  = (PetscScalar *) x_array;
115:     fftw->foutarray = y_array;
116:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
117:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
118:     fftw_execute(fftw->p_forward);
119:   } else { /* use existing plan */
120:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
121: #if defined(PETSC_USE_COMPLEX)
122:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
123: #else
124:       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
125: #endif
126:     } else {
127:       fftw_execute(fftw->p_forward);
128:     }
129:   }
130:   VecRestoreArray(y,&y_array);
131:   VecRestoreArrayRead(x,&x_array);
132:   return(0);
133: }

135: /* MatMultTranspose_SeqFFTW performs serial backward DFT
136:    Input parameter:
137:      A - the matrix
138:      x - the vector on which BDFT will be performed

140:    Output parameter:
141:      y - vector that stores result of BDFT
142: */

144: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
145: {
147:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
148:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
149:   const PetscScalar *x_array;
150:   PetscScalar    *y_array;
151:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
152: #if defined(PETSC_USE_COMPLEX)
153: #if defined(PETSC_USE_64BIT_INDICES)
154:   fftw_iodim64   *iodims=fftw->iodims;
155: #else
156:   fftw_iodim     *iodims=fftw->iodims;
157: #endif
158: #endif
159: 
161:   VecGetArrayRead(x,&x_array);
162:   VecGetArray(y,&y_array);
163:   if (!fftw->p_backward) { /* create a plan, then excute it */
164:     switch (ndim) {
165:     case 1:
166: #if defined(PETSC_USE_COMPLEX)
167:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
168: #else
169:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
170: #endif
171:       break;
172:     case 2:
173: #if defined(PETSC_USE_COMPLEX)
174:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
175: #else
176:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
177: #endif
178:       break;
179:     case 3:
180: #if defined(PETSC_USE_COMPLEX)
181:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
182: #else
183:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
184: #endif
185:       break;
186:     default:
187: #if defined(PETSC_USE_COMPLEX)
188: #if defined(PETSC_USE_64BIT_INDICES)
189:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
190: #else
191:       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
192: #endif
193: #else
194:       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
195: #endif
196:       break;
197:     }
198:     fftw->binarray  = (PetscScalar *) x_array;
199:     fftw->boutarray = y_array;
200:     fftw_execute(fftw->p_backward);
201:   } else { /* use existing plan */
202:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
203: #if defined(PETSC_USE_COMPLEX)
204:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
205: #else
206:       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
207: #endif
208:     } else {
209:       fftw_execute(fftw->p_backward);
210:     }
211:   }
212:   VecRestoreArray(y,&y_array);
213:   VecRestoreArrayRead(x,&x_array);
214:   return(0);
215: }

217: /* MatMult_MPIFFTW performs forward DFT in parallel
218:    Input parameter:
219:      A - the matrix
220:      x - the vector on which FDFT will be performed

222:    Output parameter:
223:    y   - vector that stores result of FDFT
224: */
225: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
226: {
228:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
229:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
230:   const PetscScalar *x_array;
231:   PetscScalar    *y_array;
232:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
233:   MPI_Comm       comm;

236:   PetscObjectGetComm((PetscObject)A,&comm);
237:   VecGetArrayRead(x,&x_array);
238:   VecGetArray(y,&y_array);
239:   if (!fftw->p_forward) { /* create a plan, then excute it */
240:     switch (ndim) {
241:     case 1:
242: #if defined(PETSC_USE_COMPLEX)
243:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
244: #else
245:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
246: #endif
247:       break;
248:     case 2:
249: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
250:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
251: #else
252:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
253: #endif
254:       break;
255:     case 3:
256: #if defined(PETSC_USE_COMPLEX)
257:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
258: #else
259:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
260: #endif
261:       break;
262:     default:
263: #if defined(PETSC_USE_COMPLEX)
264:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
265: #else
266:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
267: #endif
268:       break;
269:     }
270:     fftw->finarray  = (PetscScalar *) x_array;
271:     fftw->foutarray = y_array;
272:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
273:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
274:     fftw_execute(fftw->p_forward);
275:   } else { /* use existing plan */
276:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
277:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
278:     } else {
279:       fftw_execute(fftw->p_forward);
280:     }
281:   }
282:   VecRestoreArray(y,&y_array);
283:   VecRestoreArrayRead(x,&x_array);
284:   return(0);
285: }

287: /* MatMultTranspose_MPIFFTW performs parallel backward DFT
288:    Input parameter:
289:      A - the matrix
290:      x - the vector on which BDFT will be performed

292:    Output parameter:
293:      y - vector that stores result of BDFT
294: */
295: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
296: {
298:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
299:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
300:   const PetscScalar *x_array;
301:   PetscScalar    *y_array;
302:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
303:   MPI_Comm       comm;

306:   PetscObjectGetComm((PetscObject)A,&comm);
307:   VecGetArrayRead(x,&x_array);
308:   VecGetArray(y,&y_array);
309:   if (!fftw->p_backward) { /* create a plan, then excute it */
310:     switch (ndim) {
311:     case 1:
312: #if defined(PETSC_USE_COMPLEX)
313:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
314: #else
315:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
316: #endif
317:       break;
318:     case 2:
319: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
320:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
321: #else
322:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
323: #endif
324:       break;
325:     case 3:
326: #if defined(PETSC_USE_COMPLEX)
327:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
328: #else
329:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
330: #endif
331:       break;
332:     default:
333: #if defined(PETSC_USE_COMPLEX)
334:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
335: #else
336:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
337: #endif
338:       break;
339:     }
340:     fftw->binarray  = (PetscScalar *) x_array;
341:     fftw->boutarray = y_array;
342:     fftw_execute(fftw->p_backward);
343:   } else { /* use existing plan */
344:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
345:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
346:     } else {
347:       fftw_execute(fftw->p_backward);
348:     }
349:   }
350:   VecRestoreArray(y,&y_array);
351:   VecRestoreArrayRead(x,&x_array);
352:   return(0);
353: }

355: PetscErrorCode MatDestroy_FFTW(Mat A)
356: {
357:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
358:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

362:   fftw_destroy_plan(fftw->p_forward);
363:   fftw_destroy_plan(fftw->p_backward);
364:   if (fftw->iodims) {
365:     free(fftw->iodims);
366:   }
367:   PetscFree(fftw->dim_fftw);
368:   PetscFree(fft->data);
369:   fftw_mpi_cleanup();
370:   return(0);
371: }

373:  #include <../src/vec/vec/impls/mpi/pvecimpl.h>
374: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
375: {
377:   PetscScalar    *array;

380:   VecGetArray(v,&array);
381:   fftw_free((fftw_complex*)array);
382:   VecRestoreArray(v,&array);
383:   VecDestroy_MPI(v);
384:   return(0);
385: }

387: /*@
388:    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
389:      parallel layout determined by FFTW

391:    Collective on Mat

393:    Input Parameter:
394: .   A - the matrix

396:    Output Parameter:
397: +   x - (optional) input vector of forward FFTW
398: -   y - (optional) output vector of forward FFTW
399: -   z - (optional) output vector of backward FFTW

401:   Level: advanced

403:   Note: The parallel layout of output of forward FFTW is always same as the input
404:         of the backward FFTW. But parallel layout of the input vector of forward
405:         FFTW might not be same as the output of backward FFTW.
406:         Also note that we need to provide enough space while doing parallel real transform.
407:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
408:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
409:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
410:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
411:         zeros if it is odd only one zero is needed.
412:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
413:         figures out how much space is needed, i.e. it figures out the data+scratch space for
414:         each processor and returns that.

416: .seealso: MatCreateFFTW()
417: @*/
418: PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
419: {

423:   PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
424:   return(0);
425: }

427: PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
428: {
430:   PetscMPIInt    size,rank;
431:   MPI_Comm       comm;
432:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
433:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
434:   PetscInt       N     = fft->N;
435:   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;

440:   PetscObjectGetComm((PetscObject)A,&comm);

442:   MPI_Comm_size(comm, &size);
443:   MPI_Comm_rank(comm, &rank);
444:   if (size == 1) { /* sequential case */
445: #if defined(PETSC_USE_COMPLEX)
446:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
447:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,N,fout);}
448:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,N,bout);}
449: #else
450:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
451:     if (fout) {VecCreateSeq(PETSC_COMM_SELF,n,fout);}
452:     if (bout) {VecCreateSeq(PETSC_COMM_SELF,n,bout);}
453: #endif
454:   } else { /* parallel cases */
455:     ptrdiff_t    alloc_local,local_n0,local_0_start;
456:     ptrdiff_t    local_n1;
457:     fftw_complex *data_fout;
458:     ptrdiff_t    local_1_start;
459: #if defined(PETSC_USE_COMPLEX)
460:     fftw_complex *data_fin,*data_bout;
461: #else
462:     double    *data_finr,*data_boutr;
463:     PetscInt  n1,N1;
464:     ptrdiff_t temp;
465: #endif

467:     switch (ndim) {
468:     case 1:
469: #if !defined(PETSC_USE_COMPLEX)
470:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
471: #else
472:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
473:       if (fin) {
474:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
475:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);

477:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
478:       }
479:       if (fout) {
480:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
481:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);

483:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
484:       }
485:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
486:       if (bout) {
487:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
488:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);

490:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
491:       }
492:       break;
493: #endif
494:     case 2:
495: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
496:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
497:       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
498:       if (fin) {
499:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
500:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

502:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
503:       }
504:       if (bout) {
505:         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
506:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

508:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
509:       }
510:       if (fout) {
511:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);

514:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
515:       }
516: #else
517:       /* Get local size */
518:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
519:       if (fin) {
520:         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
521:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

523:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
524:       }
525:       if (fout) {
526:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
527:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

529:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
530:       }
531:       if (bout) {
532:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
533:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

535:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
536:       }
537: #endif
538:       break;
539:     case 3:
540: #if !defined(PETSC_USE_COMPLEX)
541:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
542:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
543:       if (fin) {
544:         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
545:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);

547:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
548:       }
549:       if (bout) {
550:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
551:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);

553:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
554:       }

556:       if (fout) {
557:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
558:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);

560:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
561:       }
562: #else
563:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
564:       if (fin) {
565:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
566:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

568:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
569:       }
570:       if (fout) {
571:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
572:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

574:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
575:       }
576:       if (bout) {
577:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
578:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

580:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
581:       }
582: #endif
583:       break;
584:     default:
585: #if !defined(PETSC_USE_COMPLEX)
586:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

588:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

590:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
591:       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);

593:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

595:       if (fin) {
596:         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
597:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);

599:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
600:       }
601:       if (bout) {
602:         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
603:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);

605:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
606:       }

608:       if (fout) {
609:         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
610:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);

612:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
613:       }
614: #else
615:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
616:       if (fin) {
617:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
618:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);

620:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
621:       }
622:       if (fout) {
623:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
624:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);

626:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
627:       }
628:       if (bout) {
629:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
630:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);

632:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
633:       }
634: #endif
635:       break;
636:     }
637:   }
638:   return(0);
639: }

641: /*@
642:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.

644:    Collective on Mat

646:    Input Parameters:
647: +  A - FFTW matrix
648: -  x - the PETSc vector

650:    Output Parameters:
651: .  y - the FFTW vector

653:   Options Database Keys:
654: . -mat_fftw_plannerflags - set FFTW planner flags

656:    Level: intermediate

658:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
659:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
660:          zeros. This routine does that job by scattering operation.

662: .seealso: VecScatterFFTWToPetsc()
663: @*/
664: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
665: {

669:   PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
670:   return(0);
671: }

673: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
674: {
676:   MPI_Comm       comm;
677:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
678:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
679:   PetscInt       N     =fft->N;
680:   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
681:   PetscInt       low;
682:   PetscMPIInt    rank,size;
683:   PetscInt       vsize,vsize1;
684:   ptrdiff_t      local_n0,local_0_start;
685:   ptrdiff_t      local_n1,local_1_start;
686:   VecScatter     vecscat;
687:   IS             list1,list2;
688: #if !defined(PETSC_USE_COMPLEX)
689:   PetscInt       i,j,k,partial_dim;
690:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
691:   PetscInt       NM;
692:   ptrdiff_t      temp;
693: #endif

696:   PetscObjectGetComm((PetscObject)A,&comm);
697:   MPI_Comm_size(comm, &size);
698:   MPI_Comm_rank(comm, &rank);
699:   VecGetOwnershipRange(y,&low,NULL);

701:   if (size==1) {
702:     VecGetSize(x,&vsize);
703:     VecGetSize(y,&vsize1);
704:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
705:     VecScatterCreate(x,list1,y,list1,&vecscat);
706:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
707:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
708:     VecScatterDestroy(&vecscat);
709:     ISDestroy(&list1);
710:   } else {
711:     switch (ndim) {
712:     case 1:
713: #if defined(PETSC_USE_COMPLEX)
714:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

716:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
717:       ISCreateStride(comm,local_n0,low,1,&list2);
718:       VecScatterCreate(x,list1,y,list2,&vecscat);
719:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
720:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
721:       VecScatterDestroy(&vecscat);
722:       ISDestroy(&list1);
723:       ISDestroy(&list2);
724: #else
725:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
726: #endif
727:       break;
728:     case 2:
729: #if defined(PETSC_USE_COMPLEX)
730:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

732:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
733:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
734:       VecScatterCreate(x,list1,y,list2,&vecscat);
735:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
736:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
737:       VecScatterDestroy(&vecscat);
738:       ISDestroy(&list1);
739:       ISDestroy(&list2);
740: #else
741:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

743:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
744:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

746:       if (dim[1]%2==0) {
747:         NM = dim[1]+2;
748:       } else {
749:         NM = dim[1]+1;
750:       }
751:       for (i=0; i<local_n0; i++) {
752:         for (j=0; j<dim[1]; j++) {
753:           tempindx  = i*dim[1] + j;
754:           tempindx1 = i*NM + j;

756:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
757:           indx2[tempindx]=low+tempindx1;
758:         }
759:       }

761:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
762:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

764:       VecScatterCreate(x,list1,y,list2,&vecscat);
765:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
766:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
767:       VecScatterDestroy(&vecscat);
768:       ISDestroy(&list1);
769:       ISDestroy(&list2);
770:       PetscFree(indx1);
771:       PetscFree(indx2);
772: #endif
773:       break;

775:     case 3:
776: #if defined(PETSC_USE_COMPLEX)
777:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

779:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
780:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
781:       VecScatterCreate(x,list1,y,list2,&vecscat);
782:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
783:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
784:       VecScatterDestroy(&vecscat);
785:       ISDestroy(&list1);
786:       ISDestroy(&list2);
787: #else
788:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
789:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
790:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

792:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
793:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

795:       if (dim[2]%2==0) NM = dim[2]+2;
796:       else             NM = dim[2]+1;

798:       for (i=0; i<local_n0; i++) {
799:         for (j=0; j<dim[1]; j++) {
800:           for (k=0; k<dim[2]; k++) {
801:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
802:             tempindx1 = i*dim[1]*NM + j*NM + k;

804:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
805:             indx2[tempindx]=low+tempindx1;
806:           }
807:         }
808:       }

810:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
811:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
812:       VecScatterCreate(x,list1,y,list2,&vecscat);
813:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
814:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
815:       VecScatterDestroy(&vecscat);
816:       ISDestroy(&list1);
817:       ISDestroy(&list2);
818:       PetscFree(indx1);
819:       PetscFree(indx2);
820: #endif
821:       break;

823:     default:
824: #if defined(PETSC_USE_COMPLEX)
825:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

827:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
828:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
829:       VecScatterCreate(x,list1,y,list2,&vecscat);
830:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
831:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
832:       VecScatterDestroy(&vecscat);
833:       ISDestroy(&list1);
834:       ISDestroy(&list2);
835: #else
836:       /* buggy, needs to be fixed. See src/mat/examples/tests/ex158.c */
837:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
838:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

840:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

842:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

844:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

846:       partial_dim = fftw->partial_dim;

848:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
849:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

851:       if (dim[ndim-1]%2==0) NM = 2;
852:       else                  NM = 1;

854:       j = low;
855:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
856:         indx1[i] = local_0_start*partial_dim + i;
857:         indx2[i] = j;
858:         if (k%dim[ndim-1]==0) j+=NM;
859:         j++;
860:       }
861:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
862:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
863:       VecScatterCreate(x,list1,y,list2,&vecscat);
864:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
865:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
866:       VecScatterDestroy(&vecscat);
867:       ISDestroy(&list1);
868:       ISDestroy(&list2);
869:       PetscFree(indx1);
870:       PetscFree(indx2);
871: #endif
872:       break;
873:     }
874:   }
875:   return(0);
876: }

878: /*@
879:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

881:    Collective on Mat

883:     Input Parameters:
884: +   A - FFTW matrix
885: -   x - FFTW vector

887:    Output Parameters:
888: .  y - PETSc vector

890:    Level: intermediate

892:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
893:          VecScatterFFTWToPetsc removes those extra zeros.

895: .seealso: VecScatterPetscToFFTW()
896: @*/
897: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
898: {

902:   PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
903:   return(0);
904: }

906: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
907: {
909:   MPI_Comm       comm;
910:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
911:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
912:   PetscInt       N     = fft->N;
913:   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
914:   PetscInt       low;
915:   PetscMPIInt    rank,size;
916:   ptrdiff_t      local_n0,local_0_start;
917:   ptrdiff_t      local_n1,local_1_start;
918:   VecScatter     vecscat;
919:   IS             list1,list2;
920: #if !defined(PETSC_USE_COMPLEX)
921:   PetscInt       i,j,k,partial_dim;
922:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
923:   PetscInt       NM;
924:   ptrdiff_t      temp;
925: #endif

928:   PetscObjectGetComm((PetscObject)A,&comm);
929:   MPI_Comm_size(comm, &size);
930:   MPI_Comm_rank(comm, &rank);
931:   VecGetOwnershipRange(x,&low,NULL);

933:   if (size==1) {
934:     ISCreateStride(comm,N,0,1,&list1);
935:     VecScatterCreate(x,list1,y,list1,&vecscat);
936:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
937:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
938:     VecScatterDestroy(&vecscat);
939:     ISDestroy(&list1);

941:   } else {
942:     switch (ndim) {
943:     case 1:
944: #if defined(PETSC_USE_COMPLEX)
945:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

947:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
948:       ISCreateStride(comm,local_n1,low,1,&list2);
949:       VecScatterCreate(x,list1,y,list2,&vecscat);
950:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
951:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
952:       VecScatterDestroy(&vecscat);
953:       ISDestroy(&list1);
954:       ISDestroy(&list2);
955: #else
956:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
957: #endif
958:       break;
959:     case 2:
960: #if defined(PETSC_USE_COMPLEX)
961:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);

963:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
964:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
965:       VecScatterCreate(x,list2,y,list1,&vecscat);
966:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
967:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
968:       VecScatterDestroy(&vecscat);
969:       ISDestroy(&list1);
970:       ISDestroy(&list2);
971: #else
972:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

974:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1);
975:       PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2);

977:       if (dim[1]%2==0) NM = dim[1]+2;
978:       else             NM = dim[1]+1;

980:       for (i=0; i<local_n0; i++) {
981:         for (j=0; j<dim[1]; j++) {
982:           tempindx = i*dim[1] + j;
983:           tempindx1 = i*NM + j;

985:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
986:           indx2[tempindx]=low+tempindx1;
987:         }
988:       }

990:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
991:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

993:       VecScatterCreate(x,list2,y,list1,&vecscat);
994:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
995:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
996:       VecScatterDestroy(&vecscat);
997:       ISDestroy(&list1);
998:       ISDestroy(&list2);
999:       PetscFree(indx1);
1000:       PetscFree(indx2);
1001: #endif
1002:       break;
1003:     case 3:
1004: #if defined(PETSC_USE_COMPLEX)
1005:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1007:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
1008:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
1009:       VecScatterCreate(x,list1,y,list2,&vecscat);
1010:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1011:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1012:       VecScatterDestroy(&vecscat);
1013:       ISDestroy(&list1);
1014:       ISDestroy(&list2);
1015: #else
1016:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1018:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
1019:       PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

1021:       if (dim[2]%2==0) NM = dim[2]+2;
1022:       else             NM = dim[2]+1;

1024:       for (i=0; i<local_n0; i++) {
1025:         for (j=0; j<dim[1]; j++) {
1026:           for (k=0; k<dim[2]; k++) {
1027:             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1028:             tempindx1 = i*dim[1]*NM + j*NM + k;

1030:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1031:             indx2[tempindx]=low+tempindx1;
1032:           }
1033:         }
1034:       }

1036:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
1037:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

1039:       VecScatterCreate(x,list2,y,list1,&vecscat);
1040:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1041:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1042:       VecScatterDestroy(&vecscat);
1043:       ISDestroy(&list1);
1044:       ISDestroy(&list2);
1045:       PetscFree(indx1);
1046:       PetscFree(indx2);
1047: #endif
1048:       break;
1049:     default:
1050: #if defined(PETSC_USE_COMPLEX)
1051:       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);

1053:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1054:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1055:       VecScatterCreate(x,list1,y,list2,&vecscat);
1056:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1057:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1058:       VecScatterDestroy(&vecscat);
1059:       ISDestroy(&list1);
1060:       ISDestroy(&list2);
1061: #else
1062:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];

1064:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;

1066:       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1068:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

1070:       partial_dim = fftw->partial_dim;

1072:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1);
1073:       PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2);

1075:       if (dim[ndim-1]%2==0) NM = 2;
1076:       else                  NM = 1;

1078:       j = low;
1079:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1080:         indx1[i] = local_0_start*partial_dim + i;
1081:         indx2[i] = j;
1082:         if (k%dim[ndim-1]==0) j+=NM;
1083:         j++;
1084:       }
1085:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1086:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1088:       VecScatterCreate(x,list2,y,list1,&vecscat);
1089:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1090:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1091:       VecScatterDestroy(&vecscat);
1092:       ISDestroy(&list1);
1093:       ISDestroy(&list2);
1094:       PetscFree(indx1);
1095:       PetscFree(indx2);
1096: #endif
1097:       break;
1098:     }
1099:   }
1100:   return(0);
1101: }

1103: /*
1104:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1106:   Options Database Keys:
1107: + -mat_fftw_plannerflags - set FFTW planner flags

1109:    Level: intermediate

1111: */
1112: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1113: {
1115:   MPI_Comm       comm;
1116:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1117:   Mat_FFTW       *fftw;
1118:   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim=fft->dim;
1119:   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1120:   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1121:   PetscBool      flg;
1122:   PetscInt       p_flag,partial_dim=1,ctr;
1123:   PetscMPIInt    size,rank;
1124:   ptrdiff_t      *pdim;
1125:   ptrdiff_t      local_n1,local_1_start;
1126: #if !defined(PETSC_USE_COMPLEX)
1127:   ptrdiff_t      temp;
1128:   PetscInt       N1,tot_dim;
1129: #else
1130:   PetscInt       n1;
1131: #endif

1134:   PetscObjectGetComm((PetscObject)A,&comm);
1135:   MPI_Comm_size(comm, &size);
1136:   MPI_Comm_rank(comm, &rank);

1138:   fftw_mpi_init();
1139:   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1140:   pdim[0] = dim[0];
1141: #if !defined(PETSC_USE_COMPLEX)
1142:   tot_dim = 2*dim[0];
1143: #endif
1144:   for (ctr=1; ctr<ndim; ctr++) {
1145:     partial_dim *= dim[ctr];
1146:     pdim[ctr]    = dim[ctr];
1147: #if !defined(PETSC_USE_COMPLEX)
1148:     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1149:     else             tot_dim *= dim[ctr];
1150: #endif
1151:   }

1153:   if (size == 1) {
1154: #if defined(PETSC_USE_COMPLEX)
1155:     MatSetSizes(A,N,N,N,N);
1156:     n    = N;
1157: #else
1158:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1159:     n    = tot_dim;
1160: #endif

1162:   } else {
1163:     ptrdiff_t local_n0,local_0_start;
1164:     switch (ndim) {
1165:     case 1:
1166: #if !defined(PETSC_USE_COMPLEX)
1167:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1168: #else
1169:       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);

1171:       n    = (PetscInt)local_n0;
1172:       n1   = (PetscInt)local_n1;
1173:       MatSetSizes(A,n1,n,N,N);
1174: #endif
1175:       break;
1176:     case 2:
1177: #if defined(PETSC_USE_COMPLEX)
1178:       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1179:       /*
1180:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1181:        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1182:        */
1183:       n    = (PetscInt)local_n0*dim[1];
1184:       MatSetSizes(A,n,n,N,N);
1185: #else
1186:       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1188:       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1189:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1190: #endif
1191:       break;
1192:     case 3:
1193: #if defined(PETSC_USE_COMPLEX)
1194:       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);

1196:       n    = (PetscInt)local_n0*dim[1]*dim[2];
1197:       MatSetSizes(A,n,n,N,N);
1198: #else
1199:       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1201:       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1202:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1203: #endif
1204:       break;
1205:     default:
1206: #if defined(PETSC_USE_COMPLEX)
1207:       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);

1209:       n    = (PetscInt)local_n0*partial_dim;
1210:       MatSetSizes(A,n,n,N,N);
1211: #else
1212:       temp = pdim[ndim-1];

1214:       pdim[ndim-1] = temp/2 + 1;

1216:       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);

1218:       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1219:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);

1221:       pdim[ndim-1] = temp;

1223:       MatSetSizes(A,n,n,N1,N1);
1224: #endif
1225:       break;
1226:     }
1227:   }
1228:   free(pdim);
1229:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1230:   PetscNewLog(A,&fftw);
1231:   fft->data = (void*)fftw;

1233:   fft->n            = n;
1234:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1235:   fftw->partial_dim = partial_dim;

1237:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1238:   if (size == 1) {
1239: #if defined(PETSC_USE_64BIT_INDICES)
1240:     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1241: #else
1242:     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1243: #endif
1244:   }

1246:   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];

1248:   fftw->p_forward  = 0;
1249:   fftw->p_backward = 0;
1250:   fftw->p_flag     = FFTW_ESTIMATE;

1252:   if (size == 1) {
1253:     A->ops->mult          = MatMult_SeqFFTW;
1254:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1255:   } else {
1256:     A->ops->mult          = MatMult_MPIFFTW;
1257:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1258:   }
1259:   fft->matdestroy = MatDestroy_FFTW;
1260:   A->assembled    = PETSC_TRUE;
1261:   A->preallocated = PETSC_TRUE;

1263:   PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW);
1264:   PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);
1265:   PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);

1267:   /* get runtime options */
1268:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1269:   PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg);
1270:   if (flg) {
1271:     fftw->p_flag = iplans[p_flag];
1272:   }
1273:   PetscOptionsEnd();
1274:   return(0);
1275: }