Actual source code: fftw.c

petsc-3.3-p7 2013-05-11
  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/examples/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
  8: EXTERN_C_BEGIN
  9: #include <fftw3-mpi.h>
 10: EXTERN_C_END

 12: typedef struct {
 13:   ptrdiff_t   ndim_fftw,*dim_fftw;
 14:   PetscInt    partial_dim;
 15:   fftw_plan   p_forward,p_backward;
 16:   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 17:   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be 
 18:                                                             executed for the arrays with which the plan was created */
 19: } Mat_FFTW;

 21: extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
 22: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
 23: extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
 24: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
 25: extern PetscErrorCode MatDestroy_FFTW(Mat);
 26: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);

 28: /* MatMult_SeqFFTW performs forward DFT in parallel
 29:    Input parameter:
 30:      A - the matrix
 31:      x - the vector on which FDFT will be performed

 33:    Output parameter:
 34:      y - vector that stores result of FDFT
 35: */
 38: PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
 39: {
 41:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
 42:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
 43:   PetscScalar    *x_array,*y_array;
 44:   PetscInt       ndim=fft->ndim,*dim=fft->dim;

 47:   VecGetArray(x,&x_array);
 48:   VecGetArray(y,&y_array);
 49:   if (!fftw->p_forward){ /* create a plan, then excute it */
 50:     switch (ndim){
 51:     case 1:
 52: #if defined(PETSC_USE_COMPLEX)
 53:       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 54: #else
 55:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
 56: #endif
 57:       break;
 58:     case 2:
 59: #if defined(PETSC_USE_COMPLEX)
 60:       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 61: #else
 62:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
 63: #endif
 64:       break;
 65:     case 3:
 66: #if defined(PETSC_USE_COMPLEX)
 67:       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 68: #else
 69:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
 70: #endif
 71:       break;
 72:     default:
 73: #if defined(PETSC_USE_COMPLEX)
 74:       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
 75: #else
 76:       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
 77: #endif
 78:       break;
 79:     }
 80:     fftw->finarray  = x_array;
 81:     fftw->foutarray = y_array;
 82:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten! 
 83:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
 84:     fftw_execute(fftw->p_forward);
 85:   } else { /* use existing plan */
 86:     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
 87:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
 88:     } else {
 89:       fftw_execute(fftw->p_forward);
 90:     }
 91:   }
 92:   VecRestoreArray(y,&y_array);
 93:   VecRestoreArray(x,&x_array);
 94:   return(0);
 95: }

 97: /* MatMultTranspose_SeqFFTW performs serial backward DFT 
 98:    Input parameter:
 99:      A - the matrix
100:      x - the vector on which BDFT will be performed

102:    Output parameter:
103:      y - vector that stores result of BDFT
104: */

108: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109: {
111:   Mat_FFT        *fft = (Mat_FFT*)A->data;
112:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113:   PetscScalar    *x_array,*y_array;
114:   PetscInt       ndim=fft->ndim,*dim=fft->dim;

117:   VecGetArray(x,&x_array);
118:   VecGetArray(y,&y_array);
119:   if (!fftw->p_backward){ /* create a plan, then excute it */
120:     switch (ndim){
121:     case 1:
122: #if defined(PETSC_USE_COMPLEX)
123:       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124: #else
125:       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
126: #endif
127:       break;
128:     case 2:
129: #if defined(PETSC_USE_COMPLEX)
130:       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131: #else
132:       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
133: #endif
134:       break;
135:     case 3:
136: #if defined(PETSC_USE_COMPLEX)
137:       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138: #else
139:       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
140: #endif
141:       break;
142:     default:
143: #if defined(PETSC_USE_COMPLEX)
144:       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145: #else
146:       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
147: #endif
148:       break;
149:     }
150:     fftw->binarray  = x_array;
151:     fftw->boutarray = y_array;
152:     fftw_execute(fftw->p_backward);
153:   } else { /* use existing plan */
154:     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
155:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156:     } else {
157:       fftw_execute(fftw->p_backward);
158:     }
159:   }
160:   VecRestoreArray(y,&y_array);
161:   VecRestoreArray(x,&x_array);
162:   return(0);
163: }

165: /* MatMult_MPIFFTW performs forward DFT in parallel
166:    Input parameter:
167:      A - the matrix
168:      x - the vector on which FDFT will be performed

170:    Output parameter:
171:    y   - vector that stores result of FDFT
172: */
175: PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176: {
178:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180:   PetscScalar    *x_array,*y_array;
181:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182:   MPI_Comm       comm=((PetscObject)A)->comm;

185:   VecGetArray(x,&x_array);
186:   VecGetArray(y,&y_array);
187:   if (!fftw->p_forward){ /* create a plan, then excute it */
188:     switch (ndim){
189:     case 1:
190: #if defined(PETSC_USE_COMPLEX)
191:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
192: #else
193:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
194: #endif 
195:       break;
196:     case 2:
197: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */ 
198:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199: #else
200:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
201: #endif 
202:       break;
203:     case 3:
204: #if defined(PETSC_USE_COMPLEX)
205:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206: #else
207:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208: #endif 
209:       break;
210:     default:
211: #if defined(PETSC_USE_COMPLEX)
212:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213: #else
214:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215: #endif 
216:       break;
217:     }
218:     fftw->finarray  = x_array;
219:     fftw->foutarray = y_array;
220:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten! 
221:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
222:     fftw_execute(fftw->p_forward);
223:   } else { /* use existing plan */
224:     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
225:       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
226:     } else {
227:       fftw_execute(fftw->p_forward);
228:     }
229:   }
230:   VecRestoreArray(y,&y_array);
231:   VecRestoreArray(x,&x_array);
232:   return(0);
233: }

235: /* MatMultTranspose_MPIFFTW performs parallel backward DFT    
236:    Input parameter:
237:      A - the matrix
238:      x - the vector on which BDFT will be performed

240:    Output parameter:
241:      y - vector that stores result of BDFT
242: */
245: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
246: {
248:   Mat_FFT        *fft  = (Mat_FFT*)A->data;
249:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
250:   PetscScalar    *x_array,*y_array;
251:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
252:   MPI_Comm       comm=((PetscObject)A)->comm;

255:   VecGetArray(x,&x_array);
256:   VecGetArray(y,&y_array);
257:   if (!fftw->p_backward){ /* create a plan, then excute it */
258:     switch (ndim){
259:     case 1:
260: #if defined(PETSC_USE_COMPLEX)
261:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
262: #else
263:       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
264: #endif 
265:       break;
266:     case 2:
267: #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */ 
268:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
269: #else
270:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
271: #endif 
272:       break;
273:     case 3:
274: #if defined(PETSC_USE_COMPLEX)
275:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
276: #else
277:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
278: #endif 
279:       break;
280:     default:
281: #if defined(PETSC_USE_COMPLEX)
282:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
283: #else
284:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
285: #endif 
286:       break;
287:     }
288:     fftw->binarray  = x_array;
289:     fftw->boutarray = y_array;
290:     fftw_execute(fftw->p_backward);
291:   } else { /* use existing plan */
292:     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
293:       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
294:     } else {
295:       fftw_execute(fftw->p_backward);
296:     }
297:   }
298:   VecRestoreArray(y,&y_array);
299:   VecRestoreArray(x,&x_array);
300:   return(0);
301: }

305: PetscErrorCode MatDestroy_FFTW(Mat A)
306: {
307:   Mat_FFT        *fft = (Mat_FFT*)A->data;
308:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;

312:   fftw_destroy_plan(fftw->p_forward);
313:   fftw_destroy_plan(fftw->p_backward);
314:   PetscFree(fftw->dim_fftw);
315:   PetscFree(fft->data);
316:   fftw_mpi_cleanup();
317:   return(0);
318: }

320: #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
323: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
324: {
325:   PetscErrorCode  ierr;
326:   PetscScalar     *array;

329:   VecGetArray(v,&array);
330:   fftw_free((fftw_complex*)array);
331:   VecRestoreArray(v,&array);
332:   VecDestroy_MPI(v);
333:   return(0);
334: }

338: /*@
339:    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
340:      parallel layout determined by FFTW

342:    Collective on Mat

344:    Input Parameter:
345: .   A - the matrix

347:    Output Parameter:
348: +   x - (optional) input vector of forward FFTW
349: -   y - (optional) output vector of forward FFTW
350: -   z - (optional) output vector of backward FFTW

352:   Level: advanced

354:   Note: The parallel layout of output of forward FFTW is always same as the input
355:         of the backward FFTW. But parallel layout of the input vector of forward
356:         FFTW might not be same as the output of backward FFTW. 
357:         Also note that we need to provide enough space while doing parallel real transform.
358:         We need to pad extra zeros at the end of last dimension. For this reason the one needs to 
359:         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the 
360:         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded 
361:         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
362:         zeros if it is odd only one zero is needed.
363:         Lastly one needs some scratch space at the end of data set in each process. alloc_local
364:         figures out how much space is needed, i.e. it figures out the data+scratch space for
365:         each processor and returns that.

367: .seealso: MatCreateFFTW()
368: @*/
369: PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
370: {

374:   PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
375:   return(0);
376: }

378: EXTERN_C_BEGIN
381: PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
382: {
384:   PetscMPIInt    size,rank;
385:   MPI_Comm       comm=((PetscObject)A)->comm;
386:   Mat_FFT        *fft = (Mat_FFT*)A->data;
387:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
388:   PetscInt       N=fft->N;
389:   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;


395:   MPI_Comm_size(comm, &size);
396:   MPI_Comm_rank(comm, &rank);
397:   if (size == 1){ /* sequential case */
398: #if defined(PETSC_USE_COMPLEX)
399:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,N,fin);}
400:     if (fout){VecCreateSeq(PETSC_COMM_SELF,N,fout);}
401:     if (bout){VecCreateSeq(PETSC_COMM_SELF,N,bout);}
402: #else
403:     if (fin) {VecCreateSeq(PETSC_COMM_SELF,n,fin);}
404:     if (fout){VecCreateSeq(PETSC_COMM_SELF,n,fout);}
405:     if (bout){VecCreateSeq(PETSC_COMM_SELF,n,bout);}
406: #endif
407:   } else { /* parallel cases */
408:     ptrdiff_t      alloc_local,local_n0,local_0_start;
409:     ptrdiff_t      local_n1;
410:     fftw_complex   *data_fout;
411:     ptrdiff_t      local_1_start;
412: #if defined(PETSC_USE_COMPLEX)
413:     fftw_complex   *data_fin,*data_bout;
414: #else
415:     double         *data_finr,*data_boutr;
416:     PetscInt       n1,N1;
417:     ptrdiff_t      temp;
418: #endif

420:     switch (ndim){
421:     case 1:
422: #if !defined(PETSC_USE_COMPLEX)
423:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
424: #else
425:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
426:       if (fin) {
427:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
428:         VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);
429:         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
430:       }
431:       if (fout) {
432:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
433:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);
434:         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
435:       }
436:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
437:       if (bout) {
438:         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
439:         VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);
440:         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
441:       }
442:       break;
443: #endif
444:     case 2:
445: #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
446:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
447:       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
448:       if (fin) {
449:         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
450:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);
451:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
452:       }
453:       if (bout) {
454:         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
455:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);
456:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
457:       }
458:       if (fout) {
459:         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
460:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);
461:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
462:       }
463: #else
464:       /* Get local size */
465:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
466:       if (fin) {
467:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
468:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);
469:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
470:       }
471:       if (fout) {
472:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
473:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);
474:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
475:       }
476:       if (bout) {
477:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
478:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);
479:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
480:       }
481: #endif
482:       break;
483:     case 3:
484: #if !defined(PETSC_USE_COMPLEX)
485:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
486:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
487:       if (fin) {
488:         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
489:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);
490:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
491:       }
492:       if (bout) {
493:         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
494:         VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);
495:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
496:       }
497: 
498:       if (fout) {
499:         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
500:         VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);
501:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
502:       }
503: #else
504:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
505:       if (fin) {
506:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
507:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);
508:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
509:       }
510:       if (fout) {
511:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);
513:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
514:       }
515:       if (bout) {
516:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);
518:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
519:       }
520: #endif
521:       break;
522:     default:
523: #if !defined(PETSC_USE_COMPLEX)
524:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
525:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
526:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
527:       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
528:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

530:       if (fin) {
531:         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
532:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);
533:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
534:       }
535:       if (bout) {
536:         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
537:         VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);
538:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
539:       }
540: 
541:       if (fout) {
542:         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
543:         VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);
544:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
545:       }
546: #else
547:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
548:       if (fin) {
549:         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
550:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);
551:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
552:       }
553:       if (fout) {
554:         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
555:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);
556:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
557:       }
558:       if (bout) {
559:         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
560:         VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);
561:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
562:       }
563: #endif
564:       break;
565:     }
566:   }
567:   return(0);
568: }
569: EXTERN_C_END

573: /*@
574:    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.  

576:    Collective on Mat

578:    Input Parameters:
579: +  A - FFTW matrix
580: -  x - the PETSc vector 

582:    Output Parameters:
583: .  y - the FFTW vector 

585:   Options Database Keys:
586: . -mat_fftw_plannerflags - set FFTW planner flags

588:    Level: intermediate

590:    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when 
591:          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra 
592:          zeros. This routine does that job by scattering operation.     

594: .seealso: VecScatterFFTWToPetsc()
595: @*/
596: PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
597: {

601:   PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
602:   return(0);
603: }

605: EXTERN_C_BEGIN
608: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
609: {
611:   MPI_Comm       comm=((PetscObject)A)->comm;
612:   Mat_FFT        *fft = (Mat_FFT*)A->data;
613:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
614:   PetscInt       N=fft->N;
615:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
616:   PetscInt       low;
617:   PetscInt       rank,size,vsize,vsize1;
618:   ptrdiff_t      alloc_local,local_n0,local_0_start;
619:   ptrdiff_t      local_n1,local_1_start;
620:   VecScatter     vecscat;
621:   IS             list1,list2;
622: #if !defined(PETSC_USE_COMPLEX)
623:   PetscInt       i,j,k,partial_dim;
624:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
625:   PetscInt       N1, n1 ,NM;
626:   ptrdiff_t      temp;
627: #endif

630:   MPI_Comm_size(comm, &size);
631:   MPI_Comm_rank(comm, &rank);
632:   VecGetOwnershipRange(y,&low,PETSC_NULL);

634:   if (size==1){
635:     VecGetSize(x,&vsize);
636:     VecGetSize(y,&vsize1);
637:     ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);
638:     VecScatterCreate(x,list1,y,list1,&vecscat);
639:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
640:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
641:     VecScatterDestroy(&vecscat);
642:     ISDestroy(&list1);
643:   } else {
644:     switch (ndim){
645:     case 1:
646: #if defined(PETSC_USE_COMPLEX)
647:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
648:       ISCreateStride(comm,local_n0,local_0_start,1,&list1);
649:       ISCreateStride(comm,local_n0,low,1,&list2);
650:       VecScatterCreate(x,list1,y,list2,&vecscat);
651:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
652:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
653:       VecScatterDestroy(&vecscat);
654:       ISDestroy(&list1);
655:       ISDestroy(&list2);
656: #else
657:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
658: #endif
659:       break;
660:     case 2:
661: #if defined(PETSC_USE_COMPLEX)
662:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
663:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
664:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
665:       VecScatterCreate(x,list1,y,list2,&vecscat);
666:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
667:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
668:       VecScatterDestroy(&vecscat);
669:       ISDestroy(&list1);
670:       ISDestroy(&list2);
671: #else
672:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
673:       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
674:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);
675:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);
676: 
677:       if (dim[1]%2==0){
678:         NM = dim[1]+2;
679:       } else {
680:         NM = dim[1]+1;
681:       }
682:       for (i=0;i<local_n0;i++){
683:          for (j=0;j<dim[1];j++){
684:            tempindx = i*dim[1] + j;
685:            tempindx1 = i*NM + j;
686:            indx1[tempindx]=local_0_start*dim[1]+tempindx;
687:            indx2[tempindx]=low+tempindx1;
688:          }
689:       }
690: 
691:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
692:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

694:       VecScatterCreate(x,list1,y,list2,&vecscat);
695:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
696:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
697:       VecScatterDestroy(&vecscat);
698:       ISDestroy(&list1);
699:       ISDestroy(&list2);
700:       PetscFree(indx1);
701:       PetscFree(indx2);
702: #endif
703:       break;

705:     case 3:
706: #if defined(PETSC_USE_COMPLEX)
707:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
708:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
709:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
710:       VecScatterCreate(x,list1,y,list2,&vecscat);
711:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
712:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
713:       VecScatterDestroy(&vecscat);
714:       ISDestroy(&list1);
715:       ISDestroy(&list2);
716: #else
717:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
718:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);

720:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
721:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

723:       if (dim[2]%2==0)
724:         NM = dim[2]+2;
725:       else
726:         NM = dim[2]+1;

728:       for (i=0;i<local_n0;i++){
729:         for (j=0;j<dim[1];j++){
730:           for (k=0;k<dim[2];k++){
731:             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
732:             tempindx1 = i*dim[1]*NM + j*NM + k;
733:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
734:             indx2[tempindx]=low+tempindx1;
735:           }
736:         }
737:       }

739:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
740:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);
741:       VecScatterCreate(x,list1,y,list2,&vecscat);
742:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
743:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
744:       VecScatterDestroy(&vecscat);
745:       ISDestroy(&list1);
746:       ISDestroy(&list2);
747:       PetscFree(indx1);
748:       PetscFree(indx2);
749: #endif
750:       break;

752:     default:
753: #if defined(PETSC_USE_COMPLEX)
754:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
755:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
756:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
757:       VecScatterCreate(x,list1,y,list2,&vecscat);
758:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
759:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
760:       VecScatterDestroy(&vecscat);
761:       ISDestroy(&list1);
762:       ISDestroy(&list2);
763: #else
764:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
765:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
766:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
767:       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
768:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

770:       partial_dim = fftw->partial_dim;
771: 
772:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);
773:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);

775:       if (dim[ndim-1]%2==0)
776:         NM = 2;
777:       else
778:         NM = 1;

780:       j = low;
781:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++){
782:         indx1[i] = local_0_start*partial_dim + i;
783:         indx2[i] = j;
784:         if (k%dim[ndim-1]==0){ j+=NM;}
785:         j++;
786:       }
787:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
788:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);
789:       VecScatterCreate(x,list1,y,list2,&vecscat);
790:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
791:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
792:       VecScatterDestroy(&vecscat);
793:       ISDestroy(&list1);
794:       ISDestroy(&list2);
795:       PetscFree(indx1);
796:       PetscFree(indx2);
797: #endif
798:       break;
799:     }
800:   }
801:   return(0);
802: }
803: EXTERN_C_END

807: /*@
808:    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.

810:    Collective on Mat

812:     Input Parameters:
813: +   A - FFTW matrix
814: -   x - FFTW vector

816:    Output Parameters:
817: .  y - PETSc vector 

819:    Level: intermediate

821:    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
822:          VecScatterFFTWToPetsc removes those extra zeros.
823:    
824: .seealso: VecScatterPetscToFFTW()
825: @*/
826: PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
827: {
830:   PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
831:   return(0);
832: }

834: EXTERN_C_BEGIN
837: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
838: {
840:   MPI_Comm       comm=((PetscObject)A)->comm;
841:   Mat_FFT        *fft = (Mat_FFT*)A->data;
842:   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
843:   PetscInt       N=fft->N;
844:   PetscInt       ndim=fft->ndim,*dim=fft->dim;
845:   PetscInt       low;
846:   PetscInt       rank,size;
847:   ptrdiff_t      alloc_local,local_n0,local_0_start;
848:   ptrdiff_t      local_n1,local_1_start;
849:   VecScatter     vecscat;
850:   IS             list1,list2;
851: #if !defined(PETSC_USE_COMPLEX)
852:   PetscInt       i,j,k,partial_dim;
853:   PetscInt       *indx1, *indx2, tempindx, tempindx1;
854:   PetscInt       N1, n1 ,NM;
855:   ptrdiff_t      temp;
856: #endif

859:   MPI_Comm_size(comm, &size);
860:   MPI_Comm_rank(comm, &rank);
861:   VecGetOwnershipRange(x,&low,PETSC_NULL);
862: 
863:   if (size==1){
864:     ISCreateStride(comm,N,0,1,&list1);
865:     VecScatterCreate(x,list1,y,list1,&vecscat);
866:     VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
867:     VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
868:     VecScatterDestroy(&vecscat);
869:     ISDestroy(&list1);

871:   } else{
872:     switch (ndim){
873:     case 1:
874: #if defined(PETSC_USE_COMPLEX)
875:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
876:       ISCreateStride(comm,local_n1,local_1_start,1,&list1);
877:       ISCreateStride(comm,local_n1,low,1,&list2);
878:       VecScatterCreate(x,list1,y,list2,&vecscat);
879:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
880:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
881:       VecScatterDestroy(&vecscat);
882:       ISDestroy(&list1);
883:       ISDestroy(&list2);
884: #else
885:       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
886: #endif
887:       break;
888:     case 2:
889: #if defined(PETSC_USE_COMPLEX)
890:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
891:       ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
892:       ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
893:       VecScatterCreate(x,list2,y,list1,&vecscat);
894:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
895:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
896:       VecScatterDestroy(&vecscat);
897:       ISDestroy(&list1);
898:       ISDestroy(&list2);
899: #else
900:       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
901:       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);

903:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);
904:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);

906:       if (dim[1]%2==0)
907:         NM = dim[1]+2;
908:       else
909:         NM = dim[1]+1;
910: 
911:       for (i=0;i<local_n0;i++){
912:         for (j=0;j<dim[1];j++){
913:           tempindx = i*dim[1] + j;
914:           tempindx1 = i*NM + j;
915:           indx1[tempindx]=local_0_start*dim[1]+tempindx;
916:           indx2[tempindx]=low+tempindx1;
917:         }
918:       }

920:       ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);
921:       ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);

923:       VecScatterCreate(x,list2,y,list1,&vecscat);
924:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
925:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
926:       VecScatterDestroy(&vecscat);
927:       ISDestroy(&list1);
928:       ISDestroy(&list2);
929:       PetscFree(indx1);
930:       PetscFree(indx2);
931: #endif
932:       break;
933:     case 3:
934: #if defined(PETSC_USE_COMPLEX)
935:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
936:       ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
937:       ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
938:       VecScatterCreate(x,list1,y,list2,&vecscat);
939:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
940:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
941:       VecScatterDestroy(&vecscat);
942:       ISDestroy(&list1);
943:       ISDestroy(&list2);
944: #else

946:       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
947:       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);

949:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);
950:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);

952:       if (dim[2]%2==0)
953:         NM = dim[2]+2;
954:       else
955:         NM = dim[2]+1;

957:       for (i=0;i<local_n0;i++){
958:         for (j=0;j<dim[1];j++){
959:           for (k=0;k<dim[2];k++){
960:             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
961:             tempindx1 = i*dim[1]*NM + j*NM + k;
962:             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
963:             indx2[tempindx]=low+tempindx1;
964:           }
965:         }
966:       }

968:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);
969:       ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);

971:       VecScatterCreate(x,list2,y,list1,&vecscat);
972:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
973:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
974:       VecScatterDestroy(&vecscat);
975:       ISDestroy(&list1);
976:       ISDestroy(&list2);
977:       PetscFree(indx1);
978:       PetscFree(indx2);
979: #endif
980:       break;
981:     default:
982: #if defined(PETSC_USE_COMPLEX)
983:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
984:       ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
985:       ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
986:       VecScatterCreate(x,list1,y,list2,&vecscat);
987:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
988:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
989:       VecScatterDestroy(&vecscat);
990:       ISDestroy(&list1);
991:       ISDestroy(&list2);
992: #else
993:       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
994:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
995:       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
996:       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
997:       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;

999:       partial_dim = fftw->partial_dim;

1001:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);
1002:       PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);

1004:       if (dim[ndim-1]%2==0)
1005:         NM = 2;
1006:       else
1007:         NM = 1;

1009:       j = low;
1010:       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++){
1011:         indx1[i] = local_0_start*partial_dim + i;
1012:         indx2[i] = j;
1013:         if (k%dim[ndim-1]==0)j+=NM;
1014:         j++;
1015:       }
1016:       ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);
1017:       ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);

1019:       VecScatterCreate(x,list2,y,list1,&vecscat);
1020:       VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1021:       VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
1022:       VecScatterDestroy(&vecscat);
1023:       ISDestroy(&list1);
1024:       ISDestroy(&list2);
1025:       PetscFree(indx1);
1026:       PetscFree(indx2);
1027: #endif
1028:       break;
1029:     }
1030:   }
1031:   return(0);
1032: }
1033: EXTERN_C_END

1035: EXTERN_C_BEGIN
1038: /*
1039:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1041:   Options Database Keys:
1042: + -mat_fftw_plannerflags - set FFTW planner flags

1044:    Level: intermediate
1045:    
1046: */
1047: PetscErrorCode MatCreate_FFTW(Mat A)
1048: {
1050:   MPI_Comm       comm=((PetscObject)A)->comm;
1051:   Mat_FFT        *fft=(Mat_FFT*)A->data;
1052:   Mat_FFTW       *fftw;
1053:   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1054:   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1055:   PetscBool      flg;
1056:   PetscInt       p_flag,partial_dim=1,ctr;
1057:   PetscMPIInt    size,rank;
1058:   ptrdiff_t      *pdim;
1059:   ptrdiff_t      local_n1,local_1_start;
1060: #if !defined(PETSC_USE_COMPLEX)
1061:    ptrdiff_t     temp;
1062:    PetscInt      N1,tot_dim;
1063: #else 
1064:    PetscInt      n1;
1065: #endif

1068:   MPI_Comm_size(comm, &size);
1069:   MPI_Comm_rank(comm, &rank);

1071:   fftw_mpi_init();
1072:   pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
1073:   pdim[0] = dim[0];
1074: #if !defined(PETSC_USE_COMPLEX)
1075:   tot_dim = 2*dim[0];
1076: #endif
1077:   for (ctr=1;ctr<ndim;ctr++){
1078:     partial_dim *= dim[ctr];
1079:     pdim[ctr] = dim[ctr];
1080: #if !defined(PETSC_USE_COMPLEX)
1081:     if (ctr==ndim-1)
1082:       tot_dim *= (dim[ctr]/2+1);
1083:     else
1084:       tot_dim *= dim[ctr];
1085: #endif
1086:   }
1087: 
1088:   if (size == 1) {
1089: #if defined(PETSC_USE_COMPLEX)
1090:     MatSetSizes(A,N,N,N,N);
1091:     n = N;
1092: #else
1093:     MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);
1094:     n = tot_dim;
1095: #endif

1097:   } else {
1098:     ptrdiff_t alloc_local,local_n0,local_0_start;
1099:     switch (ndim){
1100:     case 1:
1101: #if !defined(PETSC_USE_COMPLEX)
1102:       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1103: #else
1104:       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1105:       n = (PetscInt)local_n0;
1106:       n1 = (PetscInt) local_n1;
1107:       MatSetSizes(A,n1,n,N,N);
1108: #endif
1109:       break;
1110:     case 2:
1111: #if defined(PETSC_USE_COMPLEX)
1112:       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1113:       /*
1114:        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1115:        PetscSynchronizedFlush(comm);
1116:        */
1117:       n = (PetscInt)local_n0*dim[1];
1118:       MatSetSizes(A,n,n,N,N);
1119: #else
1120:       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1121:       n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1122:       MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1123: #endif 
1124:       break;
1125:     case 3:
1126: #if defined(PETSC_USE_COMPLEX)
1127:       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1128:       n = (PetscInt)local_n0*dim[1]*dim[2];
1129:       MatSetSizes(A,n,n,N,N);
1130: #else
1131:       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1132:       n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1133:       MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1134: #endif 
1135:       break;
1136:     default:
1137: #if defined(PETSC_USE_COMPLEX)
1138:       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1139:       n = (PetscInt)local_n0*partial_dim;
1140:       MatSetSizes(A,n,n,N,N);
1141: #else
1142:       temp = pdim[ndim-1];
1143:       pdim[ndim-1] = temp/2 + 1;
1144:       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1145:       n = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1146:       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1147:       pdim[ndim-1] = temp;
1148:       MatSetSizes(A,n,n,N1,N1);
1149: #endif  
1150:       break;
1151:     }
1152:   }
1153:   PetscObjectChangeTypeName((PetscObject)A,MATFFTW);
1154:   PetscNewLog(A,Mat_FFTW,&fftw);
1155:   fft->data = (void*)fftw;
1156: 
1157:   fft->n            = n;
1158:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1159:   fftw->partial_dim = partial_dim;
1160:   PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t *)&(fftw->dim_fftw));
1161:   for(ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1162: 
1163:   fftw->p_forward  = 0;
1164:   fftw->p_backward = 0;
1165:   fftw->p_flag     = FFTW_ESTIMATE;

1167:   if (size == 1){
1168:     A->ops->mult          = MatMult_SeqFFTW;
1169:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1170:   } else {
1171:     A->ops->mult          = MatMult_MPIFFTW;
1172:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1173:   }
1174:   fft->matdestroy = MatDestroy_FFTW;
1175:   A->assembled    = PETSC_TRUE;
1176:   A->preallocated = PETSC_TRUE;
1177:   PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);
1178:   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);
1179:   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);
1180: 
1181:   /* get runtime options */
1182:   PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");
1183:     PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);
1184:     if (flg) {fftw->p_flag = (unsigned)p_flag;}
1185:   PetscOptionsEnd();
1186:   return(0);
1187: }
1188: EXTERN_C_END