Actual source code: seqaijpthread.c

petsc-3.3-p7 2013-05-11
  2: #include <petsc-private/vecimpl.h>
  3: #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
  4: #include <../src/sys/objects/pthread/pthreadimpl.h>
  5: #include <../src/mat/impls/aij/seq/seqpthread/seqaijpthread.h>

  7: static PetscInt    mats_created=0;

  9: PetscErrorCode MatRealPart_Kernel(void* arg)
 10: {
 11:   Mat_KernelData    *data=(Mat_KernelData*)arg;
 12:   const PetscInt    *ai = (const PetscInt*)data->ai;
 13:   MatScalar         *aabase = data->aa,*aa;
 14:   PetscInt          nrows=data->nrows;
 15:   PetscInt          nz,i;

 17: 
 18:   nz = ai[nrows] - ai[0];
 19:   aa = aabase + ai[0];
 20:   for(i=0;i<nz;i++) aa[i] = PetscRealPart(aa[i]);

 22:   return(0);
 23: }

 27: PetscErrorCode MatRealPart_SeqAIJPThread(Mat A)
 28: {
 29:   PetscErrorCode     ierr;
 30:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
 31:   PetscThreadsLayout tmap=A->rmap->tmap;
 32:   PetscInt           *trstarts = tmap->trstarts;
 33:   PetscInt           i;
 34:   MatScalar          *aa=a->a;
 35:   PetscInt           *ai=a->i;


 39:   for(i=0; i < tmap->nthreads; i++) {
 40:     mat_kerneldatap[i].ai = ai + trstarts[i];
 41:     mat_kerneldatap[i].aa = aa;
 42:     mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
 43:     mat_pdata[i] = &mat_kerneldatap[i];
 44:   }
 45:   PetscThreadsRunKernel(MatRealPart_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

 47:   a->idiagvalid = PETSC_FALSE;
 48:   a->ibdiagvalid = PETSC_FALSE;
 49:   return(0);
 50: }

 52: PetscErrorCode MatImaginaryPart_Kernel(void* arg)
 53: {
 54:   Mat_KernelData    *data=(Mat_KernelData*)arg;
 55:   const PetscInt    *ai = (const PetscInt*)data->ai;
 56:   MatScalar         *aabase = data->aa,*aa;
 57:   PetscInt          nrows=data->nrows;
 58:   PetscInt          nz,i;

 60: 
 61:   nz = ai[nrows] - ai[0];
 62:   aa = aabase + ai[0];
 63:   for(i=0;i<nz;i++) aa[i] = PetscImaginaryPart(aa[i]);

 65:   return(0);
 66: }

 70: PetscErrorCode MatImaginaryPart_SeqAIJPThread(Mat A)
 71: {
 72:   PetscErrorCode     ierr;
 73:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
 74:   PetscThreadsLayout tmap=A->rmap->tmap;
 75:   PetscInt           *trstarts = tmap->trstarts;
 76:   PetscInt           i;
 77:   MatScalar          *aa=a->a;
 78:   PetscInt           *ai=a->i;


 82:   for(i=0; i < tmap->nthreads; i++) {
 83:     mat_kerneldatap[i].ai = ai + trstarts[i];
 84:     mat_kerneldatap[i].aa = aa;
 85:     mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
 86:     mat_pdata[i] = &mat_kerneldatap[i];
 87:   }
 88:   PetscThreadsRunKernel(MatImaginaryPart_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

 90:   a->idiagvalid = PETSC_FALSE;
 91:   a->ibdiagvalid = PETSC_FALSE;
 92:   return(0);
 93: }

 95: PetscErrorCode MatFactorGetDiagonal_Kernel(void* arg)
 96: {
 97:   Mat_KernelData    *data=(Mat_KernelData*)arg;
 98:   const PetscInt    *adiag = (const PetscInt*)data->adiag;
 99:   const MatScalar   *aa = (const MatScalar*)data->aa;
100:   PetscScalar       *x = (PetscScalar*)data->x;
101:   PetscInt           nrows=(PetscInt)data->nrows,i;

103: 
104:   for(i=0;i < nrows;i++) {
105:     x[i] = 1.0/aa[adiag[i]];
106:   }
107:   return(0);
108: }

110: PetscErrorCode MatGetDiagonal_Kernel(void* arg)
111: {
112:   Mat_KernelData    *data=(Mat_KernelData*)arg;
113:   const PetscInt    *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
114:   const MatScalar   *aa = (const MatScalar*)data->aa;
115:   PetscScalar       *x = (PetscScalar*)data->x;
116:   PetscInt           nrows=(PetscInt)data->nrows;
117:   PetscInt           i,j,row;
118:   PetscInt           rstart=(PetscInt)data->rstart;

120: 
121:   for(i=0;i < nrows;i++) {
122:     row = rstart+i;
123:     for(j=ai[i]; j < ai[i+1];j++) {
124:       if(aj[j] == row) {
125:         x[i] = aa[j];
126:         break;
127:       }
128:     }
129:   }
130:   return(0);
131: }

135: PetscErrorCode MatGetDiagonal_SeqAIJPThread(Mat A,Vec v)
136: {
137:   PetscErrorCode     ierr;
138:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
139:   PetscThreadsLayout tmap=A->rmap->tmap;
140:   PetscInt           *trstarts=tmap->trstarts;
141:   PetscScalar        *x;
142:   MatScalar          *aa=a->a;
143:   PetscInt           *ai=a->i,*aj=a->j;
144:   PetscInt           i,n;

147:   VecGetLocalSize(v,&n);
148:   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");

150:   if(A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
151:     PetscInt *diag=a->diag;
152:     VecGetArray(v,&x);
153:     for(i=0;i < tmap->nthreads; i++) {
154:       mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
155:       mat_kerneldatap[i].aa    = aa;
156:       mat_kerneldatap[i].adiag = diag + trstarts[i];
157:       mat_kerneldatap[i].x     = x + trstarts[i];
158:       mat_pdata[i]             = &mat_kerneldatap[i];
159:     }
160:     PetscThreadsRunKernel(MatFactorGetDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
161:     VecRestoreArray(v,&x);
162:     return(0);
163:   }

165:   VecSet(v,0.0);
166:   VecGetArray(v,&x);
167:   for(i=0;i < tmap->nthreads; i++) {
168:     mat_kerneldatap[i].ai     = ai + trstarts[i];
169:     mat_kerneldatap[i].rstart = trstarts[i];
170:     mat_kerneldatap[i].aj     = aj;
171:     mat_kerneldatap[i].aa     = aa;
172:     mat_kerneldatap[i].nrows  = trstarts[i+1]-trstarts[i];
173:     mat_kerneldatap[i].x      = x + trstarts[i];
174:     mat_pdata[i]              = &mat_kerneldatap[i];
175:   }
176:   PetscThreadsRunKernel(MatGetDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
177:   VecRestoreArray(v,&x);
178:   return(0);
179: }

181: PetscErrorCode MatZeroEntries_Kernel(void* arg)
182: {
183:   Mat_KernelData    *data=(Mat_KernelData*)arg;
184:   const PetscInt    *ai = (const PetscInt*)data->ai;
185:   MatScalar         *aabase = data->aa;
186:   MatScalar         *aa;
187:   PetscInt          nrows=data->nrows;
188:   PetscInt          nz;

190: 
191:   nz = ai[nrows] - ai[0];
192:   aa = aabase + ai[0];
193:   PetscMemzero(aa,nz*sizeof(PetscScalar));

195:   return(0);
196: }

200: PetscErrorCode MatZeroEntries_SeqAIJPThread(Mat A)
201: {
202:   PetscErrorCode     ierr;
203:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
204:   PetscThreadsLayout tmap=A->rmap->tmap;
205:   PetscInt           *trstarts = tmap->trstarts;
206:   PetscInt           i;
207:   MatScalar          *aa=a->a;
208:   PetscInt           *ai=a->i;


212:   for(i=0; i < tmap->nthreads; i++) {
213:     mat_kerneldatap[i].ai = ai + trstarts[i];
214:     mat_kerneldatap[i].aa = aa;
215:     mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
216:     mat_pdata[i] = &mat_kerneldatap[i];
217:   }
218: 
219:   PetscThreadsRunKernel(MatZeroEntries_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
220: 
221:   return(0);
222: }

224: PetscErrorCode MatMult_Kernel(void* arg)
225: {
226:   Mat_KernelData    *data=(Mat_KernelData*)arg;
227:   const PetscInt    *ai = (const PetscInt*)data->ai,*ajbase = (const PetscInt*)data->aj,*aj;
228:   const MatScalar   *aabase = (const MatScalar*)data->aa,*aa;
229:   const PetscScalar *x = (const PetscScalar*)data->x;
230:   PetscScalar       *y = data->y;
231:   PetscInt           nrows = data->nrows;
232:   PetscInt           nz,i;
233:   PetscScalar        sum;

235: 
236:   data->nonzerorow = 0;
237:   for(i=0;i<nrows;i++) {
238:     nz = ai[i+1] - ai[i];
239:     aj = ajbase + ai[i];
240:     aa = aabase + ai[i];
241:     sum = 0.0;
242:     data->nonzerorow += (nz>0);
243:     PetscSparseDensePlusDot(sum,x,aa,aj,nz);
244:     y[i] = sum;
245:   }
246:   return(0);
247: }

251: PetscErrorCode MatMult_SeqAIJPThread(Mat A,Vec xx,Vec yy)
252: {
253:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
254:   PetscThreadsLayout tmap=A->rmap->tmap;
255:   PetscInt          *trstarts=tmap->trstarts;
256:   PetscScalar       *x,*y;
257:   PetscErrorCode    ierr;
258:   MatScalar         *aa = a->a;
259:   PetscInt          *ai = a->i, *aj = a->j;
260:   PetscInt          i,nonzerorow=0;;


264:   if(a->compressedrow.use) {
265:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Compressed row format not supported yet");
266:   }
267:   VecGetArray(xx,&x);
268:   VecGetArray(yy,&y);

270:   for(i=0;i < tmap->nthreads;i++) {
271:     mat_kerneldatap[i].ai    = ai + trstarts[i];
272:     mat_kerneldatap[i].aa    = aa;
273:     mat_kerneldatap[i].aj    = aj;
274:     mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
275:     mat_kerneldatap[i].x     = x;
276:     mat_kerneldatap[i].y     = y + trstarts[i];
277:     mat_pdata[i]             = &mat_kerneldatap[i];
278:   }
279:   PetscThreadsRunKernel(MatMult_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

281:   for(i=0;i< tmap->nthreads;i++) nonzerorow += mat_kerneldatap[i].nonzerorow;

283:   PetscLogFlops(2.0*a->nz - nonzerorow);
284:   VecRestoreArray(xx,&x);
285:   VecRestoreArray(yy,&y);

287:   return(0);
288: }

290: PetscErrorCode MatMultAdd_Kernel(void* arg)
291: {
292:   Mat_KernelData    *data=(Mat_KernelData*)arg;
293:   const PetscInt    *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
294:   const MatScalar   *aa = (const MatScalar*)data->aa;
295:   const PetscScalar *x = (const PetscScalar*)data->x;
296:   PetscScalar       *y = data->y;
297:   PetscScalar       *z = data->z;
298:   PetscInt           nrows = data->nrows;
299:   PetscInt           nz,i;
300:   PetscScalar        sum;

302: 
303:   data->nonzerorow = 0;
304:   for(i=0;i<nrows;i++) {
305:     nz = ai[i+1] - ai[i];
306:     aj = data->aj + ai[i];
307:     aa = data->aa + ai[i];
308:     sum = y[i];
309:     data->nonzerorow += (nz>0);
310:     PetscSparseDensePlusDot(sum,x,aa,aj,nz);
311:     z[i] = sum;
312:   }
313:   return(0);
314: }
315: 
318: PetscErrorCode MatMultAdd_SeqAIJPThread(Mat A,Vec xx,Vec yy,Vec zz)
319: {
320:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
321:   PetscThreadsLayout tmap=A->rmap->tmap;
322:   PetscInt          *trstarts=tmap->trstarts;
323:   PetscScalar       *x,*y,*z;
324:   PetscErrorCode    ierr;
325:   MatScalar         *aa = a->a;
326:   PetscInt          *ai = a->i, *aj = a->j;
327:   PetscInt          i,nonzerorow=0;;


331:   if(a->compressedrow.use) {
332:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Compressed row format not supported yet");
333:   }
334:   VecGetArray(xx,&x);
335:   VecGetArray(yy,&y);
336:   if(zz != yy) {
337:     VecGetArray(zz,&z);
338:   } else z = y;

340:   for(i=0;i < tmap->nthreads;i++) {
341:     mat_kerneldatap[i].ai    = ai + trstarts[i];
342:     mat_kerneldatap[i].aa    = aa;
343:     mat_kerneldatap[i].aj    = aj;
344:     mat_kerneldatap[i].nrows = trstarts[i+1]-trstarts[i];
345:     mat_kerneldatap[i].x     = x;
346:     mat_kerneldatap[i].y     = y + trstarts[i];
347:     mat_kerneldatap[i].z     = z + trstarts[i];
348:     mat_pdata[i]             = &mat_kerneldatap[i];
349:   }
350:   PetscThreadsRunKernel(MatMultAdd_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

352:   for(i=0;i< tmap->nthreads;i++) nonzerorow += mat_kerneldatap[i].nonzerorow;

354:   PetscLogFlops(2.0*a->nz - nonzerorow);
355:   VecRestoreArray(xx,&x);
356:   VecRestoreArray(yy,&y);
357:   if(zz != yy) {
358:     VecRestoreArray(zz,&z);
359:   }

361:   return(0);
362: }

364: PetscErrorCode MatMarkDiagonal_Kernel(void* arg)
365: {
366:   Mat_KernelData    *data=(Mat_KernelData*)arg;
367:   const PetscInt    *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
368:   PetscInt          *adiag=(PetscInt*)data->adiag;
369:   PetscInt           nrows=(PetscInt)data->nrows;
370:   PetscInt           i,j,row;
371:   PetscInt           rstart=(PetscInt)data->rstart;

373: 
374:   for(i=0;i < nrows;i++) {
375:     row = rstart+i;
376:     for(j=ai[i]; j < ai[i+1];j++) {
377:       if(aj[j] == row) {
378:         adiag[i] = j;
379:         break;
380:       }
381:     }
382:   }
383:   return(0);
384: }

388: PetscErrorCode MatMarkDiagonal_SeqAIJPThread(Mat A)
389: {
390:   PetscErrorCode     ierr;
391:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
392:   PetscThreadsLayout tmap=A->rmap->tmap;
393:   PetscInt           *trstarts=tmap->trstarts;
394:   PetscInt           *ai=a->i,*aj=a->j;
395:   PetscInt           i,m=A->rmap->n;


399:   if(!a->diag) {
400:     PetscMalloc(m*sizeof(PetscInt),&a->diag);
401:     PetscLogObjectMemory(A,m*sizeof(PetscInt));
402:   }

404:   for(i=0;i < tmap->nthreads; i++) {
405:     mat_kerneldatap[i].nrows  = trstarts[i+1] - trstarts[i];
406:     mat_kerneldatap[i].adiag  = a->diag + trstarts[i];
407:     mat_kerneldatap[i].ai     = ai + trstarts[i];
408:     mat_kerneldatap[i].aj     = aj;
409:     mat_kerneldatap[i].rstart = trstarts[i];
410:     mat_pdata[i]              = &mat_kerneldatap[i];
411:   }
412:   PetscThreadsRunKernel(MatMarkDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
413:   return(0);
414: }

416: PetscErrorCode MatFindZeroDiagonalCount_Kernel(void* arg)
417: {
418:   Mat_KernelData     *data=(Mat_KernelData*)arg;
419:   const PetscInt     *aj = (const PetscInt*)data->aj;
420:   const MatScalar    *aa = (const MatScalar*)data->aa;
421:   const PetscInt     *adiag = (const PetscInt*)data->adiag;
422:   PetscInt           nrows = (PetscInt)data->nrows;
423:   PetscInt           i,row;
424:   PetscInt           rstart = (PetscInt)data->rstart;

426: 
427:   for(i=0;i < nrows; i++) {
428:     row = rstart+i;
429:     if((aj[adiag[i]] != row) || (aa[adiag[i]] == 0.0)) data->nzerodiags++;
430:   }
431:   return(0);
432: }

434: PetscErrorCode MatFindZeroDiagonals_Kernel(void* arg)
435: {
436:   Mat_KernelData     *data=(Mat_KernelData*)arg;
437:   const PetscInt     *aj = (const PetscInt*)data->aj;
438:   const MatScalar    *aa = (const MatScalar*)data->aa;
439:   const PetscInt     *adiag = (const PetscInt*)data->adiag;
440:   PetscInt           nrows = (PetscInt)data->nrows;
441:   PetscInt           i,row;
442:   PetscInt           rstart = (PetscInt)data->rstart,cnt=0;;

444: 
445:   for(i=0;i < nrows; i++) {
446:     row = rstart+i;
447:     if((aj[adiag[i]] != row) || (aa[adiag[i]] == 0.0)) data->zerodiags[cnt++] = row;
448:   }
449:   return(0);
450: }

454: PetscErrorCode MatFindZeroDiagonals_SeqAIJPThread(Mat A,IS *zrows)
455: {
456:   Mat_SeqAIJ          *a = (Mat_SeqAIJ*)A->data;
457:   PetscThreadsLayout  tmap=A->rmap->tmap;
458:   PetscInt            *trstarts = tmap->trstarts;
459:   PetscInt            i,cnt=0,*rows,ctr=0;
460:   PetscInt            *aj = a->j,*diag;
461:   MatScalar           *aa = a->a;
462:   PetscErrorCode      ierr;

465:   MatMarkDiagonal_SeqAIJPThread(A);
466:   diag = a->diag;

468:   /* Find zero diagonals count */
469:   for(i=0;i < tmap->nthreads;i++) {
470:     mat_kerneldatap[i].aj      = aj;
471:     mat_kerneldatap[i].nrows   = trstarts[i+1]-trstarts[i];
472:     mat_kerneldatap[i].aa      = aa;
473:     mat_kerneldatap[i].rstart  = trstarts[i];
474:     mat_kerneldatap[i].nzerodiags = 0;
475:     mat_kerneldatap[i].adiag   = diag + trstarts[i];
476:     mat_pdata[i]               = &mat_kerneldatap[i];
477:   }
478:   PetscThreadsRunKernel(MatFindZeroDiagonalCount_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

480:   for(i=0;i < tmap->nthreads;i++) cnt += mat_kerneldatap[i].nzerodiags;
481:   PetscMalloc(cnt*sizeof(PetscInt),&rows);

483:   /* Get zero diagonals */
484:   for(i=0;i < tmap->nthreads;i++) {
485:     mat_kerneldatap[i].aj      = aj;
486:     mat_kerneldatap[i].nrows   = trstarts[i+1]-trstarts[i];
487:     mat_kerneldatap[i].aa      = aa;
488:     mat_kerneldatap[i].rstart  = trstarts[i];
489:     mat_kerneldatap[i].zerodiags = rows + ctr;
490:     mat_kerneldatap[i].adiag   = diag + trstarts[i];
491:     mat_pdata[i]               = &mat_kerneldatap[i];
492:     ctr += mat_kerneldatap[i].nzerodiags;
493:   }
494:   PetscThreadsRunKernel(MatFindZeroDiagonals_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

496:   ISCreateGeneral(((PetscObject)A)->comm,cnt,rows,PETSC_OWN_POINTER,zrows);
497:   return(0);
498: }
499: 
500: PetscErrorCode MatMissingDiagonal_Kernel(void* arg)
501: {
502:   Mat_KernelData    *data=(Mat_KernelData*)arg;
503:   const PetscInt    *aj = (const PetscInt*)data->aj;
504:   PetscInt          *adiag=(PetscInt*)data->adiag;
505:   PetscInt           nrows=(PetscInt)data->nrows;
506:   PetscInt           i,row;
507:   PetscInt           rstart=(PetscInt)data->rstart;

509: 
510:   data->missing_diag = PETSC_FALSE;
511:   for(i=0; i < nrows; i++) {
512:     row = rstart + i;
513:     if(aj[adiag[i]] != row) {
514:       data->missing_diag = PETSC_TRUE;
515:       if(data->find_d) data->d =  row;
516:       break;
517:     }
518:   }
519:   return(0);
520: }

524: PetscErrorCode MatMissingDiagonal_SeqAIJPThread(Mat A, PetscBool *missing,PetscInt *d)
525: {
526:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
527:   PetscErrorCode     ierr;
528:   PetscThreadsLayout tmap=A->rmap->tmap;
529:   PetscInt           *trstarts=tmap->trstarts;
530:   PetscInt           *aj=a->j,*diag,i;

533:   *missing = PETSC_FALSE;
534:   if(A->rmap->n > 0 && !aj) {
535:     *missing = PETSC_TRUE;
536:     if (d) *d = 0;
537:     PetscInfo(A,"Matrix has no entries therefore is missing diagonal");
538:   } else {
539:     diag = a->diag;
540:     for(i=0; i < tmap->nthreads; i++) {
541:       mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
542:       mat_kerneldatap[i].adiag = diag + trstarts[i];
543:       mat_kerneldatap[i].aj    = aj;
544:       mat_kerneldatap[i].rstart = trstarts[i];
545:       mat_kerneldatap[i].missing_diag = PETSC_FALSE;
546:       mat_kerneldatap[i].find_d = PETSC_FALSE;;
547:       if(d) mat_kerneldatap[i].find_d = PETSC_TRUE;
548:       mat_pdata[i]  = &mat_kerneldatap[i];
549:     }
550:     PetscThreadsRunKernel(MatMissingDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

552:     for(i=0;i < tmap->nthreads;i++) {
553:       if(mat_kerneldatap[i].missing_diag) {
554:         *missing = PETSC_TRUE;
555:         if(d) *d = mat_kerneldatap[i].d;
556:         PetscInfo1(A,"Matrix is missing diagonal number %D",mat_kerneldatap[i].d);
557:         break;
558:       }
559:     }
560:   }
561:   return(0);
562: }

564: PetscErrorCode MatDiagonalScale_Kernel(void* arg)
565: {
566:   Mat_KernelData    *data=(Mat_KernelData*)arg;
567:   const PetscInt    *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
568:   MatScalar         *aa = (MatScalar*)data->aa;
569:   const PetscScalar *ll = (const PetscScalar*)data->x;
570:   const PetscScalar *rr = (const PetscScalar*)data->y;
571:   PetscInt           nrows = data->nrows;
572:   PetscInt           i,j;

574:   if(ll) {
575:     for(i=0; i < nrows; i++) {
576:       for(j=ai[i]; j < ai[i+1]; j++) aa[j] *= ll[i];
577:     }
578:   }

580:   if(rr) {
581:     for(i=0; i < nrows; i++) {
582:       for(j=ai[i]; j < ai[i+1]; j++) aa[j] *= rr[aj[j]];
583:     }
584:   }
585:   return(0);
586: }

590: PetscErrorCode MatDiagonalScale_SeqAIJPThread(Mat A,Vec ll,Vec rr)
591: {
592:   PetscErrorCode     ierr;
593:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
594:   PetscThreadsLayout tmap=A->rmap->tmap;
595:   PetscInt           *trstarts=tmap->trstarts;
596:   PetscScalar        *l,*r;
597:   MatScalar          *aa=a->a;
598:   PetscInt           *ai=a->i,*aj=a->j;
599:   PetscInt           i,m = A->rmap->n, n = A->cmap->n,nz = a->nz;

602:   if(ll) {
603:     VecGetLocalSize(ll,&m);
604:     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
605:     VecGetArray(ll,&l);
606:   }
607:   if(rr) {
608:     VecGetLocalSize(rr,&n);
609:     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
610:     VecGetArray(rr,&r);
611:   }

613:   for(i=0;i< tmap->nthreads;i++) {
614:     mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
615:     mat_kerneldatap[i].ai    = ai + trstarts[i];
616:     mat_kerneldatap[i].aj    = aj;
617:     mat_kerneldatap[i].aa    = aa;
618:     if(ll) mat_kerneldatap[i].x = l + trstarts[i];
619:     else mat_kerneldatap[i].x = PETSC_NULL;
620:     if(rr) mat_kerneldatap[i].y = r;
621:     else mat_kerneldatap[i].y = PETSC_NULL;
622:     mat_pdata[i] = &mat_kerneldatap[i];
623:   }
624:   PetscThreadsRunKernel(MatDiagonalScale_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

626:   if(ll) {
627:     VecRestoreArray(ll,&l);
628:     PetscLogFlops(nz);
629:   }
630:   if(rr) {
631:     VecRestoreArray(rr,&r);
632:     PetscLogFlops(nz);
633:   }
634:   a->idiagvalid = PETSC_FALSE;
635:   a->ibdiagvalid = PETSC_FALSE;
636:   return(0);
637: }
638: 
639: PetscErrorCode MatDiagonalSet_Kernel(void* arg)
640: {
641:   Mat_KernelData    *data=(Mat_KernelData*)arg;
642:   MatScalar         *aa = (MatScalar*)data->aa;
643:   const PetscInt    *adiag = (const PetscInt*)data->adiag;
644:   PetscInt          nrows = data->nrows;
645:   PetscInt          i;
646:   PetscScalar       *x = (PetscScalar*)data->x;
647:   InsertMode        is = data->is;
648: 
649:   if(is == INSERT_VALUES) {
650:     for(i=0; i < nrows; i++) {
651:       aa[adiag[i]] = x[i];
652:     }
653:   } else {
654:     for(i=0;i < nrows; i++) {
655:       aa[adiag[i]] += x[i];
656:     }
657:   }
658:   return(0);
659: }

663: PetscErrorCode MatDiagonalSet_SeqAIJPThread(Mat Y,Vec D,InsertMode is)
664: {
665:   PetscErrorCode     ierr;
666:   Mat_SeqAIJ         *aij = (Mat_SeqAIJ*)Y->data;
667:   PetscThreadsLayout tmap=Y->rmap->tmap;
668:   PetscInt           *trstarts=tmap->trstarts;
669:   MatScalar          *aa = aij->a;
670:   PetscInt           *diag;
671:   PetscInt           i;
672:   PetscBool          missing;
673:   PetscScalar        *v;

676:   if(Y->assembled) {
677:     MatMissingDiagonal_SeqAIJPThread(Y,&missing,PETSC_NULL);
678:     if(!missing) {
679:       diag = aij->diag;
680:       VecGetArray(D,&v);
681:       for(i=0; i < tmap->nthreads;i++) {
682:         mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
683:         mat_kerneldatap[i].aa    = aa;
684:         mat_kerneldatap[i].adiag = diag + trstarts[i];
685:         mat_kerneldatap[i].x     = v + trstarts[i];
686:         mat_kerneldatap[i].is    = is;
687:         mat_pdata[i]             = &mat_kerneldatap[i];
688:       }
689:       PetscThreadsRunKernel(MatDiagonalSet_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);

691:       VecRestoreArray(D,&v);
692:       return(0);
693:     }
694:     aij->idiagvalid = PETSC_FALSE;
695:     aij->ibdiagvalid = PETSC_FALSE;
696:   }
697:   MatDiagonalSet_Default(Y,D,is);
698:   return(0);
699: }

703: PetscErrorCode MatSetUp_SeqAIJPThread(Mat A)
704: {

708:   MatSeqAIJPThreadSetPreallocation_SeqAIJPThread(A,PETSC_DEFAULT,0);
709:   return(0);
710: }

714: static PetscErrorCode MatView_SeqAIJPThread(Mat A,PetscViewer viewer)
715: {
717:   PetscBool iascii;
718:   PetscViewerFormat format;

721:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
722:   PetscViewerGetFormat(viewer,&format);
723:   if (iascii && (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO)) {
724:     PetscInt nthreads;
725:     MatGetNThreads(A,&nthreads);
726:     PetscViewerASCIIPrintf(viewer,"nthreads=%D\n",nthreads);
727:   }
728:   MatView_SeqAIJ(A,viewer);
729:   return(0);
730: }


735: PetscErrorCode MatDestroy_SeqAIJPThread(Mat A)
736: {
737:   PetscErrorCode      ierr;


741:   if(!A->rmap->refcnt) {
742:     PetscThreadsLayoutDestroy(&A->rmap->tmap);
743:   }

745:   mats_created--;
746:   /* Free the kernel data structure on the destruction of the last matrix */
747:   if(!mats_created) {
748:     PetscFree(mat_kerneldatap);
749:     PetscFree(mat_pdata);
750:   }
751:   MatDestroy_SeqAIJ(A);
752:   return(0);
753: }

757: /*@
758:    MatSetNThreads - Set the number of threads to be used for matrix operations.

760:    Not Collective, but it is usually desirable to use the same number of threads per process

762:    Input Parameters
763: +  A - the matrix
764: -  nthreads - number of threads

766:    Level: intermediate

768:    Concepts: matrix^setting number of threads

770: .seealso: MatCreateSeqAIJPThread()
771: @*/
772: PetscErrorCode MatSetNThreads(Mat A,PetscInt nthreads)
773: {
774:   PetscErrorCode     ierr;
775:   PetscThreadsLayout tmap=A->rmap->tmap;
776:   PetscInt           nworkThreads=PetscMaxThreads+PetscMainThreadShareWork;

779: 
780:   if(!tmap) {
781:     PetscThreadsLayoutCreate(&tmap);
782:     A->rmap->tmap = tmap;
783:   }

785:   if(nthreads == PETSC_DECIDE) {
786:     tmap->nthreads = nworkThreads;
787:     PetscOptionsInt("-mat_threads","Set number of threads to be used for matrix operations","MatSetNThreads",nworkThreads,&tmap->nthreads,PETSC_NULL);
788:     if(tmap->nthreads > nworkThreads) {
789:       SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Mat A: threads requested %D, Max. threads initialized %D",tmap->nthreads,nworkThreads);
790:     }
791:   } else {
792:     if(nthreads > nworkThreads) {
793:       SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Mat A: threads requested %D, Max. threads initialized %D",nthreads,nworkThreads);
794:     }
795:     tmap->nthreads = nthreads;
796:   }
797: 
798:   return(0);
799: }

803: /*@
804:    MatGetNThreads - Get the number of threads used for matrix operations.

806:    Not Collective

808:    Input Parameter
809: .  A - the matrix

811:    Output Parameter:
812: .  nthreads - number of threads

814:    Level: intermediate

816:    Concepts: matrix^getting number of threads

818: .seealso: MatCreateSeqAIJPThread(), MatSetNThreads()
819: @*/
820: PetscErrorCode MatGetNThreads(Mat A,PetscInt *nthreads)
821: {
822:   PetscThreadsLayout tmap=A->rmap->tmap;

826:   if (tmap) *nthreads = tmap->nthreads - PetscMainThreadShareWork;
827:   else *nthreads = 1;
828:   return(0);
829: }

833: /*
834:    MatSetThreadAffinities - Sets the CPU affinities of matrix threads.

836:    Input Parameters
837: +  A - the matrix
838: -  affinities - list of cpu affinities for threads.

840:    Notes:
841:    Must set affinities for all the threads used with the matrix.
842:    size(affinities[]) = nthreads
843:    Use affinities[] = PETSC_NULL for PETSc to set the thread affinities.

845:    Options Database Keys:
846: +  -mat_thread_affinities - Comma seperated list of thread affinities

848:    Level: intermediate

850:    Concepts: matrices^thread cpu affinity

852: .seealso: MatPThreadGetThreadAffinities()
853: */
854: PetscErrorCode MatSetThreadAffinities(Mat A,const PetscInt affinities[])
855: {
856:   PetscErrorCode    ierr;
857:   PetscThreadsLayout tmap=A->rmap->tmap;
858:   PetscInt          nmax=PetscMaxThreads+PetscMainThreadShareWork;
859:   PetscBool         flg;


863:   if(!tmap) {
864:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must set the number of threads before setting thread affinities");
865:   }

867:   PetscMalloc(tmap->nthreads*sizeof(PetscInt),&tmap->affinity);

869:   if(affinities == PETSC_NULL) {
870:     PetscInt *thread_affinities;
871:     PetscMalloc(nmax*sizeof(PetscInt),&thread_affinities);
872:     PetscMemzero(thread_affinities,nmax*sizeof(PetscInt));
873:     /* Check if run-time option is set */
874:     PetscOptionsIntArray("-mat_thread_affinities","Set CPU affinity for each thread","MatSetThreadAffinities",thread_affinities,&nmax,&flg);
875:     if(flg) {
876:       if(nmax != tmap->nthreads) {
877:         SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Must set affinities for all threads, matrix A Threads = %D, CPU affinities set = %D",tmap->nthreads,nmax);
878:       }
879:       PetscMemcpy(tmap->affinity,thread_affinities,tmap->nthreads*sizeof(PetscInt));
880:     } else {
881:       /* Reuse the core affinities set for first s->nthreads */
882:       PetscMemcpy(tmap->affinity,PetscThreadsCoreAffinities,tmap->nthreads*sizeof(PetscInt));
883:     }
884:     PetscFree(thread_affinities);
885:   } else {
886:     /* Set user provided affinities */
887:     PetscMemcpy(tmap->affinity,affinities,tmap->nthreads*sizeof(PetscInt));
888:   }
889:   return(0);
890: }

892: EXTERN_C_BEGIN
895: PetscErrorCode MatSeqAIJPThreadSetPreallocation_SeqAIJPThread(Mat A, PetscInt nz,const PetscInt nnz[])
896: {
898:   PetscThreadsLayout tmap=A->rmap->tmap;

901:   MatSeqAIJSetPreallocation_SeqAIJ(A, nz, nnz);

903:   tmap->N = A->rmap->n;
904:   PetscThreadsLayoutSetUp(tmap);

906:   MatZeroEntries_SeqAIJPThread(A);
907:   return(0);
908: }
909: EXTERN_C_END

911: EXTERN_C_BEGIN
914: PetscErrorCode MatCreate_SeqAIJPThread(Mat B)
915: {
916:   PetscErrorCode     ierr;
917:   Mat_SeqAIJ         *b;
918:   PetscThreadsLayout tmap=B->rmap->tmap;

921:   PetscThreadsInitialize(PetscMaxThreads);
922:   MatCreate_SeqAIJ(B);
923:   b = (Mat_SeqAIJ*)B->data;
924:   /* Set inodes off */
925:   b->inode.use = PETSC_FALSE;

927:   if(!B->rmap->tmap) {
928:     PetscThreadsLayoutCreate(&B->rmap->tmap);
929:     tmap = B->rmap->tmap;
930:   }

932:   /* Set the number of threads */
933:   if(tmap->nthreads == PETSC_DECIDE) {
934:     MatSetNThreads(B,PETSC_DECIDE);
935:   }
936:   /* Set thread affinities */
937:   if(!tmap->affinity) {
938:     MatSetThreadAffinities(B,PETSC_NULL);
939:   }

941:   B->ops->destroy         = MatDestroy_SeqAIJPThread;
942:   B->ops->mult            = MatMult_SeqAIJPThread;
943:   B->ops->multadd         = MatMultAdd_SeqAIJPThread;
944:   B->ops->setup           = MatSetUp_SeqAIJPThread;
945:   B->ops->zeroentries     = MatZeroEntries_SeqAIJPThread;
946:   B->ops->realpart        = MatRealPart_SeqAIJPThread;
947:   B->ops->imaginarypart   = MatImaginaryPart_SeqAIJPThread;
948:   B->ops->getdiagonal     = MatGetDiagonal_SeqAIJPThread;
949:   B->ops->missingdiagonal = MatMissingDiagonal_SeqAIJPThread;
950:   B->ops->findzerodiagonals = MatFindZeroDiagonals_SeqAIJPThread;
951:   B->ops->diagonalscale     = MatDiagonalScale_SeqAIJPThread;
952:   B->ops->diagonalset       = MatDiagonalSet_SeqAIJPThread;
953:   B->ops->view = MatView_SeqAIJPThread;

955:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatSeqAIJSetPreallocation_C","",PETSC_NULL);
956:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatSeqAIJSetPreallocation_C","MatSeqAIJPThreadSetPreallocation_SeqAIJPThread",MatSeqAIJPThreadSetPreallocation_SeqAIJPThread);

958:   if(mats_created == 0) {
959:     PetscMalloc((PetscMaxThreads+PetscMainThreadShareWork)*sizeof(Mat_KernelData),&mat_kerneldatap);
960:     PetscMalloc((PetscMaxThreads+PetscMainThreadShareWork)*sizeof(Mat_KernelData*),&mat_pdata);
961:   }
962:   mats_created++;

964:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPTHREAD);
965:   return(0);
966: }
967: EXTERN_C_END

971: /*
972:    MatCreateSeqAIJPThread - Creates a sparse matrix in AIJ (compressed row) format
973:    (the default parallel PETSc format) using posix threads.  For good matrix assembly performance
974:    the user should preallocate the matrix storage by setting the parameter nz
975:    (or the array nnz).  By setting these parameters accurately, performance
976:    during matrix assembly can be increased by more than a factor of 50.

978:    Collective on MPI_Comm

980:    Input Parameters:
981: +  comm - MPI communicator, set to PETSC_COMM_SELF
982: .  m - number of rows
983: .  n - number of columns
984: .  nthreads - number of threads for matrix operations
985: .  nz - number of nonzeros per row (same for all rows)
986: -  nnz - array containing the number of nonzeros in the various rows 
987:          (possibly different for each row) or PETSC_NULL

989:    Output Parameter:
990: .  A - the matrix 

992:    It is recommended that one use the MatCreate(), MatSetSizes(), MatSetType() and/or MatSetFromOptions(),
993:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
994:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

996:    Notes:
997:    If nnz is given then nz is ignored

999:    The AIJ format (also called the Yale sparse matrix format or
1000:    compressed row storage), is fully compatible with standard Fortran 77
1001:    storage.  That is, the stored row and column indices can begin at
1002:    either one (as in Fortran) or zero.  See the users' manual for details.

1004:    Specify the preallocated storage with either nz or nnz (not both).
1005:    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 
1006:    allocation.  For large problems you MUST preallocate memory or you 
1007:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1009:    By default, this format uses inodes (identical nodes) when possible, to 
1010:    improve numerical efficiency of matrix-vector products and solves. We 
1011:    search for consecutive rows with the same nonzero structure, thereby
1012:    reusing matrix information to achieve increased efficiency.

1014:    Options Database Keys:
1015: +  -mat_no_inode  - Do not use inodes
1016: -  -mat_inode_limit <limit> - Sets inode limit (max limit=5)

1018:    Level: intermediate

1020: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays()

1022: */
1023: PetscErrorCode  MatCreateSeqAIJPThread(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nthreads,PetscInt nz,const PetscInt nnz[],Mat *A)
1024: {

1028:   MatCreate(comm,A);
1029:   MatSetSizes(*A,m,n,m,n);
1030:   MatSetType(*A,MATSEQAIJ);
1031:   MatSetNThreads(*A,nthreads);
1032:   MatSeqAIJPThreadSetPreallocation_SeqAIJPThread(*A,nz,nnz);
1033:   return(0);
1034: }