Actual source code: seqaijpthread.c
petsc-3.3-p7 2013-05-11
2: #include <petsc-private/vecimpl.h>
3: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
4: #include <../src/sys/objects/pthread/pthreadimpl.h>
5: #include <../src/mat/impls/aij/seq/seqpthread/seqaijpthread.h>
7: static PetscInt mats_created=0;
9: PetscErrorCode MatRealPart_Kernel(void* arg)
10: {
11: Mat_KernelData *data=(Mat_KernelData*)arg;
12: const PetscInt *ai = (const PetscInt*)data->ai;
13: MatScalar *aabase = data->aa,*aa;
14: PetscInt nrows=data->nrows;
15: PetscInt nz,i;
17:
18: nz = ai[nrows] - ai[0];
19: aa = aabase + ai[0];
20: for(i=0;i<nz;i++) aa[i] = PetscRealPart(aa[i]);
22: return(0);
23: }
27: PetscErrorCode MatRealPart_SeqAIJPThread(Mat A)
28: {
29: PetscErrorCode ierr;
30: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
31: PetscThreadsLayout tmap=A->rmap->tmap;
32: PetscInt *trstarts = tmap->trstarts;
33: PetscInt i;
34: MatScalar *aa=a->a;
35: PetscInt *ai=a->i;
39: for(i=0; i < tmap->nthreads; i++) {
40: mat_kerneldatap[i].ai = ai + trstarts[i];
41: mat_kerneldatap[i].aa = aa;
42: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
43: mat_pdata[i] = &mat_kerneldatap[i];
44: }
45: PetscThreadsRunKernel(MatRealPart_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
47: a->idiagvalid = PETSC_FALSE;
48: a->ibdiagvalid = PETSC_FALSE;
49: return(0);
50: }
52: PetscErrorCode MatImaginaryPart_Kernel(void* arg)
53: {
54: Mat_KernelData *data=(Mat_KernelData*)arg;
55: const PetscInt *ai = (const PetscInt*)data->ai;
56: MatScalar *aabase = data->aa,*aa;
57: PetscInt nrows=data->nrows;
58: PetscInt nz,i;
60:
61: nz = ai[nrows] - ai[0];
62: aa = aabase + ai[0];
63: for(i=0;i<nz;i++) aa[i] = PetscImaginaryPart(aa[i]);
65: return(0);
66: }
70: PetscErrorCode MatImaginaryPart_SeqAIJPThread(Mat A)
71: {
72: PetscErrorCode ierr;
73: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
74: PetscThreadsLayout tmap=A->rmap->tmap;
75: PetscInt *trstarts = tmap->trstarts;
76: PetscInt i;
77: MatScalar *aa=a->a;
78: PetscInt *ai=a->i;
82: for(i=0; i < tmap->nthreads; i++) {
83: mat_kerneldatap[i].ai = ai + trstarts[i];
84: mat_kerneldatap[i].aa = aa;
85: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
86: mat_pdata[i] = &mat_kerneldatap[i];
87: }
88: PetscThreadsRunKernel(MatImaginaryPart_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
90: a->idiagvalid = PETSC_FALSE;
91: a->ibdiagvalid = PETSC_FALSE;
92: return(0);
93: }
95: PetscErrorCode MatFactorGetDiagonal_Kernel(void* arg)
96: {
97: Mat_KernelData *data=(Mat_KernelData*)arg;
98: const PetscInt *adiag = (const PetscInt*)data->adiag;
99: const MatScalar *aa = (const MatScalar*)data->aa;
100: PetscScalar *x = (PetscScalar*)data->x;
101: PetscInt nrows=(PetscInt)data->nrows,i;
103:
104: for(i=0;i < nrows;i++) {
105: x[i] = 1.0/aa[adiag[i]];
106: }
107: return(0);
108: }
110: PetscErrorCode MatGetDiagonal_Kernel(void* arg)
111: {
112: Mat_KernelData *data=(Mat_KernelData*)arg;
113: const PetscInt *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
114: const MatScalar *aa = (const MatScalar*)data->aa;
115: PetscScalar *x = (PetscScalar*)data->x;
116: PetscInt nrows=(PetscInt)data->nrows;
117: PetscInt i,j,row;
118: PetscInt rstart=(PetscInt)data->rstart;
120:
121: for(i=0;i < nrows;i++) {
122: row = rstart+i;
123: for(j=ai[i]; j < ai[i+1];j++) {
124: if(aj[j] == row) {
125: x[i] = aa[j];
126: break;
127: }
128: }
129: }
130: return(0);
131: }
135: PetscErrorCode MatGetDiagonal_SeqAIJPThread(Mat A,Vec v)
136: {
137: PetscErrorCode ierr;
138: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
139: PetscThreadsLayout tmap=A->rmap->tmap;
140: PetscInt *trstarts=tmap->trstarts;
141: PetscScalar *x;
142: MatScalar *aa=a->a;
143: PetscInt *ai=a->i,*aj=a->j;
144: PetscInt i,n;
147: VecGetLocalSize(v,&n);
148: if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
150: if(A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
151: PetscInt *diag=a->diag;
152: VecGetArray(v,&x);
153: for(i=0;i < tmap->nthreads; i++) {
154: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
155: mat_kerneldatap[i].aa = aa;
156: mat_kerneldatap[i].adiag = diag + trstarts[i];
157: mat_kerneldatap[i].x = x + trstarts[i];
158: mat_pdata[i] = &mat_kerneldatap[i];
159: }
160: PetscThreadsRunKernel(MatFactorGetDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
161: VecRestoreArray(v,&x);
162: return(0);
163: }
165: VecSet(v,0.0);
166: VecGetArray(v,&x);
167: for(i=0;i < tmap->nthreads; i++) {
168: mat_kerneldatap[i].ai = ai + trstarts[i];
169: mat_kerneldatap[i].rstart = trstarts[i];
170: mat_kerneldatap[i].aj = aj;
171: mat_kerneldatap[i].aa = aa;
172: mat_kerneldatap[i].nrows = trstarts[i+1]-trstarts[i];
173: mat_kerneldatap[i].x = x + trstarts[i];
174: mat_pdata[i] = &mat_kerneldatap[i];
175: }
176: PetscThreadsRunKernel(MatGetDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
177: VecRestoreArray(v,&x);
178: return(0);
179: }
181: PetscErrorCode MatZeroEntries_Kernel(void* arg)
182: {
183: Mat_KernelData *data=(Mat_KernelData*)arg;
184: const PetscInt *ai = (const PetscInt*)data->ai;
185: MatScalar *aabase = data->aa;
186: MatScalar *aa;
187: PetscInt nrows=data->nrows;
188: PetscInt nz;
190:
191: nz = ai[nrows] - ai[0];
192: aa = aabase + ai[0];
193: PetscMemzero(aa,nz*sizeof(PetscScalar));
195: return(0);
196: }
200: PetscErrorCode MatZeroEntries_SeqAIJPThread(Mat A)
201: {
202: PetscErrorCode ierr;
203: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
204: PetscThreadsLayout tmap=A->rmap->tmap;
205: PetscInt *trstarts = tmap->trstarts;
206: PetscInt i;
207: MatScalar *aa=a->a;
208: PetscInt *ai=a->i;
212: for(i=0; i < tmap->nthreads; i++) {
213: mat_kerneldatap[i].ai = ai + trstarts[i];
214: mat_kerneldatap[i].aa = aa;
215: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
216: mat_pdata[i] = &mat_kerneldatap[i];
217: }
218:
219: PetscThreadsRunKernel(MatZeroEntries_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
220:
221: return(0);
222: }
224: PetscErrorCode MatMult_Kernel(void* arg)
225: {
226: Mat_KernelData *data=(Mat_KernelData*)arg;
227: const PetscInt *ai = (const PetscInt*)data->ai,*ajbase = (const PetscInt*)data->aj,*aj;
228: const MatScalar *aabase = (const MatScalar*)data->aa,*aa;
229: const PetscScalar *x = (const PetscScalar*)data->x;
230: PetscScalar *y = data->y;
231: PetscInt nrows = data->nrows;
232: PetscInt nz,i;
233: PetscScalar sum;
235:
236: data->nonzerorow = 0;
237: for(i=0;i<nrows;i++) {
238: nz = ai[i+1] - ai[i];
239: aj = ajbase + ai[i];
240: aa = aabase + ai[i];
241: sum = 0.0;
242: data->nonzerorow += (nz>0);
243: PetscSparseDensePlusDot(sum,x,aa,aj,nz);
244: y[i] = sum;
245: }
246: return(0);
247: }
251: PetscErrorCode MatMult_SeqAIJPThread(Mat A,Vec xx,Vec yy)
252: {
253: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
254: PetscThreadsLayout tmap=A->rmap->tmap;
255: PetscInt *trstarts=tmap->trstarts;
256: PetscScalar *x,*y;
257: PetscErrorCode ierr;
258: MatScalar *aa = a->a;
259: PetscInt *ai = a->i, *aj = a->j;
260: PetscInt i,nonzerorow=0;;
264: if(a->compressedrow.use) {
265: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Compressed row format not supported yet");
266: }
267: VecGetArray(xx,&x);
268: VecGetArray(yy,&y);
270: for(i=0;i < tmap->nthreads;i++) {
271: mat_kerneldatap[i].ai = ai + trstarts[i];
272: mat_kerneldatap[i].aa = aa;
273: mat_kerneldatap[i].aj = aj;
274: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
275: mat_kerneldatap[i].x = x;
276: mat_kerneldatap[i].y = y + trstarts[i];
277: mat_pdata[i] = &mat_kerneldatap[i];
278: }
279: PetscThreadsRunKernel(MatMult_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
281: for(i=0;i< tmap->nthreads;i++) nonzerorow += mat_kerneldatap[i].nonzerorow;
283: PetscLogFlops(2.0*a->nz - nonzerorow);
284: VecRestoreArray(xx,&x);
285: VecRestoreArray(yy,&y);
287: return(0);
288: }
290: PetscErrorCode MatMultAdd_Kernel(void* arg)
291: {
292: Mat_KernelData *data=(Mat_KernelData*)arg;
293: const PetscInt *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
294: const MatScalar *aa = (const MatScalar*)data->aa;
295: const PetscScalar *x = (const PetscScalar*)data->x;
296: PetscScalar *y = data->y;
297: PetscScalar *z = data->z;
298: PetscInt nrows = data->nrows;
299: PetscInt nz,i;
300: PetscScalar sum;
302:
303: data->nonzerorow = 0;
304: for(i=0;i<nrows;i++) {
305: nz = ai[i+1] - ai[i];
306: aj = data->aj + ai[i];
307: aa = data->aa + ai[i];
308: sum = y[i];
309: data->nonzerorow += (nz>0);
310: PetscSparseDensePlusDot(sum,x,aa,aj,nz);
311: z[i] = sum;
312: }
313: return(0);
314: }
315:
318: PetscErrorCode MatMultAdd_SeqAIJPThread(Mat A,Vec xx,Vec yy,Vec zz)
319: {
320: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
321: PetscThreadsLayout tmap=A->rmap->tmap;
322: PetscInt *trstarts=tmap->trstarts;
323: PetscScalar *x,*y,*z;
324: PetscErrorCode ierr;
325: MatScalar *aa = a->a;
326: PetscInt *ai = a->i, *aj = a->j;
327: PetscInt i,nonzerorow=0;;
331: if(a->compressedrow.use) {
332: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Compressed row format not supported yet");
333: }
334: VecGetArray(xx,&x);
335: VecGetArray(yy,&y);
336: if(zz != yy) {
337: VecGetArray(zz,&z);
338: } else z = y;
340: for(i=0;i < tmap->nthreads;i++) {
341: mat_kerneldatap[i].ai = ai + trstarts[i];
342: mat_kerneldatap[i].aa = aa;
343: mat_kerneldatap[i].aj = aj;
344: mat_kerneldatap[i].nrows = trstarts[i+1]-trstarts[i];
345: mat_kerneldatap[i].x = x;
346: mat_kerneldatap[i].y = y + trstarts[i];
347: mat_kerneldatap[i].z = z + trstarts[i];
348: mat_pdata[i] = &mat_kerneldatap[i];
349: }
350: PetscThreadsRunKernel(MatMultAdd_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
352: for(i=0;i< tmap->nthreads;i++) nonzerorow += mat_kerneldatap[i].nonzerorow;
354: PetscLogFlops(2.0*a->nz - nonzerorow);
355: VecRestoreArray(xx,&x);
356: VecRestoreArray(yy,&y);
357: if(zz != yy) {
358: VecRestoreArray(zz,&z);
359: }
361: return(0);
362: }
364: PetscErrorCode MatMarkDiagonal_Kernel(void* arg)
365: {
366: Mat_KernelData *data=(Mat_KernelData*)arg;
367: const PetscInt *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
368: PetscInt *adiag=(PetscInt*)data->adiag;
369: PetscInt nrows=(PetscInt)data->nrows;
370: PetscInt i,j,row;
371: PetscInt rstart=(PetscInt)data->rstart;
373:
374: for(i=0;i < nrows;i++) {
375: row = rstart+i;
376: for(j=ai[i]; j < ai[i+1];j++) {
377: if(aj[j] == row) {
378: adiag[i] = j;
379: break;
380: }
381: }
382: }
383: return(0);
384: }
388: PetscErrorCode MatMarkDiagonal_SeqAIJPThread(Mat A)
389: {
390: PetscErrorCode ierr;
391: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
392: PetscThreadsLayout tmap=A->rmap->tmap;
393: PetscInt *trstarts=tmap->trstarts;
394: PetscInt *ai=a->i,*aj=a->j;
395: PetscInt i,m=A->rmap->n;
399: if(!a->diag) {
400: PetscMalloc(m*sizeof(PetscInt),&a->diag);
401: PetscLogObjectMemory(A,m*sizeof(PetscInt));
402: }
404: for(i=0;i < tmap->nthreads; i++) {
405: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
406: mat_kerneldatap[i].adiag = a->diag + trstarts[i];
407: mat_kerneldatap[i].ai = ai + trstarts[i];
408: mat_kerneldatap[i].aj = aj;
409: mat_kerneldatap[i].rstart = trstarts[i];
410: mat_pdata[i] = &mat_kerneldatap[i];
411: }
412: PetscThreadsRunKernel(MatMarkDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
413: return(0);
414: }
416: PetscErrorCode MatFindZeroDiagonalCount_Kernel(void* arg)
417: {
418: Mat_KernelData *data=(Mat_KernelData*)arg;
419: const PetscInt *aj = (const PetscInt*)data->aj;
420: const MatScalar *aa = (const MatScalar*)data->aa;
421: const PetscInt *adiag = (const PetscInt*)data->adiag;
422: PetscInt nrows = (PetscInt)data->nrows;
423: PetscInt i,row;
424: PetscInt rstart = (PetscInt)data->rstart;
426:
427: for(i=0;i < nrows; i++) {
428: row = rstart+i;
429: if((aj[adiag[i]] != row) || (aa[adiag[i]] == 0.0)) data->nzerodiags++;
430: }
431: return(0);
432: }
434: PetscErrorCode MatFindZeroDiagonals_Kernel(void* arg)
435: {
436: Mat_KernelData *data=(Mat_KernelData*)arg;
437: const PetscInt *aj = (const PetscInt*)data->aj;
438: const MatScalar *aa = (const MatScalar*)data->aa;
439: const PetscInt *adiag = (const PetscInt*)data->adiag;
440: PetscInt nrows = (PetscInt)data->nrows;
441: PetscInt i,row;
442: PetscInt rstart = (PetscInt)data->rstart,cnt=0;;
444:
445: for(i=0;i < nrows; i++) {
446: row = rstart+i;
447: if((aj[adiag[i]] != row) || (aa[adiag[i]] == 0.0)) data->zerodiags[cnt++] = row;
448: }
449: return(0);
450: }
454: PetscErrorCode MatFindZeroDiagonals_SeqAIJPThread(Mat A,IS *zrows)
455: {
456: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
457: PetscThreadsLayout tmap=A->rmap->tmap;
458: PetscInt *trstarts = tmap->trstarts;
459: PetscInt i,cnt=0,*rows,ctr=0;
460: PetscInt *aj = a->j,*diag;
461: MatScalar *aa = a->a;
462: PetscErrorCode ierr;
465: MatMarkDiagonal_SeqAIJPThread(A);
466: diag = a->diag;
468: /* Find zero diagonals count */
469: for(i=0;i < tmap->nthreads;i++) {
470: mat_kerneldatap[i].aj = aj;
471: mat_kerneldatap[i].nrows = trstarts[i+1]-trstarts[i];
472: mat_kerneldatap[i].aa = aa;
473: mat_kerneldatap[i].rstart = trstarts[i];
474: mat_kerneldatap[i].nzerodiags = 0;
475: mat_kerneldatap[i].adiag = diag + trstarts[i];
476: mat_pdata[i] = &mat_kerneldatap[i];
477: }
478: PetscThreadsRunKernel(MatFindZeroDiagonalCount_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
480: for(i=0;i < tmap->nthreads;i++) cnt += mat_kerneldatap[i].nzerodiags;
481: PetscMalloc(cnt*sizeof(PetscInt),&rows);
483: /* Get zero diagonals */
484: for(i=0;i < tmap->nthreads;i++) {
485: mat_kerneldatap[i].aj = aj;
486: mat_kerneldatap[i].nrows = trstarts[i+1]-trstarts[i];
487: mat_kerneldatap[i].aa = aa;
488: mat_kerneldatap[i].rstart = trstarts[i];
489: mat_kerneldatap[i].zerodiags = rows + ctr;
490: mat_kerneldatap[i].adiag = diag + trstarts[i];
491: mat_pdata[i] = &mat_kerneldatap[i];
492: ctr += mat_kerneldatap[i].nzerodiags;
493: }
494: PetscThreadsRunKernel(MatFindZeroDiagonals_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
496: ISCreateGeneral(((PetscObject)A)->comm,cnt,rows,PETSC_OWN_POINTER,zrows);
497: return(0);
498: }
499:
500: PetscErrorCode MatMissingDiagonal_Kernel(void* arg)
501: {
502: Mat_KernelData *data=(Mat_KernelData*)arg;
503: const PetscInt *aj = (const PetscInt*)data->aj;
504: PetscInt *adiag=(PetscInt*)data->adiag;
505: PetscInt nrows=(PetscInt)data->nrows;
506: PetscInt i,row;
507: PetscInt rstart=(PetscInt)data->rstart;
509:
510: data->missing_diag = PETSC_FALSE;
511: for(i=0; i < nrows; i++) {
512: row = rstart + i;
513: if(aj[adiag[i]] != row) {
514: data->missing_diag = PETSC_TRUE;
515: if(data->find_d) data->d = row;
516: break;
517: }
518: }
519: return(0);
520: }
524: PetscErrorCode MatMissingDiagonal_SeqAIJPThread(Mat A, PetscBool *missing,PetscInt *d)
525: {
526: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
527: PetscErrorCode ierr;
528: PetscThreadsLayout tmap=A->rmap->tmap;
529: PetscInt *trstarts=tmap->trstarts;
530: PetscInt *aj=a->j,*diag,i;
533: *missing = PETSC_FALSE;
534: if(A->rmap->n > 0 && !aj) {
535: *missing = PETSC_TRUE;
536: if (d) *d = 0;
537: PetscInfo(A,"Matrix has no entries therefore is missing diagonal");
538: } else {
539: diag = a->diag;
540: for(i=0; i < tmap->nthreads; i++) {
541: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
542: mat_kerneldatap[i].adiag = diag + trstarts[i];
543: mat_kerneldatap[i].aj = aj;
544: mat_kerneldatap[i].rstart = trstarts[i];
545: mat_kerneldatap[i].missing_diag = PETSC_FALSE;
546: mat_kerneldatap[i].find_d = PETSC_FALSE;;
547: if(d) mat_kerneldatap[i].find_d = PETSC_TRUE;
548: mat_pdata[i] = &mat_kerneldatap[i];
549: }
550: PetscThreadsRunKernel(MatMissingDiagonal_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
552: for(i=0;i < tmap->nthreads;i++) {
553: if(mat_kerneldatap[i].missing_diag) {
554: *missing = PETSC_TRUE;
555: if(d) *d = mat_kerneldatap[i].d;
556: PetscInfo1(A,"Matrix is missing diagonal number %D",mat_kerneldatap[i].d);
557: break;
558: }
559: }
560: }
561: return(0);
562: }
564: PetscErrorCode MatDiagonalScale_Kernel(void* arg)
565: {
566: Mat_KernelData *data=(Mat_KernelData*)arg;
567: const PetscInt *ai = (const PetscInt*)data->ai,*aj = (const PetscInt*)data->aj;
568: MatScalar *aa = (MatScalar*)data->aa;
569: const PetscScalar *ll = (const PetscScalar*)data->x;
570: const PetscScalar *rr = (const PetscScalar*)data->y;
571: PetscInt nrows = data->nrows;
572: PetscInt i,j;
574: if(ll) {
575: for(i=0; i < nrows; i++) {
576: for(j=ai[i]; j < ai[i+1]; j++) aa[j] *= ll[i];
577: }
578: }
580: if(rr) {
581: for(i=0; i < nrows; i++) {
582: for(j=ai[i]; j < ai[i+1]; j++) aa[j] *= rr[aj[j]];
583: }
584: }
585: return(0);
586: }
590: PetscErrorCode MatDiagonalScale_SeqAIJPThread(Mat A,Vec ll,Vec rr)
591: {
592: PetscErrorCode ierr;
593: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
594: PetscThreadsLayout tmap=A->rmap->tmap;
595: PetscInt *trstarts=tmap->trstarts;
596: PetscScalar *l,*r;
597: MatScalar *aa=a->a;
598: PetscInt *ai=a->i,*aj=a->j;
599: PetscInt i,m = A->rmap->n, n = A->cmap->n,nz = a->nz;
602: if(ll) {
603: VecGetLocalSize(ll,&m);
604: if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
605: VecGetArray(ll,&l);
606: }
607: if(rr) {
608: VecGetLocalSize(rr,&n);
609: if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
610: VecGetArray(rr,&r);
611: }
613: for(i=0;i< tmap->nthreads;i++) {
614: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
615: mat_kerneldatap[i].ai = ai + trstarts[i];
616: mat_kerneldatap[i].aj = aj;
617: mat_kerneldatap[i].aa = aa;
618: if(ll) mat_kerneldatap[i].x = l + trstarts[i];
619: else mat_kerneldatap[i].x = PETSC_NULL;
620: if(rr) mat_kerneldatap[i].y = r;
621: else mat_kerneldatap[i].y = PETSC_NULL;
622: mat_pdata[i] = &mat_kerneldatap[i];
623: }
624: PetscThreadsRunKernel(MatDiagonalScale_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
626: if(ll) {
627: VecRestoreArray(ll,&l);
628: PetscLogFlops(nz);
629: }
630: if(rr) {
631: VecRestoreArray(rr,&r);
632: PetscLogFlops(nz);
633: }
634: a->idiagvalid = PETSC_FALSE;
635: a->ibdiagvalid = PETSC_FALSE;
636: return(0);
637: }
638:
639: PetscErrorCode MatDiagonalSet_Kernel(void* arg)
640: {
641: Mat_KernelData *data=(Mat_KernelData*)arg;
642: MatScalar *aa = (MatScalar*)data->aa;
643: const PetscInt *adiag = (const PetscInt*)data->adiag;
644: PetscInt nrows = data->nrows;
645: PetscInt i;
646: PetscScalar *x = (PetscScalar*)data->x;
647: InsertMode is = data->is;
648:
649: if(is == INSERT_VALUES) {
650: for(i=0; i < nrows; i++) {
651: aa[adiag[i]] = x[i];
652: }
653: } else {
654: for(i=0;i < nrows; i++) {
655: aa[adiag[i]] += x[i];
656: }
657: }
658: return(0);
659: }
663: PetscErrorCode MatDiagonalSet_SeqAIJPThread(Mat Y,Vec D,InsertMode is)
664: {
665: PetscErrorCode ierr;
666: Mat_SeqAIJ *aij = (Mat_SeqAIJ*)Y->data;
667: PetscThreadsLayout tmap=Y->rmap->tmap;
668: PetscInt *trstarts=tmap->trstarts;
669: MatScalar *aa = aij->a;
670: PetscInt *diag;
671: PetscInt i;
672: PetscBool missing;
673: PetscScalar *v;
676: if(Y->assembled) {
677: MatMissingDiagonal_SeqAIJPThread(Y,&missing,PETSC_NULL);
678: if(!missing) {
679: diag = aij->diag;
680: VecGetArray(D,&v);
681: for(i=0; i < tmap->nthreads;i++) {
682: mat_kerneldatap[i].nrows = trstarts[i+1] - trstarts[i];
683: mat_kerneldatap[i].aa = aa;
684: mat_kerneldatap[i].adiag = diag + trstarts[i];
685: mat_kerneldatap[i].x = v + trstarts[i];
686: mat_kerneldatap[i].is = is;
687: mat_pdata[i] = &mat_kerneldatap[i];
688: }
689: PetscThreadsRunKernel(MatDiagonalSet_Kernel,(void**)mat_pdata,tmap->nthreads,tmap->affinity);
691: VecRestoreArray(D,&v);
692: return(0);
693: }
694: aij->idiagvalid = PETSC_FALSE;
695: aij->ibdiagvalid = PETSC_FALSE;
696: }
697: MatDiagonalSet_Default(Y,D,is);
698: return(0);
699: }
703: PetscErrorCode MatSetUp_SeqAIJPThread(Mat A)
704: {
708: MatSeqAIJPThreadSetPreallocation_SeqAIJPThread(A,PETSC_DEFAULT,0);
709: return(0);
710: }
714: static PetscErrorCode MatView_SeqAIJPThread(Mat A,PetscViewer viewer)
715: {
717: PetscBool iascii;
718: PetscViewerFormat format;
721: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
722: PetscViewerGetFormat(viewer,&format);
723: if (iascii && (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO)) {
724: PetscInt nthreads;
725: MatGetNThreads(A,&nthreads);
726: PetscViewerASCIIPrintf(viewer,"nthreads=%D\n",nthreads);
727: }
728: MatView_SeqAIJ(A,viewer);
729: return(0);
730: }
735: PetscErrorCode MatDestroy_SeqAIJPThread(Mat A)
736: {
737: PetscErrorCode ierr;
741: if(!A->rmap->refcnt) {
742: PetscThreadsLayoutDestroy(&A->rmap->tmap);
743: }
745: mats_created--;
746: /* Free the kernel data structure on the destruction of the last matrix */
747: if(!mats_created) {
748: PetscFree(mat_kerneldatap);
749: PetscFree(mat_pdata);
750: }
751: MatDestroy_SeqAIJ(A);
752: return(0);
753: }
757: /*@
758: MatSetNThreads - Set the number of threads to be used for matrix operations.
760: Not Collective, but it is usually desirable to use the same number of threads per process
762: Input Parameters
763: + A - the matrix
764: - nthreads - number of threads
766: Level: intermediate
768: Concepts: matrix^setting number of threads
770: .seealso: MatCreateSeqAIJPThread()
771: @*/
772: PetscErrorCode MatSetNThreads(Mat A,PetscInt nthreads)
773: {
774: PetscErrorCode ierr;
775: PetscThreadsLayout tmap=A->rmap->tmap;
776: PetscInt nworkThreads=PetscMaxThreads+PetscMainThreadShareWork;
779:
780: if(!tmap) {
781: PetscThreadsLayoutCreate(&tmap);
782: A->rmap->tmap = tmap;
783: }
785: if(nthreads == PETSC_DECIDE) {
786: tmap->nthreads = nworkThreads;
787: PetscOptionsInt("-mat_threads","Set number of threads to be used for matrix operations","MatSetNThreads",nworkThreads,&tmap->nthreads,PETSC_NULL);
788: if(tmap->nthreads > nworkThreads) {
789: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Mat A: threads requested %D, Max. threads initialized %D",tmap->nthreads,nworkThreads);
790: }
791: } else {
792: if(nthreads > nworkThreads) {
793: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Mat A: threads requested %D, Max. threads initialized %D",nthreads,nworkThreads);
794: }
795: tmap->nthreads = nthreads;
796: }
797:
798: return(0);
799: }
803: /*@
804: MatGetNThreads - Get the number of threads used for matrix operations.
806: Not Collective
808: Input Parameter
809: . A - the matrix
811: Output Parameter:
812: . nthreads - number of threads
814: Level: intermediate
816: Concepts: matrix^getting number of threads
818: .seealso: MatCreateSeqAIJPThread(), MatSetNThreads()
819: @*/
820: PetscErrorCode MatGetNThreads(Mat A,PetscInt *nthreads)
821: {
822: PetscThreadsLayout tmap=A->rmap->tmap;
826: if (tmap) *nthreads = tmap->nthreads - PetscMainThreadShareWork;
827: else *nthreads = 1;
828: return(0);
829: }
833: /*
834: MatSetThreadAffinities - Sets the CPU affinities of matrix threads.
836: Input Parameters
837: + A - the matrix
838: - affinities - list of cpu affinities for threads.
840: Notes:
841: Must set affinities for all the threads used with the matrix.
842: size(affinities[]) = nthreads
843: Use affinities[] = PETSC_NULL for PETSc to set the thread affinities.
845: Options Database Keys:
846: + -mat_thread_affinities - Comma seperated list of thread affinities
848: Level: intermediate
850: Concepts: matrices^thread cpu affinity
852: .seealso: MatPThreadGetThreadAffinities()
853: */
854: PetscErrorCode MatSetThreadAffinities(Mat A,const PetscInt affinities[])
855: {
856: PetscErrorCode ierr;
857: PetscThreadsLayout tmap=A->rmap->tmap;
858: PetscInt nmax=PetscMaxThreads+PetscMainThreadShareWork;
859: PetscBool flg;
863: if(!tmap) {
864: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must set the number of threads before setting thread affinities");
865: }
867: PetscMalloc(tmap->nthreads*sizeof(PetscInt),&tmap->affinity);
869: if(affinities == PETSC_NULL) {
870: PetscInt *thread_affinities;
871: PetscMalloc(nmax*sizeof(PetscInt),&thread_affinities);
872: PetscMemzero(thread_affinities,nmax*sizeof(PetscInt));
873: /* Check if run-time option is set */
874: PetscOptionsIntArray("-mat_thread_affinities","Set CPU affinity for each thread","MatSetThreadAffinities",thread_affinities,&nmax,&flg);
875: if(flg) {
876: if(nmax != tmap->nthreads) {
877: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Must set affinities for all threads, matrix A Threads = %D, CPU affinities set = %D",tmap->nthreads,nmax);
878: }
879: PetscMemcpy(tmap->affinity,thread_affinities,tmap->nthreads*sizeof(PetscInt));
880: } else {
881: /* Reuse the core affinities set for first s->nthreads */
882: PetscMemcpy(tmap->affinity,PetscThreadsCoreAffinities,tmap->nthreads*sizeof(PetscInt));
883: }
884: PetscFree(thread_affinities);
885: } else {
886: /* Set user provided affinities */
887: PetscMemcpy(tmap->affinity,affinities,tmap->nthreads*sizeof(PetscInt));
888: }
889: return(0);
890: }
892: EXTERN_C_BEGIN
895: PetscErrorCode MatSeqAIJPThreadSetPreallocation_SeqAIJPThread(Mat A, PetscInt nz,const PetscInt nnz[])
896: {
898: PetscThreadsLayout tmap=A->rmap->tmap;
901: MatSeqAIJSetPreallocation_SeqAIJ(A, nz, nnz);
903: tmap->N = A->rmap->n;
904: PetscThreadsLayoutSetUp(tmap);
906: MatZeroEntries_SeqAIJPThread(A);
907: return(0);
908: }
909: EXTERN_C_END
911: EXTERN_C_BEGIN
914: PetscErrorCode MatCreate_SeqAIJPThread(Mat B)
915: {
916: PetscErrorCode ierr;
917: Mat_SeqAIJ *b;
918: PetscThreadsLayout tmap=B->rmap->tmap;
921: PetscThreadsInitialize(PetscMaxThreads);
922: MatCreate_SeqAIJ(B);
923: b = (Mat_SeqAIJ*)B->data;
924: /* Set inodes off */
925: b->inode.use = PETSC_FALSE;
927: if(!B->rmap->tmap) {
928: PetscThreadsLayoutCreate(&B->rmap->tmap);
929: tmap = B->rmap->tmap;
930: }
932: /* Set the number of threads */
933: if(tmap->nthreads == PETSC_DECIDE) {
934: MatSetNThreads(B,PETSC_DECIDE);
935: }
936: /* Set thread affinities */
937: if(!tmap->affinity) {
938: MatSetThreadAffinities(B,PETSC_NULL);
939: }
941: B->ops->destroy = MatDestroy_SeqAIJPThread;
942: B->ops->mult = MatMult_SeqAIJPThread;
943: B->ops->multadd = MatMultAdd_SeqAIJPThread;
944: B->ops->setup = MatSetUp_SeqAIJPThread;
945: B->ops->zeroentries = MatZeroEntries_SeqAIJPThread;
946: B->ops->realpart = MatRealPart_SeqAIJPThread;
947: B->ops->imaginarypart = MatImaginaryPart_SeqAIJPThread;
948: B->ops->getdiagonal = MatGetDiagonal_SeqAIJPThread;
949: B->ops->missingdiagonal = MatMissingDiagonal_SeqAIJPThread;
950: B->ops->findzerodiagonals = MatFindZeroDiagonals_SeqAIJPThread;
951: B->ops->diagonalscale = MatDiagonalScale_SeqAIJPThread;
952: B->ops->diagonalset = MatDiagonalSet_SeqAIJPThread;
953: B->ops->view = MatView_SeqAIJPThread;
955: PetscObjectComposeFunctionDynamic((PetscObject)B,"MatSeqAIJSetPreallocation_C","",PETSC_NULL);
956: PetscObjectComposeFunctionDynamic((PetscObject)B,"MatSeqAIJSetPreallocation_C","MatSeqAIJPThreadSetPreallocation_SeqAIJPThread",MatSeqAIJPThreadSetPreallocation_SeqAIJPThread);
958: if(mats_created == 0) {
959: PetscMalloc((PetscMaxThreads+PetscMainThreadShareWork)*sizeof(Mat_KernelData),&mat_kerneldatap);
960: PetscMalloc((PetscMaxThreads+PetscMainThreadShareWork)*sizeof(Mat_KernelData*),&mat_pdata);
961: }
962: mats_created++;
964: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPTHREAD);
965: return(0);
966: }
967: EXTERN_C_END
971: /*
972: MatCreateSeqAIJPThread - Creates a sparse matrix in AIJ (compressed row) format
973: (the default parallel PETSc format) using posix threads. For good matrix assembly performance
974: the user should preallocate the matrix storage by setting the parameter nz
975: (or the array nnz). By setting these parameters accurately, performance
976: during matrix assembly can be increased by more than a factor of 50.
978: Collective on MPI_Comm
980: Input Parameters:
981: + comm - MPI communicator, set to PETSC_COMM_SELF
982: . m - number of rows
983: . n - number of columns
984: . nthreads - number of threads for matrix operations
985: . nz - number of nonzeros per row (same for all rows)
986: - nnz - array containing the number of nonzeros in the various rows
987: (possibly different for each row) or PETSC_NULL
989: Output Parameter:
990: . A - the matrix
992: It is recommended that one use the MatCreate(), MatSetSizes(), MatSetType() and/or MatSetFromOptions(),
993: MatXXXXSetPreallocation() paradgm instead of this routine directly.
994: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
996: Notes:
997: If nnz is given then nz is ignored
999: The AIJ format (also called the Yale sparse matrix format or
1000: compressed row storage), is fully compatible with standard Fortran 77
1001: storage. That is, the stored row and column indices can begin at
1002: either one (as in Fortran) or zero. See the users' manual for details.
1004: Specify the preallocated storage with either nz or nnz (not both).
1005: Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
1006: allocation. For large problems you MUST preallocate memory or you
1007: will get TERRIBLE performance, see the users' manual chapter on matrices.
1009: By default, this format uses inodes (identical nodes) when possible, to
1010: improve numerical efficiency of matrix-vector products and solves. We
1011: search for consecutive rows with the same nonzero structure, thereby
1012: reusing matrix information to achieve increased efficiency.
1014: Options Database Keys:
1015: + -mat_no_inode - Do not use inodes
1016: - -mat_inode_limit <limit> - Sets inode limit (max limit=5)
1018: Level: intermediate
1020: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays()
1022: */
1023: PetscErrorCode MatCreateSeqAIJPThread(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nthreads,PetscInt nz,const PetscInt nnz[],Mat *A)
1024: {
1028: MatCreate(comm,A);
1029: MatSetSizes(*A,m,n,m,n);
1030: MatSetType(*A,MATSEQAIJ);
1031: MatSetNThreads(*A,nthreads);
1032: MatSeqAIJPThreadSetPreallocation_SeqAIJPThread(*A,nz,nnz);
1033: return(0);
1034: }