Actual source code: baijsolvnat4.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /*
5: Special case where the matrix was ILU(0) factored in the natural
6: ordering. This eliminates the need for the column and row permutation.
7: */
8: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9: {
10: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
11: PetscInt n = a->mbs;
12: const PetscInt *ai = a->i, *aj = a->j;
13: const PetscInt *diag = a->diag;
14: const MatScalar *aa = a->a;
15: PetscScalar *x;
16: const PetscScalar *b;
18: PetscFunctionBegin;
19: PetscCall(VecGetArrayRead(bb, &b));
20: PetscCall(VecGetArray(xx, &x));
22: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23: {
24: static PetscScalar w[2000]; /* very BAD need to fix */
25: fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
26: }
27: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28: fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
29: #else
30: {
31: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
32: const MatScalar *v;
33: PetscInt jdx, idt, idx, nz, i, ai16;
34: const PetscInt *vi;
36: /* forward solve the lower triangular */
37: idx = 0;
38: x[0] = b[0];
39: x[1] = b[1];
40: x[2] = b[2];
41: x[3] = b[3];
42: for (i = 1; i < n; i++) {
43: v = aa + 16 * ai[i];
44: vi = aj + ai[i];
45: nz = diag[i] - ai[i];
46: idx += 4;
47: s1 = b[idx];
48: s2 = b[1 + idx];
49: s3 = b[2 + idx];
50: s4 = b[3 + idx];
51: while (nz--) {
52: jdx = 4 * (*vi++);
53: x1 = x[jdx];
54: x2 = x[1 + jdx];
55: x3 = x[2 + jdx];
56: x4 = x[3 + jdx];
57: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
58: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
59: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
60: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
61: v += 16;
62: }
63: x[idx] = s1;
64: x[1 + idx] = s2;
65: x[2 + idx] = s3;
66: x[3 + idx] = s4;
67: }
68: /* backward solve the upper triangular */
69: idt = 4 * (n - 1);
70: for (i = n - 1; i >= 0; i--) {
71: ai16 = 16 * diag[i];
72: v = aa + ai16 + 16;
73: vi = aj + diag[i] + 1;
74: nz = ai[i + 1] - diag[i] - 1;
75: s1 = x[idt];
76: s2 = x[1 + idt];
77: s3 = x[2 + idt];
78: s4 = x[3 + idt];
79: while (nz--) {
80: idx = 4 * (*vi++);
81: x1 = x[idx];
82: x2 = x[1 + idx];
83: x3 = x[2 + idx];
84: x4 = x[3 + idx];
85: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
86: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
87: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
89: v += 16;
90: }
91: v = aa + ai16;
92: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
93: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
94: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
95: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
96: idt -= 4;
97: }
98: }
99: #endif
101: PetscCall(VecRestoreArrayRead(bb, &b));
102: PetscCall(VecRestoreArray(xx, &x));
103: PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
107: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108: {
109: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
110: const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111: PetscInt i, k, nz, idx, jdx, idt;
112: const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
113: const MatScalar *aa = a->a, *v;
114: PetscScalar *x;
115: const PetscScalar *b;
116: PetscScalar s1, s2, s3, s4, x1, x2, x3, x4;
118: PetscFunctionBegin;
119: PetscCall(VecGetArrayRead(bb, &b));
120: PetscCall(VecGetArray(xx, &x));
121: /* forward solve the lower triangular */
122: idx = 0;
123: x[0] = b[idx];
124: x[1] = b[1 + idx];
125: x[2] = b[2 + idx];
126: x[3] = b[3 + idx];
127: for (i = 1; i < n; i++) {
128: v = aa + bs2 * ai[i];
129: vi = aj + ai[i];
130: nz = ai[i + 1] - ai[i];
131: idx = bs * i;
132: s1 = b[idx];
133: s2 = b[1 + idx];
134: s3 = b[2 + idx];
135: s4 = b[3 + idx];
136: for (k = 0; k < nz; k++) {
137: jdx = bs * vi[k];
138: x1 = x[jdx];
139: x2 = x[1 + jdx];
140: x3 = x[2 + jdx];
141: x4 = x[3 + jdx];
142: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
147: v += bs2;
148: }
150: x[idx] = s1;
151: x[1 + idx] = s2;
152: x[2 + idx] = s3;
153: x[3 + idx] = s4;
154: }
156: /* backward solve the upper triangular */
157: for (i = n - 1; i >= 0; i--) {
158: v = aa + bs2 * (adiag[i + 1] + 1);
159: vi = aj + adiag[i + 1] + 1;
160: nz = adiag[i] - adiag[i + 1] - 1;
161: idt = bs * i;
162: s1 = x[idt];
163: s2 = x[1 + idt];
164: s3 = x[2 + idt];
165: s4 = x[3 + idt];
167: for (k = 0; k < nz; k++) {
168: idx = bs * vi[k];
169: x1 = x[idx];
170: x2 = x[1 + idx];
171: x3 = x[2 + idx];
172: x4 = x[3 + idx];
173: s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174: s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175: s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176: s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
178: v += bs2;
179: }
180: /* x = inv_diagonal*x */
181: x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182: x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183: x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184: x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185: }
187: PetscCall(VecRestoreArrayRead(bb, &b));
188: PetscCall(VecRestoreArray(xx, &x));
189: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190: PetscFunctionReturn(PETSC_SUCCESS);
191: }