Actual source code: aijAssemble.cu

petsc-3.6.1 2015-08-06
Report Typos and Errors
  1: #define PETSC_SKIP_COMPLEX

  3: #include <petscconf.h>
  4: PETSC_CUDA_EXTERN_C_BEGIN
  5: #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
  6: #include <petscbt.h>
  7: #include <../src/vec/vec/impls/dvecimpl.h>
  8: #include <petsc/private/vecimpl.h>
  9: PETSC_CUDA_EXTERN_C_END
 10: #undef VecType
 11: #include <../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h>

 13: #include <thrust/reduce.h>
 14: #include <thrust/inner_product.h>

 16: #include <cusp/array1d.h>
 17: #include <cusp/print.h>
 18: #include <cusp/coo_matrix.h>

 20: #include <cusp/io/matrix_market.h>

 22: #include <thrust/iterator/counting_iterator.h>
 23: #include <thrust/iterator/transform_iterator.h>
 24: #include <thrust/iterator/permutation_iterator.h>
 25: #include <thrust/functional.h>

 27: // this example illustrates how to make repeated access to a range of values
 28: // examples:
 29: //   repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
 30: //   repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
 31: //   repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
 32: //   ...

 34: template <typename Iterator>
 35: class repeated_range
 36: {
 37: public:

 39:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 41:   struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
 42:   {
 43:     difference_type repeats;

 45:     repeat_functor(difference_type repeats) : repeats(repeats) {}

 47:     __host__ __device__
 48:     difference_type operator()(const difference_type &i) const {
 49:       return i / repeats;
 50:     }
 51:   };

 53:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
 54:   typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
 55:   typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

 57:   // type of the repeated_range iterator
 58:   typedef PermutationIterator iterator;

 60:   // construct repeated_range for the range [first,last)
 61:   repeated_range(Iterator first, Iterator last, difference_type repeats) : first(first), last(last), repeats(repeats) {}

 63:   iterator begin(void) const
 64:   {
 65:     return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
 66:   }

 68:   iterator end(void) const
 69:   {
 70:     return begin() + repeats * (last - first);
 71:   }

 73: protected:
 74:   difference_type repeats;
 75:   Iterator        first;
 76:   Iterator        last;

 78: };

 80: // this example illustrates how to repeat blocks in a range multiple times
 81: // examples:
 82: //   tiled_range([0, 1, 2, 3], 2)    -> [0, 1, 2, 3, 0, 1, 2, 3]
 83: //   tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
 84: //   tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
 85: //   tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
 86: //   ...

 88: template <typename Iterator>
 89: class tiled_range
 90: {
 91: public:

 93:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 95:   struct tile_functor : public thrust::unary_function<difference_type,difference_type>
 96:   {
 97:     difference_type repeats;
 98:     difference_type tile_size;

100:     tile_functor(difference_type repeats, difference_type tile_size) : tile_size(tile_size), repeats(repeats) {}

102:     __host__ __device__
103:     difference_type operator()(const difference_type &i) const {
104:       return tile_size * (i / (tile_size * repeats)) + i % tile_size;
105:     }
106:   };

108:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
109:   typedef typename thrust::transform_iterator<tile_functor, CountingIterator>   TransformIterator;
110:   typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

112:   // type of the tiled_range iterator
113:   typedef PermutationIterator iterator;

115:   // construct repeated_range for the range [first,last)
116:   tiled_range(Iterator first, Iterator last, difference_type repeats)
117:     : first(first), last(last), repeats(repeats), tile_size(last - first) {}

119:   tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
120:     : first(first), last(last), repeats(repeats), tile_size(tile_size)
121:   {
122:     // ASSERT((last - first) % tile_size == 0)
123:   }

125:   iterator begin(void) const
126:   {
127:     return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
128:   }

130:   iterator end(void) const
131:   {
132:     return begin() + repeats * (last - first);
133:   }

135: protected:
136:   difference_type repeats;
137:   difference_type tile_size;
138:   Iterator        first;
139:   Iterator        last;
140: };

142: typedef cusp::device_memory memSpace;
143: typedef int IndexType;
144: typedef PetscScalar ValueType;
145: typedef cusp::array1d<IndexType, memSpace> IndexArray;
146: typedef cusp::array1d<ValueType, memSpace> ValueArray;
147: typedef IndexArray::iterator IndexArrayIterator;
148: typedef ValueArray::iterator ValueArrayIterator;

150: // Ne: Number of elements
151: // Nl: Number of dof per element
154: PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
155: {
156:   size_t         N  = Ne * Nl;
157:   size_t         No = Ne * Nl*Nl;
158:   PetscInt       Nr; // Number of rows

161:   // copy elemRows and elemMat to device
162:   IndexArray d_elemRows(elemRows, elemRows + N);
163:   ValueArray d_elemMats(elemMats, elemMats + No);

166:   MatGetSize(J, &Nr, NULL);
167:   // allocate storage for "fat" COO representation of matrix
168:   PetscInfo1(J, "Making COO matrix of size %d\n", Nr);
169:   cusp::coo_matrix<IndexType,ValueType, memSpace> COO(Nr, Nr, No);

171:   // repeat elemRows entries Nl times
172:   PetscInfo(J, "Making row indices\n");
173:   repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
174:   thrust::copy(rowInd.begin(), rowInd.end(), COO.row_indices.begin());

176:   // tile rows of elemRows Nl times
177:   PetscInfo(J, "Making column indices\n");
178:   tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
179:   thrust::copy(colInd.begin(), colInd.end(), COO.column_indices.begin());

181:   // copy values from elemMats into COO structure (could be avoided)
182:   thrust::copy(d_elemMats.begin(), d_elemMats.end(), COO.values.begin());

184:   // For MPIAIJ, split this into two COO matrices, and return both
185:   //   Need the column map

187:   // print the "fat" COO representation
188: #if !defined(PETSC_USE_COMPLEX)
189:   if (PetscLogPrintInfo) cusp::print(COO);
190: #endif
191:   // sort COO format by (i,j), this is the most costly step
192:   PetscInfo(J, "Sorting rows and columns\n");
193: #if 1
194:   COO.sort_by_row_and_column();
195: #else
196:   {
197:     PetscInfo(J, "  Making permutation\n");
198:     IndexArray permutation(No);
199:     thrust::sequence(permutation.begin(), permutation.end());

201:     // compute permutation and sort by (I,J)
202:     {
203:       PetscInfo(J, "  Sorting columns\n");
204:       IndexArray temp(No);
205:       thrust::copy(COO.column_indices.begin(), COO.column_indices.end(), temp.begin());
206:       thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
207:       PetscInfo(J, "    Sorted columns\n");
208:       if (PetscLogPrintInfo) {
209:         for (IndexArrayIterator t_iter = temp.begin(), p_iter = permutation.begin(); t_iter != temp.end(); ++t_iter, ++p_iter) {
210:           PetscInfo2(J, "%d(%d)\n", *t_iter, *p_iter);
211:         }
212:       }

214:       PetscInfo(J, "  Copying rows\n");
215:       //cusp::copy(COO.row_indices, temp);
216:       thrust::copy(COO.row_indices.begin(), COO.row_indices.end(), temp.begin());
217:       PetscInfo(J, "  Gathering rows\n");
218:       thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.row_indices.begin());
219:       PetscInfo(J, "  Sorting rows\n");
220:       thrust::stable_sort_by_key(COO.row_indices.begin(), COO.row_indices.end(), permutation.begin());

222:       PetscInfo(J, "  Gathering columns\n");
223:       cusp::copy(COO.column_indices, temp);
224:       thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.column_indices.begin());
225:     }

227:     // use permutation to reorder the values
228:     {
229:       PetscInfo(J, "  Sorting values\n");
230:       ValueArray temp(COO.values);
231:       cusp::copy(COO.values, temp);
232:       thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.values.begin());
233:     }
234:   }
235: #endif

237:   // print the "fat" COO representation
238: #if !defined(PETSC_USE_COMPLEX)
239:   if (PetscLogPrintInfo) cusp::print(COO);
240: #endif
241:   // compute number of unique (i,j) entries
242:   //   this counts the number of changes as we move along the (i,j) list
243:   PetscInfo(J, "Computing number of unique entries\n");
244:   size_t num_entries = thrust::inner_product
245:                          (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
246:                          thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end (),  COO.column_indices.end()))   - 1,
247:                          thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())) + 1,
248:                          size_t(1),
249:                          thrust::plus<size_t>(),
250:                          thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());

252:   // allocate COO storage for final matrix
253:   PetscInfo(J, "Allocating compressed matrix\n");
254:   cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_entries);

256:   // sum values with the same (i,j) index
257:   // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
258:   //     the Cusp one is 2x faster, but still not optimal
259:   // This could possibly be done in-place
260:   PetscInfo(J, "Compressing matrix\n");
261:   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
262:                         thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(),   COO.column_indices.end())),
263:                         COO.values.begin(),
264:                         thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
265:                         A.values.begin(),
266:                         thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
267:                         thrust::plus<ValueType>());

269:   // print the final matrix
270: #if !defined(PETSC_USE_COMPLEX)
271:   if (PetscLogPrintInfo) cusp::print(A);
272: #endif
273:   //std::cout << "Writing matrix" << std::endl;
274:   //cusp::io::write_matrix_market_file(A, "A.mtx");

276:   PetscInfo(J, "Converting to PETSc matrix\n");
277:   MatSetType(J, MATSEQAIJCUSP);
278:   //cusp::csr_matrix<PetscInt,PetscScalar,cusp::device_memory> Jgpu;
279:   CUSPMATRIX *Jgpu = new CUSPMATRIX;
280:   cusp::convert(A, *Jgpu);
281: #if !defined(PETSC_USE_COMPLEX)
282:   if (PetscLogPrintInfo) cusp::print(*Jgpu);
283: #endif
284:   PetscInfo(J, "Copying to CPU matrix\n");
285:   MatCUSPCopyFromGPU(J, Jgpu);
286:   return(0);
287: }