Actual source code: aijAssemble.cu

petsc-3.3-p7 2013-05-11
  1: #include "petscconf.h"
  2: PETSC_CUDA_EXTERN_C_BEGIN
 3:  #include ../src/mat/impls/aij/seq/aij.h
 4:  #include petscbt.h
 5:  #include ../src/vec/vec/impls/dvecimpl.h
  6: #include "petsc-private/vecimpl.h"
  7: PETSC_CUDA_EXTERN_C_END
  8: #undef VecType
 9:  #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h

 11: #include <thrust/reduce.h>
 12: #include <thrust/inner_product.h>

 14: #include <cusp/array1d.h>
 15: #include <cusp/print.h>
 16: #include <cusp/coo_matrix.h>

 18: #include <cusp/io/matrix_market.h>

 20: #include <thrust/iterator/counting_iterator.h>
 21: #include <thrust/iterator/transform_iterator.h>
 22: #include <thrust/iterator/permutation_iterator.h>
 23: #include <thrust/functional.h>

 25: // this example illustrates how to make repeated access to a range of values
 26: // examples:
 27: //   repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
 28: //   repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
 29: //   repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
 30: //   ...

 32: template <typename Iterator>
 33: class repeated_range
 34: {
 35:     public:

 37:     typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 39:     struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
 40:     {
 41:         difference_type repeats;

 43:         repeat_functor(difference_type repeats)
 44:             : repeats(repeats) {}

 46:         __host__ __device__
 47:         difference_type operator()(const difference_type& i) const
 48:         {
 49:             return i / repeats;
 50:         }
 51:     };

 53:     typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
 54:     typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
 55:     typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

 57:     // type of the repeated_range iterator
 58:     typedef PermutationIterator iterator;

 60:     // construct repeated_range for the range [first,last)
 61:     repeated_range(Iterator first, Iterator last, difference_type repeats)
 62:         : first(first), last(last), repeats(repeats) {}

 64:     iterator begin(void) const
 65:     {
 66:         return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
 67:     }

 69:     iterator end(void) const
 70:     {
 71:         return begin() + repeats * (last - first);
 72:     }

 74:     protected:
 75:     difference_type repeats;
 76:     Iterator first;
 77:     Iterator last;

 79: };

 81: // this example illustrates how to repeat blocks in a range multiple times
 82: // examples:
 83: //   tiled_range([0, 1, 2, 3], 2)    -> [0, 1, 2, 3, 0, 1, 2, 3]
 84: //   tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
 85: //   tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
 86: //   tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
 87: //   ...

 89: template <typename Iterator>
 90: class tiled_range
 91: {
 92:     public:

 94:     typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 96:     struct tile_functor : public thrust::unary_function<difference_type,difference_type>
 97:     {
 98:         difference_type repeats;
 99:         difference_type tile_size;

101:         tile_functor(difference_type repeats, difference_type tile_size)
102:             : tile_size(tile_size), repeats(repeats) {}

104:         __host__ __device__
105:         difference_type operator()(const difference_type& i) const
106:         {
107:             return tile_size * (i / (tile_size * repeats)) + i % tile_size;
108:         }
109:     };

111:     typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
112:     typedef typename thrust::transform_iterator<tile_functor, CountingIterator>   TransformIterator;
113:     typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

115:     // type of the tiled_range iterator
116:     typedef PermutationIterator iterator;

118:     // construct repeated_range for the range [first,last)
119:     tiled_range(Iterator first, Iterator last, difference_type repeats)
120:         : first(first), last(last), repeats(repeats), tile_size(last - first) {}

122:     tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
123:         : first(first), last(last), repeats(repeats), tile_size(tile_size)
124:     {
125:       // ASSERT((last - first) % tile_size == 0)
126:     }

128:     iterator begin(void) const
129:     {
130:         return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
131:     }

133:     iterator end(void) const
134:     {
135:         return begin() + repeats * (last - first);
136:     }

138:     protected:
139:     difference_type repeats;
140:     difference_type tile_size;
141:     Iterator first;
142:     Iterator last;
143: };

145: typedef cusp::device_memory memSpace;
146: typedef int   IndexType;
147: typedef float ValueType;
148: typedef cusp::array1d<IndexType, memSpace> IndexArray;
149: typedef cusp::array1d<ValueType, memSpace> ValueArray;
150: typedef IndexArray::iterator IndexArrayIterator;
151: typedef ValueArray::iterator ValueArrayIterator;

153: // Ne: Number of elements
154: // Nl: Number of dof per element
157: PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
158: {
159:   size_t   N  = Ne * Nl;
160:   size_t   No = Ne * Nl*Nl;
161:   PetscInt Nr; // Number of rows

164:   // copy elemRows and elemMat to device
165:   IndexArray d_elemRows(elemRows, elemRows + N);
166:   ValueArray d_elemMats(elemMats, elemMats + No);

169:   MatGetSize(J, &Nr, PETSC_NULL);
170:   // allocate storage for "fat" COO representation of matrix
171:   PetscInfo1(J, "Making COO matrix of size %d\n", Nr);
172:   cusp::coo_matrix<IndexType,ValueType, memSpace> COO(Nr, Nr, No);

174:   // repeat elemRows entries Nl times
175:   PetscInfo(J, "Making row indices\n");
176:   repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
177:   thrust::copy(rowInd.begin(), rowInd.end(), COO.row_indices.begin());

179:   // tile rows of elemRows Nl times
180:   PetscInfo(J, "Making column indices\n");
181:   tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
182:   thrust::copy(colInd.begin(), colInd.end(), COO.column_indices.begin());

184:   // copy values from elemMats into COO structure (could be avoided)
185:   thrust::copy(d_elemMats.begin(), d_elemMats.end(), COO.values.begin());

187:   // For MPIAIJ, split this into two COO matrices, and return both
188:   //   Need the column map

190:   // print the "fat" COO representation
191:   if (PetscLogPrintInfo) {cusp::print(COO);}

193:   // sort COO format by (i,j), this is the most costly step
194:   PetscInfo(J, "Sorting rows and columns\n");
195: #if 1
196:   COO.sort_by_row_and_column();
197: #else
198:   {
199:     PetscInfo(J, "  Making permutation\n");
200:     IndexArray permutation(No);
201:     thrust::sequence(permutation.begin(), permutation.end());

203:     // compute permutation and sort by (I,J)
204:     {
205:         PetscInfo(J, "  Sorting columns\n");
206:         IndexArray temp(No);
207:         thrust::copy(COO.column_indices.begin(), COO.column_indices.end(), temp.begin());
208:         thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
209:         PetscInfo(J, "    Sorted columns\n");
210:         if (PetscLogPrintInfo) {
211:           for(IndexArrayIterator t_iter = temp.begin(), p_iter = permutation.begin(); t_iter != temp.end(); ++t_iter, ++p_iter) {
212:             PetscInfo2(J, "%d(%d)\n", *t_iter, *p_iter);
213:           }
214:         }

216:         PetscInfo(J, "  Copying rows\n");
217:         //cusp::copy(COO.row_indices, temp);
218:         thrust::copy(COO.row_indices.begin(), COO.row_indices.end(), temp.begin());
219:         PetscInfo(J, "  Gathering rows\n");
220:         thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.row_indices.begin());
221:         PetscInfo(J, "  Sorting rows\n");
222:         thrust::stable_sort_by_key(COO.row_indices.begin(), COO.row_indices.end(), permutation.begin());

224:         PetscInfo(J, "  Gathering columns\n");
225:         cusp::copy(COO.column_indices, temp);
226:         thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.column_indices.begin());
227:     }

229:     // use permutation to reorder the values
230:     {
231:         PetscInfo(J, "  Sorting values\n");
232:         ValueArray temp(COO.values);
233:         cusp::copy(COO.values, temp);
234:         thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.values.begin());
235:     }
236:   }
237: #endif

239:   // print the "fat" COO representation
240:   if (PetscLogPrintInfo) {cusp::print(COO);}

242:   // compute number of unique (i,j) entries
243:   //   this counts the number of changes as we move along the (i,j) list
244:   PetscInfo(J, "Computing number of unique entries\n");
245:   size_t num_entries = thrust::inner_product
246:     (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
247:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end (),  COO.column_indices.end()))   - 1,
248:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())) + 1,
249:      size_t(1),
250:      thrust::plus<size_t>(),
251:      thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());

253:   // allocate COO storage for final matrix
254:   PetscInfo(J, "Allocating compressed matrix\n");
255:   cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_entries);

257:   // sum values with the same (i,j) index
258:   // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
259:   //     the Cusp one is 2x faster, but still not optimal
260:   // This could possibly be done in-place
261:   PetscInfo(J, "Compressing matrix\n");
262:   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
263:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(),   COO.column_indices.end())),
264:      COO.values.begin(),
265:      thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
266:      A.values.begin(),
267:      thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
268:      thrust::plus<ValueType>());

270:   // print the final matrix
271:   if (PetscLogPrintInfo) {cusp::print(A);}

273:   //std::cout << "Writing matrix" << std::endl;
274:   //cusp::io::write_matrix_market_file(A, "A.mtx");

276:   PetscInfo(J, "Converting to PETSc matrix\n");
277:   MatSetType(J, MATSEQAIJCUSP);
278:   //cusp::csr_matrix<PetscInt,PetscScalar,cusp::device_memory> Jgpu;
279:   CUSPMATRIX *Jgpu = new CUSPMATRIX;
280:   cusp::convert(A, *Jgpu);
281:   if (PetscLogPrintInfo) {cusp::print(*Jgpu);}
282:   PetscInfo(J, "Copying to CPU matrix\n");
283:   MatCUSPCopyFromGPU(J, Jgpu);
284:   return(0);
285: }