4 #if __CUDACC_VER_MAJOR__ >= 11
7 #include <namd_cub/cub.cuh>
12 #ifdef NAMD_HIP //NAMD_HIP
13 #include <hip/hip_runtime.h>
14 #include <hipcub/hipcub.hpp>
18 #include "HipDefines.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
22 #ifdef NODEGROUP_FORCE_REGISTER
24 #define OVERALLOC 2.0f
26 void NAMD_die(const char *);
29 __constant__ double c_pairlistTrigger;
30 __constant__ double c_pairlistGrow;
31 __constant__ double c_pairlistShrink;
33 __device__ void reset_atomic_counter(unsigned int *counter) {
39 template <int T_MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
40 __global__ void accumulateForceToSOAKernelMGPU(
42 CudaLocalRecord* localRecords,
43 const double * __restrict f_bond,
44 const double * __restrict f_bond_nbond,
45 const double * __restrict f_bond_slow,
47 const float4 * __restrict f_nbond,
48 const float4 * __restrict f_nbond_slow,
49 const CudaForce* __restrict f_slow,
50 double * __restrict d_f_global_x,
51 double * __restrict d_f_global_y,
52 double * __restrict d_f_global_z,
53 double * __restrict d_f_normal_x,
54 double * __restrict d_f_normal_y,
55 double * __restrict d_f_normal_z,
56 double * __restrict d_f_nbond_x,
57 double * __restrict d_f_nbond_y,
58 double * __restrict d_f_nbond_z,
59 double * __restrict d_f_slow_x,
60 double * __restrict d_f_slow_y,
61 double * __restrict d_f_slow_z,
62 const int * __restrict patchUnsortOrder,
63 const Lattice lattice,
64 unsigned int** __restrict deviceQueues,
65 unsigned int* __restrict queueCounters,
66 unsigned int* __restrict tbcatomic
68 __shared__ CudaLocalRecord s_record;
69 using AccessType = int32_t;
70 AccessType* s_record_buffer = (AccessType*) &s_record;
72 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
73 // Read in the CudaLocalRecord using multiple threads. This should
75 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
76 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
80 const int numAtoms = s_record.numAtoms;
81 const int offset = s_record.bufferOffset;
82 const int offsetNB = s_record.bufferOffsetNBPad;
84 double f_slow_x, f_slow_y, f_slow_z;
85 double f_nbond_x, f_nbond_y, f_nbond_z;
86 double f_normal_x, f_normal_y, f_normal_z;
88 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
89 if (T_MAX_FORCE_NUMBER >= 1) {
90 int unorder = patchUnsortOrder[offset + i];
91 // gather from sorted nonbonded force array
92 float4 fnbond = f_nbond[offsetNB + unorder];
93 // set (medium) nonbonded force accumulators
94 f_nbond_x = (double)fnbond.x;
95 f_nbond_y = (double)fnbond.y;
96 f_nbond_z = (double)fnbond.z;
97 if (T_MAX_FORCE_NUMBER == 2) {
98 // gather from sorted nonbonded slow force array
99 float4 fnbslow = f_nbond_slow[offsetNB + unorder];
100 // accumulate slow force contributions from nonbonded calculation
101 f_slow_x = (double)fnbslow.x;
102 f_slow_y = (double)fnbslow.y;
103 f_slow_z = (double)fnbslow.z;
106 // gather from strided bond force array
107 // set (fast) normal force accumulators
108 if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
109 // Global forces is stored in d_f_global, add to bonded force
110 f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
111 f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
112 f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
113 if (T_DOCUDAGLOBAL) {
114 d_f_global_x[offset + i] = 0;
115 d_f_global_y[offset + i] = 0;
116 d_f_global_z[offset + i] = 0;
120 f_normal_x = f_bond[offset + i];
121 f_normal_y = f_bond[offset + i + forceStride];
122 f_normal_z = f_bond[offset + i + 2*forceStride];
124 if (T_MAX_FORCE_NUMBER >= 1) {
125 // gather from strided bond nonbonded force array
126 // accumulate (medium) nonbonded force accumulators
127 f_nbond_x += f_bond_nbond[offset + i];
128 f_nbond_y += f_bond_nbond[offset + i + forceStride];
129 f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
130 if (T_MAX_FORCE_NUMBER == 2) {
131 // gather from strided bond slow force array
132 // accumulate slow force accumulators
133 f_slow_x += f_bond_slow[offset + i];
134 f_slow_y += f_bond_slow[offset + i + forceStride];
135 f_slow_z += f_bond_slow[offset + i + 2*forceStride];
138 // set normal, nonbonded, and slow SOA force buffers
139 d_f_normal_x[offset + i] = f_normal_x;
140 d_f_normal_y[offset + i] = f_normal_y;
141 d_f_normal_z[offset + i] = f_normal_z;
142 if (T_MAX_FORCE_NUMBER >= 1) {
143 d_f_nbond_x[offset + i] = f_nbond_x;
144 d_f_nbond_y[offset + i] = f_nbond_y;
145 d_f_nbond_z[offset + i] = f_nbond_z;
146 if (T_MAX_FORCE_NUMBER == 2){
147 d_f_slow_x[offset + i] = f_slow_x;
148 d_f_slow_y[offset + i] = f_slow_y;
149 d_f_slow_z[offset + i] = f_slow_z;
157 // DMC This was in the accumulateForceToSOAKernelMGPU (commented out)
160 if(threadIdx.x == 0){
161 // Need another value here
162 unsigned int value = atomicInc(&tbcatomic[4], gridDim.x);
163 isLastBlockDone = (value == (gridDim.x - 1));
167 // thread0 flags everything
168 if(threadIdx.x == 0){
169 for(int i = 0; i < nDev; i++ ){
170 if (i == devID) continue;
171 unsigned int value = atomicInc(& (queueCounters[i]), nDev);
172 printf(" Device[%d] queue[%d] pos[%d]\n", devID, i, value);
173 // deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
174 deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
175 __threadfence_system();
177 // sets tcbCounter back to zero
179 printf(" GPU[%d] finished accumulating SOA buffers\n", devID);
184 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
185 __global__ void accumulateForceToSOAKernel(
186 const int numPatches,
187 CudaLocalRecord* localRecords,
188 const double * __restrict f_bond,
189 const double * __restrict f_bond_nbond,
190 const double * __restrict f_bond_slow,
192 const float4 * __restrict f_nbond,
193 const float4 * __restrict f_nbond_slow,
194 const CudaForce* __restrict f_slow,
195 double * __restrict d_f_global_x,
196 double * __restrict d_f_global_y,
197 double * __restrict d_f_global_z,
198 double * __restrict d_f_normal_x,
199 double * __restrict d_f_normal_y,
200 double * __restrict d_f_normal_z,
201 double * __restrict d_f_nbond_x,
202 double * __restrict d_f_nbond_y,
203 double * __restrict d_f_nbond_z,
204 double * __restrict d_f_slow_x,
205 double * __restrict d_f_slow_y,
206 double * __restrict d_f_slow_z,
207 const int * __restrict patchUnsortOrder,
208 const Lattice lattice
210 __shared__ CudaLocalRecord s_record;
211 using AccessType = int32_t;
212 AccessType* s_record_buffer = (AccessType*) &s_record;
214 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
215 // Read in the CudaLocalRecord using multiple threads. This should
217 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
218 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
222 const int numAtoms = s_record.numAtoms;
223 const int offset = s_record.bufferOffset;
224 const int offsetNB = s_record.bufferOffsetNBPad;
226 double f_slow_x, f_slow_y, f_slow_z;
227 double f_nbond_x, f_nbond_y, f_nbond_z;
228 double f_normal_x, f_normal_y, f_normal_z;
230 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
231 if (MAX_FORCE_NUMBER == 2) {
232 CudaForce f = f_slow[offset + i];
233 double3 f_scaled = lattice.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
234 // set slow force accumulators
235 f_slow_x = f_scaled.x;
236 f_slow_y = f_scaled.y;
237 f_slow_z = f_scaled.z;
239 if (MAX_FORCE_NUMBER >= 1) {
240 int unorder = patchUnsortOrder[offset + i];
241 // gather from sorted nonbonded force array
242 float4 fnbond = f_nbond[offsetNB + unorder];
243 // set (medium) nonbonded force accumulators
244 f_nbond_x = (double)fnbond.x;
245 f_nbond_y = (double)fnbond.y;
246 f_nbond_z = (double)fnbond.z;
247 if (MAX_FORCE_NUMBER == 2) {
248 // gather from sorted nonbonded slow force array
249 float4 fnbslow = f_nbond_slow[offsetNB + unorder];
250 // accumulate slow force contributions from nonbonded calculation
251 f_slow_x += (double)fnbslow.x;
252 f_slow_y += (double)fnbslow.y;
253 f_slow_z += (double)fnbslow.z;
256 // gather from strided bond force array
257 // set (fast) normal force accumulators
258 if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
259 // Global forces is stored in d_f_global, add to bonded force
260 f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
261 f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
262 f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
263 if (T_DOCUDAGLOBAL) {
264 d_f_global_x[offset + i] = 0;
265 d_f_global_y[offset + i] = 0;
266 d_f_global_z[offset + i] = 0;
271 f_normal_x = f_bond[offset + i];
272 f_normal_y = f_bond[offset + i + forceStride];
273 f_normal_z = f_bond[offset + i + 2*forceStride];
276 if (MAX_FORCE_NUMBER >= 1) {
277 // gather from strided bond nonbonded force array
278 // accumulate (medium) nonbonded force accumulators
279 f_nbond_x += f_bond_nbond[offset + i];
280 f_nbond_y += f_bond_nbond[offset + i + forceStride];
281 f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
282 if (MAX_FORCE_NUMBER == 2) {
283 // gather from strided bond slow force array
284 // accumulate slow force accumulators
285 f_slow_x += f_bond_slow[offset + i];
286 f_slow_y += f_bond_slow[offset + i + forceStride];
287 f_slow_z += f_bond_slow[offset + i + 2*forceStride];
290 // set normal, nonbonded, and slow SOA force buffers
291 d_f_normal_x[offset + i] = f_normal_x;
292 d_f_normal_y[offset + i] = f_normal_y;
293 d_f_normal_z[offset + i] = f_normal_z;
294 if (MAX_FORCE_NUMBER >= 1) {
295 d_f_nbond_x[offset + i] = f_nbond_x;
296 d_f_nbond_y[offset + i] = f_nbond_y;
297 d_f_nbond_z[offset + i] = f_nbond_z;
298 if (MAX_FORCE_NUMBER == 2) {
299 d_f_slow_x[offset + i] = f_slow_x;
300 d_f_slow_y[offset + i] = f_slow_y;
301 d_f_slow_z[offset + i] = f_slow_z;
309 __global__ void accumulatePMEForces(
311 const CudaForce* f_slow,
315 const int* patchOffsets,
318 int tid = threadIdx.x + (blockDim.x * blockIdx.x);
319 double f_slow_x, f_slow_y, f_slow_z;
322 CudaForce f = f_slow[tid];
324 double3 f_scaled = lat.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
325 f_slow_x = f_scaled.x;
326 f_slow_y = f_scaled.y;
327 f_slow_z = f_scaled.z;
329 d_f_slow_x[tid] += f_slow_x;
330 d_f_slow_y[tid] += f_slow_y;
331 d_f_slow_z[tid] += f_slow_z;
335 void SequencerCUDAKernel::accumulateForceToSOA(
337 const int doCudaGlobal,
338 const int maxForceNumber,
339 const int numPatches,
341 CudaLocalRecord* localRecords,
342 const double* f_bond,
343 const double* f_bond_nbond,
344 const double* f_bond_slow,
346 const float4* f_nbond,
347 const float4* f_nbond_slow,
348 const CudaForce* f_slow,
349 double* d_f_global_x,
350 double* d_f_global_y,
351 double* d_f_global_z,
352 double* d_f_normal_x,
353 double* d_f_normal_y,
354 double* d_f_normal_z,
361 const int* patchUnsortOrder,
362 const Lattice lattice,
363 unsigned int** deviceQueues,
364 unsigned int* queueCounters,
365 unsigned int* tbcatomic,
368 // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
369 const int options = (maxForceNumber << 2) + (doGlobal << 1) + doCudaGlobal;
371 #define CALL_MGPU(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
372 accumulateForceToSOAKernelMGPU<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
373 <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
374 ( numPatches, localRecords, \
375 f_bond, f_bond_nbond, f_bond_slow, forceStride, \
376 f_nbond, f_nbond_slow, f_slow, \
377 d_f_global_x, d_f_global_y, d_f_global_z, \
378 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
379 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
380 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
381 patchUnsortOrder, lattice, \
382 deviceQueues, queueCounters, tbcatomic \
385 case ((0<<2) + (0<<1) + (0<<0)): CALL_MGPU(0, 0, 0); break;
386 case ((0<<2) + (0<<1) + (1<<0)): CALL_MGPU(0, 0, 1); break;
387 case ((0<<2) + (1<<1) + (1<<0)): CALL_MGPU(0, 1, 1); break;
388 case ((0<<2) + (1<<1) + (0<<0)): CALL_MGPU(0, 1, 0); break;
389 case ((1<<2) + (0<<1) + (0<<0)): CALL_MGPU(1, 0, 0); break;
390 case ((1<<2) + (0<<1) + (1<<0)): CALL_MGPU(1, 0, 1); break;
391 case ((1<<2) + (1<<1) + (1<<0)): CALL_MGPU(1, 1, 1); break;
392 case ((1<<2) + (1<<1) + (0<<0)): CALL_MGPU(1, 1, 0); break;
393 case ((2<<2) + (0<<1) + (0<<0)): CALL_MGPU(2, 0, 0); break;
394 case ((2<<2) + (0<<1) + (1<<0)): CALL_MGPU(2, 0, 1); break;
395 case ((2<<2) + (1<<1) + (1<<0)): CALL_MGPU(2, 1, 1); break;
396 case ((2<<2) + (1<<1) + (0<<0)): CALL_MGPU(2, 1, 0); break;
398 const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
399 NAMD_bug(error.c_str());
404 #define CALL(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
405 accumulateForceToSOAKernel<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
406 <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
407 ( numPatches, localRecords, \
408 f_bond, f_bond_nbond, f_bond_slow, forceStride, \
409 f_nbond, f_nbond_slow, f_slow, \
410 d_f_global_x, d_f_global_y, d_f_global_z, \
411 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
412 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
413 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
414 patchUnsortOrder, lattice \
417 case ((0<<2) + (0<<1) + (0<<0)): CALL(0, 0, 0); break;
418 case ((0<<2) + (0<<1) + (1<<0)): CALL(0, 0, 1); break;
419 case ((0<<2) + (1<<1) + (1<<0)): CALL(0, 1, 1); break;
420 case ((0<<2) + (1<<1) + (0<<0)): CALL(0, 1, 0); break;
421 case ((1<<2) + (0<<1) + (0<<0)): CALL(1, 0, 0); break;
422 case ((1<<2) + (0<<1) + (1<<0)): CALL(1, 0, 1); break;
423 case ((1<<2) + (1<<1) + (1<<0)): CALL(1, 1, 1); break;
424 case ((1<<2) + (1<<1) + (0<<0)): CALL(1, 1, 0); break;
425 case ((2<<2) + (0<<1) + (0<<0)): CALL(2, 0, 0); break;
426 case ((2<<2) + (0<<1) + (1<<0)): CALL(2, 0, 1); break;
427 case ((2<<2) + (1<<1) + (1<<0)): CALL(2, 1, 1); break;
428 case ((2<<2) + (1<<1) + (0<<0)): CALL(2, 1, 0); break;
430 const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
431 NAMD_bug(error.c_str());
438 template <int T_MAX_FORCE_NUMBER>
439 __global__ void mergeForcesFromPeersKernel(
440 const int numPatches,
441 const int devID, // GPU ID
442 CudaLocalRecord* localRecords,
443 CudaPeerRecord* peerRecords,
445 double** __restrict f_normal_x,
446 double** __restrict f_normal_y,
447 double** __restrict f_normal_z,
448 double** __restrict f_nbond_x,
449 double** __restrict f_nbond_y,
450 double** __restrict f_nbond_z,
451 double** __restrict f_slow_x,
452 double** __restrict f_slow_y,
453 double** __restrict f_slow_z,
454 const CudaForce* __restrict pmeForces
456 __shared__ CudaLocalRecord s_record;
457 using AccessType = int32_t;
458 AccessType* s_record_buffer = (AccessType*) &s_record;
460 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
461 // Read in the CudaLocalRecord using multiple threads. This should
463 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
464 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
468 const int numAtoms = s_record.numAtoms;
469 const int dstOffset = s_record.bufferOffset;
470 const int numPeerRecords = s_record.numPeerRecord;
471 const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
473 const int inlinePeers = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
475 for (int peer = 0; peer < inlinePeers; peer++) {
476 const int srcDevice = s_record.inline_peers[peer].deviceIndex;
477 const int srcOffset = s_record.inline_peers[peer].bufferOffset;
479 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
480 double f_x = f_normal_x[srcDevice][srcOffset + i];
481 double f_y = f_normal_y[srcDevice][srcOffset + i];
482 double f_z = f_normal_z[srcDevice][srcOffset + i];
484 f_normal_x[devID][dstOffset + i] += f_x;
485 f_normal_y[devID][dstOffset + i] += f_y;
486 f_normal_z[devID][dstOffset + i] += f_z;
488 if(T_MAX_FORCE_NUMBER >= 1){
489 // We don't need the nonbonded offset here since
490 // this isn't the actual nonbonded buffer
491 f_x = f_nbond_x[srcDevice][srcOffset + i];
492 f_y = f_nbond_y[srcDevice][srcOffset + i];
493 f_z = f_nbond_z[srcDevice][srcOffset + i];
495 f_nbond_x[devID][dstOffset + i] += f_x;
496 f_nbond_y[devID][dstOffset + i] += f_y;
497 f_nbond_z[devID][dstOffset + i] += f_z;
499 if(T_MAX_FORCE_NUMBER == 2){
500 f_x = f_slow_x[srcDevice][srcOffset + i];
501 f_y = f_slow_y[srcDevice][srcOffset + i];
502 f_z = f_slow_z[srcDevice][srcOffset + i];
504 f_slow_x[devID][dstOffset + i] += f_x;
505 f_slow_y[devID][dstOffset + i] += f_y;
506 f_slow_z[devID][dstOffset + i] += f_z;
512 for (int peer = inlinePeers; peer < numPeerRecords; peer++) {
513 const int srcDevice = peerRecords[peerRecordStartIndexs + peer].deviceIndex;
514 const int srcOffset = peerRecords[peerRecordStartIndexs + peer].bufferOffset;
516 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
517 double f_x = f_normal_x[srcDevice][srcOffset + i];
518 double f_y = f_normal_y[srcDevice][srcOffset + i];
519 double f_z = f_normal_z[srcDevice][srcOffset + i];
521 f_normal_x[devID][dstOffset + i] += f_x;
522 f_normal_y[devID][dstOffset + i] += f_y;
523 f_normal_z[devID][dstOffset + i] += f_z;
525 if(T_MAX_FORCE_NUMBER >= 1){
526 // We don't need the nonbonded offset here since
527 // this isn't the actual nonbonded buffer
528 f_x = f_nbond_x[srcDevice][srcOffset + i];
529 f_y = f_nbond_y[srcDevice][srcOffset + i];
530 f_z = f_nbond_z[srcDevice][srcOffset + i];
532 f_nbond_x[devID][dstOffset + i] += f_x;
533 f_nbond_y[devID][dstOffset + i] += f_y;
534 f_nbond_z[devID][dstOffset + i] += f_z;
536 if(T_MAX_FORCE_NUMBER == 2){
537 f_x = f_slow_x[srcDevice][srcOffset + i];
538 f_y = f_slow_y[srcDevice][srcOffset + i];
539 f_z = f_slow_z[srcDevice][srcOffset + i];
541 f_slow_x[devID][dstOffset + i] += f_x;
542 f_slow_y[devID][dstOffset + i] += f_y;
543 f_slow_z[devID][dstOffset + i] += f_z;
549 // Merge PME forces here instead of in the fetch forcing kernel
550 if(T_MAX_FORCE_NUMBER == 2){
551 for(int i = threadIdx.x; i < numAtoms; i += blockDim.x){
552 CudaForce f = pmeForces[dstOffset + i];
554 double3 f_scaled = lat.scale_force(
555 Vector((double)f.x, (double)f.y, (double)f.z));
556 f_slow_x[devID][dstOffset + i] += f_scaled.x;
557 f_slow_y[devID][dstOffset + i] += f_scaled.y;
558 f_slow_z[devID][dstOffset + i] += f_scaled.z;
565 void SequencerCUDAKernel::mergeForcesFromPeers(
567 const int maxForceNumber,
569 const int numPatchesHomeAndProxy,
570 const int numPatchesHome,
580 const CudaForce* pmeForces,
581 CudaLocalRecord* localRecords,
582 CudaPeerRecord* peerRecords,
583 std::vector<int>& atomCounts,
586 // atom-based kernel here
587 //int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
588 int grid = (numPatchesHome);
589 int blocks = PATCH_BLOCKS;
590 // launch a single-threaded grid for debugging
593 for (int i = 0; i < devID; i++) {
594 offset += atomCounts[i];
597 switch (maxForceNumber) {
599 mergeForcesFromPeersKernel<0><<<grid, blocks, 0, stream>>>(
600 numPatchesHome, devID, localRecords, peerRecords,
602 f_normal_x, f_normal_y, f_normal_z,
603 f_nbond_x, f_nbond_y, f_nbond_z,
604 f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
608 mergeForcesFromPeersKernel<1><<<grid, blocks, 0, stream>>>(
609 numPatchesHome, devID, localRecords, peerRecords,
611 f_normal_x, f_normal_y, f_normal_z,
612 f_nbond_x, f_nbond_y, f_nbond_z,
613 f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
617 mergeForcesFromPeersKernel<2><<<grid, blocks, 0, stream>>>(
618 numPatchesHome, devID, localRecords, peerRecords,
620 f_normal_x, f_normal_y, f_normal_z,
621 f_nbond_x, f_nbond_y, f_nbond_z,
622 f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
628 // JM NOTE: This is a fused version of accumulateForceToSOA + addForceToMomentum.
629 // addForceToMomentum barely has any math and is memory-bandwith bound, so
630 // fusing these kernels allows us to keep the forces in the registers and
631 // reusing its values for velocities
632 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL, int DO_FIXED>
633 __global__ void accumulateForceKick(
634 const int numPatches,
635 CudaLocalRecord* localRecords,
636 const double * __restrict f_bond,
637 const double * __restrict f_bond_nbond,
638 const double * __restrict f_bond_slow,
640 const float4 * __restrict f_nbond,
641 const float4 * __restrict f_nbond_slow,
642 const CudaForce* __restrict f_slow,
643 double * __restrict d_f_global_x,
644 double * __restrict d_f_global_y,
645 double * __restrict d_f_global_z,
646 double * __restrict d_f_normal_x,
647 double * __restrict d_f_normal_y,
648 double * __restrict d_f_normal_z,
649 double * __restrict d_f_nbond_x,
650 double * __restrict d_f_nbond_y,
651 double * __restrict d_f_nbond_z,
652 double * __restrict d_f_slow_x,
653 double * __restrict d_f_slow_y,
654 double * __restrict d_f_slow_z,
655 const int * __restrict patchUnsortOrder,
656 const Lattice lattice,
660 const double * __restrict recipMass,
661 const int* __restrict atomFixed,
662 const double dt_normal,
663 const double dt_nbond,
664 const double dt_slow,
667 __shared__ CudaLocalRecord s_record;
668 using AccessType = int32_t;
669 AccessType* s_record_buffer = (AccessType*) &s_record;
671 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
672 // Read in the CudaLocalRecord using multiple threads. This should
674 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
675 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
679 const int numAtoms = s_record.numAtoms;
680 const int offset = s_record.bufferOffset;
681 const int offsetNB = s_record.bufferOffsetNBPad;
683 double f_slow_x, f_slow_y, f_slow_z;
684 double f_nbond_x, f_nbond_y, f_nbond_z;
685 double f_normal_x, f_normal_y, f_normal_z;
687 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
698 if (MAX_FORCE_NUMBER == 2) {
699 CudaForce f = f_slow[offset + i];
701 double3 f_scaled = lattice.scale_force(
702 Vector((double)f.x, (double)f.y, (double)f.z));
704 // set slow force accumulators
705 f_slow_x = f_scaled.x;
706 f_slow_y = f_scaled.y;
707 f_slow_z = f_scaled.z;
709 if (MAX_FORCE_NUMBER >= 1) {
710 int unorder = patchUnsortOrder[offset + i];
711 // gather from sorted nonbonded force array
712 float4 fnbond = f_nbond[offsetNB + unorder];
713 // set (medium) nonbonded force accumulators
714 f_nbond_x = (double)fnbond.x;
715 f_nbond_y = (double)fnbond.y;
716 f_nbond_z = (double)fnbond.z;
717 if (MAX_FORCE_NUMBER == 2) {
718 // gather from sorted nonbonded slow force array
719 float4 fnbslow = f_nbond_slow[offsetNB + unorder];
720 // accumulate slow force contributions from nonbonded calculation
721 f_slow_x += (double)fnbslow.x;
722 f_slow_y += (double)fnbslow.y;
723 f_slow_z += (double)fnbslow.z;
726 // gather from strided bond force array
727 // set (fast) normal force accumulators
728 if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
729 // Global forces is stored in d_f_global, add to bonded force
730 f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
731 f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
732 f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
733 if (T_DOCUDAGLOBAL) {
734 d_f_global_x[offset + i] = 0;
735 d_f_global_y[offset + i] = 0;
736 d_f_global_z[offset + i] = 0;
742 f_normal_x = f_bond[offset + i];
743 f_normal_y = f_bond[offset + i + forceStride];
744 f_normal_z = f_bond[offset + i + 2*forceStride];
747 if (MAX_FORCE_NUMBER >= 1) {
748 // gather from strided bond nonbonded force array
749 // accumulate (medium) nonbonded force accumulators
750 f_nbond_x += f_bond_nbond[offset + i];
751 f_nbond_y += f_bond_nbond[offset + i + forceStride];
752 f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
753 if (MAX_FORCE_NUMBER == 2) {
754 // gather from strided bond slow force array
755 // accumulate slow force accumulators
756 f_slow_x += f_bond_slow[offset + i];
757 f_slow_y += f_bond_slow[offset + i + forceStride];
758 f_slow_z += f_bond_slow[offset + i + 2*forceStride];
761 // set normal, nonbonded, and slow SOA force buffers
762 d_f_normal_x[offset + i] = f_normal_x;
763 d_f_normal_y[offset + i] = f_normal_y;
764 d_f_normal_z[offset + i] = f_normal_z;
765 if (MAX_FORCE_NUMBER >= 1) {
766 d_f_nbond_x[offset + i] = f_nbond_x;
767 d_f_nbond_y[offset + i] = f_nbond_y;
768 d_f_nbond_z[offset + i] = f_nbond_z;
769 if (MAX_FORCE_NUMBER == 2) {
770 d_f_slow_x[offset + i] = f_slow_x;
771 d_f_slow_y[offset + i] = f_slow_y;
772 d_f_slow_z[offset + i] = f_slow_z;
776 /* Velocity updates */
777 double rp = recipMass[offset + i];
779 vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
780 vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
781 vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
783 d_vel_x[offset + i] += vx;
784 d_vel_y[offset + i] += vy;
785 d_vel_z[offset + i] += vz;
787 if (!atomFixed[offset + i]) {
788 d_vel_x[offset + i] += vx;
789 d_vel_y[offset + i] += vy;
790 d_vel_z[offset + i] += vz;
792 d_vel_x[offset + i] = 0;
793 d_vel_y[offset + i] = 0;
794 d_vel_z[offset + i] = 0;
804 template <int MAX_FORCE_NUMBER>
805 __global__ void accumulateForceKick(
806 const double * __restrict f_bond,
807 const double * __restrict f_bond_nbond,
808 const double * __restrict f_bond_slow,
810 const PatchRecord * __restrict bond_pr,
811 const float4 * __restrict f_nbond,
812 const float4 * __restrict f_nbond_slow,
813 const CudaPatchRecord * __restrict nbond_pr,
814 const CudaForce * __restrict f_slow,
815 double *d_f_normal_x,
816 double *d_f_normal_y,
817 double *d_f_normal_z,
827 const double * __restrict recipMass,
828 const double dt_normal,
829 const double dt_nbond,
830 const double dt_slow,
831 const double scaling,
832 const int * __restrict patchIDs,
833 const int * __restrict patchOffsets,
834 const int * __restrict patchUnsortOrder,
835 const CudaLattice lat)
837 int tid = threadIdx.x;
838 int stride = blockDim.x;
840 __shared__ int sh_patchID;
841 __shared__ int sh_patchOffset;
842 // number of atoms per patch should be same no matter
843 // which data structure we access it through
844 __shared__ int sh_natoms;
845 __shared__ int sh_bondForceOffset;
846 __shared__ int sh_nbondForceOffset;
850 // number of atoms per patch should be same no matter
851 // which data structure we access it through
854 int nbondForceOffset;
856 if(threadIdx.x == 0){
857 sh_patchID = patchIDs[blockIdx.x];
858 sh_patchOffset = patchOffsets[blockIdx.x];
859 // number of atoms per patch should be same no matter
860 // which data structure we access it through
861 sh_natoms = bond_pr[sh_patchID].numAtoms;
862 sh_bondForceOffset = bond_pr[sh_patchID].atomStart;
863 if(MAX_FORCE_NUMBER >= 1){
864 sh_nbondForceOffset = nbond_pr[sh_patchID].atomStart;
870 // pull stuff to registers
871 // patchID = sh_patchID;
872 patchOffset = sh_patchOffset;
874 bondForceOffset = sh_bondForceOffset;
875 nbondForceOffset = (MAX_FORCE_NUMBER >= 1) ? sh_nbondForceOffset : 0;
877 double r1x, r1y, r1z, r2x, r2y, r2z, r3x, r3y, r3z;
879 if (MAX_FORCE_NUMBER == 2) {
891 double f_slow_x, f_slow_y, f_slow_z;
892 double f_nbond_x, f_nbond_y, f_nbond_z;
893 double f_normal_x, f_normal_y, f_normal_z;
896 for (int i = tid; i < natoms; i += stride) {
908 // Global forces is stored in d_f_normal, just need to add bonded force
909 f_normal_x = d_f_normal_x[patchOffset + i];
910 f_normal_y = d_f_normal_y[patchOffset + i];
911 f_normal_z = d_f_normal_z[patchOffset + i];
914 if (MAX_FORCE_NUMBER == 2) {
915 CudaForce f = f_slow[patchOffset + i];
916 double fx = (double) f.x;
917 double fy = (double) f.y;
918 double fz = (double) f.z;
919 // set slow force accumulators
920 f_slow_x = (fx*r1x + fy*r2x + fz*r3x);
921 f_slow_y = (fx*r1y + fy*r2y + fz*r3y);
922 f_slow_z = (fx*r1z + fy*r2z + fz*r3z);
924 if (MAX_FORCE_NUMBER >= 1) {
925 int unorder = patchUnsortOrder[patchOffset + i];
926 // gather from sorted nonbonded force array
927 float4 fnbond = f_nbond[nbondForceOffset + unorder];
928 // set (medium) nonbonded force accumulators
929 f_nbond_x = (double)fnbond.x;
930 f_nbond_y = (double)fnbond.y;
931 f_nbond_z = (double)fnbond.z;
932 if (MAX_FORCE_NUMBER == 2) {
933 // gather from sorted nonbonded slow force array
934 float4 fnbslow = f_nbond_slow[nbondForceOffset + unorder];
935 // accumulate slow force contributions from nonbonded calculation
936 f_slow_x += (double)fnbslow.x;
937 f_slow_y += (double)fnbslow.y;
938 f_slow_z += (double)fnbslow.z;
941 // gather from strided bond force array
942 // set (fast) normal force accumulators
943 f_normal_x += f_bond[bondForceOffset + i];
944 f_normal_y += f_bond[bondForceOffset + i + forceStride];
945 f_normal_z += f_bond[bondForceOffset + i + 2*forceStride];
946 if (MAX_FORCE_NUMBER >= 1) {
947 // gather from strided bond nonbonded force array
948 // accumulate (medium) nonbonded force accumulators
949 f_nbond_x += f_bond_nbond[bondForceOffset + i];
950 f_nbond_y += f_bond_nbond[bondForceOffset + i + forceStride];
951 f_nbond_z += f_bond_nbond[bondForceOffset + i + 2*forceStride];
952 if (MAX_FORCE_NUMBER == 2) {
953 // gather from strided bond slow force array
954 // accumulate slow force accumulators
955 f_slow_x += f_bond_slow[bondForceOffset + i];
956 f_slow_y += f_bond_slow[bondForceOffset + i + forceStride];
957 f_slow_z += f_bond_slow[bondForceOffset + i + 2*forceStride];
960 // set normal, nonbonded, and slow SOA force buffers
961 d_f_normal_x[patchOffset + i] = f_normal_x;
962 d_f_normal_y[patchOffset + i] = f_normal_y;
963 d_f_normal_z[patchOffset + i] = f_normal_z;
965 if (MAX_FORCE_NUMBER >= 1) {
966 d_f_nbond_x[patchOffset + i] = f_nbond_x;
967 d_f_nbond_y[patchOffset + i] = f_nbond_y;
968 d_f_nbond_z[patchOffset + i] = f_nbond_z;
969 if (MAX_FORCE_NUMBER == 2) {
970 d_f_slow_x[patchOffset + i] = f_slow_x;
971 d_f_slow_y[patchOffset + i] = f_slow_y;
972 d_f_slow_z[patchOffset + i] = f_slow_z;
976 /* Velocity updates */
977 double rp = recipMass[patchOffset + i];
978 vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
979 vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
980 vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
982 d_vel_x[patchOffset + i] += vx;
983 d_vel_y[patchOffset + i] += vy;
984 d_vel_z[patchOffset + i] += vz;
990 void SequencerCUDAKernel::accumulate_force_kick(
991 const bool doFixedAtoms,
993 const int doCudaGlobal,
994 const int maxForceNumber,
995 const int numPatches,
996 CudaLocalRecord* localRecords,
997 const double* f_bond,
998 const double* f_bond_nbond,
999 const double* f_bond_slow,
1001 const float4* f_nbond,
1002 const float4* f_nbond_slow,
1003 const CudaForce* f_slow,
1004 double* d_f_global_x,
1005 double* d_f_global_y,
1006 double* d_f_global_z,
1007 double* d_f_normal_x,
1008 double* d_f_normal_y,
1009 double* d_f_normal_z,
1010 double* d_f_nbond_x,
1011 double* d_f_nbond_y,
1012 double* d_f_nbond_z,
1019 const double* recipMass,
1020 const int* d_atomFixed,
1021 const double dt_normal,
1022 const double dt_nbond,
1023 const double dt_slow,
1024 const double scaling,
1025 const int* patchUnsortOrder,
1026 const Lattice lattice,
1029 // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
1030 const int options = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms);
1031 #define CALL(MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED) \
1032 accumulateForceKick<MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED> \
1033 <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
1034 ( numPatches, localRecords, \
1035 f_bond, f_bond_nbond, f_bond_slow, forceStride, \
1036 f_nbond, f_nbond_slow, f_slow, \
1037 d_f_global_x, d_f_global_y, d_f_global_z, \
1038 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
1039 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
1040 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
1041 patchUnsortOrder, lattice, \
1042 d_vel_x, d_vel_y, d_vel_z, recipMass, \
1043 d_atomFixed, dt_normal, dt_nbond, dt_slow, scaling)
1045 * Generated by the following python script:
1046 * #!/usr/bin/env python3
1047 * for maxForceNumber in range(3):
1048 * for doGlobal in range(2):
1049 * for doCudaGlobal in range(2):
1050 * for doFixedAtoms in range(2):
1051 * option = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms)
1052 * print(f' case {option}: CALL ({maxForceNumber}, {doGlobal}, {doCudaGlobal}, {doFixedAtoms}); break;')
1055 case 0: CALL (0, 0, 0, 0); break;
1056 case 1: CALL (0, 0, 0, 1); break;
1057 case 2: CALL (0, 0, 1, 0); break;
1058 case 3: CALL (0, 0, 1, 1); break;
1059 case 4: CALL (0, 1, 0, 0); break;
1060 case 5: CALL (0, 1, 0, 1); break;
1061 case 6: CALL (0, 1, 1, 0); break;
1062 case 7: CALL (0, 1, 1, 1); break;
1063 case 8: CALL (1, 0, 0, 0); break;
1064 case 9: CALL (1, 0, 0, 1); break;
1065 case 10: CALL (1, 0, 1, 0); break;
1066 case 11: CALL (1, 0, 1, 1); break;
1067 case 12: CALL (1, 1, 0, 0); break;
1068 case 13: CALL (1, 1, 0, 1); break;
1069 case 14: CALL (1, 1, 1, 0); break;
1070 case 15: CALL (1, 1, 1, 1); break;
1071 case 16: CALL (2, 0, 0, 0); break;
1072 case 17: CALL (2, 0, 0, 1); break;
1073 case 18: CALL (2, 0, 1, 0); break;
1074 case 19: CALL (2, 0, 1, 1); break;
1075 case 20: CALL (2, 1, 0, 0); break;
1076 case 21: CALL (2, 1, 0, 1); break;
1077 case 22: CALL (2, 1, 1, 0); break;
1078 case 23: CALL (2, 1, 1, 1); break;
1080 const std::string error =
1081 "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " +
1082 std::to_string(maxForceNumber) +
1083 ", doGlobal = " + std::to_string(doGlobal) +
1084 ", doFixedAtoms = " + std::to_string(doFixedAtoms) +
1085 ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
1086 NAMD_bug(error.c_str());
1092 template <bool doSlow>
1093 __global__ void copyFNBondToSOA(float4* __restrict__ f_nbond,
1094 float4* __restrict__ f_nbond_slow,
1101 const int* __restrict__ patchIDs,
1102 const int* __restrict__ patchOffsets,
1103 const int* __restrict__ patchUnsortOrder,
1104 const CudaPatchRecord* __restrict__ nbondIndexPerPatch)
1107 int stride = blockDim.x;
1108 int pid = patchIDs[blockIdx.x];
1110 // nbondIndexPerPatch is organized on a per local ID on the compute
1111 // It's also device-wide, which means it hopefully has more patches than
1112 // PatchMap but that means the ordering is different. I can't use the
1113 // PatchID here to index nbondIndexPerPatch I need something else.
1114 // Each patchID from patchMap needs to map somewhere to nbondIndexPerPatch
1115 // Means I can't use patchIDs anymore. I need to use a different ordering
1116 // straight from the computes nbondIndexPerPatch is the same for
1117 // both multi and single PE. What changes is force storage?
1118 // The only thing that needs updating is the usage of the local compute index
1119 // instead of using the global patchID.
1120 // I need the local compute index
1121 // what do... Patches never migrate, so hopefully this datastructure will never change.
1123 // I can build an index at patchMap with the ordering inside the computes.
1124 int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1125 int patchOffset = patchOffsets[blockIdx.x];
1126 int natom = nbondIndexPerPatch[pid].numAtoms;
1127 for(int i = threadIdx.x; i < natom; i+= stride){
1128 unorder = patchUnsortOrder[patchOffset + i];
1129 float4 force = f_nbond[globalForceOffset + unorder];
1131 f_nbond_x[patchOffset + i] = (double)force.x;
1132 f_nbond_y[patchOffset + i] = (double)force.y;
1133 f_nbond_z[patchOffset + i] = (double)force.z;
1136 // XXX Accumulation here works because each f_slow_* buffer
1137 // is first initialized in copyFSlowToSOA.
1138 float4 f_nb_slow = f_nbond_slow[globalForceOffset + unorder];
1139 f_slow_x[patchOffset + i] += (double)f_nb_slow.x;
1140 f_slow_y[patchOffset + i] += (double)f_nb_slow.y;
1141 f_slow_z[patchOffset + i] += (double)f_nb_slow.z;
1146 void SequencerCUDAKernel::copy_nbond_forces(int numPatches,
1148 float4* f_nbond_slow,
1155 const int* patchIDs,
1156 const int* patchOffsets,
1157 const int* patchUnsortOrder,
1158 const CudaPatchRecord* pr,
1160 cudaStream_t stream){
1163 copyFNBondToSOA<1><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1164 f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1165 patchIDs, patchOffsets, patchUnsortOrder, pr);
1167 copyFNBondToSOA<0><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1168 f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1169 patchIDs, patchOffsets, patchUnsortOrder, pr);
1174 // this function will fuse copyFNBond, copyFbond, addForceToMomentum and updateVelocities (hopefully)
1175 // XXX TODO: missing slow forces support
1176 template<bool T_DONBOND, bool T_DOSLOW>
1177 __global__ void fetchForcesAndUpdateVelocitiesKernel(
1178 const double scaling,
1182 /* copyFbondToSOA parameters*/
1183 const double* __restrict__ f_bond, // forces from bond compute
1184 const double* __restrict__ f_bond_nbond, // forces from bond compute
1185 double* __restrict__ f_bond_x,
1186 double* __restrict__ f_bond_y,
1187 double* __restrict__ f_bond_z,
1188 double* __restrict__ f_nbond_x,
1189 double* __restrict__ f_nbond_y,
1190 double* __restrict__ f_nbond_z,
1191 double* __restrict__ f_slow_x,
1192 double* __restrict__ f_slow_y,
1193 double* __restrict__ f_slow_z,
1195 const PatchRecord* __restrict__ b_pr,
1196 const int* __restrict__ patchIDs,
1197 const int* __restrict__ patchOffsets,
1198 /* copyFNbondToSOA parameters*/
1199 const float4* __restrict__ f_nbond, // forces from bbond compute
1200 const float4* __restrict__ f_nbond_slow, // forces from bond compute
1201 const int* __restrict__ patchSortOrder,
1202 const CudaPatchRecord* __restrict__ nbondIndexPerPatch,
1203 /* copyFSlowToSOA */
1204 const CudaForce* __restrict__ f_slow,
1205 const int* __restrict__ patchPositions,
1206 const int* __restrict__ pencilPatchIndex,
1207 const int* __restrict__ patchOffsets,
1208 const int* __restrict__ patchIDs,
1209 const int* __restrict__ slow_patchIDs,
1210 const CudaLattice* __restrict__ lattices,
1211 /* addForceToMomentum */
1212 const double * __restrict recipMass,
1213 double * __restrict vel_x,
1214 double * __restrict vel_y,
1215 double * __restrict vel_z,
1216 /* updateRigidArrays */
1217 const double * __restrict pos_x,
1218 const double * __restrict pos_y,
1219 const double * __restrict pos_z,
1220 double * __restrict velNew_x,
1221 double * __restrict velNew_y,
1222 double * __restrict velNew_z,
1223 double * __restrict posNew_x,
1224 double * __restrict posNew_y,
1225 double * __restrict posNew_z
1228 int stride = blockDim.x;
1229 int pid = patchIDs[blockIdx.x];
1231 int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1232 int b_forceOffset = b_pr[patchID].atomStart;
1234 int patchOffset = patchOffsets[blockIdx.x];
1235 int natom = nbondIndexPerPatch.numAtoms;
1240 for(int i = 0; i < threadIdx.x; i < natom; i += stride){
1242 if(T_DONBOND) f_nb = {0};
1243 if(T_DOSLOW) f_s = {0};
1244 // fetch the bonded forces first
1245 f_b.x = f_bond[b_forceOffset + i];
1246 f_b.y = f_bond[b_forceOffset + i + forceStride];
1247 f_b.z = f_bond[b_forceOffset + i + 2*forceStride];
1250 f_nb = f_nbond[globalForceOffset + i];
1251 f_nb.x += f_bond_nbond[b_forceOffset + i];
1252 f_nb.y += f_bond_nbond[b_forceOffset + i + forceStride];
1253 f_nb.z += f_bond_nbond[b_forceOffset + i + 2*forceStride];
1255 // addForceToMomentum now
1256 // this striding is not good
1257 float vx = vel_x[patchOffset + i];
1258 float vy = vel_y[patchOffset + i];
1259 float vz = vel_z[patchOffset + i];
1260 float rmass = recipMass[patchOffset + i];
1261 switch(maxForceNumber){
1262 // XXX TODO: Case 2 for slow forces
1264 dt_nbond *= scaling;
1265 vx += f_nb.x * rmass * dt_nbond;
1266 vy += f_nb.y * rmass * dt_nbond;
1267 vz += f_nb.z * rmass * dt_nbond;
1269 dt_normal *= scaling;
1270 vx += f_b.x * rmass * dt_nbond;
1271 vy += f_b.y * rmass * dt_nbond;
1272 vz += f_b.z * rmass * dt_nbond;
1275 // updateRigidArrays
1277 posNew_x[patchOffset + i] = pos_x[i] + (vx * dt);
1278 posNew_y[patchOffset + i] = pos_y[i] + (vy * dt);
1279 posNew_z[patchOffset + i] = pos_z[i] + (vz * dt);
1280 vel_x[patchOffset + i] = vx;
1281 vel_y[patchOffset + i] = vy;
1282 vel_z[patchOffset + i] = vz;
1283 velNew_x[patchOffset + i] = vx;
1284 velNew_y[patchOffset + i] = vy;
1285 velNew_z[patchOffset + i] = vz;
1286 f_normal_x[patchOffset + i] = f_b.x;
1287 f_normal_y[patchOffset + i] = f_b.y;
1288 f_normal_z[patchOffset + i] = f_b.z;
1291 order = patchSortOrder[patchOffset + i];
1292 f_nbond_x[patchOffset + order] = f_nb.x;
1293 f_nbond_y[patchOffset + order] = f_nb.y;
1294 f_nbond_z[patchOffset + order] = f_nb.z;
1299 void SequencerCUDAKernel::fetchForcesAndUpdateVelocities(int numPatches,
1302 const double scaling,
1306 const int maxForceNumber,
1307 /* copyFbondToSOA parameters*/
1308 const double* __restrict__ f_bond, // forces from bond compute
1309 const double* __restrict__ f_bond_nbond, // forces from bond compute
1320 const PatchRecord* b_pr,
1321 const int* patchIDs,
1322 const int* patchOffsets,
1323 /* copyFNbondToSOA parameters*/
1324 const float4* f_nbond, // forces from bbond compute
1325 const float4* f_nbond_slow, // forces from bond compute
1326 const int* patchSortOrder,
1327 const CudaPatchRecord* nbondIndexPerPatch,
1328 /* copyFSlowToSOA */
1329 const CudaForce* f_slow,
1330 const int* patchPositions,
1331 const int* pencilPatchIndex,
1332 const int* patchOffsets,
1333 const int* patchIDs,
1334 const int* slow_patchIDs,
1335 const CudaLattice* lattices,
1336 /* addForceToMomentum */
1337 const double * recipMass,
1341 /* updateRigidArrays */
1342 const double * pos_x,
1343 const double * pos_y,
1344 const double * pos_z,
1351 cudaStream_t stream){
1353 // reduce the amount of arguments in this function
1355 int grid = numPatches;
1357 // XXX TODO finish this
1360 fetchForcesAndUpdateVelocities<true, true><<<numPatches, 128, 0, stream>>>();
1362 fetchForcesAndUpdateVelocities<true, false><<<numPatches, 128, 0, stream>>>();
1365 fetchForcesAndUpdateVelocities<false, false><<<numPatches, 128, 0, stream>>>();
1370 // JM: I need a function to do the pairlistCheck
1371 template<bool T_ISPERIODIC>
1372 __global__ void pairListMarginCheckKernel(
1373 const int numPatches,
1374 CudaLocalRecord* localRecords,
1375 const double* __restrict pos_x,
1376 const double* __restrict pos_y,
1377 const double* __restrict pos_z,
1378 const double* __restrict pos_old_x,
1379 const double* __restrict pos_old_y,
1380 const double* __restrict pos_old_z,
1381 const double3* __restrict awayDists, // for margin check
1382 const Lattice lattice,
1383 const Lattice latticeOld,
1384 const double3* __restrict patchMins,
1385 const double3* __restrict patchMaxes,
1386 const double3* __restrict patchCenter,
1387 const CudaMInfo* __restrict mInfo,
1388 unsigned int* __restrict tbcatomic,
1389 double* patchMaxAtomMovement,
1390 double* h_patchMaxAtomMovement,
1391 double* patchNewTolerance,
1392 double* h_patchNewTolerance,
1393 const double minSize,
1394 const double cutoff,
1395 const double sysdima,
1396 const double sysdimb,
1397 const double sysdimc,
1398 unsigned int* __restrict h_marginViolations,
1399 unsigned int* __restrict h_periodicCellSmall,
1400 const bool rescalePairlistTolerance
1402 __shared__ CudaLocalRecord s_record;
1403 using AccessType = int32_t;
1404 AccessType* s_record_buffer = (AccessType*) &s_record;
1406 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1407 // Read in the CudaLocalRecord using multiple threads. This should
1409 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1410 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1414 const int pid = s_record.patchID;
1417 // So each patch has a max_CD particular to it
1418 // since we have a block per patch, we get the stuff here
1419 // we need to check eight points for each patch. Might not pay off to do that
1420 // we do that once per patch I guess
1421 double cd, max_cd, max_pd;
1422 __shared__ double minx, miny, minz, maxx, maxy, maxz;
1423 double3 corner, corner_unscaled, corner2_unscaled;
1427 const int stride = blockDim.x;
1429 const int ind = patchIndex;
1430 double3 min = patchMins[ind];
1431 double3 max = patchMaxes[ind];
1432 double3 aDist = awayDists[ind];
1434 const int start = s_record.bufferOffset;
1435 const int numAtoms = s_record.numAtoms;
1436 double3 center = patchCenter[ind];
1438 double3 center_cur = lattice.unscale(center);
1439 double3 center_old = latticeOld.unscale(center);
1441 center_cur.x -= center_old.x;
1442 center_cur.y -= center_old.y;
1443 center_cur.z -= center_old.z;
1447 if(threadIdx.x == 0){
1448 if (rescalePairlistTolerance) patchNewTolerance[ind] *= (1.0f - c_pairlistShrink);
1450 // doMargincheck -> checks if the periodic cell has became too small:
1452 // Those values are in registers to begin with, so the overhead here is not that large
1453 // if this condition is set to true, simulation will stop
1454 mValue.x = (0.5) * (aDist.x - cutoff / sysdima);
1455 mValue.y = (0.5) * (aDist.y - cutoff / sysdimb);
1456 mValue.z = (0.5) * (aDist.z - cutoff / sysdimc);
1457 minx = min.x - mValue.x;
1458 miny = min.y - mValue.y;
1459 minz = min.z - mValue.z;
1460 maxx = max.x + mValue.x;
1461 maxy = max.y + mValue.y;
1462 maxz = max.z + mValue.z;
1464 for(int i = 0; i < 2; i++){
1465 for(int j =0 ; j< 2; j++){
1466 for(int k = 0; k < 2; k++){
1467 corner.x = (i ? min.x : max.x);
1468 corner.y = (j ? min.y : max.y);
1469 corner.z = (k ? min.z : max.z);
1470 corner_unscaled = lattice.unscale(corner);
1471 corner2_unscaled = latticeOld.unscale(corner);
1472 corner.x = (corner_unscaled.x - corner2_unscaled.x) - center_cur.x;
1473 corner.y = (corner_unscaled.y - corner2_unscaled.y) - center_cur.y;
1474 corner.z = (corner_unscaled.z - corner2_unscaled.z) - center_cur.z;
1475 cd = corner.x * corner.x +
1476 corner.y * corner.y + corner.z * corner.z;
1477 if (cd > max_cd) max_cd = cd;
1484 // Alrights, so we have max_cd for each patch, now we get the max_pd
1485 // XXX TODO: Get the number of atoms per patch
1486 // we need the nAtomsPerPatch
1487 // we need an atomStart here
1488 unsigned int mc = 0;
1489 for(int i = threadIdx.x; i < numAtoms; i += stride){
1490 // Here I can also check for margin violations - we need the aAwayDists
1492 pos.x = pos_x[start + i];
1493 pos.y = pos_y[start + i];
1494 pos.z = pos_z[start + i];
1495 s = lattice.scale(pos);
1497 // This is true if the system in periodic in A, B and C
1498 // if any of these clauses are true, atom needs to migrate
1499 mc += ((s.x < minx || s.x >= maxx) || (s.y < miny || s.y >= maxy) || (s.z < minz || s.z >= maxz)) ? 1 : 0;
1501 int xdev, ydev, zdev;
1502 // Ok, so if the system is not periodic, we need to access the mInfo data structure to determine migrations
1503 if (s.x < minx) xdev = 0;
1504 else if (maxx <= s.x) xdev = 2;
1507 if (s.y < miny) ydev = 0;
1508 else if (maxy <= s.y) ydev = 2;
1511 if (s.z < minz) zdev = 0;
1512 else if (maxz <= s.z) zdev = 2;
1515 if(xdev != 1 || ydev != 1 || zdev != 1){
1516 // we check if any *dev are different than zero to prevent this horrible global memory access here
1517 int destPatch = mInfo[ind].destPatchID[xdev][ydev][zdev];
1518 if(destPatch != -1 && destPatch != pid) mc += 1; // atom needs to migrate
1521 corner.x = (pos.x - pos_old_x[start + i]) - center_cur.x;
1522 corner.y = (pos.y - pos_old_y[start + i]) - center_cur.y;
1523 corner.z = (pos.z - pos_old_z[start + i]) - center_cur.z;
1524 cd = corner.x * corner.x +
1525 corner.y * corner.y + corner.z * corner.z;
1526 if(cd > max_pd) max_pd = cd;
1528 // JM NOTE: The atomic add to host memory is bad, but if you have margin violations the simulation is going badly
1531 atomicAdd(h_marginViolations, mc);
1534 typedef cub::BlockReduce<BigReal, PATCH_BLOCKS> BlockReduce;
1535 __shared__ typename BlockReduce::TempStorage temp_storage;
1536 max_pd = BlockReduce(temp_storage).Reduce(max_pd, cub::Max());
1537 if(threadIdx.x == 0){
1538 max_pd = sqrt(max_cd) + sqrt(max_pd);
1539 // this might not be needed since I'm updating host-mapped values anyways
1540 patchMaxAtomMovement[ind] = max_pd;
1541 if(max_pd > (1.f - c_pairlistTrigger) * patchNewTolerance[ind]){
1542 patchNewTolerance[ind] *= (1.f + c_pairlistGrow);
1544 if(max_pd > patchNewTolerance[ind]){
1545 patchNewTolerance[ind] = max_pd / (1.f - c_pairlistTrigger);
1547 // printf("patchNewTolerance[%d] = %lf %lf\n", ind,
1548 // patchNewTolerance[ind], patchMaxAtomMovement[ind]);
1549 h_patchMaxAtomMovement[ind] = max_pd; // Host-mapped update
1550 h_patchNewTolerance[ind] = patchNewTolerance[ind];
1553 // Checks if periodic cell has become too small and flags it
1554 if(threadIdx.x == 0){
1555 if ( ( aDist.x*sysdima < minSize*0.9999 ) ||
1556 ( aDist.y*sysdimb < minSize*0.9999 ) ||
1557 ( aDist.z*sysdimc < minSize*0.9999 ) ||
1558 ( mValue.x < -0.0001 ) ||
1559 ( mValue.y < -0.0001 ) ||
1560 ( mValue.z < -0.0001)) {
1561 *h_periodicCellSmall = 1;
1567 void SequencerCUDAKernel::PairListMarginCheck(const int numPatches,
1568 CudaLocalRecord* localRecords,
1569 const double* pos_x,
1570 const double* pos_y,
1571 const double* pos_z,
1572 const double* pos_old_x,
1573 const double* pos_old_y,
1574 const double* pos_old_z,
1575 const double3* awayDists, // for margin check
1576 const Lattice lattices,
1577 const Lattice lattices_old,
1578 const double3* __restrict patchMins,
1579 const double3* __restrict patchMaxes,
1580 const double3* __restrict patchCenter,
1581 const CudaMInfo* __restrict mInfo,
1582 unsigned int* __restrict tbcatomic,
1583 const double pairlistTrigger,
1584 const double pairlistGrow,
1585 const double pairlistShrink,
1586 double* __restrict patchMaxAtomMovement,
1587 double* __restrict h_patchMaxAtomMovement,
1588 double* __restrict patchNewTolerance,
1589 double* __restrict h_patchNewTolerance,
1590 const double minSize,
1591 const double cutoff,
1592 const double sysdima,
1593 const double sysdimb,
1594 const double sysdimc,
1595 unsigned int* h_marginViolations,
1596 unsigned int* h_periodicCellSmall,
1597 const bool rescalePairlistTolerance,
1598 const bool isPeriodic,
1599 cudaStream_t stream){
1600 int grid = numPatches;
1602 if(!this->intConstInit){
1603 this->intConstInit = true;
1604 cudaCheck(cudaMemcpyToSymbol(c_pairlistTrigger, &pairlistTrigger, sizeof(double)));
1605 cudaCheck(cudaMemcpyToSymbol(c_pairlistGrow, &pairlistGrow, sizeof(double)));
1606 cudaCheck(cudaMemcpyToSymbol(c_pairlistShrink, &pairlistShrink, sizeof(double)));
1610 pairListMarginCheckKernel<true><<<grid, PATCH_BLOCKS, 0, stream>>>(
1611 numPatches, localRecords,
1612 pos_x, pos_y, pos_z, pos_old_x,
1613 pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1614 patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1615 patchMaxAtomMovement, h_patchMaxAtomMovement,
1616 patchNewTolerance, h_patchNewTolerance,
1617 minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1618 rescalePairlistTolerance);
1621 pairListMarginCheckKernel<false><<<grid, PATCH_BLOCKS, 0, stream>>>(
1622 numPatches, localRecords,
1623 pos_x, pos_y, pos_z, pos_old_x,
1624 pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1625 patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1626 patchMaxAtomMovement, h_patchMaxAtomMovement,
1627 patchNewTolerance, h_patchNewTolerance,
1628 minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1629 rescalePairlistTolerance);
1633 template <bool doNbond, bool doSlow>
1634 __global__ void copyFBondToSOA(double *f_bond,
1635 double *f_bond_nbond,
1636 double *f_bond_slow,
1646 const int forceStride,
1647 const PatchRecord *pr,
1648 const int *patchIDs,
1649 const int *patchOffsets)
1651 // I suppose if I work with the entire PatchMap, this should work?
1652 // Same thing, each block gets a patch
1653 // What do I need -> Forces + atomStart
1654 // the bonded forces are wrong here.
1655 // What isforceOffset and patchOffset?
1656 int stride = blockDim.x;
1657 int patchID = patchIDs[blockIdx.x];
1658 // we need to check this data structure first to make sure it is safe to access it using patchID
1659 // I think patchId is the correct way of indexing this datastructure. Does this change with +p?
1660 int natoms = pr[patchID].numAtoms;
1661 int forceOffset = pr[patchID].atomStart;
1662 int patchOffset = patchOffsets[blockIdx.x];
1664 for(int i = threadIdx.x; i < natoms; i+=stride){
1666 f_bond_x[patchOffset + i] = f_bond[forceOffset + i];
1667 f_bond_y[patchOffset + i] = f_bond[forceOffset + i + forceStride];
1668 f_bond_z[patchOffset + i] = f_bond[forceOffset + i + 2*forceStride];
1671 // XXX Accumulation here works because each f_nbond_* buffer
1672 // is first initialized in copyFNBondToSOA.
1673 f_nbond_x[patchOffset + i] += f_bond_nbond[forceOffset + i];
1674 f_nbond_y[patchOffset + i] += f_bond_nbond[forceOffset + i + forceStride];
1675 f_nbond_z[patchOffset + i] += f_bond_nbond[forceOffset + i + 2*forceStride];
1679 // XXX Accumulation here works because each f_slow_* buffer
1680 // is first initialized in copyFSlowToSOA.
1681 f_slow_x[patchOffset + i] += f_bond_slow[forceOffset + i];
1682 f_slow_y[patchOffset + i] += f_bond_slow[forceOffset + i + forceStride];
1683 f_slow_z[patchOffset + i] += f_bond_slow[forceOffset + i + 2*forceStride];
1688 void SequencerCUDAKernel::copy_bond_forces(int numPatches,
1690 double* f_bond_nbond,
1691 double* f_bond_slow,
1701 int forceStride, //if stridedForces
1703 const int* patchIDs,
1704 const int* patchOffsets,
1707 cudaStream_t stream)
1710 copyFBondToSOA<true, true><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1711 f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1712 f_nbond_x, f_nbond_y, f_nbond_z,
1713 f_slow_x, f_slow_y, f_slow_z, forceStride,
1714 pr, patchIDs, patchOffsets);
1716 copyFBondToSOA<true, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1717 f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1718 f_nbond_x, f_nbond_y, f_nbond_z,
1719 f_slow_x, f_slow_y, f_slow_z, forceStride,
1720 pr, patchIDs, patchOffsets);
1722 copyFBondToSOA<false, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1723 f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1724 f_nbond_x, f_nbond_y, f_nbond_z,
1725 f_slow_x, f_slow_y, f_slow_z, forceStride,
1726 pr, patchIDs, patchOffsets);
1730 __global__ void copyFSlowToSOA(const CudaForce* __restrict__ f_slow,
1734 const int* __restrict__ patchOffsets,
1735 const Lattice* __restrict__ lattices)
1737 int tid = threadIdx.x;
1738 int stride = blockDim.x;
1739 int patchOffset = patchOffsets[blockIdx.x];
1740 int numAtoms = patchOffsets[blockIdx.x + 1] - patchOffset;
1742 Lattice lat = lattices[0];
1744 for(int i = tid; i < numAtoms; i += stride){
1745 CudaForce f = f_slow[patchOffset + i];
1747 double3 f_scaled = lat.scale_force(
1748 Vector((double)f.x, (double)f.y, (double)f.z));
1750 // XXX Instead of accumulating slow force (+=), set them here (=)!
1751 f_slow_x[patchOffset + i] = f_scaled.x;
1752 f_slow_y[patchOffset + i] = f_scaled.y;
1753 f_slow_z[patchOffset + i] = f_scaled.z;
1757 void SequencerCUDAKernel::copy_slow_forces(int numPatches,
1758 const CudaForce* f_slow,
1762 const int* d_patchOffsets,
1763 const Lattice *lattices,
1764 cudaStream_t stream){
1766 copyFSlowToSOA<<<numPatches, PATCH_BLOCKS, 0, stream>>>(f_slow,
1767 f_slow_x, f_slow_y, f_slow_z, d_patchOffsets, lattices);
1770 __forceinline__ __device__ void zero_cudaTensor(cudaTensor &v)
1783 template <int MAX_FORCE_NUMBER, bool DO_VEL_RESCALING, bool DO_FIXED>
1784 __global__ void addForceToMomentumKernel(const double scaling,
1788 double velrescaling, // for stochastic velocity rescaling
1789 const double * __restrict recipMass,
1790 const double * __restrict f_normal_x,
1791 const double * __restrict f_normal_y,
1792 const double * __restrict f_normal_z,
1793 const double * __restrict f_nbond_x,
1794 const double * __restrict f_nbond_y,
1795 const double * __restrict f_nbond_z,
1796 const double * __restrict f_slow_x,
1797 const double * __restrict f_slow_y,
1798 const double * __restrict f_slow_z,
1799 double * __restrict vel_x,
1800 double * __restrict vel_y,
1801 double * __restrict vel_z,
1802 const int * __restrict atomFixed,
1805 int i = threadIdx.x + blockIdx.x*blockDim.x;
1808 switch (maxForceNumber) {
1811 vel_x[i] += f_slow_x[i] * recipMass[i] * dt_slow;
1812 vel_y[i] += f_slow_y[i] * recipMass[i] * dt_slow;
1813 vel_z[i] += f_slow_z[i] * recipMass[i] * dt_slow;
1814 // fall through because we will always have the "faster" forces
1816 dt_nbond *= scaling;
1817 vel_x[i] += f_nbond_x[i] * recipMass[i] * dt_nbond;
1818 vel_y[i] += f_nbond_y[i] * recipMass[i] * dt_nbond;
1819 vel_z[i] += f_nbond_z[i] * recipMass[i] * dt_nbond;
1820 // fall through because we will always have the "faster" forces
1822 dt_normal *= scaling;
1823 vel_x[i] += f_normal_x[i] * recipMass[i] * dt_normal;
1824 vel_y[i] += f_normal_y[i] * recipMass[i] * dt_normal;
1825 vel_z[i] += f_normal_z[i] * recipMass[i] * dt_normal;
1831 switch (MAX_FORCE_NUMBER) {
1833 vx += f_slow_x[i] * dt_slow;
1834 vy += f_slow_y[i] * dt_slow;
1835 vz += f_slow_z[i] * dt_slow;
1836 // fall through because we will always have the "faster" forces
1838 vx += f_nbond_x[i] * dt_nbond;
1839 vy += f_nbond_y[i] * dt_nbond;
1840 vz += f_nbond_z[i] * dt_nbond;
1841 // fall through because we will always have the "faster" forces
1843 vx += f_normal_x[i] * dt_normal;
1844 vy += f_normal_y[i] * dt_normal;
1845 vz += f_normal_z[i] * dt_normal;
1847 vx *= scaling * recipMass[i];
1848 vy *= scaling * recipMass[i];
1849 vz *= scaling * recipMass[i];
1851 if (DO_VEL_RESCALING) {
1852 vel_x[i] = velrescaling*vel_x[i] + vx;
1853 vel_y[i] = velrescaling*vel_y[i] + vy;
1854 vel_z[i] = velrescaling*vel_z[i] + vz;
1861 } else { // fixed atoms
1862 if (!atomFixed[i]) {
1863 if (DO_VEL_RESCALING) {
1864 vel_x[i] = velrescaling*vel_x[i] + vx;
1865 vel_y[i] = velrescaling*vel_y[i] + vy;
1866 vel_z[i] = velrescaling*vel_z[i] + vz;
1884 // JM: This sets the cudaAtom vector from the nonbonded compute
1885 template <bool t_doNbond, bool t_doHomePatches>
1886 __global__ void setComputePositionsKernel(
1887 CudaLocalRecord* localRecords,
1888 CudaPeerRecord* peerRecords,
1889 const double3* __restrict patchCenter,
1890 const int* __restrict patchSortOrder,
1891 const int numPatches,
1892 const unsigned int devID,
1894 const double charge_scaling,
1895 const double* __restrict d_pos_x,
1896 const double* __restrict d_pos_y,
1897 const double* __restrict d_pos_z,
1898 const float* __restrict charges,
1899 double** __restrict d_peer_pos_x,
1900 double** __restrict d_peer_pos_y,
1901 double** __restrict d_peer_pos_z,
1902 float4* __restrict nb_atoms,
1903 float4* __restrict b_atoms,
1904 float4* __restrict s_atoms
1907 __shared__ CudaLocalRecord s_record;
1908 using AccessType = int32_t;
1909 AccessType* s_record_buffer = (AccessType*) &s_record;
1911 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1912 // Read in the CudaLocalRecord using multiple threads
1914 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1915 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1920 //int soapid = globalToLocalID[record.patchID];
1921 center = patchCenter[patchIndex];
1922 double3 ucenter = lat.unscale(center);
1924 const int numAtoms = s_record.numAtoms;
1925 const int dstOffset = s_record.bufferOffset;
1926 const int dstOffsetNB = s_record.bufferOffsetNBPad;
1928 int srcDevice, srcOffset;
1929 const double *srcPosX, *srcPosY, *srcPosZ;
1930 if (t_doHomePatches) {
1932 srcOffset = dstOffset;
1937 srcDevice = s_record.inline_peers[0].deviceIndex;
1938 srcOffset = s_record.inline_peers[0].bufferOffset;
1939 srcPosX = d_peer_pos_x[srcDevice];
1940 srcPosY = d_peer_pos_y[srcDevice];
1941 srcPosZ = d_peer_pos_z[srcDevice];
1945 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1946 const int order = patchSortOrder[dstOffset + i]; // Should this be order or unorder?
1947 atom.x = srcPosX[srcOffset + order] - ucenter.x;
1948 atom.y = srcPosY[srcOffset + order] - ucenter.y;
1949 atom.z = srcPosZ[srcOffset + order] - ucenter.z;
1950 atom.w = charges[dstOffset + order] * charge_scaling;
1952 b_atoms[dstOffset + order] = atom;
1953 if (t_doNbond) nb_atoms[dstOffsetNB + i] = atom;
1957 if (threadIdx.x / WARPSIZE == 0) {
1958 const int to_write = (((numAtoms+32-1)/32)*32) - numAtoms; // WARPSIZE
1961 const int order = patchSortOrder[dstOffset + numAtoms - 1]; // Should this be order or unorder?
1962 lastAtom.x = srcPosX[srcOffset + order] - ucenter.x;
1963 lastAtom.y = srcPosY[srcOffset + order] - ucenter.y;
1964 lastAtom.z = srcPosZ[srcOffset + order] - ucenter.z;
1965 lastAtom.w = charges[dstOffset + order] * charge_scaling;
1967 if (threadIdx.x < to_write) {
1968 nb_atoms[dstOffsetNB+numAtoms+threadIdx.x] = lastAtom;
1976 template<bool t_doHomePatches, bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
1977 __global__ void setComputePositionsKernel_PME (
1978 const double* __restrict d_pos_x,
1979 const double* __restrict d_pos_y,
1980 const double* __restrict d_pos_z,
1981 const float* __restrict charges,
1982 double** __restrict d_peer_pos_x,
1983 double** __restrict d_peer_pos_y,
1984 double** __restrict d_peer_pos_z,
1985 float** __restrict d_peer_charge,
1986 int** __restrict d_peer_partition,
1987 const int* __restrict partition,
1988 const double charge_scaling,
1989 const int* __restrict s_patchPositions,
1990 const int* __restrict s_pencilPatchIndex,
1991 const int* __restrict s_patchIDs,
1993 float4* __restrict s_atoms,
1995 int* __restrict s_partition,
1997 const int peerDevice,
2000 const bool handleBoundary
2005 const int pmeBufferOffset = offset;
2006 const int srcBufferOffset = 0;
2007 const int srcDevice = peerDevice;
2009 // double precision calculation
2010 double px, py, pz, wx, wy, wz, q;
2012 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numAtoms; i += blockDim.x * gridDim.x) {
2013 // this gets atoms in same order as PmeCuda code path
2014 if (t_doHomePatches) {
2015 q = (double)(charges[srcBufferOffset + i]);
2016 px = d_pos_x[srcBufferOffset + i];
2017 py = d_pos_y[srcBufferOffset + i];
2018 pz = d_pos_z[srcBufferOffset + i];
2019 if (t_doFEP || t_doTI) {
2020 part = partition[srcBufferOffset + i];
2023 q = (double)(d_peer_charge[srcDevice][srcBufferOffset + i]);
2024 px = d_peer_pos_x[srcDevice][srcBufferOffset + i];
2025 py = d_peer_pos_y[srcDevice][srcBufferOffset + i];
2026 pz = d_peer_pos_z[srcDevice][srcBufferOffset + i];
2027 if (t_doFEP || t_doTI) {
2028 part = d_peer_partition[srcDevice][srcBufferOffset + i];
2032 double3 w_vec = lat.scale(Vector(px, py, pz));
2037 if (handleBoundary) {
2038 wx = (wx - (floor(wx + 0.5) - 0.5));
2039 wy = (wy - (floor(wy + 0.5) - 0.5));
2040 wz = (wz - (floor(wz + 0.5) - 0.5));
2047 if (handleBoundary) {
2048 foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2049 foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2050 foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2053 foo.w = (float) (charge_scaling * q);
2054 if (!t_doFEP && !t_doTI) {
2055 s_atoms[pmeBufferOffset + i] = foo;
2057 else { // alchemical multiple grids
2058 float4 foo_zero_charge = foo;
2059 foo_zero_charge.w = 0.0f;
2060 s_partition[pmeBufferOffset + i] = part;
2061 /* grid 0 grid 1 grid 2 grid 3 grid 4
2062 * non-alch (p = 0) Y Y N N Y
2063 * appearing (p = 1) Y N Y N N
2064 * disappearing (p = 2) N Y N Y N
2065 * Requirements of grids:
2066 * 1. t_doFEP || t_doTI : grid 0, grid 1
2067 * 2. t_doAlchDecouple : grid 2, grid 3
2068 * 3. t_doAlchSoftCore || t_doTI: grid 4
2069 * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2070 * although the atoms that have zero charges in extra grids would not change in non-migration steps,
2071 * I still find no way to get rid of these copying, because positions of the atoms can be changed.
2072 * The non-zero charges may also change if they are computed from some QM engines or some new kinds of FF.
2073 * It seems these branchings in non-migration steps are inevitable.
2078 s_atoms[pmeBufferOffset + i] = foo;
2079 s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2080 if (t_doAlchDecouple) {
2081 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2082 s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2083 if (t_doAlchSoftCore || t_doTI) {
2084 s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo;
2087 if (t_doAlchSoftCore || t_doTI) {
2088 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2095 s_atoms[pmeBufferOffset + i] = foo;
2096 s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo_zero_charge;
2097 if (t_doAlchDecouple) {
2098 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2099 s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2100 if (t_doAlchSoftCore || t_doTI) {
2101 s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2104 if (t_doAlchSoftCore || t_doTI) {
2105 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2110 // disappearing atoms
2112 s_atoms[pmeBufferOffset + i] = foo_zero_charge;
2113 s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2114 if (t_doAlchDecouple) {
2115 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2116 s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo;
2117 if (t_doAlchSoftCore || t_doTI) {
2118 s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2121 if (t_doAlchSoftCore || t_doTI) {
2122 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2132 void SequencerCUDAKernel::set_compute_positions(
2134 const bool isPmeDevice,
2136 const int numPatchesHomeAndProxy,
2137 const int numPatchesHome,
2142 const bool doAlchDecouple,
2143 const bool doAlchSoftCore,
2144 const bool handleBoundary,
2145 const double* d_pos_x,
2146 const double* d_pos_y,
2147 const double* d_pos_z,
2148 #ifndef NAMD_NCCL_ALLREDUCE
2149 double** d_peer_pos_x,
2150 double** d_peer_pos_y,
2151 double** d_peer_pos_z,
2152 float** d_peer_charge,
2153 int** d_peer_partition,
2155 const float* charges,
2156 const int* partition,
2157 const double charge_scaling,
2158 const double3* patchCenter,
2159 const int* s_patchPositions,
2160 const int* s_pencilPatchIndex,
2161 const int* s_patchIDs,
2162 const int* patchSortOrder,
2163 const Lattice lattice,
2169 CudaLocalRecord* localRecords,
2170 CudaPeerRecord* peerRecords,
2171 std::vector<int>& atomCounts,
2172 cudaStream_t stream)
2174 // Launch Local Set Compute Positions
2176 setComputePositionsKernel<true, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2177 localRecords, peerRecords, patchCenter, patchSortOrder,
2178 numPatchesHome, devID, lattice, charge_scaling,
2179 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2180 nb_atoms, b_atoms, s_atoms
2183 setComputePositionsKernel<false, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2184 localRecords, peerRecords, patchCenter, patchSortOrder,
2185 numPatchesHome, devID, lattice, charge_scaling,
2186 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2187 nb_atoms, b_atoms, s_atoms
2191 // Launch Peer Set Computes
2193 const int numProxyPatches = numPatchesHomeAndProxy - numPatchesHome;
2195 setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2196 localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2197 numProxyPatches, devID, lattice, charge_scaling,
2198 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2199 nb_atoms, b_atoms, s_atoms
2202 setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2203 localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2204 numProxyPatches, devID, lattice, charge_scaling,
2205 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2206 nb_atoms, b_atoms, s_atoms
2211 // Launch PME setComputes
2212 #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
2213 setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
2214 <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
2215 d_pos_x, d_pos_y, d_pos_z, charges, \
2216 d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
2217 d_peer_partition, partition, charge_scaling, \
2218 s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
2219 lattice, s_atoms, numTotalAtoms, s_partition, \
2220 i, numAtoms, offset, handleBoundary \
2222 // Only when PME long-range electrostaic is enabled (doSlow is true)
2223 // The partition is needed for alchemical calculation.
2224 if(doSlow && isPmeDevice) {
2226 for (int i = 0; i < nDev; i++) {
2227 const bool home = (i == devID);
2228 const int numAtoms = atomCounts[i];
2229 const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
2230 const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
2233 case 0: CALL(0, 0, 0, 0, 0); break;
2234 case 1: CALL(0, 0, 0, 0, 1); break;
2235 case 2: CALL(0, 0, 0, 1, 0); break;
2236 case 3: CALL(0, 0, 0, 1, 1); break;
2237 case 4: CALL(0, 0, 1, 0, 0); break;
2238 case 5: CALL(0, 0, 1, 0, 1); break;
2239 case 6: CALL(0, 0, 1, 1, 0); break;
2240 case 7: CALL(0, 0, 1, 1, 1); break;
2241 case 8: CALL(0, 1, 0, 0, 0); break;
2242 case 9: CALL(0, 1, 0, 0, 1); break;
2243 case 10: CALL(0, 1, 0, 1, 0); break;
2244 case 11: CALL(0, 1, 0, 1, 1); break;
2245 case 12: CALL(0, 1, 1, 0, 0); break;
2246 case 13: CALL(0, 1, 1, 0, 1); break;
2247 case 14: CALL(0, 1, 1, 1, 0); break;
2248 case 15: CALL(0, 1, 1, 1, 1); break;
2249 case 16: CALL(1, 0, 0, 0, 0); break;
2250 case 17: CALL(1, 0, 0, 0, 1); break;
2251 case 18: CALL(1, 0, 0, 1, 0); break;
2252 case 19: CALL(1, 0, 0, 1, 1); break;
2253 case 20: CALL(1, 0, 1, 0, 0); break;
2254 case 21: CALL(1, 0, 1, 0, 1); break;
2255 case 22: CALL(1, 0, 1, 1, 0); break;
2256 case 23: CALL(1, 0, 1, 1, 1); break;
2257 case 24: CALL(1, 1, 0, 0, 0); break;
2258 case 25: CALL(1, 1, 0, 0, 1); break;
2259 case 26: CALL(1, 1, 0, 1, 0); break;
2260 case 27: CALL(1, 1, 0, 1, 1); break;
2261 case 28: CALL(1, 1, 1, 0, 0); break;
2262 case 29: CALL(1, 1, 1, 0, 1); break;
2263 case 30: CALL(1, 1, 1, 1, 0); break;
2264 case 31: CALL(1, 1, 1, 1, 1); break;
2266 NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
2275 template <bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
2276 __global__ void setPmePositionsKernel(
2277 double charge_scaling,
2278 const Lattice lattice,
2279 const double *pos_x,
2280 const double *pos_y,
2281 const double *pos_z,
2282 const float *charges,
2283 const int* partition,
2285 int* s_atoms_partition,
2288 int i = threadIdx.x + blockIdx.x*blockDim.x;
2290 Lattice lat = lattice;
2292 double wx, wy, wz, q;
2293 q = (double)(charges[i]);
2295 double3 w_vec = lat.scale(Vector(pos_x[i], pos_y[i], pos_z[i]));
2300 wx = (wx - (floor(wx + 0.5) - 0.5));
2301 wy = (wy - (floor(wy + 0.5) - 0.5));
2302 wz = (wz - (floor(wz + 0.5) - 0.5));
2306 foo.w = (float) (charge_scaling * q);
2307 foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2308 foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2309 foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2310 if (!t_doFEP && !t_doTI) {
2313 else { // alchemical multiple grids
2314 float4 foo_zero_charge = foo;
2315 foo_zero_charge.w = 0.0f;
2316 s_atoms_partition[i] = partition[i];
2317 /* grid 0 grid 1 grid 2 grid 3 grid 4
2318 * non-alch (p = 0) Y Y N N Y
2319 * appearing (p = 1) Y N Y N N
2320 * disappearing (p = 2) N Y N Y N
2321 * Requirements of grids:
2322 * 1. t_doFEP || t_doTI : grid 0, grid 1
2323 * 2. t_doAlchDecouple : grid 2, grid 3
2324 * 3. t_doAlchSoftCore || t_doTI: grid 4
2325 * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2327 switch (partition[i]) {
2331 s_atoms[i + numAtoms] = foo;
2332 if (t_doAlchDecouple) {
2333 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2334 s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2335 if (t_doAlchSoftCore || t_doTI) {
2336 s_atoms[i + 4 * numAtoms] = foo;
2339 if (t_doAlchSoftCore || t_doTI) {
2340 s_atoms[i + 2 * numAtoms] = foo;
2348 s_atoms[i + numAtoms] = foo_zero_charge;
2349 if (t_doAlchDecouple) {
2350 s_atoms[i + 2 * numAtoms] = foo;
2351 s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2352 if (t_doAlchSoftCore || t_doTI) {
2353 s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2356 if (t_doAlchSoftCore || t_doTI) {
2357 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2362 // disappearing atoms
2364 s_atoms[i] = foo_zero_charge;
2365 s_atoms[i + numAtoms] = foo;
2366 if (t_doAlchDecouple) {
2367 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2368 s_atoms[i + 3 * numAtoms] = foo;
2369 if (t_doAlchSoftCore || t_doTI) {
2370 s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2373 if (t_doAlchSoftCore || t_doTI) {
2374 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2384 __global__ void maximumMoveKernel(const double maxvel2,
2385 const double * __restrict vel_x,
2386 const double * __restrict vel_y,
2387 const double * __restrict vel_z,
2391 int i = threadIdx.x + blockIdx.x*blockDim.x;
2393 double vel2 = vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i];
2394 if (vel2 > maxvel2) {
2395 //If this ever happens, we're already screwed, so performance does not matter
2396 atomicAdd(killme, 1);
2401 void SequencerCUDAKernel::maximumMove(const double maxvel2,
2402 const double *vel_x,
2403 const double *vel_y,
2404 const double *vel_z,
2407 cudaStream_t stream)
2409 //cudaCheck(cudaMemset(killme, 0, sizeof(int)));
2410 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2411 maximumMoveKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
2412 maxvel2, vel_x, vel_y, vel_z, killme, numAtoms);
2415 void SequencerCUDAKernel::addForceToMomentum(
2417 const double scaling,
2421 double velrescaling, // for stochastic velocity rescaling
2422 const double *recipMass,
2423 const double *f_normal_x,
2424 const double *f_normal_y,
2425 const double *f_normal_z,
2426 const double *f_nbond_x,
2427 const double *f_nbond_y,
2428 const double *f_nbond_z,
2429 const double *f_slow_x,
2430 const double *f_slow_y,
2431 const double *f_slow_z,
2435 const int *atomFixed,
2438 cudaStream_t stream)
2440 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2441 const bool doVelRescaling = (velrescaling != 1.0);
2442 int kernelCalled = 0;
2443 #define CALL(MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED) \
2445 addForceToMomentumKernel<MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED> \
2446 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
2447 scaling, dt_normal, dt_nbond, dt_slow, velrescaling, \
2448 recipMass, f_normal_x, f_normal_y, f_normal_z, \
2449 f_nbond_x, f_nbond_y, f_nbond_z, \
2450 f_slow_x, f_slow_y, f_slow_z, \
2451 vel_x, vel_y, vel_z, atomFixed, \
2455 switch (maxForceNumber) {
2457 if (doVelRescaling && doFixedAtoms) CALL(2, true, true);
2458 if (doVelRescaling && !doFixedAtoms) CALL(2, true, false);
2459 if (!doVelRescaling && !doFixedAtoms) CALL(2, false, false);
2460 if (!doVelRescaling && doFixedAtoms) CALL(2, false, true);
2464 if (doVelRescaling && doFixedAtoms) CALL(1, true, true);
2465 if (doVelRescaling && !doFixedAtoms) CALL(1, true, false);
2466 if (!doVelRescaling && !doFixedAtoms) CALL(1, false, false);
2467 if (!doVelRescaling && doFixedAtoms) CALL(1, false, true);
2471 if (doVelRescaling && doFixedAtoms) CALL(0, true, true);
2472 if (doVelRescaling && !doFixedAtoms) CALL(0, true, false);
2473 if (!doVelRescaling && !doFixedAtoms) CALL(0, false, false);
2474 if (!doVelRescaling && doFixedAtoms) CALL(0, false, true);
2479 if (kernelCalled != 1) NAMD_bug("Incorrect number of calls to kernel in SequencerCUDAKernel::addForceToMomentum!\n");
2480 // cudaCheck(cudaGetLastError());
2483 template <bool copyPos, bool DO_FIXED>
2484 __global__ void addVelocityToPositionKernel(
2486 const int* __restrict atomFixed,
2487 const double * __restrict vel_x,
2488 const double * __restrict vel_y,
2489 const double * __restrict vel_z,
2490 double * __restrict pos_x,
2491 double * __restrict pos_y,
2492 double * __restrict pos_z,
2493 double * __restrict h_pos_x, // host-mapped vectors
2494 double * __restrict h_pos_y,
2495 double * __restrict h_pos_z,
2498 int i = threadIdx.x + blockIdx.x*blockDim.x;
2505 if (!atomFixed[i]) {
2526 void SequencerCUDAKernel::addVelocityToPosition(
2529 const int *atomFixed,
2530 const double *vel_x,
2531 const double *vel_y,
2532 const double *vel_z,
2541 cudaStream_t stream)
2543 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2544 #define CALL(copyPos, DO_FIXED) \
2545 addVelocityToPositionKernel<copyPos, DO_FIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
2546 dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z, \
2547 h_pos_x, h_pos_y, h_pos_z, numAtoms)
2548 if ( copyPositions && doFixedAtoms) CALL(true, true);
2549 else if (!copyPositions && doFixedAtoms) CALL(false, true);
2550 else if ( copyPositions && !doFixedAtoms) CALL(true, false);
2551 else if (!copyPositions && !doFixedAtoms) CALL(false, false);
2552 else NAMD_bug("No kernel was called in SequencerCUDAKernel::addVelocityToPosition!\n");
2554 //cudaCheck(cudaGetLastError());
2557 template <bool doFixed>
2558 __global__ void updateRigidArraysKernel(
2560 const int * __restrict atomFixed,
2561 const double * __restrict vel_x,
2562 const double * __restrict vel_y,
2563 const double * __restrict vel_z,
2564 const double * __restrict pos_x,
2565 const double * __restrict pos_y,
2566 const double * __restrict pos_z,
2567 double * __restrict velNew_x,
2568 double * __restrict velNew_y,
2569 double * __restrict velNew_z,
2570 double * __restrict posNew_x,
2571 double * __restrict posNew_y,
2572 double * __restrict posNew_z,
2575 int i = threadIdx.x + blockIdx.x*blockDim.x;
2577 double vx = vel_x[i];
2578 double vy = vel_y[i];
2579 double vz = vel_z[i];
2581 posNew_x[i] = pos_x[i] + (vx * dt);
2582 posNew_y[i] = pos_y[i] + (vy * dt);
2583 posNew_z[i] = pos_z[i] + (vz * dt);
2585 if (!atomFixed[i]) {
2586 posNew_x[i] = pos_x[i] + (vx * dt);
2587 posNew_y[i] = pos_y[i] + (vy * dt);
2588 posNew_z[i] = pos_z[i] + (vz * dt);
2590 posNew_x[i] = pos_x[i];
2591 posNew_y[i] = pos_y[i];
2592 posNew_z[i] = pos_z[i];
2602 // JM NOTE: Optimize this kernel further to improve global memory access pattern
2603 __global__ void centerOfMassKernel(
2604 const double * __restrict coor_x,
2605 const double * __restrict coor_y,
2606 const double * __restrict coor_z,
2607 double * __restrict cm_x, // center of mass is atom-sized
2608 double * __restrict cm_y,
2609 double * __restrict cm_z,
2610 const float * __restrict mass,
2611 const int * __restrict hydrogenGroupSize,
2614 int i = threadIdx.x + blockIdx.x*blockDim.x;
2616 int hgs = hydrogenGroupSize[i];
2619 // missing fixed atoms
2621 BigReal reg_cm_x = 0;
2622 BigReal reg_cm_y = 0;
2623 BigReal reg_cm_z = 0;
2624 for ( j = i; j < (i+hgs); ++j ) {
2626 reg_cm_x += mass[j] * coor_x[j];
2627 reg_cm_y += mass[j] * coor_y[j];
2628 reg_cm_z += mass[j] * coor_z[j];
2630 BigReal inv_m_cm = 1.0/m_cm;
2631 reg_cm_x *= inv_m_cm;
2632 reg_cm_y *= inv_m_cm;
2633 reg_cm_z *= inv_m_cm;
2635 for(j = i ; j < (i + hgs); j++){
2644 void SequencerCUDAKernel::centerOfMass(
2645 const double *coor_x,
2646 const double *coor_y,
2647 const double *coor_z,
2652 const int* hydrogenGroupSize,
2656 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2658 centerOfMassKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(coor_x, coor_y, coor_z,
2659 cm_x, cm_y, cm_z, mass, hydrogenGroupSize, numAtoms);
2663 void SequencerCUDAKernel::updateRigidArrays(
2666 const int *atomFixed,
2667 const double *vel_x,
2668 const double *vel_y,
2669 const double *vel_z,
2670 const double *pos_x,
2671 const double *pos_y,
2672 const double *pos_z,
2680 cudaStream_t stream)
2682 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2684 updateRigidArraysKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2685 dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2686 velNew_x, velNew_y, velNew_z,
2687 posNew_x, posNew_y, posNew_z, numAtoms);
2689 updateRigidArraysKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2690 dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2691 velNew_x, velNew_y, velNew_z,
2692 posNew_x, posNew_y, posNew_z, numAtoms);
2693 // cudaCheck(cudaGetLastError());
2696 template<int BLOCK_SIZE, bool DO_FIXED>
2697 __global__ void submitHalfKernel(
2698 // const int* __restrict atomFixed,
2699 const double * __restrict vel_x,
2700 const double * __restrict vel_y,
2701 const double * __restrict vel_z,
2702 const double * __restrict vcm_x,
2703 const double * __restrict vcm_y,
2704 const double * __restrict vcm_z,
2705 const float * __restrict mass,
2706 BigReal *kineticEnergy,
2707 BigReal *intKineticEnergy,
2709 cudaTensor *intVirialNormal,
2710 BigReal *h_kineticEnergy,
2711 BigReal *h_intKineticEnergy,
2712 cudaTensor *h_virial,
2713 cudaTensor *h_intVirialNormal,
2714 int *hydrogenGroupSize,
2716 unsigned int* tbcatomic)
2718 BigReal k = 0, intK = 0;
2721 zero_cudaTensor(intV);
2722 int i = threadIdx.x + blockIdx.x*blockDim.x;
2723 int totaltb = gridDim.x;
2724 __shared__ bool isLastBlockDone;
2727 if(threadIdx.x == 0){
2728 isLastBlockDone = 0;
2735 (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
2736 v.xx = mass[i] * vel_x[i] * vel_x[i];
2738 v.xy = mass[i] * vel_x[i] * vel_y[i];
2739 v.xz = mass[i] * vel_x[i] * vel_z[i];
2741 v.yx = mass[i] * vel_y[i] * vel_x[i];
2742 v.yy = mass[i] * vel_y[i] * vel_y[i];
2744 v.yz = mass[i] * vel_y[i] * vel_z[i];
2746 v.zx = mass[i] * vel_z[i] * vel_x[i];
2747 v.zy = mass[i] * vel_z[i] * vel_y[i];
2748 v.zz = mass[i] * vel_z[i] * vel_z[i];
2751 int hgs = hydrogenGroupSize[i];
2757 for (int j = i; j < (i+hgs); j++) {
2759 v_cm_x += mass[j] * vel_x[j];
2760 v_cm_y += mass[j] * vel_y[j];
2761 v_cm_z += mass[j] * vel_z[j];
2763 BigReal recip_m_cm = 1.0 / m_cm;
2764 v_cm_x *= recip_m_cm;
2765 v_cm_y *= recip_m_cm;
2766 v_cm_z *= recip_m_cm;
2768 for (int j = i; j < (i+hgs); j++) {
2769 BigReal dv_x = vel_x[j] - v_cm_x;
2770 BigReal dv_y = vel_y[j] - v_cm_y;
2771 BigReal dv_z = vel_z[j] - v_cm_z;
2773 (vel_x[j] * dv_x + vel_y[j] * dv_y + vel_z[j] * dv_z);
2774 intV.xx += mass[j] * vel_x[j] * dv_x;
2775 intV.xy += mass[j] * vel_x[j] * dv_y;
2776 intV.xz += mass[j] * vel_x[j] * dv_z;
2777 intV.yx += mass[j] * vel_y[j] * dv_x;
2778 intV.yy += mass[j] * vel_y[j] * dv_y;
2779 intV.yz += mass[j] * vel_y[j] * dv_z;
2780 intV.zx += mass[j] * vel_z[j] * dv_x;
2781 intV.zy += mass[j] * vel_z[j] * dv_y;
2782 intV.zz += mass[j] * vel_z[j] * dv_z;
2786 // JM: New version with centers of mass calculated by a separate kernel
2787 BigReal v_cm_x = vcm_x[i];
2788 BigReal v_cm_y = vcm_y[i];
2789 BigReal v_cm_z = vcm_z[i];
2790 BigReal dv_x = vel_x[i] - v_cm_x;
2791 BigReal dv_y = vel_y[i] - v_cm_y;
2792 BigReal dv_z = vel_z[i] - v_cm_z;
2794 (vel_x[i] * dv_x + vel_y[i] * dv_y + vel_z[i] * dv_z);
2795 intV.xx += mass[i] * vel_x[i] * dv_x;
2796 intV.xy += mass[i] * vel_x[i] * dv_y;
2797 intV.xz += mass[i] * vel_x[i] * dv_z;
2798 intV.yx += mass[i] * vel_y[i] * dv_x;
2799 intV.yy += mass[i] * vel_y[i] * dv_y;
2800 intV.yz += mass[i] * vel_y[i] * dv_z;
2801 intV.zx += mass[i] * vel_z[i] * dv_x;
2802 intV.zy += mass[i] * vel_z[i] * dv_y;
2803 intV.zz += mass[i] * vel_z[i] * dv_z;
2807 typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
2808 __shared__ typename BlockReduce::TempStorage temp_storage;
2809 k = BlockReduce(temp_storage).Sum(k);
2811 v.xx = BlockReduce(temp_storage).Sum(v.xx);
2814 v.xy = BlockReduce(temp_storage).Sum(v.xy);
2816 v.xz = BlockReduce(temp_storage).Sum(v.xz);
2819 v.yx = BlockReduce(temp_storage).Sum(v.yx);
2821 v.yy = BlockReduce(temp_storage).Sum(v.yy);
2824 v.yz = BlockReduce(temp_storage).Sum(v.yz);
2827 v.zx = BlockReduce(temp_storage).Sum(v.zx);
2829 v.zy = BlockReduce(temp_storage).Sum(v.zy);
2831 v.zz = BlockReduce(temp_storage).Sum(v.zz);
2833 intK = BlockReduce(temp_storage).Sum(intK);
2835 intV.xx = BlockReduce(temp_storage).Sum(intV.xx);
2837 intV.xy = BlockReduce(temp_storage).Sum(intV.xy);
2839 intV.xz = BlockReduce(temp_storage).Sum(intV.xz);
2841 intV.yx = BlockReduce(temp_storage).Sum(intV.yx);
2843 intV.yy = BlockReduce(temp_storage).Sum(intV.yy);
2845 intV.yz = BlockReduce(temp_storage).Sum(intV.yz);
2847 intV.zx = BlockReduce(temp_storage).Sum(intV.zx);
2849 intV.zy = BlockReduce(temp_storage).Sum(intV.zy);
2851 intV.zz = BlockReduce(temp_storage).Sum(intV.zz);
2854 if (threadIdx.x == 0) {
2855 const int bin = blockIdx.x % ATOMIC_BINS;
2857 atomicAdd(&kineticEnergy[bin], k);
2858 atomicAdd(&virial[bin].xx, v.xx);
2860 atomicAdd(&virial[bin].xy, v.xy);
2861 atomicAdd(&virial[bin].xz, v.xz);
2863 atomicAdd(&virial[bin].yx, v.yx);
2864 atomicAdd(&virial[bin].yy, v.yy);
2866 atomicAdd(&virial[bin].yz, v.yz);
2868 atomicAdd(&virial[bin].zx, v.zx);
2869 atomicAdd(&virial[bin].zy, v.zy);
2870 atomicAdd(&virial[bin].zz, v.zz);
2871 atomicAdd(&intKineticEnergy[bin], intK);
2872 atomicAdd(&intVirialNormal[bin].xx, intV.xx);
2873 atomicAdd(&intVirialNormal[bin].xy, intV.xy);
2874 atomicAdd(&intVirialNormal[bin].xz, intV.xz);
2875 atomicAdd(&intVirialNormal[bin].yx, intV.yx);
2876 atomicAdd(&intVirialNormal[bin].yy, intV.yy);
2877 atomicAdd(&intVirialNormal[bin].yz, intV.yz);
2878 atomicAdd(&intVirialNormal[bin].zx, intV.zx);
2879 atomicAdd(&intVirialNormal[bin].zy, intV.zy);
2880 atomicAdd(&intVirialNormal[bin].zz, intV.zz);
2883 unsigned int value = atomicInc(&tbcatomic[1], totaltb);
2884 isLastBlockDone = (value == (totaltb -1));
2889 if(isLastBlockDone){
2890 if(threadIdx.x < ATOMIC_BINS){
2891 const int bin = threadIdx.x;
2893 double k = kineticEnergy[bin];
2894 double intK = intKineticEnergy[bin];
2895 cudaTensor v = virial[bin];
2896 cudaTensor intV = intVirialNormal[bin];
2898 // sets device scalars back to zero
2899 kineticEnergy[bin] = 0.0;
2900 intKineticEnergy[bin] = 0.0;
2901 zero_cudaTensor(virial[bin]);
2902 zero_cudaTensor(intVirialNormal[bin]);
2904 if(ATOMIC_BINS > 1){
2905 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2906 typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
2907 __shared__ typename WarpReduce::TempStorage tempStorage;
2908 __shared__ typename WarpReduceT::TempStorage tempStorageT;
2910 k = WarpReduce(tempStorage).Sum(k);
2911 intK = WarpReduce(tempStorage).Sum(intK);
2912 v = WarpReduceT(tempStorageT).Sum(v);
2913 intV = WarpReduceT(tempStorageT).Sum(intV);
2916 if(threadIdx.x == 0){
2917 h_kineticEnergy[0] = k;
2918 h_intKineticEnergy[0] = intK;
2920 h_intVirialNormal[0] = intV;
2922 //resets atomic counter
2923 reset_atomic_counter(&tbcatomic[1]);
2930 void SequencerCUDAKernel::submitHalf(
2931 const bool doFixedAtoms,
2932 const double *vel_x,
2933 const double *vel_y,
2934 const double *vel_z,
2935 const double *vcm_x,
2936 const double *vcm_y,
2937 const double *vcm_z,
2939 BigReal *kineticEnergy,
2940 BigReal *intKineticEnergy,
2942 cudaTensor *intVirialNormal,
2943 BigReal *h_kineticEnergy,
2944 BigReal *h_intKineticEnergy,
2945 cudaTensor *h_virial,
2946 cudaTensor *h_intVirialNormal,
2947 int *hydrogenGroupSize,
2949 unsigned int* tbcatomic,
2950 cudaStream_t stream)
2952 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2954 submitHalfKernel<ATOM_BLOCKS, true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2955 vel_x, vel_y, vel_z,
2956 vcm_x, vcm_y, vcm_z, mass,
2957 kineticEnergy, intKineticEnergy,
2958 virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2959 h_virial, h_intVirialNormal,
2960 hydrogenGroupSize, numAtoms, tbcatomic);
2962 submitHalfKernel<ATOM_BLOCKS, false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2963 vel_x, vel_y, vel_z,
2964 vcm_x, vcm_y, vcm_z, mass,
2965 kineticEnergy, intKineticEnergy,
2966 virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2967 h_virial, h_intVirialNormal,
2968 hydrogenGroupSize, numAtoms, tbcatomic);
2969 //cudaCheck(cudaGetLastError());
2972 __global__ void scaleCoordinateUseGroupPressureKernel(
2973 double * __restrict pos_x,
2974 double * __restrict pos_y,
2975 double * __restrict pos_z,
2977 int *hydrogenGroupSize,
2982 int i = threadIdx.x + blockIdx.x * blockDim.x;
2984 int hgs = hydrogenGroupSize[i];
2987 // missing fixed atoms implementation
2992 // calculate the center of mass
2993 for ( j = i; j < (i+hgs); ++j ) {
2995 r_cm_x += mass[j] * pos_x[j];
2996 r_cm_y += mass[j] * pos_y[j];
2997 r_cm_z += mass[j] * pos_z[j];
2999 BigReal inv_m_cm = 1.0/m_cm;
3003 // scale the center of mass with factor
3005 double tx = r_cm_x - origin.x;
3006 double ty = r_cm_y - origin.y;
3007 double tz = r_cm_z - origin.z;
3008 // apply transformation
3009 double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3010 double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3011 double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3013 new_r_cm_x += origin.x;
3014 new_r_cm_y += origin.y;
3015 new_r_cm_z += origin.z;
3016 // translation vector from old COM and new COM
3017 double delta_r_cm_x = new_r_cm_x - r_cm_x;
3018 double delta_r_cm_y = new_r_cm_y - r_cm_y;
3019 double delta_r_cm_z = new_r_cm_z - r_cm_z;
3020 // shift the hydrogen group with translation vector
3021 for (j = i; j < (i+hgs); j++) {
3022 pos_x[j] += delta_r_cm_x;
3023 pos_y[j] += delta_r_cm_y;
3024 pos_z[j] += delta_r_cm_z;
3030 __global__ void scaleCoordinateNoGroupPressureKernel(
3031 double * __restrict pos_x,
3032 double * __restrict pos_y,
3033 double * __restrict pos_z,
3038 int i = threadIdx.x + blockIdx.x * blockDim.x;
3040 // missing fixed atoms implementation
3042 double tx = pos_x[i] - origin.x;
3043 double ty = pos_y[i] - origin.y;
3044 double tz = pos_z[i] - origin.z;
3045 // apply transformation
3046 double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3047 double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3048 double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3050 pos_x[i] = ftx + origin.x;
3051 pos_y[i] = fty + origin.y;
3052 pos_z[i] = ftz + origin.z;
3056 __global__ void SetAtomIndexOrderKernel(
3061 int i = threadIdx.x + blockIdx.x * blockDim.x;
3063 int atomIndex = id[i];
3064 idOrder[atomIndex] = i;
3068 __global__ void scaleCoordinateUsingGCKernel(
3069 double* __restrict pos_x,
3070 double* __restrict pos_y,
3071 double* __restrict pos_z,
3072 const int* __restrict idOrder,
3073 const int* __restrict moleculeStartIndex,
3074 const int* __restrict moleculeAtom,
3075 const cudaTensor factor,
3076 const cudaVector origin,
3077 const Lattice oldLattice,
3078 const Lattice newLattice,
3079 const char3* __restrict transform,
3080 const int numMolecules,
3081 const int numLargeMolecules)
3083 // missing fixed atoms implementation
3084 int startIdx, endIdx, i, j, jmapped, atomIndex;
3085 double3 position, r_gc, new_r_gc, delta_r_gc;
3087 r_gc.x = 0; r_gc.y = 0; r_gc.z = 0;
3088 if (blockIdx.x < numLargeMolecules){ //large molecule case
3090 startIdx = moleculeStartIndex[i];
3091 endIdx = moleculeStartIndex[i + 1];
3092 __shared__ double3 sh_gc;
3093 double inv_length = 1.0/(double)(endIdx - startIdx);
3094 // calculate the geometric center
3095 for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3096 atomIndex = moleculeAtom[j];
3097 jmapped = idOrder[atomIndex];
3098 tr = transform[jmapped];
3099 position.x = pos_x[jmapped];
3100 position.y = pos_y[jmapped];
3101 position.z = pos_z[jmapped];
3102 //Unwrap the coordinate with oldLattice
3103 position = oldLattice.reverse_transform(position ,tr);
3104 r_gc.x += position.x;
3105 r_gc.y += position.y;
3106 r_gc.z += position.z;
3109 typedef cub::BlockReduce<double, 64> BlockReduce;
3110 __shared__ typename BlockReduce::TempStorage temp_storage;
3112 r_gc.x = BlockReduce(temp_storage).Sum(r_gc.x);
3114 r_gc.y = BlockReduce(temp_storage).Sum(r_gc.y);
3116 r_gc.z = BlockReduce(temp_storage).Sum(r_gc.z);
3119 if (threadIdx.x == 0) {
3120 sh_gc.x = r_gc.x * inv_length;
3121 sh_gc.y = r_gc.y * inv_length;
3122 sh_gc.z = r_gc.z * inv_length;
3126 // scale the geometric center with factor
3128 double tx = sh_gc.x - origin.x;
3129 double ty = sh_gc.y - origin.y;
3130 double tz = sh_gc.z - origin.z;
3131 // apply transformation
3132 new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3133 new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3134 new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3136 new_r_gc.x += origin.x;
3137 new_r_gc.y += origin.y;
3138 new_r_gc.z += origin.z;
3139 // translation vector from old GC to new GC
3140 delta_r_gc.x = new_r_gc.x - sh_gc.x;
3141 delta_r_gc.y = new_r_gc.y - sh_gc.y;
3142 delta_r_gc.z = new_r_gc.z - sh_gc.z;
3144 // shift the atoms in molecule with translation vector
3145 for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3146 atomIndex = moleculeAtom[j];
3147 jmapped = idOrder[atomIndex];
3148 tr = transform[jmapped];
3149 position.x = pos_x[jmapped];
3150 position.y = pos_y[jmapped];
3151 position.z = pos_z[jmapped];
3152 //Unwrap the coordinate with oldLattice
3153 position = oldLattice.reverse_transform(position, tr);
3154 position.x += delta_r_gc.x;
3155 position.y += delta_r_gc.y;
3156 position.z += delta_r_gc.z;
3157 // wrap the coordinate with new lattice parameter
3158 position = newLattice.apply_transform(position, tr);
3159 pos_x[jmapped] = position.x;
3160 pos_y[jmapped] = position.y;
3161 pos_z[jmapped] = position.z;
3163 } else { //Small molecule
3164 // numLargeMolecules block has been assigned to large molecule
3165 i = numLargeMolecules + threadIdx.x +
3166 (blockIdx.x - numLargeMolecules) * blockDim.x;
3168 if (i < numMolecules) {
3169 startIdx = moleculeStartIndex[i];
3170 endIdx = moleculeStartIndex[i+1];
3171 double inv_length = 1.0/(double)(endIdx - startIdx);
3173 // calculate the geometric center
3174 for ( j = startIdx; j < endIdx; j++ ) {
3175 atomIndex = moleculeAtom[j];
3176 jmapped = idOrder[atomIndex];
3177 tr = transform[jmapped];
3178 position.x = pos_x[jmapped];
3179 position.y = pos_y[jmapped];
3180 position.z = pos_z[jmapped];
3181 //Unwrap the coordinate with oldLattice
3182 position = oldLattice.reverse_transform(position, tr);
3183 r_gc.x += position.x;
3184 r_gc.y += position.y;
3185 r_gc.z += position.z;
3188 r_gc.x *= inv_length;
3189 r_gc.y *= inv_length;
3190 r_gc.z *= inv_length;
3192 // scale the geometric center with factor
3194 double tx = r_gc.x - origin.x;
3195 double ty = r_gc.y - origin.y;
3196 double tz = r_gc.z - origin.z;
3197 // apply transformation
3198 new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3199 new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3200 new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3202 new_r_gc.x += origin.x;
3203 new_r_gc.y += origin.y;
3204 new_r_gc.z += origin.z;
3205 // translation vector from old GC to new GC
3206 delta_r_gc.x = new_r_gc.x - r_gc.x;
3207 delta_r_gc.y = new_r_gc.y - r_gc.y;
3208 delta_r_gc.z = new_r_gc.z - r_gc.z;
3210 // shift the atoms in molecule with translation vector
3211 for (j = startIdx; j < endIdx; j++) {
3212 atomIndex = moleculeAtom[j];
3213 jmapped = idOrder[atomIndex];
3215 tr = transform[jmapped];
3216 position.x = pos_x[jmapped];
3217 position.y = pos_y[jmapped];
3218 position.z = pos_z[jmapped];
3219 // unwrap the coordinate with oldLattice
3220 position = oldLattice.reverse_transform(position, tr);
3221 position.x += delta_r_gc.x;
3222 position.y += delta_r_gc.y;
3223 position.z += delta_r_gc.z;
3224 // wrap the coordinate with new lattice parameter
3225 position = newLattice.apply_transform(position, tr);
3226 pos_x[jmapped] = position.x;
3227 pos_y[jmapped] = position.y;
3228 pos_z[jmapped] = position.z;
3234 template <bool doFixed>
3235 __global__ void langevinPistonUseGroupPressureKernel(
3236 const int* __restrict atomFixed,
3237 const int* __restrict groupFixed,
3238 const Lattice lattice,
3239 const char3* transform,
3240 const double* __restrict fixedPosition_x,
3241 const double* __restrict fixedPosition_y,
3242 const double* __restrict fixedPosition_z,
3243 double * __restrict pos_x,
3244 double * __restrict pos_y,
3245 double * __restrict pos_z,
3246 double * __restrict vel_x,
3247 double * __restrict vel_y,
3248 double * __restrict vel_z,
3250 int *hydrogenGroupSize,
3258 int i = threadIdx.x + blockIdx.x*blockDim.x;
3260 int hgs = hydrogenGroupSize[i];
3264 if (groupFixed[i]) {
3265 for ( j = i; j < (i+hgs); ++j ) {
3266 const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3267 const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3268 pos_x[j] = pos_trans.x;
3269 pos_y[j] = pos_trans.y;
3270 pos_z[j] = pos_trans.z;
3282 for ( j = i; j < (i+hgs); ++j ) {
3284 if (atomFixed[j]) continue;
3287 r_cm_x += mass[j] * pos_x[j];
3288 r_cm_y += mass[j] * pos_y[j];
3289 r_cm_z += mass[j] * pos_z[j];
3290 v_cm_x += mass[j] * vel_x[j];
3291 v_cm_y += mass[j] * vel_y[j];
3292 v_cm_z += mass[j] * vel_z[j];
3294 BigReal inv_m_cm = 1.0/m_cm;
3299 double tx = r_cm_x - origin.x;
3300 double ty = r_cm_y - origin.y;
3301 double tz = r_cm_z - origin.z;
3302 double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3303 double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3304 double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3305 new_r_cm_x += origin.x;
3306 new_r_cm_y += origin.y;
3307 new_r_cm_z += origin.z;
3309 double delta_r_cm_x = new_r_cm_x - r_cm_x;
3310 double delta_r_cm_y = new_r_cm_y - r_cm_y;
3311 double delta_r_cm_z = new_r_cm_z - r_cm_z;
3315 double delta_v_cm_x = ( velFactor_x - 1 ) * v_cm_x;
3316 double delta_v_cm_y = ( velFactor_y - 1 ) * v_cm_y;
3317 double delta_v_cm_z = ( velFactor_z - 1 ) * v_cm_z;
3318 for (j = i; j < (i+hgs); j++) {
3321 const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3322 const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3323 pos_x[j] = pos_trans.x;
3324 pos_y[j] = pos_trans.y;
3325 pos_z[j] = pos_trans.z;
3329 pos_x[j] += delta_r_cm_x;
3330 pos_y[j] += delta_r_cm_y;
3331 pos_z[j] += delta_r_cm_z;
3332 vel_x[j] += delta_v_cm_x;
3333 vel_y[j] += delta_v_cm_y;
3334 vel_z[j] += delta_v_cm_z;
3341 template <bool doFixed>
3342 __global__ void langevinPistonNoGroupPressureKernel(
3343 const int* __restrict atomFixed,
3344 const Lattice lattice,
3345 const char3* transform,
3346 const double* __restrict fixedPosition_x,
3347 const double* __restrict fixedPosition_y,
3348 const double* __restrict fixedPosition_z,
3349 double * __restrict pos_x,
3350 double * __restrict pos_y,
3351 double * __restrict pos_z,
3352 double * __restrict vel_x,
3353 double * __restrict vel_y,
3354 double * __restrict vel_z,
3362 int i = threadIdx.x + blockIdx.x*blockDim.x;
3366 const double3 pos_origin{fixedPosition_x[i], fixedPosition_y[i], fixedPosition_z[i]};
3367 const auto pos_trans = lattice.apply_transform(pos_origin, transform[i]);
3368 pos_x[i] = pos_trans.x;
3369 pos_y[i] = pos_trans.y;
3370 pos_z[i] = pos_trans.z;
3374 double tx = pos_x[i] - origin.x;
3375 double ty = pos_y[i] - origin.y;
3376 double tz = pos_z[i] - origin.z;
3377 double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3378 double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3379 double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3380 pos_x[i] = ftx + origin.x;
3381 pos_y[i] = fty + origin.y;
3382 pos_z[i] = ftz + origin.z;
3383 vel_x[i] *= velFactor_x;
3384 vel_y[i] *= velFactor_y;
3385 vel_z[i] *= velFactor_z;
3389 void SequencerCUDAKernel::scaleCoordinateWithFactor(
3394 int *hydrogenGroupSize,
3397 int useGroupPressure,
3399 cudaStream_t stream)
3401 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3402 if (useGroupPressure) {
3403 scaleCoordinateUseGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3404 pos_x, pos_y, pos_z, mass, hydrogenGroupSize, factor, origin, numAtoms);
3406 scaleCoordinateNoGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3407 pos_x, pos_y, pos_z, factor, origin, numAtoms);
3412 void SequencerCUDAKernel::SetAtomIndexOrder(
3416 cudaStream_t stream)
3418 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3419 SetAtomIndexOrderKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3420 id, idOrder, numAtoms);
3423 void SequencerCUDAKernel::scaleCoordinateUsingGC(
3428 const int *moleculeStartIndex,
3429 const int *moleculeAtom,
3430 const cudaTensor factor,
3431 const cudaVector origin,
3432 const Lattice oldLattice,
3433 const Lattice newLattice,
3434 const char3 *transform,
3435 const int numMolecules,
3436 const int numLargeMolecules,
3437 cudaStream_t stream)
3439 // we want each thread to calculate geometric center for molecule with
3440 // less than 128 atoms, and 1 threadblock to calculate each molecule
3441 // with larger than 128 atoms
3442 int numThreadsPerBlock = 64;
3443 int grid = ((numMolecules - numLargeMolecules + numThreadsPerBlock - 1) /
3444 numThreadsPerBlock) + numLargeMolecules;
3445 scaleCoordinateUsingGCKernel<<<grid, numThreadsPerBlock, 0, stream>>>(
3446 pos_x, pos_y, pos_z, idOrder, moleculeStartIndex,
3447 moleculeAtom, factor, origin, oldLattice, newLattice,
3448 transform, numMolecules, numLargeMolecules);
3452 void SequencerCUDAKernel::langevinPiston(
3453 const bool doFixedAtoms,
3454 const int* atomFixed,
3455 const int* groupFixed,
3456 const char3* transform,
3457 const Lattice lattice,
3458 const double* fixedPosition_x,
3459 const double* fixedPosition_y,
3460 const double* fixedPosition_z,
3468 int *hydrogenGroupSize,
3474 int useGroupPressure,
3476 cudaStream_t stream)
3478 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3480 double *h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3481 double *h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3482 double *h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3484 double *h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3485 double *h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3486 double *h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3488 copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3489 copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3490 copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3492 copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3493 copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3494 copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3496 fprintf(stderr, "velFactors = %lf %lf %lf\n",
3497 velFactor_x, velFactor_y, velFactor_z);
3498 for(int i = 0; i < numAtoms; i++){
3499 fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3500 h_pos_x[i], h_pos_y[i], h_pos_z[i],
3501 h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3511 if (useGroupPressure) {
3513 langevinPistonUseGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3514 atomFixed, groupFixed, lattice, transform,
3515 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3516 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3517 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3519 langevinPistonUseGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3520 atomFixed, groupFixed, lattice, transform,
3521 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3522 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3523 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3527 langevinPistonNoGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3528 atomFixed, lattice, transform,
3529 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3530 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3531 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3533 langevinPistonNoGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3534 atomFixed, lattice, transform,
3535 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3536 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3537 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3540 //cudaCheck(cudaGetLastError());
3542 h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3543 h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3544 h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3546 h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3547 h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3548 h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3550 copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3551 copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3552 copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3554 copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3555 copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3556 copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3558 for(int i = 0; i < numAtoms; i++){
3559 fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3560 h_pos_x[i], h_pos_y[i], h_pos_z[i],
3561 h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3566 template <int BLOCK_SIZE>
3567 __global__ void submitReduction1Kernel(
3568 double * __restrict pos_x,
3569 double * __restrict pos_y,
3570 double * __restrict pos_z,
3571 double * __restrict vel_x,
3572 double * __restrict vel_y,
3573 double * __restrict vel_z,
3574 float * __restrict mass,
3575 // TODO: wrap these scalars in a struct
3576 BigReal *kineticEnergy,
3577 BigReal *momentum_x,
3578 BigReal *momentum_y,
3579 BigReal *momentum_z,
3580 BigReal *angularMomentum_x,
3581 BigReal *angularMomentum_y,
3582 BigReal *angularMomentum_z,
3586 BigReal *h_kineticEnergy,
3587 BigReal *h_momentum_x,
3588 BigReal *h_momentum_y,
3589 BigReal *h_momentum_z,
3590 BigReal *h_angularMomentum_x,
3591 BigReal *h_angularMomentum_y,
3592 BigReal *h_angularMomentum_z,
3593 unsigned int* tbcatomic,
3596 BigReal m_x = 0, m_y = 0, m_z = 0,
3597 a_x = 0, a_y = 0, a_z = 0;
3598 int i = threadIdx.x + blockIdx.x*blockDim.x;
3599 int totaltb = gridDim.x;
3600 __shared__ bool isLastBlockDone;
3602 if(threadIdx.x == 0){
3603 isLastBlockDone = 0;
3608 // scalar kineticEnergy += mass[i] * dot_product(vel[i], vel[i])
3610 // (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
3612 // vector momentum += mass[i] * vel[i]
3613 m_x += mass[i] * vel_x[i];
3614 m_y += mass[i] * vel_y[i];
3615 m_z += mass[i] * vel_z[i];
3617 // vector dpos = pos[i] - origin
3618 BigReal dpos_x = pos_x[i] - origin_x;
3619 BigReal dpos_y = pos_y[i] - origin_y;
3620 BigReal dpos_z = pos_z[i] - origin_z;
3622 // vector angularMomentum += mass[i] * cross_product(dpos, vel[i])
3623 a_x += mass[i] * (dpos_y*vel_z[i] - dpos_z*vel_y[i]);
3624 a_y += mass[i] * (dpos_z*vel_x[i] - dpos_x*vel_z[i]);
3625 a_z += mass[i] * (dpos_x*vel_y[i] - dpos_y*vel_x[i]);
3627 typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
3628 __shared__ typename BlockReduce::TempStorage temp_storage;
3629 // k = BlockReduce(temp_storage).Sum(k);
3632 m_x = BlockReduce(temp_storage).Sum(m_x);
3634 m_y = BlockReduce(temp_storage).Sum(m_y);
3636 m_z = BlockReduce(temp_storage).Sum(m_z);
3638 a_x = BlockReduce(temp_storage).Sum(a_x);
3640 a_y = BlockReduce(temp_storage).Sum(a_y);
3642 a_z = BlockReduce(temp_storage).Sum(a_z);
3645 if (threadIdx.x == 0) {
3646 const int bin = blockIdx.x % ATOMIC_BINS;
3648 // atomicAdd(&kineticEnergy[bin], k);
3649 atomicAdd(&momentum_x[bin], m_x);
3650 atomicAdd(&momentum_y[bin], m_y);
3651 atomicAdd(&momentum_z[bin], m_z);
3652 atomicAdd(&angularMomentum_x[bin], a_x);
3653 atomicAdd(&angularMomentum_y[bin], a_y);
3654 atomicAdd(&angularMomentum_z[bin], a_z);
3656 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
3657 isLastBlockDone = (value == (totaltb-1));
3663 if(isLastBlockDone){
3664 if(threadIdx.x < ATOMIC_BINS){
3665 const int bin = threadIdx.x;
3667 double m_x = momentum_x[bin];
3668 double m_y = momentum_y[bin];
3669 double m_z = momentum_z[bin];
3670 double a_x = angularMomentum_x[bin];
3671 double a_y = angularMomentum_y[bin];
3672 double a_z = angularMomentum_z[bin];
3674 // sets device scalars back to zero
3675 kineticEnergy[0] = 0.0;
3676 momentum_x[bin] = 0.0;
3677 momentum_y[bin] = 0.0;
3678 momentum_z[bin] = 0.0;
3679 angularMomentum_x[bin] = 0.0;
3680 angularMomentum_y[bin] = 0.0;
3681 angularMomentum_z[bin] = 0.0;
3683 if(ATOMIC_BINS > 1){
3684 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
3685 __shared__ typename WarpReduce::TempStorage tempStorage;
3687 m_x = WarpReduce(tempStorage).Sum(m_x);
3688 m_y = WarpReduce(tempStorage).Sum(m_y);
3689 m_z = WarpReduce(tempStorage).Sum(m_z);
3690 a_x = WarpReduce(tempStorage).Sum(a_x);
3691 a_y = WarpReduce(tempStorage).Sum(a_y);
3692 a_z = WarpReduce(tempStorage).Sum(a_z);
3695 if(threadIdx.x == 0){
3696 h_momentum_x[0] = m_x;
3697 h_momentum_y[0] = m_y;
3698 h_momentum_z[0] = m_z;
3699 h_angularMomentum_x[0] = a_x;
3700 h_angularMomentum_y[0] = a_y;
3701 h_angularMomentum_z[0] = a_z;
3703 //resets atomic counter
3704 reset_atomic_counter(&tbcatomic[0]);
3712 // JM: does addForcetoMomentum, maximumMove, and addVelocityToPosition
3713 template <bool DO_VEL_RESCALING, bool DO_FIXED>
3714 __global__ void velocityVerlet1Kernel(
3716 const double scaling,
3717 const double dt_normal,
3718 const double dt_nbond,
3719 const double dt_slow,
3720 const double velrescaling, // for stochastic velocity rescaling
3721 const double* __restrict recipMass,
3722 double* __restrict vel_x,
3723 double* __restrict vel_y,
3724 double* __restrict vel_z,
3725 const double maxvel2,
3727 double* __restrict pos_x,
3728 double* __restrict pos_y,
3729 double* __restrict pos_z,
3733 double* __restrict f_normal_x,
3734 double* __restrict f_normal_y,
3735 double* __restrict f_normal_z,
3736 double* __restrict f_nbond_x,
3737 double* __restrict f_nbond_y,
3738 double* __restrict f_nbond_z,
3739 double* __restrict f_slow_x,
3740 double* __restrict f_slow_y,
3741 double* __restrict f_slow_z,
3742 const int* atomFixed,
3744 const int maxForceNumber
3746 // fusion of addForceToMomentum, maximumMove, addVelocityToPosition
3747 double dt, dt_b, dt_nb, dt_s;
3748 int i = threadIdx.x + blockIdx.x*blockDim.x;
3752 double velx = vel_x[i];
3753 double vely = vel_y[i];
3754 double velz = vel_z[i];
3755 if (DO_VEL_RESCALING) {
3756 velx *= velrescaling;
3757 vely *= velrescaling;
3758 velz *= velrescaling;
3760 // JM NOTE: these need to be patch-centered
3761 double posx = pos_x[i];
3762 double posy = pos_y[i];
3763 double posz = pos_z[i];
3764 double rmass = recipMass[i];
3765 /* addForceToMomentum*/
3766 // keep velocities in registers so I can access them faster when calculating positions!
3767 switch(maxForceNumber){
3769 dt_s = dt_slow * scaling;
3770 velx += f_slow_x[i] * rmass * dt_s;
3771 vely += f_slow_y[i] * rmass * dt_s;
3772 velz += f_slow_z[i] * rmass * dt_s;
3773 // f_slow_x[i] = 0.0;
3774 // f_slow_y[i] = 0.0;
3775 // f_slow_z[i] = 0.0;
3777 dt_nb = dt_nbond * scaling;
3778 velx += f_nbond_x[i] * rmass * dt_nb;
3779 vely += f_nbond_y[i] * rmass * dt_nb;
3780 velz += f_nbond_z[i] * rmass * dt_nb;
3781 // f_nbond_x[i] = 0.0;
3782 // f_nbond_y[i] = 0.0;
3783 // f_nbond_z[i] = 0.0;
3785 dt_b = dt_normal * scaling;
3786 velx += f_normal_x[i] * rmass * dt_b;
3787 vely += f_normal_y[i] * rmass * dt_b;
3788 velz += f_normal_z[i] * rmass * dt_b;
3789 // f_normal_x[i] = 0.0;
3790 // f_normal_y[i] = 0.0;
3791 // f_normal_z[i] = 0.0;
3794 // --------------------------------------------------------
3796 // -- MaximumMove --
3797 double vel2 = velx * velx + vely * vely + velz * velz;
3798 if(vel2 > maxvel2) atomicAdd(h_killme, 1); // stops dynamics if true, perf does not matter
3799 // --------------------------------------------------------
3801 // -- AddVelocityToPosition --
3802 // ---------------------------------------------------------
3814 // Haochuan: for fixed atoms, we need to clear their velocities (see HomePatch::addForceToMomentum)
3815 if (!atomFixed[i]) {
3835 void SequencerCUDAKernel::velocityVerlet1(
3836 const bool doFixedAtoms,
3838 const double scaling,
3839 const double dt_normal,
3840 const double dt_nbond,
3841 const double dt_slow,
3842 const double velrescaling, // for stochastic velocity rescaling
3843 const double *recipMass,
3864 const int* atomFixed,
3866 const int maxForceNumber,
3867 cudaStream_t stream)
3869 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3870 const bool doVelRescaling = (velrescaling != 1.0);
3871 #define CALL(DO_VEL_RESCALING, DO_FIXED) \
3872 velocityVerlet1Kernel<DO_VEL_RESCALING, DO_FIXED> \
3873 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
3874 step, scaling, dt_normal, dt_nbond, dt_slow, velrescaling, recipMass, \
3875 vel_x, vel_y, vel_z, maxvel2, h_killme, \
3876 pos_x, pos_y, pos_z, h_pos_x, h_pos_y, h_pos_z, \
3877 f_normal_x, f_normal_y, f_normal_z, \
3878 f_nbond_x, f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z, \
3879 atomFixed, numAtoms, maxForceNumber \
3881 if ( doVelRescaling && doFixedAtoms) CALL(true, true);
3882 else if ( doVelRescaling && !doFixedAtoms) CALL(true, false);
3883 else if (!doVelRescaling && doFixedAtoms) CALL(false, true);
3884 else if (!doVelRescaling && !doFixedAtoms) CALL(false, false);
3886 NAMD_bug("No kernel was called in SequencerCUDAKernel::velocityVerlet1!\n");
3891 void SequencerCUDAKernel::submitReduction1(
3899 BigReal *kineticEnergy,
3900 BigReal *momentum_x,
3901 BigReal *momentum_y,
3902 BigReal *momentum_z,
3903 BigReal *angularMomentum_x,
3904 BigReal *angularMomentum_y,
3905 BigReal *angularMomentum_z,
3909 BigReal *h_kineticEnergy,
3910 BigReal *h_momentum_x,
3911 BigReal *h_momentum_y,
3912 BigReal *h_momentum_z,
3913 BigReal *h_angularMomentum_x,
3914 BigReal *h_angularMomentum_y,
3915 BigReal *h_angularMomentum_z,
3916 unsigned int* tbcatomic,
3918 cudaStream_t stream)
3920 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3921 submitReduction1Kernel<ATOM_BLOCKS><<<grid, ATOM_BLOCKS, 0, stream>>>(
3922 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass,
3923 kineticEnergy, momentum_x, momentum_y, momentum_z,
3924 angularMomentum_x, angularMomentum_y, angularMomentum_z,
3925 origin_x, origin_y, origin_z, h_kineticEnergy, h_momentum_x, h_momentum_y,
3926 h_momentum_z, h_angularMomentum_x, h_angularMomentum_y, h_angularMomentum_z,
3927 tbcatomic, numAtoms);
3928 //cudaCheck(cudaGetLastError());
3931 template <int BLOCK_SIZE, bool DO_FIXED, bool DO_MTS>
3932 __global__ void submitReduction2Kernel(
3933 const int* __restrict atomFixed,
3934 const double * __restrict pos_x,
3935 const double * __restrict pos_y,
3936 const double * __restrict pos_z,
3937 const double * __restrict vel_x,
3938 const double * __restrict vel_y,
3939 const double * __restrict vel_z,
3940 const double * __restrict rcm_x,
3941 const double * __restrict rcm_y,
3942 const double * __restrict rcm_z,
3943 const double * __restrict vcm_x,
3944 const double * __restrict vcm_y,
3945 const double * __restrict vcm_z,
3946 const double * __restrict f_normal_x,
3947 const double * __restrict f_normal_y,
3948 const double * __restrict f_normal_z,
3949 const double * __restrict f_nbond_x,
3950 const double * __restrict f_nbond_y,
3951 const double * __restrict f_nbond_z,
3952 const double * __restrict f_slow_x,
3953 const double * __restrict f_slow_y,
3954 const double * __restrict f_slow_z,
3955 const float * __restrict mass,
3956 const int * __restrict hydrogenGroupSize,
3957 BigReal *kineticEnergy,
3958 BigReal *h_kineticEnergy,
3959 BigReal *intKineticEnergy,
3960 BigReal *h_intKineticEnergy,
3961 cudaTensor *intVirialNormal,
3962 cudaTensor *intVirialNbond,
3963 cudaTensor *intVirialSlow,
3964 cudaTensor *h_intVirialNormal,
3965 cudaTensor *h_intVirialNbond,
3966 cudaTensor *h_intVirialSlow,
3967 cudaTensor* rigidVirial,
3968 cudaTensor* h_rigidVirial,
3969 unsigned int *tbcatomic,
3971 const int maxForceNumber)
3975 cudaTensor intVNormal, intVNbond, intVSlow;
3976 zero_cudaTensor(intVNormal);
3978 zero_cudaTensor(intVNbond);
3979 zero_cudaTensor(intVSlow);
3981 int i = threadIdx.x + blockIdx.x*blockDim.x;
3982 __shared__ bool isLastBlockDone;
3983 int totaltb = gridDim.x;
3985 if(threadIdx.x == 0){
3986 isLastBlockDone = false;
3992 int hgs = hydrogenGroupSize[i];
3993 // let's see, I have the hydrogenGroupSize, but that's not what I need;
4003 for ( j = i; j < (i+hgs); ++j ) {
4004 r_mass[j -i ] = mass[j];
4005 r_pos[j - i].x = pos_x[j];
4006 r_pos[j - i].y = pos_y[j];
4007 r_pos[j - i].z = pos_z[j];
4008 r_vel[j - i].x = vel_x[j];
4009 r_vel[j - i].y = vel_y[j];
4010 r_vel[j - i].z = vel_z[j];
4012 // r_cm_x += mass[j] * pos_x[j];
4013 // r_cm_y += mass[j] * pos_y[j];
4014 // r_cm_z += mass[j] * pos_z[j];
4015 // v_cm_x += mass[j] * vel_x[j];
4016 // v_cm_y += mass[j] * vel_y[j];
4017 // v_cm_z += mass[j] * vel_z[j];
4018 m_cm += r_mass[j - i];
4019 r_cm_x += r_mass[j - i] * r_pos[j-i].x;
4020 r_cm_y += r_mass[j - i] * r_pos[j-i].y;
4021 r_cm_z += r_mass[j - i] * r_pos[j-i].z;
4022 v_cm_x += r_mass[j - i] * r_vel[j-i].x;
4023 v_cm_y += r_mass[j - i] * r_vel[j-i].y;
4024 v_cm_z += r_mass[j - i] * r_vel[j-i].z;
4026 BigReal inv_m_cm = 1.0/m_cm;
4034 // XXX removed pairInteraction
4035 for ( j = i; j < (i+hgs); ++j ) {
4036 // XXX removed fixed atoms
4038 // vector vel[j] used twice below
4039 BigReal v_x = r_vel[j-i].x;
4040 BigReal v_y = r_vel[j-i].y;
4041 BigReal v_z = r_vel[j-i].z;
4043 // vector dv = vel[j] - v_cm
4044 BigReal dv_x = v_x - v_cm_x;
4045 BigReal dv_y = v_y - v_cm_y;
4046 BigReal dv_z = v_z - v_cm_z;
4048 // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4050 // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4051 intK += r_mass[j-i] *
4052 (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4054 // vector dr = pos[j] - r_cm
4055 // BigReal dr_x = pos_x[j] - r_cm_x;
4056 // BigReal dr_y = pos_y[j] - r_cm_y;
4057 // BigReal dr_z = pos_z[j] - r_cm_z;
4059 BigReal dr_x = r_pos[j -i].x - r_cm_x;
4060 BigReal dr_y = r_pos[j -i].y - r_cm_y;
4061 BigReal dr_z = r_pos[j -i].z - r_cm_z;
4063 // tensor intVirialNormal += outer_product(f_normal[j], dr)
4065 // we're not going to make this function any faster if we don't fix
4066 // the global memory access pattern
4067 intVNormal.xx += f_normal_x[j] * dr_x;
4068 intVNormal.xy += f_normal_x[j] * dr_y;
4069 intVNormal.xz += f_normal_x[j] * dr_z;
4070 intVNormal.yx += f_normal_y[j] * dr_x;
4071 intVNormal.yy += f_normal_y[j] * dr_y;
4072 intVNormal.yz += f_normal_y[j] * dr_z;
4073 intVNormal.zx += f_normal_z[j] * dr_x;
4074 intVNormal.zy += f_normal_z[j] * dr_y;
4075 intVNormal.zz += f_normal_z[j] * dr_z;
4077 if (maxForceNumber >= 1) {
4078 // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4079 intVNbond.xx += f_nbond_x[j] * dr_x;
4080 intVNbond.xy += f_nbond_x[j] * dr_y;
4081 intVNbond.xz += f_nbond_x[j] * dr_z;
4082 intVNbond.yx += f_nbond_y[j] * dr_x;
4083 intVNbond.yy += f_nbond_y[j] * dr_y;
4084 intVNbond.yz += f_nbond_y[j] * dr_z;
4085 intVNbond.zx += f_nbond_z[j] * dr_x;
4086 intVNbond.zy += f_nbond_z[j] * dr_y;
4087 intVNbond.zz += f_nbond_z[j] * dr_z;
4090 if (maxForceNumber >= 2) {
4091 // tensor intVirialSlow += outer_product(f_slow[j], dr)
4092 intVSlow.xx += f_slow_x[j] * dr_x;
4093 intVSlow.xy += f_slow_x[j] * dr_y;
4094 intVSlow.xz += f_slow_x[j] * dr_z;
4095 intVSlow.yx += f_slow_y[j] * dr_x;
4096 intVSlow.yy += f_slow_y[j] * dr_y;
4097 intVSlow.yz += f_slow_y[j] * dr_z;
4098 intVSlow.zx += f_slow_z[j] * dr_x;
4099 intVSlow.zy += f_slow_z[j] * dr_y;
4100 intVSlow.zz += f_slow_z[j] * dr_z;
4105 // Haochuan: I need to skip the fixed atoms
4106 if (!DO_FIXED || (DO_FIXED && !atomFixed[i])) {
4107 BigReal r_cm_x = rcm_x[i];
4108 BigReal r_cm_y = rcm_y[i];
4109 BigReal r_cm_z = rcm_z[i];
4110 BigReal v_cm_x = vcm_x[i];
4111 BigReal v_cm_y = vcm_y[i];
4112 BigReal v_cm_z = vcm_z[i];
4114 BigReal v_x = vel_x[i];
4115 BigReal v_y = vel_y[i];
4116 BigReal v_z = vel_z[i];
4118 // vector dv = vel[j] - v_cm
4119 BigReal dv_x = v_x - v_cm_x;
4120 BigReal dv_y = v_y - v_cm_y;
4121 BigReal dv_z = v_z - v_cm_z;
4123 // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4125 // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4127 (v_x * v_x + v_y*v_y + v_z*v_z);
4129 (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4131 // vector dr = pos[j] - r_cm
4132 // BigReal dr_x = pos_x[j] - r_cm_x;
4133 // BigReal dr_y = pos_y[j] - r_cm_y;
4134 // BigReal dr_z = pos_z[j] - r_cm_z;
4136 BigReal dr_x = pos_x[i] - r_cm_x;
4137 BigReal dr_y = pos_y[i] - r_cm_y;
4138 BigReal dr_z = pos_z[i] - r_cm_z;
4140 // tensor intVirialNormal += outer_product(f_normal[j], dr)
4142 // we're not going to make this function any faster if we don't fix
4143 // the global memory access pattern
4144 intVNormal.xx += f_normal_x[i] * dr_x;
4145 intVNormal.xy += f_normal_x[i] * dr_y;
4146 intVNormal.xz += f_normal_x[i] * dr_z;
4147 intVNormal.yx += f_normal_y[i] * dr_x;
4148 intVNormal.yy += f_normal_y[i] * dr_y;
4149 intVNormal.yz += f_normal_y[i] * dr_z;
4150 intVNormal.zx += f_normal_z[i] * dr_x;
4151 intVNormal.zy += f_normal_z[i] * dr_y;
4152 intVNormal.zz += f_normal_z[i] * dr_z;
4154 if (maxForceNumber >= 1) {
4155 // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4156 // Haochuan: For a MTS simulation we have to calculate intVNbond separately
4158 intVNbond.xx += f_nbond_x[i] * dr_x;
4159 intVNbond.xy += f_nbond_x[i] * dr_y;
4160 intVNbond.xz += f_nbond_x[i] * dr_z;
4161 intVNbond.yx += f_nbond_y[i] * dr_x;
4162 intVNbond.yy += f_nbond_y[i] * dr_y;
4163 intVNbond.yz += f_nbond_y[i] * dr_z;
4164 intVNbond.zx += f_nbond_z[i] * dr_x;
4165 intVNbond.zy += f_nbond_z[i] * dr_y;
4166 intVNbond.zz += f_nbond_z[i] * dr_z;
4168 // If this is not a MTS simulation, then we can sum the internal virials into a single one (intVNormal)
4169 intVNormal.xx += f_nbond_x[i] * dr_x;
4170 intVNormal.xy += f_nbond_x[i] * dr_y;
4171 intVNormal.xz += f_nbond_x[i] * dr_z;
4172 intVNormal.yx += f_nbond_y[i] * dr_x;
4173 intVNormal.yy += f_nbond_y[i] * dr_y;
4174 intVNormal.yz += f_nbond_y[i] * dr_z;
4175 intVNormal.zx += f_nbond_z[i] * dr_x;
4176 intVNormal.zy += f_nbond_z[i] * dr_y;
4177 intVNormal.zz += f_nbond_z[i] * dr_z;
4181 if (maxForceNumber >= 2) {
4182 // tensor intVirialSlow += outer_product(f_slow[j], dr)
4184 intVSlow.xx += f_slow_x[i] * dr_x;
4185 intVSlow.xy += f_slow_x[i] * dr_y;
4186 intVSlow.xz += f_slow_x[i] * dr_z;
4187 intVSlow.yx += f_slow_y[i] * dr_x;
4188 intVSlow.yy += f_slow_y[i] * dr_y;
4189 intVSlow.yz += f_slow_y[i] * dr_z;
4190 intVSlow.zx += f_slow_z[i] * dr_x;
4191 intVSlow.zy += f_slow_z[i] * dr_y;
4192 intVSlow.zz += f_slow_z[i] * dr_z;
4194 intVNormal.xx += f_slow_x[i] * dr_x;
4195 intVNormal.xy += f_slow_x[i] * dr_y;
4196 intVNormal.xz += f_slow_x[i] * dr_z;
4197 intVNormal.yx += f_slow_y[i] * dr_x;
4198 intVNormal.yy += f_slow_y[i] * dr_y;
4199 intVNormal.yz += f_slow_y[i] * dr_z;
4200 intVNormal.zx += f_slow_z[i] * dr_x;
4201 intVNormal.zy += f_slow_z[i] * dr_y;
4202 intVNormal.zz += f_slow_z[i] * dr_z;
4209 typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
4210 __shared__ typename BlockReduce::TempStorage temp_storage;
4211 // XXX TODO: If we handwrite these reductions, it might avoid a
4212 // lot of overhead from launching CUB
4214 K = BlockReduce(temp_storage).Sum(K);
4216 intK = BlockReduce(temp_storage).Sum(intK);
4218 intVNormal.xx = BlockReduce(temp_storage).Sum(intVNormal.xx);
4220 intVNormal.xy = BlockReduce(temp_storage).Sum(intVNormal.xy);
4222 intVNormal.xz = BlockReduce(temp_storage).Sum(intVNormal.xz);
4224 intVNormal.yx = BlockReduce(temp_storage).Sum(intVNormal.yx);
4226 intVNormal.yy = BlockReduce(temp_storage).Sum(intVNormal.yy);
4228 intVNormal.yz = BlockReduce(temp_storage).Sum(intVNormal.yz);
4230 intVNormal.zx = BlockReduce(temp_storage).Sum(intVNormal.zx);
4232 intVNormal.zy = BlockReduce(temp_storage).Sum(intVNormal.zy);
4234 intVNormal.zz = BlockReduce(temp_storage).Sum(intVNormal.zz);
4237 if (maxForceNumber >= 1) {
4238 intVNbond.xx = BlockReduce(temp_storage).Sum(intVNbond.xx);
4240 intVNbond.xy = BlockReduce(temp_storage).Sum(intVNbond.xy);
4242 intVNbond.xz = BlockReduce(temp_storage).Sum(intVNbond.xz);
4244 intVNbond.yx = BlockReduce(temp_storage).Sum(intVNbond.yx);
4246 intVNbond.yy = BlockReduce(temp_storage).Sum(intVNbond.yy);
4248 intVNbond.yz = BlockReduce(temp_storage).Sum(intVNbond.yz);
4250 intVNbond.zx = BlockReduce(temp_storage).Sum(intVNbond.zx);
4252 intVNbond.zy = BlockReduce(temp_storage).Sum(intVNbond.zy);
4254 intVNbond.zz = BlockReduce(temp_storage).Sum(intVNbond.zz);
4257 if (maxForceNumber >= 2) {
4258 intVSlow.xx = BlockReduce(temp_storage).Sum(intVSlow.xx);
4260 intVSlow.xy = BlockReduce(temp_storage).Sum(intVSlow.xy);
4262 intVSlow.xz = BlockReduce(temp_storage).Sum(intVSlow.xz);
4264 intVSlow.yx = BlockReduce(temp_storage).Sum(intVSlow.yx);
4266 intVSlow.yy = BlockReduce(temp_storage).Sum(intVSlow.yy);
4268 intVSlow.yz = BlockReduce(temp_storage).Sum(intVSlow.yz);
4270 intVSlow.zx = BlockReduce(temp_storage).Sum(intVSlow.zx);
4272 intVSlow.zy = BlockReduce(temp_storage).Sum(intVSlow.zy);
4274 intVSlow.zz = BlockReduce(temp_storage).Sum(intVSlow.zz);
4280 if (threadIdx.x == 0) {
4281 const int bin = blockIdx.x % ATOMIC_BINS;
4283 atomicAdd(&kineticEnergy[bin], K);
4284 atomicAdd(&intKineticEnergy[bin], intK);
4285 atomicAdd(&intVirialNormal[bin].xx, intVNormal.xx);
4286 atomicAdd(&intVirialNormal[bin].xy, intVNormal.xy);
4287 atomicAdd(&intVirialNormal[bin].xz, intVNormal.xz);
4288 atomicAdd(&intVirialNormal[bin].yx, intVNormal.yx);
4289 atomicAdd(&intVirialNormal[bin].yy, intVNormal.yy);
4290 atomicAdd(&intVirialNormal[bin].yz, intVNormal.yz);
4291 atomicAdd(&intVirialNormal[bin].zx, intVNormal.zx);
4292 atomicAdd(&intVirialNormal[bin].zy, intVNormal.zy);
4293 atomicAdd(&intVirialNormal[bin].zz, intVNormal.zz);
4295 if (maxForceNumber >= 1) {
4296 atomicAdd(&intVirialNbond[bin].xx, intVNbond.xx);
4297 atomicAdd(&intVirialNbond[bin].xy, intVNbond.xy);
4298 atomicAdd(&intVirialNbond[bin].xz, intVNbond.xz);
4299 atomicAdd(&intVirialNbond[bin].yx, intVNbond.yx);
4300 atomicAdd(&intVirialNbond[bin].yy, intVNbond.yy);
4301 atomicAdd(&intVirialNbond[bin].yz, intVNbond.yz);
4302 atomicAdd(&intVirialNbond[bin].zx, intVNbond.zx);
4303 atomicAdd(&intVirialNbond[bin].zy, intVNbond.zy);
4304 atomicAdd(&intVirialNbond[bin].zz, intVNbond.zz);
4306 if (maxForceNumber >= 2) {
4307 atomicAdd(&intVirialSlow[bin].xx, intVSlow.xx);
4308 atomicAdd(&intVirialSlow[bin].xy, intVSlow.xy);
4309 atomicAdd(&intVirialSlow[bin].xz, intVSlow.xz);
4310 atomicAdd(&intVirialSlow[bin].yx, intVSlow.yx);
4311 atomicAdd(&intVirialSlow[bin].yy, intVSlow.yy);
4312 atomicAdd(&intVirialSlow[bin].yz, intVSlow.yz);
4313 atomicAdd(&intVirialSlow[bin].zx, intVSlow.zx);
4314 atomicAdd(&intVirialSlow[bin].zy, intVSlow.zy);
4315 atomicAdd(&intVirialSlow[bin].zz, intVSlow.zz);
4319 unsigned int value = atomicInc(&tbcatomic[3], totaltb);
4320 isLastBlockDone = (value == (totaltb -1 ));
4323 // this function calculates the internal pressures and is a huge bottleneck if we
4324 // do this host-mapped update
4325 // How do I know if this is really the bottleneck?
4326 if(isLastBlockDone){
4327 if(threadIdx.x < ATOMIC_BINS){
4328 const int bin = threadIdx.x;
4330 double k = kineticEnergy[bin];
4331 double intK = intKineticEnergy[bin];
4332 cudaTensor intVNormal = intVirialNormal[bin];
4334 if (maxForceNumber >= 1) {
4335 intVNbond = intVirialNbond[bin];
4337 if (maxForceNumber >= 2) {
4338 intVSlow = intVirialSlow[bin];
4341 cudaTensor rigidV = rigidVirial[bin];
4343 // sets device scalars back to zero
4344 kineticEnergy[bin] = 0.0;
4345 intKineticEnergy[bin] = 0.0;
4346 zero_cudaTensor(intVirialNormal[bin]);
4347 zero_cudaTensor(intVirialNbond[bin]);
4348 zero_cudaTensor(intVirialSlow[bin]);
4349 zero_cudaTensor(rigidVirial[bin]);
4351 if(ATOMIC_BINS > 1){
4352 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
4353 typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
4354 __shared__ typename WarpReduce::TempStorage tempStorage;
4355 __shared__ typename WarpReduceT::TempStorage tempStorageT;
4357 // According to https://nvidia.github.io/cccl/cub/developer_overview.html#temporary-storage-usage,
4358 // I should use __syncwarp() here.
4359 k = WarpReduce(tempStorage).Sum(k); NAMD_WARP_SYNC(WARP_FULL_MASK);
4360 intK = WarpReduce(tempStorage).Sum(intK); NAMD_WARP_SYNC(WARP_FULL_MASK);
4361 intVNormal = WarpReduceT(tempStorageT).Sum(intVNormal); NAMD_WARP_SYNC(WARP_FULL_MASK);
4362 rigidV = WarpReduceT(tempStorageT).Sum(rigidV); NAMD_WARP_SYNC(WARP_FULL_MASK);
4364 if (maxForceNumber >= 1) {
4365 intVNbond = WarpReduceT(tempStorageT).Sum(intVNbond); NAMD_WARP_SYNC(WARP_FULL_MASK);
4367 if (maxForceNumber >= 2) {
4368 intVSlow = WarpReduceT(tempStorageT).Sum(intVSlow); NAMD_WARP_SYNC(WARP_FULL_MASK);
4373 if(threadIdx.x == 0){
4374 h_kineticEnergy[0] = k;
4375 h_intKineticEnergy[0] = intK;
4376 h_intVirialNormal[0] = intVNormal;
4378 if (maxForceNumber >= 1) {
4379 h_intVirialNbond[0] = intVNbond;
4381 if (maxForceNumber >= 2) {
4382 h_intVirialSlow[0] = intVSlow;
4385 h_rigidVirial[0] = rigidV;
4387 //resets atomic counter
4388 reset_atomic_counter(&tbcatomic[3]);
4394 void SequencerCUDAKernel::submitReduction2(
4395 const bool doFixedAtoms,
4396 const int* atomFixed,
4397 const double *pos_x,
4398 const double *pos_y,
4399 const double *pos_z,
4400 const double *vel_x,
4401 const double *vel_y,
4402 const double *vel_z,
4403 const double *rcm_x,
4404 const double *rcm_y,
4405 const double *rcm_z,
4406 const double *vcm_x,
4407 const double *vcm_y,
4408 const double *vcm_z,
4409 const double *f_normal_x,
4410 const double *f_normal_y,
4411 const double *f_normal_z,
4412 const double *f_nbond_x,
4413 const double *f_nbond_y,
4414 const double *f_nbond_z,
4415 const double *f_slow_x,
4416 const double *f_slow_y,
4417 const double *f_slow_z,
4419 int *hydrogenGroupSize,
4420 BigReal *kineticEnergy,
4421 BigReal *h_kineticEnergy,
4422 BigReal *intKineticEnergy,
4423 BigReal *h_intKineticEnergy,
4424 cudaTensor *intVirialNormal,
4425 cudaTensor *intVirialNbond,
4426 cudaTensor *intVirialSlow,
4427 cudaTensor *h_intVirialNormal,
4428 cudaTensor *h_intVirialNbond,
4429 cudaTensor *h_intVirialSlow,
4430 cudaTensor* rigidVirial,
4431 cudaTensor* h_rigidVirial,
4432 unsigned int *tbcatomic,
4436 cudaStream_t stream)
4439 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4440 #define CALL(DO_FIXED, DO_MTS) \
4441 submitReduction2Kernel<ATOM_BLOCKS, DO_FIXED, DO_MTS><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4442 atomFixed, pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, \
4443 rcm_x, rcm_y, rcm_z, vcm_x, vcm_y, vcm_z, \
4444 f_normal_x, f_normal_y, f_normal_z, \
4445 f_nbond_x, f_nbond_y, f_nbond_z, \
4446 f_slow_x, f_slow_y, f_slow_z, \
4447 mass, hydrogenGroupSize, \
4448 kineticEnergy, h_kineticEnergy, \
4449 intKineticEnergy, h_intKineticEnergy, \
4450 intVirialNormal, intVirialNbond, intVirialSlow, \
4451 h_intVirialNormal, h_intVirialNbond, h_intVirialSlow, rigidVirial, \
4452 h_rigidVirial, tbcatomic, numAtoms, maxForceNumber)
4454 if (doMTS) CALL(true, true);
4455 else CALL(true, false);
4457 else if (!doFixedAtoms) {
4458 if (doMTS) CALL(false, true);
4459 else CALL(false, false);
4461 else NAMD_bug("No kernel was called in SequencerCUDAKernel::submitReduction2!\n");
4463 // cudaCheck(cudaGetLastError());
4466 __global__ void langevinVelocitiesBBK1Kernel(
4468 const float * __restrict langevinParam,
4469 double * __restrict vel_x,
4470 double * __restrict vel_y,
4471 double * __restrict vel_z,
4474 BigReal dt = timestep * (0.001 * TIMEFACTOR);
4475 int i = threadIdx.x + blockIdx.x*blockDim.x;
4477 BigReal dt_gamma = dt * langevinParam[i];
4478 BigReal scaling = 1. - 0.5 * dt_gamma;
4479 vel_x[i] *= scaling;
4480 vel_y[i] *= scaling;
4481 vel_z[i] *= scaling;
4485 void SequencerCUDAKernel::langevinVelocitiesBBK1(
4487 const float *langevinParam,
4492 cudaStream_t stream)
4494 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4495 langevinVelocitiesBBK1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4496 timestep, langevinParam, vel_x, vel_y, vel_z, numAtoms);
4497 // cudaCheck(cudaGetLastError());
4500 __global__ void langevinVelocitiesBBK2Kernel(
4502 const float * __restrict langScalVelBBK2,
4503 const float * __restrict langScalRandBBK2,
4504 const float * __restrict gaussrand_x,
4505 const float * __restrict gaussrand_y,
4506 const float * __restrict gaussrand_z,
4507 double * __restrict vel_x,
4508 double * __restrict vel_y,
4509 double * __restrict vel_z,
4512 int i = threadIdx.x + blockIdx.x*blockDim.x;
4514 vel_x[i] += gaussrand_x[i] * langScalRandBBK2[i];
4515 vel_y[i] += gaussrand_y[i] * langScalRandBBK2[i];
4516 vel_z[i] += gaussrand_z[i] * langScalRandBBK2[i];
4517 vel_x[i] *= langScalVelBBK2[i];
4518 vel_y[i] *= langScalVelBBK2[i];
4519 vel_z[i] *= langScalVelBBK2[i];
4525 void SequencerCUDAKernel::langevinVelocitiesBBK2(
4527 const float *langScalVelBBK2,
4528 const float *langScalRandBBK2,
4536 const int numAtomsGlobal,
4538 curandGenerator_t gen,
4539 cudaStream_t stream)
4542 // For some reason, if n == nAtomsDevice + 1, this gives me a misaligned address
4543 // Generating the full array on gaussrand without striding for multiple GPU simulations for now
4545 // Adding missing call to hiprandSetStream to set the current stream for HIPRAND kernel launches
4546 curandCheck(curandSetStream(gen, stream));
4547 // array buffers have to be even length for pseudorandom normal distribution
4548 // choose n to be the smallest even number >= numAtoms
4549 // we can choose 1 larger than numAtoms since allocation is > numAtoms
4550 int n = (numAtomsGlobal + 1) & (~1);
4552 curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4553 curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4554 curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4555 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4556 langevinVelocitiesBBK2Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4557 timestep, langScalVelBBK2, langScalRandBBK2,
4558 gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride,
4559 vel_x, vel_y, vel_z, numAtoms);
4560 // cudaCheck(cudaGetLastError());
4563 template <bool doFixed>
4564 __global__ void reassignVelocitiesKernel(
4565 const BigReal timestep,
4566 const int* __restrict atomFixed,
4567 const float * __restrict gaussrand_x,
4568 const float * __restrict gaussrand_y,
4569 const float * __restrict gaussrand_z,
4570 double * __restrict vel_x,
4571 double * __restrict vel_y,
4572 double * __restrict vel_z,
4573 const double* __restrict d_recipMass,
4577 //! note we are NOT supporting LES (locally enhanced sampling) here
4578 //! hence no switch on the partition to apply tempfactor
4581 int i = threadIdx.x + blockIdx.x*blockDim.x;
4589 vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4590 vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4591 vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4594 vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4595 vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4596 vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4602 void SequencerCUDAKernel::reassignVelocities(
4603 const BigReal timestep,
4604 const bool doFixedAtoms,
4605 const int* atomFixed,
4612 const double* d_recipMass,
4615 const int numAtomsGlobal,
4617 curandGenerator_t gen,
4618 cudaStream_t stream)
4621 curandCheck(curandSetStream(gen, stream));
4622 // array buffers have to be even length for pseudorandom normal distribution
4623 // choose n to be the smallest even number >= numAtoms
4624 // we can choose 1 larger than numAtoms since allocation is > numAtoms
4625 int n = (numAtomsGlobal + 1) & (~1);
4627 curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4628 curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4629 curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4630 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4631 #define CALL(DOFIXED) \
4632 reassignVelocitiesKernel<DOFIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4633 timestep, atomFixed, \
4634 gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride, \
4635 vel_x, vel_y, vel_z, d_recipMass, kbT, numAtoms)
4636 if (doFixedAtoms) CALL(true);
4639 cudaCheck(cudaStreamSynchronize(stream));
4645 __global__ void updateVelocities(const int nRattles,
4648 const int * __restrict settleList,
4649 const CudaRattleElem * __restrict rattleList,
4650 const double * __restrict pos_x,
4651 const double * __restrict pos_y,
4652 const double * __restrict pos_z,
4653 const double * __restrict posNew_x,
4654 const double * __restrict posNew_y,
4655 const double * __restrict posNew_z,
4660 int tid = threadIdx.x + blockIdx.x*blockDim.x;
4661 int stride = gridDim.x*blockDim.x;
4664 for(int i = tid; i < nSettles; i += stride){
4665 int ig = settleList[i];
4666 //Now I update the position
4667 //Updates three atoms in the settleList
4668 velNew_x[ig] = (posNew_x[ig] - pos_x[ig]) * invdt;
4669 velNew_y[ig] = (posNew_y[ig] - pos_y[ig]) * invdt;
4670 velNew_z[ig] = (posNew_z[ig] - pos_z[ig]) * invdt;
4672 velNew_x[ig+1] = (posNew_x[ig+1] - pos_x[ig+1]) * invdt;
4673 velNew_y[ig+1] = (posNew_y[ig+1] - pos_y[ig+1]) * invdt;
4674 velNew_z[ig+1] = (posNew_z[ig+1] - pos_z[ig+1]) * invdt;
4676 velNew_x[ig+2] = (posNew_x[ig+2] - pos_x[ig+2]) * invdt;
4677 velNew_y[ig+2] = (posNew_y[ig+2] - pos_y[ig+2]) * invdt;
4678 velNew_z[ig+2] = (posNew_z[ig+2] - pos_z[ig+2]) * invdt;
4682 for(int i = tid; i < nRattles; i += stride){
4683 int ig = rattleList[i].ig;
4684 int icnt = rattleList[i].icnt;
4686 fixed[0] = false; fixed[1] = false;
4687 fixed[2] = false; fixed[3] = false;
4688 for(int j = 0; j < icnt; j++){
4689 //Gets two positions
4690 int ia = rattleList[i].params[j].ia;
4691 int ib = rattleList[i].params[j].ib;
4692 //checks if any of these positions have been updated yet
4694 velNew_x[ig+ia] = (posNew_x[ig+ia] - pos_x[ig+ia]) * invdt;
4695 velNew_y[ig+ia] = (posNew_y[ig+ia] - pos_y[ig+ia]) * invdt;
4696 velNew_z[ig+ia] = (posNew_z[ig+ia] - pos_z[ig+ia]) * invdt;
4700 velNew_x[ig+ib] = (posNew_x[ig+ib] - pos_x[ig+ib]) * invdt;
4701 velNew_y[ig+ib] = (posNew_y[ig+ib] - pos_y[ig+ib]) * invdt;
4702 velNew_z[ig+ib] = (posNew_z[ig+ib] - pos_z[ig+ib]) * invdt;
4709 //JM: Function that adds a correction to the bonded forces
4710 template<bool doEnergy, bool doFixed>
4711 __global__ void addRattleForce(int numAtoms,
4713 const int* __restrict atomFixed,
4714 const float * __restrict mass,
4715 const double * __restrict pos_x,
4716 const double * __restrict pos_y,
4717 const double * __restrict pos_z,
4718 double * __restrict vel_x,
4719 double * __restrict vel_y,
4720 double * __restrict vel_z,
4721 double * __restrict velNew_x,
4722 double * __restrict velNew_y,
4723 double * __restrict velNew_z,
4724 double * __restrict f_normal_x,
4725 double * __restrict f_normal_y,
4726 double * __restrict f_normal_z,
4727 cudaTensor* __restrict virial,
4728 cudaTensor* __restrict h_virial,
4729 unsigned int* tbcatomic){
4732 // zero_cudaTensor(lVirial);
4733 int i = threadIdx.x + blockIdx.x*blockDim.x;
4737 lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
4738 lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
4739 lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
4740 //for(int i = tid; i < numAtoms; i += stride){
4742 df[0] = (velNew_x[i] - vel_x[i]) * ((double)mass[i] * invdt);
4743 df[1] = (velNew_y[i] - vel_y[i]) * ((double)mass[i] * invdt);
4744 df[2] = (velNew_z[i] - vel_z[i]) * ((double)mass[i] * invdt);
4746 pos[0] = pos_x[i]; pos[1] = pos_y[i]; pos[2] = pos_z[i];
4747 lVirial.xx += df[0] * pos[0];
4748 lVirial.xy += df[0] * pos[1];
4749 lVirial.xz += df[0] * pos[2];
4750 lVirial.yx += df[1] * pos[0];
4751 lVirial.yy += df[1] * pos[1];
4752 lVirial.yz += df[1] * pos[2];
4753 lVirial.zx += df[2] * pos[0];
4754 lVirial.zy += df[2] * pos[1];
4755 lVirial.zz += df[2] * pos[2];
4758 //Updates force and velocities
4759 f_normal_x[i] += df[0];
4760 f_normal_y[i] += df[1];
4761 f_normal_z[i] += df[2];
4763 // instead of copying I can swap the pointers
4764 vel_x[i] = velNew_x[i];
4765 vel_y[i] = velNew_y[i];
4766 vel_z[i] = velNew_z[i];
4769 //Makes sure that every thread from block has its lVirial set
4770 // Now do a block reduce on each position of virialTensor
4771 // to get consistent values
4772 // The virial tensor is supposed to be symmetric
4773 // XXX TODO: Handwrite the reductions to avoid syncthreads
4775 typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
4776 __shared__ typename BlockReduce::TempStorage temp_storage;
4777 lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
4779 lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
4780 lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
4782 lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
4783 lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
4786 lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
4788 lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
4789 lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
4790 lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
4792 // Every block has a locally reduced blockVirial
4793 // Now every thread does an atomicAdd to get a global virial
4794 if(threadIdx.x == 0){
4795 const int bin = blockIdx.x % ATOMIC_BINS;
4797 atomicAdd(&virial[bin].xx, lVirial.xx);
4799 atomicAdd(&virial[bin].xy, lVirial.xy);
4800 atomicAdd(&virial[bin].xz, lVirial.xz);
4802 atomicAdd(&virial[bin].yx, lVirial.yx);
4803 atomicAdd(&virial[bin].yy, lVirial.yy);
4805 atomicAdd(&virial[bin].yz, lVirial.yz);
4807 atomicAdd(&virial[bin].zx, lVirial.zx);
4808 atomicAdd(&virial[bin].zy, lVirial.zy);
4809 atomicAdd(&virial[bin].zz, lVirial.zz);
4815 template <bool fillIndexes>
4816 __global__ void buildRattleParams(const int size,
4820 //Do I need new vectors for holding the rattle forces? Let's do this for now
4821 const float * __restrict mass,
4822 const int * __restrict hydrogenGroupSize,
4823 const float * __restrict rigidBondLength,
4824 const int * __restrict atomFixed,
4825 CudaRattleElem* rattleList,
4828 int tid = threadIdx.x + blockIdx.x*blockDim.x;
4829 int stride = gridDim.x * blockDim.x;
4838 for(int k = tid; k < size; k += stride){
4840 i = rattleIndexes[k];
4844 hgs = hydrogenGroupSize[ig];
4845 if(hgs == 1) continue; //early bail. Sucks;
4848 for(int j = 0; j < hgs; j++){
4849 rmass[j] = (mass[ig + j] > 0.f ? __frcp_rn(mass[ig+j]) : 0.f);
4850 fixed[j] = atomFixed[ig+j];
4856 float tmp = rigidBondLength[ig];
4857 //We bail here if hydrogens are not fixed
4859 if(!anyfixed) continue;
4860 if( !(fixed[1] && fixed[2]) ){
4861 dsq[icnt] = tmp*tmp;
4867 rattleIndexes[i] = i;
4870 for(int j = 1; j < hgs; j++){
4871 if((tmp = rigidBondLength[ig+j]) > 0){
4872 if( !(fixed[0] && fixed[j])){
4873 dsq[icnt] = tmp * tmp;
4879 // This is really bad: does an atomicadd for each rattle
4880 // Improve this later
4881 rattleList[k].ig = ig;
4882 rattleList[k].icnt = icnt;
4883 for(int j = 0; j < icnt; j++){
4886 rattleList[k].params[j].ia = ia;
4887 rattleList[k].params[j].ib = ib;
4888 rattleList[k].params[j].dsq = dsq[j];
4889 rattleList[k].params[j].rma = rmass[ia];
4890 rattleList[k].params[j].rmb = rmass[ib];
4896 void buildRattleLists(
4897 const bool doFixedAtoms,
4902 const int *hydrogenGroupSize,
4903 const float *rigidBondLength,
4904 const int *atomFixed,
4906 size_t& settleListSize,
4908 size_t& consFailureSize,
4909 CudaRattleElem **rattleList,
4910 size_t& rattleListSize,
4918 char*& d_rattleList_temp_storage,
4919 size_t& temp_storage_bytes,
4920 int*& rattleIndexes,
4921 size_t& rattleIndexes_size,
4923 cudaStream_t stream){
4926 //thrust::device_ptr<const int> h_dev = thrust::device_pointer_cast(hydrogenGroupSize)
4927 size_t st_hgi_size = (size_t)(hgi_size);
4928 reallocate_device<int>(&hgi, &st_hgi_size, natoms, OVERALLOC);
4929 hgi_size = (size_t)(st_hgi_size);
4930 size_t temp_length = 1;
4931 reallocate_device<int>(&d_nHG, &temp_length, 1);
4932 reallocate_device<int>(&d_nSettles, &temp_length, 1);
4933 reallocate_device<int>(&d_nRattles, &temp_length, 1);
4935 copy_DtoD<int>(hydrogenGroupSize, hgi, natoms, stream);
4936 size_t new_storage_bytes = 0;
4937 // Removes zeroes from HGI array
4938 // Pass NULL to first CUB call to get temp buffer size
4939 cudaCheck(cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, hgi,
4940 d_nHG, natoms, notZero(), stream));
4941 reallocate_device<char>(&d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4942 // DC: There were issues where reallocate wanted int types and CUB wanted unsigned ints
4943 new_storage_bytes = temp_storage_bytes;
4944 cudaCheck( cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, hgi,
4945 d_nHG, natoms, notZero(), stream));
4948 copy_DtoH<int>(d_nHG, &temp, 1, stream );
4951 cudaCheck(cudaStreamSynchronize(stream));
4953 // Exclusive scan to build an array of indices
4954 new_storage_bytes = 0;
4955 cub::DeviceScan::ExclusiveSum(NULL,
4956 new_storage_bytes, hgi, hgi, nHG, stream);
4957 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4958 new_storage_bytes = temp_storage_bytes;
4959 cub::DeviceScan::ExclusiveSum(d_rattleList_temp_storage, new_storage_bytes,
4960 hgi, hgi, nHG, stream);
4962 size_t st_tempListSize = (size_t)(settleListSize);
4963 reallocate_device<int>(settleList, &st_tempListSize, nHG, OVERALLOC);
4964 settleListSize = (size_t)(st_tempListSize);
4966 st_tempListSize = (size_t)(rattleListSize);
4967 reallocate_device<CudaRattleElem>(rattleList, &st_tempListSize, nHG, OVERALLOC);
4968 rattleListSize = (size_t)(st_tempListSize);
4970 st_tempListSize = (size_t)(consFailureSize);
4971 reallocate_device<int>(consFailure, &st_tempListSize, nHG, OVERALLOC);
4972 consFailureSize = (size_t)(st_tempListSize);
4974 st_tempListSize = (size_t)(rattleIndexes_size);
4975 reallocate_device<int>(&rattleIndexes, &st_tempListSize, nHG, OVERALLOC);
4976 rattleIndexes_size = (size_t)(st_tempListSize);
4979 cudaCheck(cudaMemsetAsync(*rattleList, 0, sizeof(CudaRattleElem)*nHG, stream));
4980 cudaCheck(cudaMemsetAsync(rattleIndexes,-1, sizeof(int)*nHG, stream));
4981 cudaCheck(cudaMemsetAsync(*settleList, 0, sizeof(int)*nHG, stream));
4983 ///thrust::device_vector<int> hgi(natoms);
4984 new_storage_bytes = 0;
4986 isWater<true> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
4987 // builds SettleList with isWater functor
4988 cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
4989 d_nSettles, nHG, comp, stream);
4990 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4991 new_storage_bytes = temp_storage_bytes;
4992 cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
4993 d_nSettles, nHG, comp, stream);
4995 isWater<false> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
4996 // builds SettleList with isWater functor
4997 cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
4998 d_nSettles, nHG, comp, stream);
4999 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5000 new_storage_bytes = temp_storage_bytes;
5001 cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
5002 d_nSettles, nHG, comp, stream);
5005 // alright I have the number of settles here
5006 copy_DtoH<int>(d_nSettles, &nSettles, 1, stream);
5007 // fprintf(stderr, "nSettles = %d", nSettles);
5009 //Warmup call to buildRattleParams
5010 buildRattleParams<true><<<128, 128, 0, stream>>>(nHG, dt, invdt,
5011 hgi, mass, hydrogenGroupSize,
5012 rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5013 cudaCheck(cudaGetLastError());
5015 // Removing -1 from rattleIndexes
5016 new_storage_bytes = 0;
5017 cub::DeviceSelect::If(NULL, new_storage_bytes,
5018 rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5019 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5020 new_storage_bytes = temp_storage_bytes;
5021 cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes,
5022 rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5023 copy_DtoH<int>(d_nRattles, &nRattles, 1, stream);
5025 // Calculates rattleParams on rattleIndexes
5026 buildRattleParams<false><<<128, 128, 0, stream>>>(nRattles, dt, invdt,
5027 hgi, mass, hydrogenGroupSize,
5028 rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5030 cudaCheck(cudaGetLastError());
5033 // XXX TODO: JM: Memory access pattern in this function is bad, since we have to
5034 // access memory by the hydrogen groups. Improve it later
5036 void SequencerCUDAKernel::rattle1(
5037 const bool doFixedAtoms,
5038 const bool doEnergy,
5039 const bool pressure, // pressure is only false for startRun1 and startRun2 right now
5059 const int *hydrogenGroupSize,
5060 const float *rigidBondLength,
5062 const int *atomFixed,
5064 size_t& settleListSize,
5065 int **consFailure_d,
5066 size_t& consFailureSize,
5067 CudaRattleElem **rattleList,
5068 size_t& rattleListSize,
5072 cudaTensor *h_virial,
5073 unsigned int* tbcatomic,
5075 SettleParameters *sp,
5078 const WaterModel water_model,
5079 cudaStream_t stream)
5081 //Calls the necessary functions to enforce rigid bonds
5082 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5084 cudaCheck(cudaStreamSynchronize(stream));
5085 if(migration || !firstRattleDone){
5086 // I need to allocate the water positions here
5088 doFixedAtoms, numAtoms, dt, invdt, mass, hydrogenGroupSize,
5089 rigidBondLength, atomFixed, settleList, settleListSize,
5090 consFailure_d, consFailureSize, rattleList, rattleListSize,
5092 d_nHG, d_nSettles, d_nRattles, hgi, hgi_size,
5093 d_rattleList_temp_storage, temp_storage_bytes,
5094 rattleIndexes, rattleIndexes_size,
5096 firstRattleDone = true;
5101 this->updateRigidArrays(
5102 doFixedAtoms, dt, atomFixed,
5103 vel_x, vel_y, vel_z,
5104 pos_x, pos_y, pos_z,
5105 velNew_x, velNew_y, velNew_z,
5106 posNew_x, posNew_y, posNew_z,
5110 // if(*nSettle != 0){
5113 numAtoms, dt, invdt, *nSettle,
5114 vel_x, vel_y, vel_z,
5115 pos_x, pos_y, pos_z,
5116 velNew_x, velNew_y, velNew_z,
5117 posNew_x, posNew_y, posNew_z,
5118 f_normal_x, f_normal_y, f_normal_z, virial, mass,
5119 hydrogenGroupSize, rigidBondLength, atomFixed,
5120 *settleList, sp, water_model, stream);
5126 doEnergy, doFixedAtoms, *rattleList, *nRattle, hydrogenGroupSize,
5127 atomFixed, pos_x, pos_y, pos_z,
5128 posNew_x, posNew_y, posNew_z,
5129 vel_x, vel_y, vel_z,
5130 velNew_x, velNew_y,velNew_z,
5131 f_normal_x, f_normal_y, f_normal_z, virial, mass,
5132 invdt, tol2, maxiter, *consFailure_d,
5133 consFailure, stream);
5136 Rattle1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5137 dt, invdt, *nSettle,
5138 vel_x, vel_y, vel_z,
5139 pos_x, pos_y, pos_z,
5140 velNew_x, velNew_y, velNew_z,
5141 posNew_x, posNew_y, posNew_z,
5142 f_normal_x, f_normal_y, f_normal_z,
5144 hydrogenGroupSize, rigidBondLength, atomFixed,
5146 *rattleList, *nRattle, tol2, maxiter, *consFailure_d);
5151 // only for the first time step
5152 copy_DtoD<double>(posNew_x, pos_x, numAtoms, stream);
5153 copy_DtoD<double>(posNew_y, pos_y, numAtoms, stream);
5154 copy_DtoD<double>(posNew_z, pos_z, numAtoms, stream);
5155 cudaCheck(cudaStreamSynchronize(stream));
5156 }else if (pressure == 0){ // pressure is zero for the first timesteps
5157 copy_DtoD<double>(velNew_x, vel_x, numAtoms, stream);
5158 copy_DtoD<double>(velNew_y, vel_y, numAtoms, stream);
5159 copy_DtoD<double>(velNew_z, vel_z, numAtoms, stream);
5160 cudaCheck(cudaStreamSynchronize(stream));
5162 #define CALL(DOENERGY, DOFIXED) \
5163 addRattleForce<DOENERGY, DOFIXED><<<grid, ATOM_BLOCKS, 0 ,stream>>>( \
5164 numAtoms, invdt, atomFixed, mass, \
5165 pos_x, pos_y, pos_z, \
5166 vel_x, vel_y, vel_z, \
5167 velNew_x, velNew_y, velNew_z, \
5168 f_normal_x, f_normal_y, f_normal_z, virial, h_virial, tbcatomic)
5169 if (doEnergy && doFixedAtoms) CALL(true, true);
5170 else if (doEnergy && !doFixedAtoms) CALL(true, false);
5171 else if (!doEnergy && doFixedAtoms) CALL(false, true);
5172 else if (!doEnergy && !doFixedAtoms) CALL(false, false);
5173 else NAMD_bug("addRattleForce was not called in SequencerCUDAKernel::rattle1!\n");
5178 // Calls a fused kernel without any tensor update
5179 // let's calculate the number of settle blocks
5180 int nSettleBlocks = (*nSettle + 128 -1 ) / 128;
5181 int nShakeBlocks = (*nRattle + 128 -1 ) / 128;
5182 // int nTotalBlocks = (nSettleBlocks + nShakeBlocks);
5183 if (doFixedAtoms) NAMD_bug("CallRattle1Kernel does not support fixed atoms!\n");
5185 numAtoms, dt, invdt, *nSettle,
5186 vel_x, vel_y, vel_z,
5187 pos_x, pos_y, pos_z,
5188 f_normal_x, f_normal_y, f_normal_z,
5189 mass, hydrogenGroupSize, rigidBondLength, atomFixed,
5191 *rattleList, *nRattle, tol2, maxiter, *consFailure_d, nSettleBlocks,
5192 nShakeBlocks, water_model, stream
5199 template <bool T_NORMALIZED, bool T_DOENERGY>
5200 __global__ void eFieldKernel(
5202 const double3 eField,
5203 const double eFieldOmega,
5204 const double eFieldPhi,
5207 const char3* __restrict transform,
5208 const float* __restrict charges,
5209 const double* __restrict pos_x,
5210 const double* __restrict pos_y,
5211 const double* __restrict pos_z,
5212 double* __restrict f_normal_x,
5213 double* __restrict f_normal_y,
5214 double* __restrict f_normal_z,
5215 double3* __restrict d_extForce,
5216 cudaTensor* __restrict d_extVirial,
5217 double* __restrict d_extEnergy,
5218 double3* __restrict h_extForce,
5219 cudaTensor* __restrict h_extVirial,
5220 double* __restrict h_extEnergy,
5221 unsigned int* tbcatomic)
5223 int i = threadIdx.x + blockIdx.x*blockDim.x;
5226 // This can all be moved outside of the kernel
5229 double eCos = cos(eFieldOmega * t - eFieldPhi);
5230 double charge = charges[i];
5231 double3 r_extForce = {0, 0, 0};
5232 double r_extEnergy = 0;
5233 cudaTensor r_extVirial = {0};
5234 int totaltb = gridDim.x;
5235 __shared__ bool isLastBlockDone;
5236 eField1.x = eCos * eField.x;
5237 eField1.y = eCos * eField.y;
5238 eField1.z = eCos * eField.z;
5240 if(threadIdx.x == 0) {
5241 isLastBlockDone = 0;
5247 // This can be moved outside of the kernel
5249 eField1 = Vector(lat.a_r()* (Vector)eField1, lat.b_r()* (Vector)eField1, lat.c_r()* (Vector)eField1);
5252 fx = charge * eField1.x;
5253 fy = charge * eField1.y;
5254 fz = charge * eField1.z;
5256 f_normal_x[i] += fx;
5257 f_normal_y[i] += fy;
5258 f_normal_z[i] += fz;
5261 const char3 tr = transform[i];
5267 //all threads from block calculate their contribution to energy, extForce and extVirial
5268 vpos = lat.reverse_transform(vpos, tr);
5269 double3 o = lat.origin();
5270 r_extEnergy -= (fx * (vpos.x - o.x)) + (fy * (vpos.y - o.y)) + (fz * (vpos.z - o.z));
5277 // outer product to calculate a virial here
5278 r_extVirial.xx = fx * vpos.x;
5279 r_extVirial.xy = fx * vpos.y;
5280 r_extVirial.xz = fx * vpos.z;
5281 r_extVirial.yx = fy * vpos.x;
5282 r_extVirial.yy = fy * vpos.y;
5283 r_extVirial.yz = fy * vpos.z;
5284 r_extVirial.zx = fz * vpos.x;
5285 r_extVirial.zz = fz * vpos.z;
5286 r_extVirial.zy = fz * vpos.y;
5293 // use cub to reduce energies
5294 typedef cub::BlockReduce<double, ATOM_BLOCKS> BlockReduce;
5295 __shared__ typename BlockReduce::TempStorage temp_storage;
5296 r_extEnergy = BlockReduce(temp_storage).Sum(r_extEnergy);
5299 // Do the remaining reductions
5301 r_extForce.x = BlockReduce(temp_storage).Sum(r_extForce.x);
5303 r_extForce.y = BlockReduce(temp_storage).Sum(r_extForce.y);
5305 r_extForce.z = BlockReduce(temp_storage).Sum(r_extForce.z);
5308 r_extVirial.xx = BlockReduce(temp_storage).Sum(r_extVirial.xx);
5310 r_extVirial.xy = BlockReduce(temp_storage).Sum(r_extVirial.xy);
5312 r_extVirial.xz = BlockReduce(temp_storage).Sum(r_extVirial.xz);
5314 r_extVirial.yx = BlockReduce(temp_storage).Sum(r_extVirial.yx);
5316 r_extVirial.yy = BlockReduce(temp_storage).Sum(r_extVirial.yy);
5318 r_extVirial.yz = BlockReduce(temp_storage).Sum(r_extVirial.yz);
5320 r_extVirial.zx = BlockReduce(temp_storage).Sum(r_extVirial.zx);
5322 r_extVirial.zy = BlockReduce(temp_storage).Sum(r_extVirial.zy);
5324 r_extVirial.zz = BlockReduce(temp_storage).Sum(r_extVirial.zz);
5328 // Cool now atomic add to gmem
5329 if(threadIdx.x == 0 ){
5330 const int bin = blockIdx.x % ATOMIC_BINS;
5332 // printf("adding %lf to d_extEnergy\n", r_extEnergy);
5333 atomicAdd(&d_extEnergy[bin], r_extEnergy);
5336 // add force and virial as well
5337 atomicAdd(&d_extForce[bin].x, r_extForce.x);
5338 atomicAdd(&d_extForce[bin].y, r_extForce.y);
5339 atomicAdd(&d_extForce[bin].z, r_extForce.z);
5341 atomicAdd(&d_extVirial[bin].xx, r_extVirial.xx);
5342 atomicAdd(&d_extVirial[bin].xy, r_extVirial.xy);
5343 atomicAdd(&d_extVirial[bin].xz, r_extVirial.xz);
5345 atomicAdd(&d_extVirial[bin].yx, r_extVirial.yx);
5346 atomicAdd(&d_extVirial[bin].yy, r_extVirial.yy);
5347 atomicAdd(&d_extVirial[bin].yz, r_extVirial.yz);
5349 atomicAdd(&d_extVirial[bin].zx, r_extVirial.zx);
5350 atomicAdd(&d_extVirial[bin].zy, r_extVirial.zy);
5351 atomicAdd(&d_extVirial[bin].zz, r_extVirial.zz);
5355 unsigned int value = atomicInc(&tbcatomic[2], totaltb);
5356 isLastBlockDone = (value == (totaltb -1));
5360 if(isLastBlockDone){
5361 if(threadIdx.x < ATOMIC_BINS){
5362 const int bin = threadIdx.x;
5364 double e = d_extEnergy[bin];
5368 f = d_extForce[bin];
5369 v = d_extVirial[bin];
5372 // sets device scalars back to zero
5373 d_extEnergy[bin] = 0.0;
5375 d_extForce[bin] = make_double3(0.0, 0.0, 0.0);
5376 zero_cudaTensor(d_extVirial[bin]);
5379 if(ATOMIC_BINS > 1){
5380 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
5381 typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
5382 __shared__ typename WarpReduce::TempStorage tempStorage;
5383 __shared__ typename WarpReduceT::TempStorage tempStorageT;
5385 e = WarpReduce(tempStorage).Sum(e);
5387 f.x = WarpReduce(tempStorage).Sum(f.x);
5388 f.y = WarpReduce(tempStorage).Sum(f.y);
5389 f.z = WarpReduce(tempStorage).Sum(f.z);
5390 v = WarpReduceT(tempStorageT).Sum(v);
5394 if(threadIdx.x == 0){
5401 //resets atomic counter
5402 reset_atomic_counter(&(tbcatomic[2]));
5409 // JM NOTE: Apply external electric field to every atom in the system equally
5410 void SequencerCUDAKernel::apply_Efield(
5412 const bool normalized,
5413 const bool doEnergy,
5414 const double3 eField,
5415 const double eFieldOmega,
5416 const double eFieldPhi,
5419 const char3* transform,
5420 const float* charges,
5421 const double* pos_x,
5422 const double* pos_y,
5423 const double* pos_z,
5427 double3* d_extForce,
5428 cudaTensor* d_extVirial,
5429 double* d_extEnergy,
5430 double3* h_extForce,
5431 cudaTensor* h_extVirial,
5432 double* h_extEnergy,
5433 unsigned int* tbcatomic,
5436 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5439 eFieldKernel<true, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5440 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5441 pos_x, pos_y, pos_z,
5442 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5443 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5445 eFieldKernel<true, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5446 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5447 pos_x, pos_y, pos_z,
5448 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5449 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5453 eFieldKernel<false, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5454 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5455 pos_x, pos_y, pos_z,
5456 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5457 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5459 eFieldKernel<false, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5460 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5461 pos_x, pos_y, pos_z,
5462 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5463 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5468 SequencerCUDAKernel::SequencerCUDAKernel() {
5469 firstRattleDone = false;
5470 intConstInit = false;
5479 d_rattleList_temp_storage = NULL;
5480 temp_storage_bytes = 0;
5482 rattleIndexes = NULL;
5483 rattleIndexes_size = 0;
5486 SequencerCUDAKernel::~SequencerCUDAKernel() {
5487 deallocate_device<int>(&d_nHG);
5488 deallocate_device<int>(&d_nSettles);
5489 deallocate_device<int>(&d_nRattles);
5490 deallocate_device<int>(&hgi);
5491 deallocate_device<int>(&rattleIndexes);
5492 deallocate_device<char>(&d_rattleList_temp_storage);
5502 void SequencerCUDAKernel::set_pme_positions(
5504 const bool isPmeDevice,
5506 const int numPatchesHomeAndProxy,
5507 const int numPatchesHome,
5512 const bool doAlchDecouple,
5513 const bool doAlchSoftCore,
5514 const bool handleBoundary,
5515 const double* d_pos_x,
5516 const double* d_pos_y,
5517 const double* d_pos_z,
5518 #ifndef NAMD_NCCL_ALLREDUCE
5519 double** d_peer_pos_x,
5520 double** d_peer_pos_y,
5521 double** d_peer_pos_z,
5522 float** d_peer_charge,
5523 int** d_peer_partition,
5525 const float* charges,
5526 const int* partition,
5527 const double charge_scaling,
5528 const double3* patchCenter,
5529 const int* s_patchPositions,
5530 const int* s_pencilPatchIndex,
5531 const int* s_patchIDs,
5532 const int* patchSortOrder,
5533 const Lattice lattice,
5539 CudaLocalRecord* localRecords,
5540 CudaPeerRecord* peerRecords,
5541 std::vector<int>& atomCounts,
5544 // Launch PME setComputes
5545 #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
5546 setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
5547 <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
5548 d_pos_x, d_pos_y, d_pos_z, charges, \
5549 d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
5550 d_peer_partition, partition, charge_scaling, \
5551 s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
5552 lattice, s_atoms, numTotalAtoms, s_partition, \
5553 i, numAtoms, offset, handleBoundary \
5555 // Only when PME long-range electrostaic is enabled (doSlow is true)
5556 // The partition is needed for alchemical calculation.
5557 if(doSlow && isPmeDevice) {
5559 for (int i = 0; i < nDev; i++) {
5560 const bool home = (i == devID);
5561 const int numAtoms = atomCounts[i];
5562 const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
5563 const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
5566 case 0: CALL(0, 0, 0, 0, 0); break;
5567 case 1: CALL(0, 0, 0, 0, 1); break;
5568 case 2: CALL(0, 0, 0, 1, 0); break;
5569 case 3: CALL(0, 0, 0, 1, 1); break;
5570 case 4: CALL(0, 0, 1, 0, 0); break;
5571 case 5: CALL(0, 0, 1, 0, 1); break;
5572 case 6: CALL(0, 0, 1, 1, 0); break;
5573 case 7: CALL(0, 0, 1, 1, 1); break;
5574 case 8: CALL(0, 1, 0, 0, 0); break;
5575 case 9: CALL(0, 1, 0, 0, 1); break;
5576 case 10: CALL(0, 1, 0, 1, 0); break;
5577 case 11: CALL(0, 1, 0, 1, 1); break;
5578 case 12: CALL(0, 1, 1, 0, 0); break;
5579 case 13: CALL(0, 1, 1, 0, 1); break;
5580 case 14: CALL(0, 1, 1, 1, 0); break;
5581 case 15: CALL(0, 1, 1, 1, 1); break;
5582 case 16: CALL(1, 0, 0, 0, 0); break;
5583 case 17: CALL(1, 0, 0, 0, 1); break;
5584 case 18: CALL(1, 0, 0, 1, 0); break;
5585 case 19: CALL(1, 0, 0, 1, 1); break;
5586 case 20: CALL(1, 0, 1, 0, 0); break;
5587 case 21: CALL(1, 0, 1, 0, 1); break;
5588 case 22: CALL(1, 0, 1, 1, 0); break;
5589 case 23: CALL(1, 0, 1, 1, 1); break;
5590 case 24: CALL(1, 1, 0, 0, 0); break;
5591 case 25: CALL(1, 1, 0, 0, 1); break;
5592 case 26: CALL(1, 1, 0, 1, 0); break;
5593 case 27: CALL(1, 1, 0, 1, 1); break;
5594 case 28: CALL(1, 1, 1, 0, 0); break;
5595 case 29: CALL(1, 1, 1, 0, 1); break;
5596 case 30: CALL(1, 1, 1, 1, 0); break;
5597 case 31: CALL(1, 1, 1, 1, 1); break;
5599 NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
5608 template <bool doGlobal, bool doForcesOutput>
5609 __global__ void copyForcesToHostSOAKernel(
5610 const int numPatches,
5611 CudaLocalRecord* localRecords,
5612 const int maxForceNumber,
5613 const double* d_f_normal_x,
5614 const double* d_f_normal_y,
5615 const double* d_f_normal_z,
5616 const double* d_f_nbond_x,
5617 const double* d_f_nbond_y,
5618 const double* d_f_nbond_z,
5619 const double* d_f_slow_x,
5620 const double* d_f_slow_y,
5621 const double* d_f_slow_z,
5622 PatchDataSOA* d_HostPatchDataSOA
5624 __shared__ CudaLocalRecord s_record;
5625 using AccessType = int32_t;
5626 AccessType* s_record_buffer = (AccessType*) &s_record;
5628 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5629 // Read in the CudaLocalRecord using multiple threads. This should
5631 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5632 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5636 const int numAtoms = s_record.numAtoms;
5637 const int offset = s_record.bufferOffset;
5639 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5640 if (maxForceNumber >= 0) {
5641 d_HostPatchDataSOA[patchIndex].f_normal_x[i] = d_f_normal_x[offset + i];
5642 d_HostPatchDataSOA[patchIndex].f_normal_y[i] = d_f_normal_y[offset + i];
5643 d_HostPatchDataSOA[patchIndex].f_normal_z[i] = d_f_normal_z[offset + i];
5645 if (maxForceNumber >= 1) {
5647 d_HostPatchDataSOA[patchIndex].f_saved_nbond_x[i] = d_f_nbond_x[offset + i];
5648 d_HostPatchDataSOA[patchIndex].f_saved_nbond_y[i] = d_f_nbond_y[offset + i];
5649 d_HostPatchDataSOA[patchIndex].f_saved_nbond_z[i] = d_f_nbond_z[offset + i];
5651 if (doForcesOutput) {
5652 d_HostPatchDataSOA[patchIndex].f_nbond_x[i] = d_f_nbond_x[offset + i];
5653 d_HostPatchDataSOA[patchIndex].f_nbond_y[i] = d_f_nbond_y[offset + i];
5654 d_HostPatchDataSOA[patchIndex].f_nbond_z[i] = d_f_nbond_z[offset + i];
5657 if (maxForceNumber >= 2) {
5659 d_HostPatchDataSOA[patchIndex].f_saved_slow_x[i] = d_f_slow_x[offset + i];
5660 d_HostPatchDataSOA[patchIndex].f_saved_slow_y[i] = d_f_slow_y[offset + i];
5661 d_HostPatchDataSOA[patchIndex].f_saved_slow_z[i] = d_f_slow_z[offset + i];
5663 if (doForcesOutput) {
5664 d_HostPatchDataSOA[patchIndex].f_slow_x[i] = d_f_slow_x[offset + i];
5665 d_HostPatchDataSOA[patchIndex].f_slow_y[i] = d_f_slow_y[offset + i];
5666 d_HostPatchDataSOA[patchIndex].f_slow_z[i] = d_f_slow_z[offset + i];
5675 void SequencerCUDAKernel::copyForcesToHostSOA(
5676 const int numPatches,
5677 CudaLocalRecord* localRecords,
5678 const int maxForceNumber,
5679 const double* d_f_normal_x,
5680 const double* d_f_normal_y,
5681 const double* d_f_normal_z,
5682 const double* d_f_nbond_x,
5683 const double* d_f_nbond_y,
5684 const double* d_f_nbond_z,
5685 const double* d_f_slow_x,
5686 const double* d_f_slow_y,
5687 const double* d_f_slow_z,
5688 PatchDataSOA* d_HostPatchDataSOA,
5689 const bool doGlobal,
5690 const bool doForcesOutput,
5693 #define CALL(GLOBAL, FORCESOUTPUT) \
5694 do {copyForcesToHostSOAKernel<GLOBAL, FORCESOUTPUT><<<numPatches, PATCH_BLOCKS, 0, stream>>>( \
5707 d_HostPatchDataSOA \
5709 if (doGlobal && !doForcesOutput) CALL(true, false);
5710 else if (doGlobal && doForcesOutput) CALL(true, true);
5711 else if (!doGlobal && doForcesOutput) CALL(false, true);
5712 else NAMD_bug("No kernel is called in copyForcesToHostSOA");
5716 __global__ void copyPositionsToHostSOAKernel(
5717 const int numPatches,
5718 CudaLocalRecord* localRecords,
5719 const double* pos_x,
5720 const double* pos_y,
5721 const double* pos_z,
5722 PatchDataSOA* d_HostPatchDataSOA
5724 __shared__ CudaLocalRecord s_record;
5725 using AccessType = int32_t;
5726 AccessType* s_record_buffer = (AccessType*) &s_record;
5728 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5729 // Read in the CudaLocalRecord using multiple threads. This should
5731 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5732 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5736 const int numAtoms = s_record.numAtoms;
5737 const int offset = s_record.bufferOffset;
5739 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5740 d_HostPatchDataSOA[patchIndex].pos_x[i] = pos_x[offset + i];
5741 d_HostPatchDataSOA[patchIndex].pos_y[i] = pos_y[offset + i];
5742 d_HostPatchDataSOA[patchIndex].pos_z[i] = pos_z[offset + i];
5748 void SequencerCUDAKernel::copyPositionsToHostSOA(
5749 const int numPatches,
5750 CudaLocalRecord* localRecords,
5751 const double* pos_x,
5752 const double* pos_y,
5753 const double* pos_z,
5754 PatchDataSOA* d_HostPatchDataSOA,
5757 copyPositionsToHostSOAKernel<<<numPatches, PATCH_BLOCKS, 0, stream>>>(
5760 pos_x, pos_y, pos_z,
5765 // Swipe from HomePatch::redistrib_lp_water_force
5766 /* Redistribute forces from the massless lonepair charge particle onto
5767 * the other atoms of the water.
5769 * This is done using the same algorithm as charmm uses for TIP4P lonepairs.
5771 * Pass by reference the forces (O H1 H2 LP) to be modified,
5772 * pass by constant reference the corresponding positions.
5774 template <bool doVirial>
5775 __device__ void redistrib_lp_water_force(
5776 double f_ox[3], double f_h1[3], double f_h2[3], double f_lp[3],
5777 const double p_ox[3], const double p_h1[3], const double p_h2[3],
5778 const double p_lp[3], cudaTensor& virial) {
5779 // Accumulate force adjustments
5780 double fad_ox[3] = {};
5781 double fad_h[3] = {};
5782 // Calculate the radial component of the force and add it to the oxygen
5783 const double r_ox_lp[3] = {p_lp[0] - p_ox[0],
5786 double invlen2_r_ox_lp = rnorm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5787 invlen2_r_ox_lp *= invlen2_r_ox_lp;
5788 const double rad_factor =
5789 (f_lp[0] * r_ox_lp[0] + f_lp[1] * r_ox_lp[1] + f_lp[2] * r_ox_lp[2]) * invlen2_r_ox_lp;
5790 const double f_rad[3] = {r_ox_lp[0] * rad_factor,
5791 r_ox_lp[1] * rad_factor,
5792 r_ox_lp[2] * rad_factor};
5793 fad_ox[0] += f_rad[0];
5794 fad_ox[1] += f_rad[1];
5795 fad_ox[2] += f_rad[2];
5796 // Calculate the angular component
5797 const double r_hcom_ox[3] = {
5798 p_ox[0] - ( (p_h1[0] + p_h2[0]) * 0.5 ),
5799 p_ox[1] - ( (p_h1[1] + p_h2[1]) * 0.5 ),
5800 p_ox[2] - ( (p_h1[2] + p_h2[2]) * 0.5 )};
5801 const double f_ang[3] = {
5804 f_lp[2] - f_rad[2]};
5805 // now split this component onto the other atoms
5806 const double len_r_ox_lp = norm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5807 const double invlen_r_hcom_ox = rnorm3d(
5808 r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]);
5809 const double oxcomp =
5810 (norm3d(r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]) - len_r_ox_lp) *
5812 const double hydcomp = 0.5 * len_r_ox_lp * invlen_r_hcom_ox;
5813 fad_ox[0] += (f_ang[0] * oxcomp);
5814 fad_ox[1] += (f_ang[1] * oxcomp);
5815 fad_ox[2] += (f_ang[2] * oxcomp);
5816 fad_h[0] += (f_ang[0] * hydcomp);
5817 fad_h[1] += (f_ang[1] * hydcomp);
5818 fad_h[2] += (f_ang[2] * hydcomp);
5820 virial.xx = fad_ox[0] * p_ox[0] + fad_h[0] * p_h1[0] + fad_h[0] * p_h2[0] - f_lp[0] * p_lp[0];
5821 virial.xy = fad_ox[0] * p_ox[1] + fad_h[0] * p_h1[1] + fad_h[0] * p_h2[1] - f_lp[0] * p_lp[1];
5822 virial.xz = fad_ox[0] * p_ox[2] + fad_h[0] * p_h1[2] + fad_h[0] * p_h2[2] - f_lp[0] * p_lp[2];
5823 virial.yx = fad_ox[1] * p_ox[0] + fad_h[1] * p_h1[0] + fad_h[1] * p_h2[0] - f_lp[1] * p_lp[0];
5824 virial.yy = fad_ox[1] * p_ox[1] + fad_h[1] * p_h1[1] + fad_h[1] * p_h2[1] - f_lp[1] * p_lp[1];
5825 virial.yz = fad_ox[1] * p_ox[2] + fad_h[1] * p_h1[2] + fad_h[1] * p_h2[2] - f_lp[1] * p_lp[2];
5826 virial.zx = fad_ox[2] * p_ox[0] + fad_h[2] * p_h1[0] + fad_h[2] * p_h2[0] - f_lp[2] * p_lp[0];
5827 virial.zy = fad_ox[2] * p_ox[1] + fad_h[2] * p_h1[1] + fad_h[2] * p_h2[1] - f_lp[2] * p_lp[1];
5828 virial.zz = fad_ox[2] * p_ox[2] + fad_h[2] * p_h1[2] + fad_h[2] * p_h2[2] - f_lp[2] * p_lp[2];
5833 f_ox[0] += fad_ox[0];
5834 f_ox[1] += fad_ox[1];
5835 f_ox[2] += fad_ox[2];
5836 f_h1[0] += fad_h[0];
5837 f_h1[1] += fad_h[1];
5838 f_h1[2] += fad_h[2];
5839 f_h2[0] += fad_h[0];
5840 f_h2[1] += fad_h[1];
5841 f_h2[2] += fad_h[2];
5844 template <bool doVirial>
5845 __global__ void redistributeTip4pForcesKernel2(
5849 cudaTensor* d_virial,
5850 const double* d_pos_x,
5851 const double* d_pos_y,
5852 const double* d_pos_z,
5853 const float* d_mass,
5856 const int i = threadIdx.x + blockIdx.x * blockDim.x;
5857 cudaTensor lVirial = {0};
5859 if (d_mass[i] < 0.01f) {
5860 double f_ox[3] = {d_f_x[i-3], d_f_y[i-3], d_f_z[i-3]};
5861 double f_h1[3] = {d_f_x[i-2], d_f_y[i-2], d_f_z[i-2]};
5862 double f_h2[3] = {d_f_x[i-1], d_f_y[i-1], d_f_z[i-1]};
5863 double f_lp[3] = {d_f_x[i], d_f_y[i], d_f_z[i]};
5864 const double p_ox[3] = {d_pos_x[i-3], d_pos_y[i-3], d_pos_z[i-3]};
5865 const double p_h1[3] = {d_pos_x[i-2], d_pos_y[i-2], d_pos_z[i-2]};
5866 const double p_h2[3] = {d_pos_x[i-1], d_pos_y[i-1], d_pos_z[i-1]};
5867 const double p_lp[3] = {d_pos_x[i], d_pos_y[i], d_pos_z[i]};
5868 redistrib_lp_water_force<doVirial>(
5869 f_ox, f_h1, f_h2, f_lp, p_ox, p_h1, p_h2, p_lp, lVirial);
5870 // copy the force back
5871 d_f_x[i-3] = f_ox[0];
5872 d_f_x[i-2] = f_h1[0];
5873 d_f_x[i-1] = f_h2[0];
5875 d_f_y[i-3] = f_ox[1];
5876 d_f_y[i-2] = f_h1[1];
5877 d_f_y[i-1] = f_h2[1];
5879 d_f_z[i-3] = f_ox[2];
5880 d_f_z[i-2] = f_h1[2];
5881 d_f_z[i-1] = f_h2[2];
5885 typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
5886 __shared__ typename BlockReduce::TempStorage temp_storage;
5888 lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx);
5890 lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy);
5892 lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz);
5894 lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx);
5896 lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
5898 lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz);
5900 lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx);
5902 lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy);
5904 lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz);
5906 if (threadIdx.x == 0) {
5907 atomicAdd(&(d_virial->xx), lVirial.xx);
5908 atomicAdd(&(d_virial->xy), lVirial.xy);
5909 atomicAdd(&(d_virial->xz), lVirial.xz);
5910 atomicAdd(&(d_virial->yx), lVirial.yx);
5911 atomicAdd(&(d_virial->yy), lVirial.yy);
5912 atomicAdd(&(d_virial->yz), lVirial.yz);
5913 atomicAdd(&(d_virial->zx), lVirial.zx);
5914 atomicAdd(&(d_virial->zy), lVirial.zy);
5915 atomicAdd(&(d_virial->zz), lVirial.zz);
5920 void SequencerCUDAKernel::redistributeTip4pForces(
5921 double* d_f_normal_x,
5922 double* d_f_normal_y,
5923 double* d_f_normal_z,
5924 double* d_f_nbond_x,
5925 double* d_f_nbond_y,
5926 double* d_f_nbond_z,
5930 cudaTensor* d_virial_normal,
5931 cudaTensor* d_virial_nbond,
5932 cudaTensor* d_virial_slow,
5933 const double* d_pos_x,
5934 const double* d_pos_y,
5935 const double* d_pos_z,
5936 const float* d_mass,
5939 const int maxForceNumber,
5942 const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5943 switch (maxForceNumber) {
5945 if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5946 else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5948 if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5949 else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5951 if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5952 else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5956 __forceinline__ __device__ void calcVirial(
5957 const int i, const double factor,
5958 const double* __restrict f_x, const double* __restrict f_y, const double* __restrict f_z,
5959 const double* __restrict r_x, const double* __restrict r_y, const double* __restrict r_z,
5960 cudaTensor& tensor_out) {
5961 tensor_out.xx = factor * f_x[i] * r_x[i];
5962 tensor_out.xy = factor * f_x[i] * r_y[i];
5963 tensor_out.xz = factor * f_x[i] * r_z[i];
5964 tensor_out.yx = factor * f_y[i] * r_x[i];
5965 tensor_out.yy = factor * f_y[i] * r_y[i];
5966 tensor_out.yz = factor * f_y[i] * r_z[i];
5967 tensor_out.zx = factor * f_z[i] * r_x[i];
5968 tensor_out.zy = factor * f_z[i] * r_y[i];
5969 tensor_out.zz = factor * f_z[i] * r_z[i];
5972 template <bool doNbond, bool doSlow>
5973 __global__ void calcFixVirialKernel(
5975 const int* __restrict d_atomFixed,
5976 const double* __restrict d_fixedPosition_x,
5977 const double* __restrict d_fixedPosition_y,
5978 const double* __restrict d_fixedPosition_z,
5979 const double* __restrict d_f_normal_x,
5980 const double* __restrict d_f_normal_y,
5981 const double* __restrict d_f_normal_z,
5982 const double* __restrict d_f_nbond_x,
5983 const double* __restrict d_f_nbond_y,
5984 const double* __restrict d_f_nbond_z,
5985 const double* __restrict d_f_slow_x,
5986 const double* __restrict d_f_slow_y,
5987 const double* __restrict d_f_slow_z,
5988 cudaTensor* __restrict d_virial_normal,
5989 cudaTensor* __restrict d_virial_nbond,
5990 cudaTensor* __restrict d_virial_slow,
5991 double3* __restrict d_extForce_normal,
5992 double3* __restrict d_extForce_nbond,
5993 double3* __restrict d_extForce_slow)
5995 const int i = threadIdx.x + blockIdx.x * blockDim.x;
5996 cudaTensor vir_normal = {0};
5997 cudaTensor vir_nbond = {0};
5998 cudaTensor vir_slow = {0};
5999 double3 f_sum_normal = {0, 0, 0};
6000 double3 f_sum_nbond;
6002 f_sum_nbond = {0, 0, 0};
6006 f_sum_slow = {0, 0, 0};
6009 if (d_atomFixed[i]) {
6010 calcVirial(i, -1.0, d_f_normal_x, d_f_normal_y, d_f_normal_z,
6011 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_normal);
6012 f_sum_normal.x = -1.0 * d_f_normal_x[i];
6013 f_sum_normal.y = -1.0 * d_f_normal_y[i];
6014 f_sum_normal.z = -1.0 * d_f_normal_z[i];
6016 calcVirial(i, -1.0, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6017 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_nbond);
6018 f_sum_nbond.x = -1.0 * d_f_nbond_x[i];
6019 f_sum_nbond.y = -1.0 * d_f_nbond_y[i];
6020 f_sum_nbond.z = -1.0 * d_f_nbond_z[i];
6023 calcVirial(i, -1.0, d_f_slow_x, d_f_slow_y, d_f_slow_z,
6024 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_slow);
6025 f_sum_slow.x = -1.0 * d_f_slow_x[i];
6026 f_sum_slow.y = -1.0 * d_f_slow_y[i];
6027 f_sum_slow.z = -1.0 * d_f_slow_z[i];
6031 // Haochuan: is it OK to use lambda to reduce the cudaTensor as a whole instead of component-wise??
6032 typedef cub::BlockReduce<cudaTensor, ATOM_BLOCKS> BlockReduceCudaTensor;
6033 __shared__ typename BlockReduceCudaTensor::TempStorage temp_storage;
6034 vir_normal = BlockReduceCudaTensor(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6037 vir_nbond = BlockReduceCudaTensor(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6041 vir_slow = BlockReduceCudaTensor(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6044 typedef cub::BlockReduce<double3, ATOM_BLOCKS> BlockReduceDouble3;
6045 __shared__ typename BlockReduceDouble3::TempStorage temp_storage2;
6046 f_sum_normal = BlockReduceDouble3(temp_storage2).Reduce(f_sum_normal, [](const double3& a, const double3& b){return a+b;});
6049 f_sum_nbond = BlockReduceDouble3(temp_storage2).Reduce(f_sum_nbond, [](const double3& a, const double3& b){return a+b;});
6053 f_sum_slow = BlockReduceDouble3(temp_storage2).Reduce(f_sum_slow, [](const double3& a, const double3& b){return a+b;});
6056 if (threadIdx.x == 0) {
6057 atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
6058 atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
6059 atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
6060 atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
6061 atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
6062 atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
6063 atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
6064 atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
6065 atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
6066 atomicAdd(&(d_extForce_normal->x), f_sum_normal.x);
6067 atomicAdd(&(d_extForce_normal->y), f_sum_normal.y);
6068 atomicAdd(&(d_extForce_normal->z), f_sum_normal.z);
6070 atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
6071 atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
6072 atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
6073 atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
6074 atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
6075 atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
6076 atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
6077 atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
6078 atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
6079 atomicAdd(&(d_extForce_nbond->x), f_sum_nbond.x);
6080 atomicAdd(&(d_extForce_nbond->y), f_sum_nbond.y);
6081 atomicAdd(&(d_extForce_nbond->z), f_sum_nbond.z);
6084 atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
6085 atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
6086 atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
6087 atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
6088 atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
6089 atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
6090 atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
6091 atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
6092 atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
6093 atomicAdd(&(d_extForce_slow->x), f_sum_slow.x);
6094 atomicAdd(&(d_extForce_slow->y), f_sum_slow.y);
6095 atomicAdd(&(d_extForce_slow->z), f_sum_slow.z);
6100 __global__ void copyForcesToDeviceKernel(
6102 const double* d_f_nbond_x,
6103 const double* d_f_nbond_y,
6104 const double* d_f_nbond_z,
6105 const double* d_f_slow_x,
6106 const double* d_f_slow_y,
6107 const double* d_f_slow_z,
6108 double* d_f_saved_nbond_x,
6109 double* d_f_saved_nbond_y,
6110 double* d_f_saved_nbond_z,
6111 double* d_f_saved_slow_x,
6112 double* d_f_saved_slow_y,
6113 double* d_f_saved_slow_z,
6114 const int maxForceNumber
6116 const int i = threadIdx.x + blockIdx.x * blockDim.x;
6118 if (maxForceNumber >= 2) {
6119 d_f_saved_slow_x[i] = d_f_slow_x[i];
6120 d_f_saved_slow_y[i] = d_f_slow_y[i];
6121 d_f_saved_slow_z[i] = d_f_slow_z[i];
6123 if (maxForceNumber >= 1) {
6124 d_f_saved_nbond_x[i] = d_f_nbond_x[i];
6125 d_f_saved_nbond_y[i] = d_f_nbond_y[i];
6126 d_f_saved_nbond_z[i] = d_f_nbond_z[i];
6131 void SequencerCUDAKernel::calcFixVirial(
6132 const int maxForceNumber,
6134 const int* d_atomFixed,
6135 const double* d_fixedPosition_x,
6136 const double* d_fixedPosition_y,
6137 const double* d_fixedPosition_z,
6138 const double* d_f_normal_x,
6139 const double* d_f_normal_y,
6140 const double* d_f_normal_z,
6141 const double* d_f_nbond_x,
6142 const double* d_f_nbond_y,
6143 const double* d_f_nbond_z,
6144 const double* d_f_slow_x,
6145 const double* d_f_slow_y,
6146 const double* d_f_slow_z,
6147 cudaTensor* d_virial_normal,
6148 cudaTensor* d_virial_nbond,
6149 cudaTensor* d_virial_slow,
6150 double3* d_extForce_normal,
6151 double3* d_extForce_nbond,
6152 double3* d_extForce_slow,
6153 cudaStream_t stream)
6155 const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6156 #define CALL(doNbond, doSlow) \
6157 calcFixVirialKernel<doNbond, doSlow><<<grid, ATOM_BLOCKS, 0, stream>>>( \
6158 numAtoms, d_atomFixed, \
6159 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, \
6160 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
6161 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
6162 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
6163 d_virial_normal, d_virial_nbond, d_virial_slow, \
6164 d_extForce_normal, d_extForce_nbond, d_extForce_slow \
6166 switch (maxForceNumber) {
6167 case 0: CALL(false, false); break;
6168 case 1: CALL(true, false); break;
6169 case 2: CALL(true, true); break;
6170 default: NAMD_bug("No kernel is called in SequencerCUDAKernel::calcFixVirial!\n");
6175 void SequencerCUDAKernel::copyForcesToDevice(
6177 const double* d_f_nbond_x,
6178 const double* d_f_nbond_y,
6179 const double* d_f_nbond_z,
6180 const double* d_f_slow_x,
6181 const double* d_f_slow_y,
6182 const double* d_f_slow_z,
6183 double* d_f_saved_nbond_x,
6184 double* d_f_saved_nbond_y,
6185 double* d_f_saved_nbond_z,
6186 double* d_f_saved_slow_x,
6187 double* d_f_saved_slow_y,
6188 double* d_f_saved_slow_z,
6189 const int maxForceNumber,
6192 const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6193 copyForcesToDeviceKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
6194 numAtoms, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6195 d_f_slow_x, d_f_slow_y, d_f_slow_z,
6196 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
6197 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z,
6203 #endif // NODEGROUP_FORCE_REGISTER