4 #if __CUDACC_VER_MAJOR__ >= 11
7 #include <namd_cub/cub.cuh>
12 #ifdef NAMD_HIP //NAMD_HIP
13 #include <hip/hip_runtime.h>
14 #include <hipcub/hipcub.hpp>
18 #include "HipDefines.h"
19 #include "SequencerCUDAKernel.h"
20 #include "MShakeKernel.h"
22 #ifdef NODEGROUP_FORCE_REGISTER
24 #define OVERALLOC 2.0f
26 void NAMD_die(const char *);
29 __constant__ double c_pairlistTrigger;
30 __constant__ double c_pairlistGrow;
31 __constant__ double c_pairlistShrink;
33 __device__ void reset_atomic_counter(unsigned int *counter) {
39 template <int T_MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
40 __global__ void accumulateForceToSOAKernelMGPU(
42 CudaLocalRecord* localRecords,
43 const double * __restrict f_bond,
44 const double * __restrict f_bond_nbond,
45 const double * __restrict f_bond_slow,
47 const float4 * __restrict f_nbond,
48 const float4 * __restrict f_nbond_slow,
49 const CudaForce* __restrict f_slow,
50 double * __restrict d_f_global_x,
51 double * __restrict d_f_global_y,
52 double * __restrict d_f_global_z,
53 double * __restrict d_f_normal_x,
54 double * __restrict d_f_normal_y,
55 double * __restrict d_f_normal_z,
56 double * __restrict d_f_nbond_x,
57 double * __restrict d_f_nbond_y,
58 double * __restrict d_f_nbond_z,
59 double * __restrict d_f_slow_x,
60 double * __restrict d_f_slow_y,
61 double * __restrict d_f_slow_z,
62 const int * __restrict patchUnsortOrder,
63 const Lattice lattice,
64 unsigned int** __restrict deviceQueues,
65 unsigned int* __restrict queueCounters,
66 unsigned int* __restrict tbcatomic
68 __shared__ CudaLocalRecord s_record;
69 using AccessType = int32_t;
70 AccessType* s_record_buffer = (AccessType*) &s_record;
72 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
73 // Read in the CudaLocalRecord using multiple threads. This should
75 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
76 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
80 const int numAtoms = s_record.numAtoms;
81 const int offset = s_record.bufferOffset;
82 const int offsetNB = s_record.bufferOffsetNBPad;
84 double f_slow_x, f_slow_y, f_slow_z;
85 double f_nbond_x, f_nbond_y, f_nbond_z;
86 double f_normal_x, f_normal_y, f_normal_z;
88 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
89 if (T_MAX_FORCE_NUMBER >= 1) {
90 int unorder = patchUnsortOrder[offset + i];
91 // gather from sorted nonbonded force array
92 float4 fnbond = f_nbond[offsetNB + unorder];
93 // set (medium) nonbonded force accumulators
94 f_nbond_x = (double)fnbond.x;
95 f_nbond_y = (double)fnbond.y;
96 f_nbond_z = (double)fnbond.z;
97 if (T_MAX_FORCE_NUMBER == 2) {
98 // gather from sorted nonbonded slow force array
99 float4 fnbslow = f_nbond_slow[offsetNB + unorder];
100 // accumulate slow force contributions from nonbonded calculation
101 f_slow_x = (double)fnbslow.x;
102 f_slow_y = (double)fnbslow.y;
103 f_slow_z = (double)fnbslow.z;
106 // gather from strided bond force array
107 // set (fast) normal force accumulators
108 if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
109 // Global forces is stored in d_f_global, add to bonded force
110 f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
111 f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
112 f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
113 if (T_DOCUDAGLOBAL) {
114 d_f_global_x[offset + i] = 0;
115 d_f_global_y[offset + i] = 0;
116 d_f_global_z[offset + i] = 0;
120 f_normal_x = f_bond[offset + i];
121 f_normal_y = f_bond[offset + i + forceStride];
122 f_normal_z = f_bond[offset + i + 2*forceStride];
124 if (T_MAX_FORCE_NUMBER >= 1) {
125 // gather from strided bond nonbonded force array
126 // accumulate (medium) nonbonded force accumulators
127 f_nbond_x += f_bond_nbond[offset + i];
128 f_nbond_y += f_bond_nbond[offset + i + forceStride];
129 f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
130 if (T_MAX_FORCE_NUMBER == 2) {
131 // gather from strided bond slow force array
132 // accumulate slow force accumulators
133 f_slow_x += f_bond_slow[offset + i];
134 f_slow_y += f_bond_slow[offset + i + forceStride];
135 f_slow_z += f_bond_slow[offset + i + 2*forceStride];
138 // set normal, nonbonded, and slow SOA force buffers
139 d_f_normal_x[offset + i] = f_normal_x;
140 d_f_normal_y[offset + i] = f_normal_y;
141 d_f_normal_z[offset + i] = f_normal_z;
142 if (T_MAX_FORCE_NUMBER >= 1) {
143 d_f_nbond_x[offset + i] = f_nbond_x;
144 d_f_nbond_y[offset + i] = f_nbond_y;
145 d_f_nbond_z[offset + i] = f_nbond_z;
146 if (T_MAX_FORCE_NUMBER == 2){
147 d_f_slow_x[offset + i] = f_slow_x;
148 d_f_slow_y[offset + i] = f_slow_y;
149 d_f_slow_z[offset + i] = f_slow_z;
157 // DMC This was in the accumulateForceToSOAKernelMGPU (commented out)
160 if(threadIdx.x == 0){
161 // Need another value here
162 unsigned int value = atomicInc(&tbcatomic[4], gridDim.x);
163 isLastBlockDone = (value == (gridDim.x - 1));
167 // thread0 flags everything
168 if(threadIdx.x == 0){
169 for(int i = 0; i < nDev; i++ ){
170 if (i == devID) continue;
171 unsigned int value = atomicInc(& (queueCounters[i]), nDev);
172 printf(" Device[%d] queue[%d] pos[%d]\n", devID, i, value);
173 // deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
174 deviceQueues[i][value] = devID; // flags in other device's queue that we're ready for processing
175 __threadfence_system();
177 // sets tcbCounter back to zero
179 printf(" GPU[%d] finished accumulating SOA buffers\n", devID);
184 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL>
185 __global__ void accumulateForceToSOAKernel(
186 const int numPatches,
187 CudaLocalRecord* localRecords,
188 const double * __restrict f_bond,
189 const double * __restrict f_bond_nbond,
190 const double * __restrict f_bond_slow,
192 const float4 * __restrict f_nbond,
193 const float4 * __restrict f_nbond_slow,
194 const CudaForce* __restrict f_slow,
195 double * __restrict d_f_global_x,
196 double * __restrict d_f_global_y,
197 double * __restrict d_f_global_z,
198 double * __restrict d_f_normal_x,
199 double * __restrict d_f_normal_y,
200 double * __restrict d_f_normal_z,
201 double * __restrict d_f_nbond_x,
202 double * __restrict d_f_nbond_y,
203 double * __restrict d_f_nbond_z,
204 double * __restrict d_f_slow_x,
205 double * __restrict d_f_slow_y,
206 double * __restrict d_f_slow_z,
207 const int * __restrict patchUnsortOrder,
208 const Lattice lattice
210 __shared__ CudaLocalRecord s_record;
211 using AccessType = int32_t;
212 AccessType* s_record_buffer = (AccessType*) &s_record;
214 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
215 // Read in the CudaLocalRecord using multiple threads. This should
217 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
218 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
222 const int numAtoms = s_record.numAtoms;
223 const int offset = s_record.bufferOffset;
224 const int offsetNB = s_record.bufferOffsetNBPad;
226 double f_slow_x, f_slow_y, f_slow_z;
227 double f_nbond_x, f_nbond_y, f_nbond_z;
228 double f_normal_x, f_normal_y, f_normal_z;
230 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
231 if (MAX_FORCE_NUMBER == 2) {
232 CudaForce f = f_slow[offset + i];
233 double3 f_scaled = lattice.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
234 // set slow force accumulators
235 f_slow_x = f_scaled.x;
236 f_slow_y = f_scaled.y;
237 f_slow_z = f_scaled.z;
239 if (MAX_FORCE_NUMBER >= 1) {
240 int unorder = patchUnsortOrder[offset + i];
241 // gather from sorted nonbonded force array
242 float4 fnbond = f_nbond[offsetNB + unorder];
243 // set (medium) nonbonded force accumulators
244 f_nbond_x = (double)fnbond.x;
245 f_nbond_y = (double)fnbond.y;
246 f_nbond_z = (double)fnbond.z;
247 if (MAX_FORCE_NUMBER == 2) {
248 // gather from sorted nonbonded slow force array
249 float4 fnbslow = f_nbond_slow[offsetNB + unorder];
250 // accumulate slow force contributions from nonbonded calculation
251 f_slow_x += (double)fnbslow.x;
252 f_slow_y += (double)fnbslow.y;
253 f_slow_z += (double)fnbslow.z;
256 // gather from strided bond force array
257 // set (fast) normal force accumulators
258 if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
259 // Global forces is stored in d_f_global, add to bonded force
260 f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
261 f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
262 f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
263 if (T_DOCUDAGLOBAL) {
264 d_f_global_x[offset + i] = 0;
265 d_f_global_y[offset + i] = 0;
266 d_f_global_z[offset + i] = 0;
271 f_normal_x = f_bond[offset + i];
272 f_normal_y = f_bond[offset + i + forceStride];
273 f_normal_z = f_bond[offset + i + 2*forceStride];
276 if (MAX_FORCE_NUMBER >= 1) {
277 // gather from strided bond nonbonded force array
278 // accumulate (medium) nonbonded force accumulators
279 f_nbond_x += f_bond_nbond[offset + i];
280 f_nbond_y += f_bond_nbond[offset + i + forceStride];
281 f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
282 if (MAX_FORCE_NUMBER == 2) {
283 // gather from strided bond slow force array
284 // accumulate slow force accumulators
285 f_slow_x += f_bond_slow[offset + i];
286 f_slow_y += f_bond_slow[offset + i + forceStride];
287 f_slow_z += f_bond_slow[offset + i + 2*forceStride];
290 // set normal, nonbonded, and slow SOA force buffers
291 d_f_normal_x[offset + i] = f_normal_x;
292 d_f_normal_y[offset + i] = f_normal_y;
293 d_f_normal_z[offset + i] = f_normal_z;
294 if (MAX_FORCE_NUMBER >= 1) {
295 d_f_nbond_x[offset + i] = f_nbond_x;
296 d_f_nbond_y[offset + i] = f_nbond_y;
297 d_f_nbond_z[offset + i] = f_nbond_z;
298 if (MAX_FORCE_NUMBER == 2) {
299 d_f_slow_x[offset + i] = f_slow_x;
300 d_f_slow_y[offset + i] = f_slow_y;
301 d_f_slow_z[offset + i] = f_slow_z;
309 __global__ void accumulatePMEForces(
311 const CudaForce* f_slow,
315 const int* patchOffsets,
318 int tid = threadIdx.x + (blockDim.x * blockIdx.x);
319 double f_slow_x, f_slow_y, f_slow_z;
322 CudaForce f = f_slow[tid];
324 double3 f_scaled = lat.scale_force(Vector((double)f.x, (double)f.y, (double)f.z));
325 f_slow_x = f_scaled.x;
326 f_slow_y = f_scaled.y;
327 f_slow_z = f_scaled.z;
329 d_f_slow_x[tid] += f_slow_x;
330 d_f_slow_y[tid] += f_slow_y;
331 d_f_slow_z[tid] += f_slow_z;
335 void SequencerCUDAKernel::accumulateForceToSOA(
337 const int doCudaGlobal,
338 const int maxForceNumber,
339 const int numPatches,
341 CudaLocalRecord* localRecords,
342 const double* f_bond,
343 const double* f_bond_nbond,
344 const double* f_bond_slow,
346 const float4* f_nbond,
347 const float4* f_nbond_slow,
348 const CudaForce* f_slow,
349 double* d_f_global_x,
350 double* d_f_global_y,
351 double* d_f_global_z,
352 double* d_f_normal_x,
353 double* d_f_normal_y,
354 double* d_f_normal_z,
361 const int* patchUnsortOrder,
362 const Lattice lattice,
363 unsigned int** deviceQueues,
364 unsigned int* queueCounters,
365 unsigned int* tbcatomic,
368 // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
369 const int options = (maxForceNumber << 2) + (doGlobal << 1) + doCudaGlobal;
371 #define CALL_MGPU(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
372 accumulateForceToSOAKernelMGPU<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
373 <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
374 ( numPatches, localRecords, \
375 f_bond, f_bond_nbond, f_bond_slow, forceStride, \
376 f_nbond, f_nbond_slow, f_slow, \
377 d_f_global_x, d_f_global_y, d_f_global_z, \
378 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
379 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
380 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
381 patchUnsortOrder, lattice, \
382 deviceQueues, queueCounters, tbcatomic \
385 case ((0<<2) + (0<<1) + (0<<0)): CALL_MGPU(0, 0, 0); break;
386 case ((0<<2) + (0<<1) + (1<<0)): CALL_MGPU(0, 0, 1); break;
387 case ((0<<2) + (1<<1) + (1<<0)): CALL_MGPU(0, 1, 1); break;
388 case ((0<<2) + (1<<1) + (0<<0)): CALL_MGPU(0, 1, 0); break;
389 case ((1<<2) + (0<<1) + (0<<0)): CALL_MGPU(1, 0, 0); break;
390 case ((1<<2) + (0<<1) + (1<<0)): CALL_MGPU(1, 0, 1); break;
391 case ((1<<2) + (1<<1) + (1<<0)): CALL_MGPU(1, 1, 1); break;
392 case ((1<<2) + (1<<1) + (0<<0)): CALL_MGPU(1, 1, 0); break;
393 case ((2<<2) + (0<<1) + (0<<0)): CALL_MGPU(2, 0, 0); break;
394 case ((2<<2) + (0<<1) + (1<<0)): CALL_MGPU(2, 0, 1); break;
395 case ((2<<2) + (1<<1) + (1<<0)): CALL_MGPU(2, 1, 1); break;
396 case ((2<<2) + (1<<1) + (0<<0)): CALL_MGPU(2, 1, 0); break;
398 const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
399 NAMD_bug(error.c_str());
404 #define CALL(T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL) \
405 accumulateForceToSOAKernel<T_MAX_FORCE_NUMBER, T_DOGLOBAL, T_DOCUDAGLOBAL> \
406 <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
407 ( numPatches, localRecords, \
408 f_bond, f_bond_nbond, f_bond_slow, forceStride, \
409 f_nbond, f_nbond_slow, f_slow, \
410 d_f_global_x, d_f_global_y, d_f_global_z, \
411 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
412 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
413 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
414 patchUnsortOrder, lattice \
417 case ((0<<2) + (0<<1) + (0<<0)): CALL(0, 0, 0); break;
418 case ((0<<2) + (0<<1) + (1<<0)): CALL(0, 0, 1); break;
419 case ((0<<2) + (1<<1) + (1<<0)): CALL(0, 1, 1); break;
420 case ((0<<2) + (1<<1) + (0<<0)): CALL(0, 1, 0); break;
421 case ((1<<2) + (0<<1) + (0<<0)): CALL(1, 0, 0); break;
422 case ((1<<2) + (0<<1) + (1<<0)): CALL(1, 0, 1); break;
423 case ((1<<2) + (1<<1) + (1<<0)): CALL(1, 1, 1); break;
424 case ((1<<2) + (1<<1) + (0<<0)): CALL(1, 1, 0); break;
425 case ((2<<2) + (0<<1) + (0<<0)): CALL(2, 0, 0); break;
426 case ((2<<2) + (0<<1) + (1<<0)): CALL(2, 0, 1); break;
427 case ((2<<2) + (1<<1) + (1<<0)): CALL(2, 1, 1); break;
428 case ((2<<2) + (1<<1) + (0<<0)): CALL(2, 1, 0); break;
430 const std::string error = "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " + std::to_string(maxForceNumber) + ", doGlobal = " + std::to_string(doGlobal) + ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
431 NAMD_bug(error.c_str());
438 template <int T_MAX_FORCE_NUMBER>
439 __global__ void mergeForcesFromPeersKernel(
440 const int numPatches,
441 const int devID, // GPU ID
442 CudaLocalRecord* localRecords,
443 CudaPeerRecord* peerRecords,
445 double** __restrict f_normal_x,
446 double** __restrict f_normal_y,
447 double** __restrict f_normal_z,
448 double** __restrict f_nbond_x,
449 double** __restrict f_nbond_y,
450 double** __restrict f_nbond_z,
451 double** __restrict f_slow_x,
452 double** __restrict f_slow_y,
453 double** __restrict f_slow_z,
454 const CudaForce* __restrict pmeForces
456 __shared__ CudaLocalRecord s_record;
457 using AccessType = int32_t;
458 AccessType* s_record_buffer = (AccessType*) &s_record;
460 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
461 // Read in the CudaLocalRecord using multiple threads. This should
463 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
464 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
468 const int numAtoms = s_record.numAtoms;
469 const int dstOffset = s_record.bufferOffset;
470 const int numPeerRecords = s_record.numPeerRecord;
471 const int peerRecordStartIndexs = s_record.peerRecordStartIndex;
473 const int inlinePeers = min(numPeerRecords, CudaLocalRecord::num_inline_peer);
475 for (int peer = 0; peer < inlinePeers; peer++) {
476 const int srcDevice = s_record.inline_peers[peer].deviceIndex;
477 const int srcOffset = s_record.inline_peers[peer].bufferOffset;
479 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
480 double f_x = f_normal_x[srcDevice][srcOffset + i];
481 double f_y = f_normal_y[srcDevice][srcOffset + i];
482 double f_z = f_normal_z[srcDevice][srcOffset + i];
484 f_normal_x[devID][dstOffset + i] += f_x;
485 f_normal_y[devID][dstOffset + i] += f_y;
486 f_normal_z[devID][dstOffset + i] += f_z;
488 if(T_MAX_FORCE_NUMBER >= 1){
489 // We don't need the nonbonded offset here since
490 // this isn't the actual nonbonded buffer
491 f_x = f_nbond_x[srcDevice][srcOffset + i];
492 f_y = f_nbond_y[srcDevice][srcOffset + i];
493 f_z = f_nbond_z[srcDevice][srcOffset + i];
495 f_nbond_x[devID][dstOffset + i] += f_x;
496 f_nbond_y[devID][dstOffset + i] += f_y;
497 f_nbond_z[devID][dstOffset + i] += f_z;
499 if(T_MAX_FORCE_NUMBER == 2){
500 f_x = f_slow_x[srcDevice][srcOffset + i];
501 f_y = f_slow_y[srcDevice][srcOffset + i];
502 f_z = f_slow_z[srcDevice][srcOffset + i];
504 f_slow_x[devID][dstOffset + i] += f_x;
505 f_slow_y[devID][dstOffset + i] += f_y;
506 f_slow_z[devID][dstOffset + i] += f_z;
512 for (int peer = inlinePeers; peer < numPeerRecords; peer++) {
513 const int srcDevice = peerRecords[peerRecordStartIndexs + peer].deviceIndex;
514 const int srcOffset = peerRecords[peerRecordStartIndexs + peer].bufferOffset;
516 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x){
517 double f_x = f_normal_x[srcDevice][srcOffset + i];
518 double f_y = f_normal_y[srcDevice][srcOffset + i];
519 double f_z = f_normal_z[srcDevice][srcOffset + i];
521 f_normal_x[devID][dstOffset + i] += f_x;
522 f_normal_y[devID][dstOffset + i] += f_y;
523 f_normal_z[devID][dstOffset + i] += f_z;
525 if(T_MAX_FORCE_NUMBER >= 1){
526 // We don't need the nonbonded offset here since
527 // this isn't the actual nonbonded buffer
528 f_x = f_nbond_x[srcDevice][srcOffset + i];
529 f_y = f_nbond_y[srcDevice][srcOffset + i];
530 f_z = f_nbond_z[srcDevice][srcOffset + i];
532 f_nbond_x[devID][dstOffset + i] += f_x;
533 f_nbond_y[devID][dstOffset + i] += f_y;
534 f_nbond_z[devID][dstOffset + i] += f_z;
536 if(T_MAX_FORCE_NUMBER == 2){
537 f_x = f_slow_x[srcDevice][srcOffset + i];
538 f_y = f_slow_y[srcDevice][srcOffset + i];
539 f_z = f_slow_z[srcDevice][srcOffset + i];
541 f_slow_x[devID][dstOffset + i] += f_x;
542 f_slow_y[devID][dstOffset + i] += f_y;
543 f_slow_z[devID][dstOffset + i] += f_z;
549 // Merge PME forces here instead of in the fetch forcing kernel
550 if(T_MAX_FORCE_NUMBER == 2){
551 for(int i = threadIdx.x; i < numAtoms; i += blockDim.x){
552 CudaForce f = pmeForces[dstOffset + i];
554 double3 f_scaled = lat.scale_force(
555 Vector((double)f.x, (double)f.y, (double)f.z));
556 f_slow_x[devID][dstOffset + i] += f_scaled.x;
557 f_slow_y[devID][dstOffset + i] += f_scaled.y;
558 f_slow_z[devID][dstOffset + i] += f_scaled.z;
565 void SequencerCUDAKernel::mergeForcesFromPeers(
567 const int maxForceNumber,
569 const int numPatchesHomeAndProxy,
570 const int numPatchesHome,
580 const CudaForce* pmeForces,
581 CudaLocalRecord* localRecords,
582 CudaPeerRecord* peerRecords,
583 std::vector<int>& atomCounts,
586 // atom-based kernel here
587 //int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
588 int grid = (numPatchesHome);
589 int blocks = PATCH_BLOCKS;
590 // launch a single-threaded grid for debugging
593 for (int i = 0; i < devID; i++) {
594 offset += atomCounts[i];
597 switch (maxForceNumber) {
599 mergeForcesFromPeersKernel<0><<<grid, blocks, 0, stream>>>(
600 numPatchesHome, devID, localRecords, peerRecords,
602 f_normal_x, f_normal_y, f_normal_z,
603 f_nbond_x, f_nbond_y, f_nbond_z,
604 f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
608 mergeForcesFromPeersKernel<1><<<grid, blocks, 0, stream>>>(
609 numPatchesHome, devID, localRecords, peerRecords,
611 f_normal_x, f_normal_y, f_normal_z,
612 f_nbond_x, f_nbond_y, f_nbond_z,
613 f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
617 mergeForcesFromPeersKernel<2><<<grid, blocks, 0, stream>>>(
618 numPatchesHome, devID, localRecords, peerRecords,
620 f_normal_x, f_normal_y, f_normal_z,
621 f_nbond_x, f_nbond_y, f_nbond_z,
622 f_slow_x, f_slow_y, f_slow_z, pmeForces + offset
628 // JM NOTE: This is a fused version of accumulateForceToSOA + addForceToMomentum.
629 // addForceToMomentum barely has any math and is memory-bandwith bound, so
630 // fusing these kernels allows us to keep the forces in the registers and
631 // reusing its values for velocities
632 template <int MAX_FORCE_NUMBER, int T_DOGLOBAL, int T_DOCUDAGLOBAL, int DO_FIXED>
633 __global__ void accumulateForceKick(
634 const int numPatches,
635 CudaLocalRecord* localRecords,
636 const double * __restrict f_bond,
637 const double * __restrict f_bond_nbond,
638 const double * __restrict f_bond_slow,
640 const float4 * __restrict f_nbond,
641 const float4 * __restrict f_nbond_slow,
642 const CudaForce* __restrict f_slow,
643 double * __restrict d_f_global_x,
644 double * __restrict d_f_global_y,
645 double * __restrict d_f_global_z,
646 double * __restrict d_f_normal_x,
647 double * __restrict d_f_normal_y,
648 double * __restrict d_f_normal_z,
649 double * __restrict d_f_nbond_x,
650 double * __restrict d_f_nbond_y,
651 double * __restrict d_f_nbond_z,
652 double * __restrict d_f_slow_x,
653 double * __restrict d_f_slow_y,
654 double * __restrict d_f_slow_z,
655 const int * __restrict patchUnsortOrder,
656 const Lattice lattice,
660 const double * __restrict recipMass,
661 const int* __restrict atomFixed,
662 const double dt_normal,
663 const double dt_nbond,
664 const double dt_slow,
667 __shared__ CudaLocalRecord s_record;
668 using AccessType = int32_t;
669 AccessType* s_record_buffer = (AccessType*) &s_record;
671 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
672 // Read in the CudaLocalRecord using multiple threads. This should
674 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
675 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
679 const int numAtoms = s_record.numAtoms;
680 const int offset = s_record.bufferOffset;
681 const int offsetNB = s_record.bufferOffsetNBPad;
683 double f_slow_x, f_slow_y, f_slow_z;
684 double f_nbond_x, f_nbond_y, f_nbond_z;
685 double f_normal_x, f_normal_y, f_normal_z;
687 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
698 if (MAX_FORCE_NUMBER == 2) {
699 CudaForce f = f_slow[offset + i];
701 double3 f_scaled = lattice.scale_force(
702 Vector((double)f.x, (double)f.y, (double)f.z));
704 // set slow force accumulators
705 f_slow_x = f_scaled.x;
706 f_slow_y = f_scaled.y;
707 f_slow_z = f_scaled.z;
709 if (MAX_FORCE_NUMBER >= 1) {
710 int unorder = patchUnsortOrder[offset + i];
711 // gather from sorted nonbonded force array
712 float4 fnbond = f_nbond[offsetNB + unorder];
713 // set (medium) nonbonded force accumulators
714 f_nbond_x = (double)fnbond.x;
715 f_nbond_y = (double)fnbond.y;
716 f_nbond_z = (double)fnbond.z;
717 if (MAX_FORCE_NUMBER == 2) {
718 // gather from sorted nonbonded slow force array
719 float4 fnbslow = f_nbond_slow[offsetNB + unorder];
720 // accumulate slow force contributions from nonbonded calculation
721 f_slow_x += (double)fnbslow.x;
722 f_slow_y += (double)fnbslow.y;
723 f_slow_z += (double)fnbslow.z;
726 // gather from strided bond force array
727 // set (fast) normal force accumulators
728 if(T_DOGLOBAL || T_DOCUDAGLOBAL) {
729 // Global forces is stored in d_f_global, add to bonded force
730 f_normal_x = d_f_global_x[offset + i] + f_bond[offset + i];
731 f_normal_y = d_f_global_y[offset + i] + f_bond[offset + i + forceStride];
732 f_normal_z = d_f_global_z[offset + i] + f_bond[offset + i + 2*forceStride];
733 if (T_DOCUDAGLOBAL) {
734 d_f_global_x[offset + i] = 0;
735 d_f_global_y[offset + i] = 0;
736 d_f_global_z[offset + i] = 0;
742 f_normal_x = f_bond[offset + i];
743 f_normal_y = f_bond[offset + i + forceStride];
744 f_normal_z = f_bond[offset + i + 2*forceStride];
747 if (MAX_FORCE_NUMBER >= 1) {
748 // gather from strided bond nonbonded force array
749 // accumulate (medium) nonbonded force accumulators
750 f_nbond_x += f_bond_nbond[offset + i];
751 f_nbond_y += f_bond_nbond[offset + i + forceStride];
752 f_nbond_z += f_bond_nbond[offset + i + 2*forceStride];
753 if (MAX_FORCE_NUMBER == 2) {
754 // gather from strided bond slow force array
755 // accumulate slow force accumulators
756 f_slow_x += f_bond_slow[offset + i];
757 f_slow_y += f_bond_slow[offset + i + forceStride];
758 f_slow_z += f_bond_slow[offset + i + 2*forceStride];
761 // set normal, nonbonded, and slow SOA force buffers
762 d_f_normal_x[offset + i] = f_normal_x;
763 d_f_normal_y[offset + i] = f_normal_y;
764 d_f_normal_z[offset + i] = f_normal_z;
765 if (MAX_FORCE_NUMBER >= 1) {
766 d_f_nbond_x[offset + i] = f_nbond_x;
767 d_f_nbond_y[offset + i] = f_nbond_y;
768 d_f_nbond_z[offset + i] = f_nbond_z;
769 if (MAX_FORCE_NUMBER == 2) {
770 d_f_slow_x[offset + i] = f_slow_x;
771 d_f_slow_y[offset + i] = f_slow_y;
772 d_f_slow_z[offset + i] = f_slow_z;
776 /* Velocity updates */
777 double rp = recipMass[offset + i];
779 vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
780 vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
781 vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
783 d_vel_x[offset + i] += vx;
784 d_vel_y[offset + i] += vy;
785 d_vel_z[offset + i] += vz;
787 if (!atomFixed[offset + i]) {
788 d_vel_x[offset + i] += vx;
789 d_vel_y[offset + i] += vy;
790 d_vel_z[offset + i] += vz;
792 d_vel_x[offset + i] = 0;
793 d_vel_y[offset + i] = 0;
794 d_vel_z[offset + i] = 0;
804 template <int MAX_FORCE_NUMBER>
805 __global__ void accumulateForceKick(
806 const double * __restrict f_bond,
807 const double * __restrict f_bond_nbond,
808 const double * __restrict f_bond_slow,
810 const PatchRecord * __restrict bond_pr,
811 const float4 * __restrict f_nbond,
812 const float4 * __restrict f_nbond_slow,
813 const CudaPatchRecord * __restrict nbond_pr,
814 const CudaForce * __restrict f_slow,
815 double *d_f_normal_x,
816 double *d_f_normal_y,
817 double *d_f_normal_z,
827 const double * __restrict recipMass,
828 const double dt_normal,
829 const double dt_nbond,
830 const double dt_slow,
831 const double scaling,
832 const int * __restrict patchIDs,
833 const int * __restrict patchOffsets,
834 const int * __restrict patchUnsortOrder,
835 const CudaLattice lat)
837 int tid = threadIdx.x;
838 int stride = blockDim.x;
840 __shared__ int sh_patchID;
841 __shared__ int sh_patchOffset;
842 // number of atoms per patch should be same no matter
843 // which data structure we access it through
844 __shared__ int sh_natoms;
845 __shared__ int sh_bondForceOffset;
846 __shared__ int sh_nbondForceOffset;
850 // number of atoms per patch should be same no matter
851 // which data structure we access it through
854 int nbondForceOffset;
856 if(threadIdx.x == 0){
857 sh_patchID = patchIDs[blockIdx.x];
858 sh_patchOffset = patchOffsets[blockIdx.x];
859 // number of atoms per patch should be same no matter
860 // which data structure we access it through
861 sh_natoms = bond_pr[sh_patchID].numAtoms;
862 sh_bondForceOffset = bond_pr[sh_patchID].atomStart;
863 if(MAX_FORCE_NUMBER >= 1){
864 sh_nbondForceOffset = nbond_pr[sh_patchID].atomStart;
870 // pull stuff to registers
871 // patchID = sh_patchID;
872 patchOffset = sh_patchOffset;
874 bondForceOffset = sh_bondForceOffset;
875 nbondForceOffset = (MAX_FORCE_NUMBER >= 1) ? sh_nbondForceOffset : 0;
877 double r1x, r1y, r1z, r2x, r2y, r2z, r3x, r3y, r3z;
879 if (MAX_FORCE_NUMBER == 2) {
891 double f_slow_x, f_slow_y, f_slow_z;
892 double f_nbond_x, f_nbond_y, f_nbond_z;
893 double f_normal_x, f_normal_y, f_normal_z;
896 for (int i = tid; i < natoms; i += stride) {
908 // Global forces is stored in d_f_normal, just need to add bonded force
909 f_normal_x = d_f_normal_x[patchOffset + i];
910 f_normal_y = d_f_normal_y[patchOffset + i];
911 f_normal_z = d_f_normal_z[patchOffset + i];
914 if (MAX_FORCE_NUMBER == 2) {
915 CudaForce f = f_slow[patchOffset + i];
916 double fx = (double) f.x;
917 double fy = (double) f.y;
918 double fz = (double) f.z;
919 // set slow force accumulators
920 f_slow_x = (fx*r1x + fy*r2x + fz*r3x);
921 f_slow_y = (fx*r1y + fy*r2y + fz*r3y);
922 f_slow_z = (fx*r1z + fy*r2z + fz*r3z);
924 if (MAX_FORCE_NUMBER >= 1) {
925 int unorder = patchUnsortOrder[patchOffset + i];
926 // gather from sorted nonbonded force array
927 float4 fnbond = f_nbond[nbondForceOffset + unorder];
928 // set (medium) nonbonded force accumulators
929 f_nbond_x = (double)fnbond.x;
930 f_nbond_y = (double)fnbond.y;
931 f_nbond_z = (double)fnbond.z;
932 if (MAX_FORCE_NUMBER == 2) {
933 // gather from sorted nonbonded slow force array
934 float4 fnbslow = f_nbond_slow[nbondForceOffset + unorder];
935 // accumulate slow force contributions from nonbonded calculation
936 f_slow_x += (double)fnbslow.x;
937 f_slow_y += (double)fnbslow.y;
938 f_slow_z += (double)fnbslow.z;
941 // gather from strided bond force array
942 // set (fast) normal force accumulators
943 f_normal_x += f_bond[bondForceOffset + i];
944 f_normal_y += f_bond[bondForceOffset + i + forceStride];
945 f_normal_z += f_bond[bondForceOffset + i + 2*forceStride];
946 if (MAX_FORCE_NUMBER >= 1) {
947 // gather from strided bond nonbonded force array
948 // accumulate (medium) nonbonded force accumulators
949 f_nbond_x += f_bond_nbond[bondForceOffset + i];
950 f_nbond_y += f_bond_nbond[bondForceOffset + i + forceStride];
951 f_nbond_z += f_bond_nbond[bondForceOffset + i + 2*forceStride];
952 if (MAX_FORCE_NUMBER == 2) {
953 // gather from strided bond slow force array
954 // accumulate slow force accumulators
955 f_slow_x += f_bond_slow[bondForceOffset + i];
956 f_slow_y += f_bond_slow[bondForceOffset + i + forceStride];
957 f_slow_z += f_bond_slow[bondForceOffset + i + 2*forceStride];
960 // set normal, nonbonded, and slow SOA force buffers
961 d_f_normal_x[patchOffset + i] = f_normal_x;
962 d_f_normal_y[patchOffset + i] = f_normal_y;
963 d_f_normal_z[patchOffset + i] = f_normal_z;
965 if (MAX_FORCE_NUMBER >= 1) {
966 d_f_nbond_x[patchOffset + i] = f_nbond_x;
967 d_f_nbond_y[patchOffset + i] = f_nbond_y;
968 d_f_nbond_z[patchOffset + i] = f_nbond_z;
969 if (MAX_FORCE_NUMBER == 2) {
970 d_f_slow_x[patchOffset + i] = f_slow_x;
971 d_f_slow_y[patchOffset + i] = f_slow_y;
972 d_f_slow_z[patchOffset + i] = f_slow_z;
976 /* Velocity updates */
977 double rp = recipMass[patchOffset + i];
978 vx = ((f_slow_x * dt_slow) + (f_nbond_x * dt_nbond) + (f_normal_x * dt_normal)) * scaling * rp;
979 vy = ((f_slow_y * dt_slow) + (f_nbond_y * dt_nbond) + (f_normal_y * dt_normal)) * scaling * rp;
980 vz = ((f_slow_z * dt_slow) + (f_nbond_z * dt_nbond) + (f_normal_z * dt_normal)) * scaling * rp;
982 d_vel_x[patchOffset + i] += vx;
983 d_vel_y[patchOffset + i] += vy;
984 d_vel_z[patchOffset + i] += vz;
990 void SequencerCUDAKernel::accumulate_force_kick(
991 const bool doFixedAtoms,
993 const int doCudaGlobal,
994 const int maxForceNumber,
995 const int numPatches,
996 CudaLocalRecord* localRecords,
997 const double* f_bond,
998 const double* f_bond_nbond,
999 const double* f_bond_slow,
1001 const float4* f_nbond,
1002 const float4* f_nbond_slow,
1003 const CudaForce* f_slow,
1004 double* d_f_global_x,
1005 double* d_f_global_y,
1006 double* d_f_global_z,
1007 double* d_f_normal_x,
1008 double* d_f_normal_y,
1009 double* d_f_normal_z,
1010 double* d_f_nbond_x,
1011 double* d_f_nbond_y,
1012 double* d_f_nbond_z,
1019 const double* recipMass,
1020 const int* d_atomFixed,
1021 const double dt_normal,
1022 const double dt_nbond,
1023 const double dt_slow,
1024 const double scaling,
1025 const int* patchUnsortOrder,
1026 const Lattice lattice,
1029 // ASSERT( 0 <= maxForceNumber && maxForceNumber <= 2 );
1030 const int options = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms);
1031 #define CALL(MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED) \
1032 accumulateForceKick<MAX_FORCE_NUMBER, DOGLOBAL, DOCUDAGLOBAL, DOFIXED> \
1033 <<<numPatches, PATCH_BLOCKS, 0, stream>>> \
1034 ( numPatches, localRecords, \
1035 f_bond, f_bond_nbond, f_bond_slow, forceStride, \
1036 f_nbond, f_nbond_slow, f_slow, \
1037 d_f_global_x, d_f_global_y, d_f_global_z, \
1038 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
1039 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
1040 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
1041 patchUnsortOrder, lattice, \
1042 d_vel_x, d_vel_y, d_vel_z, recipMass, \
1043 d_atomFixed, dt_normal, dt_nbond, dt_slow, scaling)
1045 * Generated by the following python script:
1046 * #!/usr/bin/env python3
1047 * for maxForceNumber in range(3):
1048 * for doGlobal in range(2):
1049 * for doCudaGlobal in range(2):
1050 * for doFixedAtoms in range(2):
1051 * option = (maxForceNumber << 3) + (doGlobal << 2) + (doCudaGlobal << 1) + int(doFixedAtoms)
1052 * print(f' case {option}: CALL ({maxForceNumber}, {doGlobal}, {doCudaGlobal}, {doFixedAtoms}); break;')
1055 case 0: CALL (0, 0, 0, 0); break;
1056 case 1: CALL (0, 0, 0, 1); break;
1057 case 2: CALL (0, 0, 1, 0); break;
1058 case 3: CALL (0, 0, 1, 1); break;
1059 case 4: CALL (0, 1, 0, 0); break;
1060 case 5: CALL (0, 1, 0, 1); break;
1061 case 6: CALL (0, 1, 1, 0); break;
1062 case 7: CALL (0, 1, 1, 1); break;
1063 case 8: CALL (1, 0, 0, 0); break;
1064 case 9: CALL (1, 0, 0, 1); break;
1065 case 10: CALL (1, 0, 1, 0); break;
1066 case 11: CALL (1, 0, 1, 1); break;
1067 case 12: CALL (1, 1, 0, 0); break;
1068 case 13: CALL (1, 1, 0, 1); break;
1069 case 14: CALL (1, 1, 1, 0); break;
1070 case 15: CALL (1, 1, 1, 1); break;
1071 case 16: CALL (2, 0, 0, 0); break;
1072 case 17: CALL (2, 0, 0, 1); break;
1073 case 18: CALL (2, 0, 1, 0); break;
1074 case 19: CALL (2, 0, 1, 1); break;
1075 case 20: CALL (2, 1, 0, 0); break;
1076 case 21: CALL (2, 1, 0, 1); break;
1077 case 22: CALL (2, 1, 1, 0); break;
1078 case 23: CALL (2, 1, 1, 1); break;
1080 const std::string error =
1081 "Unsupported options in SequencerCUDAKernel::accumulateForceToSOA, no kernel is called. Options are maxForceNumber = " +
1082 std::to_string(maxForceNumber) +
1083 ", doGlobal = " + std::to_string(doGlobal) +
1084 ", doFixedAtoms = " + std::to_string(doFixedAtoms) +
1085 ", doCudaGlobal = " + std::to_string(doCudaGlobal) + "\n";
1086 NAMD_bug(error.c_str());
1092 template <bool doSlow>
1093 __global__ void copyFNBondToSOA(float4* __restrict__ f_nbond,
1094 float4* __restrict__ f_nbond_slow,
1101 const int* __restrict__ patchIDs,
1102 const int* __restrict__ patchOffsets,
1103 const int* __restrict__ patchUnsortOrder,
1104 const CudaPatchRecord* __restrict__ nbondIndexPerPatch)
1107 int stride = blockDim.x;
1108 int pid = patchIDs[blockIdx.x];
1110 // nbondIndexPerPatch is organized on a per local ID on the compute
1111 // It's also device-wide, which means it hopefully has more patches than
1112 // PatchMap but that means the ordering is different. I can't use the
1113 // PatchID here to index nbondIndexPerPatch I need something else.
1114 // Each patchID from patchMap needs to map somewhere to nbondIndexPerPatch
1115 // Means I can't use patchIDs anymore. I need to use a different ordering
1116 // straight from the computes nbondIndexPerPatch is the same for
1117 // both multi and single PE. What changes is force storage?
1118 // The only thing that needs updating is the usage of the local compute index
1119 // instead of using the global patchID.
1120 // I need the local compute index
1121 // what do... Patches never migrate, so hopefully this datastructure will never change.
1123 // I can build an index at patchMap with the ordering inside the computes.
1124 int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1125 int patchOffset = patchOffsets[blockIdx.x];
1126 int natom = nbondIndexPerPatch[pid].numAtoms;
1127 for(int i = threadIdx.x; i < natom; i+= stride){
1128 unorder = patchUnsortOrder[patchOffset + i];
1129 float4 force = f_nbond[globalForceOffset + unorder];
1131 f_nbond_x[patchOffset + i] = (double)force.x;
1132 f_nbond_y[patchOffset + i] = (double)force.y;
1133 f_nbond_z[patchOffset + i] = (double)force.z;
1136 // XXX Accumulation here works because each f_slow_* buffer
1137 // is first initialized in copyFSlowToSOA.
1138 float4 f_nb_slow = f_nbond_slow[globalForceOffset + unorder];
1139 f_slow_x[patchOffset + i] += (double)f_nb_slow.x;
1140 f_slow_y[patchOffset + i] += (double)f_nb_slow.y;
1141 f_slow_z[patchOffset + i] += (double)f_nb_slow.z;
1146 void SequencerCUDAKernel::copy_nbond_forces(int numPatches,
1148 float4* f_nbond_slow,
1155 const int* patchIDs,
1156 const int* patchOffsets,
1157 const int* patchUnsortOrder,
1158 const CudaPatchRecord* pr,
1160 cudaStream_t stream){
1163 copyFNBondToSOA<1><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1164 f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1165 patchIDs, patchOffsets, patchUnsortOrder, pr);
1167 copyFNBondToSOA<0><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_nbond, f_nbond_slow, f_nbond_x,
1168 f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z,
1169 patchIDs, patchOffsets, patchUnsortOrder, pr);
1174 // this function will fuse copyFNBond, copyFbond, addForceToMomentum and updateVelocities (hopefully)
1175 // XXX TODO: missing slow forces support
1176 template<bool T_DONBOND, bool T_DOSLOW>
1177 __global__ void fetchForcesAndUpdateVelocitiesKernel(
1178 const double scaling,
1182 /* copyFbondToSOA parameters*/
1183 const double* __restrict__ f_bond, // forces from bond compute
1184 const double* __restrict__ f_bond_nbond, // forces from bond compute
1185 double* __restrict__ f_bond_x,
1186 double* __restrict__ f_bond_y,
1187 double* __restrict__ f_bond_z,
1188 double* __restrict__ f_nbond_x,
1189 double* __restrict__ f_nbond_y,
1190 double* __restrict__ f_nbond_z,
1191 double* __restrict__ f_slow_x,
1192 double* __restrict__ f_slow_y,
1193 double* __restrict__ f_slow_z,
1195 const PatchRecord* __restrict__ b_pr,
1196 const int* __restrict__ patchIDs,
1197 const int* __restrict__ patchOffsets,
1198 /* copyFNbondToSOA parameters*/
1199 const float4* __restrict__ f_nbond, // forces from bbond compute
1200 const float4* __restrict__ f_nbond_slow, // forces from bond compute
1201 const int* __restrict__ patchSortOrder,
1202 const CudaPatchRecord* __restrict__ nbondIndexPerPatch,
1203 /* copyFSlowToSOA */
1204 const CudaForce* __restrict__ f_slow,
1205 const int* __restrict__ patchPositions,
1206 const int* __restrict__ pencilPatchIndex,
1207 const int* __restrict__ patchOffsets,
1208 const int* __restrict__ patchIDs,
1209 const int* __restrict__ slow_patchIDs,
1210 const CudaLattice* __restrict__ lattices,
1211 /* addForceToMomentum */
1212 const double * __restrict recipMass,
1213 double * __restrict vel_x,
1214 double * __restrict vel_y,
1215 double * __restrict vel_z,
1216 /* updateRigidArrays */
1217 const double * __restrict pos_x,
1218 const double * __restrict pos_y,
1219 const double * __restrict pos_z,
1220 double * __restrict velNew_x,
1221 double * __restrict velNew_y,
1222 double * __restrict velNew_z,
1223 double * __restrict posNew_x,
1224 double * __restrict posNew_y,
1225 double * __restrict posNew_z
1228 int stride = blockDim.x;
1229 int pid = patchIDs[blockIdx.x];
1231 int globalForceOffset = nbondIndexPerPatch[pid].atomStart;
1232 int b_forceOffset = b_pr[patchID].atomStart;
1234 int patchOffset = patchOffsets[blockIdx.x];
1235 int natom = nbondIndexPerPatch.numAtoms;
1240 for(int i = 0; i < threadIdx.x; i < natom; i += stride){
1242 if(T_DONBOND) f_nb = {0};
1243 if(T_DOSLOW) f_s = {0};
1244 // fetch the bonded forces first
1245 f_b.x = f_bond[b_forceOffset + i];
1246 f_b.y = f_bond[b_forceOffset + i + forceStride];
1247 f_b.z = f_bond[b_forceOffset + i + 2*forceStride];
1250 f_nb = f_nbond[globalForceOffset + i];
1251 f_nb.x += f_bond_nbond[b_forceOffset + i];
1252 f_nb.y += f_bond_nbond[b_forceOffset + i + forceStride];
1253 f_nb.z += f_bond_nbond[b_forceOffset + i + 2*forceStride];
1255 // addForceToMomentum now
1256 // this striding is not good
1257 float vx = vel_x[patchOffset + i];
1258 float vy = vel_y[patchOffset + i];
1259 float vz = vel_z[patchOffset + i];
1260 float rmass = recipMass[patchOffset + i];
1261 switch(maxForceNumber){
1262 // XXX TODO: Case 2 for slow forces
1264 dt_nbond *= scaling;
1265 vx += f_nb.x * rmass * dt_nbond;
1266 vy += f_nb.y * rmass * dt_nbond;
1267 vz += f_nb.z * rmass * dt_nbond;
1269 dt_normal *= scaling;
1270 vx += f_b.x * rmass * dt_nbond;
1271 vy += f_b.y * rmass * dt_nbond;
1272 vz += f_b.z * rmass * dt_nbond;
1275 // updateRigidArrays
1277 posNew_x[patchOffset + i] = pos_x[i] + (vx * dt);
1278 posNew_y[patchOffset + i] = pos_y[i] + (vy * dt);
1279 posNew_z[patchOffset + i] = pos_z[i] + (vz * dt);
1280 vel_x[patchOffset + i] = vx;
1281 vel_y[patchOffset + i] = vy;
1282 vel_z[patchOffset + i] = vz;
1283 velNew_x[patchOffset + i] = vx;
1284 velNew_y[patchOffset + i] = vy;
1285 velNew_z[patchOffset + i] = vz;
1286 f_normal_x[patchOffset + i] = f_b.x;
1287 f_normal_y[patchOffset + i] = f_b.y;
1288 f_normal_z[patchOffset + i] = f_b.z;
1291 order = patchSortOrder[patchOffset + i];
1292 f_nbond_x[patchOffset + order] = f_nb.x;
1293 f_nbond_y[patchOffset + order] = f_nb.y;
1294 f_nbond_z[patchOffset + order] = f_nb.z;
1299 void SequencerCUDAKernel::fetchForcesAndUpdateVelocities(int numPatches,
1302 const double scaling,
1306 const int maxForceNumber,
1307 /* copyFbondToSOA parameters*/
1308 const double* __restrict__ f_bond, // forces from bond compute
1309 const double* __restrict__ f_bond_nbond, // forces from bond compute
1320 const PatchRecord* b_pr,
1321 const int* patchIDs,
1322 const int* patchOffsets,
1323 /* copyFNbondToSOA parameters*/
1324 const float4* f_nbond, // forces from bbond compute
1325 const float4* f_nbond_slow, // forces from bond compute
1326 const int* patchSortOrder,
1327 const CudaPatchRecord* nbondIndexPerPatch,
1328 /* copyFSlowToSOA */
1329 const CudaForce* f_slow,
1330 const int* patchPositions,
1331 const int* pencilPatchIndex,
1332 const int* patchOffsets,
1333 const int* patchIDs,
1334 const int* slow_patchIDs,
1335 const CudaLattice* lattices,
1336 /* addForceToMomentum */
1337 const double * recipMass,
1341 /* updateRigidArrays */
1342 const double * pos_x,
1343 const double * pos_y,
1344 const double * pos_z,
1351 cudaStream_t stream){
1353 // reduce the amount of arguments in this function
1355 int grid = numPatches;
1357 // XXX TODO finish this
1360 fetchForcesAndUpdateVelocities<true, true><<<numPatches, 128, 0, stream>>>();
1362 fetchForcesAndUpdateVelocities<true, false><<<numPatches, 128, 0, stream>>>();
1365 fetchForcesAndUpdateVelocities<false, false><<<numPatches, 128, 0, stream>>>();
1370 // JM: I need a function to do the pairlistCheck
1371 template<bool T_ISPERIODIC>
1372 __global__ void pairListMarginCheckKernel(
1373 const int numPatches,
1374 CudaLocalRecord* localRecords,
1375 const double* __restrict pos_x,
1376 const double* __restrict pos_y,
1377 const double* __restrict pos_z,
1378 const double* __restrict pos_old_x,
1379 const double* __restrict pos_old_y,
1380 const double* __restrict pos_old_z,
1381 const double3* __restrict awayDists, // for margin check
1382 const Lattice lattice,
1383 const Lattice latticeOld,
1384 const double3* __restrict patchMins,
1385 const double3* __restrict patchMaxes,
1386 const double3* __restrict patchCenter,
1387 const CudaMInfo* __restrict mInfo,
1388 unsigned int* __restrict tbcatomic,
1389 double* patchMaxAtomMovement,
1390 double* h_patchMaxAtomMovement,
1391 double* patchNewTolerance,
1392 double* h_patchNewTolerance,
1393 const double minSize,
1394 const double cutoff,
1395 const double sysdima,
1396 const double sysdimb,
1397 const double sysdimc,
1398 unsigned int* __restrict h_marginViolations,
1399 unsigned int* __restrict h_periodicCellSmall,
1400 const bool rescalePairlistTolerance
1402 __shared__ CudaLocalRecord s_record;
1403 using AccessType = int32_t;
1404 AccessType* s_record_buffer = (AccessType*) &s_record;
1406 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1407 // Read in the CudaLocalRecord using multiple threads. This should
1409 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1410 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1414 const int pid = s_record.patchID;
1417 // So each patch has a max_CD particular to it
1418 // since we have a block per patch, we get the stuff here
1419 // we need to check eight points for each patch. Might not pay off to do that
1420 // we do that once per patch I guess
1421 double cd, max_cd, max_pd;
1422 __shared__ double minx, miny, minz, maxx, maxy, maxz;
1423 double3 corner, corner_unscaled, corner2_unscaled;
1427 const int stride = blockDim.x;
1429 const int ind = patchIndex;
1430 double3 min = patchMins[ind];
1431 double3 max = patchMaxes[ind];
1432 double3 aDist = awayDists[ind];
1434 const int start = s_record.bufferOffset;
1435 const int numAtoms = s_record.numAtoms;
1436 double3 center = patchCenter[ind];
1438 double3 center_cur = lattice.unscale(center);
1439 double3 center_old = latticeOld.unscale(center);
1441 center_cur.x -= center_old.x;
1442 center_cur.y -= center_old.y;
1443 center_cur.z -= center_old.z;
1447 if(threadIdx.x == 0){
1448 if (rescalePairlistTolerance) patchNewTolerance[ind] *= (1.0f - c_pairlistShrink);
1450 // doMargincheck -> checks if the periodic cell has became too small:
1452 // Those values are in registers to begin with, so the overhead here is not that large
1453 // if this condition is set to true, simulation will stop
1454 mValue.x = (0.5) * (aDist.x - cutoff / sysdima);
1455 mValue.y = (0.5) * (aDist.y - cutoff / sysdimb);
1456 mValue.z = (0.5) * (aDist.z - cutoff / sysdimc);
1457 minx = min.x - mValue.x;
1458 miny = min.y - mValue.y;
1459 minz = min.z - mValue.z;
1460 maxx = max.x + mValue.x;
1461 maxy = max.y + mValue.y;
1462 maxz = max.z + mValue.z;
1464 for(int i = 0; i < 2; i++){
1465 for(int j =0 ; j< 2; j++){
1466 for(int k = 0; k < 2; k++){
1467 corner.x = (i ? min.x : max.x);
1468 corner.y = (j ? min.y : max.y);
1469 corner.z = (k ? min.z : max.z);
1470 corner_unscaled = lattice.unscale(corner);
1471 corner2_unscaled = latticeOld.unscale(corner);
1472 corner.x = (corner_unscaled.x - corner2_unscaled.x) - center_cur.x;
1473 corner.y = (corner_unscaled.y - corner2_unscaled.y) - center_cur.y;
1474 corner.z = (corner_unscaled.z - corner2_unscaled.z) - center_cur.z;
1475 cd = corner.x * corner.x +
1476 corner.y * corner.y + corner.z * corner.z;
1477 if (cd > max_cd) max_cd = cd;
1484 // Alrights, so we have max_cd for each patch, now we get the max_pd
1485 // XXX TODO: Get the number of atoms per patch
1486 // we need the nAtomsPerPatch
1487 // we need an atomStart here
1488 unsigned int mc = 0;
1489 for(int i = threadIdx.x; i < numAtoms; i += stride){
1490 // Here I can also check for margin violations - we need the aAwayDists
1492 pos.x = pos_x[start + i];
1493 pos.y = pos_y[start + i];
1494 pos.z = pos_z[start + i];
1495 s = lattice.scale(pos);
1497 // This is true if the system in periodic in A, B and C
1498 // if any of these clauses are true, atom needs to migrate
1499 mc += ((s.x < minx || s.x >= maxx) || (s.y < miny || s.y >= maxy) || (s.z < minz || s.z >= maxz)) ? 1 : 0;
1501 int xdev, ydev, zdev;
1502 // Ok, so if the system is not periodic, we need to access the mInfo data structure to determine migrations
1503 if (s.x < minx) xdev = 0;
1504 else if (maxx <= s.x) xdev = 2;
1507 if (s.y < miny) ydev = 0;
1508 else if (maxy <= s.y) ydev = 2;
1511 if (s.z < minz) zdev = 0;
1512 else if (maxz <= s.z) zdev = 2;
1515 if(xdev != 1 || ydev != 1 || zdev != 1){
1516 // we check if any *dev are different than zero to prevent this horrible global memory access here
1517 int destPatch = mInfo[ind].destPatchID[xdev][ydev][zdev];
1518 if(destPatch != -1 && destPatch != pid) mc += 1; // atom needs to migrate
1521 corner.x = (pos.x - pos_old_x[start + i]) - center_cur.x;
1522 corner.y = (pos.y - pos_old_y[start + i]) - center_cur.y;
1523 corner.z = (pos.z - pos_old_z[start + i]) - center_cur.z;
1524 cd = corner.x * corner.x +
1525 corner.y * corner.y + corner.z * corner.z;
1526 if(cd > max_pd) max_pd = cd;
1528 // JM NOTE: The atomic add to host memory is bad, but if you have margin violations the simulation is going badly
1531 atomicAdd(h_marginViolations, mc);
1534 typedef cub::BlockReduce<BigReal, PATCH_BLOCKS> BlockReduce;
1535 __shared__ typename BlockReduce::TempStorage temp_storage;
1536 #if defined(NAMD_HIP) || (NAMD_CCCL_MAJOR_VERSION <= 2)
1537 max_pd = BlockReduce(temp_storage).Reduce(max_pd, cub::Max());
1539 max_pd = BlockReduce(temp_storage).Reduce(max_pd, cuda::maximum());
1541 if(threadIdx.x == 0){
1542 max_pd = sqrt(max_cd) + sqrt(max_pd);
1543 // this might not be needed since I'm updating host-mapped values anyways
1544 patchMaxAtomMovement[ind] = max_pd;
1545 if(max_pd > (1.f - c_pairlistTrigger) * patchNewTolerance[ind]){
1546 patchNewTolerance[ind] *= (1.f + c_pairlistGrow);
1548 if(max_pd > patchNewTolerance[ind]){
1549 patchNewTolerance[ind] = max_pd / (1.f - c_pairlistTrigger);
1551 // printf("patchNewTolerance[%d] = %lf %lf\n", ind,
1552 // patchNewTolerance[ind], patchMaxAtomMovement[ind]);
1553 h_patchMaxAtomMovement[ind] = max_pd; // Host-mapped update
1554 h_patchNewTolerance[ind] = patchNewTolerance[ind];
1557 // Checks if periodic cell has become too small and flags it
1558 if(threadIdx.x == 0){
1559 if ( ( aDist.x*sysdima < minSize*0.9999 ) ||
1560 ( aDist.y*sysdimb < minSize*0.9999 ) ||
1561 ( aDist.z*sysdimc < minSize*0.9999 ) ||
1562 ( mValue.x < -0.0001 ) ||
1563 ( mValue.y < -0.0001 ) ||
1564 ( mValue.z < -0.0001)) {
1565 *h_periodicCellSmall = 1;
1571 void SequencerCUDAKernel::PairListMarginCheck(const int numPatches,
1572 CudaLocalRecord* localRecords,
1573 const double* pos_x,
1574 const double* pos_y,
1575 const double* pos_z,
1576 const double* pos_old_x,
1577 const double* pos_old_y,
1578 const double* pos_old_z,
1579 const double3* awayDists, // for margin check
1580 const Lattice lattices,
1581 const Lattice lattices_old,
1582 const double3* __restrict patchMins,
1583 const double3* __restrict patchMaxes,
1584 const double3* __restrict patchCenter,
1585 const CudaMInfo* __restrict mInfo,
1586 unsigned int* __restrict tbcatomic,
1587 const double pairlistTrigger,
1588 const double pairlistGrow,
1589 const double pairlistShrink,
1590 double* __restrict patchMaxAtomMovement,
1591 double* __restrict h_patchMaxAtomMovement,
1592 double* __restrict patchNewTolerance,
1593 double* __restrict h_patchNewTolerance,
1594 const double minSize,
1595 const double cutoff,
1596 const double sysdima,
1597 const double sysdimb,
1598 const double sysdimc,
1599 unsigned int* h_marginViolations,
1600 unsigned int* h_periodicCellSmall,
1601 const bool rescalePairlistTolerance,
1602 const bool isPeriodic,
1603 cudaStream_t stream){
1604 int grid = numPatches;
1606 if(!this->intConstInit){
1607 this->intConstInit = true;
1608 cudaCheck(cudaMemcpyToSymbol(c_pairlistTrigger, &pairlistTrigger, sizeof(double)));
1609 cudaCheck(cudaMemcpyToSymbol(c_pairlistGrow, &pairlistGrow, sizeof(double)));
1610 cudaCheck(cudaMemcpyToSymbol(c_pairlistShrink, &pairlistShrink, sizeof(double)));
1614 pairListMarginCheckKernel<true><<<grid, PATCH_BLOCKS, 0, stream>>>(
1615 numPatches, localRecords,
1616 pos_x, pos_y, pos_z, pos_old_x,
1617 pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1618 patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1619 patchMaxAtomMovement, h_patchMaxAtomMovement,
1620 patchNewTolerance, h_patchNewTolerance,
1621 minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1622 rescalePairlistTolerance);
1625 pairListMarginCheckKernel<false><<<grid, PATCH_BLOCKS, 0, stream>>>(
1626 numPatches, localRecords,
1627 pos_x, pos_y, pos_z, pos_old_x,
1628 pos_old_y, pos_old_z, awayDists, lattices, lattices_old,
1629 patchMins, patchMaxes, patchCenter, mInfo, tbcatomic,
1630 patchMaxAtomMovement, h_patchMaxAtomMovement,
1631 patchNewTolerance, h_patchNewTolerance,
1632 minSize, cutoff, sysdima, sysdimb, sysdimc, h_marginViolations, h_periodicCellSmall,
1633 rescalePairlistTolerance);
1637 template <bool doNbond, bool doSlow>
1638 __global__ void copyFBondToSOA(double *f_bond,
1639 double *f_bond_nbond,
1640 double *f_bond_slow,
1650 const int forceStride,
1651 const PatchRecord *pr,
1652 const int *patchIDs,
1653 const int *patchOffsets)
1655 // I suppose if I work with the entire PatchMap, this should work?
1656 // Same thing, each block gets a patch
1657 // What do I need -> Forces + atomStart
1658 // the bonded forces are wrong here.
1659 // What isforceOffset and patchOffset?
1660 int stride = blockDim.x;
1661 int patchID = patchIDs[blockIdx.x];
1662 // we need to check this data structure first to make sure it is safe to access it using patchID
1663 // I think patchId is the correct way of indexing this datastructure. Does this change with +p?
1664 int natoms = pr[patchID].numAtoms;
1665 int forceOffset = pr[patchID].atomStart;
1666 int patchOffset = patchOffsets[blockIdx.x];
1668 for(int i = threadIdx.x; i < natoms; i+=stride){
1670 f_bond_x[patchOffset + i] = f_bond[forceOffset + i];
1671 f_bond_y[patchOffset + i] = f_bond[forceOffset + i + forceStride];
1672 f_bond_z[patchOffset + i] = f_bond[forceOffset + i + 2*forceStride];
1675 // XXX Accumulation here works because each f_nbond_* buffer
1676 // is first initialized in copyFNBondToSOA.
1677 f_nbond_x[patchOffset + i] += f_bond_nbond[forceOffset + i];
1678 f_nbond_y[patchOffset + i] += f_bond_nbond[forceOffset + i + forceStride];
1679 f_nbond_z[patchOffset + i] += f_bond_nbond[forceOffset + i + 2*forceStride];
1683 // XXX Accumulation here works because each f_slow_* buffer
1684 // is first initialized in copyFSlowToSOA.
1685 f_slow_x[patchOffset + i] += f_bond_slow[forceOffset + i];
1686 f_slow_y[patchOffset + i] += f_bond_slow[forceOffset + i + forceStride];
1687 f_slow_z[patchOffset + i] += f_bond_slow[forceOffset + i + 2*forceStride];
1692 void SequencerCUDAKernel::copy_bond_forces(int numPatches,
1694 double* f_bond_nbond,
1695 double* f_bond_slow,
1705 int forceStride, //if stridedForces
1707 const int* patchIDs,
1708 const int* patchOffsets,
1711 cudaStream_t stream)
1714 copyFBondToSOA<true, true><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1715 f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1716 f_nbond_x, f_nbond_y, f_nbond_z,
1717 f_slow_x, f_slow_y, f_slow_z, forceStride,
1718 pr, patchIDs, patchOffsets);
1720 copyFBondToSOA<true, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1721 f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1722 f_nbond_x, f_nbond_y, f_nbond_z,
1723 f_slow_x, f_slow_y, f_slow_z, forceStride,
1724 pr, patchIDs, patchOffsets);
1726 copyFBondToSOA<false, false><<< numPatches, PATCH_BLOCKS, 0, stream >>>(f_bond, f_bond_nbond,
1727 f_bond_slow, f_bond_x, f_bond_y, f_bond_z,
1728 f_nbond_x, f_nbond_y, f_nbond_z,
1729 f_slow_x, f_slow_y, f_slow_z, forceStride,
1730 pr, patchIDs, patchOffsets);
1734 __global__ void copyFSlowToSOA(const CudaForce* __restrict__ f_slow,
1738 const int* __restrict__ patchOffsets,
1739 const Lattice* __restrict__ lattices)
1741 int tid = threadIdx.x;
1742 int stride = blockDim.x;
1743 int patchOffset = patchOffsets[blockIdx.x];
1744 int numAtoms = patchOffsets[blockIdx.x + 1] - patchOffset;
1746 Lattice lat = lattices[0];
1748 for(int i = tid; i < numAtoms; i += stride){
1749 CudaForce f = f_slow[patchOffset + i];
1751 double3 f_scaled = lat.scale_force(
1752 Vector((double)f.x, (double)f.y, (double)f.z));
1754 // XXX Instead of accumulating slow force (+=), set them here (=)!
1755 f_slow_x[patchOffset + i] = f_scaled.x;
1756 f_slow_y[patchOffset + i] = f_scaled.y;
1757 f_slow_z[patchOffset + i] = f_scaled.z;
1761 void SequencerCUDAKernel::copy_slow_forces(int numPatches,
1762 const CudaForce* f_slow,
1766 const int* d_patchOffsets,
1767 const Lattice *lattices,
1768 cudaStream_t stream){
1770 copyFSlowToSOA<<<numPatches, PATCH_BLOCKS, 0, stream>>>(f_slow,
1771 f_slow_x, f_slow_y, f_slow_z, d_patchOffsets, lattices);
1774 __forceinline__ __device__ void zero_cudaTensor(cudaTensor &v)
1787 template <int MAX_FORCE_NUMBER, bool DO_VEL_RESCALING, bool DO_FIXED>
1788 __global__ void addForceToMomentumKernel(const double scaling,
1792 double velrescaling, // for stochastic velocity rescaling
1793 const double * __restrict recipMass,
1794 const double * __restrict f_normal_x,
1795 const double * __restrict f_normal_y,
1796 const double * __restrict f_normal_z,
1797 const double * __restrict f_nbond_x,
1798 const double * __restrict f_nbond_y,
1799 const double * __restrict f_nbond_z,
1800 const double * __restrict f_slow_x,
1801 const double * __restrict f_slow_y,
1802 const double * __restrict f_slow_z,
1803 double * __restrict vel_x,
1804 double * __restrict vel_y,
1805 double * __restrict vel_z,
1806 const int * __restrict atomFixed,
1809 int i = threadIdx.x + blockIdx.x*blockDim.x;
1812 switch (maxForceNumber) {
1815 vel_x[i] += f_slow_x[i] * recipMass[i] * dt_slow;
1816 vel_y[i] += f_slow_y[i] * recipMass[i] * dt_slow;
1817 vel_z[i] += f_slow_z[i] * recipMass[i] * dt_slow;
1818 // fall through because we will always have the "faster" forces
1820 dt_nbond *= scaling;
1821 vel_x[i] += f_nbond_x[i] * recipMass[i] * dt_nbond;
1822 vel_y[i] += f_nbond_y[i] * recipMass[i] * dt_nbond;
1823 vel_z[i] += f_nbond_z[i] * recipMass[i] * dt_nbond;
1824 // fall through because we will always have the "faster" forces
1826 dt_normal *= scaling;
1827 vel_x[i] += f_normal_x[i] * recipMass[i] * dt_normal;
1828 vel_y[i] += f_normal_y[i] * recipMass[i] * dt_normal;
1829 vel_z[i] += f_normal_z[i] * recipMass[i] * dt_normal;
1835 switch (MAX_FORCE_NUMBER) {
1837 vx += f_slow_x[i] * dt_slow;
1838 vy += f_slow_y[i] * dt_slow;
1839 vz += f_slow_z[i] * dt_slow;
1840 // fall through because we will always have the "faster" forces
1842 vx += f_nbond_x[i] * dt_nbond;
1843 vy += f_nbond_y[i] * dt_nbond;
1844 vz += f_nbond_z[i] * dt_nbond;
1845 // fall through because we will always have the "faster" forces
1847 vx += f_normal_x[i] * dt_normal;
1848 vy += f_normal_y[i] * dt_normal;
1849 vz += f_normal_z[i] * dt_normal;
1851 vx *= scaling * recipMass[i];
1852 vy *= scaling * recipMass[i];
1853 vz *= scaling * recipMass[i];
1855 if (DO_VEL_RESCALING) {
1856 vel_x[i] = velrescaling*vel_x[i] + vx;
1857 vel_y[i] = velrescaling*vel_y[i] + vy;
1858 vel_z[i] = velrescaling*vel_z[i] + vz;
1865 } else { // fixed atoms
1866 if (!atomFixed[i]) {
1867 if (DO_VEL_RESCALING) {
1868 vel_x[i] = velrescaling*vel_x[i] + vx;
1869 vel_y[i] = velrescaling*vel_y[i] + vy;
1870 vel_z[i] = velrescaling*vel_z[i] + vz;
1888 // JM: This sets the cudaAtom vector from the nonbonded compute
1889 template <bool t_doNbond, bool t_doHomePatches>
1890 __global__ void setComputePositionsKernel(
1891 CudaLocalRecord* localRecords,
1892 CudaPeerRecord* peerRecords,
1893 const double3* __restrict patchCenter,
1894 const int* __restrict patchSortOrder,
1895 const int numPatches,
1896 const unsigned int devID,
1898 const double charge_scaling,
1899 const double* __restrict d_pos_x,
1900 const double* __restrict d_pos_y,
1901 const double* __restrict d_pos_z,
1902 const float* __restrict charges,
1903 double** __restrict d_peer_pos_x,
1904 double** __restrict d_peer_pos_y,
1905 double** __restrict d_peer_pos_z,
1906 float4* __restrict nb_atoms,
1907 float4* __restrict b_atoms,
1908 float4* __restrict s_atoms
1911 __shared__ CudaLocalRecord s_record;
1912 using AccessType = int32_t;
1913 AccessType* s_record_buffer = (AccessType*) &s_record;
1915 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
1916 // Read in the CudaLocalRecord using multiple threads
1918 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
1919 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
1924 //int soapid = globalToLocalID[record.patchID];
1925 center = patchCenter[patchIndex];
1926 double3 ucenter = lat.unscale(center);
1928 const int numAtoms = s_record.numAtoms;
1929 const int dstOffset = s_record.bufferOffset;
1930 const int dstOffsetNB = s_record.bufferOffsetNBPad;
1932 int srcDevice, srcOffset;
1933 const double *srcPosX, *srcPosY, *srcPosZ;
1934 if (t_doHomePatches) {
1936 srcOffset = dstOffset;
1941 srcDevice = s_record.inline_peers[0].deviceIndex;
1942 srcOffset = s_record.inline_peers[0].bufferOffset;
1943 srcPosX = d_peer_pos_x[srcDevice];
1944 srcPosY = d_peer_pos_y[srcDevice];
1945 srcPosZ = d_peer_pos_z[srcDevice];
1949 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
1950 const int order = patchSortOrder[dstOffset + i]; // Should this be order or unorder?
1951 atom.x = srcPosX[srcOffset + order] - ucenter.x;
1952 atom.y = srcPosY[srcOffset + order] - ucenter.y;
1953 atom.z = srcPosZ[srcOffset + order] - ucenter.z;
1954 atom.w = charges[dstOffset + order] * charge_scaling;
1956 b_atoms[dstOffset + order] = atom;
1957 if (t_doNbond) nb_atoms[dstOffsetNB + i] = atom;
1961 if (threadIdx.x / WARPSIZE == 0) {
1962 const int to_write = (((numAtoms+32-1)/32)*32) - numAtoms; // WARPSIZE
1965 const int order = patchSortOrder[dstOffset + numAtoms - 1]; // Should this be order or unorder?
1966 lastAtom.x = srcPosX[srcOffset + order] - ucenter.x;
1967 lastAtom.y = srcPosY[srcOffset + order] - ucenter.y;
1968 lastAtom.z = srcPosZ[srcOffset + order] - ucenter.z;
1969 lastAtom.w = charges[dstOffset + order] * charge_scaling;
1971 if (threadIdx.x < to_write) {
1972 nb_atoms[dstOffsetNB+numAtoms+threadIdx.x] = lastAtom;
1980 template<bool t_doHomePatches, bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
1981 __global__ void setComputePositionsKernel_PME (
1982 const double* __restrict d_pos_x,
1983 const double* __restrict d_pos_y,
1984 const double* __restrict d_pos_z,
1985 const float* __restrict charges,
1986 double** __restrict d_peer_pos_x,
1987 double** __restrict d_peer_pos_y,
1988 double** __restrict d_peer_pos_z,
1989 float** __restrict d_peer_charge,
1990 int** __restrict d_peer_partition,
1991 const int* __restrict partition,
1992 const double charge_scaling,
1993 const int* __restrict s_patchPositions,
1994 const int* __restrict s_pencilPatchIndex,
1995 const int* __restrict s_patchIDs,
1997 float4* __restrict s_atoms,
1999 int* __restrict s_partition,
2001 const int peerDevice,
2004 const bool handleBoundary
2009 const int pmeBufferOffset = offset;
2010 const int srcBufferOffset = 0;
2011 const int srcDevice = peerDevice;
2013 // double precision calculation
2014 double px, py, pz, wx, wy, wz, q;
2016 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numAtoms; i += blockDim.x * gridDim.x) {
2017 // this gets atoms in same order as PmeCuda code path
2018 if (t_doHomePatches) {
2019 q = (double)(charges[srcBufferOffset + i]);
2020 px = d_pos_x[srcBufferOffset + i];
2021 py = d_pos_y[srcBufferOffset + i];
2022 pz = d_pos_z[srcBufferOffset + i];
2023 if (t_doFEP || t_doTI) {
2024 part = partition[srcBufferOffset + i];
2027 q = (double)(d_peer_charge[srcDevice][srcBufferOffset + i]);
2028 px = d_peer_pos_x[srcDevice][srcBufferOffset + i];
2029 py = d_peer_pos_y[srcDevice][srcBufferOffset + i];
2030 pz = d_peer_pos_z[srcDevice][srcBufferOffset + i];
2031 if (t_doFEP || t_doTI) {
2032 part = d_peer_partition[srcDevice][srcBufferOffset + i];
2036 double3 w_vec = lat.scale(Vector(px, py, pz));
2041 if (handleBoundary) {
2042 wx = (wx - (floor(wx + 0.5) - 0.5));
2043 wy = (wy - (floor(wy + 0.5) - 0.5));
2044 wz = (wz - (floor(wz + 0.5) - 0.5));
2051 if (handleBoundary) {
2052 foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2053 foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2054 foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2057 foo.w = (float) (charge_scaling * q);
2058 if (!t_doFEP && !t_doTI) {
2059 s_atoms[pmeBufferOffset + i] = foo;
2061 else { // alchemical multiple grids
2062 float4 foo_zero_charge = foo;
2063 foo_zero_charge.w = 0.0f;
2064 s_partition[pmeBufferOffset + i] = part;
2065 /* grid 0 grid 1 grid 2 grid 3 grid 4
2066 * non-alch (p = 0) Y Y N N Y
2067 * appearing (p = 1) Y N Y N N
2068 * disappearing (p = 2) N Y N Y N
2069 * Requirements of grids:
2070 * 1. t_doFEP || t_doTI : grid 0, grid 1
2071 * 2. t_doAlchDecouple : grid 2, grid 3
2072 * 3. t_doAlchSoftCore || t_doTI: grid 4
2073 * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2074 * although the atoms that have zero charges in extra grids would not change in non-migration steps,
2075 * I still find no way to get rid of these copying, because positions of the atoms can be changed.
2076 * The non-zero charges may also change if they are computed from some QM engines or some new kinds of FF.
2077 * It seems these branchings in non-migration steps are inevitable.
2082 s_atoms[pmeBufferOffset + i] = foo;
2083 s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2084 if (t_doAlchDecouple) {
2085 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2086 s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2087 if (t_doAlchSoftCore || t_doTI) {
2088 s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo;
2091 if (t_doAlchSoftCore || t_doTI) {
2092 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2099 s_atoms[pmeBufferOffset + i] = foo;
2100 s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo_zero_charge;
2101 if (t_doAlchDecouple) {
2102 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo;
2103 s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo_zero_charge;
2104 if (t_doAlchSoftCore || t_doTI) {
2105 s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2108 if (t_doAlchSoftCore || t_doTI) {
2109 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2114 // disappearing atoms
2116 s_atoms[pmeBufferOffset + i] = foo_zero_charge;
2117 s_atoms[pmeBufferOffset + i + numTotalAtoms] = foo;
2118 if (t_doAlchDecouple) {
2119 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2120 s_atoms[pmeBufferOffset + i + 3 * numTotalAtoms] = foo;
2121 if (t_doAlchSoftCore || t_doTI) {
2122 s_atoms[pmeBufferOffset + i + 4 * numTotalAtoms] = foo_zero_charge;
2125 if (t_doAlchSoftCore || t_doTI) {
2126 s_atoms[pmeBufferOffset + i + 2 * numTotalAtoms] = foo_zero_charge;
2136 void SequencerCUDAKernel::set_compute_positions(
2138 const bool isPmeDevice,
2140 const int numPatchesHomeAndProxy,
2141 const int numPatchesHome,
2146 const bool doAlchDecouple,
2147 const bool doAlchSoftCore,
2148 const bool handleBoundary,
2149 const double* d_pos_x,
2150 const double* d_pos_y,
2151 const double* d_pos_z,
2152 #ifndef NAMD_NCCL_ALLREDUCE
2153 double** d_peer_pos_x,
2154 double** d_peer_pos_y,
2155 double** d_peer_pos_z,
2156 float** d_peer_charge,
2157 int** d_peer_partition,
2159 const float* charges,
2160 const int* partition,
2161 const double charge_scaling,
2162 const double3* patchCenter,
2163 const int* s_patchPositions,
2164 const int* s_pencilPatchIndex,
2165 const int* s_patchIDs,
2166 const int* patchSortOrder,
2167 const Lattice lattice,
2173 CudaLocalRecord* localRecords,
2174 CudaPeerRecord* peerRecords,
2175 std::vector<int>& atomCounts,
2176 cudaStream_t stream)
2178 // Launch Local Set Compute Positions
2180 setComputePositionsKernel<true, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2181 localRecords, peerRecords, patchCenter, patchSortOrder,
2182 numPatchesHome, devID, lattice, charge_scaling,
2183 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2184 nb_atoms, b_atoms, s_atoms
2187 setComputePositionsKernel<false, true><<<numPatchesHome, PATCH_BLOCKS, 0, stream>>>(
2188 localRecords, peerRecords, patchCenter, patchSortOrder,
2189 numPatchesHome, devID, lattice, charge_scaling,
2190 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2191 nb_atoms, b_atoms, s_atoms
2195 // Launch Peer Set Computes
2197 const int numProxyPatches = numPatchesHomeAndProxy - numPatchesHome;
2199 setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2200 localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2201 numProxyPatches, devID, lattice, charge_scaling,
2202 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2203 nb_atoms, b_atoms, s_atoms
2206 setComputePositionsKernel<true, false><<<numProxyPatches, PATCH_BLOCKS, 0, stream>>>(
2207 localRecords + numPatchesHome, peerRecords, patchCenter + numPatchesHome, patchSortOrder,
2208 numProxyPatches, devID, lattice, charge_scaling,
2209 d_pos_x, d_pos_y, d_pos_z, charges, d_peer_pos_x, d_peer_pos_y, d_peer_pos_z,
2210 nb_atoms, b_atoms, s_atoms
2215 // Launch PME setComputes
2216 #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
2217 setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
2218 <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
2219 d_pos_x, d_pos_y, d_pos_z, charges, \
2220 d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
2221 d_peer_partition, partition, charge_scaling, \
2222 s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
2223 lattice, s_atoms, numTotalAtoms, s_partition, \
2224 i, numAtoms, offset, handleBoundary \
2226 // Only when PME long-range electrostaic is enabled (doSlow is true)
2227 // The partition is needed for alchemical calculation.
2228 if(doSlow && isPmeDevice) {
2230 for (int i = 0; i < nDev; i++) {
2231 const bool home = (i == devID);
2232 const int numAtoms = atomCounts[i];
2233 const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
2234 const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
2237 case 0: CALL(0, 0, 0, 0, 0); break;
2238 case 1: CALL(0, 0, 0, 0, 1); break;
2239 case 2: CALL(0, 0, 0, 1, 0); break;
2240 case 3: CALL(0, 0, 0, 1, 1); break;
2241 case 4: CALL(0, 0, 1, 0, 0); break;
2242 case 5: CALL(0, 0, 1, 0, 1); break;
2243 case 6: CALL(0, 0, 1, 1, 0); break;
2244 case 7: CALL(0, 0, 1, 1, 1); break;
2245 case 8: CALL(0, 1, 0, 0, 0); break;
2246 case 9: CALL(0, 1, 0, 0, 1); break;
2247 case 10: CALL(0, 1, 0, 1, 0); break;
2248 case 11: CALL(0, 1, 0, 1, 1); break;
2249 case 12: CALL(0, 1, 1, 0, 0); break;
2250 case 13: CALL(0, 1, 1, 0, 1); break;
2251 case 14: CALL(0, 1, 1, 1, 0); break;
2252 case 15: CALL(0, 1, 1, 1, 1); break;
2253 case 16: CALL(1, 0, 0, 0, 0); break;
2254 case 17: CALL(1, 0, 0, 0, 1); break;
2255 case 18: CALL(1, 0, 0, 1, 0); break;
2256 case 19: CALL(1, 0, 0, 1, 1); break;
2257 case 20: CALL(1, 0, 1, 0, 0); break;
2258 case 21: CALL(1, 0, 1, 0, 1); break;
2259 case 22: CALL(1, 0, 1, 1, 0); break;
2260 case 23: CALL(1, 0, 1, 1, 1); break;
2261 case 24: CALL(1, 1, 0, 0, 0); break;
2262 case 25: CALL(1, 1, 0, 0, 1); break;
2263 case 26: CALL(1, 1, 0, 1, 0); break;
2264 case 27: CALL(1, 1, 0, 1, 1); break;
2265 case 28: CALL(1, 1, 1, 0, 0); break;
2266 case 29: CALL(1, 1, 1, 0, 1); break;
2267 case 30: CALL(1, 1, 1, 1, 0); break;
2268 case 31: CALL(1, 1, 1, 1, 1); break;
2270 NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
2279 template <bool t_doFEP, bool t_doTI, bool t_doAlchDecouple, bool t_doAlchSoftCore>
2280 __global__ void setPmePositionsKernel(
2281 double charge_scaling,
2282 const Lattice lattice,
2283 const double *pos_x,
2284 const double *pos_y,
2285 const double *pos_z,
2286 const float *charges,
2287 const int* partition,
2289 int* s_atoms_partition,
2292 int i = threadIdx.x + blockIdx.x*blockDim.x;
2294 Lattice lat = lattice;
2296 double wx, wy, wz, q;
2297 q = (double)(charges[i]);
2299 double3 w_vec = lat.scale(Vector(pos_x[i], pos_y[i], pos_z[i]));
2304 wx = (wx - (floor(wx + 0.5) - 0.5));
2305 wy = (wy - (floor(wy + 0.5) - 0.5));
2306 wz = (wz - (floor(wz + 0.5) - 0.5));
2310 foo.w = (float) (charge_scaling * q);
2311 foo.x = foo.x - 1.0f*(foo.x >= 1.0f);
2312 foo.y = foo.y - 1.0f*(foo.y >= 1.0f);
2313 foo.z = foo.z - 1.0f*(foo.z >= 1.0f);
2314 if (!t_doFEP && !t_doTI) {
2317 else { // alchemical multiple grids
2318 float4 foo_zero_charge = foo;
2319 foo_zero_charge.w = 0.0f;
2320 s_atoms_partition[i] = partition[i];
2321 /* grid 0 grid 1 grid 2 grid 3 grid 4
2322 * non-alch (p = 0) Y Y N N Y
2323 * appearing (p = 1) Y N Y N N
2324 * disappearing (p = 2) N Y N Y N
2325 * Requirements of grids:
2326 * 1. t_doFEP || t_doTI : grid 0, grid 1
2327 * 2. t_doAlchDecouple : grid 2, grid 3
2328 * 3. t_doAlchSoftCore || t_doTI: grid 4
2329 * grid 4 can be s_atoms[i + 4 * numAtoms] (t_doAlchDecouple) or s_atoms[i + 2 * numAtoms] (!t_doAlchDecouple)
2331 switch (partition[i]) {
2335 s_atoms[i + numAtoms] = foo;
2336 if (t_doAlchDecouple) {
2337 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2338 s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2339 if (t_doAlchSoftCore || t_doTI) {
2340 s_atoms[i + 4 * numAtoms] = foo;
2343 if (t_doAlchSoftCore || t_doTI) {
2344 s_atoms[i + 2 * numAtoms] = foo;
2352 s_atoms[i + numAtoms] = foo_zero_charge;
2353 if (t_doAlchDecouple) {
2354 s_atoms[i + 2 * numAtoms] = foo;
2355 s_atoms[i + 3 * numAtoms] = foo_zero_charge;
2356 if (t_doAlchSoftCore || t_doTI) {
2357 s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2360 if (t_doAlchSoftCore || t_doTI) {
2361 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2366 // disappearing atoms
2368 s_atoms[i] = foo_zero_charge;
2369 s_atoms[i + numAtoms] = foo;
2370 if (t_doAlchDecouple) {
2371 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2372 s_atoms[i + 3 * numAtoms] = foo;
2373 if (t_doAlchSoftCore || t_doTI) {
2374 s_atoms[i + 4 * numAtoms] = foo_zero_charge;
2377 if (t_doAlchSoftCore || t_doTI) {
2378 s_atoms[i + 2 * numAtoms] = foo_zero_charge;
2388 __global__ void maximumMoveKernel(const double maxvel2,
2389 const double * __restrict vel_x,
2390 const double * __restrict vel_y,
2391 const double * __restrict vel_z,
2395 int i = threadIdx.x + blockIdx.x*blockDim.x;
2397 double vel2 = vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i];
2398 if (vel2 > maxvel2) {
2399 //If this ever happens, we're already screwed, so performance does not matter
2400 atomicAdd(killme, 1);
2405 void SequencerCUDAKernel::maximumMove(const double maxvel2,
2406 const double *vel_x,
2407 const double *vel_y,
2408 const double *vel_z,
2411 cudaStream_t stream)
2413 //cudaCheck(cudaMemset(killme, 0, sizeof(int)));
2414 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2415 maximumMoveKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
2416 maxvel2, vel_x, vel_y, vel_z, killme, numAtoms);
2419 void SequencerCUDAKernel::addForceToMomentum(
2421 const double scaling,
2425 double velrescaling, // for stochastic velocity rescaling
2426 const double *recipMass,
2427 const double *f_normal_x,
2428 const double *f_normal_y,
2429 const double *f_normal_z,
2430 const double *f_nbond_x,
2431 const double *f_nbond_y,
2432 const double *f_nbond_z,
2433 const double *f_slow_x,
2434 const double *f_slow_y,
2435 const double *f_slow_z,
2439 const int *atomFixed,
2442 cudaStream_t stream)
2444 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2445 const bool doVelRescaling = (velrescaling != 1.0);
2446 int kernelCalled = 0;
2447 #define CALL(MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED) \
2449 addForceToMomentumKernel<MAX_FORCE_NUMBER, DO_VEL_RESCALING, DO_FIXED> \
2450 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
2451 scaling, dt_normal, dt_nbond, dt_slow, velrescaling, \
2452 recipMass, f_normal_x, f_normal_y, f_normal_z, \
2453 f_nbond_x, f_nbond_y, f_nbond_z, \
2454 f_slow_x, f_slow_y, f_slow_z, \
2455 vel_x, vel_y, vel_z, atomFixed, \
2459 switch (maxForceNumber) {
2461 if (doVelRescaling && doFixedAtoms) CALL(2, true, true);
2462 if (doVelRescaling && !doFixedAtoms) CALL(2, true, false);
2463 if (!doVelRescaling && !doFixedAtoms) CALL(2, false, false);
2464 if (!doVelRescaling && doFixedAtoms) CALL(2, false, true);
2468 if (doVelRescaling && doFixedAtoms) CALL(1, true, true);
2469 if (doVelRescaling && !doFixedAtoms) CALL(1, true, false);
2470 if (!doVelRescaling && !doFixedAtoms) CALL(1, false, false);
2471 if (!doVelRescaling && doFixedAtoms) CALL(1, false, true);
2475 if (doVelRescaling && doFixedAtoms) CALL(0, true, true);
2476 if (doVelRescaling && !doFixedAtoms) CALL(0, true, false);
2477 if (!doVelRescaling && !doFixedAtoms) CALL(0, false, false);
2478 if (!doVelRescaling && doFixedAtoms) CALL(0, false, true);
2483 if (kernelCalled != 1) NAMD_bug("Incorrect number of calls to kernel in SequencerCUDAKernel::addForceToMomentum!\n");
2484 // cudaCheck(cudaGetLastError());
2487 template <bool copyPos, bool DO_FIXED>
2488 __global__ void addVelocityToPositionKernel(
2490 const int* __restrict atomFixed,
2491 const double * __restrict vel_x,
2492 const double * __restrict vel_y,
2493 const double * __restrict vel_z,
2494 double * __restrict pos_x,
2495 double * __restrict pos_y,
2496 double * __restrict pos_z,
2497 double * __restrict h_pos_x, // host-mapped vectors
2498 double * __restrict h_pos_y,
2499 double * __restrict h_pos_z,
2502 int i = threadIdx.x + blockIdx.x*blockDim.x;
2509 if (!atomFixed[i]) {
2530 void SequencerCUDAKernel::addVelocityToPosition(
2533 const int *atomFixed,
2534 const double *vel_x,
2535 const double *vel_y,
2536 const double *vel_z,
2545 cudaStream_t stream)
2547 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2548 #define CALL(copyPos, DO_FIXED) \
2549 addVelocityToPositionKernel<copyPos, DO_FIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
2550 dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z, \
2551 h_pos_x, h_pos_y, h_pos_z, numAtoms)
2552 if ( copyPositions && doFixedAtoms) CALL(true, true);
2553 else if (!copyPositions && doFixedAtoms) CALL(false, true);
2554 else if ( copyPositions && !doFixedAtoms) CALL(true, false);
2555 else if (!copyPositions && !doFixedAtoms) CALL(false, false);
2556 else NAMD_bug("No kernel was called in SequencerCUDAKernel::addVelocityToPosition!\n");
2558 //cudaCheck(cudaGetLastError());
2561 template <bool doFixed>
2562 __global__ void updateRigidArraysKernel(
2564 const int * __restrict atomFixed,
2565 const double * __restrict vel_x,
2566 const double * __restrict vel_y,
2567 const double * __restrict vel_z,
2568 const double * __restrict pos_x,
2569 const double * __restrict pos_y,
2570 const double * __restrict pos_z,
2571 double * __restrict velNew_x,
2572 double * __restrict velNew_y,
2573 double * __restrict velNew_z,
2574 double * __restrict posNew_x,
2575 double * __restrict posNew_y,
2576 double * __restrict posNew_z,
2579 int i = threadIdx.x + blockIdx.x*blockDim.x;
2581 double vx = vel_x[i];
2582 double vy = vel_y[i];
2583 double vz = vel_z[i];
2585 posNew_x[i] = pos_x[i] + (vx * dt);
2586 posNew_y[i] = pos_y[i] + (vy * dt);
2587 posNew_z[i] = pos_z[i] + (vz * dt);
2589 if (!atomFixed[i]) {
2590 posNew_x[i] = pos_x[i] + (vx * dt);
2591 posNew_y[i] = pos_y[i] + (vy * dt);
2592 posNew_z[i] = pos_z[i] + (vz * dt);
2594 posNew_x[i] = pos_x[i];
2595 posNew_y[i] = pos_y[i];
2596 posNew_z[i] = pos_z[i];
2606 // JM NOTE: Optimize this kernel further to improve global memory access pattern
2607 __global__ void centerOfMassKernel(
2608 const double * __restrict coor_x,
2609 const double * __restrict coor_y,
2610 const double * __restrict coor_z,
2611 double * __restrict cm_x, // center of mass is atom-sized
2612 double * __restrict cm_y,
2613 double * __restrict cm_z,
2614 const float * __restrict mass,
2615 const int * __restrict hydrogenGroupSize,
2618 int i = threadIdx.x + blockIdx.x*blockDim.x;
2620 int hgs = hydrogenGroupSize[i];
2623 // missing fixed atoms
2625 BigReal reg_cm_x = 0;
2626 BigReal reg_cm_y = 0;
2627 BigReal reg_cm_z = 0;
2628 for ( j = i; j < (i+hgs); ++j ) {
2630 reg_cm_x += mass[j] * coor_x[j];
2631 reg_cm_y += mass[j] * coor_y[j];
2632 reg_cm_z += mass[j] * coor_z[j];
2634 BigReal inv_m_cm = 1.0/m_cm;
2635 reg_cm_x *= inv_m_cm;
2636 reg_cm_y *= inv_m_cm;
2637 reg_cm_z *= inv_m_cm;
2639 for(j = i ; j < (i + hgs); j++){
2648 void SequencerCUDAKernel::centerOfMass(
2649 const double *coor_x,
2650 const double *coor_y,
2651 const double *coor_z,
2656 const int* hydrogenGroupSize,
2660 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2662 centerOfMassKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(coor_x, coor_y, coor_z,
2663 cm_x, cm_y, cm_z, mass, hydrogenGroupSize, numAtoms);
2667 void SequencerCUDAKernel::updateRigidArrays(
2670 const int *atomFixed,
2671 const double *vel_x,
2672 const double *vel_y,
2673 const double *vel_z,
2674 const double *pos_x,
2675 const double *pos_y,
2676 const double *pos_z,
2684 cudaStream_t stream)
2686 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2688 updateRigidArraysKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2689 dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2690 velNew_x, velNew_y, velNew_z,
2691 posNew_x, posNew_y, posNew_z, numAtoms);
2693 updateRigidArraysKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2694 dt, atomFixed, vel_x, vel_y, vel_z, pos_x, pos_y, pos_z,
2695 velNew_x, velNew_y, velNew_z,
2696 posNew_x, posNew_y, posNew_z, numAtoms);
2697 // cudaCheck(cudaGetLastError());
2700 template<int BLOCK_SIZE, bool DO_FIXED>
2701 __global__ void submitHalfKernel(
2702 // const int* __restrict atomFixed,
2703 const double * __restrict vel_x,
2704 const double * __restrict vel_y,
2705 const double * __restrict vel_z,
2706 const double * __restrict vcm_x,
2707 const double * __restrict vcm_y,
2708 const double * __restrict vcm_z,
2709 const float * __restrict mass,
2710 BigReal *kineticEnergy,
2711 BigReal *intKineticEnergy,
2713 cudaTensor *intVirialNormal,
2714 BigReal *h_kineticEnergy,
2715 BigReal *h_intKineticEnergy,
2716 cudaTensor *h_virial,
2717 cudaTensor *h_intVirialNormal,
2718 int *hydrogenGroupSize,
2720 unsigned int* tbcatomic)
2722 BigReal k = 0, intK = 0;
2725 zero_cudaTensor(intV);
2726 int i = threadIdx.x + blockIdx.x*blockDim.x;
2727 int totaltb = gridDim.x;
2728 __shared__ bool isLastBlockDone;
2731 if(threadIdx.x == 0){
2732 isLastBlockDone = 0;
2739 (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
2740 v.xx = mass[i] * vel_x[i] * vel_x[i];
2742 v.xy = mass[i] * vel_x[i] * vel_y[i];
2743 v.xz = mass[i] * vel_x[i] * vel_z[i];
2745 v.yx = mass[i] * vel_y[i] * vel_x[i];
2746 v.yy = mass[i] * vel_y[i] * vel_y[i];
2748 v.yz = mass[i] * vel_y[i] * vel_z[i];
2750 v.zx = mass[i] * vel_z[i] * vel_x[i];
2751 v.zy = mass[i] * vel_z[i] * vel_y[i];
2752 v.zz = mass[i] * vel_z[i] * vel_z[i];
2755 int hgs = hydrogenGroupSize[i];
2761 for (int j = i; j < (i+hgs); j++) {
2763 v_cm_x += mass[j] * vel_x[j];
2764 v_cm_y += mass[j] * vel_y[j];
2765 v_cm_z += mass[j] * vel_z[j];
2767 BigReal recip_m_cm = 1.0 / m_cm;
2768 v_cm_x *= recip_m_cm;
2769 v_cm_y *= recip_m_cm;
2770 v_cm_z *= recip_m_cm;
2772 for (int j = i; j < (i+hgs); j++) {
2773 BigReal dv_x = vel_x[j] - v_cm_x;
2774 BigReal dv_y = vel_y[j] - v_cm_y;
2775 BigReal dv_z = vel_z[j] - v_cm_z;
2777 (vel_x[j] * dv_x + vel_y[j] * dv_y + vel_z[j] * dv_z);
2778 intV.xx += mass[j] * vel_x[j] * dv_x;
2779 intV.xy += mass[j] * vel_x[j] * dv_y;
2780 intV.xz += mass[j] * vel_x[j] * dv_z;
2781 intV.yx += mass[j] * vel_y[j] * dv_x;
2782 intV.yy += mass[j] * vel_y[j] * dv_y;
2783 intV.yz += mass[j] * vel_y[j] * dv_z;
2784 intV.zx += mass[j] * vel_z[j] * dv_x;
2785 intV.zy += mass[j] * vel_z[j] * dv_y;
2786 intV.zz += mass[j] * vel_z[j] * dv_z;
2790 // JM: New version with centers of mass calculated by a separate kernel
2791 BigReal v_cm_x = vcm_x[i];
2792 BigReal v_cm_y = vcm_y[i];
2793 BigReal v_cm_z = vcm_z[i];
2794 BigReal dv_x = vel_x[i] - v_cm_x;
2795 BigReal dv_y = vel_y[i] - v_cm_y;
2796 BigReal dv_z = vel_z[i] - v_cm_z;
2798 (vel_x[i] * dv_x + vel_y[i] * dv_y + vel_z[i] * dv_z);
2799 intV.xx += mass[i] * vel_x[i] * dv_x;
2800 intV.xy += mass[i] * vel_x[i] * dv_y;
2801 intV.xz += mass[i] * vel_x[i] * dv_z;
2802 intV.yx += mass[i] * vel_y[i] * dv_x;
2803 intV.yy += mass[i] * vel_y[i] * dv_y;
2804 intV.yz += mass[i] * vel_y[i] * dv_z;
2805 intV.zx += mass[i] * vel_z[i] * dv_x;
2806 intV.zy += mass[i] * vel_z[i] * dv_y;
2807 intV.zz += mass[i] * vel_z[i] * dv_z;
2811 typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
2812 __shared__ typename BlockReduce::TempStorage temp_storage;
2813 k = BlockReduce(temp_storage).Sum(k);
2815 v.xx = BlockReduce(temp_storage).Sum(v.xx);
2818 v.xy = BlockReduce(temp_storage).Sum(v.xy);
2820 v.xz = BlockReduce(temp_storage).Sum(v.xz);
2823 v.yx = BlockReduce(temp_storage).Sum(v.yx);
2825 v.yy = BlockReduce(temp_storage).Sum(v.yy);
2828 v.yz = BlockReduce(temp_storage).Sum(v.yz);
2831 v.zx = BlockReduce(temp_storage).Sum(v.zx);
2833 v.zy = BlockReduce(temp_storage).Sum(v.zy);
2835 v.zz = BlockReduce(temp_storage).Sum(v.zz);
2837 intK = BlockReduce(temp_storage).Sum(intK);
2839 intV.xx = BlockReduce(temp_storage).Sum(intV.xx);
2841 intV.xy = BlockReduce(temp_storage).Sum(intV.xy);
2843 intV.xz = BlockReduce(temp_storage).Sum(intV.xz);
2845 intV.yx = BlockReduce(temp_storage).Sum(intV.yx);
2847 intV.yy = BlockReduce(temp_storage).Sum(intV.yy);
2849 intV.yz = BlockReduce(temp_storage).Sum(intV.yz);
2851 intV.zx = BlockReduce(temp_storage).Sum(intV.zx);
2853 intV.zy = BlockReduce(temp_storage).Sum(intV.zy);
2855 intV.zz = BlockReduce(temp_storage).Sum(intV.zz);
2858 if (threadIdx.x == 0) {
2859 const int bin = blockIdx.x % ATOMIC_BINS;
2861 atomicAdd(&kineticEnergy[bin], k);
2862 atomicAdd(&virial[bin].xx, v.xx);
2864 atomicAdd(&virial[bin].xy, v.xy);
2865 atomicAdd(&virial[bin].xz, v.xz);
2867 atomicAdd(&virial[bin].yx, v.yx);
2868 atomicAdd(&virial[bin].yy, v.yy);
2870 atomicAdd(&virial[bin].yz, v.yz);
2872 atomicAdd(&virial[bin].zx, v.zx);
2873 atomicAdd(&virial[bin].zy, v.zy);
2874 atomicAdd(&virial[bin].zz, v.zz);
2875 atomicAdd(&intKineticEnergy[bin], intK);
2876 atomicAdd(&intVirialNormal[bin].xx, intV.xx);
2877 atomicAdd(&intVirialNormal[bin].xy, intV.xy);
2878 atomicAdd(&intVirialNormal[bin].xz, intV.xz);
2879 atomicAdd(&intVirialNormal[bin].yx, intV.yx);
2880 atomicAdd(&intVirialNormal[bin].yy, intV.yy);
2881 atomicAdd(&intVirialNormal[bin].yz, intV.yz);
2882 atomicAdd(&intVirialNormal[bin].zx, intV.zx);
2883 atomicAdd(&intVirialNormal[bin].zy, intV.zy);
2884 atomicAdd(&intVirialNormal[bin].zz, intV.zz);
2887 unsigned int value = atomicInc(&tbcatomic[1], totaltb);
2888 isLastBlockDone = (value == (totaltb -1));
2893 if(isLastBlockDone){
2894 if(threadIdx.x < ATOMIC_BINS){
2895 const int bin = threadIdx.x;
2897 double k = kineticEnergy[bin];
2898 double intK = intKineticEnergy[bin];
2899 cudaTensor v = virial[bin];
2900 cudaTensor intV = intVirialNormal[bin];
2902 // sets device scalars back to zero
2903 kineticEnergy[bin] = 0.0;
2904 intKineticEnergy[bin] = 0.0;
2905 zero_cudaTensor(virial[bin]);
2906 zero_cudaTensor(intVirialNormal[bin]);
2908 if(ATOMIC_BINS > 1){
2909 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
2910 typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
2911 __shared__ typename WarpReduce::TempStorage tempStorage;
2912 __shared__ typename WarpReduceT::TempStorage tempStorageT;
2914 k = WarpReduce(tempStorage).Sum(k);
2915 intK = WarpReduce(tempStorage).Sum(intK);
2916 v = WarpReduceT(tempStorageT).Sum(v);
2917 intV = WarpReduceT(tempStorageT).Sum(intV);
2920 if(threadIdx.x == 0){
2921 h_kineticEnergy[0] = k;
2922 h_intKineticEnergy[0] = intK;
2924 h_intVirialNormal[0] = intV;
2926 //resets atomic counter
2927 reset_atomic_counter(&tbcatomic[1]);
2934 void SequencerCUDAKernel::submitHalf(
2935 const bool doFixedAtoms,
2936 const double *vel_x,
2937 const double *vel_y,
2938 const double *vel_z,
2939 const double *vcm_x,
2940 const double *vcm_y,
2941 const double *vcm_z,
2943 BigReal *kineticEnergy,
2944 BigReal *intKineticEnergy,
2946 cudaTensor *intVirialNormal,
2947 BigReal *h_kineticEnergy,
2948 BigReal *h_intKineticEnergy,
2949 cudaTensor *h_virial,
2950 cudaTensor *h_intVirialNormal,
2951 int *hydrogenGroupSize,
2953 unsigned int* tbcatomic,
2954 cudaStream_t stream)
2956 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
2958 submitHalfKernel<ATOM_BLOCKS, true><<<grid, ATOM_BLOCKS, 0, stream>>>(
2959 vel_x, vel_y, vel_z,
2960 vcm_x, vcm_y, vcm_z, mass,
2961 kineticEnergy, intKineticEnergy,
2962 virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2963 h_virial, h_intVirialNormal,
2964 hydrogenGroupSize, numAtoms, tbcatomic);
2966 submitHalfKernel<ATOM_BLOCKS, false><<<grid, ATOM_BLOCKS, 0, stream>>>(
2967 vel_x, vel_y, vel_z,
2968 vcm_x, vcm_y, vcm_z, mass,
2969 kineticEnergy, intKineticEnergy,
2970 virial, intVirialNormal, h_kineticEnergy, h_intKineticEnergy,
2971 h_virial, h_intVirialNormal,
2972 hydrogenGroupSize, numAtoms, tbcatomic);
2973 //cudaCheck(cudaGetLastError());
2976 __global__ void scaleCoordinateUseGroupPressureKernel(
2977 double * __restrict pos_x,
2978 double * __restrict pos_y,
2979 double * __restrict pos_z,
2981 int *hydrogenGroupSize,
2986 int i = threadIdx.x + blockIdx.x * blockDim.x;
2988 int hgs = hydrogenGroupSize[i];
2991 // missing fixed atoms implementation
2996 // calculate the center of mass
2997 for ( j = i; j < (i+hgs); ++j ) {
2999 r_cm_x += mass[j] * pos_x[j];
3000 r_cm_y += mass[j] * pos_y[j];
3001 r_cm_z += mass[j] * pos_z[j];
3003 BigReal inv_m_cm = 1.0/m_cm;
3007 // scale the center of mass with factor
3009 double tx = r_cm_x - origin.x;
3010 double ty = r_cm_y - origin.y;
3011 double tz = r_cm_z - origin.z;
3012 // apply transformation
3013 double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3014 double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3015 double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3017 new_r_cm_x += origin.x;
3018 new_r_cm_y += origin.y;
3019 new_r_cm_z += origin.z;
3020 // translation vector from old COM and new COM
3021 double delta_r_cm_x = new_r_cm_x - r_cm_x;
3022 double delta_r_cm_y = new_r_cm_y - r_cm_y;
3023 double delta_r_cm_z = new_r_cm_z - r_cm_z;
3024 // shift the hydrogen group with translation vector
3025 for (j = i; j < (i+hgs); j++) {
3026 pos_x[j] += delta_r_cm_x;
3027 pos_y[j] += delta_r_cm_y;
3028 pos_z[j] += delta_r_cm_z;
3034 __global__ void scaleCoordinateNoGroupPressureKernel(
3035 double * __restrict pos_x,
3036 double * __restrict pos_y,
3037 double * __restrict pos_z,
3042 int i = threadIdx.x + blockIdx.x * blockDim.x;
3044 // missing fixed atoms implementation
3046 double tx = pos_x[i] - origin.x;
3047 double ty = pos_y[i] - origin.y;
3048 double tz = pos_z[i] - origin.z;
3049 // apply transformation
3050 double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3051 double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3052 double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3054 pos_x[i] = ftx + origin.x;
3055 pos_y[i] = fty + origin.y;
3056 pos_z[i] = ftz + origin.z;
3060 __global__ void SetAtomIndexOrderKernel(
3065 int i = threadIdx.x + blockIdx.x * blockDim.x;
3067 int atomIndex = id[i];
3068 idOrder[atomIndex] = i;
3072 __global__ void scaleCoordinateUsingGCKernel(
3073 double* __restrict pos_x,
3074 double* __restrict pos_y,
3075 double* __restrict pos_z,
3076 const int* __restrict idOrder,
3077 const int* __restrict moleculeStartIndex,
3078 const int* __restrict moleculeAtom,
3079 const cudaTensor factor,
3080 const cudaVector origin,
3081 const Lattice oldLattice,
3082 const Lattice newLattice,
3083 const char3* __restrict transform,
3084 const int numMolecules,
3085 const int numLargeMolecules)
3087 // missing fixed atoms implementation
3088 int startIdx, endIdx, i, j, jmapped, atomIndex;
3089 double3 position, r_gc, new_r_gc, delta_r_gc;
3091 r_gc.x = 0; r_gc.y = 0; r_gc.z = 0;
3092 if (blockIdx.x < numLargeMolecules){ //large molecule case
3094 startIdx = moleculeStartIndex[i];
3095 endIdx = moleculeStartIndex[i + 1];
3096 __shared__ double3 sh_gc;
3097 double inv_length = 1.0/(double)(endIdx - startIdx);
3098 // calculate the geometric center
3099 for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3100 atomIndex = moleculeAtom[j];
3101 jmapped = idOrder[atomIndex];
3102 tr = transform[jmapped];
3103 position.x = pos_x[jmapped];
3104 position.y = pos_y[jmapped];
3105 position.z = pos_z[jmapped];
3106 //Unwrap the coordinate with oldLattice
3107 position = oldLattice.reverse_transform(position ,tr);
3108 r_gc.x += position.x;
3109 r_gc.y += position.y;
3110 r_gc.z += position.z;
3113 typedef cub::BlockReduce<double, 64> BlockReduce;
3114 __shared__ typename BlockReduce::TempStorage temp_storage;
3116 r_gc.x = BlockReduce(temp_storage).Sum(r_gc.x);
3118 r_gc.y = BlockReduce(temp_storage).Sum(r_gc.y);
3120 r_gc.z = BlockReduce(temp_storage).Sum(r_gc.z);
3123 if (threadIdx.x == 0) {
3124 sh_gc.x = r_gc.x * inv_length;
3125 sh_gc.y = r_gc.y * inv_length;
3126 sh_gc.z = r_gc.z * inv_length;
3130 // scale the geometric center with factor
3132 double tx = sh_gc.x - origin.x;
3133 double ty = sh_gc.y - origin.y;
3134 double tz = sh_gc.z - origin.z;
3135 // apply transformation
3136 new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3137 new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3138 new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3140 new_r_gc.x += origin.x;
3141 new_r_gc.y += origin.y;
3142 new_r_gc.z += origin.z;
3143 // translation vector from old GC to new GC
3144 delta_r_gc.x = new_r_gc.x - sh_gc.x;
3145 delta_r_gc.y = new_r_gc.y - sh_gc.y;
3146 delta_r_gc.z = new_r_gc.z - sh_gc.z;
3148 // shift the atoms in molecule with translation vector
3149 for (j = startIdx + threadIdx.x; j < endIdx; j += blockDim.x) {
3150 atomIndex = moleculeAtom[j];
3151 jmapped = idOrder[atomIndex];
3152 tr = transform[jmapped];
3153 position.x = pos_x[jmapped];
3154 position.y = pos_y[jmapped];
3155 position.z = pos_z[jmapped];
3156 //Unwrap the coordinate with oldLattice
3157 position = oldLattice.reverse_transform(position, tr);
3158 position.x += delta_r_gc.x;
3159 position.y += delta_r_gc.y;
3160 position.z += delta_r_gc.z;
3161 // wrap the coordinate with new lattice parameter
3162 position = newLattice.apply_transform(position, tr);
3163 pos_x[jmapped] = position.x;
3164 pos_y[jmapped] = position.y;
3165 pos_z[jmapped] = position.z;
3167 } else { //Small molecule
3168 // numLargeMolecules block has been assigned to large molecule
3169 i = numLargeMolecules + threadIdx.x +
3170 (blockIdx.x - numLargeMolecules) * blockDim.x;
3172 if (i < numMolecules) {
3173 startIdx = moleculeStartIndex[i];
3174 endIdx = moleculeStartIndex[i+1];
3175 double inv_length = 1.0/(double)(endIdx - startIdx);
3177 // calculate the geometric center
3178 for ( j = startIdx; j < endIdx; j++ ) {
3179 atomIndex = moleculeAtom[j];
3180 jmapped = idOrder[atomIndex];
3181 tr = transform[jmapped];
3182 position.x = pos_x[jmapped];
3183 position.y = pos_y[jmapped];
3184 position.z = pos_z[jmapped];
3185 //Unwrap the coordinate with oldLattice
3186 position = oldLattice.reverse_transform(position, tr);
3187 r_gc.x += position.x;
3188 r_gc.y += position.y;
3189 r_gc.z += position.z;
3192 r_gc.x *= inv_length;
3193 r_gc.y *= inv_length;
3194 r_gc.z *= inv_length;
3196 // scale the geometric center with factor
3198 double tx = r_gc.x - origin.x;
3199 double ty = r_gc.y - origin.y;
3200 double tz = r_gc.z - origin.z;
3201 // apply transformation
3202 new_r_gc.x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3203 new_r_gc.y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3204 new_r_gc.z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3206 new_r_gc.x += origin.x;
3207 new_r_gc.y += origin.y;
3208 new_r_gc.z += origin.z;
3209 // translation vector from old GC to new GC
3210 delta_r_gc.x = new_r_gc.x - r_gc.x;
3211 delta_r_gc.y = new_r_gc.y - r_gc.y;
3212 delta_r_gc.z = new_r_gc.z - r_gc.z;
3214 // shift the atoms in molecule with translation vector
3215 for (j = startIdx; j < endIdx; j++) {
3216 atomIndex = moleculeAtom[j];
3217 jmapped = idOrder[atomIndex];
3219 tr = transform[jmapped];
3220 position.x = pos_x[jmapped];
3221 position.y = pos_y[jmapped];
3222 position.z = pos_z[jmapped];
3223 // unwrap the coordinate with oldLattice
3224 position = oldLattice.reverse_transform(position, tr);
3225 position.x += delta_r_gc.x;
3226 position.y += delta_r_gc.y;
3227 position.z += delta_r_gc.z;
3228 // wrap the coordinate with new lattice parameter
3229 position = newLattice.apply_transform(position, tr);
3230 pos_x[jmapped] = position.x;
3231 pos_y[jmapped] = position.y;
3232 pos_z[jmapped] = position.z;
3238 template <bool doFixed>
3239 __global__ void langevinPistonUseGroupPressureKernel(
3240 const int* __restrict atomFixed,
3241 const int* __restrict groupFixed,
3242 const Lattice lattice,
3243 const char3* transform,
3244 const double* __restrict fixedPosition_x,
3245 const double* __restrict fixedPosition_y,
3246 const double* __restrict fixedPosition_z,
3247 double * __restrict pos_x,
3248 double * __restrict pos_y,
3249 double * __restrict pos_z,
3250 double * __restrict vel_x,
3251 double * __restrict vel_y,
3252 double * __restrict vel_z,
3254 int *hydrogenGroupSize,
3262 int i = threadIdx.x + blockIdx.x*blockDim.x;
3264 int hgs = hydrogenGroupSize[i];
3268 if (groupFixed[i]) {
3269 for ( j = i; j < (i+hgs); ++j ) {
3270 const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3271 const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3272 pos_x[j] = pos_trans.x;
3273 pos_y[j] = pos_trans.y;
3274 pos_z[j] = pos_trans.z;
3286 for ( j = i; j < (i+hgs); ++j ) {
3288 if (atomFixed[j]) continue;
3291 r_cm_x += mass[j] * pos_x[j];
3292 r_cm_y += mass[j] * pos_y[j];
3293 r_cm_z += mass[j] * pos_z[j];
3294 v_cm_x += mass[j] * vel_x[j];
3295 v_cm_y += mass[j] * vel_y[j];
3296 v_cm_z += mass[j] * vel_z[j];
3298 BigReal inv_m_cm = 1.0/m_cm;
3303 double tx = r_cm_x - origin.x;
3304 double ty = r_cm_y - origin.y;
3305 double tz = r_cm_z - origin.z;
3306 double new_r_cm_x = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3307 double new_r_cm_y = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3308 double new_r_cm_z = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3309 new_r_cm_x += origin.x;
3310 new_r_cm_y += origin.y;
3311 new_r_cm_z += origin.z;
3313 double delta_r_cm_x = new_r_cm_x - r_cm_x;
3314 double delta_r_cm_y = new_r_cm_y - r_cm_y;
3315 double delta_r_cm_z = new_r_cm_z - r_cm_z;
3319 double delta_v_cm_x = ( velFactor_x - 1 ) * v_cm_x;
3320 double delta_v_cm_y = ( velFactor_y - 1 ) * v_cm_y;
3321 double delta_v_cm_z = ( velFactor_z - 1 ) * v_cm_z;
3322 for (j = i; j < (i+hgs); j++) {
3325 const double3 pos_origin{fixedPosition_x[j], fixedPosition_y[j], fixedPosition_z[j]};
3326 const auto pos_trans = lattice.apply_transform(pos_origin, transform[j]);
3327 pos_x[j] = pos_trans.x;
3328 pos_y[j] = pos_trans.y;
3329 pos_z[j] = pos_trans.z;
3333 pos_x[j] += delta_r_cm_x;
3334 pos_y[j] += delta_r_cm_y;
3335 pos_z[j] += delta_r_cm_z;
3336 vel_x[j] += delta_v_cm_x;
3337 vel_y[j] += delta_v_cm_y;
3338 vel_z[j] += delta_v_cm_z;
3345 template <bool doFixed>
3346 __global__ void langevinPistonNoGroupPressureKernel(
3347 const int* __restrict atomFixed,
3348 const Lattice lattice,
3349 const char3* transform,
3350 const double* __restrict fixedPosition_x,
3351 const double* __restrict fixedPosition_y,
3352 const double* __restrict fixedPosition_z,
3353 double * __restrict pos_x,
3354 double * __restrict pos_y,
3355 double * __restrict pos_z,
3356 double * __restrict vel_x,
3357 double * __restrict vel_y,
3358 double * __restrict vel_z,
3366 int i = threadIdx.x + blockIdx.x*blockDim.x;
3370 const double3 pos_origin{fixedPosition_x[i], fixedPosition_y[i], fixedPosition_z[i]};
3371 const auto pos_trans = lattice.apply_transform(pos_origin, transform[i]);
3372 pos_x[i] = pos_trans.x;
3373 pos_y[i] = pos_trans.y;
3374 pos_z[i] = pos_trans.z;
3378 double tx = pos_x[i] - origin.x;
3379 double ty = pos_y[i] - origin.y;
3380 double tz = pos_z[i] - origin.z;
3381 double ftx = factor.xx*tx + factor.xy*ty + factor.xz*tz;
3382 double fty = factor.yx*tx + factor.yy*ty + factor.yz*tz;
3383 double ftz = factor.zx*tx + factor.zy*ty + factor.zz*tz;
3384 pos_x[i] = ftx + origin.x;
3385 pos_y[i] = fty + origin.y;
3386 pos_z[i] = ftz + origin.z;
3387 vel_x[i] *= velFactor_x;
3388 vel_y[i] *= velFactor_y;
3389 vel_z[i] *= velFactor_z;
3393 void SequencerCUDAKernel::scaleCoordinateWithFactor(
3398 int *hydrogenGroupSize,
3401 int useGroupPressure,
3403 cudaStream_t stream)
3405 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3406 if (useGroupPressure) {
3407 scaleCoordinateUseGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3408 pos_x, pos_y, pos_z, mass, hydrogenGroupSize, factor, origin, numAtoms);
3410 scaleCoordinateNoGroupPressureKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3411 pos_x, pos_y, pos_z, factor, origin, numAtoms);
3416 void SequencerCUDAKernel::SetAtomIndexOrder(
3420 cudaStream_t stream)
3422 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3423 SetAtomIndexOrderKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
3424 id, idOrder, numAtoms);
3427 void SequencerCUDAKernel::scaleCoordinateUsingGC(
3432 const int *moleculeStartIndex,
3433 const int *moleculeAtom,
3434 const cudaTensor factor,
3435 const cudaVector origin,
3436 const Lattice oldLattice,
3437 const Lattice newLattice,
3438 const char3 *transform,
3439 const int numMolecules,
3440 const int numLargeMolecules,
3441 cudaStream_t stream)
3443 // we want each thread to calculate geometric center for molecule with
3444 // less than 128 atoms, and 1 threadblock to calculate each molecule
3445 // with larger than 128 atoms
3446 int numThreadsPerBlock = 64;
3447 int grid = ((numMolecules - numLargeMolecules + numThreadsPerBlock - 1) /
3448 numThreadsPerBlock) + numLargeMolecules;
3449 scaleCoordinateUsingGCKernel<<<grid, numThreadsPerBlock, 0, stream>>>(
3450 pos_x, pos_y, pos_z, idOrder, moleculeStartIndex,
3451 moleculeAtom, factor, origin, oldLattice, newLattice,
3452 transform, numMolecules, numLargeMolecules);
3456 void SequencerCUDAKernel::langevinPiston(
3457 const bool doFixedAtoms,
3458 const int* atomFixed,
3459 const int* groupFixed,
3460 const char3* transform,
3461 const Lattice lattice,
3462 const double* fixedPosition_x,
3463 const double* fixedPosition_y,
3464 const double* fixedPosition_z,
3472 int *hydrogenGroupSize,
3478 int useGroupPressure,
3480 cudaStream_t stream)
3482 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3484 double *h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3485 double *h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3486 double *h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3488 double *h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3489 double *h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3490 double *h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3492 copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3493 copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3494 copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3496 copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3497 copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3498 copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3500 fprintf(stderr, "velFactors = %lf %lf %lf\n",
3501 velFactor_x, velFactor_y, velFactor_z);
3502 for(int i = 0; i < numAtoms; i++){
3503 fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3504 h_pos_x[i], h_pos_y[i], h_pos_z[i],
3505 h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3515 if (useGroupPressure) {
3517 langevinPistonUseGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3518 atomFixed, groupFixed, lattice, transform,
3519 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3520 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3521 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3523 langevinPistonUseGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3524 atomFixed, groupFixed, lattice, transform,
3525 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3526 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass, hydrogenGroupSize,
3527 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3531 langevinPistonNoGroupPressureKernel<true><<<grid, ATOM_BLOCKS, 0, stream>>>(
3532 atomFixed, lattice, transform,
3533 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3534 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3535 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3537 langevinPistonNoGroupPressureKernel<false><<<grid, ATOM_BLOCKS, 0, stream>>>(
3538 atomFixed, lattice, transform,
3539 fixedPosition_x, fixedPosition_y, fixedPosition_z,
3540 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z,
3541 factor, origin, velFactor_x, velFactor_y, velFactor_z, numAtoms);
3544 //cudaCheck(cudaGetLastError());
3546 h_pos_x = (double*)malloc(sizeof(double)*numAtoms);
3547 h_pos_y = (double*)malloc(sizeof(double)*numAtoms);
3548 h_pos_z = (double*)malloc(sizeof(double)*numAtoms);
3550 h_vel_x = (double*)malloc(sizeof(double)*numAtoms);
3551 h_vel_y = (double*)malloc(sizeof(double)*numAtoms);
3552 h_vel_z = (double*)malloc(sizeof(double)*numAtoms);
3554 copy_DtoH_sync<double>(pos_x, h_pos_x, numAtoms);
3555 copy_DtoH_sync<double>(pos_y, h_pos_y, numAtoms);
3556 copy_DtoH_sync<double>(pos_z, h_pos_z, numAtoms);
3558 copy_DtoH_sync<double>(vel_x, h_vel_x, numAtoms);
3559 copy_DtoH_sync<double>(vel_y, h_vel_y, numAtoms);
3560 copy_DtoH_sync<double>(vel_z, h_vel_z, numAtoms);
3562 for(int i = 0; i < numAtoms; i++){
3563 fprintf(stderr, "%lf %lf %lf %lf %lf %lf\n",
3564 h_pos_x[i], h_pos_y[i], h_pos_z[i],
3565 h_vel_x[i], h_vel_y[i], h_vel_z[i]);
3570 template <int BLOCK_SIZE>
3571 __global__ void submitReduction1Kernel(
3572 double * __restrict pos_x,
3573 double * __restrict pos_y,
3574 double * __restrict pos_z,
3575 double * __restrict vel_x,
3576 double * __restrict vel_y,
3577 double * __restrict vel_z,
3578 float * __restrict mass,
3579 // TODO: wrap these scalars in a struct
3580 BigReal *kineticEnergy,
3581 BigReal *momentum_x,
3582 BigReal *momentum_y,
3583 BigReal *momentum_z,
3584 BigReal *angularMomentum_x,
3585 BigReal *angularMomentum_y,
3586 BigReal *angularMomentum_z,
3590 BigReal *h_kineticEnergy,
3591 BigReal *h_momentum_x,
3592 BigReal *h_momentum_y,
3593 BigReal *h_momentum_z,
3594 BigReal *h_angularMomentum_x,
3595 BigReal *h_angularMomentum_y,
3596 BigReal *h_angularMomentum_z,
3597 unsigned int* tbcatomic,
3600 BigReal m_x = 0, m_y = 0, m_z = 0,
3601 a_x = 0, a_y = 0, a_z = 0;
3602 int i = threadIdx.x + blockIdx.x*blockDim.x;
3603 int totaltb = gridDim.x;
3604 __shared__ bool isLastBlockDone;
3606 if(threadIdx.x == 0){
3607 isLastBlockDone = 0;
3612 // scalar kineticEnergy += mass[i] * dot_product(vel[i], vel[i])
3614 // (vel_x[i]*vel_x[i] + vel_y[i]*vel_y[i] + vel_z[i]*vel_z[i]);
3616 // vector momentum += mass[i] * vel[i]
3617 m_x += mass[i] * vel_x[i];
3618 m_y += mass[i] * vel_y[i];
3619 m_z += mass[i] * vel_z[i];
3621 // vector dpos = pos[i] - origin
3622 BigReal dpos_x = pos_x[i] - origin_x;
3623 BigReal dpos_y = pos_y[i] - origin_y;
3624 BigReal dpos_z = pos_z[i] - origin_z;
3626 // vector angularMomentum += mass[i] * cross_product(dpos, vel[i])
3627 a_x += mass[i] * (dpos_y*vel_z[i] - dpos_z*vel_y[i]);
3628 a_y += mass[i] * (dpos_z*vel_x[i] - dpos_x*vel_z[i]);
3629 a_z += mass[i] * (dpos_x*vel_y[i] - dpos_y*vel_x[i]);
3631 typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
3632 __shared__ typename BlockReduce::TempStorage temp_storage;
3633 // k = BlockReduce(temp_storage).Sum(k);
3636 m_x = BlockReduce(temp_storage).Sum(m_x);
3638 m_y = BlockReduce(temp_storage).Sum(m_y);
3640 m_z = BlockReduce(temp_storage).Sum(m_z);
3642 a_x = BlockReduce(temp_storage).Sum(a_x);
3644 a_y = BlockReduce(temp_storage).Sum(a_y);
3646 a_z = BlockReduce(temp_storage).Sum(a_z);
3649 if (threadIdx.x == 0) {
3650 const int bin = blockIdx.x % ATOMIC_BINS;
3652 // atomicAdd(&kineticEnergy[bin], k);
3653 atomicAdd(&momentum_x[bin], m_x);
3654 atomicAdd(&momentum_y[bin], m_y);
3655 atomicAdd(&momentum_z[bin], m_z);
3656 atomicAdd(&angularMomentum_x[bin], a_x);
3657 atomicAdd(&angularMomentum_y[bin], a_y);
3658 atomicAdd(&angularMomentum_z[bin], a_z);
3660 unsigned int value = atomicInc(&tbcatomic[0], totaltb);
3661 isLastBlockDone = (value == (totaltb-1));
3667 if(isLastBlockDone){
3668 if(threadIdx.x < ATOMIC_BINS){
3669 const int bin = threadIdx.x;
3671 double m_x = momentum_x[bin];
3672 double m_y = momentum_y[bin];
3673 double m_z = momentum_z[bin];
3674 double a_x = angularMomentum_x[bin];
3675 double a_y = angularMomentum_y[bin];
3676 double a_z = angularMomentum_z[bin];
3678 // sets device scalars back to zero
3679 kineticEnergy[0] = 0.0;
3680 momentum_x[bin] = 0.0;
3681 momentum_y[bin] = 0.0;
3682 momentum_z[bin] = 0.0;
3683 angularMomentum_x[bin] = 0.0;
3684 angularMomentum_y[bin] = 0.0;
3685 angularMomentum_z[bin] = 0.0;
3687 if(ATOMIC_BINS > 1){
3688 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
3689 __shared__ typename WarpReduce::TempStorage tempStorage;
3691 m_x = WarpReduce(tempStorage).Sum(m_x);
3692 m_y = WarpReduce(tempStorage).Sum(m_y);
3693 m_z = WarpReduce(tempStorage).Sum(m_z);
3694 a_x = WarpReduce(tempStorage).Sum(a_x);
3695 a_y = WarpReduce(tempStorage).Sum(a_y);
3696 a_z = WarpReduce(tempStorage).Sum(a_z);
3699 if(threadIdx.x == 0){
3700 h_momentum_x[0] = m_x;
3701 h_momentum_y[0] = m_y;
3702 h_momentum_z[0] = m_z;
3703 h_angularMomentum_x[0] = a_x;
3704 h_angularMomentum_y[0] = a_y;
3705 h_angularMomentum_z[0] = a_z;
3707 //resets atomic counter
3708 reset_atomic_counter(&tbcatomic[0]);
3716 // JM: does addForcetoMomentum, maximumMove, and addVelocityToPosition
3717 template <bool DO_VEL_RESCALING, bool DO_FIXED>
3718 __global__ void velocityVerlet1Kernel(
3720 const double scaling,
3721 const double dt_normal,
3722 const double dt_nbond,
3723 const double dt_slow,
3724 const double velrescaling, // for stochastic velocity rescaling
3725 const double* __restrict recipMass,
3726 double* __restrict vel_x,
3727 double* __restrict vel_y,
3728 double* __restrict vel_z,
3729 const double maxvel2,
3731 double* __restrict pos_x,
3732 double* __restrict pos_y,
3733 double* __restrict pos_z,
3737 double* __restrict f_normal_x,
3738 double* __restrict f_normal_y,
3739 double* __restrict f_normal_z,
3740 double* __restrict f_nbond_x,
3741 double* __restrict f_nbond_y,
3742 double* __restrict f_nbond_z,
3743 double* __restrict f_slow_x,
3744 double* __restrict f_slow_y,
3745 double* __restrict f_slow_z,
3746 const int* atomFixed,
3748 const int maxForceNumber
3750 // fusion of addForceToMomentum, maximumMove, addVelocityToPosition
3751 double dt, dt_b, dt_nb, dt_s;
3752 int i = threadIdx.x + blockIdx.x*blockDim.x;
3756 double velx = vel_x[i];
3757 double vely = vel_y[i];
3758 double velz = vel_z[i];
3759 if (DO_VEL_RESCALING) {
3760 velx *= velrescaling;
3761 vely *= velrescaling;
3762 velz *= velrescaling;
3764 // JM NOTE: these need to be patch-centered
3765 double posx = pos_x[i];
3766 double posy = pos_y[i];
3767 double posz = pos_z[i];
3768 double rmass = recipMass[i];
3769 /* addForceToMomentum*/
3770 // keep velocities in registers so I can access them faster when calculating positions!
3771 switch(maxForceNumber){
3773 dt_s = dt_slow * scaling;
3774 velx += f_slow_x[i] * rmass * dt_s;
3775 vely += f_slow_y[i] * rmass * dt_s;
3776 velz += f_slow_z[i] * rmass * dt_s;
3777 // f_slow_x[i] = 0.0;
3778 // f_slow_y[i] = 0.0;
3779 // f_slow_z[i] = 0.0;
3781 dt_nb = dt_nbond * scaling;
3782 velx += f_nbond_x[i] * rmass * dt_nb;
3783 vely += f_nbond_y[i] * rmass * dt_nb;
3784 velz += f_nbond_z[i] * rmass * dt_nb;
3785 // f_nbond_x[i] = 0.0;
3786 // f_nbond_y[i] = 0.0;
3787 // f_nbond_z[i] = 0.0;
3789 dt_b = dt_normal * scaling;
3790 velx += f_normal_x[i] * rmass * dt_b;
3791 vely += f_normal_y[i] * rmass * dt_b;
3792 velz += f_normal_z[i] * rmass * dt_b;
3793 // f_normal_x[i] = 0.0;
3794 // f_normal_y[i] = 0.0;
3795 // f_normal_z[i] = 0.0;
3798 // --------------------------------------------------------
3800 // -- MaximumMove --
3801 double vel2 = velx * velx + vely * vely + velz * velz;
3802 if(vel2 > maxvel2) atomicAdd(h_killme, 1); // stops dynamics if true, perf does not matter
3803 // --------------------------------------------------------
3805 // -- AddVelocityToPosition --
3806 // ---------------------------------------------------------
3818 // Haochuan: for fixed atoms, we need to clear their velocities (see HomePatch::addForceToMomentum)
3819 if (!atomFixed[i]) {
3839 void SequencerCUDAKernel::velocityVerlet1(
3840 const bool doFixedAtoms,
3842 const double scaling,
3843 const double dt_normal,
3844 const double dt_nbond,
3845 const double dt_slow,
3846 const double velrescaling, // for stochastic velocity rescaling
3847 const double *recipMass,
3868 const int* atomFixed,
3870 const int maxForceNumber,
3871 cudaStream_t stream)
3873 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3874 const bool doVelRescaling = (velrescaling != 1.0);
3875 #define CALL(DO_VEL_RESCALING, DO_FIXED) \
3876 velocityVerlet1Kernel<DO_VEL_RESCALING, DO_FIXED> \
3877 <<<grid, ATOM_BLOCKS, 0, stream>>>( \
3878 step, scaling, dt_normal, dt_nbond, dt_slow, velrescaling, recipMass, \
3879 vel_x, vel_y, vel_z, maxvel2, h_killme, \
3880 pos_x, pos_y, pos_z, h_pos_x, h_pos_y, h_pos_z, \
3881 f_normal_x, f_normal_y, f_normal_z, \
3882 f_nbond_x, f_nbond_y, f_nbond_z, f_slow_x, f_slow_y, f_slow_z, \
3883 atomFixed, numAtoms, maxForceNumber \
3885 if ( doVelRescaling && doFixedAtoms) CALL(true, true);
3886 else if ( doVelRescaling && !doFixedAtoms) CALL(true, false);
3887 else if (!doVelRescaling && doFixedAtoms) CALL(false, true);
3888 else if (!doVelRescaling && !doFixedAtoms) CALL(false, false);
3890 NAMD_bug("No kernel was called in SequencerCUDAKernel::velocityVerlet1!\n");
3895 void SequencerCUDAKernel::submitReduction1(
3903 BigReal *kineticEnergy,
3904 BigReal *momentum_x,
3905 BigReal *momentum_y,
3906 BigReal *momentum_z,
3907 BigReal *angularMomentum_x,
3908 BigReal *angularMomentum_y,
3909 BigReal *angularMomentum_z,
3913 BigReal *h_kineticEnergy,
3914 BigReal *h_momentum_x,
3915 BigReal *h_momentum_y,
3916 BigReal *h_momentum_z,
3917 BigReal *h_angularMomentum_x,
3918 BigReal *h_angularMomentum_y,
3919 BigReal *h_angularMomentum_z,
3920 unsigned int* tbcatomic,
3922 cudaStream_t stream)
3924 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
3925 submitReduction1Kernel<ATOM_BLOCKS><<<grid, ATOM_BLOCKS, 0, stream>>>(
3926 pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, mass,
3927 kineticEnergy, momentum_x, momentum_y, momentum_z,
3928 angularMomentum_x, angularMomentum_y, angularMomentum_z,
3929 origin_x, origin_y, origin_z, h_kineticEnergy, h_momentum_x, h_momentum_y,
3930 h_momentum_z, h_angularMomentum_x, h_angularMomentum_y, h_angularMomentum_z,
3931 tbcatomic, numAtoms);
3932 //cudaCheck(cudaGetLastError());
3935 template <int BLOCK_SIZE, bool DO_FIXED, bool DO_MTS>
3936 __global__ void submitReduction2Kernel(
3937 const int* __restrict atomFixed,
3938 const double * __restrict pos_x,
3939 const double * __restrict pos_y,
3940 const double * __restrict pos_z,
3941 const double * __restrict vel_x,
3942 const double * __restrict vel_y,
3943 const double * __restrict vel_z,
3944 const double * __restrict rcm_x,
3945 const double * __restrict rcm_y,
3946 const double * __restrict rcm_z,
3947 const double * __restrict vcm_x,
3948 const double * __restrict vcm_y,
3949 const double * __restrict vcm_z,
3950 const double * __restrict f_normal_x,
3951 const double * __restrict f_normal_y,
3952 const double * __restrict f_normal_z,
3953 const double * __restrict f_nbond_x,
3954 const double * __restrict f_nbond_y,
3955 const double * __restrict f_nbond_z,
3956 const double * __restrict f_slow_x,
3957 const double * __restrict f_slow_y,
3958 const double * __restrict f_slow_z,
3959 const float * __restrict mass,
3960 const int * __restrict hydrogenGroupSize,
3961 BigReal *kineticEnergy,
3962 BigReal *h_kineticEnergy,
3963 BigReal *intKineticEnergy,
3964 BigReal *h_intKineticEnergy,
3965 cudaTensor *intVirialNormal,
3966 cudaTensor *intVirialNbond,
3967 cudaTensor *intVirialSlow,
3968 cudaTensor *h_intVirialNormal,
3969 cudaTensor *h_intVirialNbond,
3970 cudaTensor *h_intVirialSlow,
3971 cudaTensor* rigidVirial,
3972 cudaTensor* h_rigidVirial,
3973 unsigned int *tbcatomic,
3975 const int maxForceNumber)
3979 cudaTensor intVNormal, intVNbond, intVSlow;
3980 zero_cudaTensor(intVNormal);
3982 zero_cudaTensor(intVNbond);
3983 zero_cudaTensor(intVSlow);
3985 int i = threadIdx.x + blockIdx.x*blockDim.x;
3986 __shared__ bool isLastBlockDone;
3987 int totaltb = gridDim.x;
3989 if(threadIdx.x == 0){
3990 isLastBlockDone = false;
3996 int hgs = hydrogenGroupSize[i];
3997 // let's see, I have the hydrogenGroupSize, but that's not what I need;
4007 for ( j = i; j < (i+hgs); ++j ) {
4008 r_mass[j -i ] = mass[j];
4009 r_pos[j - i].x = pos_x[j];
4010 r_pos[j - i].y = pos_y[j];
4011 r_pos[j - i].z = pos_z[j];
4012 r_vel[j - i].x = vel_x[j];
4013 r_vel[j - i].y = vel_y[j];
4014 r_vel[j - i].z = vel_z[j];
4016 // r_cm_x += mass[j] * pos_x[j];
4017 // r_cm_y += mass[j] * pos_y[j];
4018 // r_cm_z += mass[j] * pos_z[j];
4019 // v_cm_x += mass[j] * vel_x[j];
4020 // v_cm_y += mass[j] * vel_y[j];
4021 // v_cm_z += mass[j] * vel_z[j];
4022 m_cm += r_mass[j - i];
4023 r_cm_x += r_mass[j - i] * r_pos[j-i].x;
4024 r_cm_y += r_mass[j - i] * r_pos[j-i].y;
4025 r_cm_z += r_mass[j - i] * r_pos[j-i].z;
4026 v_cm_x += r_mass[j - i] * r_vel[j-i].x;
4027 v_cm_y += r_mass[j - i] * r_vel[j-i].y;
4028 v_cm_z += r_mass[j - i] * r_vel[j-i].z;
4030 BigReal inv_m_cm = 1.0/m_cm;
4038 // XXX removed pairInteraction
4039 for ( j = i; j < (i+hgs); ++j ) {
4040 // XXX removed fixed atoms
4042 // vector vel[j] used twice below
4043 BigReal v_x = r_vel[j-i].x;
4044 BigReal v_y = r_vel[j-i].y;
4045 BigReal v_z = r_vel[j-i].z;
4047 // vector dv = vel[j] - v_cm
4048 BigReal dv_x = v_x - v_cm_x;
4049 BigReal dv_y = v_y - v_cm_y;
4050 BigReal dv_z = v_z - v_cm_z;
4052 // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4054 // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4055 intK += r_mass[j-i] *
4056 (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4058 // vector dr = pos[j] - r_cm
4059 // BigReal dr_x = pos_x[j] - r_cm_x;
4060 // BigReal dr_y = pos_y[j] - r_cm_y;
4061 // BigReal dr_z = pos_z[j] - r_cm_z;
4063 BigReal dr_x = r_pos[j -i].x - r_cm_x;
4064 BigReal dr_y = r_pos[j -i].y - r_cm_y;
4065 BigReal dr_z = r_pos[j -i].z - r_cm_z;
4067 // tensor intVirialNormal += outer_product(f_normal[j], dr)
4069 // we're not going to make this function any faster if we don't fix
4070 // the global memory access pattern
4071 intVNormal.xx += f_normal_x[j] * dr_x;
4072 intVNormal.xy += f_normal_x[j] * dr_y;
4073 intVNormal.xz += f_normal_x[j] * dr_z;
4074 intVNormal.yx += f_normal_y[j] * dr_x;
4075 intVNormal.yy += f_normal_y[j] * dr_y;
4076 intVNormal.yz += f_normal_y[j] * dr_z;
4077 intVNormal.zx += f_normal_z[j] * dr_x;
4078 intVNormal.zy += f_normal_z[j] * dr_y;
4079 intVNormal.zz += f_normal_z[j] * dr_z;
4081 if (maxForceNumber >= 1) {
4082 // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4083 intVNbond.xx += f_nbond_x[j] * dr_x;
4084 intVNbond.xy += f_nbond_x[j] * dr_y;
4085 intVNbond.xz += f_nbond_x[j] * dr_z;
4086 intVNbond.yx += f_nbond_y[j] * dr_x;
4087 intVNbond.yy += f_nbond_y[j] * dr_y;
4088 intVNbond.yz += f_nbond_y[j] * dr_z;
4089 intVNbond.zx += f_nbond_z[j] * dr_x;
4090 intVNbond.zy += f_nbond_z[j] * dr_y;
4091 intVNbond.zz += f_nbond_z[j] * dr_z;
4094 if (maxForceNumber >= 2) {
4095 // tensor intVirialSlow += outer_product(f_slow[j], dr)
4096 intVSlow.xx += f_slow_x[j] * dr_x;
4097 intVSlow.xy += f_slow_x[j] * dr_y;
4098 intVSlow.xz += f_slow_x[j] * dr_z;
4099 intVSlow.yx += f_slow_y[j] * dr_x;
4100 intVSlow.yy += f_slow_y[j] * dr_y;
4101 intVSlow.yz += f_slow_y[j] * dr_z;
4102 intVSlow.zx += f_slow_z[j] * dr_x;
4103 intVSlow.zy += f_slow_z[j] * dr_y;
4104 intVSlow.zz += f_slow_z[j] * dr_z;
4109 // Haochuan: I need to skip the fixed atoms
4110 if (!DO_FIXED || (DO_FIXED && !atomFixed[i])) {
4111 BigReal r_cm_x = rcm_x[i];
4112 BigReal r_cm_y = rcm_y[i];
4113 BigReal r_cm_z = rcm_z[i];
4114 BigReal v_cm_x = vcm_x[i];
4115 BigReal v_cm_y = vcm_y[i];
4116 BigReal v_cm_z = vcm_z[i];
4118 BigReal v_x = vel_x[i];
4119 BigReal v_y = vel_y[i];
4120 BigReal v_z = vel_z[i];
4122 // vector dv = vel[j] - v_cm
4123 BigReal dv_x = v_x - v_cm_x;
4124 BigReal dv_y = v_y - v_cm_y;
4125 BigReal dv_z = v_z - v_cm_z;
4127 // scalar intKineticEnergy += mass[j] * dot_product(v, dv)
4129 // (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4131 (v_x * v_x + v_y*v_y + v_z*v_z);
4133 (v_x * dv_x + v_y * dv_y + v_z * dv_z);
4135 // vector dr = pos[j] - r_cm
4136 // BigReal dr_x = pos_x[j] - r_cm_x;
4137 // BigReal dr_y = pos_y[j] - r_cm_y;
4138 // BigReal dr_z = pos_z[j] - r_cm_z;
4140 BigReal dr_x = pos_x[i] - r_cm_x;
4141 BigReal dr_y = pos_y[i] - r_cm_y;
4142 BigReal dr_z = pos_z[i] - r_cm_z;
4144 // tensor intVirialNormal += outer_product(f_normal[j], dr)
4146 // we're not going to make this function any faster if we don't fix
4147 // the global memory access pattern
4148 intVNormal.xx += f_normal_x[i] * dr_x;
4149 intVNormal.xy += f_normal_x[i] * dr_y;
4150 intVNormal.xz += f_normal_x[i] * dr_z;
4151 intVNormal.yx += f_normal_y[i] * dr_x;
4152 intVNormal.yy += f_normal_y[i] * dr_y;
4153 intVNormal.yz += f_normal_y[i] * dr_z;
4154 intVNormal.zx += f_normal_z[i] * dr_x;
4155 intVNormal.zy += f_normal_z[i] * dr_y;
4156 intVNormal.zz += f_normal_z[i] * dr_z;
4158 if (maxForceNumber >= 1) {
4159 // tensor intVirialNbond += outer_product(f_nbond[j], dr)
4160 // Haochuan: For a MTS simulation we have to calculate intVNbond separately
4162 intVNbond.xx += f_nbond_x[i] * dr_x;
4163 intVNbond.xy += f_nbond_x[i] * dr_y;
4164 intVNbond.xz += f_nbond_x[i] * dr_z;
4165 intVNbond.yx += f_nbond_y[i] * dr_x;
4166 intVNbond.yy += f_nbond_y[i] * dr_y;
4167 intVNbond.yz += f_nbond_y[i] * dr_z;
4168 intVNbond.zx += f_nbond_z[i] * dr_x;
4169 intVNbond.zy += f_nbond_z[i] * dr_y;
4170 intVNbond.zz += f_nbond_z[i] * dr_z;
4172 // If this is not a MTS simulation, then we can sum the internal virials into a single one (intVNormal)
4173 intVNormal.xx += f_nbond_x[i] * dr_x;
4174 intVNormal.xy += f_nbond_x[i] * dr_y;
4175 intVNormal.xz += f_nbond_x[i] * dr_z;
4176 intVNormal.yx += f_nbond_y[i] * dr_x;
4177 intVNormal.yy += f_nbond_y[i] * dr_y;
4178 intVNormal.yz += f_nbond_y[i] * dr_z;
4179 intVNormal.zx += f_nbond_z[i] * dr_x;
4180 intVNormal.zy += f_nbond_z[i] * dr_y;
4181 intVNormal.zz += f_nbond_z[i] * dr_z;
4185 if (maxForceNumber >= 2) {
4186 // tensor intVirialSlow += outer_product(f_slow[j], dr)
4188 intVSlow.xx += f_slow_x[i] * dr_x;
4189 intVSlow.xy += f_slow_x[i] * dr_y;
4190 intVSlow.xz += f_slow_x[i] * dr_z;
4191 intVSlow.yx += f_slow_y[i] * dr_x;
4192 intVSlow.yy += f_slow_y[i] * dr_y;
4193 intVSlow.yz += f_slow_y[i] * dr_z;
4194 intVSlow.zx += f_slow_z[i] * dr_x;
4195 intVSlow.zy += f_slow_z[i] * dr_y;
4196 intVSlow.zz += f_slow_z[i] * dr_z;
4198 intVNormal.xx += f_slow_x[i] * dr_x;
4199 intVNormal.xy += f_slow_x[i] * dr_y;
4200 intVNormal.xz += f_slow_x[i] * dr_z;
4201 intVNormal.yx += f_slow_y[i] * dr_x;
4202 intVNormal.yy += f_slow_y[i] * dr_y;
4203 intVNormal.yz += f_slow_y[i] * dr_z;
4204 intVNormal.zx += f_slow_z[i] * dr_x;
4205 intVNormal.zy += f_slow_z[i] * dr_y;
4206 intVNormal.zz += f_slow_z[i] * dr_z;
4213 typedef cub::BlockReduce<BigReal, BLOCK_SIZE> BlockReduce;
4214 __shared__ typename BlockReduce::TempStorage temp_storage;
4215 // XXX TODO: If we handwrite these reductions, it might avoid a
4216 // lot of overhead from launching CUB
4218 K = BlockReduce(temp_storage).Sum(K);
4220 intK = BlockReduce(temp_storage).Sum(intK);
4222 intVNormal.xx = BlockReduce(temp_storage).Sum(intVNormal.xx);
4224 intVNormal.xy = BlockReduce(temp_storage).Sum(intVNormal.xy);
4226 intVNormal.xz = BlockReduce(temp_storage).Sum(intVNormal.xz);
4228 intVNormal.yx = BlockReduce(temp_storage).Sum(intVNormal.yx);
4230 intVNormal.yy = BlockReduce(temp_storage).Sum(intVNormal.yy);
4232 intVNormal.yz = BlockReduce(temp_storage).Sum(intVNormal.yz);
4234 intVNormal.zx = BlockReduce(temp_storage).Sum(intVNormal.zx);
4236 intVNormal.zy = BlockReduce(temp_storage).Sum(intVNormal.zy);
4238 intVNormal.zz = BlockReduce(temp_storage).Sum(intVNormal.zz);
4241 if (maxForceNumber >= 1) {
4242 intVNbond.xx = BlockReduce(temp_storage).Sum(intVNbond.xx);
4244 intVNbond.xy = BlockReduce(temp_storage).Sum(intVNbond.xy);
4246 intVNbond.xz = BlockReduce(temp_storage).Sum(intVNbond.xz);
4248 intVNbond.yx = BlockReduce(temp_storage).Sum(intVNbond.yx);
4250 intVNbond.yy = BlockReduce(temp_storage).Sum(intVNbond.yy);
4252 intVNbond.yz = BlockReduce(temp_storage).Sum(intVNbond.yz);
4254 intVNbond.zx = BlockReduce(temp_storage).Sum(intVNbond.zx);
4256 intVNbond.zy = BlockReduce(temp_storage).Sum(intVNbond.zy);
4258 intVNbond.zz = BlockReduce(temp_storage).Sum(intVNbond.zz);
4261 if (maxForceNumber >= 2) {
4262 intVSlow.xx = BlockReduce(temp_storage).Sum(intVSlow.xx);
4264 intVSlow.xy = BlockReduce(temp_storage).Sum(intVSlow.xy);
4266 intVSlow.xz = BlockReduce(temp_storage).Sum(intVSlow.xz);
4268 intVSlow.yx = BlockReduce(temp_storage).Sum(intVSlow.yx);
4270 intVSlow.yy = BlockReduce(temp_storage).Sum(intVSlow.yy);
4272 intVSlow.yz = BlockReduce(temp_storage).Sum(intVSlow.yz);
4274 intVSlow.zx = BlockReduce(temp_storage).Sum(intVSlow.zx);
4276 intVSlow.zy = BlockReduce(temp_storage).Sum(intVSlow.zy);
4278 intVSlow.zz = BlockReduce(temp_storage).Sum(intVSlow.zz);
4284 if (threadIdx.x == 0) {
4285 const int bin = blockIdx.x % ATOMIC_BINS;
4287 atomicAdd(&kineticEnergy[bin], K);
4288 atomicAdd(&intKineticEnergy[bin], intK);
4289 atomicAdd(&intVirialNormal[bin].xx, intVNormal.xx);
4290 atomicAdd(&intVirialNormal[bin].xy, intVNormal.xy);
4291 atomicAdd(&intVirialNormal[bin].xz, intVNormal.xz);
4292 atomicAdd(&intVirialNormal[bin].yx, intVNormal.yx);
4293 atomicAdd(&intVirialNormal[bin].yy, intVNormal.yy);
4294 atomicAdd(&intVirialNormal[bin].yz, intVNormal.yz);
4295 atomicAdd(&intVirialNormal[bin].zx, intVNormal.zx);
4296 atomicAdd(&intVirialNormal[bin].zy, intVNormal.zy);
4297 atomicAdd(&intVirialNormal[bin].zz, intVNormal.zz);
4299 if (maxForceNumber >= 1) {
4300 atomicAdd(&intVirialNbond[bin].xx, intVNbond.xx);
4301 atomicAdd(&intVirialNbond[bin].xy, intVNbond.xy);
4302 atomicAdd(&intVirialNbond[bin].xz, intVNbond.xz);
4303 atomicAdd(&intVirialNbond[bin].yx, intVNbond.yx);
4304 atomicAdd(&intVirialNbond[bin].yy, intVNbond.yy);
4305 atomicAdd(&intVirialNbond[bin].yz, intVNbond.yz);
4306 atomicAdd(&intVirialNbond[bin].zx, intVNbond.zx);
4307 atomicAdd(&intVirialNbond[bin].zy, intVNbond.zy);
4308 atomicAdd(&intVirialNbond[bin].zz, intVNbond.zz);
4310 if (maxForceNumber >= 2) {
4311 atomicAdd(&intVirialSlow[bin].xx, intVSlow.xx);
4312 atomicAdd(&intVirialSlow[bin].xy, intVSlow.xy);
4313 atomicAdd(&intVirialSlow[bin].xz, intVSlow.xz);
4314 atomicAdd(&intVirialSlow[bin].yx, intVSlow.yx);
4315 atomicAdd(&intVirialSlow[bin].yy, intVSlow.yy);
4316 atomicAdd(&intVirialSlow[bin].yz, intVSlow.yz);
4317 atomicAdd(&intVirialSlow[bin].zx, intVSlow.zx);
4318 atomicAdd(&intVirialSlow[bin].zy, intVSlow.zy);
4319 atomicAdd(&intVirialSlow[bin].zz, intVSlow.zz);
4323 unsigned int value = atomicInc(&tbcatomic[3], totaltb);
4324 isLastBlockDone = (value == (totaltb -1 ));
4327 // this function calculates the internal pressures and is a huge bottleneck if we
4328 // do this host-mapped update
4329 // How do I know if this is really the bottleneck?
4330 if(isLastBlockDone){
4331 if(threadIdx.x < ATOMIC_BINS){
4332 const int bin = threadIdx.x;
4334 double k = kineticEnergy[bin];
4335 double intK = intKineticEnergy[bin];
4336 cudaTensor intVNormal = intVirialNormal[bin];
4338 if (maxForceNumber >= 1) {
4339 intVNbond = intVirialNbond[bin];
4341 if (maxForceNumber >= 2) {
4342 intVSlow = intVirialSlow[bin];
4345 cudaTensor rigidV = rigidVirial[bin];
4347 // sets device scalars back to zero
4348 kineticEnergy[bin] = 0.0;
4349 intKineticEnergy[bin] = 0.0;
4350 zero_cudaTensor(intVirialNormal[bin]);
4351 zero_cudaTensor(intVirialNbond[bin]);
4352 zero_cudaTensor(intVirialSlow[bin]);
4353 zero_cudaTensor(rigidVirial[bin]);
4355 if(ATOMIC_BINS > 1){
4356 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
4357 typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
4358 __shared__ typename WarpReduce::TempStorage tempStorage;
4359 __shared__ typename WarpReduceT::TempStorage tempStorageT;
4361 // According to https://nvidia.github.io/cccl/cub/developer_overview.html#temporary-storage-usage,
4362 // I should use __syncwarp() here.
4363 k = WarpReduce(tempStorage).Sum(k); NAMD_WARP_SYNC(WARP_FULL_MASK);
4364 intK = WarpReduce(tempStorage).Sum(intK); NAMD_WARP_SYNC(WARP_FULL_MASK);
4365 intVNormal = WarpReduceT(tempStorageT).Sum(intVNormal); NAMD_WARP_SYNC(WARP_FULL_MASK);
4366 rigidV = WarpReduceT(tempStorageT).Sum(rigidV); NAMD_WARP_SYNC(WARP_FULL_MASK);
4368 if (maxForceNumber >= 1) {
4369 intVNbond = WarpReduceT(tempStorageT).Sum(intVNbond); NAMD_WARP_SYNC(WARP_FULL_MASK);
4371 if (maxForceNumber >= 2) {
4372 intVSlow = WarpReduceT(tempStorageT).Sum(intVSlow); NAMD_WARP_SYNC(WARP_FULL_MASK);
4377 if(threadIdx.x == 0){
4378 h_kineticEnergy[0] = k;
4379 h_intKineticEnergy[0] = intK;
4380 h_intVirialNormal[0] = intVNormal;
4382 if (maxForceNumber >= 1) {
4383 h_intVirialNbond[0] = intVNbond;
4385 if (maxForceNumber >= 2) {
4386 h_intVirialSlow[0] = intVSlow;
4389 h_rigidVirial[0] = rigidV;
4391 //resets atomic counter
4392 reset_atomic_counter(&tbcatomic[3]);
4398 void SequencerCUDAKernel::submitReduction2(
4399 const bool doFixedAtoms,
4400 const int* atomFixed,
4401 const double *pos_x,
4402 const double *pos_y,
4403 const double *pos_z,
4404 const double *vel_x,
4405 const double *vel_y,
4406 const double *vel_z,
4407 const double *rcm_x,
4408 const double *rcm_y,
4409 const double *rcm_z,
4410 const double *vcm_x,
4411 const double *vcm_y,
4412 const double *vcm_z,
4413 const double *f_normal_x,
4414 const double *f_normal_y,
4415 const double *f_normal_z,
4416 const double *f_nbond_x,
4417 const double *f_nbond_y,
4418 const double *f_nbond_z,
4419 const double *f_slow_x,
4420 const double *f_slow_y,
4421 const double *f_slow_z,
4423 int *hydrogenGroupSize,
4424 BigReal *kineticEnergy,
4425 BigReal *h_kineticEnergy,
4426 BigReal *intKineticEnergy,
4427 BigReal *h_intKineticEnergy,
4428 cudaTensor *intVirialNormal,
4429 cudaTensor *intVirialNbond,
4430 cudaTensor *intVirialSlow,
4431 cudaTensor *h_intVirialNormal,
4432 cudaTensor *h_intVirialNbond,
4433 cudaTensor *h_intVirialSlow,
4434 cudaTensor* rigidVirial,
4435 cudaTensor* h_rigidVirial,
4436 unsigned int *tbcatomic,
4440 cudaStream_t stream)
4443 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4444 #define CALL(DO_FIXED, DO_MTS) \
4445 submitReduction2Kernel<ATOM_BLOCKS, DO_FIXED, DO_MTS><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4446 atomFixed, pos_x, pos_y, pos_z, vel_x, vel_y, vel_z, \
4447 rcm_x, rcm_y, rcm_z, vcm_x, vcm_y, vcm_z, \
4448 f_normal_x, f_normal_y, f_normal_z, \
4449 f_nbond_x, f_nbond_y, f_nbond_z, \
4450 f_slow_x, f_slow_y, f_slow_z, \
4451 mass, hydrogenGroupSize, \
4452 kineticEnergy, h_kineticEnergy, \
4453 intKineticEnergy, h_intKineticEnergy, \
4454 intVirialNormal, intVirialNbond, intVirialSlow, \
4455 h_intVirialNormal, h_intVirialNbond, h_intVirialSlow, rigidVirial, \
4456 h_rigidVirial, tbcatomic, numAtoms, maxForceNumber)
4458 if (doMTS) CALL(true, true);
4459 else CALL(true, false);
4461 else if (!doFixedAtoms) {
4462 if (doMTS) CALL(false, true);
4463 else CALL(false, false);
4465 else NAMD_bug("No kernel was called in SequencerCUDAKernel::submitReduction2!\n");
4467 // cudaCheck(cudaGetLastError());
4470 __global__ void langevinVelocitiesBBK1Kernel(
4472 const float * __restrict langevinParam,
4473 double * __restrict vel_x,
4474 double * __restrict vel_y,
4475 double * __restrict vel_z,
4478 BigReal dt = timestep * (0.001 * TIMEFACTOR);
4479 int i = threadIdx.x + blockIdx.x*blockDim.x;
4481 BigReal dt_gamma = dt * langevinParam[i];
4482 BigReal scaling = 1. - 0.5 * dt_gamma;
4483 vel_x[i] *= scaling;
4484 vel_y[i] *= scaling;
4485 vel_z[i] *= scaling;
4489 void SequencerCUDAKernel::langevinVelocitiesBBK1(
4491 const float *langevinParam,
4496 cudaStream_t stream)
4498 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4499 langevinVelocitiesBBK1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4500 timestep, langevinParam, vel_x, vel_y, vel_z, numAtoms);
4501 // cudaCheck(cudaGetLastError());
4504 __global__ void langevinVelocitiesBBK2Kernel(
4506 const float * __restrict langScalVelBBK2,
4507 const float * __restrict langScalRandBBK2,
4508 const float * __restrict gaussrand_x,
4509 const float * __restrict gaussrand_y,
4510 const float * __restrict gaussrand_z,
4511 double * __restrict vel_x,
4512 double * __restrict vel_y,
4513 double * __restrict vel_z,
4516 int i = threadIdx.x + blockIdx.x*blockDim.x;
4518 vel_x[i] += gaussrand_x[i] * langScalRandBBK2[i];
4519 vel_y[i] += gaussrand_y[i] * langScalRandBBK2[i];
4520 vel_z[i] += gaussrand_z[i] * langScalRandBBK2[i];
4521 vel_x[i] *= langScalVelBBK2[i];
4522 vel_y[i] *= langScalVelBBK2[i];
4523 vel_z[i] *= langScalVelBBK2[i];
4529 void SequencerCUDAKernel::langevinVelocitiesBBK2(
4531 const float *langScalVelBBK2,
4532 const float *langScalRandBBK2,
4540 const int numAtomsGlobal,
4542 curandGenerator_t gen,
4543 cudaStream_t stream)
4546 // For some reason, if n == nAtomsDevice + 1, this gives me a misaligned address
4547 // Generating the full array on gaussrand without striding for multiple GPU simulations for now
4549 // Adding missing call to hiprandSetStream to set the current stream for HIPRAND kernel launches
4550 curandCheck(curandSetStream(gen, stream));
4551 // array buffers have to be even length for pseudorandom normal distribution
4552 // choose n to be the smallest even number >= numAtoms
4553 // we can choose 1 larger than numAtoms since allocation is > numAtoms
4554 int n = (numAtomsGlobal + 1) & (~1);
4556 curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4557 curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4558 curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4559 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4560 langevinVelocitiesBBK2Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
4561 timestep, langScalVelBBK2, langScalRandBBK2,
4562 gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride,
4563 vel_x, vel_y, vel_z, numAtoms);
4564 // cudaCheck(cudaGetLastError());
4567 template <bool doFixed>
4568 __global__ void reassignVelocitiesKernel(
4569 const BigReal timestep,
4570 const int* __restrict atomFixed,
4571 const float * __restrict gaussrand_x,
4572 const float * __restrict gaussrand_y,
4573 const float * __restrict gaussrand_z,
4574 double * __restrict vel_x,
4575 double * __restrict vel_y,
4576 double * __restrict vel_z,
4577 const double* __restrict d_recipMass,
4581 //! note we are NOT supporting LES (locally enhanced sampling) here
4582 //! hence no switch on the partition to apply tempfactor
4585 int i = threadIdx.x + blockIdx.x*blockDim.x;
4593 vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4594 vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4595 vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4598 vel_x[i]=gaussrand_x[i]*sqrt(kbT*d_recipMass[i]);
4599 vel_y[i]=gaussrand_y[i]*sqrt(kbT*d_recipMass[i]);
4600 vel_z[i]=gaussrand_z[i]*sqrt(kbT*d_recipMass[i]);
4606 void SequencerCUDAKernel::reassignVelocities(
4607 const BigReal timestep,
4608 const bool doFixedAtoms,
4609 const int* atomFixed,
4616 const double* d_recipMass,
4619 const int numAtomsGlobal,
4621 curandGenerator_t gen,
4622 cudaStream_t stream)
4625 curandCheck(curandSetStream(gen, stream));
4626 // array buffers have to be even length for pseudorandom normal distribution
4627 // choose n to be the smallest even number >= numAtoms
4628 // we can choose 1 larger than numAtoms since allocation is > numAtoms
4629 int n = (numAtomsGlobal + 1) & (~1);
4631 curandCheck(curandGenerateNormal(gen, gaussrand_x, n, 0, 1));
4632 curandCheck(curandGenerateNormal(gen, gaussrand_y, n, 0, 1));
4633 curandCheck(curandGenerateNormal(gen, gaussrand_z, n, 0, 1));
4634 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
4635 #define CALL(DOFIXED) \
4636 reassignVelocitiesKernel<DOFIXED><<<grid, ATOM_BLOCKS, 0, stream>>>( \
4637 timestep, atomFixed, \
4638 gaussrand_x + stride, gaussrand_y + stride, gaussrand_z + stride, \
4639 vel_x, vel_y, vel_z, d_recipMass, kbT, numAtoms)
4640 if (doFixedAtoms) CALL(true);
4643 cudaCheck(cudaStreamSynchronize(stream));
4649 __global__ void updateVelocities(const int nRattles,
4652 const int * __restrict settleList,
4653 const CudaRattleElem * __restrict rattleList,
4654 const double * __restrict pos_x,
4655 const double * __restrict pos_y,
4656 const double * __restrict pos_z,
4657 const double * __restrict posNew_x,
4658 const double * __restrict posNew_y,
4659 const double * __restrict posNew_z,
4664 int tid = threadIdx.x + blockIdx.x*blockDim.x;
4665 int stride = gridDim.x*blockDim.x;
4668 for(int i = tid; i < nSettles; i += stride){
4669 int ig = settleList[i];
4670 //Now I update the position
4671 //Updates three atoms in the settleList
4672 velNew_x[ig] = (posNew_x[ig] - pos_x[ig]) * invdt;
4673 velNew_y[ig] = (posNew_y[ig] - pos_y[ig]) * invdt;
4674 velNew_z[ig] = (posNew_z[ig] - pos_z[ig]) * invdt;
4676 velNew_x[ig+1] = (posNew_x[ig+1] - pos_x[ig+1]) * invdt;
4677 velNew_y[ig+1] = (posNew_y[ig+1] - pos_y[ig+1]) * invdt;
4678 velNew_z[ig+1] = (posNew_z[ig+1] - pos_z[ig+1]) * invdt;
4680 velNew_x[ig+2] = (posNew_x[ig+2] - pos_x[ig+2]) * invdt;
4681 velNew_y[ig+2] = (posNew_y[ig+2] - pos_y[ig+2]) * invdt;
4682 velNew_z[ig+2] = (posNew_z[ig+2] - pos_z[ig+2]) * invdt;
4686 for(int i = tid; i < nRattles; i += stride){
4687 int ig = rattleList[i].ig;
4688 int icnt = rattleList[i].icnt;
4690 fixed[0] = false; fixed[1] = false;
4691 fixed[2] = false; fixed[3] = false;
4692 for(int j = 0; j < icnt; j++){
4693 //Gets two positions
4694 int ia = rattleList[i].params[j].ia;
4695 int ib = rattleList[i].params[j].ib;
4696 //checks if any of these positions have been updated yet
4698 velNew_x[ig+ia] = (posNew_x[ig+ia] - pos_x[ig+ia]) * invdt;
4699 velNew_y[ig+ia] = (posNew_y[ig+ia] - pos_y[ig+ia]) * invdt;
4700 velNew_z[ig+ia] = (posNew_z[ig+ia] - pos_z[ig+ia]) * invdt;
4704 velNew_x[ig+ib] = (posNew_x[ig+ib] - pos_x[ig+ib]) * invdt;
4705 velNew_y[ig+ib] = (posNew_y[ig+ib] - pos_y[ig+ib]) * invdt;
4706 velNew_z[ig+ib] = (posNew_z[ig+ib] - pos_z[ig+ib]) * invdt;
4713 //JM: Function that adds a correction to the bonded forces
4714 template<bool doEnergy, bool doFixed>
4715 __global__ void addRattleForce(int numAtoms,
4717 const int* __restrict atomFixed,
4718 const float * __restrict mass,
4719 const double * __restrict pos_x,
4720 const double * __restrict pos_y,
4721 const double * __restrict pos_z,
4722 double * __restrict vel_x,
4723 double * __restrict vel_y,
4724 double * __restrict vel_z,
4725 double * __restrict velNew_x,
4726 double * __restrict velNew_y,
4727 double * __restrict velNew_z,
4728 double * __restrict f_normal_x,
4729 double * __restrict f_normal_y,
4730 double * __restrict f_normal_z,
4731 cudaTensor* __restrict virial,
4732 cudaTensor* __restrict h_virial,
4733 unsigned int* tbcatomic){
4736 // zero_cudaTensor(lVirial);
4737 int i = threadIdx.x + blockIdx.x*blockDim.x;
4741 lVirial.xx = 0.0; lVirial.xy = 0.0; lVirial.xz = 0.0;
4742 lVirial.yx = 0.0; lVirial.yy = 0.0; lVirial.yz = 0.0;
4743 lVirial.zx = 0.0; lVirial.zy = 0.0; lVirial.zz = 0.0;
4744 //for(int i = tid; i < numAtoms; i += stride){
4746 df[0] = (velNew_x[i] - vel_x[i]) * ((double)mass[i] * invdt);
4747 df[1] = (velNew_y[i] - vel_y[i]) * ((double)mass[i] * invdt);
4748 df[2] = (velNew_z[i] - vel_z[i]) * ((double)mass[i] * invdt);
4750 pos[0] = pos_x[i]; pos[1] = pos_y[i]; pos[2] = pos_z[i];
4751 lVirial.xx += df[0] * pos[0];
4752 lVirial.xy += df[0] * pos[1];
4753 lVirial.xz += df[0] * pos[2];
4754 lVirial.yx += df[1] * pos[0];
4755 lVirial.yy += df[1] * pos[1];
4756 lVirial.yz += df[1] * pos[2];
4757 lVirial.zx += df[2] * pos[0];
4758 lVirial.zy += df[2] * pos[1];
4759 lVirial.zz += df[2] * pos[2];
4762 //Updates force and velocities
4763 f_normal_x[i] += df[0];
4764 f_normal_y[i] += df[1];
4765 f_normal_z[i] += df[2];
4767 // instead of copying I can swap the pointers
4768 vel_x[i] = velNew_x[i];
4769 vel_y[i] = velNew_y[i];
4770 vel_z[i] = velNew_z[i];
4773 //Makes sure that every thread from block has its lVirial set
4774 // Now do a block reduce on each position of virialTensor
4775 // to get consistent values
4776 // The virial tensor is supposed to be symmetric
4777 // XXX TODO: Handwrite the reductions to avoid syncthreads
4779 typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
4780 __shared__ typename BlockReduce::TempStorage temp_storage;
4781 lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx); __syncthreads();
4783 lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy); __syncthreads();
4784 lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz); __syncthreads();
4786 lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx); __syncthreads();
4787 lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
4790 lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz); __syncthreads();
4792 lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx); __syncthreads();
4793 lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy); __syncthreads();
4794 lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz); __syncthreads();
4796 // Every block has a locally reduced blockVirial
4797 // Now every thread does an atomicAdd to get a global virial
4798 if(threadIdx.x == 0){
4799 const int bin = blockIdx.x % ATOMIC_BINS;
4801 atomicAdd(&virial[bin].xx, lVirial.xx);
4803 atomicAdd(&virial[bin].xy, lVirial.xy);
4804 atomicAdd(&virial[bin].xz, lVirial.xz);
4806 atomicAdd(&virial[bin].yx, lVirial.yx);
4807 atomicAdd(&virial[bin].yy, lVirial.yy);
4809 atomicAdd(&virial[bin].yz, lVirial.yz);
4811 atomicAdd(&virial[bin].zx, lVirial.zx);
4812 atomicAdd(&virial[bin].zy, lVirial.zy);
4813 atomicAdd(&virial[bin].zz, lVirial.zz);
4819 template <bool fillIndexes>
4820 __global__ void buildRattleParams(const int size,
4824 //Do I need new vectors for holding the rattle forces? Let's do this for now
4825 const float * __restrict mass,
4826 const int * __restrict hydrogenGroupSize,
4827 const float * __restrict rigidBondLength,
4828 const int * __restrict atomFixed,
4829 CudaRattleElem* rattleList,
4832 int tid = threadIdx.x + blockIdx.x*blockDim.x;
4833 int stride = gridDim.x * blockDim.x;
4842 for(int k = tid; k < size; k += stride){
4844 i = rattleIndexes[k];
4848 hgs = hydrogenGroupSize[ig];
4849 if(hgs == 1) continue; //early bail. Sucks;
4852 for(int j = 0; j < hgs; j++){
4853 rmass[j] = (mass[ig + j] > 0.f ? __frcp_rn(mass[ig+j]) : 0.f);
4854 fixed[j] = atomFixed[ig+j];
4860 float tmp = rigidBondLength[ig];
4861 //We bail here if hydrogens are not fixed
4863 if(!anyfixed) continue;
4864 if( !(fixed[1] && fixed[2]) ){
4865 dsq[icnt] = tmp*tmp;
4871 rattleIndexes[i] = i;
4874 for(int j = 1; j < hgs; j++){
4875 if((tmp = rigidBondLength[ig+j]) > 0){
4876 if( !(fixed[0] && fixed[j])){
4877 dsq[icnt] = tmp * tmp;
4883 // This is really bad: does an atomicadd for each rattle
4884 // Improve this later
4885 rattleList[k].ig = ig;
4886 rattleList[k].icnt = icnt;
4887 for(int j = 0; j < icnt; j++){
4890 rattleList[k].params[j].ia = ia;
4891 rattleList[k].params[j].ib = ib;
4892 rattleList[k].params[j].dsq = dsq[j];
4893 rattleList[k].params[j].rma = rmass[ia];
4894 rattleList[k].params[j].rmb = rmass[ib];
4900 void buildRattleLists(
4901 const bool doFixedAtoms,
4906 const int *hydrogenGroupSize,
4907 const float *rigidBondLength,
4908 const int *atomFixed,
4910 size_t& settleListSize,
4912 size_t& consFailureSize,
4913 CudaRattleElem **rattleList,
4914 size_t& rattleListSize,
4922 char*& d_rattleList_temp_storage,
4923 size_t& temp_storage_bytes,
4924 int*& rattleIndexes,
4925 size_t& rattleIndexes_size,
4927 cudaStream_t stream){
4930 //thrust::device_ptr<const int> h_dev = thrust::device_pointer_cast(hydrogenGroupSize)
4931 size_t st_hgi_size = (size_t)(hgi_size);
4932 reallocate_device<int>(&hgi, &st_hgi_size, natoms, OVERALLOC);
4933 hgi_size = (size_t)(st_hgi_size);
4934 size_t temp_length = 1;
4935 reallocate_device<int>(&d_nHG, &temp_length, 1);
4936 reallocate_device<int>(&d_nSettles, &temp_length, 1);
4937 reallocate_device<int>(&d_nRattles, &temp_length, 1);
4939 copy_DtoD<int>(hydrogenGroupSize, hgi, natoms, stream);
4940 size_t new_storage_bytes = 0;
4941 // Removes zeroes from HGI array
4942 // Pass NULL to first CUB call to get temp buffer size
4943 cudaCheck(cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, hgi,
4944 d_nHG, natoms, notZero(), stream));
4945 reallocate_device<char>(&d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4946 // DC: There were issues where reallocate wanted int types and CUB wanted unsigned ints
4947 new_storage_bytes = temp_storage_bytes;
4948 cudaCheck( cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, hgi,
4949 d_nHG, natoms, notZero(), stream));
4952 copy_DtoH<int>(d_nHG, &temp, 1, stream );
4955 cudaCheck(cudaStreamSynchronize(stream));
4957 // Exclusive scan to build an array of indices
4958 new_storage_bytes = 0;
4959 cub::DeviceScan::ExclusiveSum(NULL,
4960 new_storage_bytes, hgi, hgi, nHG, stream);
4961 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4962 new_storage_bytes = temp_storage_bytes;
4963 cub::DeviceScan::ExclusiveSum(d_rattleList_temp_storage, new_storage_bytes,
4964 hgi, hgi, nHG, stream);
4966 size_t st_tempListSize = (size_t)(settleListSize);
4967 reallocate_device<int>(settleList, &st_tempListSize, nHG, OVERALLOC);
4968 settleListSize = (size_t)(st_tempListSize);
4970 st_tempListSize = (size_t)(rattleListSize);
4971 reallocate_device<CudaRattleElem>(rattleList, &st_tempListSize, nHG, OVERALLOC);
4972 rattleListSize = (size_t)(st_tempListSize);
4974 st_tempListSize = (size_t)(consFailureSize);
4975 reallocate_device<int>(consFailure, &st_tempListSize, nHG, OVERALLOC);
4976 consFailureSize = (size_t)(st_tempListSize);
4978 st_tempListSize = (size_t)(rattleIndexes_size);
4979 reallocate_device<int>(&rattleIndexes, &st_tempListSize, nHG, OVERALLOC);
4980 rattleIndexes_size = (size_t)(st_tempListSize);
4983 cudaCheck(cudaMemsetAsync(*rattleList, 0, sizeof(CudaRattleElem)*nHG, stream));
4984 cudaCheck(cudaMemsetAsync(rattleIndexes,-1, sizeof(int)*nHG, stream));
4985 cudaCheck(cudaMemsetAsync(*settleList, 0, sizeof(int)*nHG, stream));
4987 ///thrust::device_vector<int> hgi(natoms);
4988 new_storage_bytes = 0;
4990 isWater<true> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
4991 // builds SettleList with isWater functor
4992 cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
4993 d_nSettles, nHG, comp, stream);
4994 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
4995 new_storage_bytes = temp_storage_bytes;
4996 cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
4997 d_nSettles, nHG, comp, stream);
4999 isWater<false> comp(rigidBondLength, hydrogenGroupSize, atomFixed);
5000 // builds SettleList with isWater functor
5001 cub::DeviceSelect::If(NULL, new_storage_bytes, hgi, *settleList,
5002 d_nSettles, nHG, comp, stream);
5003 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5004 new_storage_bytes = temp_storage_bytes;
5005 cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes, hgi, *settleList,
5006 d_nSettles, nHG, comp, stream);
5009 // alright I have the number of settles here
5010 copy_DtoH<int>(d_nSettles, &nSettles, 1, stream);
5011 // fprintf(stderr, "nSettles = %d", nSettles);
5013 //Warmup call to buildRattleParams
5014 buildRattleParams<true><<<128, 128, 0, stream>>>(nHG, dt, invdt,
5015 hgi, mass, hydrogenGroupSize,
5016 rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5017 cudaCheck(cudaGetLastError());
5019 // Removing -1 from rattleIndexes
5020 new_storage_bytes = 0;
5021 cub::DeviceSelect::If(NULL, new_storage_bytes,
5022 rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5023 reallocate_device<char>((char**) &d_rattleList_temp_storage, &temp_storage_bytes, new_storage_bytes, OVERALLOC);
5024 new_storage_bytes = temp_storage_bytes;
5025 cub::DeviceSelect::If(d_rattleList_temp_storage, new_storage_bytes,
5026 rattleIndexes, rattleIndexes, d_nRattles, nHG, validRattle(), stream);
5027 copy_DtoH<int>(d_nRattles, &nRattles, 1, stream);
5029 // Calculates rattleParams on rattleIndexes
5030 buildRattleParams<false><<<128, 128, 0, stream>>>(nRattles, dt, invdt,
5031 hgi, mass, hydrogenGroupSize,
5032 rigidBondLength, atomFixed, *rattleList, rattleIndexes);
5034 cudaCheck(cudaGetLastError());
5037 // XXX TODO: JM: Memory access pattern in this function is bad, since we have to
5038 // access memory by the hydrogen groups. Improve it later
5040 void SequencerCUDAKernel::rattle1(
5041 const bool doFixedAtoms,
5042 const bool doEnergy,
5043 const bool pressure, // pressure is only false for startRun1 and startRun2 right now
5063 const int *hydrogenGroupSize,
5064 const float *rigidBondLength,
5066 const int *atomFixed,
5068 size_t& settleListSize,
5069 int **consFailure_d,
5070 size_t& consFailureSize,
5071 CudaRattleElem **rattleList,
5072 size_t& rattleListSize,
5076 cudaTensor *h_virial,
5077 unsigned int* tbcatomic,
5079 SettleParameters *sp,
5082 const WaterModel water_model,
5083 cudaStream_t stream)
5085 //Calls the necessary functions to enforce rigid bonds
5086 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5088 cudaCheck(cudaStreamSynchronize(stream));
5089 if(migration || !firstRattleDone){
5090 // I need to allocate the water positions here
5092 doFixedAtoms, numAtoms, dt, invdt, mass, hydrogenGroupSize,
5093 rigidBondLength, atomFixed, settleList, settleListSize,
5094 consFailure_d, consFailureSize, rattleList, rattleListSize,
5096 d_nHG, d_nSettles, d_nRattles, hgi, hgi_size,
5097 d_rattleList_temp_storage, temp_storage_bytes,
5098 rattleIndexes, rattleIndexes_size,
5100 firstRattleDone = true;
5105 this->updateRigidArrays(
5106 doFixedAtoms, dt, atomFixed,
5107 vel_x, vel_y, vel_z,
5108 pos_x, pos_y, pos_z,
5109 velNew_x, velNew_y, velNew_z,
5110 posNew_x, posNew_y, posNew_z,
5114 // if(*nSettle != 0){
5117 numAtoms, dt, invdt, *nSettle,
5118 vel_x, vel_y, vel_z,
5119 pos_x, pos_y, pos_z,
5120 velNew_x, velNew_y, velNew_z,
5121 posNew_x, posNew_y, posNew_z,
5122 f_normal_x, f_normal_y, f_normal_z, virial, mass,
5123 hydrogenGroupSize, rigidBondLength, atomFixed,
5124 *settleList, sp, water_model, stream);
5130 doEnergy, doFixedAtoms, *rattleList, *nRattle, hydrogenGroupSize,
5131 atomFixed, pos_x, pos_y, pos_z,
5132 posNew_x, posNew_y, posNew_z,
5133 vel_x, vel_y, vel_z,
5134 velNew_x, velNew_y,velNew_z,
5135 f_normal_x, f_normal_y, f_normal_z, virial, mass,
5136 invdt, tol2, maxiter, *consFailure_d,
5137 consFailure, stream);
5140 Rattle1Kernel<<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5141 dt, invdt, *nSettle,
5142 vel_x, vel_y, vel_z,
5143 pos_x, pos_y, pos_z,
5144 velNew_x, velNew_y, velNew_z,
5145 posNew_x, posNew_y, posNew_z,
5146 f_normal_x, f_normal_y, f_normal_z,
5148 hydrogenGroupSize, rigidBondLength, atomFixed,
5150 *rattleList, *nRattle, tol2, maxiter, *consFailure_d);
5155 // only for the first time step
5156 copy_DtoD<double>(posNew_x, pos_x, numAtoms, stream);
5157 copy_DtoD<double>(posNew_y, pos_y, numAtoms, stream);
5158 copy_DtoD<double>(posNew_z, pos_z, numAtoms, stream);
5159 cudaCheck(cudaStreamSynchronize(stream));
5160 }else if (pressure == 0){ // pressure is zero for the first timesteps
5161 copy_DtoD<double>(velNew_x, vel_x, numAtoms, stream);
5162 copy_DtoD<double>(velNew_y, vel_y, numAtoms, stream);
5163 copy_DtoD<double>(velNew_z, vel_z, numAtoms, stream);
5164 cudaCheck(cudaStreamSynchronize(stream));
5166 #define CALL(DOENERGY, DOFIXED) \
5167 addRattleForce<DOENERGY, DOFIXED><<<grid, ATOM_BLOCKS, 0 ,stream>>>( \
5168 numAtoms, invdt, atomFixed, mass, \
5169 pos_x, pos_y, pos_z, \
5170 vel_x, vel_y, vel_z, \
5171 velNew_x, velNew_y, velNew_z, \
5172 f_normal_x, f_normal_y, f_normal_z, virial, h_virial, tbcatomic)
5173 if (doEnergy && doFixedAtoms) CALL(true, true);
5174 else if (doEnergy && !doFixedAtoms) CALL(true, false);
5175 else if (!doEnergy && doFixedAtoms) CALL(false, true);
5176 else if (!doEnergy && !doFixedAtoms) CALL(false, false);
5177 else NAMD_bug("addRattleForce was not called in SequencerCUDAKernel::rattle1!\n");
5182 // Calls a fused kernel without any tensor update
5183 // let's calculate the number of settle blocks
5184 int nSettleBlocks = (*nSettle + 128 -1 ) / 128;
5185 int nShakeBlocks = (*nRattle + 128 -1 ) / 128;
5186 // int nTotalBlocks = (nSettleBlocks + nShakeBlocks);
5187 if (doFixedAtoms) NAMD_bug("CallRattle1Kernel does not support fixed atoms!\n");
5189 numAtoms, dt, invdt, *nSettle,
5190 vel_x, vel_y, vel_z,
5191 pos_x, pos_y, pos_z,
5192 f_normal_x, f_normal_y, f_normal_z,
5193 mass, hydrogenGroupSize, rigidBondLength, atomFixed,
5195 *rattleList, *nRattle, tol2, maxiter, *consFailure_d, nSettleBlocks,
5196 nShakeBlocks, water_model, stream
5203 template <bool T_NORMALIZED, bool T_DOENERGY>
5204 __global__ void eFieldKernel(
5206 const double3 eField,
5207 const double eFieldOmega,
5208 const double eFieldPhi,
5211 const char3* __restrict transform,
5212 const float* __restrict charges,
5213 const double* __restrict pos_x,
5214 const double* __restrict pos_y,
5215 const double* __restrict pos_z,
5216 double* __restrict f_normal_x,
5217 double* __restrict f_normal_y,
5218 double* __restrict f_normal_z,
5219 double3* __restrict d_extForce,
5220 cudaTensor* __restrict d_extVirial,
5221 double* __restrict d_extEnergy,
5222 double3* __restrict h_extForce,
5223 cudaTensor* __restrict h_extVirial,
5224 double* __restrict h_extEnergy,
5225 unsigned int* tbcatomic)
5227 int i = threadIdx.x + blockIdx.x*blockDim.x;
5230 // This can all be moved outside of the kernel
5233 double eCos = cos(eFieldOmega * t - eFieldPhi);
5234 double charge = charges[i];
5235 double3 r_extForce = {0, 0, 0};
5236 double r_extEnergy = 0;
5237 cudaTensor r_extVirial = {0};
5238 int totaltb = gridDim.x;
5239 __shared__ bool isLastBlockDone;
5240 eField1.x = eCos * eField.x;
5241 eField1.y = eCos * eField.y;
5242 eField1.z = eCos * eField.z;
5244 if(threadIdx.x == 0) {
5245 isLastBlockDone = 0;
5251 // This can be moved outside of the kernel
5253 eField1 = Vector(lat.a_r()* (Vector)eField1, lat.b_r()* (Vector)eField1, lat.c_r()* (Vector)eField1);
5256 fx = charge * eField1.x;
5257 fy = charge * eField1.y;
5258 fz = charge * eField1.z;
5260 f_normal_x[i] += fx;
5261 f_normal_y[i] += fy;
5262 f_normal_z[i] += fz;
5265 const char3 tr = transform[i];
5271 //all threads from block calculate their contribution to energy, extForce and extVirial
5272 vpos = lat.reverse_transform(vpos, tr);
5273 double3 o = lat.origin();
5274 r_extEnergy -= (fx * (vpos.x - o.x)) + (fy * (vpos.y - o.y)) + (fz * (vpos.z - o.z));
5281 // outer product to calculate a virial here
5282 r_extVirial.xx = fx * vpos.x;
5283 r_extVirial.xy = fx * vpos.y;
5284 r_extVirial.xz = fx * vpos.z;
5285 r_extVirial.yx = fy * vpos.x;
5286 r_extVirial.yy = fy * vpos.y;
5287 r_extVirial.yz = fy * vpos.z;
5288 r_extVirial.zx = fz * vpos.x;
5289 r_extVirial.zz = fz * vpos.z;
5290 r_extVirial.zy = fz * vpos.y;
5297 // use cub to reduce energies
5298 typedef cub::BlockReduce<double, ATOM_BLOCKS> BlockReduce;
5299 __shared__ typename BlockReduce::TempStorage temp_storage;
5300 r_extEnergy = BlockReduce(temp_storage).Sum(r_extEnergy);
5303 // Do the remaining reductions
5305 r_extForce.x = BlockReduce(temp_storage).Sum(r_extForce.x);
5307 r_extForce.y = BlockReduce(temp_storage).Sum(r_extForce.y);
5309 r_extForce.z = BlockReduce(temp_storage).Sum(r_extForce.z);
5312 r_extVirial.xx = BlockReduce(temp_storage).Sum(r_extVirial.xx);
5314 r_extVirial.xy = BlockReduce(temp_storage).Sum(r_extVirial.xy);
5316 r_extVirial.xz = BlockReduce(temp_storage).Sum(r_extVirial.xz);
5318 r_extVirial.yx = BlockReduce(temp_storage).Sum(r_extVirial.yx);
5320 r_extVirial.yy = BlockReduce(temp_storage).Sum(r_extVirial.yy);
5322 r_extVirial.yz = BlockReduce(temp_storage).Sum(r_extVirial.yz);
5324 r_extVirial.zx = BlockReduce(temp_storage).Sum(r_extVirial.zx);
5326 r_extVirial.zy = BlockReduce(temp_storage).Sum(r_extVirial.zy);
5328 r_extVirial.zz = BlockReduce(temp_storage).Sum(r_extVirial.zz);
5332 // Cool now atomic add to gmem
5333 if(threadIdx.x == 0 ){
5334 const int bin = blockIdx.x % ATOMIC_BINS;
5336 // printf("adding %lf to d_extEnergy\n", r_extEnergy);
5337 atomicAdd(&d_extEnergy[bin], r_extEnergy);
5340 // add force and virial as well
5341 atomicAdd(&d_extForce[bin].x, r_extForce.x);
5342 atomicAdd(&d_extForce[bin].y, r_extForce.y);
5343 atomicAdd(&d_extForce[bin].z, r_extForce.z);
5345 atomicAdd(&d_extVirial[bin].xx, r_extVirial.xx);
5346 atomicAdd(&d_extVirial[bin].xy, r_extVirial.xy);
5347 atomicAdd(&d_extVirial[bin].xz, r_extVirial.xz);
5349 atomicAdd(&d_extVirial[bin].yx, r_extVirial.yx);
5350 atomicAdd(&d_extVirial[bin].yy, r_extVirial.yy);
5351 atomicAdd(&d_extVirial[bin].yz, r_extVirial.yz);
5353 atomicAdd(&d_extVirial[bin].zx, r_extVirial.zx);
5354 atomicAdd(&d_extVirial[bin].zy, r_extVirial.zy);
5355 atomicAdd(&d_extVirial[bin].zz, r_extVirial.zz);
5359 unsigned int value = atomicInc(&tbcatomic[2], totaltb);
5360 isLastBlockDone = (value == (totaltb -1));
5364 if(isLastBlockDone){
5365 if(threadIdx.x < ATOMIC_BINS){
5366 const int bin = threadIdx.x;
5368 double e = d_extEnergy[bin];
5372 f = d_extForce[bin];
5373 v = d_extVirial[bin];
5376 // sets device scalars back to zero
5377 d_extEnergy[bin] = 0.0;
5379 d_extForce[bin] = make_double3(0.0, 0.0, 0.0);
5380 zero_cudaTensor(d_extVirial[bin]);
5383 if(ATOMIC_BINS > 1){
5384 typedef cub::WarpReduce<double, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduce;
5385 typedef cub::WarpReduce<cudaTensor, (ATOMIC_BINS > 1 ? ATOMIC_BINS : 2)> WarpReduceT;
5386 __shared__ typename WarpReduce::TempStorage tempStorage;
5387 __shared__ typename WarpReduceT::TempStorage tempStorageT;
5389 e = WarpReduce(tempStorage).Sum(e);
5391 f.x = WarpReduce(tempStorage).Sum(f.x);
5392 f.y = WarpReduce(tempStorage).Sum(f.y);
5393 f.z = WarpReduce(tempStorage).Sum(f.z);
5394 v = WarpReduceT(tempStorageT).Sum(v);
5398 if(threadIdx.x == 0){
5405 //resets atomic counter
5406 reset_atomic_counter(&(tbcatomic[2]));
5413 // JM NOTE: Apply external electric field to every atom in the system equally
5414 void SequencerCUDAKernel::apply_Efield(
5416 const bool normalized,
5417 const bool doEnergy,
5418 const double3 eField,
5419 const double eFieldOmega,
5420 const double eFieldPhi,
5423 const char3* transform,
5424 const float* charges,
5425 const double* pos_x,
5426 const double* pos_y,
5427 const double* pos_z,
5431 double3* d_extForce,
5432 cudaTensor* d_extVirial,
5433 double* d_extEnergy,
5434 double3* h_extForce,
5435 cudaTensor* h_extVirial,
5436 double* h_extEnergy,
5437 unsigned int* tbcatomic,
5440 int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5443 eFieldKernel<true, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5444 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5445 pos_x, pos_y, pos_z,
5446 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5447 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5449 eFieldKernel<true, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5450 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5451 pos_x, pos_y, pos_z,
5452 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5453 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5457 eFieldKernel<false, true><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5458 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5459 pos_x, pos_y, pos_z,
5460 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5461 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5463 eFieldKernel<false, false><<<grid, ATOM_BLOCKS, 0, stream>>>(numAtoms,
5464 eField, eFieldOmega, eFieldPhi, t, lat, transform, charges,
5465 pos_x, pos_y, pos_z,
5466 f_normal_x, f_normal_y, f_normal_z, d_extForce, d_extVirial,
5467 d_extEnergy, h_extForce, h_extVirial, h_extEnergy, tbcatomic);
5472 SequencerCUDAKernel::SequencerCUDAKernel() {
5473 firstRattleDone = false;
5474 intConstInit = false;
5483 d_rattleList_temp_storage = NULL;
5484 temp_storage_bytes = 0;
5486 rattleIndexes = NULL;
5487 rattleIndexes_size = 0;
5490 SequencerCUDAKernel::~SequencerCUDAKernel() {
5491 deallocate_device<int>(&d_nHG);
5492 deallocate_device<int>(&d_nSettles);
5493 deallocate_device<int>(&d_nRattles);
5494 deallocate_device<int>(&hgi);
5495 deallocate_device<int>(&rattleIndexes);
5496 deallocate_device<char>(&d_rattleList_temp_storage);
5506 void SequencerCUDAKernel::set_pme_positions(
5508 const bool isPmeDevice,
5510 const int numPatchesHomeAndProxy,
5511 const int numPatchesHome,
5516 const bool doAlchDecouple,
5517 const bool doAlchSoftCore,
5518 const bool handleBoundary,
5519 const double* d_pos_x,
5520 const double* d_pos_y,
5521 const double* d_pos_z,
5522 #ifndef NAMD_NCCL_ALLREDUCE
5523 double** d_peer_pos_x,
5524 double** d_peer_pos_y,
5525 double** d_peer_pos_z,
5526 float** d_peer_charge,
5527 int** d_peer_partition,
5529 const float* charges,
5530 const int* partition,
5531 const double charge_scaling,
5532 const double3* patchCenter,
5533 const int* s_patchPositions,
5534 const int* s_pencilPatchIndex,
5535 const int* s_patchIDs,
5536 const int* patchSortOrder,
5537 const Lattice lattice,
5543 CudaLocalRecord* localRecords,
5544 CudaPeerRecord* peerRecords,
5545 std::vector<int>& atomCounts,
5548 // Launch PME setComputes
5549 #define CALL(HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE) \
5550 setComputePositionsKernel_PME<HOME, FEP, TI, ALCH_DECOUPLE, ALCH_SOFTCORE> \
5551 <<<pme_grid, PATCH_BLOCKS, 0, stream>>>( \
5552 d_pos_x, d_pos_y, d_pos_z, charges, \
5553 d_peer_pos_x, d_peer_pos_y, d_peer_pos_z, d_peer_charge, \
5554 d_peer_partition, partition, charge_scaling, \
5555 s_patchPositions, s_pencilPatchIndex, s_patchIDs, \
5556 lattice, s_atoms, numTotalAtoms, s_partition, \
5557 i, numAtoms, offset, handleBoundary \
5559 // Only when PME long-range electrostaic is enabled (doSlow is true)
5560 // The partition is needed for alchemical calculation.
5561 if(doSlow && isPmeDevice) {
5563 for (int i = 0; i < nDev; i++) {
5564 const bool home = (i == devID);
5565 const int numAtoms = atomCounts[i];
5566 const int pme_grid = (numAtoms + PATCH_BLOCKS - 1) / PATCH_BLOCKS;
5567 const int options = (home << 4) + (doFEP << 3) + (doTI << 2) + (doAlchDecouple << 1) + doAlchSoftCore;
5570 case 0: CALL(0, 0, 0, 0, 0); break;
5571 case 1: CALL(0, 0, 0, 0, 1); break;
5572 case 2: CALL(0, 0, 0, 1, 0); break;
5573 case 3: CALL(0, 0, 0, 1, 1); break;
5574 case 4: CALL(0, 0, 1, 0, 0); break;
5575 case 5: CALL(0, 0, 1, 0, 1); break;
5576 case 6: CALL(0, 0, 1, 1, 0); break;
5577 case 7: CALL(0, 0, 1, 1, 1); break;
5578 case 8: CALL(0, 1, 0, 0, 0); break;
5579 case 9: CALL(0, 1, 0, 0, 1); break;
5580 case 10: CALL(0, 1, 0, 1, 0); break;
5581 case 11: CALL(0, 1, 0, 1, 1); break;
5582 case 12: CALL(0, 1, 1, 0, 0); break;
5583 case 13: CALL(0, 1, 1, 0, 1); break;
5584 case 14: CALL(0, 1, 1, 1, 0); break;
5585 case 15: CALL(0, 1, 1, 1, 1); break;
5586 case 16: CALL(1, 0, 0, 0, 0); break;
5587 case 17: CALL(1, 0, 0, 0, 1); break;
5588 case 18: CALL(1, 0, 0, 1, 0); break;
5589 case 19: CALL(1, 0, 0, 1, 1); break;
5590 case 20: CALL(1, 0, 1, 0, 0); break;
5591 case 21: CALL(1, 0, 1, 0, 1); break;
5592 case 22: CALL(1, 0, 1, 1, 0); break;
5593 case 23: CALL(1, 0, 1, 1, 1); break;
5594 case 24: CALL(1, 1, 0, 0, 0); break;
5595 case 25: CALL(1, 1, 0, 0, 1); break;
5596 case 26: CALL(1, 1, 0, 1, 0); break;
5597 case 27: CALL(1, 1, 0, 1, 1); break;
5598 case 28: CALL(1, 1, 1, 0, 0); break;
5599 case 29: CALL(1, 1, 1, 0, 1); break;
5600 case 30: CALL(1, 1, 1, 1, 0); break;
5601 case 31: CALL(1, 1, 1, 1, 1); break;
5603 NAMD_die("SequencerCUDAKernel::setComputePositions: no kernel called");
5612 template <bool doGlobal, bool doForcesOutput>
5613 __global__ void copyForcesToHostSOAKernel(
5614 const int numPatches,
5615 CudaLocalRecord* localRecords,
5616 const int maxForceNumber,
5617 const double* d_f_normal_x,
5618 const double* d_f_normal_y,
5619 const double* d_f_normal_z,
5620 const double* d_f_nbond_x,
5621 const double* d_f_nbond_y,
5622 const double* d_f_nbond_z,
5623 const double* d_f_slow_x,
5624 const double* d_f_slow_y,
5625 const double* d_f_slow_z,
5626 PatchDataSOA* d_HostPatchDataSOA
5628 __shared__ CudaLocalRecord s_record;
5629 using AccessType = int32_t;
5630 AccessType* s_record_buffer = (AccessType*) &s_record;
5632 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5633 // Read in the CudaLocalRecord using multiple threads. This should
5635 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5636 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5640 const int numAtoms = s_record.numAtoms;
5641 const int offset = s_record.bufferOffset;
5643 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5644 if (maxForceNumber >= 0) {
5645 d_HostPatchDataSOA[patchIndex].f_normal_x[i] = d_f_normal_x[offset + i];
5646 d_HostPatchDataSOA[patchIndex].f_normal_y[i] = d_f_normal_y[offset + i];
5647 d_HostPatchDataSOA[patchIndex].f_normal_z[i] = d_f_normal_z[offset + i];
5649 if (maxForceNumber >= 1) {
5651 d_HostPatchDataSOA[patchIndex].f_saved_nbond_x[i] = d_f_nbond_x[offset + i];
5652 d_HostPatchDataSOA[patchIndex].f_saved_nbond_y[i] = d_f_nbond_y[offset + i];
5653 d_HostPatchDataSOA[patchIndex].f_saved_nbond_z[i] = d_f_nbond_z[offset + i];
5655 if (doForcesOutput) {
5656 d_HostPatchDataSOA[patchIndex].f_nbond_x[i] = d_f_nbond_x[offset + i];
5657 d_HostPatchDataSOA[patchIndex].f_nbond_y[i] = d_f_nbond_y[offset + i];
5658 d_HostPatchDataSOA[patchIndex].f_nbond_z[i] = d_f_nbond_z[offset + i];
5661 if (maxForceNumber >= 2) {
5663 d_HostPatchDataSOA[patchIndex].f_saved_slow_x[i] = d_f_slow_x[offset + i];
5664 d_HostPatchDataSOA[patchIndex].f_saved_slow_y[i] = d_f_slow_y[offset + i];
5665 d_HostPatchDataSOA[patchIndex].f_saved_slow_z[i] = d_f_slow_z[offset + i];
5667 if (doForcesOutput) {
5668 d_HostPatchDataSOA[patchIndex].f_slow_x[i] = d_f_slow_x[offset + i];
5669 d_HostPatchDataSOA[patchIndex].f_slow_y[i] = d_f_slow_y[offset + i];
5670 d_HostPatchDataSOA[patchIndex].f_slow_z[i] = d_f_slow_z[offset + i];
5679 void SequencerCUDAKernel::copyForcesToHostSOA(
5680 const int numPatches,
5681 CudaLocalRecord* localRecords,
5682 const int maxForceNumber,
5683 const double* d_f_normal_x,
5684 const double* d_f_normal_y,
5685 const double* d_f_normal_z,
5686 const double* d_f_nbond_x,
5687 const double* d_f_nbond_y,
5688 const double* d_f_nbond_z,
5689 const double* d_f_slow_x,
5690 const double* d_f_slow_y,
5691 const double* d_f_slow_z,
5692 PatchDataSOA* d_HostPatchDataSOA,
5693 const bool doGlobal,
5694 const bool doForcesOutput,
5697 #define CALL(GLOBAL, FORCESOUTPUT) \
5698 do {copyForcesToHostSOAKernel<GLOBAL, FORCESOUTPUT><<<numPatches, PATCH_BLOCKS, 0, stream>>>( \
5711 d_HostPatchDataSOA \
5713 if (doGlobal && !doForcesOutput) CALL(true, false);
5714 else if (doGlobal && doForcesOutput) CALL(true, true);
5715 else if (!doGlobal && doForcesOutput) CALL(false, true);
5716 else NAMD_bug("No kernel is called in copyForcesToHostSOA");
5720 __global__ void copyPositionsToHostSOAKernel(
5721 const int numPatches,
5722 CudaLocalRecord* localRecords,
5723 const double* pos_x,
5724 const double* pos_y,
5725 const double* pos_z,
5726 PatchDataSOA* d_HostPatchDataSOA
5728 __shared__ CudaLocalRecord s_record;
5729 using AccessType = int32_t;
5730 AccessType* s_record_buffer = (AccessType*) &s_record;
5732 for (int patchIndex = blockIdx.x; patchIndex < numPatches; patchIndex += gridDim.x) {
5733 // Read in the CudaLocalRecord using multiple threads. This should
5735 for (int i = threadIdx.x; i < sizeof(CudaLocalRecord)/sizeof(AccessType); i += blockDim.x) {
5736 s_record_buffer[i] = ((AccessType*) &(localRecords[patchIndex]))[i];
5740 const int numAtoms = s_record.numAtoms;
5741 const int offset = s_record.bufferOffset;
5743 for (int i = threadIdx.x; i < numAtoms; i += blockDim.x) {
5744 d_HostPatchDataSOA[patchIndex].pos_x[i] = pos_x[offset + i];
5745 d_HostPatchDataSOA[patchIndex].pos_y[i] = pos_y[offset + i];
5746 d_HostPatchDataSOA[patchIndex].pos_z[i] = pos_z[offset + i];
5752 void SequencerCUDAKernel::copyPositionsToHostSOA(
5753 const int numPatches,
5754 CudaLocalRecord* localRecords,
5755 const double* pos_x,
5756 const double* pos_y,
5757 const double* pos_z,
5758 PatchDataSOA* d_HostPatchDataSOA,
5761 copyPositionsToHostSOAKernel<<<numPatches, PATCH_BLOCKS, 0, stream>>>(
5764 pos_x, pos_y, pos_z,
5769 // Swipe from HomePatch::redistrib_lp_water_force
5770 /* Redistribute forces from the massless lonepair charge particle onto
5771 * the other atoms of the water.
5773 * This is done using the same algorithm as charmm uses for TIP4P lonepairs.
5775 * Pass by reference the forces (O H1 H2 LP) to be modified,
5776 * pass by constant reference the corresponding positions.
5778 template <bool doVirial>
5779 __device__ void redistrib_lp_water_force(
5780 double f_ox[3], double f_h1[3], double f_h2[3], double f_lp[3],
5781 const double p_ox[3], const double p_h1[3], const double p_h2[3],
5782 const double p_lp[3], cudaTensor& virial) {
5783 // Accumulate force adjustments
5784 double fad_ox[3] = {};
5785 double fad_h[3] = {};
5786 // Calculate the radial component of the force and add it to the oxygen
5787 const double r_ox_lp[3] = {p_lp[0] - p_ox[0],
5790 double invlen2_r_ox_lp = rnorm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5791 invlen2_r_ox_lp *= invlen2_r_ox_lp;
5792 const double rad_factor =
5793 (f_lp[0] * r_ox_lp[0] + f_lp[1] * r_ox_lp[1] + f_lp[2] * r_ox_lp[2]) * invlen2_r_ox_lp;
5794 const double f_rad[3] = {r_ox_lp[0] * rad_factor,
5795 r_ox_lp[1] * rad_factor,
5796 r_ox_lp[2] * rad_factor};
5797 fad_ox[0] += f_rad[0];
5798 fad_ox[1] += f_rad[1];
5799 fad_ox[2] += f_rad[2];
5800 // Calculate the angular component
5801 const double r_hcom_ox[3] = {
5802 p_ox[0] - ( (p_h1[0] + p_h2[0]) * 0.5 ),
5803 p_ox[1] - ( (p_h1[1] + p_h2[1]) * 0.5 ),
5804 p_ox[2] - ( (p_h1[2] + p_h2[2]) * 0.5 )};
5805 const double f_ang[3] = {
5808 f_lp[2] - f_rad[2]};
5809 // now split this component onto the other atoms
5810 const double len_r_ox_lp = norm3d(r_ox_lp[0], r_ox_lp[1], r_ox_lp[2]);
5811 const double invlen_r_hcom_ox = rnorm3d(
5812 r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]);
5813 const double oxcomp =
5814 (norm3d(r_hcom_ox[0], r_hcom_ox[1], r_hcom_ox[2]) - len_r_ox_lp) *
5816 const double hydcomp = 0.5 * len_r_ox_lp * invlen_r_hcom_ox;
5817 fad_ox[0] += (f_ang[0] * oxcomp);
5818 fad_ox[1] += (f_ang[1] * oxcomp);
5819 fad_ox[2] += (f_ang[2] * oxcomp);
5820 fad_h[0] += (f_ang[0] * hydcomp);
5821 fad_h[1] += (f_ang[1] * hydcomp);
5822 fad_h[2] += (f_ang[2] * hydcomp);
5824 virial.xx = fad_ox[0] * p_ox[0] + fad_h[0] * p_h1[0] + fad_h[0] * p_h2[0] - f_lp[0] * p_lp[0];
5825 virial.xy = fad_ox[0] * p_ox[1] + fad_h[0] * p_h1[1] + fad_h[0] * p_h2[1] - f_lp[0] * p_lp[1];
5826 virial.xz = fad_ox[0] * p_ox[2] + fad_h[0] * p_h1[2] + fad_h[0] * p_h2[2] - f_lp[0] * p_lp[2];
5827 virial.yx = fad_ox[1] * p_ox[0] + fad_h[1] * p_h1[0] + fad_h[1] * p_h2[0] - f_lp[1] * p_lp[0];
5828 virial.yy = fad_ox[1] * p_ox[1] + fad_h[1] * p_h1[1] + fad_h[1] * p_h2[1] - f_lp[1] * p_lp[1];
5829 virial.yz = fad_ox[1] * p_ox[2] + fad_h[1] * p_h1[2] + fad_h[1] * p_h2[2] - f_lp[1] * p_lp[2];
5830 virial.zx = fad_ox[2] * p_ox[0] + fad_h[2] * p_h1[0] + fad_h[2] * p_h2[0] - f_lp[2] * p_lp[0];
5831 virial.zy = fad_ox[2] * p_ox[1] + fad_h[2] * p_h1[1] + fad_h[2] * p_h2[1] - f_lp[2] * p_lp[1];
5832 virial.zz = fad_ox[2] * p_ox[2] + fad_h[2] * p_h1[2] + fad_h[2] * p_h2[2] - f_lp[2] * p_lp[2];
5837 f_ox[0] += fad_ox[0];
5838 f_ox[1] += fad_ox[1];
5839 f_ox[2] += fad_ox[2];
5840 f_h1[0] += fad_h[0];
5841 f_h1[1] += fad_h[1];
5842 f_h1[2] += fad_h[2];
5843 f_h2[0] += fad_h[0];
5844 f_h2[1] += fad_h[1];
5845 f_h2[2] += fad_h[2];
5848 template <bool doVirial>
5849 __global__ void redistributeTip4pForcesKernel2(
5853 cudaTensor* d_virial,
5854 const double* d_pos_x,
5855 const double* d_pos_y,
5856 const double* d_pos_z,
5857 const float* d_mass,
5860 const int i = threadIdx.x + blockIdx.x * blockDim.x;
5861 cudaTensor lVirial = {0};
5863 if (d_mass[i] < 0.01f) {
5864 double f_ox[3] = {d_f_x[i-3], d_f_y[i-3], d_f_z[i-3]};
5865 double f_h1[3] = {d_f_x[i-2], d_f_y[i-2], d_f_z[i-2]};
5866 double f_h2[3] = {d_f_x[i-1], d_f_y[i-1], d_f_z[i-1]};
5867 double f_lp[3] = {d_f_x[i], d_f_y[i], d_f_z[i]};
5868 const double p_ox[3] = {d_pos_x[i-3], d_pos_y[i-3], d_pos_z[i-3]};
5869 const double p_h1[3] = {d_pos_x[i-2], d_pos_y[i-2], d_pos_z[i-2]};
5870 const double p_h2[3] = {d_pos_x[i-1], d_pos_y[i-1], d_pos_z[i-1]};
5871 const double p_lp[3] = {d_pos_x[i], d_pos_y[i], d_pos_z[i]};
5872 redistrib_lp_water_force<doVirial>(
5873 f_ox, f_h1, f_h2, f_lp, p_ox, p_h1, p_h2, p_lp, lVirial);
5874 // copy the force back
5875 d_f_x[i-3] = f_ox[0];
5876 d_f_x[i-2] = f_h1[0];
5877 d_f_x[i-1] = f_h2[0];
5879 d_f_y[i-3] = f_ox[1];
5880 d_f_y[i-2] = f_h1[1];
5881 d_f_y[i-1] = f_h2[1];
5883 d_f_z[i-3] = f_ox[2];
5884 d_f_z[i-2] = f_h1[2];
5885 d_f_z[i-1] = f_h2[2];
5889 typedef cub::BlockReduce<BigReal, ATOM_BLOCKS> BlockReduce;
5890 __shared__ typename BlockReduce::TempStorage temp_storage;
5892 lVirial.xx = BlockReduce(temp_storage).Sum(lVirial.xx);
5894 lVirial.xy = BlockReduce(temp_storage).Sum(lVirial.xy);
5896 lVirial.xz = BlockReduce(temp_storage).Sum(lVirial.xz);
5898 lVirial.yx = BlockReduce(temp_storage).Sum(lVirial.yx);
5900 lVirial.yy = BlockReduce(temp_storage).Sum(lVirial.yy);
5902 lVirial.yz = BlockReduce(temp_storage).Sum(lVirial.yz);
5904 lVirial.zx = BlockReduce(temp_storage).Sum(lVirial.zx);
5906 lVirial.zy = BlockReduce(temp_storage).Sum(lVirial.zy);
5908 lVirial.zz = BlockReduce(temp_storage).Sum(lVirial.zz);
5910 if (threadIdx.x == 0) {
5911 atomicAdd(&(d_virial->xx), lVirial.xx);
5912 atomicAdd(&(d_virial->xy), lVirial.xy);
5913 atomicAdd(&(d_virial->xz), lVirial.xz);
5914 atomicAdd(&(d_virial->yx), lVirial.yx);
5915 atomicAdd(&(d_virial->yy), lVirial.yy);
5916 atomicAdd(&(d_virial->yz), lVirial.yz);
5917 atomicAdd(&(d_virial->zx), lVirial.zx);
5918 atomicAdd(&(d_virial->zy), lVirial.zy);
5919 atomicAdd(&(d_virial->zz), lVirial.zz);
5924 void SequencerCUDAKernel::redistributeTip4pForces(
5925 double* d_f_normal_x,
5926 double* d_f_normal_y,
5927 double* d_f_normal_z,
5928 double* d_f_nbond_x,
5929 double* d_f_nbond_y,
5930 double* d_f_nbond_z,
5934 cudaTensor* d_virial_normal,
5935 cudaTensor* d_virial_nbond,
5936 cudaTensor* d_virial_slow,
5937 const double* d_pos_x,
5938 const double* d_pos_y,
5939 const double* d_pos_z,
5940 const float* d_mass,
5943 const int maxForceNumber,
5946 const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
5947 switch (maxForceNumber) {
5949 if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5950 else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_slow_x, d_f_slow_y, d_f_slow_z, d_virial_slow, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5952 if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5953 else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, d_virial_nbond, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5955 if (doVirial) redistributeTip4pForcesKernel2<true><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5956 else redistributeTip4pForcesKernel2<false><<<grid, ATOM_BLOCKS, 0, stream>>>(d_f_normal_x, d_f_normal_y, d_f_normal_z, d_virial_normal, d_pos_x, d_pos_y, d_pos_z, d_mass, numAtoms);
5960 __forceinline__ __device__ void calcVirial(
5961 const int i, const double factor,
5962 const double* __restrict f_x, const double* __restrict f_y, const double* __restrict f_z,
5963 const double* __restrict r_x, const double* __restrict r_y, const double* __restrict r_z,
5964 cudaTensor& tensor_out) {
5965 tensor_out.xx = factor * f_x[i] * r_x[i];
5966 tensor_out.xy = factor * f_x[i] * r_y[i];
5967 tensor_out.xz = factor * f_x[i] * r_z[i];
5968 tensor_out.yx = factor * f_y[i] * r_x[i];
5969 tensor_out.yy = factor * f_y[i] * r_y[i];
5970 tensor_out.yz = factor * f_y[i] * r_z[i];
5971 tensor_out.zx = factor * f_z[i] * r_x[i];
5972 tensor_out.zy = factor * f_z[i] * r_y[i];
5973 tensor_out.zz = factor * f_z[i] * r_z[i];
5976 template <bool doNbond, bool doSlow>
5977 __global__ void calcFixVirialKernel(
5979 const int* __restrict d_atomFixed,
5980 const double* __restrict d_fixedPosition_x,
5981 const double* __restrict d_fixedPosition_y,
5982 const double* __restrict d_fixedPosition_z,
5983 const double* __restrict d_f_normal_x,
5984 const double* __restrict d_f_normal_y,
5985 const double* __restrict d_f_normal_z,
5986 const double* __restrict d_f_nbond_x,
5987 const double* __restrict d_f_nbond_y,
5988 const double* __restrict d_f_nbond_z,
5989 const double* __restrict d_f_slow_x,
5990 const double* __restrict d_f_slow_y,
5991 const double* __restrict d_f_slow_z,
5992 cudaTensor* __restrict d_virial_normal,
5993 cudaTensor* __restrict d_virial_nbond,
5994 cudaTensor* __restrict d_virial_slow,
5995 double3* __restrict d_extForce_normal,
5996 double3* __restrict d_extForce_nbond,
5997 double3* __restrict d_extForce_slow)
5999 const int i = threadIdx.x + blockIdx.x * blockDim.x;
6000 cudaTensor vir_normal = {0};
6001 cudaTensor vir_nbond = {0};
6002 cudaTensor vir_slow = {0};
6003 double3 f_sum_normal = {0, 0, 0};
6004 double3 f_sum_nbond;
6006 f_sum_nbond = {0, 0, 0};
6010 f_sum_slow = {0, 0, 0};
6013 if (d_atomFixed[i]) {
6014 calcVirial(i, -1.0, d_f_normal_x, d_f_normal_y, d_f_normal_z,
6015 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_normal);
6016 f_sum_normal.x = -1.0 * d_f_normal_x[i];
6017 f_sum_normal.y = -1.0 * d_f_normal_y[i];
6018 f_sum_normal.z = -1.0 * d_f_normal_z[i];
6020 calcVirial(i, -1.0, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6021 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_nbond);
6022 f_sum_nbond.x = -1.0 * d_f_nbond_x[i];
6023 f_sum_nbond.y = -1.0 * d_f_nbond_y[i];
6024 f_sum_nbond.z = -1.0 * d_f_nbond_z[i];
6027 calcVirial(i, -1.0, d_f_slow_x, d_f_slow_y, d_f_slow_z,
6028 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, vir_slow);
6029 f_sum_slow.x = -1.0 * d_f_slow_x[i];
6030 f_sum_slow.y = -1.0 * d_f_slow_y[i];
6031 f_sum_slow.z = -1.0 * d_f_slow_z[i];
6035 // Haochuan: is it OK to use lambda to reduce the cudaTensor as a whole instead of component-wise??
6036 typedef cub::BlockReduce<cudaTensor, ATOM_BLOCKS> BlockReduceCudaTensor;
6037 __shared__ typename BlockReduceCudaTensor::TempStorage temp_storage;
6038 vir_normal = BlockReduceCudaTensor(temp_storage).Reduce(vir_normal, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6041 vir_nbond = BlockReduceCudaTensor(temp_storage).Reduce(vir_nbond, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6045 vir_slow = BlockReduceCudaTensor(temp_storage).Reduce(vir_slow, [](const cudaTensor& a, const cudaTensor& b){return a+b;});
6048 typedef cub::BlockReduce<double3, ATOM_BLOCKS> BlockReduceDouble3;
6049 __shared__ typename BlockReduceDouble3::TempStorage temp_storage2;
6050 f_sum_normal = BlockReduceDouble3(temp_storage2).Reduce(f_sum_normal, [](const double3& a, const double3& b){return a+b;});
6053 f_sum_nbond = BlockReduceDouble3(temp_storage2).Reduce(f_sum_nbond, [](const double3& a, const double3& b){return a+b;});
6057 f_sum_slow = BlockReduceDouble3(temp_storage2).Reduce(f_sum_slow, [](const double3& a, const double3& b){return a+b;});
6060 if (threadIdx.x == 0) {
6061 atomicAdd(&(d_virial_normal->xx), vir_normal.xx);
6062 atomicAdd(&(d_virial_normal->xy), vir_normal.xy);
6063 atomicAdd(&(d_virial_normal->xz), vir_normal.xz);
6064 atomicAdd(&(d_virial_normal->yx), vir_normal.yx);
6065 atomicAdd(&(d_virial_normal->yy), vir_normal.yy);
6066 atomicAdd(&(d_virial_normal->yz), vir_normal.yz);
6067 atomicAdd(&(d_virial_normal->zx), vir_normal.zx);
6068 atomicAdd(&(d_virial_normal->zy), vir_normal.zy);
6069 atomicAdd(&(d_virial_normal->zz), vir_normal.zz);
6070 atomicAdd(&(d_extForce_normal->x), f_sum_normal.x);
6071 atomicAdd(&(d_extForce_normal->y), f_sum_normal.y);
6072 atomicAdd(&(d_extForce_normal->z), f_sum_normal.z);
6074 atomicAdd(&(d_virial_nbond->xx), vir_nbond.xx);
6075 atomicAdd(&(d_virial_nbond->xy), vir_nbond.xy);
6076 atomicAdd(&(d_virial_nbond->xz), vir_nbond.xz);
6077 atomicAdd(&(d_virial_nbond->yx), vir_nbond.yx);
6078 atomicAdd(&(d_virial_nbond->yy), vir_nbond.yy);
6079 atomicAdd(&(d_virial_nbond->yz), vir_nbond.yz);
6080 atomicAdd(&(d_virial_nbond->zx), vir_nbond.zx);
6081 atomicAdd(&(d_virial_nbond->zy), vir_nbond.zy);
6082 atomicAdd(&(d_virial_nbond->zz), vir_nbond.zz);
6083 atomicAdd(&(d_extForce_nbond->x), f_sum_nbond.x);
6084 atomicAdd(&(d_extForce_nbond->y), f_sum_nbond.y);
6085 atomicAdd(&(d_extForce_nbond->z), f_sum_nbond.z);
6088 atomicAdd(&(d_virial_slow->xx), vir_slow.xx);
6089 atomicAdd(&(d_virial_slow->xy), vir_slow.xy);
6090 atomicAdd(&(d_virial_slow->xz), vir_slow.xz);
6091 atomicAdd(&(d_virial_slow->yx), vir_slow.yx);
6092 atomicAdd(&(d_virial_slow->yy), vir_slow.yy);
6093 atomicAdd(&(d_virial_slow->yz), vir_slow.yz);
6094 atomicAdd(&(d_virial_slow->zx), vir_slow.zx);
6095 atomicAdd(&(d_virial_slow->zy), vir_slow.zy);
6096 atomicAdd(&(d_virial_slow->zz), vir_slow.zz);
6097 atomicAdd(&(d_extForce_slow->x), f_sum_slow.x);
6098 atomicAdd(&(d_extForce_slow->y), f_sum_slow.y);
6099 atomicAdd(&(d_extForce_slow->z), f_sum_slow.z);
6104 __global__ void copyForcesToDeviceKernel(
6106 const double* d_f_nbond_x,
6107 const double* d_f_nbond_y,
6108 const double* d_f_nbond_z,
6109 const double* d_f_slow_x,
6110 const double* d_f_slow_y,
6111 const double* d_f_slow_z,
6112 double* d_f_saved_nbond_x,
6113 double* d_f_saved_nbond_y,
6114 double* d_f_saved_nbond_z,
6115 double* d_f_saved_slow_x,
6116 double* d_f_saved_slow_y,
6117 double* d_f_saved_slow_z,
6118 const int maxForceNumber
6120 const int i = threadIdx.x + blockIdx.x * blockDim.x;
6122 if (maxForceNumber >= 2) {
6123 d_f_saved_slow_x[i] = d_f_slow_x[i];
6124 d_f_saved_slow_y[i] = d_f_slow_y[i];
6125 d_f_saved_slow_z[i] = d_f_slow_z[i];
6127 if (maxForceNumber >= 1) {
6128 d_f_saved_nbond_x[i] = d_f_nbond_x[i];
6129 d_f_saved_nbond_y[i] = d_f_nbond_y[i];
6130 d_f_saved_nbond_z[i] = d_f_nbond_z[i];
6135 void SequencerCUDAKernel::calcFixVirial(
6136 const int maxForceNumber,
6138 const int* d_atomFixed,
6139 const double* d_fixedPosition_x,
6140 const double* d_fixedPosition_y,
6141 const double* d_fixedPosition_z,
6142 const double* d_f_normal_x,
6143 const double* d_f_normal_y,
6144 const double* d_f_normal_z,
6145 const double* d_f_nbond_x,
6146 const double* d_f_nbond_y,
6147 const double* d_f_nbond_z,
6148 const double* d_f_slow_x,
6149 const double* d_f_slow_y,
6150 const double* d_f_slow_z,
6151 cudaTensor* d_virial_normal,
6152 cudaTensor* d_virial_nbond,
6153 cudaTensor* d_virial_slow,
6154 double3* d_extForce_normal,
6155 double3* d_extForce_nbond,
6156 double3* d_extForce_slow,
6157 cudaStream_t stream)
6159 const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6160 #define CALL(doNbond, doSlow) \
6161 calcFixVirialKernel<doNbond, doSlow><<<grid, ATOM_BLOCKS, 0, stream>>>( \
6162 numAtoms, d_atomFixed, \
6163 d_fixedPosition_x, d_fixedPosition_y, d_fixedPosition_z, \
6164 d_f_normal_x, d_f_normal_y, d_f_normal_z, \
6165 d_f_nbond_x, d_f_nbond_y, d_f_nbond_z, \
6166 d_f_slow_x, d_f_slow_y, d_f_slow_z, \
6167 d_virial_normal, d_virial_nbond, d_virial_slow, \
6168 d_extForce_normal, d_extForce_nbond, d_extForce_slow \
6170 switch (maxForceNumber) {
6171 case 0: CALL(false, false); break;
6172 case 1: CALL(true, false); break;
6173 case 2: CALL(true, true); break;
6174 default: NAMD_bug("No kernel is called in SequencerCUDAKernel::calcFixVirial!\n");
6179 void SequencerCUDAKernel::copyForcesToDevice(
6181 const double* d_f_nbond_x,
6182 const double* d_f_nbond_y,
6183 const double* d_f_nbond_z,
6184 const double* d_f_slow_x,
6185 const double* d_f_slow_y,
6186 const double* d_f_slow_z,
6187 double* d_f_saved_nbond_x,
6188 double* d_f_saved_nbond_y,
6189 double* d_f_saved_nbond_z,
6190 double* d_f_saved_slow_x,
6191 double* d_f_saved_slow_y,
6192 double* d_f_saved_slow_z,
6193 const int maxForceNumber,
6196 const int grid = (numAtoms + ATOM_BLOCKS - 1) / ATOM_BLOCKS;
6197 copyForcesToDeviceKernel<<<grid, ATOM_BLOCKS, 0, stream>>>(
6198 numAtoms, d_f_nbond_x, d_f_nbond_y, d_f_nbond_z,
6199 d_f_slow_x, d_f_slow_y, d_f_slow_z,
6200 d_f_saved_nbond_x, d_f_saved_nbond_y, d_f_saved_nbond_z,
6201 d_f_saved_slow_x, d_f_saved_slow_y, d_f_saved_slow_z,
6207 #endif // NODEGROUP_FORCE_REGISTER