13 #define fftwf_malloc fftw_malloc 14 #define fftwf_free fftw_free 15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX 36 #include "ComputePmeMgr.decl.h" 45 #include "ComputeMgr.decl.h" 47 #define MIN_DEBUG_LEVEL 3 53 #include "ckhashtable.h" 57 #include "ComputeMoaMgr.decl.h" 66 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 69 if ((err = cudaGetLastError()) != cudaSuccess) {
71 gethostname(host, 128); host[127] = 0;
72 char devstr[128] =
"";
74 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
75 sprintf(devstr,
" device %d", devnum);
77 cudaDeviceProp deviceProp;
78 if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
79 sprintf(devstr,
" device %d pci %x:%x:%x", devnum,
80 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
83 sprintf(errmsg,
"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
88 #include <cuda_runtime.h> 93 #include <hip/hip_runtime.h> 97 #define __thread __declspec(thread) 105 #define SQRT_PI 1.7724538509055160273 108 #if CMK_PERSISTENT_COMM 109 #define USE_PERSISTENT 1 118 #if (defined(NAMD_HIP) || defined(NAMD_CUDA)) && defined(MEM_OPT_VERSION) 119 #define USE_NODE_PAR_RECEIVE 1 201 : ia(i_a), ib(i_b), nb(n_b),
202 size(n), data(newcopyint(n,d)) {
208 virtual int procNum(
int,
const CkArrayIndex &i) {
210 return data[ i.data()[ia] * nb + i.data()[ib] ];
214 for (
int i=0; i < size; ++i ) {
215 if ( data[i] == mype ) {
216 CkArrayIndex3D ai(0,0,0);
217 ai.data()[ia] = i / nb;
218 ai.data()[ib] = i % nb;
219 if ( procNum(0,ai) != mype )
NAMD_bug(
"PmePencilMap is inconsistent");
220 if ( ! msg )
NAMD_bug(
"PmePencilMap multiple pencils on a pe?");
221 mgr->insertInitial(ai,msg);
225 mgr->doneInserting();
226 if ( msg ) CkFreeMsg(msg);
229 const int ia, ib, nb, size;
230 const int*
const data;
231 static int* newcopyint(
int n,
int *d) {
232 int *newd =
new int[n];
233 memcpy(newd, d, n*
sizeof(
int));
247 CProxy_PmePencilMap
xm;
248 CProxy_PmePencilMap
ym;
249 CProxy_PmePencilMap
zm;
278 int node = CmiMyNode();
279 int firstpe = CmiNodeFirst(node);
280 int nodeSize = CmiNodeSize(node);
281 int myrank = CkMyRank();
282 for (
int i=0; i<nodeSize; ++i ) {
283 int pe = firstpe + (myrank+i)%nodeSize;
292 CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
294 for (
int i=0; i<nodeSize; ++i ) {
295 if ( pelist[i] == CkMyPe() ) myrank = i;
297 for (
int i=0; i<nodeSize; ++i ) {
298 int pe = pelist[(myrank+i)%nodeSize];
306 int npes = CkNumPes();
307 for (
int i=0; i<npes; ++i ) {
308 int pe = (mype+i)%npes;
314 NAMD_bug(
"findRecipEvirPe() failed!");
321 int ncpus = CkNumPes();
323 for (
int i=0; i<numGridPes; ++i ) {
326 std::sort(gridPeMap,gridPeMap+numGridPes);
327 int firstTransPe = ncpus - numGridPes - numTransPes;
328 if ( firstTransPe < 0 ) {
331 if ( ncpus > numTransPes ) firstTransPe = 1;
333 for (
int i=0; i<numTransPes; ++i ) {
336 std::sort(transPeMap,transPeMap+numTransPes);
341 bool generateBGLORBPmePeList(
int *pemap,
int numPes,
int *block_pes=0,
349 if ( d )
while ( ! (d & c) ) {
352 return (a & c) - (b & c);
358 if ( d )
while ( ! (d & c) ) {
365 inline bool operator() (
int a,
int b)
const {
391 void initialize_pencils(CkQdMsg*);
392 void activate_pencils(CkQdMsg*);
393 void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
394 void initialize_computes();
396 void sendData(
Lattice &,
int sequence);
397 void sendDataPart(
int first,
int last,
Lattice &,
int sequence,
int sourcepe,
int errors);
402 void sendPencils(
Lattice &,
int sequence);
403 void sendPencilsPart(
int first,
int last,
Lattice &,
int sequence,
int sourcepe);
405 void gridCalc1(
void);
406 void sendTransBarrier(
void);
407 void sendTransSubset(
int first,
int last);
408 void sendTrans(
void);
415 void gridCalc2(
void);
416 #ifdef OPENATOM_VERSION 417 void gridCalc2Moa(
void);
418 #endif // OPENATOM_VERSION 419 void gridCalc2R(
void);
422 void sendUntrans(
void);
423 void sendUntransSubset(
int first,
int last);
426 void gridCalc3(
void);
427 void sendUngrid(
void);
428 void sendUngridSubset(
int first,
int last);
433 void ungridCalc(
void);
435 void addRecipEvirClient(
void);
436 void submitReductions();
438 #if 0 && USE_PERSISTENT 439 void setup_recvgrid_persistent();
445 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 453 void chargeGridSubmitted(
Lattice &lattice,
int sequence);
465 void cuda_submit_charges(
Lattice &lattice,
int sequence);
473 void sendChargeGridReady();
477 void pollChargeGridReady();
478 void pollForcesReady();
479 void recvChargeGridReady();
480 void chargeGridReady(
Lattice &lattice,
int sequence);
486 #if 0 && USE_PERSISTENT 487 PersistentHandle *recvGrid_handle;
490 CProxy_ComputePmeMgr pmeProxy;
491 CProxy_ComputePmeMgr pmeProxyDir;
492 CProxy_NodePmeMgr pmeNodeProxy;
497 if ( ! pmeComputes.
size() ) initialize_computes();
511 fftwf_plan *forward_plan_x, *backward_plan_x;
512 fftwf_plan *forward_plan_yz, *backward_plan_yz;
515 fftw_plan forward_plan_x, backward_plan_x;
516 rfftwnd_plan forward_plan_yz, backward_plan_yz;
523 int qsize, fsize, bsize;
539 int ungridForcesCount;
541 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 542 #define NUM_STREAMS 1 556 int f_data_mgr_alloc;
557 float *f_data_mgr_host;
558 float *f_data_mgr_dev;
562 float *bspline_coeffs_dev;
563 float *bspline_dcoeffs_dev;
566 int recipEvirClients;
584 int myGridPe, myGridNode;
585 int myTransPe, myTransNode;
599 int compute_sequence;
602 int sendTransBarrier_received;
605 int xBlocks, yBlocks, zBlocks;
606 CProxy_PmeXPencil xPencil;
607 CProxy_PmeYPencil yPencil;
608 CProxy_PmeZPencil zPencil;
611 int numPencilsActive;
612 int strayChargeErrors;
620 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 642 void sendDataHelper(
int);
643 void sendPencilsHelper(
int);
646 void registerXPencil(CkArrayIndex3D,
PmeXPencil *);
647 void registerYPencil(CkArrayIndex3D,
PmeYPencil *);
648 void registerZPencil(CkArrayIndex3D,
PmeZPencil *);
658 xm=_xm; ym=_ym; zm=_zm;
660 CProxy_PmePencilMap
xm;
661 CProxy_PmePencilMap
ym;
662 CProxy_PmePencilMap
zm;
665 CProxy_ComputePmeMgr mgrProxy;
668 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 672 CProxy_PmeXPencil xPencil;
673 CProxy_PmeYPencil yPencil;
674 CProxy_PmeZPencil zPencil;
675 CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
676 CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
677 CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
679 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 680 cudaEvent_t end_charge_memset;
681 cudaEvent_t end_all_pme_kernels;
682 cudaEvent_t end_potential_memcpy;
691 delete [] mgrObjects;
695 CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
696 mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
697 if ( CkMyRank() == 0 ) {
699 mgrObject = proxy.ckLocalBranch();
704 mgrObject->fwdSharedTrans(msg);
708 mgrObject->fwdSharedUntrans(msg);
712 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 713 masterPmeMgr->recvUngrid(msg);
715 NAMD_bug(
"NodePmeMgr::recvUngrid called in non-CUDA build.");
722 xPencilObj.put(idx)=obj;
728 yPencilObj.put(idx)=obj;
734 zPencilObj.put(idx)=obj;
739 pmeProxyDir(thisgroup) {
741 CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
742 pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
743 nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
745 pmeNodeProxy.ckLocalBranch()->initialize();
747 if ( CmiMyRank() == 0 ) {
761 sendTransBarrier_received = 0;
764 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 766 if ( CmiMyRank() == 0 ) {
770 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 771 int leastPriority, greatestPriority;
772 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
777 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority) 779 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X) 794 cudaEventCreateWithFlags(&
end_charges,cudaEventDisableTiming);
803 f_data_mgr_alloc = 0;
809 #define CUDA_EVENT_ID_PME_CHARGES 80 810 #define CUDA_EVENT_ID_PME_FORCES 81 811 #define CUDA_EVENT_ID_PME_TICK 82 812 #define CUDA_EVENT_ID_PME_COPY 83 813 #define CUDA_EVENT_ID_PME_KERNEL 84 814 if ( 0 == CkMyPe() ) {
823 recipEvirClients = 0;
829 CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
830 xPencil = x; yPencil = y; zPencil = z;
834 pmeNodeProxy.ckLocalBranch()->xPencil=x;
835 pmeNodeProxy.ckLocalBranch()->yPencil=y;
836 pmeNodeProxy.ckLocalBranch()->zPencil=z;
844 Coord(): x(0), y(0), z(0) {}
845 Coord(
int a,
int b,
int c): x(a), y(b), z(c) {}
847 extern void SFC_grid(
int xdim,
int ydim,
int zdim,
int xdim1,
int ydim1,
int zdim1, vector<Coord> &result);
853 for (
int i=0; i<result.size(); i++) {
854 Coord &c = result[i];
855 for (
int j=0; j<procs.
size(); j++) {
858 tmgr.rankToCoordinates(pe, x, y, z, t);
859 if (x==c.x && y==c.y && z==c.z)
860 newprocs[num++] = pe;
863 CmiAssert(newprocs.size() == procs.
size());
867 int find_level_grid(
int x)
879 CmiNodeLock tmgr_lock;
886 tmgr_lock = CmiCreateLock();
896 gridPeMap =
new int[CkNumPes()];
897 transPeMap =
new int[CkNumPes()];
898 recipPeDest =
new int[CkNumPes()];
899 gridPeOrder =
new int[CkNumPes()];
900 gridNodeOrder =
new int[CkNumNodes()];
901 transNodeOrder =
new int[CkNumNodes()];
903 if (CkMyRank() == 0) {
912 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 914 NAMD_die(
"PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
921 cudaDeviceProp deviceProp;
922 cudaGetDeviceProperties(&deviceProp, dev);
924 if ( deviceProp.major < 2 )
925 NAMD_die(
"PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
934 else if (
simParams->PMEPencils > 0 ) usePencils = 1;
937 if ( nrps <= 0 ) nrps = CkNumPes();
938 if ( nrps > CkNumPes() ) nrps = CkNumPes();
941 int maxslabs = 1 + (dimx - 1) /
simParams->PMEMinSlices;
942 if ( maxslabs > nrps ) maxslabs = nrps;
945 if ( maxpencils > nrps ) maxpencils = nrps;
946 if ( maxpencils > 3 * maxslabs ) usePencils = 1;
952 if ( nrps <= 0 ) nrps = CkNumPes();
953 if ( nrps > CkNumPes() ) nrps = CkNumPes();
956 xBlocks = yBlocks = zBlocks =
simParams->PMEPencils;
960 if ( nb2 > nrps ) nb2 = nrps;
961 if ( nb2 < 1 ) nb2 = 1;
962 int nb = (int) sqrt((
float)nb2);
963 if ( nb < 1 ) nb = 1;
964 xBlocks = zBlocks = nb;
973 int bx = 1 + ( dimx - 1 ) / xBlocks;
974 xBlocks = 1 + ( dimx - 1 ) / bx;
977 int by = 1 + ( dimy - 1 ) / yBlocks;
978 yBlocks = 1 + ( dimy - 1 ) / by;
980 int dimz =
simParams->PMEGridSizeZ / 2 + 1;
981 int bz = 1 + ( dimz - 1 ) / zBlocks;
982 zBlocks = 1 + ( dimz - 1 ) / bz;
984 if ( xBlocks * yBlocks > CkNumPes() ) {
985 NAMD_die(
"PME pencils xBlocks * yBlocks > numPes");
987 if ( xBlocks * zBlocks > CkNumPes() ) {
988 NAMD_die(
"PME pencils xBlocks * zBlocks > numPes");
990 if ( yBlocks * zBlocks > CkNumPes() ) {
991 NAMD_die(
"PME pencils yBlocks * zBlocks > numPes");
995 iout <<
iINFO <<
"PME using " << xBlocks <<
" x " <<
996 yBlocks <<
" x " << zBlocks <<
997 " pencil grid for FFT and reciprocal sum.\n" <<
endi;
1004 int minslices =
simParams->PMEMinSlices;
1006 int nrpx = ( dimx + minslices - 1 ) / minslices;
1008 int nrpy = ( dimy + minslices - 1 ) / minslices;
1011 int nrpp = CkNumPes();
1013 if ( nrpp < nrpx ) nrpx = nrpp;
1014 if ( nrpp < nrpy ) nrpy = nrpp;
1018 if ( nrps > CkNumPes() ) nrps = CkNumPes();
1019 if ( nrps > 0 ) nrpx = nrps;
1020 if ( nrps > 0 ) nrpy = nrps;
1023 int bx = ( dimx + nrpx - 1 ) / nrpx;
1024 nrpx = ( dimx + bx - 1 ) / bx;
1025 int by = ( dimy + nrpy - 1 ) / nrpy;
1026 nrpy = ( dimy + by - 1 ) / by;
1027 if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1028 NAMD_bug(
"Error in selecting number of PME processors.");
1029 if ( by != ( dimy + nrpy - 1 ) / nrpy )
1030 NAMD_bug(
"Error in selecting number of PME processors.");
1036 iout <<
iINFO <<
"PME using " << numGridPes <<
" and " << numTransPes <<
1037 " processors for FFT and reciprocal sum.\n" <<
endi;
1040 int sum_npes = numTransPes + numGridPes;
1041 int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1043 #if 0 // USE_TOPOMAP 1049 if(tmgr.hasMultipleProcsPerNode())
1053 if(CkNumPes() > 2*sum_npes + patch_pes) {
1054 done = generateBGLORBPmePeList(transPeMap, numTransPes);
1055 done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1058 if(CkNumPes() > 2 *max_npes + patch_pes) {
1059 done = generateBGLORBPmePeList(transPeMap, max_npes);
1060 gridPeMap = transPeMap;
1074 for ( i=0; i<numGridPes && i<10; ++i ) {
1075 iout <<
" " << gridPeMap[i];
1077 if ( i < numGridPes )
iout <<
" ...";
1079 iout <<
iINFO <<
"PME TRANS LOCATIONS:";
1080 for ( i=0; i<numTransPes && i<10; ++i ) {
1081 iout <<
" " << transPeMap[i];
1083 if ( i < numTransPes )
iout <<
" ...";
1095 for ( i=0; i<numGridPes; ++i ) {
1096 if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1098 int real_node_i = CkNodeOf(gridPeMap[i]);
1099 if ( real_node_i == real_node ) {
1100 gridNodeInfo[node].
npe += 1;
1102 real_node = real_node_i;
1104 gridNodeInfo[node].
real_node = real_node;
1106 gridNodeInfo[node].
npe = 1;
1108 if ( CkMyNode() == real_node_i ) myGridNode = node;
1110 numGridNodes = node + 1;
1115 for ( i=0; i<numTransPes; ++i ) {
1116 if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1118 int real_node_i = CkNodeOf(transPeMap[i]);
1119 if ( real_node_i == real_node ) {
1120 transNodeInfo[node].
npe += 1;
1122 real_node = real_node_i;
1124 transNodeInfo[node].
real_node = real_node;
1126 transNodeInfo[node].
npe = 1;
1128 if ( CkMyNode() == real_node_i ) myTransNode = node;
1130 numTransNodes = node + 1;
1133 iout <<
iINFO <<
"PME USING " << numGridNodes <<
" GRID NODES AND " 1134 << numTransNodes <<
" TRANS NODES\n" <<
endi;
1139 for ( i = 0; i < numGridPes; ++i ) {
1143 if ( myGridPe < 0 ) {
1144 rand.
reorder(gridPeOrder,numGridPes);
1146 gridPeOrder[myGridPe] = numGridPes-1;
1147 gridPeOrder[numGridPes-1] = myGridPe;
1148 rand.
reorder(gridPeOrder,numGridPes-1);
1150 for ( i = 0; i < numGridNodes; ++i ) {
1151 gridNodeOrder[i] = i;
1153 if ( myGridNode < 0 ) {
1154 rand.
reorder(gridNodeOrder,numGridNodes);
1156 gridNodeOrder[myGridNode] = numGridNodes-1;
1157 gridNodeOrder[numGridNodes-1] = myGridNode;
1158 rand.
reorder(gridNodeOrder,numGridNodes-1);
1160 for ( i = 0; i < numTransNodes; ++i ) {
1161 transNodeOrder[i] = i;
1163 if ( myTransNode < 0 ) {
1164 rand.
reorder(transNodeOrder,numTransNodes);
1166 transNodeOrder[myTransNode] = numTransNodes-1;
1167 transNodeOrder[numTransNodes-1] = myTransNode;
1168 rand.
reorder(transNodeOrder,numTransNodes-1);
1179 myGrid.
dim3 = 2 * (myGrid.
K3/2 + 1);
1181 if ( ! usePencils ) {
1182 myGrid.
block1 = ( myGrid.
K1 + numGridPes - 1 ) / numGridPes;
1183 myGrid.
block2 = ( myGrid.
K2 + numTransPes - 1 ) / numTransPes;
1188 myGrid.
block1 = ( myGrid.
K1 + xBlocks - 1 ) / xBlocks;
1189 myGrid.
block2 = ( myGrid.
K2 + yBlocks - 1 ) / yBlocks;
1190 myGrid.
block3 = ( myGrid.
K3/2 + 1 + zBlocks - 1 ) / zBlocks;
1202 int ncpus = CkNumPes();
1205 for (
int icpu=0; icpu<ncpus; ++icpu ) {
1210 else nopatches.
add(ri);
1216 int *tmp =
new int[patches.
size()];
1217 int nn = patches.
size();
1218 for (i=0;i<nn;i++) tmp[i] = patches[i];
1221 for (i=0;i<nn;i++) patches.
add(tmp[i]);
1223 tmp =
new int[nopatches.
size()];
1224 nn = nopatches.
size();
1225 for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1228 for (i=0;i<nn;i++) nopatches.
add(tmp[i]);
1234 int npens = xBlocks*yBlocks;
1235 if ( npens % ncpus == 0 ) useZero = 1;
1236 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1237 npens += xBlocks*zBlocks;
1238 if ( npens % ncpus == 0 ) useZero = 1;
1239 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1240 npens += yBlocks*zBlocks;
1241 if ( npens % ncpus == 0 ) useZero = 1;
1242 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1245 for ( i=nopatches.
size()-1; i>=0; --i ) pmeprocs.
add(nopatches[i]);
1247 for ( i=patches.
size()-1; i>=0; --i ) pmeprocs.
add(patches[i]);
1250 int npes = pmeprocs.
size();
1251 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1252 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1253 #if !USE_RANDOM_TOPO 1256 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1257 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1258 #if !USE_RANDOM_TOPO 1261 for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1262 if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1263 #if !USE_RANDOM_TOPO 1271 int xdim = tmgr.getDimNX();
1272 int ydim = tmgr.getDimNY();
1273 int zdim = tmgr.getDimNZ();
1274 int xdim1 = find_level_grid(xdim);
1275 int ydim1 = find_level_grid(ydim);
1276 int zdim1 = find_level_grid(zdim);
1278 printf(
"xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1280 vector<Coord> result;
1281 SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1282 sort_sfc(xprocs, tmgr, result);
1283 sort_sfc(yprocs, tmgr, result);
1284 sort_sfc(zprocs, tmgr, result);
1286 CmiUnlock(tmgr_lock);
1291 iout <<
iINFO <<
"PME Z PENCIL LOCATIONS:";
1292 for ( i=0; i<zprocs.
size() && i<10; ++i ) {
1295 tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1296 iout <<
" " << zprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1298 iout <<
" " << zprocs[i];
1301 if ( i < zprocs.
size() )
iout <<
" ...";
1305 if (CkMyRank() == 0) {
1306 for (pe=0, x = 0; x < xBlocks; ++x)
1307 for (y = 0; y < yBlocks; ++y, ++pe ) {
1313 iout <<
iINFO <<
"PME Y PENCIL LOCATIONS:";
1314 for ( i=0; i<yprocs.
size() && i<10; ++i ) {
1317 tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1318 iout <<
" " << yprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1320 iout <<
" " << yprocs[i];
1323 if ( i < yprocs.
size() )
iout <<
" ...";
1327 if (CkMyRank() == 0) {
1328 for (pe=0, z = 0; z < zBlocks; ++z )
1329 for (x = 0; x < xBlocks; ++x, ++pe ) {
1335 iout <<
iINFO <<
"PME X PENCIL LOCATIONS:";
1336 for ( i=0; i<xprocs.
size() && i<10; ++i ) {
1339 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1340 iout <<
" " << xprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1342 iout <<
" " << xprocs[i];
1345 if ( i < xprocs.
size() )
iout <<
" ...";
1349 if (CkMyRank() == 0) {
1350 for (pe=0, y = 0; y < yBlocks; ++y )
1351 for (z = 0; z < zBlocks; ++z, ++pe ) {
1358 if ( CkMyPe() == 0 ){
1359 #if !USE_RANDOM_TOPO 1366 CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.
begin());
1367 CProxy_PmePencilMap ym;
1369 ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.
begin());
1371 ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.
begin());
1372 CProxy_PmePencilMap xm;
1374 xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.
begin());
1376 xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.
begin());
1377 pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1378 CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1379 CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1380 CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1381 zo.setAnytimeMigration(
false); zo.setStaticInsertion(
true);
1382 yo.setAnytimeMigration(
false); yo.setStaticInsertion(
true);
1383 xo.setAnytimeMigration(
false); xo.setStaticInsertion(
true);
1384 zPencil = CProxy_PmeZPencil::ckNew(zo);
1385 yPencil = CProxy_PmeYPencil::ckNew(yo);
1386 xPencil = CProxy_PmeXPencil::ckNew(xo);
1388 zPencil = CProxy_PmeZPencil::ckNew();
1389 yPencil = CProxy_PmeYPencil::ckNew();
1390 xPencil = CProxy_PmeXPencil::ckNew();
1392 for (pe=0, x = 0; x < xBlocks; ++x)
1393 for (y = 0; y < yBlocks; ++y, ++pe ) {
1394 zPencil(x,y,0).insert(zprocs[pe]);
1396 zPencil.doneInserting();
1398 for (pe=0, x = 0; x < xBlocks; ++x)
1399 for (z = 0; z < zBlocks; ++z, ++pe ) {
1400 yPencil(x,0,z).insert(yprocs[pe]);
1402 yPencil.doneInserting();
1405 for (pe=0, y = 0; y < yBlocks; ++y )
1406 for (z = 0; z < zBlocks; ++z, ++pe ) {
1407 xPencil(0,y,z).insert(xprocs[pe]);
1409 xPencil.doneInserting();
1412 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1414 msgdata.
grid = myGrid;
1437 for ( pe = 0; pe < numGridPes; ++pe ) {
1440 if ( nx > myGrid.
K1 ) nx = myGrid.
K1;
1441 localInfo[pe].
nx = nx - localInfo[pe].
x_start;
1444 for ( pe = 0; pe < numTransPes; ++pe ) {
1447 if ( ny > myGrid.
K2 ) ny = myGrid.
K2;
1460 int numNodes = CkNumPes();
1461 int *source_flags =
new int[numNodes];
1463 for ( node=0; node<numNodes; ++node ) {
1464 source_flags[node] = 0;
1465 recipPeDest[node] = 0;
1474 for (
int pid=0; pid < numPatches; ++pid ) {
1475 int pnode = patchMap->
node(pid);
1476 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1477 if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1479 int shift1 = (myGrid.
K1 + myGrid.
order - 1)/2;
1482 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1484 int min1 = ((int) floor(myGrid.
K1 * (minx - margina))) + shift1 - myGrid.
order + 1;
1485 int max1 = ((int) floor(myGrid.
K1 * (maxx + margina))) + shift1;
1486 for (
int i=min1; i<=max1; ++i ) {
1488 while ( ix >= myGrid.
K1 ) ix -= myGrid.
K1;
1489 while ( ix < 0 ) ix += myGrid.
K1;
1491 if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1492 ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1493 source_flags[pnode] = 1;
1496 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1498 if ( pnode == CkNodeFirst(CkMyNode()) ) {
1499 recipPeDest[ix / myGrid.
block1] = 1;
1503 if ( pnode == CkMyPe() ) {
1504 recipPeDest[ix / myGrid.
block1] = 1;
1509 int numSourcesSamePhysicalNode = 0;
1511 numDestRecipPes = 0;
1512 for ( node=0; node<numNodes; ++node ) {
1513 if ( source_flags[node] ) ++numSources;
1514 if ( recipPeDest[node] ) ++numDestRecipPes;
1515 if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1520 CkPrintf(
"pe %5d pme %5d of %5d on same physical node\n",
1521 CkMyPe(), numSourcesSamePhysicalNode, numSources);
1522 iout <<
iINFO <<
"PME " << CkMyPe() <<
" sources:";
1523 for ( node=0; node<numNodes; ++node ) {
1524 if ( source_flags[node] )
iout <<
" " << node;
1530 delete [] source_flags;
1537 ungrid_count = numDestRecipPes;
1539 sendTransBarrier_received = 0;
1541 if ( myGridPe < 0 && myTransPe < 0 )
return;
1544 if ( myTransPe >= 0 ) {
1546 pmeProxy[recipEvirPe].addRecipEvirClient();
1549 if ( myTransPe >= 0 ) {
1552 #ifdef OPENATOM_VERSION 1554 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1555 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2, moaProxy);
1557 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2);
1559 #else // OPENATOM_VERSION 1560 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2);
1561 #endif // OPENATOM_VERSION 1564 int local_size = myGrid.
block1 * myGrid.
K2 * myGrid.
dim3;
1565 int local_size_2 = myGrid.
block2 * myGrid.
K1 * myGrid.
dim3;
1566 if ( local_size < local_size_2 ) local_size = local_size_2;
1567 qgrid =
new float[local_size*
numGrids];
1568 if ( numGridPes > 1 || numTransPes > 1 ) {
1569 kgrid =
new float[local_size*
numGrids];
1573 qgrid_size = local_size;
1575 if ( myGridPe >= 0 ) {
1576 qgrid_start = localInfo[myGridPe].
x_start * myGrid.
K2 * myGrid.
dim3;
1577 qgrid_len = localInfo[myGridPe].
nx * myGrid.
K2 * myGrid.
dim3;
1578 fgrid_start = localInfo[myGridPe].
x_start * myGrid.
K2;
1579 fgrid_len = localInfo[myGridPe].
nx * myGrid.
K2;
1582 int n[3]; n[0] = myGrid.
K1; n[1] = myGrid.
K2; n[2] = myGrid.
K3;
1586 work =
new fftwf_complex[n[0]];
1587 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1588 if ( myGridPe >= 0 ) {
1589 forward_plan_yz=
new fftwf_plan[
numGrids];
1590 backward_plan_yz=
new fftwf_plan[
numGrids];
1592 if ( myTransPe >= 0 ) {
1593 forward_plan_x=
new fftwf_plan[
numGrids];
1594 backward_plan_x=
new fftwf_plan[
numGrids];
1597 if ( ! CkMyPe() )
iout <<
iINFO <<
"Optimizing 4 FFT steps. 1..." <<
endi;
1598 if ( myGridPe >= 0 ) {
1601 forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1602 localInfo[myGridPe].nx,
1603 qgrid + qgrid_size * g,
1608 (qgrid + qgrid_size * g),
1615 int zdim = myGrid.
dim3;
1617 if ( ! CkMyPe() )
iout <<
" 2..." <<
endi;
1618 if ( myTransPe >= 0 ) {
1622 forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1624 (kgrid+qgrid_size*g),
1629 (kgrid+qgrid_size*g),
1633 FFTW_FORWARD,fftwFlags);
1637 if ( ! CkMyPe() )
iout <<
" 3..." <<
endi;
1638 if ( myTransPe >= 0 ) {
1641 backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1643 (kgrid+qgrid_size*g),
1648 (kgrid+qgrid_size*g),
1652 FFTW_BACKWARD, fftwFlags);
1656 if ( ! CkMyPe() )
iout <<
" 4..." <<
endi;
1657 if ( myGridPe >= 0 ) {
1660 backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1661 localInfo[myGridPe].nx,
1663 (qgrid + qgrid_size * g),
1667 qgrid + qgrid_size * g,
1674 if ( ! CkMyPe() )
iout <<
" Done.\n" <<
endi;
1677 work =
new fftw_complex[n[0]];
1679 if ( ! CkMyPe() )
iout <<
iINFO <<
"Optimizing 4 FFT steps. 1..." <<
endi;
1680 if ( myGridPe >= 0 ) {
1681 forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1682 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1683 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1685 if ( ! CkMyPe() )
iout <<
" 2..." <<
endi;
1686 if ( myTransPe >= 0 ) {
1687 forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1688 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1689 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1692 if ( ! CkMyPe() )
iout <<
" 3..." <<
endi;
1693 if ( myTransPe >= 0 ) {
1694 backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1695 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1696 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1699 if ( ! CkMyPe() )
iout <<
" 4..." <<
endi;
1700 if ( myGridPe >= 0 ) {
1701 backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1702 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1703 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1705 if ( ! CkMyPe() )
iout <<
" Done.\n" <<
endi;
1709 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
1712 if ( myGridPe >= 0 && numSources == 0 )
1713 NAMD_bug(
"PME grid elements exist without sources.");
1714 grid_count = numSources;
1715 memset( (
void*) qgrid, 0, qgrid_size *
numGrids *
sizeof(
float) );
1716 trans_count = numGridPes;
1723 if ( ! usePencils )
return;
1735 pencilActive =
new char[xBlocks*yBlocks];
1736 for (
int i=0; i<xBlocks; ++i ) {
1737 for (
int j=0; j<yBlocks; ++j ) {
1738 pencilActive[i*yBlocks+j] = 0;
1742 for (
int pid=0; pid < numPatches; ++pid ) {
1743 int pnode = patchMap->
node(pid);
1744 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1746 if ( CkNodeOf(pnode) != CkMyNode() )
continue;
1749 if ( pnode != CkMyPe() )
continue;
1751 int shift1 = (myGrid.
K1 + myGrid.
order - 1)/2;
1752 int shift2 = (myGrid.
K2 + myGrid.
order - 1)/2;
1756 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1758 int min1 = ((int) floor(myGrid.
K1 * (minx - margina))) + shift1 - myGrid.
order + 1;
1759 int max1 = ((int) floor(myGrid.
K1 * (maxx + margina))) + shift1;
1763 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1765 int min2 = ((int) floor(myGrid.
K2 * (miny - marginb))) + shift2 - myGrid.
order + 1;
1766 int max2 = ((int) floor(myGrid.
K2 * (maxy + marginb))) + shift2;
1768 for (
int i=min1; i<=max1; ++i ) {
1770 while ( ix >= myGrid.
K1 ) ix -= myGrid.
K1;
1771 while ( ix < 0 ) ix += myGrid.
K1;
1772 for (
int j=min2; j<=max2; ++j ) {
1774 while ( jy >= myGrid.
K2 ) jy -= myGrid.
K2;
1775 while ( jy < 0 ) jy += myGrid.
K2;
1776 pencilActive[(ix / myGrid.
block1)*yBlocks + (jy / myGrid.
block2)] = 1;
1781 numPencilsActive = 0;
1782 for (
int i=0; i<xBlocks; ++i ) {
1783 for (
int j=0; j<yBlocks; ++j ) {
1784 if ( pencilActive[i*yBlocks+j] ) {
1786 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1789 zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1793 activePencils =
new ijpair[numPencilsActive];
1794 numPencilsActive = 0;
1795 for (
int i=0; i<xBlocks; ++i ) {
1796 for (
int j=0; j<yBlocks; ++j ) {
1797 if ( pencilActive[i*yBlocks+j] ) {
1798 activePencils[numPencilsActive++] =
ijpair(i,j);
1806 rand.
reorder(activePencils,numPencilsActive);
1812 ungrid_count = numPencilsActive;
1817 if ( ! usePencils )
return;
1818 if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1824 if ( CmiMyRank() == 0 ) {
1830 delete [] localInfo;
1831 delete [] gridNodeInfo;
1832 delete [] transNodeInfo;
1833 delete [] gridPeMap;
1834 delete [] transPeMap;
1835 delete [] recipPeDest;
1836 delete [] gridPeOrder;
1837 delete [] gridNodeOrder;
1838 delete [] transNodeOrder;
1840 if ( kgrid != qgrid )
delete [] kgrid;
1842 delete [] gridmsg_reuse;
1845 for (
int i=0; i<q_count; ++i) {
1846 delete [] q_list[i];
1857 if ( grid_count == 0 ) {
1858 NAMD_bug(
"Message order failure in ComputePmeMgr::recvGrid\n");
1860 if ( grid_count == numSources ) {
1865 int zdim = myGrid.
dim3;
1867 int *zlist = msg->
zlist;
1868 float *qmsg = msg->
qgrid;
1870 char *f = msg->
fgrid + fgrid_len * g;
1871 float *q = qgrid + qgrid_size * g;
1872 for (
int i=0; i<fgrid_len; ++i ) {
1874 for (
int k=0; k<zlistlen; ++k ) {
1875 q[zlist[k]] += *(qmsg++);
1882 gridmsg_reuse[numSources-grid_count] = msg;
1885 if ( grid_count == 0 ) {
1886 pmeProxyDir[CkMyPe()].gridCalc1();
1887 if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1890 #ifdef MANUAL_DEBUG_FFTW3 1893 void dumpMatrixFloat(
const char *infilename,
float *matrix,
int xdim,
int ydim,
int zdim,
int pe)
1897 char filename[1000];
1898 strncpy(fmt,infilename,999);
1899 strncat(fmt,
"_%d.out",999);
1900 sprintf(filename,fmt, pe);
1901 FILE *loutfile = fopen(filename,
"w");
1902 #ifdef PAIRCALC_TEST_DUMP 1903 fprintf(loutfile,
"%d\n",ydim);
1905 fprintf(loutfile,
"%d %d %d\n",xdim,ydim, zdim);
1906 for(
int i=0;i<xdim;i++)
1907 for(
int j=0;j<ydim;j++)
1908 for(
int k=0;k<zdim;k++)
1909 fprintf(loutfile,
"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1914 void dumpMatrixFloat3(
const char *infilename,
float *matrix,
int xdim,
int ydim,
int zdim,
int x,
int y,
int z)
1917 char filename[1000];
1918 strncpy(fmt,infilename,999);
1919 strncat(fmt,
"_%d_%d_%d.out",999);
1920 sprintf(filename,fmt, x,y,z);
1921 FILE *loutfile = fopen(filename,
"w");
1922 CkAssert(loutfile!=NULL);
1923 CkPrintf(
"opened %s for dump\n",filename);
1924 fprintf(loutfile,
"%d %d %d\n",xdim,ydim, zdim);
1925 for(
int i=0;i<xdim;i++)
1926 for(
int j=0;j<ydim;j++)
1927 for(
int k=0;k<zdim;k++)
1928 fprintf(loutfile,
"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1940 fftwf_execute(forward_plan_yz[g]);
1942 rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1943 qgrid + qgrid_size * g, 1, myGrid.
dim2 * myGrid.
dim3, 0, 0, 0);
1949 if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1953 sendTransBarrier_received += 1;
1955 if ( sendTransBarrier_received < numGridPes )
return;
1956 sendTransBarrier_received = 0;
1957 for (
int i=0; i<numGridPes; ++i ) {
1958 pmeProxyDir[gridPeMap[i]].sendTrans();
1962 static inline void PmeSlabSendTrans(
int first,
int last,
void *result,
int paraNum,
void *param) {
1969 untrans_count = numTransPes;
1971 #if CMK_SMP && USE_CKLOOP 1974 CkLoop_Parallelize(
PmeSlabSendTrans, 1, (
void *)
this, CkMyNodeSize(), 0, numTransNodes-1, 0);
1987 int zdim = myGrid.
dim3;
1988 int nx = localInfo[myGridPe].
nx;
1989 int x_start = localInfo[myGridPe].
x_start;
1990 int slicelen = myGrid.
K2 * zdim;
1992 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1995 CmiNetworkProgressAfter (0);
1998 for (
int j=first; j<=last; j++) {
1999 int node = transNodeOrder[j];
2000 int pe = transNodeInfo[node].
pe_start;
2001 int npe = transNodeInfo[node].
npe;
2003 if ( node != myTransNode )
for (
int i=0; i<npe; ++i, ++pe) {
2015 float *qmsg = newmsg->
qgrid + nx * totlen * g;
2017 for (
int i=0; i<npe; ++i, ++pe) {
2020 if ( node == myTransNode ) {
2022 qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2025 for (
int x = 0; x < nx; ++x ) {
2026 CmiMemcpy((
void*)qmsg, (
void*)q, cpylen*
sizeof(
float));
2034 if ( node == myTransNode ) newmsg->
nx = 0;
2037 else pmeNodeProxy[transNodeInfo[node].
real_node].recvTrans(newmsg);
2038 }
else pmeProxy[transPeMap[transNodeInfo[node].
pe_start]].recvTrans(newmsg);
2044 int pe = transNodeInfo[myTransNode].
pe_start;
2045 int npe = transNodeInfo[myTransNode].
npe;
2046 CmiNodeLock lock = CmiCreateLock();
2047 int *count =
new int; *count = npe;
2048 for (
int i=0; i<npe; ++i, ++pe) {
2052 shmsg->
count = count;
2054 pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2061 int count = --(*msg->
count);
2062 CmiUnlock(msg->
lock);
2064 CmiDestroyLock(msg->
lock);
2078 if ( trans_count == numGridPes ) {
2084 int zdim = myGrid.
dim3;
2085 NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2087 int last_pe = first_pe+nodeInfo.
npe-1;
2097 CmiMemcpy((
void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2098 (
void*)(msg->
qgrid + nx*(ny_msg*g+y_skip)*zdim),
2099 nx*ny*zdim*
sizeof(
float));
2105 if ( trans_count == 0 ) {
2106 pmeProxyDir[CkMyPe()].gridCalc2();
2114 CmiNetworkProgressAfter (0);
2117 int zdim = myGrid.
dim3;
2125 fftwf_execute(forward_plan_x[g]);
2127 fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2128 ny * zdim / 2, 1, work, 1, 0);
2133 #ifdef OPENATOM_VERSION 2135 #endif // OPENATOM_VERSION 2137 #ifdef OPENATOM_VERSION 2141 #endif // OPENATOM_VERSION 2144 #ifdef OPENATOM_VERSION 2145 void ComputePmeMgr::gridCalc2Moa(
void) {
2147 int zdim = myGrid.
dim3;
2153 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2156 #ifdef OPENATOM_VERSION_DEBUG 2157 CkPrintf(
"Sending recQ on processor %d \n", CkMyPe());
2158 for (
int i=0; i<=(ny * zdim / 2); ++i)
2160 CkPrintf(
"PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2162 #endif // OPENATOM_VERSION_DEBUG 2164 CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2165 moaProxy[CkMyPe()].recvQ(g,
numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2168 #endif // OPENATOM_VERSION 2173 #if CMK_SMP && USE_CKLOOP 2175 && CkNumPes() >= 2 * numTransPes ) {
2180 int zdim = myGrid.
dim3;
2189 lattice, LJewaldcof, &(recip_evir2[g][1]), useCkLoop);
2194 lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2202 fftwf_execute(backward_plan_x[g]);
2204 fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2205 ny * zdim / 2, 1, work, 1, 0);
2210 pmeProxyDir[CkMyPe()].sendUntrans();
2220 trans_count = numGridPes;
2225 newmsg->
evir[g] = recip_evir2[g];
2228 CmiEnableUrgentSend(1);
2229 pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2230 CmiEnableUrgentSend(0);
2233 #if CMK_SMP && USE_CKLOOP 2236 CkLoop_Parallelize(
PmeSlabSendUntrans, 1, (
void *)
this, CkMyNodeSize(), 0, numGridNodes-1, 0);
2247 int zdim = myGrid.
dim3;
2250 int slicelen = myGrid.
K2 * zdim;
2252 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2255 CmiNetworkProgressAfter (0);
2259 for (
int j=first; j<=last; j++) {
2260 int node = gridNodeOrder[j];
2261 int pe = gridNodeInfo[node].
pe_start;
2262 int npe = gridNodeInfo[node].
npe;
2264 if ( node != myGridNode )
for (
int i=0; i<npe; ++i, ++pe) {
2266 int cpylen = li.
nx * zdim;
2274 float *qmsg = newmsg->
qgrid + ny * totlen * g;
2276 for (
int i=0; i<npe; ++i, ++pe) {
2278 if ( node == myGridNode ) {
2280 qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2281 float *q = kgrid + qgrid_size*g + li.
x_start*ny*zdim;
2282 int cpylen = ny * zdim;
2283 for (
int x = 0; x < li.
nx; ++x ) {
2284 CmiMemcpy((
void*)qmsg, (
void*)q, cpylen*
sizeof(
float));
2289 CmiMemcpy((
void*)qmsg,
2290 (
void*)(kgrid + qgrid_size*g + li.
x_start*ny*zdim),
2291 li.
nx*ny*zdim*
sizeof(
float));
2292 qmsg += li.
nx*ny*zdim;
2297 if ( node == myGridNode ) newmsg->
ny = 0;
2300 else pmeNodeProxy[gridNodeInfo[node].
real_node].recvUntrans(newmsg);
2301 }
else pmeProxy[gridPeMap[gridNodeInfo[node].
pe_start]].recvUntrans(newmsg);
2306 int pe = gridNodeInfo[myGridNode].
pe_start;
2307 int npe = gridNodeInfo[myGridNode].
npe;
2308 CmiNodeLock lock = CmiCreateLock();
2309 int *count =
new int; *count = npe;
2310 for (
int i=0; i<npe; ++i, ++pe) {
2313 shmsg->
count = count;
2315 pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2322 int count = --(*msg->
count);
2323 CmiUnlock(msg->
lock);
2325 CmiDestroyLock(msg->
lock);
2341 CmiNetworkProgressAfter (0);
2349 int zdim = myGrid.
dim3;
2350 int last_pe = first_pe+nodeInfo.
npe-1;
2351 int x_skip = localInfo[myGridPe].
x_start 2352 - localInfo[first_pe].
x_start;
2353 int nx_msg = localInfo[last_pe].
x_start 2354 + localInfo[last_pe].
nx 2355 - localInfo[first_pe].
x_start;
2356 int nx = localInfo[myGridPe].
nx;
2359 int slicelen = myGrid.
K2 * zdim;
2360 int cpylen = ny * zdim;
2362 float *q = qgrid + qgrid_size * g + y_start * zdim;
2363 float *qmsg = msg->
qgrid + (nx_msg*g+x_skip) * cpylen;
2364 for (
int x = 0; x < nx; ++x ) {
2365 CmiMemcpy((
void*)q, (
void*)qmsg, cpylen*
sizeof(
float));
2374 if ( untrans_count == 0 ) {
2375 pmeProxyDir[CkMyPe()].gridCalc3();
2386 fftwf_execute(backward_plan_yz[g]);
2388 rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2389 (fftw_complex *) (qgrid + qgrid_size * g),
2390 1, myGrid.
dim2 * myGrid.
dim3 / 2, 0, 0, 0);
2396 pmeProxyDir[CkMyPe()].sendUngrid();
2406 #if CMK_SMP && USE_CKLOOP 2409 CkLoop_Parallelize(
PmeSlabSendUngrid, 1, (
void *)
this, CkMyNodeSize(), 0, numSources-1, 1);
2416 grid_count = numSources;
2417 memset( (
void*) qgrid, 0, qgrid_size *
numGrids *
sizeof(
float) );
2422 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2428 for (
int j=first; j<=last; ++j ) {
2432 int zdim = myGrid.
dim3;
2433 int flen = newmsg->
len;
2434 int fstart = newmsg->
start;
2436 int *zlist = newmsg->
zlist;
2437 float *qmsg = newmsg->
qgrid;
2439 char *f = newmsg->
fgrid + fgrid_len * g;
2440 float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2441 for (
int i=0; i<flen; ++i ) {
2443 for (
int k=0; k<zlistlen; ++k ) {
2444 *(qmsg++) = q[zlist[k]];
2453 CmiEnableUrgentSend(1);
2454 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2456 pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2459 pmeProxyDir[pe].recvUngrid(newmsg);
2460 CmiEnableUrgentSend(0);
2466 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2469 if ( ungrid_count == 0 ) {
2470 NAMD_bug(
"Message order failure in ComputePmeMgr::recvUngrid\n");
2480 if ( msg )
delete msg;
2481 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2484 if ( ungrid_count == 0 ) {
2485 NAMD_bug(
"Message order failure in ComputePmeMgr::recvUngrid\n");
2487 int uc = --ungrid_count;
2498 if ( ungrid_count == 0 ) {
2499 pmeProxyDir[CkMyPe()].ungridCalc();
2503 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2504 #define count_limit 1000000 2505 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1) 2506 #define EVENT_STRIDE 10 2517 if ( err == cudaSuccess ) {
2531 }
else if ( err != cudaErrorNotReady ) {
2533 sprintf(errmsg,
"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2540 sprintf(errmsg,
"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2559 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2564 if (
this == masterPmeMgr ) {
2565 double before = CmiWallTimer();
2567 cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 );
2568 cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 );
2570 cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2571 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after potential memcpy");
2574 const int myrank = CkMyRank();
2575 for (
int i=0; i<CkMyNodeSize(); ++i ) {
2586 for (
int i=0; i<n; ++i ) {
2587 cudaEventCreateWithFlags(&
end_forces[i],cudaEventDisableTiming);
2593 cudaMallocHost((
void**) &afn_host, 3*pcsz*
sizeof(
float*));
2594 cudaMalloc((
void**) &afn_dev, 3*pcsz*
sizeof(
float*));
2598 for (
int i=0; i<pcsz; ++i ) {
2602 if ( totn > f_data_mgr_alloc ) {
2603 if ( f_data_mgr_alloc ) {
2604 CkPrintf(
"Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2605 cudaFree(f_data_mgr_dev);
2606 cudaFreeHost(f_data_mgr_host);
2608 f_data_mgr_alloc = 1.2 * (totn + 100);
2609 cudaMalloc((
void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*
sizeof(
float));
2610 cudaMallocHost((
void**) &f_data_mgr_host, 3*f_data_mgr_alloc*
sizeof(
float));
2614 float *f_dev = f_data_mgr_dev;
2615 float *f_host = f_data_mgr_host;
2616 for (
int i=0; i<pcsz; ++i ) {
2621 afn_host[3*i+1] = f_dev;
2622 afn_host[3*i+2] = f_dev + n;
2627 double before = CmiWallTimer();
2628 cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*
sizeof(
float*), cudaMemcpyHostToDevice, streams[stream]);
2629 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force pointer memcpy");
2631 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2632 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2635 for (
int i=0; i<pcsz; ++i ) {
2638 int dimy = pcsz - i;
2642 for (
int j=0; j<dimy; ++j ) {
2645 if ( n > maxn ) maxn = n;
2648 before = CmiWallTimer();
2651 v_arr_dev, afn_dev+3*i, dimy, maxn,
2656 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force kernel submit");
2658 before = CmiWallTimer();
2660 cudaMemcpyDeviceToHost, streams[stream]);
2662 cudaDeviceSynchronize();
2663 fprintf(stderr,
"i = %d\n", i);
2664 for(
int k=0; k < subtotn*3; k++)
2666 fprintf(stderr,
"f_data_host[%d][%d] = %f\n", i, k,
2670 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force memcpy submit");
2673 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after end_forces event");
2689 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2694 pmeProxy[
this_pe].pollForcesReady();
2698 ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2702 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2706 NAMD_bug(
"ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2714 DebugM(4,
"ComputePme created.\n");
2718 CProxy_ComputePmeMgr::ckLocalBranch(
2719 CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(
this);
2733 myGrid.
dim3 = 2 * (myGrid.
K3/2 + 1);
2735 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2736 cuda_atoms_offset = 0;
2753 NAMD_bug(
"ComputePme used with unknown patch.");
2758 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2769 ungridForcesCount = 0;
2775 strayChargeErrors = 0;
2777 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2779 int pe =
master_pe = CkNodeFirst(CkMyNode());
2780 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2783 if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() )
master_pe = pe;
2791 NAMD_bug(
"ComputePmeMgr::initialize_computes() master_pe has no patches.");
2794 masterPmeMgr = nodePmeMgr->mgrObjects[
master_pe - CkNodeFirst(CkMyNode())];
2803 nodePmeMgr->masterPmeMgr = masterPmeMgr;
2807 qsize = myGrid.
K1 * myGrid.
dim2 * myGrid.
dim3;
2808 fsize = myGrid.
K1 * myGrid.
dim2;
2809 if ( myGrid.
K2 != myGrid.
dim2 )
NAMD_bug(
"PME myGrid.K2 != myGrid.dim2");
2810 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2814 q_arr =
new float*[fsize*
numGrids];
2815 memset( (
void*) q_arr, 0, fsize*
numGrids *
sizeof(
float*) );
2816 q_list =
new float*[fsize*
numGrids];
2817 memset( (
void*) q_list, 0, fsize*
numGrids *
sizeof(
float*) );
2821 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2822 if ( cudaFirst || ! offload ) {
2827 for (
int n=fsize*
numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2830 char *f = f_arr + g*fsize;
2834 int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2835 int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2836 int dim2 = myGrid.
dim2;
2837 for (
int ap=0; ap<numPencilsActive; ++ap) {
2838 int ib = activePencils[ap].
i;
2839 int jb = activePencils[ap].
j;
2840 int ibegin = ib*block1;
2841 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
2842 int jbegin = jb*block2;
2843 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
2844 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
2845 for (
int i=ibegin; i<iend; ++i ) {
2846 for (
int j=jbegin; j<jend; ++j ) {
2852 int block1 = ( myGrid.
K1 + numGridPes - 1 ) / numGridPes;
2853 bsize = block1 * myGrid.
dim2 * myGrid.
dim3;
2854 for (
int pe=0; pe<numGridPes; pe++) {
2855 if ( ! recipPeDest[pe] )
continue;
2856 int start = pe * bsize;
2858 if ( start >= qsize ) { start = 0; len = 0; }
2859 if ( start + len > qsize ) { len = qsize - start; }
2860 int zdim = myGrid.
dim3;
2861 int fstart = start / zdim;
2862 int flen = len / zdim;
2863 memset(f + fstart, 0, flen*
sizeof(
char));
2868 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2874 int f_alloc_count = 0;
2875 for (
int n=fsize, i=0; i<n; ++i ) {
2876 if ( f_arr[i] == 0 ) {
2882 q_arr =
new float*[fsize*
numGrids];
2883 memset( (
void*) q_arr, 0, fsize*
numGrids *
sizeof(
float*) );
2885 float **q_arr_dev_host =
new float*[fsize];
2886 cudaMalloc((
void**) &q_arr_dev, fsize *
sizeof(
float*));
2888 float **v_arr_dev_host =
new float*[fsize];
2889 cudaMalloc((
void**) &v_arr_dev, fsize *
sizeof(
float*));
2891 int q_stride = myGrid.
K3+myGrid.
order-1;
2892 q_data_size = f_alloc_count * q_stride *
sizeof(float);
2893 ffz_size = (fsize + q_stride) *
sizeof(
int);
2896 cudaMallocHost((
void**) &q_data_host, q_data_size+ffz_size);
2897 ffz_host = (
int*)(((
char*)q_data_host) + q_data_size);
2898 cudaMalloc((
void**) &q_data_dev, q_data_size+ffz_size);
2899 ffz_dev = (
int*)(((
char*)q_data_dev) + q_data_size);
2900 cudaMalloc((
void**) &v_data_dev, q_data_size);
2902 cudaMemset(q_data_dev, 0, q_data_size + ffz_size);
2903 cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2904 cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2905 cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2906 cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2909 for (
int n=fsize, i=0; i<n; ++i ) {
2910 if ( f_arr[i] == 0 ) {
2911 q_arr[i] = q_data_host + f_alloc_count * q_stride;
2912 q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2913 v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2917 q_arr_dev_host[i] = 0;
2918 v_arr_dev_host[i] = 0;
2922 cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize *
sizeof(
float*), cudaMemcpyHostToDevice);
2923 cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize *
sizeof(
float*), cudaMemcpyHostToDevice);
2924 delete [] q_arr_dev_host;
2925 delete [] v_arr_dev_host;
2927 f_arr =
new char[fsize + q_stride];
2928 fz_arr = f_arr + fsize;
2929 memset(f_arr, 0, fsize + q_stride);
2930 memset(ffz_host, 0, (fsize + q_stride)*
sizeof(
int));
2937 #define XCOPY(X) masterPmeMgr->X = X; 2938 XCOPY(bspline_coeffs_dev)
2939 XCOPY(bspline_dcoeffs_dev)
2956 #define XCOPY(X) X = masterPmeMgr->X; 2957 XCOPY(bspline_coeffs_dev)
2958 XCOPY(bspline_dcoeffs_dev)
2977 fz_arr =
new char[myGrid.
K3+myGrid.
order-1];
2980 #if 0 && USE_PERSISTENT 2981 recvGrid_handle = NULL;
2987 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2991 for (
int g=0; g<numGridsMax; ++g )
delete myRealSpace[g];
2995 #if 0 && USE_PERSISTENT 2996 void ComputePmeMgr::setup_recvgrid_persistent()
3000 int dim2 = myGrid.
dim2;
3001 int dim3 = myGrid.
dim3;
3002 int block1 = myGrid.
block1;
3003 int block2 = myGrid.
block2;
3005 CkArray *zPencil_local = zPencil.ckLocalBranch();
3006 recvGrid_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * numPencilsActive);
3007 for (
int ap=0; ap<numPencilsActive; ++ap) {
3008 int ib = activePencils[ap].
i;
3009 int jb = activePencils[ap].
j;
3010 int ibegin = ib*block1;
3011 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3012 int jbegin = jb*block2;
3013 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3014 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
3018 char *f = f_arr + g*fsize;
3019 for (
int i=ibegin; i<iend; ++i ) {
3020 for (
int j=jbegin; j<jend; ++j ) {
3021 fcount += f[i*dim2+j];
3026 for (
int i=0; i<myGrid.
K3; ++i ) {
3027 if ( fz_arr[i] ) ++zlistlen;
3029 int hd = ( fcount? 1 : 0 );
3030 int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
3032 int compress_size =
sizeof(float)*hd*fcount*zlistlen;
3033 int size = compress_start + compress_size +
PRIORITY_SIZE/8+6;
3034 recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
3046 if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount )
return 0;
3047 myMgr->heldComputes.
add(
this);
3051 positionBox->
skip();
3055 myMgr->noWorkCount = 0;
3056 myMgr->reduction->
submit();
3071 evir[g] += msg->
evir[g];
3091 numLocalQMAtoms = 0;
3092 for (
int paIter=0; paIter<patch->
getNumAtoms(); paIter++) {
3093 if ( qmAtomGroup[xExt[paIter].
id] != 0 ) {
3101 if (qmLoclIndx != 0)
3102 delete [] qmLoclIndx;
3103 if (qmLocalCharges != 0)
3104 delete [] qmLocalCharges;
3106 qmLoclIndx =
new int[numLocalQMAtoms] ;
3107 qmLocalCharges =
new Real[numLocalQMAtoms] ;
3113 for (
int paIter=0; paIter<patch->
getNumAtoms(); paIter++) {
3115 for (
int i=0; i<numQMAtms; i++) {
3117 if (qmAtmIndx[i] == xExt[paIter].
id) {
3119 qmLoclIndx[procAtms] = paIter ;
3120 qmLocalCharges[procAtms] = qmAtmChrg[i];
3128 if (procAtms == numLocalQMAtoms)
3138 DebugM(4,
"Entering ComputePme::doWork().\n");
3141 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3148 if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->
submitReductions();
3154 #ifdef TRACE_COMPUTE_OBJECTS 3155 double traceObjStartTime = CmiWallTimer();
3158 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3179 localData = localData_alloc.
begin();
3180 localPartition_alloc.
resize(numLocalAtoms);
3181 localPartition = localPartition_alloc.
begin();
3187 localGridData[g] = localData + numLocalAtoms*(g+extraGrids);
3192 unsigned char * part_ptr = localPartition;
3199 positionBox->
close(&x);
3200 x = avgPositionBox->
open();
3204 for(
int i=0; i<numAtoms; ++i)
3209 data_ptr->
cg = coulomb_sqrt * x[i].
charge;
3219 for(
int i=0; i<numLocalQMAtoms; ++i)
3221 localData[qmLoclIndx[i]].
cg = coulomb_sqrt * qmLocalCharges[i];
3227 else { positionBox->
close(&x); }
3236 for(
int i=0; i<numLocalAtoms; ++i) {
3237 if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3240 lgd[nga++] = localData[i];
3243 numGridAtoms[g] = nga;
3246 for(
int i=0; i<numLocalAtoms; ++i) {
3247 if ( localPartition[i] == 0 ) {
3249 lgd[nga++] = localData[i];
3252 numGridAtoms[g] = nga;
3262 for ( g=0; g<2; ++g ) {
3265 for(
int i=0; i<numLocalAtoms; ++i) {
3266 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3267 lgd[nga++] = localData[i];
3270 numGridAtoms[g] = nga;
3272 for (g=2 ; g<4 ; ++g ) {
3275 for(
int i=0; i<numLocalAtoms; ++i) {
3276 if ( localPartition[i] == (g-1) ) {
3277 lgd[nga++] = localData[i];
3280 numGridAtoms[g] = nga;
3286 for(
int i=0; i<numLocalAtoms; ++i) {
3287 if ( localPartition[i] == 0 ) {
3288 lgd[nga++] = localData[i];
3291 numGridAtoms[g] = nga;
3298 for(
int i=0; i<numLocalAtoms; ++i) {
3299 if ( localPartition[i] == 1 ) {
3300 lgd[nga++] = localData[i];
3303 numGridAtoms[g] = nga;
3309 for(
int i=0; i<numLocalAtoms; ++i) {
3310 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3311 lgd[nga++] = localData[i];
3314 numGridAtoms[g] = nga;
3315 for ( g=1; g<3; ++g ) {
3318 for(
int i=0; i<numLocalAtoms; ++i) {
3319 if ( localPartition[i] == g ) {
3320 lgd[nga++] = localData[i];
3323 numGridAtoms[g] = nga;
3327 if (
numGrids != 2 )
NAMD_bug(
"ComputePme::doWork assertion for LJ-PME failed");
3331 numGridAtoms[0] = numLocalAtoms;
3332 numGridAtoms[1] = numLocalAtoms;
3334 for (
int i=0; i < numLocalAtoms; ++i) {
3335 lgd[i].
x = localData[i].
x;
3336 lgd[i].
y = localData[i].
y;
3337 lgd[i].
z = localData[i].
z;
3348 numGridAtoms[0] = numLocalAtoms;
3351 if ( ! myMgr->doWorkCount ) {
3354 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3358 memset( (
void*) myMgr->fz_arr, 0, (myGrid.
K3+myGrid.
order-1) *
sizeof(
char) );
3360 for (
int i=0; i<myMgr->q_count; ++i) {
3361 memset( (
void*) (myMgr->q_list[i]), 0, (myGrid.
K3+myGrid.
order-1) *
sizeof(
float) );
3369 myMgr->strayChargeErrors = 0;
3371 myMgr->compute_sequence =
sequence();
3374 if (
sequence() != myMgr->compute_sequence )
NAMD_bug(
"ComputePme sequence mismatch in doWork()");
3376 int strayChargeErrors = 0;
3383 data_ptr = localGridData[g];
3384 for (
int i=0; i<numGridAtoms[g]; ++i) {
3385 selfEnergy += data_ptr->
cg * data_ptr->
cg;
3390 double alpha6 = LJewaldcof * LJewaldcof * LJewaldcof;
3391 alpha6 = alpha6 * alpha6;
3392 selfEnergy *= (1./12.) * alpha6;
3394 selfEnergy *= -1. * ewaldcof /
SQRT_PI;
3396 myMgr->evir[g][0] += selfEnergy;
3398 float **q = myMgr->q_arr + g*myMgr->fsize;
3399 char *f = myMgr->f_arr + g*myMgr->fsize;
3402 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3407 cudaMallocHost((
void**) &(myMgr->
a_data_host), 7*na*
sizeof(
float));
3408 cudaMalloc((
void**) &(myMgr->
a_data_dev), 7*na*
sizeof(
float));
3413 int n = numGridAtoms[g];
3416 CkPrintf(
"Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3420 const float *a_data_host_old = myMgr->
a_data_host;
3421 cudaMallocHost((
void**) &(myMgr->
a_data_host), 7*na*
sizeof(
float));
3423 memcpy(myMgr->
a_data_host, a_data_host_old, 7*cuda_atoms_offset*
sizeof(
float));
3424 cudaFreeHost((
void*) a_data_host_old);
3428 cudaMalloc((
void**) &(myMgr->
a_data_dev), 7*na*
sizeof(
float));
3431 float *a_data_host = myMgr->
a_data_host + 7 * cuda_atoms_offset;
3432 data_ptr = localGridData[g];
3433 double order_1 = myGrid.
order - 1;
3434 double K1 = myGrid.
K1;
3435 double K2 = myGrid.
K2;
3436 double K3 = myGrid.
K3;
3437 int found_negative = 0;
3438 for (
int i=0; i<n; ++i ) {
3439 if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3443 double x_int = (int) data_ptr[i].x;
3444 double y_int = (int) data_ptr[i].y;
3445 double z_int = (int) data_ptr[i].z;
3446 a_data_host[7*i ] = data_ptr[i].
x - x_int;
3447 a_data_host[7*i+1] = data_ptr[i].
y - y_int;
3448 a_data_host[7*i+2] = data_ptr[i].
z - z_int;
3449 a_data_host[7*i+3] = data_ptr[i].
cg;
3450 x_int -= order_1;
if ( x_int < 0 ) x_int += K1;
3451 y_int -= order_1;
if ( y_int < 0 ) y_int += K2;
3452 z_int -= order_1;
if ( z_int < 0 ) z_int += K3;
3453 a_data_host[7*i+4] = x_int;
3454 a_data_host[7*i+5] = y_int;
3455 a_data_host[7*i+6] = z_int;
3457 if ( found_negative )
NAMD_bug(
"found negative atom coordinate in ComputePme::doWork");
3462 myRealSpace[g]->
fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3465 myMgr->strayChargeErrors += strayChargeErrors;
3467 #ifdef TRACE_COMPUTE_OBJECTS 3471 if ( --(myMgr->doWorkCount) == 0 ) {
3473 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3510 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3518 const double before = CmiWallTimer();
3520 cudaMemcpyHostToDevice, streams[stream]);
3521 const double after = CmiWallTimer();
3523 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3527 q_arr_dev, ffz_dev, ffz_dev + fsize,
3531 const double after2 = CmiWallTimer();
3543 cudaError_t err = cudaEventQuery(argp->
end_charges);
3544 if ( err == cudaSuccess ) {
3549 }
else if ( err != cudaErrorNotReady ) {
3551 sprintf(errmsg,
"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3557 sprintf(errmsg,
"cuda_check_pme_charges polled %d times over %f s on seq %d",
3581 double before = CmiWallTimer();
3582 cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);
3583 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3584 cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3585 cudaMemcpyDeviceToHost, streams[stream]);
3587 cudaEventRecord(masterPmeMgr->
end_charges, streams[stream]);
3588 cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);
3589 cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3595 pmeProxy[
master_pe].pollChargeGridReady();
3600 for (
int i=0; i<CkMyNodeSize(); ++i ) {
3604 mgr->ungridForcesCount = cs;
3605 mgr->recipEvirCount = mgr->recipEvirClients;
3609 pmeProxy[
master_pe].recvChargeGridReady();
3614 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3618 NAMD_bug(
"ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3628 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3631 int q_stride = myGrid.
K3+myGrid.
order-1;
3632 for (
int n=fsize+q_stride, j=fsize; j<n; ++j) {
3633 f_arr[j] = ffz_host[j];
3634 if ( ffz_host[j] & ~1 ) ++errcount;
3636 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::chargeGridReady");
3639 recipEvirCount = recipEvirClients;
3642 for (
int j=0; j<myGrid.
order-1; ++j) {
3643 fz_arr[j] |= fz_arr[myGrid.
K3+j];
3658 #if 0 && USE_PERSISTENT 3659 if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3663 int dim2 = myGrid.
dim2;
3664 int dim3 = myGrid.
dim3;
3665 int block1 = myGrid.
block1;
3666 int block2 = myGrid.
block2;
3669 NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3671 for (
int ap=first; ap<=last; ++ap) {
3672 int ib = activePencils[ap].
i;
3673 int jb = activePencils[ap].
j;
3674 int ibegin = ib*block1;
3675 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3676 int jbegin = jb*block2;
3677 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3678 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
3682 char *f = f_arr + g*fsize;
3683 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3686 for (
int i=ibegin; i<iend; ++i ) {
3687 for (
int j=jbegin; j<jend; ++j ) {
3691 if ( ffz_host[k] & ~1 ) ++errcount;
3694 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::sendPencilsPart");
3697 for (
int i=ibegin; i<iend; ++i ) {
3698 for (
int j=jbegin; j<jend; ++j ) {
3699 fcount += f[i*dim2+j];
3704 #ifdef NETWORK_PROGRESS 3705 CmiNetworkProgress();
3708 if ( ! pencilActive[ib*yBlocks+jb] )
3709 NAMD_bug(
"PME activePencils list inconsistent");
3712 for (
int i=0; i<myGrid.
K3; ++i ) {
3713 if ( fz_arr[i] ) ++zlistlen;
3716 int hd = ( fcount? 1 : 0 );
3720 PmeGridMsg *msg =
new ( hd*zlistlen, hd*flen,
3727 msg->
start = fstart;
3734 int *zlist = msg->
zlist;
3736 for (
int i=0; i<myGrid.
K3; ++i ) {
3737 if ( fz_arr[i] ) zlist[zlistlen++] = i;
3739 char *fmsg = msg->
fgrid;
3740 float *qmsg = msg->
qgrid;
3742 char *f = f_arr + g*fsize;
3743 float **q = q_arr + g*fsize;
3744 for (
int i=ibegin; i<iend; ++i ) {
3745 for (
int j=jbegin; j<jend; ++j ) {
3746 *(fmsg++) = f[i*dim2+j];
3748 for (
int h=0; h<myGrid.
order-1; ++h) {
3749 q[i*dim2+j][h] += q[i*dim2+j][myGrid.
K3+h];
3751 for (
int k=0; k<zlistlen; ++k ) {
3752 *(qmsg++) = q[i*dim2+j][zlist[k]];
3762 CmiEnableUrgentSend(1);
3763 #if USE_NODE_PAR_RECEIVE 3764 msg->
destElem=CkArrayIndex3D(ib,jb,0);
3765 CProxy_PmePencilMap lzm = npMgr->
zm;
3766 int destproc = lzm.ckLocalBranch()->procNum(0, msg->
destElem);
3767 int destnode = CmiNodeOf(destproc);
3770 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3772 pmeNodeProxy[destnode].recvZGrid(msg);
3774 CmiUsePersistentHandle(NULL, 0);
3778 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3780 zPencil(ib,jb,0).recvGrid(msg);
3782 CmiUsePersistentHandle(NULL, 0);
3785 CmiEnableUrgentSend(0);
3801 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3805 NAMD_bug(
"NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3815 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3817 for (
int ap=0; ap < numPencilsActive; ++ap ) {
3820 int ib = activePencils[ap].
i;
3821 int jb = activePencils[ap].
j;
3822 int destproc = nodePmeMgr->
zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3823 pmeProxy[destproc].sendPencilsHelper(ap);
3825 pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3834 if ( strayChargeErrors ) {
3835 strayChargeErrors = 0;
3836 iout <<
iERROR <<
"Stray PME grid charges detected: " 3837 << CkMyPe() <<
" sending to (x,y)";
3840 int dim2 = myGrid.
dim2;
3841 int block1 = myGrid.
block1;
3842 int block2 = myGrid.
block2;
3843 for (
int ib=0; ib<xBlocks; ++ib) {
3844 for (
int jb=0; jb<yBlocks; ++jb) {
3845 int ibegin = ib*block1;
3846 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3847 int jbegin = jb*block2;
3848 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3849 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
3852 char *f = f_arr + g*fsize;
3853 if ( ! pencilActive[ib*yBlocks+jb] ) {
3854 for (
int i=ibegin; i<iend; ++i ) {
3855 for (
int j=jbegin; j<jend; ++j ) {
3856 if ( f[i*dim2+j] == 3 ) {
3858 iout <<
" (" << i <<
"," << j <<
")";
3876 int dim2 = myGrid.
dim2;
3877 int dim3 = myGrid.
dim3;
3878 int block1 = myGrid.
block1;
3879 int block2 = myGrid.
block2;
3885 int ibegin = ib*block1;
3886 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3887 int jbegin = jb*block2;
3888 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3891 int *zlist = msg->
zlist;
3892 float *qmsg = msg->
qgrid;
3895 char *f = f_arr + g*fsize;
3896 float **q = q_arr + g*fsize;
3897 for (
int i=ibegin; i<iend; ++i ) {
3898 for (
int j=jbegin; j<jend; ++j ) {
3901 for (
int k=0; k<zlistlen; ++k ) {
3902 q[i*dim2+j][zlist[k]] = *(qmsg++);
3904 for (
int h=0; h<myGrid.
order-1; ++h) {
3905 q[i*dim2+j][myGrid.
K3+h] = q[i*dim2+j][h];
3920 CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3921 for (
int j=first; j<=last; j++) {
3922 int pe = gridPeOrder[j];
3923 if ( ! recipPeDest[pe] && ! errors )
continue;
3924 int start = pe * bsize;
3926 if ( start >= qsize ) { start = 0; len = 0; }
3927 if ( start + len > qsize ) { len = qsize - start; }
3928 int zdim = myGrid.
dim3;
3929 int fstart = start / zdim;
3930 int flen = len / zdim;
3936 char *f = f_arr + fstart + g*fsize;
3937 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3940 for ( i=0; i<flen; ++i ) {
3941 f[i] = ffz_host[fstart+i];
3943 if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3945 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::sendDataPart");
3948 for ( i=0; i<flen; ++i ) {
3951 if ( ! recipPeDest[pe] ) {
3953 for ( i=0; i<flen; ++i ) {
3960 iout <<
iERROR <<
"Stray PME grid charges detected: " 3961 << sourcepe <<
" sending to " << gridPeMap[pe] <<
" for planes";
3963 for ( i=0; i<flen; ++i ) {
3966 int jz = (i+fstart)/myGrid.
K2;
3967 if ( iz != jz ) {
iout <<
" " << jz; iz = jz; }
3975 #ifdef NETWORK_PROGRESS 3976 CmiNetworkProgress();
3979 if ( ! recipPeDest[pe] )
continue;
3982 for ( i=0; i<myGrid.
K3; ++i ) {
3983 if ( fz_arr[i] ) ++zlistlen;
3991 msg->
start = fstart;
3994 int *zlist = msg->
zlist;
3996 for ( i=0; i<myGrid.
K3; ++i ) {
3997 if ( fz_arr[i] ) zlist[zlistlen++] = i;
3999 float *qmsg = msg->
qgrid;
4001 char *f = f_arr + fstart + g*fsize;
4002 CmiMemcpy((
void*)(msg->
fgrid+g*flen),(
void*)f,flen*
sizeof(
char));
4003 float **q = q_arr + fstart + g*fsize;
4004 for ( i=0; i<flen; ++i ) {
4006 for (
int h=0; h<myGrid.
order-1; ++h) {
4007 q[i][h] += q[i][myGrid.
K3+h];
4009 for (
int k=0; k<zlistlen; ++k ) {
4010 *(qmsg++) = q[i][zlist[k]];
4018 pmeProxy[gridPeMap[pe]].recvGrid(msg);
4028 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 4032 NAMD_bug(
"NodePmeMgr::sendDataHelper called in non-CUDA build");
4042 strayChargeErrors = 0;
4044 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 4046 for (
int i=0; i < numGridPes; ++i ) {
4047 int pe = gridPeOrder[i];
4051 pmeProxy[gridPeMap[pe]].sendDataHelper(i);
4053 pmeNodeProxy[CkMyNode()].sendDataHelper(i);
4066 int zdim = myGrid.
dim3;
4067 int flen = msg->
len;
4068 int fstart = msg->
start;
4070 int *zlist = msg->
zlist;
4071 float *qmsg = msg->
qgrid;
4074 char *f = msg->
fgrid + g*flen;
4075 float **q = q_arr + fstart + g*fsize;
4076 for (
int i=0; i<flen; ++i ) {
4079 for (
int k=0; k<zlistlen; ++k ) {
4080 q[i][zlist[k]] = *(qmsg++);
4082 for (
int h=0; h<myGrid.
order-1; ++h) {
4083 q[i][myGrid.
K3+h] = q[i][h];
4092 if (
sequence() != myMgr->compute_sequence )
NAMD_bug(
"ComputePme sequence mismatch in ungridForces()");
4097 Vector *localResults = localResults_alloc.
begin();
4101 for(
int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4102 gridResults = localResults + numLocalAtoms;
4104 gridResults = localResults;
4112 #ifdef NETWORK_PROGRESS 4113 CmiNetworkProgress();
4116 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 4119 for (
int n=numGridAtoms[g], i=0; i<n; ++i ) {
4121 if ( ((
int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }
4122 gridResults[i].
x = f_data_host[3*i];
4123 gridResults[i].
y = f_data_host[3*i+1];
4124 gridResults[i].
z = f_data_host[3*i+2];
4128 for (
int n=numGridAtoms[g], i=0; i<n; ++i ) {
4129 float f = f_data_host[3*i];
4130 if ( ((
int*)f_data_host)[3*i] == 0x7fffffff ) {
4132 gridResults[i] = 0.;
4135 iout <<
iERROR <<
"Stray PME grid charges detected: " 4136 << errcount <<
" atoms on pe " << CkMyPe() <<
"\n" <<
endi;
4141 myRealSpace[g]->
compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4149 gridResults += numLocalAtoms;
4152 for (
int i=0; i < numLocalAtoms; i++) {
4153 localResults[i] += gridResults[i];
4158 BigReal elecLambdaUp, elecLambdaDown;
4160 myMgr->alchLambda = alchLambda;
4162 myMgr->alchLambda2 = alchLambda2;
4163 elecLambdaUp =
simParams->getElecLambda(alchLambda);
4164 elecLambdaDown =
simParams->getElecLambda(1. - alchLambda);
4166 if ( g == 0 ) scale = elecLambdaUp;
4167 else if ( g == 1 ) scale = elecLambdaDown;
4168 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4171 if ( g == 2 ) scale = 1 - elecLambdaUp;
4172 else if ( g == 3 ) scale = 1 - elecLambdaDown;
4173 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4178 for(
int i=0; i<numLocalAtoms; ++i) {
4179 if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4182 localResults[i] += gridResults[nga++] * scale;
4186 for(
int i=0; i<numLocalAtoms; ++i) {
4187 if ( localPartition[i] == 0 ) {
4189 localResults[i] += gridResults[nga++] * scale;
4195 for(
int i=0; i<numLocalAtoms; ++i) {
4196 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4199 localResults[i] += gridResults[nga++] * scale;
4204 for(
int i=0; i<numLocalAtoms; ++i) {
4205 if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4209 localResults[i] += gridResults[nga++] * scale;
4214 }
else if (
lesOn ) {
4218 myMgr->alchLambda = alchLambda;
4220 myMgr->alchLambda2 = alchLambda2;
4221 if ( g == 0 ) scale = alchLambda;
4222 else if ( g == 1 ) scale = 1. - alchLambda;
4223 }
else if (
lesOn ) {
4227 for(
int i=0; i<numLocalAtoms; ++i) {
4228 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4229 localResults[i] += gridResults[nga++] * scale;
4235 for(
int i=0; i<numLocalAtoms; ++i) {
4236 if ( localPartition[i] == 1 ) {
4237 pairForce += gridResults[nga];
4238 localResults[i] += gridResults[nga++];
4244 for(
int i=0; i<numLocalAtoms; ++i) {
4245 if ( localPartition[i] == 1 ) {
4246 pairForce += gridResults[nga];
4248 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4249 localResults[i] += gridResults[nga++];
4252 }
else if ( g == 1 ) {
4254 for(
int i=0; i<numLocalAtoms; ++i) {
4255 if ( localPartition[i] == g ) {
4256 pairForce -= gridResults[nga];
4257 localResults[i] -= gridResults[nga++];
4262 for(
int i=0; i<numLocalAtoms; ++i) {
4263 if ( localPartition[i] == g ) {
4264 localResults[i] -= gridResults[nga++];
4272 Vector *results_ptr = localResults;
4280 if ( ! myMgr->strayChargeErrors && !
simParams->commOnly ) {
4281 for(
int i=0; i<numAtoms; ++i) {
4282 f[i].
x += results_ptr->
x;
4283 f[i].
y += results_ptr->
y;
4284 f[i].
z += results_ptr->
z;
4288 forceBox->
close(&r);
4304 BigReal elecLambdaUp, elecLambdaDown;
4306 if ( alchLambda < 0 || alchLambda > 1 ) {
4307 NAMD_bug(
"ComputePmeMgr::submitReductions alchLambda out of range");
4309 elecLambdaUp =
simParams->getElecLambda(alchLambda);
4310 elecLambdaDown =
simParams->getElecLambda(1-alchLambda);
4311 if ( g == 0 ) scale = elecLambdaUp;
4312 else if ( g == 1 ) scale = elecLambdaDown;
4313 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4315 if ( g == 2 ) scale = 1-elecLambdaUp;
4316 else if ( g == 3 ) scale = 1-elecLambdaDown;
4317 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4319 }
else if (
lesOn ) {
4322 scale = ( g == 0 ? 1. : -1. );
4329 reduction->
item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4330 reduction->
item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4331 reduction->
item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4332 reduction->
item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4333 reduction->
item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4334 reduction->
item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4335 reduction->
item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4336 reduction->
item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4337 reduction->
item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4341 BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4342 elecLambda2Up =
simParams->getElecLambda(alchLambda2);
4343 elecLambda2Down =
simParams->getElecLambda(1.-alchLambda2);
4344 if ( g == 0 ) scale2 = elecLambda2Up;
4345 else if ( g == 1 ) scale2 = elecLambda2Down;
4346 else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4347 if (
alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4348 else if (
alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4349 else if (
alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4402 for (
int i=0; i<heldComputes.
size(); ++i ) {
4411 const static unsigned int NAMDPrimes[] = {
4441 bool generateBGLORBPmePeList(
int *pemap,
int numPes,
4442 int *block_pes,
int nbpes) {
4445 int *pmemap =
new int [CkNumPes()];
4452 memset(pmemap, 0,
sizeof(
int) * CkNumPes());
4454 for(
int count = 0; count < CkNumPes(); count++) {
4456 pmemap[block_pes[count]] = 1;
4462 if(tmgr.hasMultipleProcsPerNode()) {
4463 pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4472 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4473 Node *node = nd.ckLocalBranch();
4478 int xsize = 0, ysize = 0, zsize = 0;
4480 xsize = tmgr.getDimNX();
4481 ysize = tmgr.getDimNY();
4482 zsize = tmgr.getDimNZ();
4484 int nx = xsize, ny = ysize, nz = zsize;
4491 findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4494 int group_size = numPes/nx;
4498 int my_prime = NAMDPrimes[0];
4499 int density = (ny * nz)/group_size + 1;
4503 for(count = 0; count < NPRIMES; count ++) {
4505 if(density < NAMDPrimes[count]) {
4506 my_prime = NAMDPrimes[count];
4511 if(count == NPRIMES)
4512 my_prime = NAMDPrimes[NPRIMES-1];
4520 for(
int x = 0; x < nx; x++) {
4521 coord[0] = (x + nx/2)%nx;
4523 for(count=0; count < group_size && npme_pes < numPes; count++) {
4524 int dest = (count + 1) * my_prime;
4525 dest = dest % (ny * nz);
4527 coord[2] = dest / ny;
4528 coord[1] = dest - coord[2] * ny;
4531 int destPe = coord[dm.x] + coord[dm.y] * xsize +
4532 coord[dm.z] * xsize* ysize;
4534 if(pmemap[destPe] == 0) {
4535 pemap[gcount++] = destPe;
4538 if(tmgr.hasMultipleProcsPerNode())
4539 pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4544 for(
int pos = 1; pos < ny * nz; pos++) {
4546 coord[2] += pos / ny;
4547 coord[1] += pos % ny;
4549 coord[2] = coord[2] % nz;
4550 coord[1] = coord[1] % ny;
4552 int newdest = coord[dm.x] + coord[dm.y] * xsize +
4553 coord[dm.z] * xsize * ysize;
4555 if(pmemap[newdest] == 0) {
4556 pemap[gcount++] = newdest;
4557 pmemap[newdest] = 1;
4559 if(tmgr.hasMultipleProcsPerNode())
4560 pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4569 if(gcount == numPes)
4572 if(npme_pes >= numPes)
4578 if(npme_pes != numPes)
4595 trans_handle = untrans_handle = ungrid_handle = NULL;
4614 for (
int i=0; i<nBlocks; ++i )
send_order[i] = i;
4628 #ifndef CmiMemoryAtomicType 4642 PersistentHandle *trans_handle;
4643 PersistentHandle *untrans_handle;
4644 PersistentHandle *ungrid_handle;
4650 PmeZPencil_SDAG_CODE
4656 delete [] forward_plans;
4657 delete [] backward_plans;
4679 fftwf_plan forward_plan, backward_plan;
4683 fftwf_plan *forward_plans, *backward_plans;
4685 rfftwnd_plan forward_plan, backward_plan;
4691 void setup_persistent() {
4697 CmiAssert(yPencil_local);
4698 trans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * zBlocks);
4699 for (
int isend=0; isend<zBlocks; ++isend ) {
4702 if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4703 int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4705 int compress_start =
sizeof(
PmeTransMsg)+
sizeof(envelope);
4706 int compress_size =
sizeof(float)*hd*nx*ny*nz1*2;
4707 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4711 void setup_ungrid_persistent()
4713 ungrid_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * grid_msgs.
size());
4715 for ( limsg=0; limsg < grid_msgs.
size(); ++limsg ) {
4716 int peer = grid_msgs[limsg]->sourceNode;
4726 PmeYPencil_SDAG_CODE
4746 fftwf_plan forward_plan, backward_plan;
4748 fftw_plan forward_plan, backward_plan;
4754 void setup_persistent() {
4760 trans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * yBlocks);
4761 for (
int isend=0; isend<yBlocks; ++isend ) {
4764 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4765 int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4767 int compress_start =
sizeof(
PmeTransMsg)+
sizeof( envelope);
4768 int compress_size =
sizeof(float)*hd*nx*ny1*nz*2;
4769 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4773 untrans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * yBlocks);
4774 for (
int isend=0; isend<yBlocks; ++isend ) {
4777 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4778 int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4780 int compress_start =
sizeof(
PmeUntransMsg) +
sizeof( envelope);
4781 int compress_size =
sizeof(float)*nx*ny1*nz*2;
4782 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4790 PmeXPencil_SDAG_CODE
4796 delete [] forward_plans;
4797 delete [] backward_plans;
4814 fftwf_plan *forward_plans, *backward_plans;
4824 void setup_persistent() {
4829 untrans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * xBlocks);
4830 for (
int isend=0; isend<xBlocks; ++isend ) {
4833 if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4834 int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4837 int compress_start =
sizeof(
PmeUntransMsg) +
sizeof( envelope);
4838 int compress_size =
sizeof(float)*nx1*
ny*
nz*2;
4839 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4852 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4853 Node *node = nd.ckLocalBranch();
4856 #if USE_NODE_PAR_RECEIVE 4868 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4870 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4876 work =
new float[dim3];
4883 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4884 int sizeLines=nx*ny;
4885 int planLineSizes[1];
4886 planLineSizes[0]=K3;
4888 int ndimHalf=ndim/2;
4889 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4890 (
float *)
data, NULL, 1,
4892 (fftwf_complex *)
data, NULL, 1,
4896 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4897 (fftwf_complex *)
data, NULL, 1,
4899 (
float *)
data, NULL, 1,
4902 #if CMK_SMP && USE_CKLOOP 4906 numPlans = (nx<=ny?nx:ny);
4907 if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4908 if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4909 int howmany = sizeLines/numPlans;
4910 forward_plans =
new fftwf_plan[numPlans];
4911 backward_plans =
new fftwf_plan[numPlans];
4912 for(
int i=0; i<numPlans; i++) {
4913 int dimStride = i*ndim*howmany;
4914 int dimHalfStride = i*ndimHalf*howmany;
4915 forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4916 ((
float *)
data)+dimStride, NULL, 1,
4918 ((fftwf_complex *)
data)+dimHalfStride, NULL, 1,
4922 backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4923 ((fftwf_complex *)
data)+dimHalfStride, NULL, 1,
4925 ((
float *)
data)+dimStride, NULL, 1,
4932 forward_plans = NULL;
4933 backward_plans = NULL;
4936 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4937 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4938 | FFTW_IN_PLACE | FFTW_USE_WISDOM,
data, 1,
work, 1);
4939 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4940 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4941 | FFTW_IN_PLACE | FFTW_USE_WISDOM,
data, 1,
work, 1);
4945 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
4948 #if USE_NODE_PAR_RECEIVE 4950 memset(
data, 0,
sizeof(
float) * nx*ny*dim3);
4955 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4956 Node *node = nd.ckLocalBranch();
4959 #if USE_NODE_PAR_RECEIVE 4971 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4973 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4979 work =
new float[2*K2];
4989 int planLineSizes[1];
4990 planLineSizes[0]=K2;
4991 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4992 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4993 (fftwf_complex *)
data, NULL, sizeLines, 1,
4994 (fftwf_complex *)
data, NULL, sizeLines, 1,
4997 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4998 (fftwf_complex *)
data, NULL, sizeLines, 1,
4999 (fftwf_complex *)
data, NULL, sizeLines, 1,
5002 CkAssert(forward_plan != NULL);
5003 CkAssert(backward_plan != NULL);
5005 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
5006 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5007 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5008 nz, (fftw_complex *)
work, 1);
5009 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
5010 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5011 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5012 nz, (fftw_complex *)
work, 1);
5016 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
5019 #if USE_NODE_PAR_RECEIVE 5021 CmiMemoryWriteFence();
5031 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
5039 CmiMemoryWriteFence();
5051 if ( !
hasData )
NAMD_bug(
"PmeYPencil::node_process_untrans non-null msg but not hasData");
5053 }
else if (
hasData )
NAMD_bug(
"PmeYPencil::node_process_untrans hasData but null msg");
5055 CmiMemoryAtomicFetchAndInc(
imsgb,limsg);
5063 CmiMemoryWriteFence();
5068 #define DEBUG_NODE_PAR_RECV 0 5073 #if DEBUG_NODE_PAR_RECV 5075 CkAbort(
"xpencil in recvXTrans not found, debug registeration");
5085 #if DEBUG_NODE_PAR_RECV 5087 CkAbort(
"ypencil in recvYTrans not found, debug registeration");
5095 #if DEBUG_NODE_PAR_RECV 5097 CkAbort(
"ypencil in recvYUntrans not found, debug registeration");
5105 #if DEBUG_NODE_PAR_RECV 5107 CkAbort(
"zpencil in recvZUntrans not found, debug registeration");
5116 #if DEBUG_NODE_PAR_RECV 5118 CkAbort(
"zpencil in recvZGrid not found, debug registeration");
5125 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5126 Node *node = nd.ckLocalBranch();
5128 #if USE_NODE_PAR_RECEIVE 5139 if ( (thisIndex.y + 1) * block2 > K2 )
ny = K2 - thisIndex.y * block2;
5141 if ( (thisIndex.z+1)*block3 > dim3/2 )
nz = dim3/2 - thisIndex.z*block3;
5147 work =
new float[2*K1];
5153 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5154 int sizeLines=
ny*
nz;
5155 int planLineSizes[1];
5156 planLineSizes[0]=K1;
5157 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5158 (fftwf_complex *)
data, NULL, sizeLines, 1,
5159 (fftwf_complex *)
data, NULL, sizeLines, 1,
5162 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5163 (fftwf_complex *)
data, NULL, sizeLines, 1,
5164 (fftwf_complex *)
data, NULL, sizeLines, 1,
5168 #if CMK_SMP && USE_CKLOOP 5176 if ( sizeLines/numPlans < 4 ) numPlans = 1;
5177 int howmany = sizeLines/numPlans;
5178 forward_plans =
new fftwf_plan[numPlans];
5179 backward_plans =
new fftwf_plan[numPlans];
5180 for(
int i=0; i<numPlans; i++) {
5181 int curStride = i*howmany;
5182 forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5183 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5184 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5188 backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5189 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5190 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5197 forward_plans = NULL;
5198 backward_plans = NULL;
5201 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5202 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5203 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5205 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5206 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5207 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5212 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
5216 thisIndex.y*block2, thisIndex.y*block2 +
ny,
5217 thisIndex.z*block3, thisIndex.z*block3 +
nz);
5230 #if ! USE_NODE_PAR_RECEIVE 5231 memset(
data, 0,
sizeof(
float)*nx*ny*dim3);
5239 int * __restrict msg_zlist = msg->
zlist;
5240 int * __restrict zlist = (
int*)__builtin_assume_aligned(work_zlist.
begin(),
5242 for (
int k=0; k<zlistlen; ++k ) {
5243 zlist[k] = msg_zlist[k];
5246 int * __restrict zlist = msg->
zlist;
5248 char * __restrict fmsg = msg->
fgrid;
5249 float * __restrict qmsg = msg->
qgrid;
5250 float * __restrict d =
data;
5252 for (
int g=0; g<numGrids; ++g ) {
5253 for (
int i=0; i<nx; ++i ) {
5254 for (
int j=0; j<ny; ++j, d += dim3 ) {
5257 for (
int k=0; k<zlistlen; ++k ) {
5258 d[zlist[k]] += *(qmsg++);
5266 static inline void PmeXZPencilFFT(
int first,
int last,
void *result,
int paraNum,
void *param){
5269 fftwf_plan *plans = (fftwf_plan *)param;
5270 for(
int i=first; i<=last; i++) fftwf_execute(plans[i]);
5280 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5282 for (
int i=0; i<nx; ++i ) {
5283 for (
int j=0; j<ny; ++j, d += dim3 ) {
5284 for (
int k=0; k<dim3; ++k ) {
5285 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5291 #ifdef MANUAL_DEBUG_FFTW3 5292 dumpMatrixFloat3(
"fw_z_b",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5295 #if CMK_SMP && USE_CKLOOP 5301 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
5305 fftwf_execute(forward_plan);
5307 rfftwnd_real_to_complex(forward_plan, nx*ny,
5310 #ifdef MANUAL_DEBUG_FFTW3 5311 dumpMatrixFloat3(
"fw_z_a",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5319 for (
int i=0; i<nx; ++i ) {
5320 for (
int j=0; j<ny; ++j, d += dim3 ) {
5321 for (
int k=0; k<dim3; ++k ) {
5322 if ( d[k] == 0. ) CkPrintf(
"0 in Z at %d %d %d %d %d %d %d %d %d\n",
5323 thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5340 for (
int isend=fromIdx; isend<=toIdx; ++isend ) {
5343 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5351 float *md = msg->
qgrid;
5352 const float *d =
data;
5353 for (
int i=0; i<nx; ++i ) {
5354 for (
int j=0; j<ny; ++j, d += dim3 ) {
5355 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
5365 CmiEnableUrgentSend(1);
5366 #if USE_NODE_PAR_RECEIVE 5367 msg->
destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5369 CmiUsePersistentHandle(&trans_handle[isend], 1);
5373 CmiUsePersistentHandle(NULL, 0);
5377 CmiUsePersistentHandle(&trans_handle[isend], 1);
5381 CmiUsePersistentHandle(NULL, 0);
5384 CmiEnableUrgentSend(0);
5390 if (trans_handle == NULL) setup_persistent();
5392 #if CMK_SMP && USE_CKLOOP 5410 for (
int isend=0; isend<zBlocks; ++isend ) {
5413 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5421 float *md = msg->
qgrid;
5422 const float *d =
data;
5423 for (
int i=0; i<nx; ++i ) {
5424 for (
int j=0; j<ny; ++j, d += dim3 ) {
5425 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
5435 CmiEnableUrgentSend(1);
5436 #if USE_NODE_PAR_RECEIVE 5437 msg->
destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5439 CmiUsePersistentHandle(&trans_handle[isend], 1);
5443 CmiUsePersistentHandle(NULL, 0);
5447 CmiUsePersistentHandle(&trans_handle[isend], 1);
5451 CmiUsePersistentHandle(NULL, 0);
5454 CmiEnableUrgentSend(0);
5468 const float *md = msg->
qgrid;
5470 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5471 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5472 for (
int k=0; k<nz; ++k ) {
5474 if ( (*md) == 0. ) CkPrintf(
"0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5475 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5477 d[2*(j*nz+k)] = *(md++);
5478 d[2*(j*nz+k)+1] = *(md++);
5484 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5485 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5486 for (
int k=0; k<nz; ++k ) {
5488 d[2*(j*nz+k)+1] = 0;
5502 for(
int i=fromIdx; i<=toIdx; i++){
5503 fftwf_execute_dft(forward_plan, ((fftwf_complex *)
data) + i
5514 #ifdef MANUAL_DEBUG_FFTW3 5515 dumpMatrixFloat3(
"fw_y_b",
data, nx,
initdata.
grid.
K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5519 #if CMK_SMP && USE_CKLOOP 5528 for (
int i=0; i<nx; ++i ) {
5529 fftwf_execute_dft(forward_plan, ((fftwf_complex *)
data) + i
5534 for (
int i=0; i<nx; ++i ) {
5535 fftw(forward_plan, nz,
5537 nz, 1, (fftw_complex *)
work, 1, 0);
5540 #ifdef MANUAL_DEBUG_FFTW3 5541 dumpMatrixFloat3(
"fw_y_a",
data, nx,
initdata.
grid.
dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5556 for (
int isend=fromIdx; isend<=toIdx; ++isend ) {
5559 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5567 float *md = msg->
qgrid;
5568 const float *d =
data;
5569 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5570 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5571 for (
int k=0; k<nz; ++k ) {
5572 *(md++) = d[2*(j*nz+k)];
5573 *(md++) = d[2*(j*nz+k)+1];
5575 if ( *(md-2) == 0. ) CkPrintf(
"send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5576 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5581 if ( md != msg->
qgrid + nx*ny*nz*2 ) CkPrintf(
"error in YX at %d %d %d\n",
5582 thisIndex.x, jb, thisIndex.z);
5586 CmiEnableUrgentSend(1);
5587 #if USE_NODE_PAR_RECEIVE 5588 msg->
destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5590 CmiUsePersistentHandle(&trans_handle[isend], 1);
5594 CmiUsePersistentHandle(NULL, 0);
5598 CmiUsePersistentHandle(&trans_handle[isend], 1);
5602 CmiUsePersistentHandle(NULL, 0);
5605 CmiEnableUrgentSend(0);
5611 if (trans_handle == NULL) setup_persistent();
5613 #if CMK_SMP && USE_CKLOOP 5631 for (
int isend=0; isend<yBlocks; ++isend ) {
5634 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5642 float *md = msg->
qgrid;
5643 const float *d =
data;
5644 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5645 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5646 for (
int k=0; k<nz; ++k ) {
5647 *(md++) = d[2*(j*nz+k)];
5648 *(md++) = d[2*(j*nz+k)+1];
5650 if ( *(md-2) == 0. ) CkPrintf(
"send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5651 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5656 if ( md != msg->
qgrid + nx*ny*nz*2 ) CkPrintf(
"error in YX at %d %d %d\n",
5657 thisIndex.x, jb, thisIndex.z);
5661 CmiEnableUrgentSend(1);
5662 #if USE_NODE_PAR_RECEIVE 5663 msg->
destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5665 CmiUsePersistentHandle(&trans_handle[isend], 1);
5669 CmiUsePersistentHandle(NULL, 0);
5673 CmiUsePersistentHandle(&trans_handle[isend], 1);
5677 CmiUsePersistentHandle(NULL, 0);
5681 CmiEnableUrgentSend(0);
5691 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
5701 CmiMemoryWriteFence();
5715 const float *md = msg->
qgrid;
5716 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5718 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5719 for (
int k=0; k<
nz; ++k ) {
5721 if ( (*md) == 0. ) CkPrintf(
"0 in YX at %d %d %d %d %d %d %d %d %d\n",
5722 ib, thisIndex.y, thisIndex.z, i, j, k, nx,
ny,
nz);
5730 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5732 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5733 for (
int k=0; k<
nz; ++k ) {
5745 #ifdef MANUAL_DEBUG_FFTW3 5750 #if CMK_SMP && USE_CKLOOP 5756 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
5763 ((fftw_complex *)
data),
ny*
nz, 1, (fftw_complex *)
work, 1, 0);
5765 #ifdef MANUAL_DEBUG_FFTW3 5783 #if CMK_SMP && USE_CKLOOP 5792 for (
int g=0; g<numGrids; ++g ) {
5797 #if USE_NODE_PAR_RECEIVE 5798 CmiMemoryWriteFence();
5804 #ifdef MANUAL_DEBUG_FFTW3 5809 #if CMK_SMP && USE_CKLOOP 5815 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
5822 ((fftw_complex *)
data),
ny*
nz, 1, (fftw_complex *)
work, 1, 0);
5824 #ifdef MANUAL_DEBUG_FFTW3 5840 for(
int isend=fromIdx; isend<=toIdx; isend++) {
5844 CmiEnableUrgentSend(1);
5846 #if USE_NODE_PAR_RECEIVE 5851 CmiEnableUrgentSend(0);
5855 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5859 float *md = msg->
qgrid;
5860 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5862 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5863 for (
int k=0; k<
nz; ++k ) {
5870 CmiEnableUrgentSend(1);
5871 #if USE_NODE_PAR_RECEIVE 5872 msg->
destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5874 CmiUsePersistentHandle(&untrans_handle[isend], 1);
5878 CmiUsePersistentHandle(NULL, 0);
5889 CmiEnableUrgentSend(0);
5900 CmiEnableUrgentSend(1);
5902 CmiEnableUrgentSend(0);
5906 if (untrans_handle == NULL) setup_persistent();
5908 #if CMK_SMP && USE_CKLOOP 5914 #if USE_NODE_PAR_RECEIVE 5927 for (
int isend=0; isend<xBlocks; ++isend ) {
5931 CmiEnableUrgentSend(1);
5933 #if USE_NODE_PAR_RECEIVE 5938 CmiEnableUrgentSend(0);
5942 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5946 float *md = msg->
qgrid;
5947 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5949 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5950 for (
int k=0; k<
nz; ++k ) {
5958 CmiEnableUrgentSend(1);
5959 #if USE_NODE_PAR_RECEIVE 5960 msg->
destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5962 CmiUsePersistentHandle(&untrans_handle[isend], 1);
5966 CmiUsePersistentHandle(NULL, 0);
5970 CmiUsePersistentHandle(&untrans_handle[isend], 1);
5974 CmiUsePersistentHandle(NULL, 0);
5977 CmiEnableUrgentSend(0);
5986 const float *md = msg->
qgrid;
5988 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5990 CmiNetworkProgress();
5992 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5993 for (
int k=0; k<nz; ++k ) {
5995 if ( (*md) == 0. ) CkPrintf(
"0 in XY at %d %d %d %d %d %d %d %d %d\n",
5996 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5998 d[2*(j*nz+k)] = *(md++);
5999 d[2*(j*nz+k)+1] = *(md++);
6013 for(
int i=fromIdx; i<=toIdx; i++){
6014 fftwf_execute_dft(backward_plan,
6024 #ifdef MANUAL_DEBUG_FFTW3 6025 dumpMatrixFloat3(
"bw_y_b",
data, nx,
initdata.
grid.
K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
6029 #if CMK_SMP && USE_CKLOOP 6038 for (
int i=0; i<nx; ++i ) {
6040 CmiNetworkProgress();
6042 fftwf_execute_dft(backward_plan,
6047 for (
int i=0; i<nx; ++i ) {
6049 CmiNetworkProgress();
6051 fftw(backward_plan, nz,
6053 nz, 1, (fftw_complex *)
work, 1, 0);
6057 #ifdef MANUAL_DEBUG_FFTW3 6058 dumpMatrixFloat3(
"bw_y_a",
data, nx,
initdata.
grid.
K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
6074 for(
int isend=fromIdx; isend<=toIdx; isend++) {
6078 CmiEnableUrgentSend(1);
6080 #if USE_NODE_PAR_RECEIVE 6085 CmiEnableUrgentSend(0);
6089 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
6093 float *md = msg->
qgrid;
6094 const float *d =
data;
6095 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
6096 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
6097 for (
int k=0; k<nz; ++k ) {
6098 *(md++) = d[2*(j*nz+k)];
6099 *(md++) = d[2*(j*nz+k)+1];
6104 CmiEnableUrgentSend(1);
6105 #if USE_NODE_PAR_RECEIVE 6106 msg->
destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
6109 CmiUsePersistentHandle(&untrans_handle[isend], 1);
6113 CmiUsePersistentHandle(NULL, 0);
6117 CmiUsePersistentHandle(&untrans_handle[isend], 1);
6121 CmiUsePersistentHandle(NULL, 0);
6124 CmiEnableUrgentSend(0);
6130 if (untrans_handle == NULL) setup_persistent();
6132 #if CMK_SMP && USE_CKLOOP 6138 #if USE_NODE_PAR_RECEIVE 6151 for (
int isend=0; isend<yBlocks; ++isend ) {
6155 CmiEnableUrgentSend(1);
6157 #if USE_NODE_PAR_RECEIVE 6162 CmiEnableUrgentSend(0);
6166 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
6170 float *md = msg->
qgrid;
6171 const float *d =
data;
6172 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
6173 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
6174 for (
int k=0; k<nz; ++k ) {
6175 *(md++) = d[2*(j*nz+k)];
6176 *(md++) = d[2*(j*nz+k)+1];
6182 CmiEnableUrgentSend(1);
6183 #if USE_NODE_PAR_RECEIVE 6184 msg->
destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
6187 CmiUsePersistentHandle(&untrans_handle[isend], 1);
6191 CmiUsePersistentHandle(NULL, 0);
6195 CmiUsePersistentHandle(&untrans_handle[isend], 1);
6199 CmiUsePersistentHandle(NULL, 0);
6202 CmiEnableUrgentSend(0);
6205 #if USE_NODE_PAR_RECEIVE 6207 CmiMemoryWriteFence();
6212 #if ! USE_NODE_PAR_RECEIVE 6220 const float *md = msg->
qgrid;
6222 for (
int i=0; i<nx; ++i ) {
6224 CmiNetworkProgress();
6226 for (
int j=0; j<ny; ++j, d += dim3 ) {
6227 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
6229 if ( (*md) == 0. ) CkPrintf(
"0 in YZ at %d %d %d %d %d %d %d %d %d\n",
6230 thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
6241 #ifdef MANUAL_DEBUG_FFTW3 6242 dumpMatrixFloat3(
"bw_z_b",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
6245 #if CMK_SMP && USE_CKLOOP 6251 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
6255 fftwf_execute(backward_plan);
6257 rfftwnd_complex_to_real(backward_plan, nx*ny,
6260 #ifdef MANUAL_DEBUG_FFTW3 6261 dumpMatrixFloat3(
"bw_z_a",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
6267 CmiNetworkProgress();
6275 float scale = 1. / (1. * K1 * K2 * K3);
6278 int mi, mj, mk; mi = mj = mk = -1;
6279 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
6280 const float *d =
data;
6281 for (
int i=0; i<nx; ++i ) {
6282 for (
int j=0; j<ny; ++j, d += dim3 ) {
6283 for (
int k=0; k<K3; ++k ) {
6284 float std = 10. * (10. * (10. * std_base + i) + j) + k;
6285 float err = scale * d[k] -
std;
6286 if ( fabsf(err) > fabsf(maxerr) ) {
6289 mi = i; mj = j; mk = k;
6294 CkPrintf(
"pencil %d %d max error %f at %d %d %d (should be %f)\n",
6295 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
6309 #if CMK_SMP && USE_CKLOOP 6322 for (
int limsg=fromIdx; limsg <=toIdx; ++limsg ) {
6330 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 6341 CmiEnableUrgentSend(1);
6343 CmiEnableUrgentSend(0);
6346 if ( !
hasData )
NAMD_bug(
"PmeZPencil::send_ungrid msg->hasData but not pencil->hasData");
6350 int *zlist = msg->
zlist;
6351 char *fmsg = msg->
fgrid;
6352 float *qmsg = msg->
qgrid;
6355 for (
int g=0; g<numGrids; ++g ) {
6357 CmiNetworkProgress();
6359 for (
int i=0; i<nx; ++i ) {
6360 for (
int j=0; j<ny; ++j, d += dim3 ) {
6362 for (
int k=0; k<zlistlen; ++k ) {
6363 *(qmsg++) = d[zlist[k]];
6370 CmiEnableUrgentSend(1);
6371 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 6377 CmiEnableUrgentSend(0);
6382 #if USE_NODE_PAR_RECEIVE 6384 CmiMemoryReadFence();
6389 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
6390 grid_msgs[limsg] = msg;
6392 if(limsg+1 == grid_msgs.
size())
6401 CmiMemoryWriteFence();
6404 #if USE_NODE_PAR_RECEIVE 6406 CmiMemoryWriteFence();
6418 if ( !
hasData )
NAMD_bug(
"PmeZPencil::node_process_untrans non-null msg but not hasData");
6420 }
else if (
hasData )
NAMD_bug(
"PmeZPencil::node_process_untrans hasData but null msg");
6421 #if USE_NODE_PAR_RECEIVE 6422 CmiMemoryWriteFence();
6426 CmiMemoryAtomicFetchAndInc(
imsgb,limsg);
6429 #if USE_NODE_PAR_RECEIVE 6430 CmiMemoryReadFence();
6440 CmiMemoryWriteFence();
6443 #if USE_NODE_PAR_RECEIVE 6450 if ( CkMyRank() )
return;
6470 }
else if (
lesOn ) {
6485 #include "ComputePmeMgr.def.h"
void node_process_grid(PmeGridMsg *)
void setNumPatches(int n)
void sendPencilsHelper(int)
static void scale_coordinates(PmeParticle p[], int N, Lattice lattice, PmeGrid grid)
void recvZGrid(PmeGridMsg *)
#define CUDA_POLL(FN, ARG)
void initialize(CkQdMsg *)
void compute_forces(const float *const *q_arr, const PmeParticle p[], Vector f[])
static CmiNodeLock fftw_plan_lock
std::ostream & iINFO(std::ostream &s)
static void sortPmePes(int *pmepes, int xdim, int ydim)
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
int numNodesWithPatches(void)
static void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param)
PmePencilInitMsgData initdata
#define PME_UNGRID_PRIORITY
void addRecipEvirClient(void)
void send_subset_untrans(int fromIdx, int toIdx)
#define PME_TRANS2_PRIORITY
void recv_trans(const PmeTransMsg *)
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
CProxy_ComputePmeMgr pmeProxy
void procUntrans(PmeUntransMsg *)
#define TRACE_COMPOBJ_IDOFFSET
BigReal max_a(int pid) const
void cuda_errcheck(const char *msg)
const int * get_qmAtmIndx()
static BigReal dielectric_1
static PatchMap * Object()
void registerXPencil(CkArrayIndex3D, PmeXPencil *)
void order_init(int nBlocks)
friend class ComputePmeMgr
void sendPencils(Lattice &, int sequence)
CProxy_PmeZPencil zPencil
virtual void submit(void)=0
SimParameters * simParameters
void recvXTrans(PmeTransMsg *)
void cudaDie(const char *msg, cudaError_t err)
#define CUDA_STREAM_CREATE(X)
#define CUDA_EVENT_ID_PME_COPY
std::ostream & endi(std::ostream &s)
void recvZUntrans(PmeUntransMsg *)
CProxy_PmeYPencil yPencil
void backward_subset_fft(int fromIdx, int toIdx)
static void messageEnqueueWork(Compute *)
SubmitReduction * willSubmit(int setID, int size=-1)
void sendChargeGridReady()
#define PME_OFFLOAD_UNGRID_PRIORITY
void recvYTrans(PmeTransMsg *)
static ReductionMgr * Object(void)
CProxy_NodePmeMgr pmeNodeProxy
static void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param)
void fwdSharedTrans(PmeTransMsg *)
void recvUngrid(PmeGridMsg *)
int add(const Elem &elem)
void recvGrid(PmeGridMsg *)
PmePencilInitMsg(PmePencilInitMsgData &d)
PmeZPencil(CkMigrateMessage *)
void recvUngrid(PmeGridMsg *)
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
void send_subset_untrans(int fromIdx, int toIdx)
void recv_trans(const PmeTransMsg *)
MathArray< double, 7 > PmeReduction
static void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param)
PmeYPencil(CkMigrateMessage *)
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
void sendUntransSubset(int first, int last)
void recvAck(PmeAckMsg *)
void reorder(Elem *a, int n)
virtual int registerArray(CkArrayIndexMax &, CkArrayID)
int isPmeProcessor(int p)
void node_process_trans(PmeTransMsg *)
void chargeGridReady(Lattice &lattice, int sequence)
#define CKLOOP_CTRL_PME_SENDUNTRANS
#define CUDA_EVENT_ID_PME_FORCES
#define CKLOOP_CTRL_PME_SENDTRANS
int sendDataHelper_sequence
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
int numPatches(void) const
void recvSharedUntrans(PmeSharedUntransMsg *)
#define PME_TRANS_PRIORITY
void recvNodeAck(PmeAckMsg *)
void CcdCallBacksReset(void *ignored, double curWallTime)
#define COMPUTE_HOME_PRIORITY
int compare_bit_reversed(int a, int b)
#define CUDA_EVENT_ID_PME_TICK
ResizeArray< ComputePme * > & getComputes(ComputePmeMgr *mgr)
void recvRecipEvir(PmeEvirMsg *)
void NAMD_bug(const char *err_msg)
void sendPencilsHelper(int)
double compute_energy_LJPME(float q_arr[], const Lattice &lattice, double LJewald, double virial[], int useCkLoop)
void fill_charges(float **q_arr, float **q_arr_list, int &q_arr_count, int &stray_count, char *f_arr, char *fz_arr, PmeParticle p[])
void send_subset_ungrid(int fromIdx, int toIdx)
static void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param)
int sendDataHelper_sourcepe
void recv_untrans(const PmeUntransMsg *)
void sendData(Lattice &, int sequence)
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
void registerYPencil(CkArrayIndex3D, PmeYPencil *)
void node_process_untrans(PmeUntransMsg *)
static void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param)
static BigReal alchElecLambdaStart
static std::deque< cuda_submit_charges_args > cuda_submit_charges_deque
ComputePme(ComputeID c, PatchID pid)
static void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param)
int chargeGridSubmittedCount
void recv_untrans(const PmeUntransMsg *)
static void scale_forces(Vector f[], int N, Lattice &lattice)
void fwdSharedUntrans(PmeUntransMsg *)
void cuda_check_pme_charges(void *arg, double walltime)
const Real * get_qmAtomGroup() const
NAMD_HOST_DEVICE Vector a_r() const
void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
NAMD_HOST_DEVICE Vector b_r() const
void setMgr(ComputePmeMgr *mgr)
void recvSharedTrans(PmeSharedTransMsg *)
#define PME_OFFLOAD_PRIORITY
void NAMD_die(const char *err_msg)
static void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param)
BigReal min_a(int pid) const
void recvUntrans(PmeUntransMsg *)
__thread DeviceCUDA * deviceCUDA
static int findRecipEvirPe()
#define CKLOOP_CTRL_PME_KSPACE
void initialize_pencils(CkQdMsg *)
NAMD_HOST_DEVICE Vector b() const
void send_subset_trans(int fromIdx, int toIdx)
static int * peDiffuseOrdering
void base_init(PmePencilInitMsg *msg)
void initialize_computes()
PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
virtual int procNum(int, const CkArrayIndex &i)
void recvUntrans(PmeUntransMsg *)
PmeYPencil_SDAG_CODE PmeYPencil()
void pollChargeGridReady()
void cuda_check_pme_forces(void *arg, double walltime)
void recvTrans(PmeTransMsg *)
void recvNodeAck(PmeAckMsg *)
#define PME_UNTRANS_PRIORITY
void node_process_untrans(PmeUntransMsg *)
BigReal max_b(int pid) const
Lattice * sendDataHelper_lattice
int sendDataHelper_errors
void recvTrans(PmeTransMsg *)
PmeXPencil_SDAG_CODE PmeXPencil()
virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr)
#define CKLOOP_CTRL_PME_FORWARDFFT
ResizeArray< ComputePme * > pmeComputes
void send_subset_trans(int fromIdx, int toIdx)
#define ADD_VECTOR_OBJECT(R, RL, D)
void sendTransSubset(int first, int last)
static BigReal LJewaldcof
bool less_than_bit_reversed(int a, int b)
#define PME_GRID_PRIORITY
#define CUDA_EVENT_ID_PME_KERNEL
void set_num_atoms(int natoms)
void registerZPencil(CkArrayIndex3D, PmeZPencil *)
void node_process_trans(PmeTransMsg *)
void sendUngridSubset(int first, int last)
int numPatchesOnNode(int node)
void copyPencils(PmeGridMsg *)
void procTrans(PmeTransMsg *)
std::ostream & iERROR(std::ostream &s)
#define SET_PRIORITY(MSG, SEQ, PRIO)
#define CKLOOP_CTRL_PME_BACKWARDFFT
void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm)
NAMD_HOST_DEVICE Vector a() const
void copyResults(PmeGridMsg *)
bool one_device_per_node()
char * pencilPMEProcessors
void cuda_submit_charges(Lattice &lattice, int sequence)
#define CUDA_EVENT_ID_PME_CHARGES
BigReal min_b(int pid) const
NAMD_HOST_DEVICE Vector unit(void) const
PmeZPencil_SDAG_CODE PmeZPencil()
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
PmePencilInitMsgData data
void chargeGridSubmitted(Lattice &lattice, int sequence)
void send_ungrid(PmeGridMsg *)
static void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param)
#define PME_UNTRANS2_PRIORITY
void recvAck(DataMessage *dmsg)
void close(Data **const t)
void forward_subset_fft(int fromIdx, int toIdx)
void recvYUntrans(PmeUntransMsg *)
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
void recvChargeGridReady()
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
CProxy_PmeXPencil xPencil
static Bool alchThermIntOn
#define PATCH_PRIORITY(PID)
static CmiNodeLock cuda_lock
void activate_pencils(CkQdMsg *)
PmeXPencil(CkMigrateMessage *)
CompAtomExt * getCompAtomExtInfo()
int y_start_after_transpose
void recv_grid(const PmeGridMsg *)
void sendTransBarrier(void)
Box< Patch, Results > * registerForceDeposit(Compute *cid)