13 #define fftwf_malloc fftw_malloc 14 #define fftwf_free fftw_free 15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX 36 #include "ComputePmeMgr.decl.h" 45 #include "ComputeMgr.decl.h" 47 #define MIN_DEBUG_LEVEL 3 53 #include "ckhashtable.h" 57 #include "ComputeMoaMgr.decl.h" 66 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 69 if ((err = cudaGetLastError()) != cudaSuccess) {
71 gethostname(host, 128); host[127] = 0;
72 char devstr[128] =
"";
74 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
75 sprintf(devstr,
" device %d", devnum);
77 cudaDeviceProp deviceProp;
78 if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
79 sprintf(devstr,
" device %d pci %x:%x:%x", devnum,
80 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
83 sprintf(errmsg,
"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
88 #include <cuda_runtime.h> 93 #include <hip/hip_runtime.h> 97 #define __thread __declspec(thread) 105 #define SQRT_PI 1.7724538509055160273 108 #if CMK_PERSISTENT_COMM 109 #define USE_PERSISTENT 1 118 #if (defined(NAMD_HIP) || defined(NAMD_CUDA)) && defined(MEM_OPT_VERSION) 119 #define USE_NODE_PAR_RECEIVE 1 199 : ia(i_a), ib(i_b), nb(n_b),
200 size(n), data(newcopyint(n,d)) {
206 virtual int procNum(
int,
const CkArrayIndex &i) {
208 return data[ i.data()[ia] * nb + i.data()[ib] ];
212 for (
int i=0; i < size; ++i ) {
213 if ( data[i] == mype ) {
214 CkArrayIndex3D ai(0,0,0);
215 ai.data()[ia] = i / nb;
216 ai.data()[ib] = i % nb;
217 if ( procNum(0,ai) != mype )
NAMD_bug(
"PmePencilMap is inconsistent");
218 if ( ! msg )
NAMD_bug(
"PmePencilMap multiple pencils on a pe?");
219 mgr->insertInitial(ai,msg);
223 mgr->doneInserting();
224 if ( msg ) CkFreeMsg(msg);
227 const int ia, ib, nb, size;
228 const int*
const data;
229 static int* newcopyint(
int n,
int *d) {
230 int *newd =
new int[n];
231 memcpy(newd, d, n*
sizeof(
int));
245 CProxy_PmePencilMap
xm;
246 CProxy_PmePencilMap
ym;
247 CProxy_PmePencilMap
zm;
276 int node = CmiMyNode();
277 int firstpe = CmiNodeFirst(node);
278 int nodeSize = CmiNodeSize(node);
279 int myrank = CkMyRank();
280 for (
int i=0; i<nodeSize; ++i ) {
281 int pe = firstpe + (myrank+i)%nodeSize;
290 CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
292 for (
int i=0; i<nodeSize; ++i ) {
293 if ( pelist[i] == CkMyPe() ) myrank = i;
295 for (
int i=0; i<nodeSize; ++i ) {
296 int pe = pelist[(myrank+i)%nodeSize];
304 int npes = CkNumPes();
305 for (
int i=0; i<npes; ++i ) {
306 int pe = (mype+i)%npes;
312 NAMD_bug(
"findRecipEvirPe() failed!");
319 int ncpus = CkNumPes();
321 for (
int i=0; i<numGridPes; ++i ) {
324 std::sort(gridPeMap,gridPeMap+numGridPes);
325 int firstTransPe = ncpus - numGridPes - numTransPes;
326 if ( firstTransPe < 0 ) {
329 if ( ncpus > numTransPes ) firstTransPe = 1;
331 for (
int i=0; i<numTransPes; ++i ) {
334 std::sort(transPeMap,transPeMap+numTransPes);
339 bool generateBGLORBPmePeList(
int *pemap,
int numPes,
int *block_pes=0,
347 if ( d )
while ( ! (d & c) ) {
350 return (a & c) - (b & c);
356 if ( d )
while ( ! (d & c) ) {
363 inline bool operator() (
int a,
int b)
const {
389 void initialize_pencils(CkQdMsg*);
390 void activate_pencils(CkQdMsg*);
391 void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
392 void initialize_computes();
394 void sendData(
Lattice &,
int sequence);
395 void sendDataPart(
int first,
int last,
Lattice &,
int sequence,
int sourcepe,
int errors);
400 void sendPencils(
Lattice &,
int sequence);
401 void sendPencilsPart(
int first,
int last,
Lattice &,
int sequence,
int sourcepe);
403 void gridCalc1(
void);
404 void sendTransBarrier(
void);
405 void sendTransSubset(
int first,
int last);
406 void sendTrans(
void);
413 void gridCalc2(
void);
414 #ifdef OPENATOM_VERSION 415 void gridCalc2Moa(
void);
416 #endif // OPENATOM_VERSION 417 void gridCalc2R(
void);
420 void sendUntrans(
void);
421 void sendUntransSubset(
int first,
int last);
424 void gridCalc3(
void);
425 void sendUngrid(
void);
426 void sendUngridSubset(
int first,
int last);
431 void ungridCalc(
void);
433 void addRecipEvirClient(
void);
434 void submitReductions();
436 #if 0 && USE_PERSISTENT 437 void setup_recvgrid_persistent();
443 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 451 void chargeGridSubmitted(
Lattice &lattice,
int sequence);
463 void cuda_submit_charges(
Lattice &lattice,
int sequence);
471 void sendChargeGridReady();
475 void pollChargeGridReady();
476 void pollForcesReady();
477 void recvChargeGridReady();
478 void chargeGridReady(
Lattice &lattice,
int sequence);
484 #if 0 && USE_PERSISTENT 485 PersistentHandle *recvGrid_handle;
488 CProxy_ComputePmeMgr pmeProxy;
489 CProxy_ComputePmeMgr pmeProxyDir;
490 CProxy_NodePmeMgr pmeNodeProxy;
495 if ( ! pmeComputes.
size() ) initialize_computes();
509 fftwf_plan *forward_plan_x, *backward_plan_x;
510 fftwf_plan *forward_plan_yz, *backward_plan_yz;
513 fftw_plan forward_plan_x, backward_plan_x;
514 rfftwnd_plan forward_plan_yz, backward_plan_yz;
521 int qsize, fsize, bsize;
537 int ungridForcesCount;
539 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 540 #define NUM_STREAMS 1 554 int f_data_mgr_alloc;
555 float *f_data_mgr_host;
556 float *f_data_mgr_dev;
560 float *bspline_coeffs_dev;
561 float *bspline_dcoeffs_dev;
564 int recipEvirClients;
582 int myGridPe, myGridNode;
583 int myTransPe, myTransNode;
597 int compute_sequence;
600 int sendTransBarrier_received;
603 int xBlocks, yBlocks, zBlocks;
604 CProxy_PmeXPencil xPencil;
605 CProxy_PmeYPencil yPencil;
606 CProxy_PmeZPencil zPencil;
609 int numPencilsActive;
610 int strayChargeErrors;
618 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 640 void sendDataHelper(
int);
641 void sendPencilsHelper(
int);
644 void registerXPencil(CkArrayIndex3D,
PmeXPencil *);
645 void registerYPencil(CkArrayIndex3D,
PmeYPencil *);
646 void registerZPencil(CkArrayIndex3D,
PmeZPencil *);
656 xm=_xm; ym=_ym; zm=_zm;
658 CProxy_PmePencilMap
xm;
659 CProxy_PmePencilMap
ym;
660 CProxy_PmePencilMap
zm;
663 CProxy_ComputePmeMgr mgrProxy;
666 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 670 CProxy_PmeXPencil xPencil;
671 CProxy_PmeYPencil yPencil;
672 CProxy_PmeZPencil zPencil;
673 CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
674 CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
675 CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
677 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 678 cudaEvent_t end_charge_memset;
679 cudaEvent_t end_all_pme_kernels;
680 cudaEvent_t end_potential_memcpy;
689 delete [] mgrObjects;
693 CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
694 mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
695 if ( CkMyRank() == 0 ) {
697 mgrObject = proxy.ckLocalBranch();
702 mgrObject->fwdSharedTrans(msg);
706 mgrObject->fwdSharedUntrans(msg);
710 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 711 masterPmeMgr->recvUngrid(msg);
713 NAMD_bug(
"NodePmeMgr::recvUngrid called in non-CUDA build.");
720 xPencilObj.put(idx)=obj;
726 yPencilObj.put(idx)=obj;
732 zPencilObj.put(idx)=obj;
737 pmeProxyDir(thisgroup) {
739 CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
740 pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
741 nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
743 pmeNodeProxy.ckLocalBranch()->initialize();
745 if ( CmiMyRank() == 0 ) {
759 sendTransBarrier_received = 0;
762 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 764 if ( CmiMyRank() == 0 ) {
768 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP) 769 int leastPriority, greatestPriority;
770 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
775 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority) 777 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X) 792 cudaEventCreateWithFlags(&
end_charges,cudaEventDisableTiming);
801 f_data_mgr_alloc = 0;
807 #define CUDA_EVENT_ID_PME_CHARGES 80 808 #define CUDA_EVENT_ID_PME_FORCES 81 809 #define CUDA_EVENT_ID_PME_TICK 82 810 #define CUDA_EVENT_ID_PME_COPY 83 811 #define CUDA_EVENT_ID_PME_KERNEL 84 812 if ( 0 == CkMyPe() ) {
821 recipEvirClients = 0;
827 CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
828 xPencil = x; yPencil = y; zPencil = z;
832 pmeNodeProxy.ckLocalBranch()->xPencil=x;
833 pmeNodeProxy.ckLocalBranch()->yPencil=y;
834 pmeNodeProxy.ckLocalBranch()->zPencil=z;
842 Coord(): x(0), y(0), z(0) {}
843 Coord(
int a,
int b,
int c): x(a), y(b), z(c) {}
845 extern void SFC_grid(
int xdim,
int ydim,
int zdim,
int xdim1,
int ydim1,
int zdim1, vector<Coord> &result);
851 for (
int i=0; i<result.size(); i++) {
852 Coord &c = result[i];
853 for (
int j=0; j<procs.
size(); j++) {
856 tmgr.rankToCoordinates(pe, x, y, z, t);
857 if (x==c.x && y==c.y && z==c.z)
858 newprocs[num++] = pe;
861 CmiAssert(newprocs.size() == procs.
size());
865 int find_level_grid(
int x)
877 CmiNodeLock tmgr_lock;
884 tmgr_lock = CmiCreateLock();
894 gridPeMap =
new int[CkNumPes()];
895 transPeMap =
new int[CkNumPes()];
896 recipPeDest =
new int[CkNumPes()];
897 gridPeOrder =
new int[CkNumPes()];
898 gridNodeOrder =
new int[CkNumNodes()];
899 transNodeOrder =
new int[CkNumNodes()];
901 if (CkMyRank() == 0) {
910 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 912 NAMD_die(
"PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
919 cudaDeviceProp deviceProp;
920 cudaGetDeviceProperties(&deviceProp, dev);
922 if ( deviceProp.major < 2 )
923 NAMD_die(
"PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
932 else if (
simParams->PMEPencils > 0 ) usePencils = 1;
935 if ( nrps <= 0 ) nrps = CkNumPes();
936 if ( nrps > CkNumPes() ) nrps = CkNumPes();
939 int maxslabs = 1 + (dimx - 1) /
simParams->PMEMinSlices;
940 if ( maxslabs > nrps ) maxslabs = nrps;
943 if ( maxpencils > nrps ) maxpencils = nrps;
944 if ( maxpencils > 3 * maxslabs ) usePencils = 1;
950 if ( nrps <= 0 ) nrps = CkNumPes();
951 if ( nrps > CkNumPes() ) nrps = CkNumPes();
954 xBlocks = yBlocks = zBlocks =
simParams->PMEPencils;
958 if ( nb2 > nrps ) nb2 = nrps;
959 if ( nb2 < 1 ) nb2 = 1;
960 int nb = (int) sqrt((
float)nb2);
961 if ( nb < 1 ) nb = 1;
962 xBlocks = zBlocks = nb;
971 int bx = 1 + ( dimx - 1 ) / xBlocks;
972 xBlocks = 1 + ( dimx - 1 ) / bx;
975 int by = 1 + ( dimy - 1 ) / yBlocks;
976 yBlocks = 1 + ( dimy - 1 ) / by;
978 int dimz =
simParams->PMEGridSizeZ / 2 + 1;
979 int bz = 1 + ( dimz - 1 ) / zBlocks;
980 zBlocks = 1 + ( dimz - 1 ) / bz;
982 if ( xBlocks * yBlocks > CkNumPes() ) {
983 NAMD_die(
"PME pencils xBlocks * yBlocks > numPes");
985 if ( xBlocks * zBlocks > CkNumPes() ) {
986 NAMD_die(
"PME pencils xBlocks * zBlocks > numPes");
988 if ( yBlocks * zBlocks > CkNumPes() ) {
989 NAMD_die(
"PME pencils yBlocks * zBlocks > numPes");
993 iout <<
iINFO <<
"PME using " << xBlocks <<
" x " <<
994 yBlocks <<
" x " << zBlocks <<
995 " pencil grid for FFT and reciprocal sum.\n" <<
endi;
1002 int minslices =
simParams->PMEMinSlices;
1004 int nrpx = ( dimx + minslices - 1 ) / minslices;
1006 int nrpy = ( dimy + minslices - 1 ) / minslices;
1009 int nrpp = CkNumPes();
1011 if ( nrpp < nrpx ) nrpx = nrpp;
1012 if ( nrpp < nrpy ) nrpy = nrpp;
1016 if ( nrps > CkNumPes() ) nrps = CkNumPes();
1017 if ( nrps > 0 ) nrpx = nrps;
1018 if ( nrps > 0 ) nrpy = nrps;
1021 int bx = ( dimx + nrpx - 1 ) / nrpx;
1022 nrpx = ( dimx + bx - 1 ) / bx;
1023 int by = ( dimy + nrpy - 1 ) / nrpy;
1024 nrpy = ( dimy + by - 1 ) / by;
1025 if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1026 NAMD_bug(
"Error in selecting number of PME processors.");
1027 if ( by != ( dimy + nrpy - 1 ) / nrpy )
1028 NAMD_bug(
"Error in selecting number of PME processors.");
1034 iout <<
iINFO <<
"PME using " << numGridPes <<
" and " << numTransPes <<
1035 " processors for FFT and reciprocal sum.\n" <<
endi;
1038 int sum_npes = numTransPes + numGridPes;
1039 int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1041 #if 0 // USE_TOPOMAP 1047 if(tmgr.hasMultipleProcsPerNode())
1051 if(CkNumPes() > 2*sum_npes + patch_pes) {
1052 done = generateBGLORBPmePeList(transPeMap, numTransPes);
1053 done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1056 if(CkNumPes() > 2 *max_npes + patch_pes) {
1057 done = generateBGLORBPmePeList(transPeMap, max_npes);
1058 gridPeMap = transPeMap;
1072 for ( i=0; i<numGridPes && i<10; ++i ) {
1073 iout <<
" " << gridPeMap[i];
1075 if ( i < numGridPes )
iout <<
" ...";
1077 iout <<
iINFO <<
"PME TRANS LOCATIONS:";
1078 for ( i=0; i<numTransPes && i<10; ++i ) {
1079 iout <<
" " << transPeMap[i];
1081 if ( i < numTransPes )
iout <<
" ...";
1093 for ( i=0; i<numGridPes; ++i ) {
1094 if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1096 int real_node_i = CkNodeOf(gridPeMap[i]);
1097 if ( real_node_i == real_node ) {
1098 gridNodeInfo[node].
npe += 1;
1100 real_node = real_node_i;
1102 gridNodeInfo[node].
real_node = real_node;
1104 gridNodeInfo[node].
npe = 1;
1106 if ( CkMyNode() == real_node_i ) myGridNode = node;
1108 numGridNodes = node + 1;
1113 for ( i=0; i<numTransPes; ++i ) {
1114 if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1116 int real_node_i = CkNodeOf(transPeMap[i]);
1117 if ( real_node_i == real_node ) {
1118 transNodeInfo[node].
npe += 1;
1120 real_node = real_node_i;
1122 transNodeInfo[node].
real_node = real_node;
1124 transNodeInfo[node].
npe = 1;
1126 if ( CkMyNode() == real_node_i ) myTransNode = node;
1128 numTransNodes = node + 1;
1131 iout <<
iINFO <<
"PME USING " << numGridNodes <<
" GRID NODES AND " 1132 << numTransNodes <<
" TRANS NODES\n" <<
endi;
1137 for ( i = 0; i < numGridPes; ++i ) {
1141 if ( myGridPe < 0 ) {
1142 rand.
reorder(gridPeOrder,numGridPes);
1144 gridPeOrder[myGridPe] = numGridPes-1;
1145 gridPeOrder[numGridPes-1] = myGridPe;
1146 rand.
reorder(gridPeOrder,numGridPes-1);
1148 for ( i = 0; i < numGridNodes; ++i ) {
1149 gridNodeOrder[i] = i;
1151 if ( myGridNode < 0 ) {
1152 rand.
reorder(gridNodeOrder,numGridNodes);
1154 gridNodeOrder[myGridNode] = numGridNodes-1;
1155 gridNodeOrder[numGridNodes-1] = myGridNode;
1156 rand.
reorder(gridNodeOrder,numGridNodes-1);
1158 for ( i = 0; i < numTransNodes; ++i ) {
1159 transNodeOrder[i] = i;
1161 if ( myTransNode < 0 ) {
1162 rand.
reorder(transNodeOrder,numTransNodes);
1164 transNodeOrder[myTransNode] = numTransNodes-1;
1165 transNodeOrder[numTransNodes-1] = myTransNode;
1166 rand.
reorder(transNodeOrder,numTransNodes-1);
1177 myGrid.
dim3 = 2 * (myGrid.
K3/2 + 1);
1179 if ( ! usePencils ) {
1180 myGrid.
block1 = ( myGrid.
K1 + numGridPes - 1 ) / numGridPes;
1181 myGrid.
block2 = ( myGrid.
K2 + numTransPes - 1 ) / numTransPes;
1186 myGrid.
block1 = ( myGrid.
K1 + xBlocks - 1 ) / xBlocks;
1187 myGrid.
block2 = ( myGrid.
K2 + yBlocks - 1 ) / yBlocks;
1188 myGrid.
block3 = ( myGrid.
K3/2 + 1 + zBlocks - 1 ) / zBlocks;
1200 int ncpus = CkNumPes();
1203 for (
int icpu=0; icpu<ncpus; ++icpu ) {
1208 else nopatches.
add(ri);
1214 int *tmp =
new int[patches.
size()];
1215 int nn = patches.
size();
1216 for (i=0;i<nn;i++) tmp[i] = patches[i];
1219 for (i=0;i<nn;i++) patches.
add(tmp[i]);
1221 tmp =
new int[nopatches.
size()];
1222 nn = nopatches.
size();
1223 for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1226 for (i=0;i<nn;i++) nopatches.
add(tmp[i]);
1232 int npens = xBlocks*yBlocks;
1233 if ( npens % ncpus == 0 ) useZero = 1;
1234 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1235 npens += xBlocks*zBlocks;
1236 if ( npens % ncpus == 0 ) useZero = 1;
1237 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1238 npens += yBlocks*zBlocks;
1239 if ( npens % ncpus == 0 ) useZero = 1;
1240 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1243 for ( i=nopatches.
size()-1; i>=0; --i ) pmeprocs.
add(nopatches[i]);
1245 for ( i=patches.
size()-1; i>=0; --i ) pmeprocs.
add(patches[i]);
1248 int npes = pmeprocs.
size();
1249 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1250 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1251 #if !USE_RANDOM_TOPO 1254 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1255 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1256 #if !USE_RANDOM_TOPO 1259 for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1260 if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1261 #if !USE_RANDOM_TOPO 1269 int xdim = tmgr.getDimNX();
1270 int ydim = tmgr.getDimNY();
1271 int zdim = tmgr.getDimNZ();
1272 int xdim1 = find_level_grid(xdim);
1273 int ydim1 = find_level_grid(ydim);
1274 int zdim1 = find_level_grid(zdim);
1276 printf(
"xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1278 vector<Coord> result;
1279 SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1280 sort_sfc(xprocs, tmgr, result);
1281 sort_sfc(yprocs, tmgr, result);
1282 sort_sfc(zprocs, tmgr, result);
1284 CmiUnlock(tmgr_lock);
1289 iout <<
iINFO <<
"PME Z PENCIL LOCATIONS:";
1290 for ( i=0; i<zprocs.
size() && i<10; ++i ) {
1293 tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1294 iout <<
" " << zprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1296 iout <<
" " << zprocs[i];
1299 if ( i < zprocs.
size() )
iout <<
" ...";
1303 if (CkMyRank() == 0) {
1304 for (pe=0, x = 0; x < xBlocks; ++x)
1305 for (y = 0; y < yBlocks; ++y, ++pe ) {
1311 iout <<
iINFO <<
"PME Y PENCIL LOCATIONS:";
1312 for ( i=0; i<yprocs.
size() && i<10; ++i ) {
1315 tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1316 iout <<
" " << yprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1318 iout <<
" " << yprocs[i];
1321 if ( i < yprocs.
size() )
iout <<
" ...";
1325 if (CkMyRank() == 0) {
1326 for (pe=0, z = 0; z < zBlocks; ++z )
1327 for (x = 0; x < xBlocks; ++x, ++pe ) {
1333 iout <<
iINFO <<
"PME X PENCIL LOCATIONS:";
1334 for ( i=0; i<xprocs.
size() && i<10; ++i ) {
1337 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1338 iout <<
" " << xprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1340 iout <<
" " << xprocs[i];
1343 if ( i < xprocs.
size() )
iout <<
" ...";
1347 if (CkMyRank() == 0) {
1348 for (pe=0, y = 0; y < yBlocks; ++y )
1349 for (z = 0; z < zBlocks; ++z, ++pe ) {
1356 if ( CkMyPe() == 0 ){
1357 #if !USE_RANDOM_TOPO 1364 CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.
begin());
1365 CProxy_PmePencilMap ym;
1367 ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.
begin());
1369 ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.
begin());
1370 CProxy_PmePencilMap xm;
1372 xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.
begin());
1374 xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.
begin());
1375 pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1376 CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1377 CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1378 CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1379 zo.setAnytimeMigration(
false); zo.setStaticInsertion(
true);
1380 yo.setAnytimeMigration(
false); yo.setStaticInsertion(
true);
1381 xo.setAnytimeMigration(
false); xo.setStaticInsertion(
true);
1382 zPencil = CProxy_PmeZPencil::ckNew(zo);
1383 yPencil = CProxy_PmeYPencil::ckNew(yo);
1384 xPencil = CProxy_PmeXPencil::ckNew(xo);
1386 zPencil = CProxy_PmeZPencil::ckNew();
1387 yPencil = CProxy_PmeYPencil::ckNew();
1388 xPencil = CProxy_PmeXPencil::ckNew();
1390 for (pe=0, x = 0; x < xBlocks; ++x)
1391 for (y = 0; y < yBlocks; ++y, ++pe ) {
1392 zPencil(x,y,0).insert(zprocs[pe]);
1394 zPencil.doneInserting();
1396 for (pe=0, x = 0; x < xBlocks; ++x)
1397 for (z = 0; z < zBlocks; ++z, ++pe ) {
1398 yPencil(x,0,z).insert(yprocs[pe]);
1400 yPencil.doneInserting();
1403 for (pe=0, y = 0; y < yBlocks; ++y )
1404 for (z = 0; z < zBlocks; ++z, ++pe ) {
1405 xPencil(0,y,z).insert(xprocs[pe]);
1407 xPencil.doneInserting();
1410 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1412 msgdata.
grid = myGrid;
1435 for ( pe = 0; pe < numGridPes; ++pe ) {
1438 if ( nx > myGrid.
K1 ) nx = myGrid.
K1;
1439 localInfo[pe].
nx = nx - localInfo[pe].
x_start;
1442 for ( pe = 0; pe < numTransPes; ++pe ) {
1445 if ( ny > myGrid.
K2 ) ny = myGrid.
K2;
1458 int numNodes = CkNumPes();
1459 int *source_flags =
new int[numNodes];
1461 for ( node=0; node<numNodes; ++node ) {
1462 source_flags[node] = 0;
1463 recipPeDest[node] = 0;
1472 for (
int pid=0; pid < numPatches; ++pid ) {
1473 int pnode = patchMap->
node(pid);
1474 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1475 if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1477 int shift1 = (myGrid.
K1 + myGrid.
order - 1)/2;
1480 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1482 int min1 = ((int) floor(myGrid.
K1 * (minx - margina))) + shift1 - myGrid.
order + 1;
1483 int max1 = ((int) floor(myGrid.
K1 * (maxx + margina))) + shift1;
1484 for (
int i=min1; i<=max1; ++i ) {
1486 while ( ix >= myGrid.
K1 ) ix -= myGrid.
K1;
1487 while ( ix < 0 ) ix += myGrid.
K1;
1489 if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1490 ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1491 source_flags[pnode] = 1;
1494 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1496 if ( pnode == CkNodeFirst(CkMyNode()) ) {
1497 recipPeDest[ix / myGrid.
block1] = 1;
1501 if ( pnode == CkMyPe() ) {
1502 recipPeDest[ix / myGrid.
block1] = 1;
1507 int numSourcesSamePhysicalNode = 0;
1509 numDestRecipPes = 0;
1510 for ( node=0; node<numNodes; ++node ) {
1511 if ( source_flags[node] ) ++numSources;
1512 if ( recipPeDest[node] ) ++numDestRecipPes;
1513 if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1518 CkPrintf(
"pe %5d pme %5d of %5d on same physical node\n",
1519 CkMyPe(), numSourcesSamePhysicalNode, numSources);
1520 iout <<
iINFO <<
"PME " << CkMyPe() <<
" sources:";
1521 for ( node=0; node<numNodes; ++node ) {
1522 if ( source_flags[node] )
iout <<
" " << node;
1528 delete [] source_flags;
1535 ungrid_count = numDestRecipPes;
1537 sendTransBarrier_received = 0;
1539 if ( myGridPe < 0 && myTransPe < 0 )
return;
1542 if ( myTransPe >= 0 ) {
1544 pmeProxy[recipEvirPe].addRecipEvirClient();
1547 if ( myTransPe >= 0 ) {
1550 #ifdef OPENATOM_VERSION 1552 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1553 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2, moaProxy);
1555 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2);
1557 #else // OPENATOM_VERSION 1558 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2);
1559 #endif // OPENATOM_VERSION 1562 int local_size = myGrid.
block1 * myGrid.
K2 * myGrid.
dim3;
1563 int local_size_2 = myGrid.
block2 * myGrid.
K1 * myGrid.
dim3;
1564 if ( local_size < local_size_2 ) local_size = local_size_2;
1565 qgrid =
new float[local_size*
numGrids];
1566 if ( numGridPes > 1 || numTransPes > 1 ) {
1567 kgrid =
new float[local_size*
numGrids];
1571 qgrid_size = local_size;
1573 if ( myGridPe >= 0 ) {
1574 qgrid_start = localInfo[myGridPe].
x_start * myGrid.
K2 * myGrid.
dim3;
1575 qgrid_len = localInfo[myGridPe].
nx * myGrid.
K2 * myGrid.
dim3;
1576 fgrid_start = localInfo[myGridPe].
x_start * myGrid.
K2;
1577 fgrid_len = localInfo[myGridPe].
nx * myGrid.
K2;
1580 int n[3]; n[0] = myGrid.
K1; n[1] = myGrid.
K2; n[2] = myGrid.
K3;
1584 work =
new fftwf_complex[n[0]];
1585 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1586 if ( myGridPe >= 0 ) {
1587 forward_plan_yz=
new fftwf_plan[
numGrids];
1588 backward_plan_yz=
new fftwf_plan[
numGrids];
1590 if ( myTransPe >= 0 ) {
1591 forward_plan_x=
new fftwf_plan[
numGrids];
1592 backward_plan_x=
new fftwf_plan[
numGrids];
1595 if ( ! CkMyPe() )
iout <<
iINFO <<
"Optimizing 4 FFT steps. 1..." <<
endi;
1596 if ( myGridPe >= 0 ) {
1599 forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1600 localInfo[myGridPe].nx,
1601 qgrid + qgrid_size * g,
1606 (qgrid + qgrid_size * g),
1613 int zdim = myGrid.
dim3;
1615 if ( ! CkMyPe() )
iout <<
" 2..." <<
endi;
1616 if ( myTransPe >= 0 ) {
1620 forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1622 (kgrid+qgrid_size*g),
1627 (kgrid+qgrid_size*g),
1631 FFTW_FORWARD,fftwFlags);
1635 if ( ! CkMyPe() )
iout <<
" 3..." <<
endi;
1636 if ( myTransPe >= 0 ) {
1639 backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1641 (kgrid+qgrid_size*g),
1646 (kgrid+qgrid_size*g),
1650 FFTW_BACKWARD, fftwFlags);
1654 if ( ! CkMyPe() )
iout <<
" 4..." <<
endi;
1655 if ( myGridPe >= 0 ) {
1658 backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1659 localInfo[myGridPe].nx,
1661 (qgrid + qgrid_size * g),
1665 qgrid + qgrid_size * g,
1672 if ( ! CkMyPe() )
iout <<
" Done.\n" <<
endi;
1675 work =
new fftw_complex[n[0]];
1677 if ( ! CkMyPe() )
iout <<
iINFO <<
"Optimizing 4 FFT steps. 1..." <<
endi;
1678 if ( myGridPe >= 0 ) {
1679 forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1680 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1681 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1683 if ( ! CkMyPe() )
iout <<
" 2..." <<
endi;
1684 if ( myTransPe >= 0 ) {
1685 forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1686 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1687 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1690 if ( ! CkMyPe() )
iout <<
" 3..." <<
endi;
1691 if ( myTransPe >= 0 ) {
1692 backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1693 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1694 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1697 if ( ! CkMyPe() )
iout <<
" 4..." <<
endi;
1698 if ( myGridPe >= 0 ) {
1699 backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1700 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1701 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1703 if ( ! CkMyPe() )
iout <<
" Done.\n" <<
endi;
1707 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
1710 if ( myGridPe >= 0 && numSources == 0 )
1711 NAMD_bug(
"PME grid elements exist without sources.");
1712 grid_count = numSources;
1713 memset( (
void*) qgrid, 0, qgrid_size *
numGrids *
sizeof(
float) );
1714 trans_count = numGridPes;
1721 if ( ! usePencils )
return;
1733 pencilActive =
new char[xBlocks*yBlocks];
1734 for (
int i=0; i<xBlocks; ++i ) {
1735 for (
int j=0; j<yBlocks; ++j ) {
1736 pencilActive[i*yBlocks+j] = 0;
1740 for (
int pid=0; pid < numPatches; ++pid ) {
1741 int pnode = patchMap->
node(pid);
1742 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1744 if ( CkNodeOf(pnode) != CkMyNode() )
continue;
1747 if ( pnode != CkMyPe() )
continue;
1749 int shift1 = (myGrid.
K1 + myGrid.
order - 1)/2;
1750 int shift2 = (myGrid.
K2 + myGrid.
order - 1)/2;
1754 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1756 int min1 = ((int) floor(myGrid.
K1 * (minx - margina))) + shift1 - myGrid.
order + 1;
1757 int max1 = ((int) floor(myGrid.
K1 * (maxx + margina))) + shift1;
1761 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1763 int min2 = ((int) floor(myGrid.
K2 * (miny - marginb))) + shift2 - myGrid.
order + 1;
1764 int max2 = ((int) floor(myGrid.
K2 * (maxy + marginb))) + shift2;
1766 for (
int i=min1; i<=max1; ++i ) {
1768 while ( ix >= myGrid.
K1 ) ix -= myGrid.
K1;
1769 while ( ix < 0 ) ix += myGrid.
K1;
1770 for (
int j=min2; j<=max2; ++j ) {
1772 while ( jy >= myGrid.
K2 ) jy -= myGrid.
K2;
1773 while ( jy < 0 ) jy += myGrid.
K2;
1774 pencilActive[(ix / myGrid.
block1)*yBlocks + (jy / myGrid.
block2)] = 1;
1779 numPencilsActive = 0;
1780 for (
int i=0; i<xBlocks; ++i ) {
1781 for (
int j=0; j<yBlocks; ++j ) {
1782 if ( pencilActive[i*yBlocks+j] ) {
1784 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 1787 zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1791 activePencils =
new ijpair[numPencilsActive];
1792 numPencilsActive = 0;
1793 for (
int i=0; i<xBlocks; ++i ) {
1794 for (
int j=0; j<yBlocks; ++j ) {
1795 if ( pencilActive[i*yBlocks+j] ) {
1796 activePencils[numPencilsActive++] =
ijpair(i,j);
1804 rand.
reorder(activePencils,numPencilsActive);
1810 ungrid_count = numPencilsActive;
1815 if ( ! usePencils )
return;
1816 if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1822 if ( CmiMyRank() == 0 ) {
1828 delete [] localInfo;
1829 delete [] gridNodeInfo;
1830 delete [] transNodeInfo;
1831 delete [] gridPeMap;
1832 delete [] transPeMap;
1833 delete [] recipPeDest;
1834 delete [] gridPeOrder;
1835 delete [] gridNodeOrder;
1836 delete [] transNodeOrder;
1838 if ( kgrid != qgrid )
delete [] kgrid;
1840 delete [] gridmsg_reuse;
1843 for (
int i=0; i<q_count; ++i) {
1844 delete [] q_list[i];
1855 if ( grid_count == 0 ) {
1856 NAMD_bug(
"Message order failure in ComputePmeMgr::recvGrid\n");
1858 if ( grid_count == numSources ) {
1863 int zdim = myGrid.
dim3;
1865 int *zlist = msg->
zlist;
1866 float *qmsg = msg->
qgrid;
1868 char *f = msg->
fgrid + fgrid_len * g;
1869 float *q = qgrid + qgrid_size * g;
1870 for (
int i=0; i<fgrid_len; ++i ) {
1872 for (
int k=0; k<zlistlen; ++k ) {
1873 q[zlist[k]] += *(qmsg++);
1880 gridmsg_reuse[numSources-grid_count] = msg;
1883 if ( grid_count == 0 ) {
1884 pmeProxyDir[CkMyPe()].gridCalc1();
1885 if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1888 #ifdef MANUAL_DEBUG_FFTW3 1891 void dumpMatrixFloat(
const char *infilename,
float *matrix,
int xdim,
int ydim,
int zdim,
int pe)
1895 char filename[1000];
1896 strncpy(fmt,infilename,999);
1897 strncat(fmt,
"_%d.out",999);
1898 sprintf(filename,fmt, pe);
1899 FILE *loutfile = fopen(filename,
"w");
1900 #ifdef PAIRCALC_TEST_DUMP 1901 fprintf(loutfile,
"%d\n",ydim);
1903 fprintf(loutfile,
"%d %d %d\n",xdim,ydim, zdim);
1904 for(
int i=0;i<xdim;i++)
1905 for(
int j=0;j<ydim;j++)
1906 for(
int k=0;k<zdim;k++)
1907 fprintf(loutfile,
"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1912 void dumpMatrixFloat3(
const char *infilename,
float *matrix,
int xdim,
int ydim,
int zdim,
int x,
int y,
int z)
1915 char filename[1000];
1916 strncpy(fmt,infilename,999);
1917 strncat(fmt,
"_%d_%d_%d.out",999);
1918 sprintf(filename,fmt, x,y,z);
1919 FILE *loutfile = fopen(filename,
"w");
1920 CkAssert(loutfile!=NULL);
1921 CkPrintf(
"opened %s for dump\n",filename);
1922 fprintf(loutfile,
"%d %d %d\n",xdim,ydim, zdim);
1923 for(
int i=0;i<xdim;i++)
1924 for(
int j=0;j<ydim;j++)
1925 for(
int k=0;k<zdim;k++)
1926 fprintf(loutfile,
"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1938 fftwf_execute(forward_plan_yz[g]);
1940 rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1941 qgrid + qgrid_size * g, 1, myGrid.
dim2 * myGrid.
dim3, 0, 0, 0);
1947 if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1951 sendTransBarrier_received += 1;
1953 if ( sendTransBarrier_received < numGridPes )
return;
1954 sendTransBarrier_received = 0;
1955 for (
int i=0; i<numGridPes; ++i ) {
1956 pmeProxyDir[gridPeMap[i]].sendTrans();
1960 static inline void PmeSlabSendTrans(
int first,
int last,
void *result,
int paraNum,
void *param) {
1967 untrans_count = numTransPes;
1969 #if CMK_SMP && USE_CKLOOP 1972 CkLoop_Parallelize(
PmeSlabSendTrans, 1, (
void *)
this, CkMyNodeSize(), 0, numTransNodes-1, 0);
1985 int zdim = myGrid.
dim3;
1986 int nx = localInfo[myGridPe].
nx;
1987 int x_start = localInfo[myGridPe].
x_start;
1988 int slicelen = myGrid.
K2 * zdim;
1990 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1993 CmiNetworkProgressAfter (0);
1996 for (
int j=first; j<=last; j++) {
1997 int node = transNodeOrder[j];
1998 int pe = transNodeInfo[node].
pe_start;
1999 int npe = transNodeInfo[node].
npe;
2001 if ( node != myTransNode )
for (
int i=0; i<npe; ++i, ++pe) {
2013 float *qmsg = newmsg->
qgrid + nx * totlen * g;
2015 for (
int i=0; i<npe; ++i, ++pe) {
2018 if ( node == myTransNode ) {
2020 qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2023 for (
int x = 0; x < nx; ++x ) {
2024 CmiMemcpy((
void*)qmsg, (
void*)q, cpylen*
sizeof(
float));
2032 if ( node == myTransNode ) newmsg->
nx = 0;
2035 else pmeNodeProxy[transNodeInfo[node].
real_node].recvTrans(newmsg);
2036 }
else pmeProxy[transPeMap[transNodeInfo[node].
pe_start]].recvTrans(newmsg);
2042 int pe = transNodeInfo[myTransNode].
pe_start;
2043 int npe = transNodeInfo[myTransNode].
npe;
2044 CmiNodeLock lock = CmiCreateLock();
2045 int *count =
new int; *count = npe;
2046 for (
int i=0; i<npe; ++i, ++pe) {
2050 shmsg->
count = count;
2052 pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2059 int count = --(*msg->
count);
2060 CmiUnlock(msg->
lock);
2062 CmiDestroyLock(msg->
lock);
2076 if ( trans_count == numGridPes ) {
2082 int zdim = myGrid.
dim3;
2083 NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2085 int last_pe = first_pe+nodeInfo.
npe-1;
2095 CmiMemcpy((
void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2096 (
void*)(msg->
qgrid + nx*(ny_msg*g+y_skip)*zdim),
2097 nx*ny*zdim*
sizeof(
float));
2103 if ( trans_count == 0 ) {
2104 pmeProxyDir[CkMyPe()].gridCalc2();
2112 CmiNetworkProgressAfter (0);
2115 int zdim = myGrid.
dim3;
2123 fftwf_execute(forward_plan_x[g]);
2125 fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2126 ny * zdim / 2, 1, work, 1, 0);
2131 #ifdef OPENATOM_VERSION 2133 #endif // OPENATOM_VERSION 2135 #ifdef OPENATOM_VERSION 2139 #endif // OPENATOM_VERSION 2142 #ifdef OPENATOM_VERSION 2143 void ComputePmeMgr::gridCalc2Moa(
void) {
2145 int zdim = myGrid.
dim3;
2151 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2154 #ifdef OPENATOM_VERSION_DEBUG 2155 CkPrintf(
"Sending recQ on processor %d \n", CkMyPe());
2156 for (
int i=0; i<=(ny * zdim / 2); ++i)
2158 CkPrintf(
"PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2160 #endif // OPENATOM_VERSION_DEBUG 2162 CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2163 moaProxy[CkMyPe()].recvQ(g,
numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2166 #endif // OPENATOM_VERSION 2171 #if CMK_SMP && USE_CKLOOP 2173 && CkNumPes() >= 2 * numTransPes ) {
2178 int zdim = myGrid.
dim3;
2186 lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2193 fftwf_execute(backward_plan_x[g]);
2195 fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2196 ny * zdim / 2, 1, work, 1, 0);
2201 pmeProxyDir[CkMyPe()].sendUntrans();
2211 trans_count = numGridPes;
2216 newmsg->
evir[g] = recip_evir2[g];
2219 CmiEnableUrgentSend(1);
2220 pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2221 CmiEnableUrgentSend(0);
2224 #if CMK_SMP && USE_CKLOOP 2227 CkLoop_Parallelize(
PmeSlabSendUntrans, 1, (
void *)
this, CkMyNodeSize(), 0, numGridNodes-1, 0);
2238 int zdim = myGrid.
dim3;
2241 int slicelen = myGrid.
K2 * zdim;
2243 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2246 CmiNetworkProgressAfter (0);
2250 for (
int j=first; j<=last; j++) {
2251 int node = gridNodeOrder[j];
2252 int pe = gridNodeInfo[node].
pe_start;
2253 int npe = gridNodeInfo[node].
npe;
2255 if ( node != myGridNode )
for (
int i=0; i<npe; ++i, ++pe) {
2257 int cpylen = li.
nx * zdim;
2265 float *qmsg = newmsg->
qgrid + ny * totlen * g;
2267 for (
int i=0; i<npe; ++i, ++pe) {
2269 if ( node == myGridNode ) {
2271 qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2272 float *q = kgrid + qgrid_size*g + li.
x_start*ny*zdim;
2273 int cpylen = ny * zdim;
2274 for (
int x = 0; x < li.
nx; ++x ) {
2275 CmiMemcpy((
void*)qmsg, (
void*)q, cpylen*
sizeof(
float));
2280 CmiMemcpy((
void*)qmsg,
2281 (
void*)(kgrid + qgrid_size*g + li.
x_start*ny*zdim),
2282 li.
nx*ny*zdim*
sizeof(
float));
2283 qmsg += li.
nx*ny*zdim;
2288 if ( node == myGridNode ) newmsg->
ny = 0;
2291 else pmeNodeProxy[gridNodeInfo[node].
real_node].recvUntrans(newmsg);
2292 }
else pmeProxy[gridPeMap[gridNodeInfo[node].
pe_start]].recvUntrans(newmsg);
2297 int pe = gridNodeInfo[myGridNode].
pe_start;
2298 int npe = gridNodeInfo[myGridNode].
npe;
2299 CmiNodeLock lock = CmiCreateLock();
2300 int *count =
new int; *count = npe;
2301 for (
int i=0; i<npe; ++i, ++pe) {
2304 shmsg->
count = count;
2306 pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2313 int count = --(*msg->
count);
2314 CmiUnlock(msg->
lock);
2316 CmiDestroyLock(msg->
lock);
2332 CmiNetworkProgressAfter (0);
2340 int zdim = myGrid.
dim3;
2341 int last_pe = first_pe+nodeInfo.
npe-1;
2342 int x_skip = localInfo[myGridPe].
x_start 2343 - localInfo[first_pe].
x_start;
2344 int nx_msg = localInfo[last_pe].
x_start 2345 + localInfo[last_pe].
nx 2346 - localInfo[first_pe].
x_start;
2347 int nx = localInfo[myGridPe].
nx;
2350 int slicelen = myGrid.
K2 * zdim;
2351 int cpylen = ny * zdim;
2353 float *q = qgrid + qgrid_size * g + y_start * zdim;
2354 float *qmsg = msg->
qgrid + (nx_msg*g+x_skip) * cpylen;
2355 for (
int x = 0; x < nx; ++x ) {
2356 CmiMemcpy((
void*)q, (
void*)qmsg, cpylen*
sizeof(
float));
2365 if ( untrans_count == 0 ) {
2366 pmeProxyDir[CkMyPe()].gridCalc3();
2377 fftwf_execute(backward_plan_yz[g]);
2379 rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2380 (fftw_complex *) (qgrid + qgrid_size * g),
2381 1, myGrid.
dim2 * myGrid.
dim3 / 2, 0, 0, 0);
2387 pmeProxyDir[CkMyPe()].sendUngrid();
2397 #if CMK_SMP && USE_CKLOOP 2400 CkLoop_Parallelize(
PmeSlabSendUngrid, 1, (
void *)
this, CkMyNodeSize(), 0, numSources-1, 1);
2407 grid_count = numSources;
2408 memset( (
void*) qgrid, 0, qgrid_size *
numGrids *
sizeof(
float) );
2413 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2419 for (
int j=first; j<=last; ++j ) {
2423 int zdim = myGrid.
dim3;
2424 int flen = newmsg->
len;
2425 int fstart = newmsg->
start;
2427 int *zlist = newmsg->
zlist;
2428 float *qmsg = newmsg->
qgrid;
2430 char *f = newmsg->
fgrid + fgrid_len * g;
2431 float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2432 for (
int i=0; i<flen; ++i ) {
2434 for (
int k=0; k<zlistlen; ++k ) {
2435 *(qmsg++) = q[zlist[k]];
2444 CmiEnableUrgentSend(1);
2445 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2447 pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2450 pmeProxyDir[pe].recvUngrid(newmsg);
2451 CmiEnableUrgentSend(0);
2457 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2460 if ( ungrid_count == 0 ) {
2461 NAMD_bug(
"Message order failure in ComputePmeMgr::recvUngrid\n");
2471 if ( msg )
delete msg;
2472 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2475 if ( ungrid_count == 0 ) {
2476 NAMD_bug(
"Message order failure in ComputePmeMgr::recvUngrid\n");
2478 int uc = --ungrid_count;
2489 if ( ungrid_count == 0 ) {
2490 pmeProxyDir[CkMyPe()].ungridCalc();
2494 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2495 #define count_limit 1000000 2496 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1) 2497 #define EVENT_STRIDE 10 2508 if ( err == cudaSuccess ) {
2522 }
else if ( err != cudaErrorNotReady ) {
2524 sprintf(errmsg,
"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2531 sprintf(errmsg,
"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2550 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2555 if (
this == masterPmeMgr ) {
2556 double before = CmiWallTimer();
2558 cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 );
2559 cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 );
2561 cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2562 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after potential memcpy");
2565 const int myrank = CkMyRank();
2566 for (
int i=0; i<CkMyNodeSize(); ++i ) {
2577 for (
int i=0; i<n; ++i ) {
2578 cudaEventCreateWithFlags(&
end_forces[i],cudaEventDisableTiming);
2584 cudaMallocHost((
void**) &afn_host, 3*pcsz*
sizeof(
float*));
2585 cudaMalloc((
void**) &afn_dev, 3*pcsz*
sizeof(
float*));
2589 for (
int i=0; i<pcsz; ++i ) {
2593 if ( totn > f_data_mgr_alloc ) {
2594 if ( f_data_mgr_alloc ) {
2595 CkPrintf(
"Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2596 cudaFree(f_data_mgr_dev);
2597 cudaFreeHost(f_data_mgr_host);
2599 f_data_mgr_alloc = 1.2 * (totn + 100);
2600 cudaMalloc((
void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*
sizeof(
float));
2601 cudaMallocHost((
void**) &f_data_mgr_host, 3*f_data_mgr_alloc*
sizeof(
float));
2605 float *f_dev = f_data_mgr_dev;
2606 float *f_host = f_data_mgr_host;
2607 for (
int i=0; i<pcsz; ++i ) {
2612 afn_host[3*i+1] = f_dev;
2613 afn_host[3*i+2] = f_dev + n;
2618 double before = CmiWallTimer();
2619 cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*
sizeof(
float*), cudaMemcpyHostToDevice, streams[stream]);
2620 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force pointer memcpy");
2622 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2623 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2626 for (
int i=0; i<pcsz; ++i ) {
2629 int dimy = pcsz - i;
2633 for (
int j=0; j<dimy; ++j ) {
2636 if ( n > maxn ) maxn = n;
2639 before = CmiWallTimer();
2642 v_arr_dev, afn_dev+3*i, dimy, maxn,
2647 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force kernel submit");
2649 before = CmiWallTimer();
2651 cudaMemcpyDeviceToHost, streams[stream]);
2653 cudaDeviceSynchronize();
2654 fprintf(stderr,
"i = %d\n", i);
2655 for(
int k=0; k < subtotn*3; k++)
2657 fprintf(stderr,
"f_data_host[%d][%d] = %f\n", i, k,
2661 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force memcpy submit");
2664 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after end_forces event");
2680 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2685 pmeProxy[
this_pe].pollForcesReady();
2689 ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2693 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2697 NAMD_bug(
"ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2705 DebugM(4,
"ComputePme created.\n");
2709 CProxy_ComputePmeMgr::ckLocalBranch(
2710 CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(
this);
2724 myGrid.
dim3 = 2 * (myGrid.
K3/2 + 1);
2726 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2727 cuda_atoms_offset = 0;
2744 NAMD_bug(
"ComputePme used with unknown patch.");
2749 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2760 ungridForcesCount = 0;
2766 strayChargeErrors = 0;
2768 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2770 int pe =
master_pe = CkNodeFirst(CkMyNode());
2771 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2774 if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() )
master_pe = pe;
2782 NAMD_bug(
"ComputePmeMgr::initialize_computes() master_pe has no patches.");
2785 masterPmeMgr = nodePmeMgr->mgrObjects[
master_pe - CkNodeFirst(CkMyNode())];
2794 nodePmeMgr->masterPmeMgr = masterPmeMgr;
2798 qsize = myGrid.
K1 * myGrid.
dim2 * myGrid.
dim3;
2799 fsize = myGrid.
K1 * myGrid.
dim2;
2800 if ( myGrid.
K2 != myGrid.
dim2 )
NAMD_bug(
"PME myGrid.K2 != myGrid.dim2");
2801 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2805 q_arr =
new float*[fsize*
numGrids];
2806 memset( (
void*) q_arr, 0, fsize*
numGrids *
sizeof(
float*) );
2807 q_list =
new float*[fsize*
numGrids];
2808 memset( (
void*) q_list, 0, fsize*
numGrids *
sizeof(
float*) );
2812 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2813 if ( cudaFirst || ! offload ) {
2818 for (
int n=fsize*
numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2821 char *f = f_arr + g*fsize;
2825 int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2826 int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2827 int dim2 = myGrid.
dim2;
2828 for (
int ap=0; ap<numPencilsActive; ++ap) {
2829 int ib = activePencils[ap].
i;
2830 int jb = activePencils[ap].
j;
2831 int ibegin = ib*block1;
2832 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
2833 int jbegin = jb*block2;
2834 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
2835 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
2836 for (
int i=ibegin; i<iend; ++i ) {
2837 for (
int j=jbegin; j<jend; ++j ) {
2843 int block1 = ( myGrid.
K1 + numGridPes - 1 ) / numGridPes;
2844 bsize = block1 * myGrid.
dim2 * myGrid.
dim3;
2845 for (
int pe=0; pe<numGridPes; pe++) {
2846 if ( ! recipPeDest[pe] )
continue;
2847 int start = pe * bsize;
2849 if ( start >= qsize ) { start = 0; len = 0; }
2850 if ( start + len > qsize ) { len = qsize - start; }
2851 int zdim = myGrid.
dim3;
2852 int fstart = start / zdim;
2853 int flen = len / zdim;
2854 memset(f + fstart, 0, flen*
sizeof(
char));
2859 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2865 int f_alloc_count = 0;
2866 for (
int n=fsize, i=0; i<n; ++i ) {
2867 if ( f_arr[i] == 0 ) {
2873 q_arr =
new float*[fsize*
numGrids];
2874 memset( (
void*) q_arr, 0, fsize*
numGrids *
sizeof(
float*) );
2876 float **q_arr_dev_host =
new float*[fsize];
2877 cudaMalloc((
void**) &q_arr_dev, fsize *
sizeof(
float*));
2879 float **v_arr_dev_host =
new float*[fsize];
2880 cudaMalloc((
void**) &v_arr_dev, fsize *
sizeof(
float*));
2882 int q_stride = myGrid.
K3+myGrid.
order-1;
2883 q_data_size = f_alloc_count * q_stride *
sizeof(float);
2884 ffz_size = (fsize + q_stride) *
sizeof(
int);
2887 cudaMallocHost((
void**) &q_data_host, q_data_size+ffz_size);
2888 ffz_host = (
int*)(((
char*)q_data_host) + q_data_size);
2889 cudaMalloc((
void**) &q_data_dev, q_data_size+ffz_size);
2890 ffz_dev = (
int*)(((
char*)q_data_dev) + q_data_size);
2891 cudaMalloc((
void**) &v_data_dev, q_data_size);
2893 cudaMemset(q_data_dev, 0, q_data_size + ffz_size);
2894 cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2895 cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2896 cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2897 cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2900 for (
int n=fsize, i=0; i<n; ++i ) {
2901 if ( f_arr[i] == 0 ) {
2902 q_arr[i] = q_data_host + f_alloc_count * q_stride;
2903 q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2904 v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2908 q_arr_dev_host[i] = 0;
2909 v_arr_dev_host[i] = 0;
2913 cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize *
sizeof(
float*), cudaMemcpyHostToDevice);
2914 cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize *
sizeof(
float*), cudaMemcpyHostToDevice);
2915 delete [] q_arr_dev_host;
2916 delete [] v_arr_dev_host;
2918 f_arr =
new char[fsize + q_stride];
2919 fz_arr = f_arr + fsize;
2920 memset(f_arr, 0, fsize + q_stride);
2921 memset(ffz_host, 0, (fsize + q_stride)*
sizeof(
int));
2928 #define XCOPY(X) masterPmeMgr->X = X; 2929 XCOPY(bspline_coeffs_dev)
2930 XCOPY(bspline_dcoeffs_dev)
2947 #define XCOPY(X) X = masterPmeMgr->X; 2948 XCOPY(bspline_coeffs_dev)
2949 XCOPY(bspline_dcoeffs_dev)
2968 fz_arr =
new char[myGrid.
K3+myGrid.
order-1];
2971 #if 0 && USE_PERSISTENT 2972 recvGrid_handle = NULL;
2978 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 2982 for (
int g=0; g<numGridsMax; ++g )
delete myRealSpace[g];
2986 #if 0 && USE_PERSISTENT 2987 void ComputePmeMgr::setup_recvgrid_persistent()
2991 int dim2 = myGrid.
dim2;
2992 int dim3 = myGrid.
dim3;
2993 int block1 = myGrid.
block1;
2994 int block2 = myGrid.
block2;
2996 CkArray *zPencil_local = zPencil.ckLocalBranch();
2997 recvGrid_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * numPencilsActive);
2998 for (
int ap=0; ap<numPencilsActive; ++ap) {
2999 int ib = activePencils[ap].
i;
3000 int jb = activePencils[ap].
j;
3001 int ibegin = ib*block1;
3002 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3003 int jbegin = jb*block2;
3004 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3005 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
3009 char *f = f_arr + g*fsize;
3010 for (
int i=ibegin; i<iend; ++i ) {
3011 for (
int j=jbegin; j<jend; ++j ) {
3012 fcount += f[i*dim2+j];
3017 for (
int i=0; i<myGrid.
K3; ++i ) {
3018 if ( fz_arr[i] ) ++zlistlen;
3020 int hd = ( fcount? 1 : 0 );
3021 int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
3023 int compress_size =
sizeof(float)*hd*fcount*zlistlen;
3024 int size = compress_start + compress_size +
PRIORITY_SIZE/8+6;
3025 recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
3037 if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount )
return 0;
3038 myMgr->heldComputes.
add(
this);
3042 positionBox->
skip();
3046 myMgr->noWorkCount = 0;
3047 myMgr->reduction->
submit();
3062 evir[g] += msg->
evir[g];
3082 numLocalQMAtoms = 0;
3083 for (
int paIter=0; paIter<patch->
getNumAtoms(); paIter++) {
3084 if ( qmAtomGroup[xExt[paIter].
id] != 0 ) {
3092 if (qmLoclIndx != 0)
3093 delete [] qmLoclIndx;
3094 if (qmLocalCharges != 0)
3095 delete [] qmLocalCharges;
3097 qmLoclIndx =
new int[numLocalQMAtoms] ;
3098 qmLocalCharges =
new Real[numLocalQMAtoms] ;
3104 for (
int paIter=0; paIter<patch->
getNumAtoms(); paIter++) {
3106 for (
int i=0; i<numQMAtms; i++) {
3108 if (qmAtmIndx[i] == xExt[paIter].
id) {
3110 qmLoclIndx[procAtms] = paIter ;
3111 qmLocalCharges[procAtms] = qmAtmChrg[i];
3119 if (procAtms == numLocalQMAtoms)
3129 DebugM(4,
"Entering ComputePme::doWork().\n");
3132 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3139 if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->
submitReductions();
3145 #ifdef TRACE_COMPUTE_OBJECTS 3146 double traceObjStartTime = CmiWallTimer();
3149 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3159 localData = localData_alloc.
begin();
3160 localPartition_alloc.
resize(numLocalAtoms);
3161 localPartition = localPartition_alloc.
begin();
3165 localGridData[g] = localData + numLocalAtoms*(g+1);
3170 unsigned char * part_ptr = localPartition;
3178 positionBox->
close(&x);
3179 x = avgPositionBox->
open();
3183 for(
int i=0; i<numAtoms; ++i)
3188 data_ptr->
cg = coulomb_sqrt * x[i].
charge;
3198 for(
int i=0; i<numLocalQMAtoms; ++i)
3200 localData[qmLoclIndx[i]].
cg = coulomb_sqrt * qmLocalCharges[i];
3206 else { positionBox->
close(&x); }
3215 for(
int i=0; i<numLocalAtoms; ++i) {
3216 if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3219 lgd[nga++] = localData[i];
3222 numGridAtoms[g] = nga;
3225 for(
int i=0; i<numLocalAtoms; ++i) {
3226 if ( localPartition[i] == 0 ) {
3228 lgd[nga++] = localData[i];
3231 numGridAtoms[g] = nga;
3241 for ( g=0; g<2; ++g ) {
3244 for(
int i=0; i<numLocalAtoms; ++i) {
3245 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3246 lgd[nga++] = localData[i];
3249 numGridAtoms[g] = nga;
3251 for (g=2 ; g<4 ; ++g ) {
3254 for(
int i=0; i<numLocalAtoms; ++i) {
3255 if ( localPartition[i] == (g-1) ) {
3256 lgd[nga++] = localData[i];
3259 numGridAtoms[g] = nga;
3265 for(
int i=0; i<numLocalAtoms; ++i) {
3266 if ( localPartition[i] == 0 ) {
3267 lgd[nga++] = localData[i];
3270 numGridAtoms[g] = nga;
3277 for(
int i=0; i<numLocalAtoms; ++i) {
3278 if ( localPartition[i] == 1 ) {
3279 lgd[nga++] = localData[i];
3282 numGridAtoms[g] = nga;
3288 for(
int i=0; i<numLocalAtoms; ++i) {
3289 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3290 lgd[nga++] = localData[i];
3293 numGridAtoms[g] = nga;
3294 for ( g=1; g<3; ++g ) {
3297 for(
int i=0; i<numLocalAtoms; ++i) {
3298 if ( localPartition[i] == g ) {
3299 lgd[nga++] = localData[i];
3302 numGridAtoms[g] = nga;
3306 localGridData[0] = localData;
3307 numGridAtoms[0] = numLocalAtoms;
3310 if ( ! myMgr->doWorkCount ) {
3313 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3317 memset( (
void*) myMgr->fz_arr, 0, (myGrid.
K3+myGrid.
order-1) *
sizeof(
char) );
3319 for (
int i=0; i<myMgr->q_count; ++i) {
3320 memset( (
void*) (myMgr->q_list[i]), 0, (myGrid.
K3+myGrid.
order-1) *
sizeof(
float) );
3328 myMgr->strayChargeErrors = 0;
3330 myMgr->compute_sequence =
sequence();
3333 if (
sequence() != myMgr->compute_sequence )
NAMD_bug(
"ComputePme sequence mismatch in doWork()");
3335 int strayChargeErrors = 0;
3341 data_ptr = localGridData[g];
3343 for(i=0; i<numGridAtoms[g]; ++i)
3345 selfEnergy += data_ptr->
cg * data_ptr->
cg;
3348 selfEnergy *= -1. * ewaldcof /
SQRT_PI;
3349 myMgr->evir[g][0] += selfEnergy;
3351 float **q = myMgr->q_arr + g*myMgr->fsize;
3352 char *f = myMgr->f_arr + g*myMgr->fsize;
3355 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3360 cudaMallocHost((
void**) &(myMgr->
a_data_host), 7*na*
sizeof(
float));
3361 cudaMalloc((
void**) &(myMgr->
a_data_dev), 7*na*
sizeof(
float));
3366 int n = numGridAtoms[g];
3369 CkPrintf(
"Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3373 const float *a_data_host_old = myMgr->
a_data_host;
3374 cudaMallocHost((
void**) &(myMgr->
a_data_host), 7*na*
sizeof(
float));
3376 memcpy(myMgr->
a_data_host, a_data_host_old, 7*cuda_atoms_offset*
sizeof(
float));
3377 cudaFreeHost((
void*) a_data_host_old);
3381 cudaMalloc((
void**) &(myMgr->
a_data_dev), 7*na*
sizeof(
float));
3384 float *a_data_host = myMgr->
a_data_host + 7 * cuda_atoms_offset;
3385 data_ptr = localGridData[g];
3386 double order_1 = myGrid.
order - 1;
3387 double K1 = myGrid.
K1;
3388 double K2 = myGrid.
K2;
3389 double K3 = myGrid.
K3;
3390 int found_negative = 0;
3391 for (
int i=0; i<n; ++i ) {
3392 if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3396 double x_int = (int) data_ptr[i].x;
3397 double y_int = (int) data_ptr[i].y;
3398 double z_int = (int) data_ptr[i].z;
3399 a_data_host[7*i ] = data_ptr[i].
x - x_int;
3400 a_data_host[7*i+1] = data_ptr[i].
y - y_int;
3401 a_data_host[7*i+2] = data_ptr[i].
z - z_int;
3402 a_data_host[7*i+3] = data_ptr[i].
cg;
3403 x_int -= order_1;
if ( x_int < 0 ) x_int += K1;
3404 y_int -= order_1;
if ( y_int < 0 ) y_int += K2;
3405 z_int -= order_1;
if ( z_int < 0 ) z_int += K3;
3406 a_data_host[7*i+4] = x_int;
3407 a_data_host[7*i+5] = y_int;
3408 a_data_host[7*i+6] = z_int;
3410 if ( found_negative )
NAMD_bug(
"found negative atom coordinate in ComputePme::doWork");
3415 myRealSpace[g]->
fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3418 myMgr->strayChargeErrors += strayChargeErrors;
3420 #ifdef TRACE_COMPUTE_OBJECTS 3424 if ( --(myMgr->doWorkCount) == 0 ) {
3426 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3463 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3471 const double before = CmiWallTimer();
3473 cudaMemcpyHostToDevice, streams[stream]);
3474 const double after = CmiWallTimer();
3476 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3480 q_arr_dev, ffz_dev, ffz_dev + fsize,
3484 const double after2 = CmiWallTimer();
3496 cudaError_t err = cudaEventQuery(argp->
end_charges);
3497 if ( err == cudaSuccess ) {
3502 }
else if ( err != cudaErrorNotReady ) {
3504 sprintf(errmsg,
"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3510 sprintf(errmsg,
"cuda_check_pme_charges polled %d times over %f s on seq %d",
3534 double before = CmiWallTimer();
3535 cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);
3536 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3537 cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3538 cudaMemcpyDeviceToHost, streams[stream]);
3540 cudaEventRecord(masterPmeMgr->
end_charges, streams[stream]);
3541 cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);
3542 cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3548 pmeProxy[
master_pe].pollChargeGridReady();
3553 for (
int i=0; i<CkMyNodeSize(); ++i ) {
3557 mgr->ungridForcesCount = cs;
3558 mgr->recipEvirCount = mgr->recipEvirClients;
3562 pmeProxy[
master_pe].recvChargeGridReady();
3567 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3571 NAMD_bug(
"ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3581 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3584 int q_stride = myGrid.
K3+myGrid.
order-1;
3585 for (
int n=fsize+q_stride, j=fsize; j<n; ++j) {
3586 f_arr[j] = ffz_host[j];
3587 if ( ffz_host[j] & ~1 ) ++errcount;
3589 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::chargeGridReady");
3592 recipEvirCount = recipEvirClients;
3595 for (
int j=0; j<myGrid.
order-1; ++j) {
3596 fz_arr[j] |= fz_arr[myGrid.
K3+j];
3611 #if 0 && USE_PERSISTENT 3612 if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3616 int dim2 = myGrid.
dim2;
3617 int dim3 = myGrid.
dim3;
3618 int block1 = myGrid.
block1;
3619 int block2 = myGrid.
block2;
3622 NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3624 for (
int ap=first; ap<=last; ++ap) {
3625 int ib = activePencils[ap].
i;
3626 int jb = activePencils[ap].
j;
3627 int ibegin = ib*block1;
3628 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3629 int jbegin = jb*block2;
3630 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3631 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
3635 char *f = f_arr + g*fsize;
3636 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3639 for (
int i=ibegin; i<iend; ++i ) {
3640 for (
int j=jbegin; j<jend; ++j ) {
3644 if ( ffz_host[k] & ~1 ) ++errcount;
3647 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::sendPencilsPart");
3650 for (
int i=ibegin; i<iend; ++i ) {
3651 for (
int j=jbegin; j<jend; ++j ) {
3652 fcount += f[i*dim2+j];
3657 #ifdef NETWORK_PROGRESS 3658 CmiNetworkProgress();
3661 if ( ! pencilActive[ib*yBlocks+jb] )
3662 NAMD_bug(
"PME activePencils list inconsistent");
3665 for (
int i=0; i<myGrid.
K3; ++i ) {
3666 if ( fz_arr[i] ) ++zlistlen;
3669 int hd = ( fcount? 1 : 0 );
3673 PmeGridMsg *msg =
new ( hd*zlistlen, hd*flen,
3680 msg->
start = fstart;
3687 int *zlist = msg->
zlist;
3689 for (
int i=0; i<myGrid.
K3; ++i ) {
3690 if ( fz_arr[i] ) zlist[zlistlen++] = i;
3692 char *fmsg = msg->
fgrid;
3693 float *qmsg = msg->
qgrid;
3695 char *f = f_arr + g*fsize;
3696 float **q = q_arr + g*fsize;
3697 for (
int i=ibegin; i<iend; ++i ) {
3698 for (
int j=jbegin; j<jend; ++j ) {
3699 *(fmsg++) = f[i*dim2+j];
3701 for (
int h=0; h<myGrid.
order-1; ++h) {
3702 q[i*dim2+j][h] += q[i*dim2+j][myGrid.
K3+h];
3704 for (
int k=0; k<zlistlen; ++k ) {
3705 *(qmsg++) = q[i*dim2+j][zlist[k]];
3715 CmiEnableUrgentSend(1);
3716 #if USE_NODE_PAR_RECEIVE 3717 msg->
destElem=CkArrayIndex3D(ib,jb,0);
3718 CProxy_PmePencilMap lzm = npMgr->
zm;
3719 int destproc = lzm.ckLocalBranch()->procNum(0, msg->
destElem);
3720 int destnode = CmiNodeOf(destproc);
3723 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3725 pmeNodeProxy[destnode].recvZGrid(msg);
3727 CmiUsePersistentHandle(NULL, 0);
3731 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3733 zPencil(ib,jb,0).recvGrid(msg);
3735 CmiUsePersistentHandle(NULL, 0);
3738 CmiEnableUrgentSend(0);
3754 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3758 NAMD_bug(
"NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3768 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3770 for (
int ap=0; ap < numPencilsActive; ++ap ) {
3773 int ib = activePencils[ap].
i;
3774 int jb = activePencils[ap].
j;
3775 int destproc = nodePmeMgr->
zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3776 pmeProxy[destproc].sendPencilsHelper(ap);
3778 pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3787 if ( strayChargeErrors ) {
3788 strayChargeErrors = 0;
3789 iout <<
iERROR <<
"Stray PME grid charges detected: " 3790 << CkMyPe() <<
" sending to (x,y)";
3793 int dim2 = myGrid.
dim2;
3794 int block1 = myGrid.
block1;
3795 int block2 = myGrid.
block2;
3796 for (
int ib=0; ib<xBlocks; ++ib) {
3797 for (
int jb=0; jb<yBlocks; ++jb) {
3798 int ibegin = ib*block1;
3799 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3800 int jbegin = jb*block2;
3801 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3802 int flen =
numGrids * (iend - ibegin) * (jend - jbegin);
3805 char *f = f_arr + g*fsize;
3806 if ( ! pencilActive[ib*yBlocks+jb] ) {
3807 for (
int i=ibegin; i<iend; ++i ) {
3808 for (
int j=jbegin; j<jend; ++j ) {
3809 if ( f[i*dim2+j] == 3 ) {
3811 iout <<
" (" << i <<
"," << j <<
")";
3829 int dim2 = myGrid.
dim2;
3830 int dim3 = myGrid.
dim3;
3831 int block1 = myGrid.
block1;
3832 int block2 = myGrid.
block2;
3838 int ibegin = ib*block1;
3839 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3840 int jbegin = jb*block2;
3841 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3844 int *zlist = msg->
zlist;
3845 float *qmsg = msg->
qgrid;
3848 char *f = f_arr + g*fsize;
3849 float **q = q_arr + g*fsize;
3850 for (
int i=ibegin; i<iend; ++i ) {
3851 for (
int j=jbegin; j<jend; ++j ) {
3854 for (
int k=0; k<zlistlen; ++k ) {
3855 q[i*dim2+j][zlist[k]] = *(qmsg++);
3857 for (
int h=0; h<myGrid.
order-1; ++h) {
3858 q[i*dim2+j][myGrid.
K3+h] = q[i*dim2+j][h];
3873 CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3874 for (
int j=first; j<=last; j++) {
3875 int pe = gridPeOrder[j];
3876 if ( ! recipPeDest[pe] && ! errors )
continue;
3877 int start = pe * bsize;
3879 if ( start >= qsize ) { start = 0; len = 0; }
3880 if ( start + len > qsize ) { len = qsize - start; }
3881 int zdim = myGrid.
dim3;
3882 int fstart = start / zdim;
3883 int flen = len / zdim;
3889 char *f = f_arr + fstart + g*fsize;
3890 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3893 for ( i=0; i<flen; ++i ) {
3894 f[i] = ffz_host[fstart+i];
3896 if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3898 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::sendDataPart");
3901 for ( i=0; i<flen; ++i ) {
3904 if ( ! recipPeDest[pe] ) {
3906 for ( i=0; i<flen; ++i ) {
3913 iout <<
iERROR <<
"Stray PME grid charges detected: " 3914 << sourcepe <<
" sending to " << gridPeMap[pe] <<
" for planes";
3916 for ( i=0; i<flen; ++i ) {
3919 int jz = (i+fstart)/myGrid.
K2;
3920 if ( iz != jz ) {
iout <<
" " << jz; iz = jz; }
3928 #ifdef NETWORK_PROGRESS 3929 CmiNetworkProgress();
3932 if ( ! recipPeDest[pe] )
continue;
3935 for ( i=0; i<myGrid.
K3; ++i ) {
3936 if ( fz_arr[i] ) ++zlistlen;
3944 msg->
start = fstart;
3947 int *zlist = msg->
zlist;
3949 for ( i=0; i<myGrid.
K3; ++i ) {
3950 if ( fz_arr[i] ) zlist[zlistlen++] = i;
3952 float *qmsg = msg->
qgrid;
3954 char *f = f_arr + fstart + g*fsize;
3955 CmiMemcpy((
void*)(msg->
fgrid+g*flen),(
void*)f,flen*
sizeof(
char));
3956 float **q = q_arr + fstart + g*fsize;
3957 for ( i=0; i<flen; ++i ) {
3959 for (
int h=0; h<myGrid.
order-1; ++h) {
3960 q[i][h] += q[i][myGrid.
K3+h];
3962 for (
int k=0; k<zlistlen; ++k ) {
3963 *(qmsg++) = q[i][zlist[k]];
3971 pmeProxy[gridPeMap[pe]].recvGrid(msg);
3981 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3985 NAMD_bug(
"NodePmeMgr::sendDataHelper called in non-CUDA build");
3995 strayChargeErrors = 0;
3997 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 3999 for (
int i=0; i < numGridPes; ++i ) {
4000 int pe = gridPeOrder[i];
4004 pmeProxy[gridPeMap[pe]].sendDataHelper(i);
4006 pmeNodeProxy[CkMyNode()].sendDataHelper(i);
4019 int zdim = myGrid.
dim3;
4020 int flen = msg->
len;
4021 int fstart = msg->
start;
4023 int *zlist = msg->
zlist;
4024 float *qmsg = msg->
qgrid;
4027 char *f = msg->
fgrid + g*flen;
4028 float **q = q_arr + fstart + g*fsize;
4029 for (
int i=0; i<flen; ++i ) {
4032 for (
int k=0; k<zlistlen; ++k ) {
4033 q[i][zlist[k]] = *(qmsg++);
4035 for (
int h=0; h<myGrid.
order-1; ++h) {
4036 q[i][myGrid.
K3+h] = q[i][h];
4045 if (
sequence() != myMgr->compute_sequence )
NAMD_bug(
"ComputePme sequence mismatch in ungridForces()");
4050 Vector *localResults = localResults_alloc.
begin();
4054 for(
int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4055 gridResults = localResults + numLocalAtoms;
4057 gridResults = localResults;
4065 #ifdef NETWORK_PROGRESS 4066 CmiNetworkProgress();
4069 #if defined(NAMD_CUDA) || defined(NAMD_HIP) 4072 for (
int n=numGridAtoms[g], i=0; i<n; ++i ) {
4074 if ( ((
int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }
4075 gridResults[i].
x = f_data_host[3*i];
4076 gridResults[i].
y = f_data_host[3*i+1];
4077 gridResults[i].
z = f_data_host[3*i+2];
4081 for (
int n=numGridAtoms[g], i=0; i<n; ++i ) {
4082 float f = f_data_host[3*i];
4083 if ( ((
int*)f_data_host)[3*i] == 0x7fffffff ) {
4085 gridResults[i] = 0.;
4088 iout <<
iERROR <<
"Stray PME grid charges detected: " 4089 << errcount <<
" atoms on pe " << CkMyPe() <<
"\n" <<
endi;
4094 myRealSpace[g]->
compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4100 BigReal elecLambdaUp, elecLambdaDown;
4102 myMgr->alchLambda = alchLambda;
4104 myMgr->alchLambda2 = alchLambda2;
4105 elecLambdaUp =
simParams->getElecLambda(alchLambda);
4106 elecLambdaDown =
simParams->getElecLambda(1. - alchLambda);
4108 if ( g == 0 ) scale = elecLambdaUp;
4109 else if ( g == 1 ) scale = elecLambdaDown;
4110 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4113 if ( g == 2 ) scale = 1 - elecLambdaUp;
4114 else if ( g == 3 ) scale = 1 - elecLambdaDown;
4115 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4120 for(
int i=0; i<numLocalAtoms; ++i) {
4121 if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4124 localResults[i] += gridResults[nga++] * scale;
4128 for(
int i=0; i<numLocalAtoms; ++i) {
4129 if ( localPartition[i] == 0 ) {
4131 localResults[i] += gridResults[nga++] * scale;
4137 for(
int i=0; i<numLocalAtoms; ++i) {
4138 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4141 localResults[i] += gridResults[nga++] * scale;
4146 for(
int i=0; i<numLocalAtoms; ++i) {
4147 if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4151 localResults[i] += gridResults[nga++] * scale;
4156 }
else if (
lesOn ) {
4160 myMgr->alchLambda = alchLambda;
4162 myMgr->alchLambda2 = alchLambda2;
4163 if ( g == 0 ) scale = alchLambda;
4164 else if ( g == 1 ) scale = 1. - alchLambda;
4165 }
else if (
lesOn ) {
4169 for(
int i=0; i<numLocalAtoms; ++i) {
4170 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4171 localResults[i] += gridResults[nga++] * scale;
4177 for(
int i=0; i<numLocalAtoms; ++i) {
4178 if ( localPartition[i] == 1 ) {
4179 pairForce += gridResults[nga];
4180 localResults[i] += gridResults[nga++];
4186 for(
int i=0; i<numLocalAtoms; ++i) {
4187 if ( localPartition[i] == 1 ) {
4188 pairForce += gridResults[nga];
4190 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4191 localResults[i] += gridResults[nga++];
4194 }
else if ( g == 1 ) {
4196 for(
int i=0; i<numLocalAtoms; ++i) {
4197 if ( localPartition[i] == g ) {
4198 pairForce -= gridResults[nga];
4199 localResults[i] -= gridResults[nga++];
4204 for(
int i=0; i<numLocalAtoms; ++i) {
4205 if ( localPartition[i] == g ) {
4206 localResults[i] -= gridResults[nga++];
4214 Vector *results_ptr = localResults;
4222 if ( ! myMgr->strayChargeErrors && !
simParams->commOnly ) {
4223 for(
int i=0; i<numAtoms; ++i) {
4224 f[i].
x += results_ptr->
x;
4225 f[i].
y += results_ptr->
y;
4226 f[i].
z += results_ptr->
z;
4230 forceBox->
close(&r);
4246 BigReal elecLambdaUp, elecLambdaDown;
4248 if ( alchLambda < 0 || alchLambda > 1 ) {
4249 NAMD_bug(
"ComputePmeMgr::submitReductions alchLambda out of range");
4251 elecLambdaUp =
simParams->getElecLambda(alchLambda);
4252 elecLambdaDown =
simParams->getElecLambda(1-alchLambda);
4253 if ( g == 0 ) scale = elecLambdaUp;
4254 else if ( g == 1 ) scale = elecLambdaDown;
4255 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4257 if ( g == 2 ) scale = 1-elecLambdaUp;
4258 else if ( g == 3 ) scale = 1-elecLambdaDown;
4259 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4261 }
else if (
lesOn ) {
4264 scale = ( g == 0 ? 1. : -1. );
4267 reduction->
item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4268 reduction->
item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4269 reduction->
item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4270 reduction->
item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4271 reduction->
item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4272 reduction->
item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4273 reduction->
item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4274 reduction->
item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4275 reduction->
item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4283 BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4284 elecLambda2Up =
simParams->getElecLambda(alchLambda2);
4285 elecLambda2Down =
simParams->getElecLambda(1.-alchLambda2);
4286 if ( g == 0 ) scale2 = elecLambda2Up;
4287 else if ( g == 1 ) scale2 = elecLambda2Down;
4288 else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4289 if (
alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4290 else if (
alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4291 else if (
alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4344 for (
int i=0; i<heldComputes.
size(); ++i ) {
4353 const static unsigned int NAMDPrimes[] = {
4383 bool generateBGLORBPmePeList(
int *pemap,
int numPes,
4384 int *block_pes,
int nbpes) {
4387 int *pmemap =
new int [CkNumPes()];
4394 memset(pmemap, 0,
sizeof(
int) * CkNumPes());
4396 for(
int count = 0; count < CkNumPes(); count++) {
4398 pmemap[block_pes[count]] = 1;
4404 if(tmgr.hasMultipleProcsPerNode()) {
4405 pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4414 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4415 Node *node = nd.ckLocalBranch();
4420 int xsize = 0, ysize = 0, zsize = 0;
4422 xsize = tmgr.getDimNX();
4423 ysize = tmgr.getDimNY();
4424 zsize = tmgr.getDimNZ();
4426 int nx = xsize, ny = ysize, nz = zsize;
4433 findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4436 int group_size = numPes/nx;
4440 int my_prime = NAMDPrimes[0];
4441 int density = (ny * nz)/group_size + 1;
4445 for(count = 0; count < NPRIMES; count ++) {
4447 if(density < NAMDPrimes[count]) {
4448 my_prime = NAMDPrimes[count];
4453 if(count == NPRIMES)
4454 my_prime = NAMDPrimes[NPRIMES-1];
4462 for(
int x = 0; x < nx; x++) {
4463 coord[0] = (x + nx/2)%nx;
4465 for(count=0; count < group_size && npme_pes < numPes; count++) {
4466 int dest = (count + 1) * my_prime;
4467 dest = dest % (ny * nz);
4469 coord[2] = dest / ny;
4470 coord[1] = dest - coord[2] * ny;
4473 int destPe = coord[dm.x] + coord[dm.y] * xsize +
4474 coord[dm.z] * xsize* ysize;
4476 if(pmemap[destPe] == 0) {
4477 pemap[gcount++] = destPe;
4480 if(tmgr.hasMultipleProcsPerNode())
4481 pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4486 for(
int pos = 1; pos < ny * nz; pos++) {
4488 coord[2] += pos / ny;
4489 coord[1] += pos % ny;
4491 coord[2] = coord[2] % nz;
4492 coord[1] = coord[1] % ny;
4494 int newdest = coord[dm.x] + coord[dm.y] * xsize +
4495 coord[dm.z] * xsize * ysize;
4497 if(pmemap[newdest] == 0) {
4498 pemap[gcount++] = newdest;
4499 pmemap[newdest] = 1;
4501 if(tmgr.hasMultipleProcsPerNode())
4502 pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4511 if(gcount == numPes)
4514 if(npme_pes >= numPes)
4520 if(npme_pes != numPes)
4537 trans_handle = untrans_handle = ungrid_handle = NULL;
4556 for (
int i=0; i<nBlocks; ++i )
send_order[i] = i;
4570 #ifndef CmiMemoryAtomicType 4584 PersistentHandle *trans_handle;
4585 PersistentHandle *untrans_handle;
4586 PersistentHandle *ungrid_handle;
4592 PmeZPencil_SDAG_CODE
4598 delete [] forward_plans;
4599 delete [] backward_plans;
4621 fftwf_plan forward_plan, backward_plan;
4625 fftwf_plan *forward_plans, *backward_plans;
4627 rfftwnd_plan forward_plan, backward_plan;
4633 void setup_persistent() {
4639 CmiAssert(yPencil_local);
4640 trans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * zBlocks);
4641 for (
int isend=0; isend<zBlocks; ++isend ) {
4644 if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4645 int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4647 int compress_start =
sizeof(
PmeTransMsg)+
sizeof(envelope);
4648 int compress_size =
sizeof(float)*hd*nx*ny*nz1*2;
4649 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4653 void setup_ungrid_persistent()
4655 ungrid_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * grid_msgs.
size());
4657 for ( limsg=0; limsg < grid_msgs.
size(); ++limsg ) {
4658 int peer = grid_msgs[limsg]->sourceNode;
4668 PmeYPencil_SDAG_CODE
4688 fftwf_plan forward_plan, backward_plan;
4690 fftw_plan forward_plan, backward_plan;
4696 void setup_persistent() {
4702 trans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * yBlocks);
4703 for (
int isend=0; isend<yBlocks; ++isend ) {
4706 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4707 int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4709 int compress_start =
sizeof(
PmeTransMsg)+
sizeof( envelope);
4710 int compress_size =
sizeof(float)*hd*nx*ny1*nz*2;
4711 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4715 untrans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * yBlocks);
4716 for (
int isend=0; isend<yBlocks; ++isend ) {
4719 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4720 int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4722 int compress_start =
sizeof(
PmeUntransMsg) +
sizeof( envelope);
4723 int compress_size =
sizeof(float)*nx*ny1*nz*2;
4724 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4732 PmeXPencil_SDAG_CODE
4738 delete [] forward_plans;
4739 delete [] backward_plans;
4756 fftwf_plan *forward_plans, *backward_plans;
4766 void setup_persistent() {
4771 untrans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * xBlocks);
4772 for (
int isend=0; isend<xBlocks; ++isend ) {
4775 if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4776 int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4779 int compress_start =
sizeof(
PmeUntransMsg) +
sizeof( envelope);
4780 int compress_size =
sizeof(float)*nx1*
ny*
nz*2;
4781 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4794 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4795 Node *node = nd.ckLocalBranch();
4798 #if USE_NODE_PAR_RECEIVE 4810 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4812 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4818 work =
new float[dim3];
4825 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4826 int sizeLines=nx*ny;
4827 int planLineSizes[1];
4828 planLineSizes[0]=K3;
4830 int ndimHalf=ndim/2;
4831 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4832 (
float *)
data, NULL, 1,
4834 (fftwf_complex *)
data, NULL, 1,
4838 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4839 (fftwf_complex *)
data, NULL, 1,
4841 (
float *)
data, NULL, 1,
4844 #if CMK_SMP && USE_CKLOOP 4848 numPlans = (nx<=ny?nx:ny);
4849 if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4850 if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4851 int howmany = sizeLines/numPlans;
4852 forward_plans =
new fftwf_plan[numPlans];
4853 backward_plans =
new fftwf_plan[numPlans];
4854 for(
int i=0; i<numPlans; i++) {
4855 int dimStride = i*ndim*howmany;
4856 int dimHalfStride = i*ndimHalf*howmany;
4857 forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4858 ((
float *)
data)+dimStride, NULL, 1,
4860 ((fftwf_complex *)
data)+dimHalfStride, NULL, 1,
4864 backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4865 ((fftwf_complex *)
data)+dimHalfStride, NULL, 1,
4867 ((
float *)
data)+dimStride, NULL, 1,
4874 forward_plans = NULL;
4875 backward_plans = NULL;
4878 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4879 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4880 | FFTW_IN_PLACE | FFTW_USE_WISDOM,
data, 1,
work, 1);
4881 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4882 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4883 | FFTW_IN_PLACE | FFTW_USE_WISDOM,
data, 1,
work, 1);
4887 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
4890 #if USE_NODE_PAR_RECEIVE 4892 memset(
data, 0,
sizeof(
float) * nx*ny*dim3);
4897 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4898 Node *node = nd.ckLocalBranch();
4901 #if USE_NODE_PAR_RECEIVE 4913 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4915 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4921 work =
new float[2*K2];
4931 int planLineSizes[1];
4932 planLineSizes[0]=K2;
4933 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4934 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4935 (fftwf_complex *)
data, NULL, sizeLines, 1,
4936 (fftwf_complex *)
data, NULL, sizeLines, 1,
4939 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4940 (fftwf_complex *)
data, NULL, sizeLines, 1,
4941 (fftwf_complex *)
data, NULL, sizeLines, 1,
4944 CkAssert(forward_plan != NULL);
4945 CkAssert(backward_plan != NULL);
4947 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
4948 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4949 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
4950 nz, (fftw_complex *)
work, 1);
4951 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
4952 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4953 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
4954 nz, (fftw_complex *)
work, 1);
4958 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
4961 #if USE_NODE_PAR_RECEIVE 4963 CmiMemoryWriteFence();
4973 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
4981 CmiMemoryWriteFence();
4993 if ( !
hasData )
NAMD_bug(
"PmeYPencil::node_process_untrans non-null msg but not hasData");
4995 }
else if (
hasData )
NAMD_bug(
"PmeYPencil::node_process_untrans hasData but null msg");
4997 CmiMemoryAtomicFetchAndInc(
imsgb,limsg);
5005 CmiMemoryWriteFence();
5010 #define DEBUG_NODE_PAR_RECV 0 5015 #if DEBUG_NODE_PAR_RECV 5017 CkAbort(
"xpencil in recvXTrans not found, debug registeration");
5027 #if DEBUG_NODE_PAR_RECV 5029 CkAbort(
"ypencil in recvYTrans not found, debug registeration");
5037 #if DEBUG_NODE_PAR_RECV 5039 CkAbort(
"ypencil in recvYUntrans not found, debug registeration");
5047 #if DEBUG_NODE_PAR_RECV 5049 CkAbort(
"zpencil in recvZUntrans not found, debug registeration");
5058 #if DEBUG_NODE_PAR_RECV 5060 CkAbort(
"zpencil in recvZGrid not found, debug registeration");
5067 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5068 Node *node = nd.ckLocalBranch();
5070 #if USE_NODE_PAR_RECEIVE 5081 if ( (thisIndex.y + 1) * block2 > K2 )
ny = K2 - thisIndex.y * block2;
5083 if ( (thisIndex.z+1)*block3 > dim3/2 )
nz = dim3/2 - thisIndex.z*block3;
5089 work =
new float[2*K1];
5095 int fftwFlags =
simParams->FFTWPatient ? FFTW_PATIENT :
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5096 int sizeLines=
ny*
nz;
5097 int planLineSizes[1];
5098 planLineSizes[0]=K1;
5099 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5100 (fftwf_complex *)
data, NULL, sizeLines, 1,
5101 (fftwf_complex *)
data, NULL, sizeLines, 1,
5104 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5105 (fftwf_complex *)
data, NULL, sizeLines, 1,
5106 (fftwf_complex *)
data, NULL, sizeLines, 1,
5110 #if CMK_SMP && USE_CKLOOP 5118 if ( sizeLines/numPlans < 4 ) numPlans = 1;
5119 int howmany = sizeLines/numPlans;
5120 forward_plans =
new fftwf_plan[numPlans];
5121 backward_plans =
new fftwf_plan[numPlans];
5122 for(
int i=0; i<numPlans; i++) {
5123 int curStride = i*howmany;
5124 forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5125 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5126 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5130 backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5131 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5132 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5139 forward_plans = NULL;
5140 backward_plans = NULL;
5143 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5144 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5145 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5147 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5148 (
simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5149 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5154 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
5158 thisIndex.y*block2, thisIndex.y*block2 +
ny,
5159 thisIndex.z*block3, thisIndex.z*block3 +
nz);
5172 #if ! USE_NODE_PAR_RECEIVE 5173 memset(
data, 0,
sizeof(
float)*nx*ny*dim3);
5181 int * __restrict msg_zlist = msg->
zlist;
5182 int * __restrict zlist = (
int*)__builtin_assume_aligned(work_zlist.
begin(),
5184 for (
int k=0; k<zlistlen; ++k ) {
5185 zlist[k] = msg_zlist[k];
5188 int * __restrict zlist = msg->
zlist;
5190 char * __restrict fmsg = msg->
fgrid;
5191 float * __restrict qmsg = msg->
qgrid;
5192 float * __restrict d =
data;
5194 for (
int g=0; g<numGrids; ++g ) {
5195 for (
int i=0; i<nx; ++i ) {
5196 for (
int j=0; j<ny; ++j, d += dim3 ) {
5199 for (
int k=0; k<zlistlen; ++k ) {
5200 d[zlist[k]] += *(qmsg++);
5208 static inline void PmeXZPencilFFT(
int first,
int last,
void *result,
int paraNum,
void *param){
5211 fftwf_plan *plans = (fftwf_plan *)param;
5212 for(
int i=first; i<=last; i++) fftwf_execute(plans[i]);
5222 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5224 for (
int i=0; i<nx; ++i ) {
5225 for (
int j=0; j<ny; ++j, d += dim3 ) {
5226 for (
int k=0; k<dim3; ++k ) {
5227 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5233 #ifdef MANUAL_DEBUG_FFTW3 5234 dumpMatrixFloat3(
"fw_z_b",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5237 #if CMK_SMP && USE_CKLOOP 5243 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
5247 fftwf_execute(forward_plan);
5249 rfftwnd_real_to_complex(forward_plan, nx*ny,
5252 #ifdef MANUAL_DEBUG_FFTW3 5253 dumpMatrixFloat3(
"fw_z_a",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5261 for (
int i=0; i<nx; ++i ) {
5262 for (
int j=0; j<ny; ++j, d += dim3 ) {
5263 for (
int k=0; k<dim3; ++k ) {
5264 if ( d[k] == 0. ) CkPrintf(
"0 in Z at %d %d %d %d %d %d %d %d %d\n",
5265 thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5282 for (
int isend=fromIdx; isend<=toIdx; ++isend ) {
5285 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5293 float *md = msg->
qgrid;
5294 const float *d =
data;
5295 for (
int i=0; i<nx; ++i ) {
5296 for (
int j=0; j<ny; ++j, d += dim3 ) {
5297 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
5307 CmiEnableUrgentSend(1);
5308 #if USE_NODE_PAR_RECEIVE 5309 msg->
destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5311 CmiUsePersistentHandle(&trans_handle[isend], 1);
5315 CmiUsePersistentHandle(NULL, 0);
5319 CmiUsePersistentHandle(&trans_handle[isend], 1);
5323 CmiUsePersistentHandle(NULL, 0);
5326 CmiEnableUrgentSend(0);
5332 if (trans_handle == NULL) setup_persistent();
5334 #if CMK_SMP && USE_CKLOOP 5352 for (
int isend=0; isend<zBlocks; ++isend ) {
5355 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5363 float *md = msg->
qgrid;
5364 const float *d =
data;
5365 for (
int i=0; i<nx; ++i ) {
5366 for (
int j=0; j<ny; ++j, d += dim3 ) {
5367 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
5377 CmiEnableUrgentSend(1);
5378 #if USE_NODE_PAR_RECEIVE 5379 msg->
destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5381 CmiUsePersistentHandle(&trans_handle[isend], 1);
5385 CmiUsePersistentHandle(NULL, 0);
5389 CmiUsePersistentHandle(&trans_handle[isend], 1);
5393 CmiUsePersistentHandle(NULL, 0);
5396 CmiEnableUrgentSend(0);
5410 const float *md = msg->
qgrid;
5412 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5413 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5414 for (
int k=0; k<nz; ++k ) {
5416 if ( (*md) == 0. ) CkPrintf(
"0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5417 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5419 d[2*(j*nz+k)] = *(md++);
5420 d[2*(j*nz+k)+1] = *(md++);
5426 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5427 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5428 for (
int k=0; k<nz; ++k ) {
5430 d[2*(j*nz+k)+1] = 0;
5444 for(
int i=fromIdx; i<=toIdx; i++){
5445 fftwf_execute_dft(forward_plan, ((fftwf_complex *)
data) + i
5456 #ifdef MANUAL_DEBUG_FFTW3 5457 dumpMatrixFloat3(
"fw_y_b",
data, nx,
initdata.
grid.
K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5461 #if CMK_SMP && USE_CKLOOP 5470 for (
int i=0; i<nx; ++i ) {
5471 fftwf_execute_dft(forward_plan, ((fftwf_complex *)
data) + i
5476 for (
int i=0; i<nx; ++i ) {
5477 fftw(forward_plan, nz,
5479 nz, 1, (fftw_complex *)
work, 1, 0);
5482 #ifdef MANUAL_DEBUG_FFTW3 5483 dumpMatrixFloat3(
"fw_y_a",
data, nx,
initdata.
grid.
dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5498 for (
int isend=fromIdx; isend<=toIdx; ++isend ) {
5501 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5509 float *md = msg->
qgrid;
5510 const float *d =
data;
5511 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5512 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5513 for (
int k=0; k<nz; ++k ) {
5514 *(md++) = d[2*(j*nz+k)];
5515 *(md++) = d[2*(j*nz+k)+1];
5517 if ( *(md-2) == 0. ) CkPrintf(
"send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5518 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5523 if ( md != msg->
qgrid + nx*ny*nz*2 ) CkPrintf(
"error in YX at %d %d %d\n",
5524 thisIndex.x, jb, thisIndex.z);
5528 CmiEnableUrgentSend(1);
5529 #if USE_NODE_PAR_RECEIVE 5530 msg->
destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5532 CmiUsePersistentHandle(&trans_handle[isend], 1);
5536 CmiUsePersistentHandle(NULL, 0);
5540 CmiUsePersistentHandle(&trans_handle[isend], 1);
5544 CmiUsePersistentHandle(NULL, 0);
5547 CmiEnableUrgentSend(0);
5553 if (trans_handle == NULL) setup_persistent();
5555 #if CMK_SMP && USE_CKLOOP 5573 for (
int isend=0; isend<yBlocks; ++isend ) {
5576 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5584 float *md = msg->
qgrid;
5585 const float *d =
data;
5586 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5587 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5588 for (
int k=0; k<nz; ++k ) {
5589 *(md++) = d[2*(j*nz+k)];
5590 *(md++) = d[2*(j*nz+k)+1];
5592 if ( *(md-2) == 0. ) CkPrintf(
"send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5593 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5598 if ( md != msg->
qgrid + nx*ny*nz*2 ) CkPrintf(
"error in YX at %d %d %d\n",
5599 thisIndex.x, jb, thisIndex.z);
5603 CmiEnableUrgentSend(1);
5604 #if USE_NODE_PAR_RECEIVE 5605 msg->
destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5607 CmiUsePersistentHandle(&trans_handle[isend], 1);
5611 CmiUsePersistentHandle(NULL, 0);
5615 CmiUsePersistentHandle(&trans_handle[isend], 1);
5619 CmiUsePersistentHandle(NULL, 0);
5623 CmiEnableUrgentSend(0);
5633 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
5643 CmiMemoryWriteFence();
5657 const float *md = msg->
qgrid;
5658 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5660 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5661 for (
int k=0; k<
nz; ++k ) {
5663 if ( (*md) == 0. ) CkPrintf(
"0 in YX at %d %d %d %d %d %d %d %d %d\n",
5664 ib, thisIndex.y, thisIndex.z, i, j, k, nx,
ny,
nz);
5672 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5674 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5675 for (
int k=0; k<
nz; ++k ) {
5687 #ifdef MANUAL_DEBUG_FFTW3 5692 #if CMK_SMP && USE_CKLOOP 5698 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
5705 ((fftw_complex *)
data),
ny*
nz, 1, (fftw_complex *)
work, 1, 0);
5707 #ifdef MANUAL_DEBUG_FFTW3 5725 #if CMK_SMP && USE_CKLOOP 5733 for (
int g=0; g<numGrids; ++g ) {
5738 #if USE_NODE_PAR_RECEIVE 5739 CmiMemoryWriteFence();
5745 #ifdef MANUAL_DEBUG_FFTW3 5750 #if CMK_SMP && USE_CKLOOP 5756 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
5763 ((fftw_complex *)
data),
ny*
nz, 1, (fftw_complex *)
work, 1, 0);
5765 #ifdef MANUAL_DEBUG_FFTW3 5781 for(
int isend=fromIdx; isend<=toIdx; isend++) {
5785 CmiEnableUrgentSend(1);
5787 #if USE_NODE_PAR_RECEIVE 5792 CmiEnableUrgentSend(0);
5796 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5800 float *md = msg->
qgrid;
5801 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5803 for (
int j=0; j<
ny; ++j, d +=
nz*2 ) {
5804 for (
int k=0; k<
nz; ++k ) {
5811 CmiEnableUrgentSend(1);
5812 #if USE_NODE_PAR_RECEIVE 5813 msg->
destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5815 CmiUsePersistentHandle(&untrans_handle[isend], 1);
5819 CmiUsePersistentHandle(NULL, 0);
5830 CmiEnableUrgentSend(0);
5841 CmiEnableUrgentSend(1);
5843 CmiEnableUrgentSend(0);
5847 if (untrans_handle == NULL) setup_persistent();
5849 #if CMK_SMP && USE_CKLOOP 5855 #if USE_NODE_PAR_RECEIVE