13 #define fftwf_malloc fftw_malloc
14 #define fftwf_free fftw_free
15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
36 #include "ComputePmeMgr.decl.h"
45 #include "ComputeMgr.decl.h"
47 #define MIN_DEBUG_LEVEL 3
53 #include "ckhashtable.h"
57 #include "ComputeMoaMgr.decl.h"
66 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
68 #include <cuda_runtime.h>
73 #include <hip/hip_runtime.h>
77 #define __thread __declspec(thread)
85 #define SQRT_PI 1.7724538509055160273
88 #if CMK_PERSISTENT_COMM
89 #define USE_PERSISTENT 1
98 #if (defined(NAMD_HIP) || defined(NAMD_CUDA)) && defined(MEM_OPT_VERSION)
99 #define USE_NODE_PAR_RECEIVE 1
179 : ia(i_a), ib(i_b), nb(n_b),
180 size(n), data(newcopyint(n,d)) {
186 virtual int procNum(
int,
const CkArrayIndex &i) {
188 return data[ i.data()[ia] * nb + i.data()[ib] ];
192 for (
int i=0; i < size; ++i ) {
193 if ( data[i] == mype ) {
194 CkArrayIndex3D ai(0,0,0);
195 ai.data()[ia] = i / nb;
196 ai.data()[ib] = i % nb;
197 if ( procNum(0,ai) != mype )
NAMD_bug(
"PmePencilMap is inconsistent");
198 if ( ! msg )
NAMD_bug(
"PmePencilMap multiple pencils on a pe?");
199 mgr->insertInitial(ai,msg);
203 mgr->doneInserting();
204 if ( msg ) CkFreeMsg(msg);
207 const int ia, ib, nb, size;
208 const int*
const data;
209 static int* newcopyint(
int n,
int *d) {
210 int *newd =
new int[n];
211 memcpy(newd, d, n*
sizeof(
int));
225 CProxy_PmePencilMap
xm;
226 CProxy_PmePencilMap
ym;
227 CProxy_PmePencilMap
zm;
256 int node = CmiMyNode();
257 int firstpe = CmiNodeFirst(node);
258 int nodeSize = CmiNodeSize(node);
259 int myrank = CkMyRank();
260 for (
int i=0; i<nodeSize; ++i ) {
261 int pe = firstpe + (myrank+i)%nodeSize;
270 CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
272 for (
int i=0; i<nodeSize; ++i ) {
273 if ( pelist[i] == CkMyPe() ) myrank = i;
275 for (
int i=0; i<nodeSize; ++i ) {
276 int pe = pelist[(myrank+i)%nodeSize];
284 int npes = CkNumPes();
285 for (
int i=0; i<npes; ++i ) {
286 int pe = (mype+i)%npes;
292 NAMD_bug(
"findRecipEvirPe() failed!");
299 int ncpus = CkNumPes();
301 for (
int i=0; i<numGridPes; ++i ) {
304 std::sort(gridPeMap,gridPeMap+numGridPes);
305 int firstTransPe = ncpus - numGridPes - numTransPes;
306 if ( firstTransPe < 0 ) {
309 if ( ncpus > numTransPes ) firstTransPe = 1;
311 for (
int i=0; i<numTransPes; ++i ) {
314 std::sort(transPeMap,transPeMap+numTransPes);
319 bool generateBGLORBPmePeList(
int *pemap,
int numPes,
int *block_pes=0,
327 if ( d )
while ( ! (d & c) ) {
330 return (a & c) - (b & c);
336 if ( d )
while ( ! (d & c) ) {
343 inline bool operator() (
int a,
int b)
const {
369 void initialize_pencils(CkQdMsg*);
370 void activate_pencils(CkQdMsg*);
371 void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
372 void initialize_computes();
374 void sendData(
Lattice &,
int sequence);
375 void sendDataPart(
int first,
int last,
Lattice &,
int sequence,
int sourcepe,
int errors);
380 void sendPencils(
Lattice &,
int sequence);
381 void sendPencilsPart(
int first,
int last,
Lattice &,
int sequence,
int sourcepe);
383 void gridCalc1(
void);
384 void sendTransBarrier(
void);
385 void sendTransSubset(
int first,
int last);
386 void sendTrans(
void);
393 void gridCalc2(
void);
394 #ifdef OPENATOM_VERSION
395 void gridCalc2Moa(
void);
396 #endif // OPENATOM_VERSION
397 void gridCalc2R(
void);
400 void sendUntrans(
void);
401 void sendUntransSubset(
int first,
int last);
404 void gridCalc3(
void);
405 void sendUngrid(
void);
406 void sendUngridSubset(
int first,
int last);
411 void ungridCalc(
void);
413 void addRecipEvirClient(
void);
414 void submitReductions();
416 #if 0 && USE_PERSISTENT
417 void setup_recvgrid_persistent();
423 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
431 void chargeGridSubmitted(
Lattice &lattice,
int sequence);
443 void cuda_submit_charges(
Lattice &lattice,
int sequence);
451 void sendChargeGridReady();
455 void pollChargeGridReady();
456 void pollForcesReady();
457 void recvChargeGridReady();
458 void chargeGridReady(
Lattice &lattice,
int sequence);
464 #if 0 && USE_PERSISTENT
465 PersistentHandle *recvGrid_handle;
468 CProxy_ComputePmeMgr pmeProxy;
469 CProxy_ComputePmeMgr pmeProxyDir;
470 CProxy_NodePmeMgr pmeNodeProxy;
475 if ( ! pmeComputes.size() ) initialize_computes();
489 fftwf_plan *forward_plan_x, *backward_plan_x;
490 fftwf_plan *forward_plan_yz, *backward_plan_yz;
493 fftw_plan forward_plan_x, backward_plan_x;
494 rfftwnd_plan forward_plan_yz, backward_plan_yz;
501 int qsize, fsize, bsize;
517 int ungridForcesCount;
519 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
520 #define NUM_STREAMS 1
534 int f_data_mgr_alloc;
535 float *f_data_mgr_host;
536 float *f_data_mgr_dev;
540 float *bspline_coeffs_dev;
541 float *bspline_dcoeffs_dev;
544 int recipEvirClients;
562 int myGridPe, myGridNode;
563 int myTransPe, myTransNode;
577 int compute_sequence;
580 int sendTransBarrier_received;
583 int xBlocks, yBlocks, zBlocks;
584 CProxy_PmeXPencil xPencil;
585 CProxy_PmeYPencil yPencil;
586 CProxy_PmeZPencil zPencil;
589 int numPencilsActive;
590 int strayChargeErrors;
598 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
620 void sendDataHelper(
int);
621 void sendPencilsHelper(
int);
624 void registerXPencil(CkArrayIndex3D,
PmeXPencil *);
625 void registerYPencil(CkArrayIndex3D,
PmeYPencil *);
626 void registerZPencil(CkArrayIndex3D,
PmeZPencil *);
636 xm=_xm; ym=_ym; zm=_zm;
638 CProxy_PmePencilMap
xm;
639 CProxy_PmePencilMap
ym;
640 CProxy_PmePencilMap
zm;
643 CProxy_ComputePmeMgr mgrProxy;
646 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
650 CProxy_PmeXPencil xPencil;
651 CProxy_PmeYPencil yPencil;
652 CProxy_PmeZPencil zPencil;
653 CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
654 CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
655 CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
657 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
658 cudaEvent_t end_charge_memset;
659 cudaEvent_t end_all_pme_kernels;
660 cudaEvent_t end_potential_memcpy;
669 delete [] mgrObjects;
673 CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
674 mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
675 if ( CkMyRank() == 0 ) {
677 mgrObject = proxy.ckLocalBranch();
682 mgrObject->fwdSharedTrans(msg);
686 mgrObject->fwdSharedUntrans(msg);
690 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
691 masterPmeMgr->recvUngrid(msg);
693 NAMD_bug(
"NodePmeMgr::recvUngrid called in non-CUDA build.");
700 xPencilObj.put(idx)=obj;
706 yPencilObj.put(idx)=obj;
712 zPencilObj.put(idx)=obj;
717 pmeProxyDir(thisgroup) {
719 CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
720 pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
721 nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
723 pmeNodeProxy.ckLocalBranch()->initialize();
725 if ( CmiMyRank() == 0 ) {
739 sendTransBarrier_received = 0;
742 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
744 if ( CmiMyRank() == 0 ) {
748 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
749 int leastPriority, greatestPriority;
750 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
755 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
757 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
772 cudaEventCreateWithFlags(&
end_charges,cudaEventDisableTiming);
781 f_data_mgr_alloc = 0;
787 #define CUDA_EVENT_ID_PME_CHARGES 80
788 #define CUDA_EVENT_ID_PME_FORCES 81
789 #define CUDA_EVENT_ID_PME_TICK 82
790 #define CUDA_EVENT_ID_PME_COPY 83
791 #define CUDA_EVENT_ID_PME_KERNEL 84
792 if ( 0 == CkMyPe() ) {
801 recipEvirClients = 0;
807 CProxy_PmeXPencil
x, CProxy_PmeYPencil
y, CProxy_PmeZPencil
z) {
808 xPencil =
x; yPencil =
y; zPencil =
z;
812 pmeNodeProxy.ckLocalBranch()->xPencil=
x;
813 pmeNodeProxy.ckLocalBranch()->yPencil=
y;
814 pmeNodeProxy.ckLocalBranch()->zPencil=
z;
822 Coord():
x(0),
y(0),
z(0) {}
823 Coord(
int a,
int b,
int c):
x(a),
y(b),
z(c) {}
825 extern void SFC_grid(
int xdim,
int ydim,
int zdim,
int xdim1,
int ydim1,
int zdim1, vector<Coord> &result);
831 for (
int i=0; i<result.size(); i++) {
832 Coord &c = result[i];
833 for (
int j=0; j<procs.
size(); j++) {
836 tmgr.rankToCoordinates(pe, x, y, z, t);
837 if (x==c.x && y==c.y && z==c.z)
838 newprocs[num++] = pe;
841 CmiAssert(newprocs.size() == procs.
size());
845 int find_level_grid(
int x)
857 CmiNodeLock tmgr_lock;
864 tmgr_lock = CmiCreateLock();
874 gridPeMap =
new int[CkNumPes()];
875 transPeMap =
new int[CkNumPes()];
876 recipPeDest =
new int[CkNumPes()];
877 gridPeOrder =
new int[CkNumPes()];
878 gridNodeOrder =
new int[CkNumNodes()];
879 transNodeOrder =
new int[CkNumNodes()];
881 if (CkMyRank() == 0) {
890 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
892 NAMD_die(
"PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
899 cudaDeviceProp deviceProp;
900 cudaGetDeviceProperties(&deviceProp, dev);
902 if ( deviceProp.major < 2 )
903 NAMD_die(
"PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
912 else if ( simParams->
PMEPencils > 0 ) usePencils = 1;
915 if ( nrps <= 0 ) nrps = CkNumPes();
916 if ( nrps > CkNumPes() ) nrps = CkNumPes();
919 int maxslabs = 1 + (dimx - 1) / simParams->
PMEMinSlices;
920 if ( maxslabs > nrps ) maxslabs = nrps;
923 if ( maxpencils > nrps ) maxpencils = nrps;
924 if ( maxpencils > 3 * maxslabs ) usePencils = 1;
930 if ( nrps <= 0 ) nrps = CkNumPes();
931 if ( nrps > CkNumPes() ) nrps = CkNumPes();
934 xBlocks = yBlocks = zBlocks = simParams->
PMEPencils;
938 if ( nb2 > nrps ) nb2 = nrps;
939 if ( nb2 < 1 ) nb2 = 1;
940 int nb = (int) sqrt((
float)nb2);
941 if ( nb < 1 ) nb = 1;
942 xBlocks = zBlocks = nb;
951 int bx = 1 + ( dimx - 1 ) / xBlocks;
952 xBlocks = 1 + ( dimx - 1 ) / bx;
955 int by = 1 + ( dimy - 1 ) / yBlocks;
956 yBlocks = 1 + ( dimy - 1 ) / by;
959 int bz = 1 + ( dimz - 1 ) / zBlocks;
960 zBlocks = 1 + ( dimz - 1 ) / bz;
962 if ( xBlocks * yBlocks > CkNumPes() ) {
963 NAMD_die(
"PME pencils xBlocks * yBlocks > numPes");
965 if ( xBlocks * zBlocks > CkNumPes() ) {
966 NAMD_die(
"PME pencils xBlocks * zBlocks > numPes");
968 if ( yBlocks * zBlocks > CkNumPes() ) {
969 NAMD_die(
"PME pencils yBlocks * zBlocks > numPes");
973 iout <<
iINFO <<
"PME using " << xBlocks <<
" x " <<
974 yBlocks <<
" x " << zBlocks <<
975 " pencil grid for FFT and reciprocal sum.\n" <<
endi;
984 int nrpx = ( dimx + minslices - 1 ) / minslices;
986 int nrpy = ( dimy + minslices - 1 ) / minslices;
989 int nrpp = CkNumPes();
991 if ( nrpp < nrpx ) nrpx = nrpp;
992 if ( nrpp < nrpy ) nrpy = nrpp;
996 if ( nrps > CkNumPes() ) nrps = CkNumPes();
997 if ( nrps > 0 ) nrpx = nrps;
998 if ( nrps > 0 ) nrpy = nrps;
1001 int bx = ( dimx + nrpx - 1 ) / nrpx;
1002 nrpx = ( dimx + bx - 1 ) / bx;
1003 int by = ( dimy + nrpy - 1 ) / nrpy;
1004 nrpy = ( dimy + by - 1 ) / by;
1005 if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1006 NAMD_bug(
"Error in selecting number of PME processors.");
1007 if ( by != ( dimy + nrpy - 1 ) / nrpy )
1008 NAMD_bug(
"Error in selecting number of PME processors.");
1014 iout <<
iINFO <<
"PME using " << numGridPes <<
" and " << numTransPes <<
1015 " processors for FFT and reciprocal sum.\n" <<
endi;
1018 int sum_npes = numTransPes + numGridPes;
1019 int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1021 #if 0 // USE_TOPOMAP
1027 if(tmgr.hasMultipleProcsPerNode())
1031 if(CkNumPes() > 2*sum_npes + patch_pes) {
1032 done = generateBGLORBPmePeList(transPeMap, numTransPes);
1033 done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1036 if(CkNumPes() > 2 *max_npes + patch_pes) {
1037 done = generateBGLORBPmePeList(transPeMap, max_npes);
1038 gridPeMap = transPeMap;
1052 for ( i=0; i<numGridPes && i<10; ++i ) {
1053 iout <<
" " << gridPeMap[i];
1055 if ( i < numGridPes )
iout <<
" ...";
1057 iout << iINFO <<
"PME TRANS LOCATIONS:";
1058 for ( i=0; i<numTransPes && i<10; ++i ) {
1059 iout <<
" " << transPeMap[i];
1061 if ( i < numTransPes )
iout <<
" ...";
1073 for ( i=0; i<numGridPes; ++i ) {
1074 if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1076 int real_node_i = CkNodeOf(gridPeMap[i]);
1077 if ( real_node_i == real_node ) {
1078 gridNodeInfo[node].
npe += 1;
1080 real_node = real_node_i;
1082 gridNodeInfo[node].
real_node = real_node;
1084 gridNodeInfo[node].
npe = 1;
1086 if ( CkMyNode() == real_node_i ) myGridNode = node;
1088 numGridNodes = node + 1;
1093 for ( i=0; i<numTransPes; ++i ) {
1094 if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1096 int real_node_i = CkNodeOf(transPeMap[i]);
1097 if ( real_node_i == real_node ) {
1098 transNodeInfo[node].
npe += 1;
1100 real_node = real_node_i;
1102 transNodeInfo[node].
real_node = real_node;
1104 transNodeInfo[node].
npe = 1;
1106 if ( CkMyNode() == real_node_i ) myTransNode = node;
1108 numTransNodes = node + 1;
1111 iout <<
iINFO <<
"PME USING " << numGridNodes <<
" GRID NODES AND "
1112 << numTransNodes <<
" TRANS NODES\n" <<
endi;
1117 for ( i = 0; i < numGridPes; ++i ) {
1121 if ( myGridPe < 0 ) {
1122 rand.
reorder(gridPeOrder,numGridPes);
1124 gridPeOrder[myGridPe] = numGridPes-1;
1125 gridPeOrder[numGridPes-1] = myGridPe;
1126 rand.
reorder(gridPeOrder,numGridPes-1);
1128 for ( i = 0; i < numGridNodes; ++i ) {
1129 gridNodeOrder[i] = i;
1131 if ( myGridNode < 0 ) {
1132 rand.
reorder(gridNodeOrder,numGridNodes);
1134 gridNodeOrder[myGridNode] = numGridNodes-1;
1135 gridNodeOrder[numGridNodes-1] = myGridNode;
1136 rand.
reorder(gridNodeOrder,numGridNodes-1);
1138 for ( i = 0; i < numTransNodes; ++i ) {
1139 transNodeOrder[i] = i;
1141 if ( myTransNode < 0 ) {
1142 rand.
reorder(transNodeOrder,numTransNodes);
1144 transNodeOrder[myTransNode] = numTransNodes-1;
1145 transNodeOrder[numTransNodes-1] = myTransNode;
1146 rand.
reorder(transNodeOrder,numTransNodes-1);
1157 myGrid.
dim3 = 2 * (myGrid.
K3/2 + 1);
1159 if ( ! usePencils ) {
1160 myGrid.
block1 = ( myGrid.
K1 + numGridPes - 1 ) / numGridPes;
1161 myGrid.
block2 = ( myGrid.
K2 + numTransPes - 1 ) / numTransPes;
1166 myGrid.
block1 = ( myGrid.
K1 + xBlocks - 1 ) / xBlocks;
1167 myGrid.
block2 = ( myGrid.
K2 + yBlocks - 1 ) / yBlocks;
1168 myGrid.
block3 = ( myGrid.
K3/2 + 1 + zBlocks - 1 ) / zBlocks;
1180 int ncpus = CkNumPes();
1183 for (
int icpu=0; icpu<ncpus; ++icpu ) {
1188 else nopatches.
add(ri);
1194 int *tmp =
new int[patches.
size()];
1195 int nn = patches.
size();
1196 for (i=0;i<nn;i++) tmp[i] = patches[i];
1199 for (i=0;i<nn;i++) patches.
add(tmp[i]);
1201 tmp =
new int[nopatches.
size()];
1202 nn = nopatches.
size();
1203 for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1206 for (i=0;i<nn;i++) nopatches.
add(tmp[i]);
1212 int npens = xBlocks*yBlocks;
1213 if ( npens % ncpus == 0 ) useZero = 1;
1214 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1215 npens += xBlocks*zBlocks;
1216 if ( npens % ncpus == 0 ) useZero = 1;
1217 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1218 npens += yBlocks*zBlocks;
1219 if ( npens % ncpus == 0 ) useZero = 1;
1220 if ( npens == nopatches.
size() + 1 ) useZero = 1;
1223 for ( i=nopatches.
size()-1; i>=0; --i ) pmeprocs.
add(nopatches[i]);
1225 for ( i=patches.
size()-1; i>=0; --i ) pmeprocs.
add(patches[i]);
1228 int npes = pmeprocs.
size();
1229 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1230 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1231 #if !USE_RANDOM_TOPO
1234 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1235 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1236 #if !USE_RANDOM_TOPO
1239 for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1240 if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1241 #if !USE_RANDOM_TOPO
1249 int xdim = tmgr.getDimNX();
1250 int ydim = tmgr.getDimNY();
1251 int zdim = tmgr.getDimNZ();
1252 int xdim1 = find_level_grid(xdim);
1253 int ydim1 = find_level_grid(ydim);
1254 int zdim1 = find_level_grid(zdim);
1256 printf(
"xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1258 vector<Coord> result;
1259 SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1260 sort_sfc(xprocs, tmgr, result);
1261 sort_sfc(yprocs, tmgr, result);
1262 sort_sfc(zprocs, tmgr, result);
1264 CmiUnlock(tmgr_lock);
1269 iout <<
iINFO <<
"PME Z PENCIL LOCATIONS:";
1270 for ( i=0; i<zprocs.
size() && i<10; ++i ) {
1273 tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1274 iout <<
" " << zprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1276 iout <<
" " << zprocs[i];
1279 if ( i < zprocs.
size() )
iout <<
" ...";
1283 if (CkMyRank() == 0) {
1284 for (pe=0, x = 0; x < xBlocks; ++
x)
1285 for (y = 0; y < yBlocks; ++
y, ++pe ) {
1291 iout <<
iINFO <<
"PME Y PENCIL LOCATIONS:";
1292 for ( i=0; i<yprocs.
size() && i<10; ++i ) {
1295 tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1296 iout <<
" " << yprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1298 iout <<
" " << yprocs[i];
1301 if ( i < yprocs.
size() )
iout <<
" ...";
1305 if (CkMyRank() == 0) {
1306 for (pe=0, z = 0; z < zBlocks; ++
z )
1307 for (x = 0; x < xBlocks; ++
x, ++pe ) {
1313 iout <<
iINFO <<
"PME X PENCIL LOCATIONS:";
1314 for ( i=0; i<xprocs.
size() && i<10; ++i ) {
1317 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1318 iout <<
" " << xprocs[i] <<
"(" << x <<
" " << y <<
" " << z <<
")";
1320 iout <<
" " << xprocs[i];
1323 if ( i < xprocs.
size() )
iout <<
" ...";
1327 if (CkMyRank() == 0) {
1328 for (pe=0, y = 0; y < yBlocks; ++
y )
1329 for (z = 0; z < zBlocks; ++
z, ++pe ) {
1336 if ( CkMyPe() == 0 ){
1337 #if !USE_RANDOM_TOPO
1344 CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.
begin());
1345 CProxy_PmePencilMap ym;
1347 ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.
begin());
1349 ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.
begin());
1350 CProxy_PmePencilMap xm;
1352 xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.
begin());
1354 xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.
begin());
1355 pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1356 CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1357 CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1358 CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1359 zo.setAnytimeMigration(
false); zo.setStaticInsertion(
true);
1360 yo.setAnytimeMigration(
false); yo.setStaticInsertion(
true);
1361 xo.setAnytimeMigration(
false); xo.setStaticInsertion(
true);
1362 zPencil = CProxy_PmeZPencil::ckNew(zo);
1363 yPencil = CProxy_PmeYPencil::ckNew(yo);
1364 xPencil = CProxy_PmeXPencil::ckNew(xo);
1366 zPencil = CProxy_PmeZPencil::ckNew();
1367 yPencil = CProxy_PmeYPencil::ckNew();
1368 xPencil = CProxy_PmeXPencil::ckNew();
1370 for (pe=0, x = 0; x < xBlocks; ++
x)
1371 for (y = 0; y < yBlocks; ++
y, ++pe ) {
1372 zPencil(x,y,0).insert(zprocs[pe]);
1374 zPencil.doneInserting();
1376 for (pe=0, x = 0; x < xBlocks; ++
x)
1377 for (z = 0; z < zBlocks; ++
z, ++pe ) {
1378 yPencil(x,0,z).insert(yprocs[pe]);
1380 yPencil.doneInserting();
1383 for (pe=0, y = 0; y < yBlocks; ++
y )
1384 for (z = 0; z < zBlocks; ++
z, ++pe ) {
1385 xPencil(0,y,z).insert(xprocs[pe]);
1387 xPencil.doneInserting();
1390 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1392 msgdata.
grid = myGrid;
1415 for ( pe = 0; pe < numGridPes; ++pe ) {
1418 if ( nx > myGrid.
K1 ) nx = myGrid.
K1;
1419 localInfo[pe].
nx = nx - localInfo[pe].
x_start;
1422 for ( pe = 0; pe < numTransPes; ++pe ) {
1425 if ( ny > myGrid.
K2 ) ny = myGrid.
K2;
1438 int numNodes = CkNumPes();
1439 int *source_flags =
new int[numNodes];
1441 for ( node=0; node<numNodes; ++node ) {
1442 source_flags[node] = 0;
1443 recipPeDest[node] = 0;
1453 int pnode = patchMap->
node(pid);
1454 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1455 if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1457 int shift1 = (myGrid.
K1 + myGrid.
order - 1)/2;
1460 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1462 int min1 = ((int) floor(myGrid.
K1 * (minx - margina))) + shift1 - myGrid.
order + 1;
1463 int max1 = ((int) floor(myGrid.
K1 * (maxx + margina))) + shift1;
1464 for (
int i=min1; i<=max1; ++i ) {
1466 while ( ix >= myGrid.
K1 ) ix -= myGrid.
K1;
1467 while ( ix < 0 ) ix += myGrid.
K1;
1469 if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1470 ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1471 source_flags[pnode] = 1;
1474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1476 if ( pnode == CkNodeFirst(CkMyNode()) ) {
1477 recipPeDest[ix / myGrid.
block1] = 1;
1481 if ( pnode == CkMyPe() ) {
1482 recipPeDest[ix / myGrid.
block1] = 1;
1487 int numSourcesSamePhysicalNode = 0;
1489 numDestRecipPes = 0;
1490 for ( node=0; node<numNodes; ++node ) {
1491 if ( source_flags[node] ) ++numSources;
1492 if ( recipPeDest[node] ) ++numDestRecipPes;
1493 if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1498 CkPrintf(
"pe %5d pme %5d of %5d on same physical node\n",
1499 CkMyPe(), numSourcesSamePhysicalNode, numSources);
1500 iout <<
iINFO <<
"PME " << CkMyPe() <<
" sources:";
1501 for ( node=0; node<numNodes; ++node ) {
1502 if ( source_flags[node] )
iout <<
" " << node;
1508 delete [] source_flags;
1515 ungrid_count = numDestRecipPes;
1517 sendTransBarrier_received = 0;
1519 if ( myGridPe < 0 && myTransPe < 0 )
return;
1522 if ( myTransPe >= 0 ) {
1524 pmeProxy[recipEvirPe].addRecipEvirClient();
1527 if ( myTransPe >= 0 ) {
1530 #ifdef OPENATOM_VERSION
1531 if ( simParams->openatomOn ) {
1532 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1533 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2, moaProxy);
1535 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2);
1537 #else // OPENATOM_VERSION
1538 myKSpace =
new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.
dim3/2);
1539 #endif // OPENATOM_VERSION
1542 int local_size = myGrid.
block1 * myGrid.
K2 * myGrid.
dim3;
1543 int local_size_2 = myGrid.
block2 * myGrid.
K1 * myGrid.
dim3;
1544 if ( local_size < local_size_2 ) local_size = local_size_2;
1545 qgrid =
new float[local_size*
numGrids];
1546 if ( numGridPes > 1 || numTransPes > 1 ) {
1547 kgrid =
new float[local_size*
numGrids];
1551 qgrid_size = local_size;
1553 if ( myGridPe >= 0 ) {
1554 qgrid_start = localInfo[myGridPe].
x_start * myGrid.
K2 * myGrid.
dim3;
1555 qgrid_len = localInfo[myGridPe].
nx * myGrid.
K2 * myGrid.
dim3;
1556 fgrid_start = localInfo[myGridPe].
x_start * myGrid.
K2;
1557 fgrid_len = localInfo[myGridPe].
nx * myGrid.
K2;
1560 int n[3]; n[0] = myGrid.
K1; n[1] = myGrid.
K2; n[2] = myGrid.
K3;
1564 work =
new fftwf_complex[n[0]];
1565 int fftwFlags = simParams->
FFTWPatient ? FFTW_PATIENT : simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1566 if ( myGridPe >= 0 ) {
1567 forward_plan_yz=
new fftwf_plan[
numGrids];
1568 backward_plan_yz=
new fftwf_plan[
numGrids];
1570 if ( myTransPe >= 0 ) {
1571 forward_plan_x=
new fftwf_plan[
numGrids];
1572 backward_plan_x=
new fftwf_plan[
numGrids];
1575 if ( ! CkMyPe() )
iout <<
iINFO <<
"Optimizing 4 FFT steps. 1..." <<
endi;
1576 if ( myGridPe >= 0 ) {
1579 forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1580 localInfo[myGridPe].nx,
1581 qgrid + qgrid_size * g,
1586 (qgrid + qgrid_size * g),
1593 int zdim = myGrid.
dim3;
1595 if ( ! CkMyPe() )
iout <<
" 2..." << endi;
1596 if ( myTransPe >= 0 ) {
1600 forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1602 (kgrid+qgrid_size*g),
1607 (kgrid+qgrid_size*g),
1611 FFTW_FORWARD,fftwFlags);
1615 if ( ! CkMyPe() )
iout <<
" 3..." <<
endi;
1616 if ( myTransPe >= 0 ) {
1619 backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1621 (kgrid+qgrid_size*g),
1626 (kgrid+qgrid_size*g),
1630 FFTW_BACKWARD, fftwFlags);
1634 if ( ! CkMyPe() )
iout <<
" 4..." <<
endi;
1635 if ( myGridPe >= 0 ) {
1638 backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1639 localInfo[myGridPe].nx,
1641 (qgrid + qgrid_size * g),
1645 qgrid + qgrid_size * g,
1652 if ( ! CkMyPe() )
iout <<
" Done.\n" <<
endi;
1655 work =
new fftw_complex[n[0]];
1657 if ( ! CkMyPe() )
iout <<
iINFO <<
"Optimizing 4 FFT steps. 1..." <<
endi;
1658 if ( myGridPe >= 0 ) {
1659 forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1660 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1661 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1663 if ( ! CkMyPe() )
iout <<
" 2..." <<
endi;
1664 if ( myTransPe >= 0 ) {
1665 forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1666 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1667 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1670 if ( ! CkMyPe() )
iout <<
" 3..." <<
endi;
1671 if ( myTransPe >= 0 ) {
1672 backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1673 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1674 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1677 if ( ! CkMyPe() )
iout <<
" 4..." <<
endi;
1678 if ( myGridPe >= 0 ) {
1679 backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1680 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1681 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1683 if ( ! CkMyPe() )
iout <<
" Done.\n" <<
endi;
1687 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
1690 if ( myGridPe >= 0 && numSources == 0 )
1691 NAMD_bug(
"PME grid elements exist without sources.");
1692 grid_count = numSources;
1693 memset( (
void*) qgrid, 0, qgrid_size *
numGrids *
sizeof(
float) );
1694 trans_count = numGridPes;
1701 if ( ! usePencils )
return;
1713 pencilActive =
new char[xBlocks*yBlocks];
1714 for (
int i=0; i<xBlocks; ++i ) {
1715 for (
int j=0; j<yBlocks; ++j ) {
1716 pencilActive[i*yBlocks+j] = 0;
1721 int pnode = patchMap->
node(pid);
1722 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1724 if ( CkNodeOf(pnode) != CkMyNode() )
continue;
1727 if ( pnode != CkMyPe() )
continue;
1729 int shift1 = (myGrid.
K1 + myGrid.
order - 1)/2;
1730 int shift2 = (myGrid.
K2 + myGrid.
order - 1)/2;
1734 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1736 int min1 = ((int) floor(myGrid.
K1 * (minx - margina))) + shift1 - myGrid.
order + 1;
1737 int max1 = ((int) floor(myGrid.
K1 * (maxx + margina))) + shift1;
1741 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1743 int min2 = ((int) floor(myGrid.
K2 * (miny - marginb))) + shift2 - myGrid.
order + 1;
1744 int max2 = ((int) floor(myGrid.
K2 * (maxy + marginb))) + shift2;
1746 for (
int i=min1; i<=max1; ++i ) {
1748 while ( ix >= myGrid.
K1 ) ix -= myGrid.
K1;
1749 while ( ix < 0 ) ix += myGrid.
K1;
1750 for (
int j=min2; j<=max2; ++j ) {
1752 while ( jy >= myGrid.
K2 ) jy -= myGrid.
K2;
1753 while ( jy < 0 ) jy += myGrid.
K2;
1754 pencilActive[(ix / myGrid.
block1)*yBlocks + (jy / myGrid.
block2)] = 1;
1759 numPencilsActive = 0;
1760 for (
int i=0; i<xBlocks; ++i ) {
1761 for (
int j=0; j<yBlocks; ++j ) {
1762 if ( pencilActive[i*yBlocks+j] ) {
1764 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1767 zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1771 activePencils =
new ijpair[numPencilsActive];
1772 numPencilsActive = 0;
1773 for (
int i=0; i<xBlocks; ++i ) {
1774 for (
int j=0; j<yBlocks; ++j ) {
1775 if ( pencilActive[i*yBlocks+j] ) {
1776 activePencils[numPencilsActive++] =
ijpair(i,j);
1784 rand.
reorder(activePencils,numPencilsActive);
1790 ungrid_count = numPencilsActive;
1795 if ( ! usePencils )
return;
1796 if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1802 if ( CmiMyRank() == 0 ) {
1808 delete [] localInfo;
1809 delete [] gridNodeInfo;
1810 delete [] transNodeInfo;
1811 delete [] gridPeMap;
1812 delete [] transPeMap;
1813 delete [] recipPeDest;
1814 delete [] gridPeOrder;
1815 delete [] gridNodeOrder;
1816 delete [] transNodeOrder;
1818 if ( kgrid != qgrid )
delete [] kgrid;
1820 delete [] gridmsg_reuse;
1823 for (
int i=0; i<q_count; ++i) {
1824 delete [] q_list[i];
1835 if ( grid_count == 0 ) {
1836 NAMD_bug(
"Message order failure in ComputePmeMgr::recvGrid\n");
1838 if ( grid_count == numSources ) {
1843 int zdim = myGrid.
dim3;
1845 int *zlist = msg->
zlist;
1846 float *qmsg = msg->
qgrid;
1848 char *f = msg->
fgrid + fgrid_len * g;
1849 float *q = qgrid + qgrid_size * g;
1850 for (
int i=0; i<fgrid_len; ++i ) {
1852 for (
int k=0; k<zlistlen; ++k ) {
1853 q[zlist[k]] += *(qmsg++);
1860 gridmsg_reuse[numSources-grid_count] = msg;
1863 if ( grid_count == 0 ) {
1864 pmeProxyDir[CkMyPe()].gridCalc1();
1865 if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1868 #ifdef MANUAL_DEBUG_FFTW3
1871 void dumpMatrixFloat(
const char *infilename,
float *matrix,
int xdim,
int ydim,
int zdim,
int pe)
1875 char filename[1000];
1876 strncpy(fmt,infilename,999);
1877 strncat(fmt,
"_%d.out",999);
1878 sprintf(filename,fmt, pe);
1879 FILE *loutfile = fopen(filename,
"w");
1880 #ifdef PAIRCALC_TEST_DUMP
1881 fprintf(loutfile,
"%d\n",ydim);
1883 fprintf(loutfile,
"%d %d %d\n",xdim,ydim, zdim);
1884 for(
int i=0;i<xdim;i++)
1885 for(
int j=0;j<ydim;j++)
1886 for(
int k=0;k<zdim;k++)
1887 fprintf(loutfile,
"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1892 void dumpMatrixFloat3(
const char *infilename,
float *matrix,
int xdim,
int ydim,
int zdim,
int x,
int y,
int z)
1895 char filename[1000];
1896 strncpy(fmt,infilename,999);
1897 strncat(fmt,
"_%d_%d_%d.out",999);
1898 sprintf(filename,fmt, x,y,z);
1899 FILE *loutfile = fopen(filename,
"w");
1900 CkAssert(loutfile!=NULL);
1901 CkPrintf(
"opened %s for dump\n",filename);
1902 fprintf(loutfile,
"%d %d %d\n",xdim,ydim, zdim);
1903 for(
int i=0;i<xdim;i++)
1904 for(
int j=0;j<ydim;j++)
1905 for(
int k=0;k<zdim;k++)
1906 fprintf(loutfile,
"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1918 fftwf_execute(forward_plan_yz[g]);
1920 rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1921 qgrid + qgrid_size * g, 1, myGrid.
dim2 * myGrid.
dim3, 0, 0, 0);
1927 if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1931 sendTransBarrier_received += 1;
1933 if ( sendTransBarrier_received < numGridPes )
return;
1934 sendTransBarrier_received = 0;
1935 for (
int i=0; i<numGridPes; ++i ) {
1936 pmeProxyDir[gridPeMap[i]].sendTrans();
1940 static inline void PmeSlabSendTrans(
int first,
int last,
void *result,
int paraNum,
void *param) {
1947 untrans_count = numTransPes;
1949 #if CMK_SMP && USE_CKLOOP
1952 CkLoop_Parallelize(
PmeSlabSendTrans, 1, (
void *)
this, CkMyNodeSize(), 0, numTransNodes-1, 0);
1965 int zdim = myGrid.
dim3;
1966 int nx = localInfo[myGridPe].
nx;
1967 int x_start = localInfo[myGridPe].
x_start;
1968 int slicelen = myGrid.
K2 * zdim;
1970 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1973 CmiNetworkProgressAfter (0);
1976 for (
int j=first; j<=last; j++) {
1977 int node = transNodeOrder[j];
1978 int pe = transNodeInfo[node].
pe_start;
1979 int npe = transNodeInfo[node].
npe;
1981 if ( node != myTransNode )
for (
int i=0; i<npe; ++i, ++pe) {
1993 float *qmsg = newmsg->
qgrid + nx * totlen * g;
1995 for (
int i=0; i<npe; ++i, ++pe) {
1998 if ( node == myTransNode ) {
2000 qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2003 for (
int x = 0; x < nx; ++
x ) {
2004 CmiMemcpy((
void*)qmsg, (
void*)q, cpylen*
sizeof(
float));
2012 if ( node == myTransNode ) newmsg->
nx = 0;
2015 else pmeNodeProxy[transNodeInfo[node].
real_node].recvTrans(newmsg);
2016 }
else pmeProxy[transPeMap[transNodeInfo[node].
pe_start]].recvTrans(newmsg);
2022 int pe = transNodeInfo[myTransNode].
pe_start;
2023 int npe = transNodeInfo[myTransNode].
npe;
2024 CmiNodeLock lock = CmiCreateLock();
2025 int *count =
new int; *count = npe;
2026 for (
int i=0; i<npe; ++i, ++pe) {
2030 shmsg->
count = count;
2032 pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2039 int count = --(*msg->
count);
2040 CmiUnlock(msg->
lock);
2042 CmiDestroyLock(msg->
lock);
2056 if ( trans_count == numGridPes ) {
2062 int zdim = myGrid.
dim3;
2063 NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2065 int last_pe = first_pe+nodeInfo.
npe-1;
2075 CmiMemcpy((
void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2076 (
void*)(msg->
qgrid + nx*(ny_msg*g+y_skip)*zdim),
2077 nx*ny*zdim*
sizeof(
float));
2083 if ( trans_count == 0 ) {
2084 pmeProxyDir[CkMyPe()].gridCalc2();
2092 CmiNetworkProgressAfter (0);
2095 int zdim = myGrid.
dim3;
2103 fftwf_execute(forward_plan_x[g]);
2105 fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2106 ny * zdim / 2, 1, work, 1, 0);
2111 #ifdef OPENATOM_VERSION
2113 #endif // OPENATOM_VERSION
2115 #ifdef OPENATOM_VERSION
2119 #endif // OPENATOM_VERSION
2122 #ifdef OPENATOM_VERSION
2123 void ComputePmeMgr::gridCalc2Moa(
void) {
2125 int zdim = myGrid.
dim3;
2131 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2134 #ifdef OPENATOM_VERSION_DEBUG
2135 CkPrintf(
"Sending recQ on processor %d \n", CkMyPe());
2136 for (
int i=0; i<=(ny * zdim / 2); ++i)
2138 CkPrintf(
"PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2140 #endif // OPENATOM_VERSION_DEBUG
2142 CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2143 moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2146 #endif // OPENATOM_VERSION
2151 #if CMK_SMP && USE_CKLOOP
2153 && CkNumPes() >= 2 * numTransPes ) {
2158 int zdim = myGrid.
dim3;
2166 lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2173 fftwf_execute(backward_plan_x[g]);
2175 fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2176 ny * zdim / 2, 1, work, 1, 0);
2181 pmeProxyDir[CkMyPe()].sendUntrans();
2191 trans_count = numGridPes;
2196 newmsg->
evir[g] = recip_evir2[g];
2199 CmiEnableUrgentSend(1);
2200 pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2201 CmiEnableUrgentSend(0);
2204 #if CMK_SMP && USE_CKLOOP
2207 CkLoop_Parallelize(
PmeSlabSendUntrans, 1, (
void *)
this, CkMyNodeSize(), 0, numGridNodes-1, 0);
2218 int zdim = myGrid.
dim3;
2221 int slicelen = myGrid.
K2 * zdim;
2223 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2226 CmiNetworkProgressAfter (0);
2230 for (
int j=first; j<=last; j++) {
2231 int node = gridNodeOrder[j];
2232 int pe = gridNodeInfo[node].
pe_start;
2233 int npe = gridNodeInfo[node].
npe;
2235 if ( node != myGridNode )
for (
int i=0; i<npe; ++i, ++pe) {
2237 int cpylen = li.
nx * zdim;
2245 float *qmsg = newmsg->
qgrid + ny * totlen * g;
2247 for (
int i=0; i<npe; ++i, ++pe) {
2249 if ( node == myGridNode ) {
2251 qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2252 float *q = kgrid + qgrid_size*g + li.
x_start*ny*zdim;
2253 int cpylen = ny * zdim;
2254 for (
int x = 0; x < li.
nx; ++
x ) {
2255 CmiMemcpy((
void*)qmsg, (
void*)q, cpylen*
sizeof(
float));
2260 CmiMemcpy((
void*)qmsg,
2261 (
void*)(kgrid + qgrid_size*g + li.
x_start*ny*zdim),
2262 li.
nx*ny*zdim*
sizeof(
float));
2263 qmsg += li.
nx*ny*zdim;
2268 if ( node == myGridNode ) newmsg->
ny = 0;
2271 else pmeNodeProxy[gridNodeInfo[node].
real_node].recvUntrans(newmsg);
2272 }
else pmeProxy[gridPeMap[gridNodeInfo[node].
pe_start]].recvUntrans(newmsg);
2277 int pe = gridNodeInfo[myGridNode].
pe_start;
2278 int npe = gridNodeInfo[myGridNode].
npe;
2279 CmiNodeLock lock = CmiCreateLock();
2280 int *count =
new int; *count = npe;
2281 for (
int i=0; i<npe; ++i, ++pe) {
2284 shmsg->
count = count;
2286 pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2293 int count = --(*msg->
count);
2294 CmiUnlock(msg->
lock);
2296 CmiDestroyLock(msg->
lock);
2312 CmiNetworkProgressAfter (0);
2320 int zdim = myGrid.
dim3;
2321 int last_pe = first_pe+nodeInfo.
npe-1;
2322 int x_skip = localInfo[myGridPe].
x_start
2323 - localInfo[first_pe].
x_start;
2324 int nx_msg = localInfo[last_pe].
x_start
2325 + localInfo[last_pe].
nx
2326 - localInfo[first_pe].
x_start;
2327 int nx = localInfo[myGridPe].
nx;
2330 int slicelen = myGrid.
K2 * zdim;
2331 int cpylen = ny * zdim;
2333 float *q = qgrid + qgrid_size * g + y_start * zdim;
2334 float *qmsg = msg->
qgrid + (nx_msg*g+x_skip) * cpylen;
2335 for (
int x = 0; x < nx; ++
x ) {
2336 CmiMemcpy((
void*)q, (
void*)qmsg, cpylen*
sizeof(
float));
2345 if ( untrans_count == 0 ) {
2346 pmeProxyDir[CkMyPe()].gridCalc3();
2357 fftwf_execute(backward_plan_yz[g]);
2359 rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2360 (fftw_complex *) (qgrid + qgrid_size * g),
2361 1, myGrid.
dim2 * myGrid.
dim3 / 2, 0, 0, 0);
2367 pmeProxyDir[CkMyPe()].sendUngrid();
2377 #if CMK_SMP && USE_CKLOOP
2380 CkLoop_Parallelize(
PmeSlabSendUngrid, 1, (
void *)
this, CkMyNodeSize(), 0, numSources-1, 1);
2387 grid_count = numSources;
2388 memset( (
void*) qgrid, 0, qgrid_size * numGrids *
sizeof(
float) );
2393 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2399 for (
int j=first; j<=last; ++j ) {
2403 int zdim = myGrid.
dim3;
2404 int flen = newmsg->
len;
2405 int fstart = newmsg->
start;
2407 int *zlist = newmsg->
zlist;
2408 float *qmsg = newmsg->
qgrid;
2410 char *f = newmsg->
fgrid + fgrid_len * g;
2411 float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2412 for (
int i=0; i<flen; ++i ) {
2414 for (
int k=0; k<zlistlen; ++k ) {
2415 *(qmsg++) = q[zlist[k]];
2424 CmiEnableUrgentSend(1);
2425 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2427 pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2430 pmeProxyDir[pe].recvUngrid(newmsg);
2431 CmiEnableUrgentSend(0);
2437 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2440 if ( ungrid_count == 0 ) {
2441 NAMD_bug(
"Message order failure in ComputePmeMgr::recvUngrid\n");
2451 if ( msg )
delete msg;
2452 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2455 if ( ungrid_count == 0 ) {
2456 NAMD_bug(
"Message order failure in ComputePmeMgr::recvUngrid\n");
2458 int uc = --ungrid_count;
2469 if ( ungrid_count == 0 ) {
2470 pmeProxyDir[CkMyPe()].ungridCalc();
2474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2475 #define count_limit 1000000
2476 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
2477 #define EVENT_STRIDE 10
2481 void cudaDie(
const char *msg, cudaError_t err=cudaSuccess);
2488 if ( err == cudaSuccess ) {
2502 }
else if ( err != cudaErrorNotReady ) {
2504 sprintf(errmsg,
"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2511 sprintf(errmsg,
"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2530 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2535 if (
this == masterPmeMgr ) {
2536 double before = CmiWallTimer();
2537 cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 );
2538 cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 );
2540 cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2541 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after potential memcpy");
2544 const int myrank = CkMyRank();
2545 for (
int i=0; i<CkMyNodeSize(); ++i ) {
2556 for (
int i=0; i<n; ++i ) {
2557 cudaEventCreateWithFlags(&
end_forces[i],cudaEventDisableTiming);
2563 cudaMallocHost((
void**) &afn_host, 3*pcsz*
sizeof(
float*));
2564 cudaMalloc((
void**) &afn_dev, 3*pcsz*
sizeof(
float*));
2568 for (
int i=0; i<pcsz; ++i ) {
2572 if ( totn > f_data_mgr_alloc ) {
2573 if ( f_data_mgr_alloc ) {
2574 CkPrintf(
"Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2575 cudaFree(f_data_mgr_dev);
2576 cudaFreeHost(f_data_mgr_host);
2578 f_data_mgr_alloc = 1.2 * (totn + 100);
2579 cudaMalloc((
void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*
sizeof(
float));
2580 cudaMallocHost((
void**) &f_data_mgr_host, 3*f_data_mgr_alloc*
sizeof(
float));
2584 float *f_dev = f_data_mgr_dev;
2585 float *f_host = f_data_mgr_host;
2586 for (
int i=0; i<pcsz; ++i ) {
2591 afn_host[3*i+1] = f_dev;
2592 afn_host[3*i+2] = f_dev + n;
2597 double before = CmiWallTimer();
2598 cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*
sizeof(
float*), cudaMemcpyHostToDevice, streams[stream]);
2599 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force pointer memcpy");
2601 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2602 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2605 for (
int i=0; i<pcsz; ++i ) {
2608 int dimy = pcsz - i;
2612 for (
int j=0; j<dimy; ++j ) {
2615 if ( n > maxn ) maxn = n;
2618 before = CmiWallTimer();
2621 v_arr_dev, afn_dev+3*i, dimy, maxn,
2626 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force kernel submit");
2628 before = CmiWallTimer();
2630 cudaMemcpyDeviceToHost, streams[stream]);
2631 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after force memcpy submit");
2634 cuda_errcheck(
"in ComputePmeMgr::ungridCalc after end_forces event");
2650 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2655 pmeProxy[
this_pe].pollForcesReady();
2659 ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2663 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2667 NAMD_bug(
"ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2675 DebugM(4,
"ComputePme created.\n");
2679 CProxy_ComputePmeMgr::ckLocalBranch(
2680 CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(
this);
2694 myGrid.
dim3 = 2 * (myGrid.
K3/2 + 1);
2696 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2697 cuda_atoms_offset = 0;
2714 NAMD_bug(
"ComputePme used with unknown patch.");
2719 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2730 ungridForcesCount = 0;
2736 strayChargeErrors = 0;
2738 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2740 int pe =
master_pe = CkNodeFirst(CkMyNode());
2741 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2744 if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() )
master_pe = pe;
2752 NAMD_bug(
"ComputePmeMgr::initialize_computes() master_pe has no patches.");
2755 masterPmeMgr = nodePmeMgr->mgrObjects[
master_pe - CkNodeFirst(CkMyNode())];
2764 nodePmeMgr->masterPmeMgr = masterPmeMgr;
2768 qsize = myGrid.
K1 * myGrid.
dim2 * myGrid.
dim3;
2769 fsize = myGrid.
K1 * myGrid.
dim2;
2770 if ( myGrid.
K2 != myGrid.
dim2 )
NAMD_bug(
"PME myGrid.K2 != myGrid.dim2");
2771 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2775 q_arr =
new float*[fsize*
numGrids];
2776 memset( (
void*) q_arr, 0, fsize*numGrids *
sizeof(
float*) );
2777 q_list =
new float*[fsize*
numGrids];
2778 memset( (
void*) q_list, 0, fsize*numGrids *
sizeof(
float*) );
2782 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2783 if ( cudaFirst || ! offload ) {
2788 for (
int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2791 char *f = f_arr + g*fsize;
2795 int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2796 int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2797 int dim2 = myGrid.
dim2;
2798 for (
int ap=0; ap<numPencilsActive; ++ap) {
2799 int ib = activePencils[ap].
i;
2800 int jb = activePencils[ap].
j;
2801 int ibegin = ib*block1;
2802 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
2803 int jbegin = jb*block2;
2804 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
2805 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2806 for (
int i=ibegin; i<iend; ++i ) {
2807 for (
int j=jbegin; j<jend; ++j ) {
2813 int block1 = ( myGrid.
K1 + numGridPes - 1 ) / numGridPes;
2814 bsize = block1 * myGrid.
dim2 * myGrid.
dim3;
2815 for (
int pe=0; pe<numGridPes; pe++) {
2816 if ( ! recipPeDest[pe] )
continue;
2817 int start = pe * bsize;
2819 if ( start >= qsize ) { start = 0; len = 0; }
2820 if ( start + len > qsize ) { len = qsize - start; }
2821 int zdim = myGrid.
dim3;
2822 int fstart = start / zdim;
2823 int flen = len / zdim;
2824 memset(f + fstart, 0, flen*
sizeof(
char));
2829 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2835 int f_alloc_count = 0;
2836 for (
int n=fsize, i=0; i<n; ++i ) {
2837 if ( f_arr[i] == 0 ) {
2843 q_arr =
new float*[fsize*
numGrids];
2844 memset( (
void*) q_arr, 0, fsize*numGrids *
sizeof(
float*) );
2846 float **q_arr_dev_host =
new float*[fsize];
2847 cudaMalloc((
void**) &q_arr_dev, fsize *
sizeof(
float*));
2849 float **v_arr_dev_host =
new float*[fsize];
2850 cudaMalloc((
void**) &v_arr_dev, fsize *
sizeof(
float*));
2852 int q_stride = myGrid.
K3+myGrid.
order-1;
2853 q_data_size = f_alloc_count * q_stride *
sizeof(float);
2854 ffz_size = (fsize + q_stride) *
sizeof(
int);
2857 cudaMallocHost((
void**) &q_data_host, q_data_size+ffz_size);
2858 ffz_host = (
int*)(((
char*)q_data_host) + q_data_size);
2859 cudaMalloc((
void**) &q_data_dev, q_data_size+ffz_size);
2860 ffz_dev = (
int*)(((
char*)q_data_dev) + q_data_size);
2861 cudaMalloc((
void**) &v_data_dev, q_data_size);
2863 cudaMemset(q_data_dev, 0, q_data_size + ffz_size);
2864 cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2865 cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2866 cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2867 cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2870 for (
int n=fsize, i=0; i<n; ++i ) {
2871 if ( f_arr[i] == 0 ) {
2872 q_arr[i] = q_data_host + f_alloc_count * q_stride;
2873 q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2874 v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2878 q_arr_dev_host[i] = 0;
2879 v_arr_dev_host[i] = 0;
2883 cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize *
sizeof(
float*), cudaMemcpyHostToDevice);
2884 cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize *
sizeof(
float*), cudaMemcpyHostToDevice);
2885 delete [] q_arr_dev_host;
2886 delete [] v_arr_dev_host;
2888 f_arr =
new char[fsize + q_stride];
2889 fz_arr = f_arr + fsize;
2890 memset(f_arr, 0, fsize + q_stride);
2891 memset(ffz_host, 0, (fsize + q_stride)*
sizeof(
int));
2898 #define XCOPY(X) masterPmeMgr->X = X;
2899 XCOPY(bspline_coeffs_dev)
2900 XCOPY(bspline_dcoeffs_dev)
2917 #define XCOPY(X) X = masterPmeMgr->X;
2918 XCOPY(bspline_coeffs_dev)
2919 XCOPY(bspline_dcoeffs_dev)
2938 fz_arr =
new char[myGrid.
K3+myGrid.
order-1];
2941 #if 0 && USE_PERSISTENT
2942 recvGrid_handle = NULL;
2948 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2952 for (
int g=0; g<numGridsMax; ++g )
delete myRealSpace[g];
2956 #if 0 && USE_PERSISTENT
2957 void ComputePmeMgr::setup_recvgrid_persistent()
2961 int dim2 = myGrid.
dim2;
2962 int dim3 = myGrid.
dim3;
2963 int block1 = myGrid.
block1;
2964 int block2 = myGrid.
block2;
2966 CkArray *zPencil_local = zPencil.ckLocalBranch();
2967 recvGrid_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * numPencilsActive);
2968 for (
int ap=0; ap<numPencilsActive; ++ap) {
2969 int ib = activePencils[ap].
i;
2970 int jb = activePencils[ap].
j;
2971 int ibegin = ib*block1;
2972 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
2973 int jbegin = jb*block2;
2974 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
2975 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2979 char *f = f_arr + g*fsize;
2980 for (
int i=ibegin; i<iend; ++i ) {
2981 for (
int j=jbegin; j<jend; ++j ) {
2982 fcount += f[i*dim2+j];
2987 for (
int i=0; i<myGrid.
K3; ++i ) {
2988 if ( fz_arr[i] ) ++zlistlen;
2990 int hd = ( fcount? 1 : 0 );
2991 int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
2992 int compress_start =
sizeof(
PmeGridMsg ) +
sizeof(envelope) +
sizeof(int)*hd*zlistlen +
sizeof(
char)*hd*flen +
sizeof(
PmeReduction)*hd*numGrids ;
2993 int compress_size =
sizeof(float)*hd*fcount*zlistlen;
2994 int size = compress_start + compress_size +
PRIORITY_SIZE/8+6;
2995 recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
3007 if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount )
return 0;
3008 myMgr->heldComputes.
add(
this);
3012 positionBox->
skip();
3016 myMgr->noWorkCount = 0;
3017 myMgr->reduction->
submit();
3032 evir[g] += msg->
evir[g];
3052 numLocalQMAtoms = 0;
3053 for (
int paIter=0; paIter<patch->
getNumAtoms(); paIter++) {
3054 if ( qmAtomGroup[xExt[paIter].
id] != 0 ) {
3062 if (qmLoclIndx != 0)
3063 delete [] qmLoclIndx;
3064 if (qmLocalCharges != 0)
3065 delete [] qmLocalCharges;
3067 qmLoclIndx =
new int[numLocalQMAtoms] ;
3068 qmLocalCharges =
new Real[numLocalQMAtoms] ;
3074 for (
int paIter=0; paIter<patch->
getNumAtoms(); paIter++) {
3076 for (
int i=0; i<numQMAtms; i++) {
3078 if (qmAtmIndx[i] == xExt[paIter].
id) {
3080 qmLoclIndx[procAtms] = paIter ;
3081 qmLocalCharges[procAtms] = qmAtmChrg[i];
3089 if (procAtms == numLocalQMAtoms)
3099 DebugM(4,
"Entering ComputePme::doWork().\n");
3102 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3109 if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->
submitReductions();
3115 #ifdef TRACE_COMPUTE_OBJECTS
3116 double traceObjStartTime = CmiWallTimer();
3119 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3128 localData_alloc.
resize(numLocalAtoms*(numGrids+ ((numGrids>1 ||
selfOn)?1:0)));
3129 localData = localData_alloc.
begin();
3130 localPartition_alloc.
resize(numLocalAtoms);
3131 localPartition = localPartition_alloc.
begin();
3135 localGridData[g] = localData + numLocalAtoms*(g+1);
3140 unsigned char * part_ptr = localPartition;
3148 positionBox->
close(&x);
3149 x = avgPositionBox->
open();
3153 for(
int i=0; i<numAtoms; ++i)
3158 data_ptr->
cg = coulomb_sqrt * x[i].
charge;
3168 for(
int i=0; i<numLocalQMAtoms; ++i)
3170 localData[qmLoclIndx[i]].
cg = coulomb_sqrt * qmLocalCharges[i];
3176 else { positionBox->
close(&x); }
3185 for(
int i=0; i<numLocalAtoms; ++i) {
3186 if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3189 lgd[nga++] = localData[i];
3192 numGridAtoms[g] = nga;
3195 for(
int i=0; i<numLocalAtoms; ++i) {
3196 if ( localPartition[i] == 0 ) {
3198 lgd[nga++] = localData[i];
3201 numGridAtoms[g] = nga;
3211 for ( g=0; g<2; ++g ) {
3214 for(
int i=0; i<numLocalAtoms; ++i) {
3215 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3216 lgd[nga++] = localData[i];
3219 numGridAtoms[g] = nga;
3221 for (g=2 ; g<4 ; ++g ) {
3224 for(
int i=0; i<numLocalAtoms; ++i) {
3225 if ( localPartition[i] == (g-1) ) {
3226 lgd[nga++] = localData[i];
3229 numGridAtoms[g] = nga;
3235 for(
int i=0; i<numLocalAtoms; ++i) {
3236 if ( localPartition[i] == 0 ) {
3237 lgd[nga++] = localData[i];
3240 numGridAtoms[g] = nga;
3243 if ( numGrids != 1 )
NAMD_bug(
"ComputePme::doWork assertion 1 failed");
3247 for(
int i=0; i<numLocalAtoms; ++i) {
3248 if ( localPartition[i] == 1 ) {
3249 lgd[nga++] = localData[i];
3252 numGridAtoms[g] = nga;
3254 if ( numGrids != 3 )
NAMD_bug(
"ComputePme::doWork assertion 2 failed");
3258 for(
int i=0; i<numLocalAtoms; ++i) {
3259 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3260 lgd[nga++] = localData[i];
3263 numGridAtoms[g] = nga;
3264 for ( g=1; g<3; ++g ) {
3267 for(
int i=0; i<numLocalAtoms; ++i) {
3268 if ( localPartition[i] == g ) {
3269 lgd[nga++] = localData[i];
3272 numGridAtoms[g] = nga;
3275 if ( numGrids != 1 )
NAMD_bug(
"ComputePme::doWork assertion 3 failed");
3276 localGridData[0] = localData;
3277 numGridAtoms[0] = numLocalAtoms;
3280 if ( ! myMgr->doWorkCount ) {
3283 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3287 memset( (
void*) myMgr->fz_arr, 0, (myGrid.
K3+myGrid.
order-1) *
sizeof(
char) );
3289 for (
int i=0; i<myMgr->q_count; ++i) {
3290 memset( (
void*) (myMgr->q_list[i]), 0, (myGrid.
K3+myGrid.
order-1) *
sizeof(
float) );
3298 myMgr->strayChargeErrors = 0;
3300 myMgr->compute_sequence =
sequence();
3303 if (
sequence() != myMgr->compute_sequence )
NAMD_bug(
"ComputePme sequence mismatch in doWork()");
3305 int strayChargeErrors = 0;
3311 data_ptr = localGridData[g];
3313 for(i=0; i<numGridAtoms[g]; ++i)
3315 selfEnergy += data_ptr->
cg * data_ptr->
cg;
3318 selfEnergy *= -1. * ewaldcof /
SQRT_PI;
3319 myMgr->evir[g][0] += selfEnergy;
3321 float **q = myMgr->q_arr + g*myMgr->fsize;
3322 char *f = myMgr->f_arr + g*myMgr->fsize;
3325 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3330 cudaMallocHost((
void**) &(myMgr->
a_data_host), 7*na*
sizeof(
float));
3331 cudaMalloc((
void**) &(myMgr->
a_data_dev), 7*na*
sizeof(
float));
3336 int n = numGridAtoms[g];
3339 CkPrintf(
"Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3343 const float *a_data_host_old = myMgr->
a_data_host;
3344 cudaMallocHost((
void**) &(myMgr->
a_data_host), 7*na*
sizeof(
float));
3346 memcpy(myMgr->
a_data_host, a_data_host_old, 7*cuda_atoms_offset*
sizeof(
float));
3347 cudaFreeHost((
void*) a_data_host_old);
3351 cudaMalloc((
void**) &(myMgr->
a_data_dev), 7*na*
sizeof(
float));
3354 float *a_data_host = myMgr->
a_data_host + 7 * cuda_atoms_offset;
3355 data_ptr = localGridData[g];
3356 double order_1 = myGrid.
order - 1;
3357 double K1 = myGrid.
K1;
3358 double K2 = myGrid.
K2;
3359 double K3 = myGrid.
K3;
3360 int found_negative = 0;
3361 for (
int i=0; i<n; ++i ) {
3362 if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3366 double x_int = (int) data_ptr[i].x;
3367 double y_int = (int) data_ptr[i].y;
3368 double z_int = (int) data_ptr[i].z;
3369 a_data_host[7*i ] = data_ptr[i].
x - x_int;
3370 a_data_host[7*i+1] = data_ptr[i].
y - y_int;
3371 a_data_host[7*i+2] = data_ptr[i].
z - z_int;
3372 a_data_host[7*i+3] = data_ptr[i].
cg;
3373 x_int -= order_1;
if ( x_int < 0 ) x_int += K1;
3374 y_int -= order_1;
if ( y_int < 0 ) y_int += K2;
3375 z_int -= order_1;
if ( z_int < 0 ) z_int += K3;
3376 a_data_host[7*i+4] = x_int;
3377 a_data_host[7*i+5] = y_int;
3378 a_data_host[7*i+6] = z_int;
3380 if ( found_negative )
NAMD_bug(
"found negative atom coordinate in ComputePme::doWork");
3385 myRealSpace[g]->
fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3388 myMgr->strayChargeErrors += strayChargeErrors;
3390 #ifdef TRACE_COMPUTE_OBJECTS
3394 if ( --(myMgr->doWorkCount) == 0 ) {
3396 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3433 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3441 const double before = CmiWallTimer();
3443 cudaMemcpyHostToDevice, streams[stream]);
3444 const double after = CmiWallTimer();
3446 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3450 q_arr_dev, ffz_dev, ffz_dev + fsize,
3454 const double after2 = CmiWallTimer();
3466 cudaError_t err = cudaEventQuery(argp->
end_charges);
3467 if ( err == cudaSuccess ) {
3472 }
else if ( err != cudaErrorNotReady ) {
3474 sprintf(errmsg,
"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3480 sprintf(errmsg,
"cuda_check_pme_charges polled %d times over %f s on seq %d",
3504 double before = CmiWallTimer();
3505 cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);
3506 cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3507 cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3508 cudaMemcpyDeviceToHost, streams[stream]);
3510 cudaEventRecord(masterPmeMgr->
end_charges, streams[stream]);
3511 cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);
3512 cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3519 CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).
computeMgr);
3523 pmeProxy[
master_pe].pollChargeGridReady();
3528 for (
int i=0; i<CkMyNodeSize(); ++i ) {
3532 mgr->ungridForcesCount = cs;
3533 mgr->recipEvirCount = mgr->recipEvirClients;
3537 pmeProxy[
master_pe].recvChargeGridReady();
3542 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3546 NAMD_bug(
"ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3556 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3559 int q_stride = myGrid.
K3+myGrid.
order-1;
3560 for (
int n=fsize+q_stride, j=fsize; j<n; ++j) {
3561 f_arr[j] = ffz_host[j];
3562 if ( ffz_host[j] & ~1 ) ++errcount;
3564 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::chargeGridReady");
3567 recipEvirCount = recipEvirClients;
3570 for (
int j=0; j<myGrid.
order-1; ++j) {
3571 fz_arr[j] |= fz_arr[myGrid.
K3+j];
3586 #if 0 && USE_PERSISTENT
3587 if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3591 int dim2 = myGrid.
dim2;
3592 int dim3 = myGrid.
dim3;
3593 int block1 = myGrid.
block1;
3594 int block2 = myGrid.
block2;
3597 NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3599 for (
int ap=first; ap<=last; ++ap) {
3600 int ib = activePencils[ap].
i;
3601 int jb = activePencils[ap].
j;
3602 int ibegin = ib*block1;
3603 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3604 int jbegin = jb*block2;
3605 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3606 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3610 char *f = f_arr + g*fsize;
3611 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3614 for (
int i=ibegin; i<iend; ++i ) {
3615 for (
int j=jbegin; j<jend; ++j ) {
3619 if ( ffz_host[k] & ~1 ) ++errcount;
3622 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::sendPencilsPart");
3625 for (
int i=ibegin; i<iend; ++i ) {
3626 for (
int j=jbegin; j<jend; ++j ) {
3627 fcount += f[i*dim2+j];
3632 #ifdef NETWORK_PROGRESS
3633 CmiNetworkProgress();
3636 if ( ! pencilActive[ib*yBlocks+jb] )
3637 NAMD_bug(
"PME activePencils list inconsistent");
3640 for (
int i=0; i<myGrid.
K3; ++i ) {
3641 if ( fz_arr[i] ) ++zlistlen;
3644 int hd = ( fcount? 1 : 0 );
3648 PmeGridMsg *msg =
new ( hd*zlistlen, hd*flen,
3655 msg->
start = fstart;
3662 int *zlist = msg->
zlist;
3664 for (
int i=0; i<myGrid.
K3; ++i ) {
3665 if ( fz_arr[i] ) zlist[zlistlen++] = i;
3667 char *fmsg = msg->
fgrid;
3668 float *qmsg = msg->
qgrid;
3670 char *f = f_arr + g*fsize;
3671 float **q = q_arr + g*fsize;
3672 for (
int i=ibegin; i<iend; ++i ) {
3673 for (
int j=jbegin; j<jend; ++j ) {
3674 *(fmsg++) = f[i*dim2+j];
3676 for (
int h=0; h<myGrid.
order-1; ++h) {
3677 q[i*dim2+j][h] += q[i*dim2+j][myGrid.
K3+h];
3679 for (
int k=0; k<zlistlen; ++k ) {
3680 *(qmsg++) = q[i*dim2+j][zlist[k]];
3690 CmiEnableUrgentSend(1);
3691 #if USE_NODE_PAR_RECEIVE
3692 msg->
destElem=CkArrayIndex3D(ib,jb,0);
3693 CProxy_PmePencilMap lzm = npMgr->
zm;
3694 int destproc = lzm.ckLocalBranch()->procNum(0, msg->
destElem);
3695 int destnode = CmiNodeOf(destproc);
3698 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3700 pmeNodeProxy[destnode].recvZGrid(msg);
3702 CmiUsePersistentHandle(NULL, 0);
3706 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3708 zPencil(ib,jb,0).recvGrid(msg);
3710 CmiUsePersistentHandle(NULL, 0);
3713 CmiEnableUrgentSend(0);
3729 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3733 NAMD_bug(
"NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3743 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3745 for (
int ap=0; ap < numPencilsActive; ++ap ) {
3748 int ib = activePencils[ap].
i;
3749 int jb = activePencils[ap].
j;
3750 int destproc = nodePmeMgr->
zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3751 pmeProxy[destproc].sendPencilsHelper(ap);
3753 pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3762 if ( strayChargeErrors ) {
3763 strayChargeErrors = 0;
3764 iout <<
iERROR <<
"Stray PME grid charges detected: "
3765 << CkMyPe() <<
" sending to (x,y)";
3768 int dim2 = myGrid.
dim2;
3769 int block1 = myGrid.
block1;
3770 int block2 = myGrid.
block2;
3771 for (
int ib=0; ib<xBlocks; ++ib) {
3772 for (
int jb=0; jb<yBlocks; ++jb) {
3773 int ibegin = ib*block1;
3774 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3775 int jbegin = jb*block2;
3776 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3777 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3780 char *f = f_arr + g*fsize;
3781 if ( ! pencilActive[ib*yBlocks+jb] ) {
3782 for (
int i=ibegin; i<iend; ++i ) {
3783 for (
int j=jbegin; j<jend; ++j ) {
3784 if ( f[i*dim2+j] == 3 ) {
3786 iout <<
" (" << i <<
"," << j <<
")";
3804 int dim2 = myGrid.
dim2;
3805 int dim3 = myGrid.
dim3;
3806 int block1 = myGrid.
block1;
3807 int block2 = myGrid.
block2;
3813 int ibegin = ib*block1;
3814 int iend = ibegin + block1;
if ( iend > K1 ) iend = K1;
3815 int jbegin = jb*block2;
3816 int jend = jbegin + block2;
if ( jend > K2 ) jend = K2;
3819 int *zlist = msg->
zlist;
3820 float *qmsg = msg->
qgrid;
3823 char *f = f_arr + g*fsize;
3824 float **q = q_arr + g*fsize;
3825 for (
int i=ibegin; i<iend; ++i ) {
3826 for (
int j=jbegin; j<jend; ++j ) {
3829 for (
int k=0; k<zlistlen; ++k ) {
3830 q[i*dim2+j][zlist[k]] = *(qmsg++);
3832 for (
int h=0; h<myGrid.
order-1; ++h) {
3833 q[i*dim2+j][myGrid.
K3+h] = q[i*dim2+j][h];
3848 CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3849 for (
int j=first; j<=last; j++) {
3850 int pe = gridPeOrder[j];
3851 if ( ! recipPeDest[pe] && ! errors )
continue;
3852 int start = pe * bsize;
3854 if ( start >= qsize ) { start = 0; len = 0; }
3855 if ( start + len > qsize ) { len = qsize - start; }
3856 int zdim = myGrid.
dim3;
3857 int fstart = start / zdim;
3858 int flen = len / zdim;
3864 char *f = f_arr + fstart + g*fsize;
3865 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3868 for ( i=0; i<flen; ++i ) {
3869 f[i] = ffz_host[fstart+i];
3871 if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3873 if ( errcount )
NAMD_bug(
"bad flag in ComputePmeMgr::sendDataPart");
3876 for ( i=0; i<flen; ++i ) {
3879 if ( ! recipPeDest[pe] ) {
3881 for ( i=0; i<flen; ++i ) {
3888 iout <<
iERROR <<
"Stray PME grid charges detected: "
3889 << sourcepe <<
" sending to " << gridPeMap[pe] <<
" for planes";
3891 for ( i=0; i<flen; ++i ) {
3894 int jz = (i+fstart)/myGrid.
K2;
3895 if ( iz != jz ) {
iout <<
" " << jz; iz = jz; }
3903 #ifdef NETWORK_PROGRESS
3904 CmiNetworkProgress();
3907 if ( ! recipPeDest[pe] )
continue;
3910 for ( i=0; i<myGrid.
K3; ++i ) {
3911 if ( fz_arr[i] ) ++zlistlen;
3919 msg->
start = fstart;
3922 int *zlist = msg->
zlist;
3924 for ( i=0; i<myGrid.
K3; ++i ) {
3925 if ( fz_arr[i] ) zlist[zlistlen++] = i;
3927 float *qmsg = msg->
qgrid;
3929 char *f = f_arr + fstart + g*fsize;
3930 CmiMemcpy((
void*)(msg->
fgrid+g*flen),(
void*)f,flen*
sizeof(
char));
3931 float **q = q_arr + fstart + g*fsize;
3932 for ( i=0; i<flen; ++i ) {
3934 for (
int h=0; h<myGrid.
order-1; ++h) {
3935 q[i][h] += q[i][myGrid.
K3+h];
3937 for (
int k=0; k<zlistlen; ++k ) {
3938 *(qmsg++) = q[i][zlist[k]];
3946 pmeProxy[gridPeMap[pe]].recvGrid(msg);
3956 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3960 NAMD_bug(
"NodePmeMgr::sendDataHelper called in non-CUDA build");
3970 strayChargeErrors = 0;
3972 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3974 for (
int i=0; i < numGridPes; ++i ) {
3975 int pe = gridPeOrder[i];
3979 pmeProxy[gridPeMap[pe]].sendDataHelper(i);
3981 pmeNodeProxy[CkMyNode()].sendDataHelper(i);
3994 int zdim = myGrid.
dim3;
3995 int flen = msg->
len;
3996 int fstart = msg->
start;
3998 int *zlist = msg->
zlist;
3999 float *qmsg = msg->
qgrid;
4002 char *f = msg->
fgrid + g*flen;
4003 float **q = q_arr + fstart + g*fsize;
4004 for (
int i=0; i<flen; ++i ) {
4007 for (
int k=0; k<zlistlen; ++k ) {
4008 q[i][zlist[k]] = *(qmsg++);
4010 for (
int h=0; h<myGrid.
order-1; ++h) {
4011 q[i][myGrid.
K3+h] = q[i][h];
4020 if (
sequence() != myMgr->compute_sequence )
NAMD_bug(
"ComputePme sequence mismatch in ungridForces()");
4024 localResults_alloc.
resize(numLocalAtoms* ((numGrids>1 ||
selfOn)?2:1));
4025 Vector *localResults = localResults_alloc.
begin();
4029 for(
int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4030 gridResults = localResults + numLocalAtoms;
4032 gridResults = localResults;
4040 #ifdef NETWORK_PROGRESS
4041 CmiNetworkProgress();
4044 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4047 for (
int n=numGridAtoms[g], i=0; i<n; ++i ) {
4049 if ( ((
int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; }
4050 gridResults[i].
x = f_data_host[3*i];
4051 gridResults[i].
y = f_data_host[3*i+1];
4052 gridResults[i].
z = f_data_host[3*i+2];
4056 for (
int n=numGridAtoms[g], i=0; i<n; ++i ) {
4057 float f = f_data_host[3*i];
4058 if ( ((
int*)f_data_host)[3*i] == 0x7fffffff ) {
4060 gridResults[i] = 0.;
4063 iout <<
iERROR <<
"Stray PME grid charges detected: "
4064 << errcount <<
" atoms on pe " << CkMyPe() <<
"\n" <<
endi;
4069 myRealSpace[g]->
compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4075 BigReal elecLambdaUp, elecLambdaDown;
4077 myMgr->alchLambda = alchLambda;
4079 myMgr->alchLambda2 = alchLambda2;
4083 if ( g == 0 ) scale = elecLambdaUp;
4084 else if ( g == 1 ) scale = elecLambdaDown;
4085 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4088 if ( g == 2 ) scale = 1 - elecLambdaUp;
4089 else if ( g == 3 ) scale = 1 - elecLambdaDown;
4090 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4095 for(
int i=0; i<numLocalAtoms; ++i) {
4096 if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4099 localResults[i] += gridResults[nga++] * scale;
4103 for(
int i=0; i<numLocalAtoms; ++i) {
4104 if ( localPartition[i] == 0 ) {
4106 localResults[i] += gridResults[nga++] * scale;
4112 for(
int i=0; i<numLocalAtoms; ++i) {
4113 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4116 localResults[i] += gridResults[nga++] * scale;
4121 for(
int i=0; i<numLocalAtoms; ++i) {
4122 if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4126 localResults[i] += gridResults[nga++] * scale;
4131 }
else if (
lesOn ) {
4135 myMgr->alchLambda = alchLambda;
4137 myMgr->alchLambda2 = alchLambda2;
4138 if ( g == 0 ) scale = alchLambda;
4139 else if ( g == 1 ) scale = 1. - alchLambda;
4140 }
else if (
lesOn ) {
4144 for(
int i=0; i<numLocalAtoms; ++i) {
4145 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4146 localResults[i] += gridResults[nga++] * scale;
4152 for(
int i=0; i<numLocalAtoms; ++i) {
4153 if ( localPartition[i] == 1 ) {
4154 pairForce += gridResults[nga];
4155 localResults[i] += gridResults[nga++];
4161 for(
int i=0; i<numLocalAtoms; ++i) {
4162 if ( localPartition[i] == 1 ) {
4163 pairForce += gridResults[nga];
4165 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4166 localResults[i] += gridResults[nga++];
4169 }
else if ( g == 1 ) {
4171 for(
int i=0; i<numLocalAtoms; ++i) {
4172 if ( localPartition[i] == g ) {
4173 pairForce -= gridResults[nga];
4174 localResults[i] -= gridResults[nga++];
4179 for(
int i=0; i<numLocalAtoms; ++i) {
4180 if ( localPartition[i] == g ) {
4181 localResults[i] -= gridResults[nga++];
4189 Vector *results_ptr = localResults;
4197 if ( ! myMgr->strayChargeErrors && ! simParams->
commOnly ) {
4198 for(
int i=0; i<numAtoms; ++i) {
4199 f[i].
x += results_ptr->
x;
4200 f[i].
y += results_ptr->
y;
4201 f[i].
z += results_ptr->
z;
4205 forceBox->
close(&r);
4221 BigReal elecLambdaUp, elecLambdaDown;
4223 if ( alchLambda < 0 || alchLambda > 1 ) {
4224 NAMD_bug(
"ComputePmeMgr::submitReductions alchLambda out of range");
4228 if ( g == 0 ) scale = elecLambdaUp;
4229 else if ( g == 1 ) scale = elecLambdaDown;
4230 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4232 if ( g == 2 ) scale = 1-elecLambdaUp;
4233 else if ( g == 3 ) scale = 1-elecLambdaDown;
4234 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4236 }
else if (
lesOn ) {
4239 scale = ( g == 0 ? 1. : -1. );
4242 reduction->
item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4243 reduction->
item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4244 reduction->
item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4245 reduction->
item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4246 reduction->
item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4247 reduction->
item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4248 reduction->
item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4249 reduction->
item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4250 reduction->
item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4258 BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4261 if ( g == 0 ) scale2 = elecLambda2Up;
4262 else if ( g == 1 ) scale2 = elecLambda2Down;
4263 else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4264 if (
alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4265 else if (
alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4266 else if (
alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4319 for (
int i=0; i<heldComputes.
size(); ++i ) {
4328 const static unsigned int NAMDPrimes[] = {
4358 bool generateBGLORBPmePeList(
int *pemap,
int numPes,
4359 int *block_pes,
int nbpes) {
4362 int *pmemap =
new int [CkNumPes()];
4369 memset(pmemap, 0,
sizeof(
int) * CkNumPes());
4371 for(
int count = 0; count < CkNumPes(); count++) {
4373 pmemap[block_pes[count]] = 1;
4379 if(tmgr.hasMultipleProcsPerNode()) {
4380 pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4389 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4390 Node *node = nd.ckLocalBranch();
4395 int xsize = 0, ysize = 0, zsize = 0;
4397 xsize = tmgr.getDimNX();
4398 ysize = tmgr.getDimNY();
4399 zsize = tmgr.getDimNZ();
4401 int nx = xsize, ny = ysize, nz = zsize;
4408 findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4411 int group_size = numPes/nx;
4415 int my_prime = NAMDPrimes[0];
4416 int density = (ny * nz)/group_size + 1;
4420 for(count = 0; count < NPRIMES; count ++) {
4422 if(density < NAMDPrimes[count]) {
4423 my_prime = NAMDPrimes[count];
4428 if(count == NPRIMES)
4429 my_prime = NAMDPrimes[NPRIMES-1];
4437 for(
int x = 0; x < nx; x++) {
4438 coord[0] = (x + nx/2)%nx;
4440 for(count=0; count < group_size && npme_pes < numPes; count++) {
4441 int dest = (count + 1) * my_prime;
4442 dest = dest % (ny * nz);
4444 coord[2] = dest / ny;
4445 coord[1] = dest - coord[2] * ny;
4448 int destPe = coord[dm.x] + coord[dm.y] * xsize +
4449 coord[dm.z] * xsize* ysize;
4451 if(pmemap[destPe] == 0) {
4452 pemap[gcount++] = destPe;
4455 if(tmgr.hasMultipleProcsPerNode())
4456 pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4461 for(
int pos = 1; pos < ny * nz; pos++) {
4463 coord[2] += pos / ny;
4464 coord[1] += pos % ny;
4466 coord[2] = coord[2] % nz;
4467 coord[1] = coord[1] % ny;
4469 int newdest = coord[dm.x] + coord[dm.y] * xsize +
4470 coord[dm.z] * xsize * ysize;
4472 if(pmemap[newdest] == 0) {
4473 pemap[gcount++] = newdest;
4474 pmemap[newdest] = 1;
4476 if(tmgr.hasMultipleProcsPerNode())
4477 pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4486 if(gcount == numPes)
4489 if(npme_pes >= numPes)
4495 if(npme_pes != numPes)
4512 trans_handle = untrans_handle = ungrid_handle = NULL;
4531 for (
int i=0; i<nBlocks; ++i )
send_order[i] = i;
4545 #ifndef CmiMemoryAtomicType
4559 PersistentHandle *trans_handle;
4560 PersistentHandle *untrans_handle;
4561 PersistentHandle *ungrid_handle;
4567 PmeZPencil_SDAG_CODE
4573 delete [] forward_plans;
4574 delete [] backward_plans;
4596 fftwf_plan forward_plan, backward_plan;
4600 fftwf_plan *forward_plans, *backward_plans;
4602 rfftwnd_plan forward_plan, backward_plan;
4608 void setup_persistent() {
4614 CmiAssert(yPencil_local);
4615 trans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * zBlocks);
4616 for (
int isend=0; isend<zBlocks; ++isend ) {
4619 if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4620 int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4622 int compress_start =
sizeof(
PmeTransMsg)+
sizeof(envelope);
4623 int compress_size =
sizeof(float)*hd*nx*ny*nz1*2;
4624 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4628 void setup_ungrid_persistent()
4630 ungrid_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * grid_msgs.
size());
4632 for ( limsg=0; limsg < grid_msgs.
size(); ++limsg ) {
4633 int peer = grid_msgs[limsg]->sourceNode;
4643 PmeYPencil_SDAG_CODE
4663 fftwf_plan forward_plan, backward_plan;
4665 fftw_plan forward_plan, backward_plan;
4671 void setup_persistent() {
4677 trans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * yBlocks);
4678 for (
int isend=0; isend<yBlocks; ++isend ) {
4681 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4682 int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4684 int compress_start =
sizeof(
PmeTransMsg)+
sizeof( envelope);
4685 int compress_size =
sizeof(float)*hd*nx*ny1*nz*2;
4686 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4690 untrans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * yBlocks);
4691 for (
int isend=0; isend<yBlocks; ++isend ) {
4694 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4695 int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4697 int compress_start =
sizeof(
PmeUntransMsg) +
sizeof( envelope);
4698 int compress_size =
sizeof(float)*nx*ny1*nz*2;
4699 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4707 PmeXPencil_SDAG_CODE
4713 delete [] forward_plans;
4714 delete [] backward_plans;
4731 fftwf_plan *forward_plans, *backward_plans;
4741 void setup_persistent() {
4746 untrans_handle = (PersistentHandle*) malloc(
sizeof(PersistentHandle) * xBlocks);
4747 for (
int isend=0; isend<xBlocks; ++isend ) {
4750 if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4751 int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4753 sizeof(
float)*nx1*ny*nz*2 +
sizeof( envelope) +
PRIORITY_SIZE/8+24;
4754 int compress_start =
sizeof(
PmeUntransMsg) +
sizeof( envelope);
4755 int compress_size =
sizeof(float)*nx1*ny*nz*2;
4756 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4769 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4770 Node *node = nd.ckLocalBranch();
4773 #if USE_NODE_PAR_RECEIVE
4785 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4787 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4793 work =
new float[dim3];
4800 int fftwFlags = simParams->
FFTWPatient ? FFTW_PATIENT : simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4801 int sizeLines=nx*ny;
4802 int planLineSizes[1];
4803 planLineSizes[0]=K3;
4805 int ndimHalf=ndim/2;
4806 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4807 (
float *)
data, NULL, 1,
4809 (fftwf_complex *)
data, NULL, 1,
4813 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4814 (fftwf_complex *)
data, NULL, 1,
4816 (
float *)
data, NULL, 1,
4819 #if CMK_SMP && USE_CKLOOP
4823 numPlans = (nx<=ny?nx:ny);
4824 if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4825 if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4826 int howmany = sizeLines/numPlans;
4827 forward_plans =
new fftwf_plan[numPlans];
4828 backward_plans =
new fftwf_plan[numPlans];
4829 for(
int i=0; i<numPlans; i++) {
4830 int dimStride = i*ndim*howmany;
4831 int dimHalfStride = i*ndimHalf*howmany;
4832 forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4833 ((
float *)
data)+dimStride, NULL, 1,
4835 ((fftwf_complex *)
data)+dimHalfStride, NULL, 1,
4839 backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4840 ((fftwf_complex *)
data)+dimHalfStride, NULL, 1,
4842 ((
float *)
data)+dimStride, NULL, 1,
4849 forward_plans = NULL;
4850 backward_plans = NULL;
4853 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4854 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4855 | FFTW_IN_PLACE | FFTW_USE_WISDOM,
data, 1,
work, 1);
4856 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4857 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4858 | FFTW_IN_PLACE | FFTW_USE_WISDOM,
data, 1,
work, 1);
4862 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
4865 #if USE_NODE_PAR_RECEIVE
4867 memset(
data, 0,
sizeof(
float) * nx*ny*dim3);
4872 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4873 Node *node = nd.ckLocalBranch();
4876 #if USE_NODE_PAR_RECEIVE
4888 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4890 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4896 work =
new float[2*K2];
4906 int planLineSizes[1];
4907 planLineSizes[0]=K2;
4908 int fftwFlags = simParams->
FFTWPatient ? FFTW_PATIENT : simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4909 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4910 (fftwf_complex *)
data, NULL, sizeLines, 1,
4911 (fftwf_complex *)
data, NULL, sizeLines, 1,
4914 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4915 (fftwf_complex *)
data, NULL, sizeLines, 1,
4916 (fftwf_complex *)
data, NULL, sizeLines, 1,
4919 CkAssert(forward_plan != NULL);
4920 CkAssert(backward_plan != NULL);
4922 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
4923 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4924 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
4925 nz, (fftw_complex *)
work, 1);
4926 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
4927 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4928 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
4929 nz, (fftw_complex *)
work, 1);
4933 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
4936 #if USE_NODE_PAR_RECEIVE
4938 CmiMemoryWriteFence();
4948 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
4956 CmiMemoryWriteFence();
4968 if ( !
hasData )
NAMD_bug(
"PmeYPencil::node_process_untrans non-null msg but not hasData");
4970 }
else if (
hasData )
NAMD_bug(
"PmeYPencil::node_process_untrans hasData but null msg");
4972 CmiMemoryAtomicFetchAndInc(
imsgb,limsg);
4980 CmiMemoryWriteFence();
4985 #define DEBUG_NODE_PAR_RECV 0
4990 #if DEBUG_NODE_PAR_RECV
4992 CkAbort(
"xpencil in recvXTrans not found, debug registeration");
5002 #if DEBUG_NODE_PAR_RECV
5004 CkAbort(
"ypencil in recvYTrans not found, debug registeration");
5012 #if DEBUG_NODE_PAR_RECV
5014 CkAbort(
"ypencil in recvYUntrans not found, debug registeration");
5022 #if DEBUG_NODE_PAR_RECV
5024 CkAbort(
"zpencil in recvZUntrans not found, debug registeration");
5033 #if DEBUG_NODE_PAR_RECV
5035 CkAbort(
"zpencil in recvZGrid not found, debug registeration");
5042 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5043 Node *node = nd.ckLocalBranch();
5045 #if USE_NODE_PAR_RECEIVE
5056 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
5058 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
5064 work =
new float[2*K1];
5070 int fftwFlags = simParams->
FFTWPatient ? FFTW_PATIENT : simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5071 int sizeLines=ny*
nz;
5072 int planLineSizes[1];
5073 planLineSizes[0]=K1;
5074 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5075 (fftwf_complex *)
data, NULL, sizeLines, 1,
5076 (fftwf_complex *)
data, NULL, sizeLines, 1,
5079 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5080 (fftwf_complex *)
data, NULL, sizeLines, 1,
5081 (fftwf_complex *)
data, NULL, sizeLines, 1,
5085 #if CMK_SMP && USE_CKLOOP
5089 numPlans = (ny<=nz?ny:
nz);
5093 if ( sizeLines/numPlans < 4 ) numPlans = 1;
5094 int howmany = sizeLines/numPlans;
5095 forward_plans =
new fftwf_plan[numPlans];
5096 backward_plans =
new fftwf_plan[numPlans];
5097 for(
int i=0; i<numPlans; i++) {
5098 int curStride = i*howmany;
5099 forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5100 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5101 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5105 backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5106 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5107 ((fftwf_complex *)
data)+curStride, NULL, sizeLines, 1,
5114 forward_plans = NULL;
5115 backward_plans = NULL;
5118 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5119 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5120 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5121 ny*nz, (fftw_complex *)
work, 1);
5122 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5123 ( simParams->
FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5124 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *)
data,
5125 ny*nz, (fftw_complex *)
work, 1);
5129 NAMD_die(
"Sorry, FFTW must be compiled in to use PME.");
5133 thisIndex.y*block2, thisIndex.y*block2 + ny,
5134 thisIndex.z*block3, thisIndex.z*block3 + nz);
5147 #if ! USE_NODE_PAR_RECEIVE
5148 memset(
data, 0,
sizeof(
float)*nx*ny*dim3);
5156 int * __restrict msg_zlist = msg->
zlist;
5157 int * __restrict zlist = work_zlist.
begin();
5158 __assume_aligned(zlist,64);
5159 for (
int k=0; k<zlistlen; ++k ) {
5160 zlist[k] = msg_zlist[k];
5163 int * __restrict zlist = msg->
zlist;
5165 char * __restrict fmsg = msg->
fgrid;
5166 float * __restrict qmsg = msg->
qgrid;
5167 float * __restrict d =
data;
5169 for (
int g=0; g<numGrids; ++g ) {
5170 for (
int i=0; i<nx; ++i ) {
5171 for (
int j=0; j<ny; ++j, d += dim3 ) {
5174 for (
int k=0; k<zlistlen; ++k ) {
5175 d[zlist[k]] += *(qmsg++);
5183 static inline void PmeXZPencilFFT(
int first,
int last,
void *result,
int paraNum,
void *param){
5186 fftwf_plan *plans = (fftwf_plan *)param;
5187 for(
int i=first; i<=last; i++) fftwf_execute(plans[i]);
5197 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5199 for (
int i=0; i<nx; ++i ) {
5200 for (
int j=0; j<ny; ++j, d += dim3 ) {
5201 for (
int k=0; k<dim3; ++k ) {
5202 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5208 #ifdef MANUAL_DEBUG_FFTW3
5209 dumpMatrixFloat3(
"fw_z_b",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5212 #if CMK_SMP && USE_CKLOOP
5218 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
5222 fftwf_execute(forward_plan);
5224 rfftwnd_real_to_complex(forward_plan, nx*ny,
5227 #ifdef MANUAL_DEBUG_FFTW3
5228 dumpMatrixFloat3(
"fw_z_a",
data, nx, ny,
initdata.
grid.
dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5236 for (
int i=0; i<nx; ++i ) {
5237 for (
int j=0; j<ny; ++j, d += dim3 ) {
5238 for (
int k=0; k<dim3; ++k ) {
5239 if ( d[k] == 0. ) CkPrintf(
"0 in Z at %d %d %d %d %d %d %d %d %d\n",
5240 thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5257 for (
int isend=fromIdx; isend<=toIdx; ++isend ) {
5260 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5268 float *md = msg->
qgrid;
5269 const float *d =
data;
5270 for (
int i=0; i<nx; ++i ) {
5271 for (
int j=0; j<ny; ++j, d += dim3 ) {
5272 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
5282 CmiEnableUrgentSend(1);
5283 #if USE_NODE_PAR_RECEIVE
5284 msg->
destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5286 CmiUsePersistentHandle(&trans_handle[isend], 1);
5290 CmiUsePersistentHandle(NULL, 0);
5294 CmiUsePersistentHandle(&trans_handle[isend], 1);
5298 CmiUsePersistentHandle(NULL, 0);
5301 CmiEnableUrgentSend(0);
5307 if (trans_handle == NULL) setup_persistent();
5309 #if CMK_SMP && USE_CKLOOP
5327 for (
int isend=0; isend<zBlocks; ++isend ) {
5330 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5338 float *md = msg->
qgrid;
5339 const float *d =
data;
5340 for (
int i=0; i<nx; ++i ) {
5341 for (
int j=0; j<ny; ++j, d += dim3 ) {
5342 for (
int k=kb*block3; k<(kb*block3+nz); ++k ) {
5352 CmiEnableUrgentSend(1);
5353 #if USE_NODE_PAR_RECEIVE
5354 msg->
destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5356 CmiUsePersistentHandle(&trans_handle[isend], 1);
5360 CmiUsePersistentHandle(NULL, 0);
5364 CmiUsePersistentHandle(&trans_handle[isend], 1);
5368 CmiUsePersistentHandle(NULL, 0);
5371 CmiEnableUrgentSend(0);
5385 const float *md = msg->
qgrid;
5387 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5388 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5389 for (
int k=0; k<nz; ++k ) {
5391 if ( (*md) == 0. ) CkPrintf(
"0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5392 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5394 d[2*(j*nz+k)] = *(md++);
5395 d[2*(j*nz+k)+1] = *(md++);
5401 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5402 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5403 for (
int k=0; k<nz; ++k ) {
5405 d[2*(j*nz+k)+1] = 0;
5419 for(
int i=fromIdx; i<=toIdx; i++){
5420 fftwf_execute_dft(forward_plan, ((fftwf_complex *)
data) + i
5431 #ifdef MANUAL_DEBUG_FFTW3
5432 dumpMatrixFloat3(
"fw_y_b",
data, nx,
initdata.
grid.
K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5436 #if CMK_SMP && USE_CKLOOP
5445 for (
int i=0; i<nx; ++i ) {
5446 fftwf_execute_dft(forward_plan, ((fftwf_complex *)
data) + i
5451 for (
int i=0; i<nx; ++i ) {
5452 fftw(forward_plan, nz,
5454 nz, 1, (fftw_complex *)
work, 1, 0);
5457 #ifdef MANUAL_DEBUG_FFTW3
5458 dumpMatrixFloat3(
"fw_y_a",
data, nx,
initdata.
grid.
dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5473 for (
int isend=fromIdx; isend<=toIdx; ++isend ) {
5476 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5484 float *md = msg->
qgrid;
5485 const float *d =
data;
5486 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5487 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5488 for (
int k=0; k<nz; ++k ) {
5489 *(md++) = d[2*(j*nz+k)];
5490 *(md++) = d[2*(j*nz+k)+1];
5492 if ( *(md-2) == 0. ) CkPrintf(
"send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5493 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5498 if ( md != msg->
qgrid + nx*ny*nz*2 ) CkPrintf(
"error in YX at %d %d %d\n",
5499 thisIndex.x, jb, thisIndex.z);
5503 CmiEnableUrgentSend(1);
5504 #if USE_NODE_PAR_RECEIVE
5505 msg->
destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5507 CmiUsePersistentHandle(&trans_handle[isend], 1);
5511 CmiUsePersistentHandle(NULL, 0);
5515 CmiUsePersistentHandle(&trans_handle[isend], 1);
5519 CmiUsePersistentHandle(NULL, 0);
5522 CmiEnableUrgentSend(0);
5528 if (trans_handle == NULL) setup_persistent();
5530 #if CMK_SMP && USE_CKLOOP
5548 for (
int isend=0; isend<yBlocks; ++isend ) {
5551 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5559 float *md = msg->
qgrid;
5560 const float *d =
data;
5561 for (
int i=0; i<nx; ++i, d += K2*nz*2 ) {
5562 for (
int j=jb*block2; j<(jb*block2+ny); ++j ) {
5563 for (
int k=0; k<nz; ++k ) {
5564 *(md++) = d[2*(j*nz+k)];
5565 *(md++) = d[2*(j*nz+k)+1];
5567 if ( *(md-2) == 0. ) CkPrintf(
"send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5568 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5573 if ( md != msg->
qgrid + nx*ny*nz*2 ) CkPrintf(
"error in YX at %d %d %d\n",
5574 thisIndex.x, jb, thisIndex.z);
5578 CmiEnableUrgentSend(1);
5579 #if USE_NODE_PAR_RECEIVE
5580 msg->
destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5582 CmiUsePersistentHandle(&trans_handle[isend], 1);
5586 CmiUsePersistentHandle(NULL, 0);
5590 CmiUsePersistentHandle(&trans_handle[isend], 1);
5594 CmiUsePersistentHandle(NULL, 0);
5598 CmiEnableUrgentSend(0);
5608 CmiMemoryAtomicFetchAndInc(
imsg,limsg);
5618 CmiMemoryWriteFence();
5632 const float *md = msg->
qgrid;
5633 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5634 float *d =
data + i*ny*nz*2;
5635 for (
int j=0; j<
ny; ++j, d += nz*2 ) {
5636 for (
int k=0; k<
nz; ++k ) {
5638 if ( (*md) == 0. ) CkPrintf(
"0 in YX at %d %d %d %d %d %d %d %d %d\n",
5639 ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
5647 for (
int i=ib*block1; i<(ib*block1+nx); ++i ) {
5648 float *d =
data + i*ny*nz*2;
5649 for (
int j=0; j<
ny; ++j, d += nz*2 ) {
5650 for (
int k=0; k<
nz; ++k ) {
5662 #ifdef MANUAL_DEBUG_FFTW3
5663 dumpMatrixFloat3(
"fw_x_b",
data,
initdata.
grid.
K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5667 #if CMK_SMP && USE_CKLOOP
5673 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
5680 ((fftw_complex *)
data), ny*nz, 1, (fftw_complex *)
work, 1, 0);
5682 #ifdef MANUAL_DEBUG_FFTW3
5683 dumpMatrixFloat3(
"fw_x_a",
data,
initdata.
grid.
K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5700 #if CMK_SMP && USE_CKLOOP
5708 for (
int g=0; g<numGrids; ++g ) {
5713 #if USE_NODE_PAR_RECEIVE
5714 CmiMemoryWriteFence();
5720 #ifdef MANUAL_DEBUG_FFTW3
5721 dumpMatrixFloat3(
"bw_x_b",
data,
initdata.
grid.
K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5725 #if CMK_SMP && USE_CKLOOP
5731 CkLoop_Parallelize(
PmeXZPencilFFT, 1, (
void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
5738 ((fftw_complex *)
data), ny*nz, 1, (fftw_complex *)
work, 1, 0);
5740 #ifdef MANUAL_DEBUG_FFTW3
5741 dumpMatrixFloat3(
"bw_x_a",
data,
initdata.
grid.
K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);