9#pragma message "Requested to compile LJFunctorAVX but AVX is not available!"
42template <
class Particle_T,
bool applyShift =
false,
bool useMixing =
false,
44 bool countFLOPs =
false,
bool relevantForTuning =
true>
47 calculateGlobals, countFLOPs, relevantForTuning>> {
48 using SoAArraysType =
typename Particle_T::SoAArraysType;
65 calculateGlobals, countFLOPs, relevantForTuning>>(cutoff),
66 _cutoffSquared{_mm256_set1_pd(cutoff * cutoff)},
67 _cutoffSquaredAoS(cutoff * cutoff),
68 _potentialEnergySum{0.},
69 _virialSum{0., 0., 0.},
71 _postProcessed{
false} {
72 if (calculateGlobals) {
75 if constexpr (countFLOPs) {
76 AutoPasLog(DEBUG,
"Using LJFunctorAVX with countFLOPs but FLOP counting is not implemented.");
81 relevantForTuning>>(cutoff) {
95 static_assert(not useMixing,
96 "Mixing without a ParticlePropertiesLibrary is not possible! Use a different constructor or set "
108 static_assert(useMixing,
109 "Not using Mixing but using a ParticlePropertiesLibrary is not allowed! Use a different constructor "
110 "or set mixing to true.");
111 _PPLibrary = &particlePropertiesLibrary;
114 std::string
getName() final {
return "LJFunctorAVX"; }
119 return useNewton3 == autopas::FunctorN3Modes::Newton3Only or useNewton3 == autopas::FunctorN3Modes::Both;
123 return useNewton3 == autopas::FunctorN3Modes::Newton3Off or useNewton3 == autopas::FunctorN3Modes::Both;
126 inline void AoSFunctor(Particle_T &i, Particle_T &j,
bool newton3)
final {
127 using namespace autopas::utils::ArrayMath::literals;
128 if (i.isDummy() or j.isDummy()) {
131 auto sigmaSquared = _sigmaSquaredAoS;
132 auto epsilon24 = _epsilon24AoS;
133 auto shift6 = _shift6AoS;
134 if constexpr (useMixing) {
137 if constexpr (applyShift) {
141 auto dr = i.getR() - j.getR();
144 if (dr2 > _cutoffSquaredAoS) {
148 double invdr2 = 1. / dr2;
149 double lj6 = sigmaSquared * invdr2;
150 lj6 = lj6 * lj6 * lj6;
151 double lj12 = lj6 * lj6;
152 double lj12m6 = lj12 - lj6;
153 double fac = epsilon24 * (lj12 + lj12m6) * invdr2;
160 if (calculateGlobals) {
164 auto virial = dr * f;
165 double potentialEnergy6 = epsilon24 * lj12m6 + shift6;
169 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
170 _aosThreadData[threadnum].virialSum += virial;
173 if (newton3 and j.isOwned()) {
174 _aosThreadData[threadnum].potentialEnergySum += potentialEnergy6;
175 _aosThreadData[threadnum].virialSum += virial;
192 const bool newton3)
final {
194 SoAFunctorPairImpl<true>(soa1, soa2);
196 SoAFunctorPairImpl<false>(soa1, soa2);
207 if (soa.
size() == 0)
return;
209 const auto *
const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
210 const auto *
const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
211 const auto *
const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
213 const auto *
const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
215 auto *
const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
216 auto *
const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
217 auto *
const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
219 const auto *
const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
221 __m256d virialSumX = _mm256_setzero_pd();
222 __m256d virialSumY = _mm256_setzero_pd();
223 __m256d virialSumZ = _mm256_setzero_pd();
224 __m256d potentialEnergySum = _mm256_setzero_pd();
228 for (
size_t i = soa.
size() - 1; (long)i >= 0; --i) {
234 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
235 "OwnershipStates underlying type should be int64_t!");
238 __m256i ownedStateI = _mm256_set1_epi64x(
static_cast<int64_t
>(ownedStatePtr[i]));
240 __m256d fxacc = _mm256_setzero_pd();
241 __m256d fyacc = _mm256_setzero_pd();
242 __m256d fzacc = _mm256_setzero_pd();
244 const __m256d x1 = _mm256_broadcast_sd(&xptr[i]);
245 const __m256d y1 = _mm256_broadcast_sd(&yptr[i]);
246 const __m256d z1 = _mm256_broadcast_sd(&zptr[i]);
252 for (; j < (i & ~(vecLength - 1)); j += 4) {
253 SoAKernel<true, false>(j, ownedStateI,
reinterpret_cast<const int64_t *
>(ownedStatePtr), x1, y1, z1, xptr, yptr,
254 zptr, fxptr, fyptr, fzptr, &typeIDptr[i], &typeIDptr[j], fxacc, fyacc, fzacc,
255 &virialSumX, &virialSumY, &virialSumZ, &potentialEnergySum, 0);
259 const int rest = (int)(i & (vecLength - 1));
261 SoAKernel<true, true>(j, ownedStateI,
reinterpret_cast<const int64_t *
>(ownedStatePtr), x1, y1, z1, xptr, yptr,
262 zptr, fxptr, fyptr, fzptr, &typeIDptr[i], &typeIDptr[j], fxacc, fyacc, fzacc, &virialSumX,
263 &virialSumY, &virialSumZ, &potentialEnergySum, rest);
267 const __m256d hSumfxfy = _mm256_hadd_pd(fxacc, fyacc);
268 const __m256d hSumfz = _mm256_hadd_pd(fzacc, fzacc);
270 const __m128d hSumfxfyLow = _mm256_extractf128_pd(hSumfxfy, 0);
271 const __m128d hSumfzLow = _mm256_extractf128_pd(hSumfz, 0);
273 const __m128d hSumfxfyHigh = _mm256_extractf128_pd(hSumfxfy, 1);
274 const __m128d hSumfzHigh = _mm256_extractf128_pd(hSumfz, 1);
276 const __m128d sumfxfyVEC = _mm_add_pd(hSumfxfyLow, hSumfxfyHigh);
277 const __m128d sumfzVEC = _mm_add_pd(hSumfzLow, hSumfzHigh);
279 const double sumfx = sumfxfyVEC[0];
280 const double sumfy = sumfxfyVEC[1];
281 const double sumfz = _mm_cvtsd_f64(sumfzVEC);
288 if constexpr (calculateGlobals) {
292 const __m256d hSumVirialxy = _mm256_hadd_pd(virialSumX, virialSumY);
293 const __m128d hSumVirialxyLow = _mm256_extractf128_pd(hSumVirialxy, 0);
294 const __m128d hSumVirialxyHigh = _mm256_extractf128_pd(hSumVirialxy, 1);
295 const __m128d hSumVirialxyVec = _mm_add_pd(hSumVirialxyHigh, hSumVirialxyLow);
298 const __m256d hSumVirialzPotentialEnergy = _mm256_hadd_pd(virialSumZ, potentialEnergySum);
299 const __m128d hSumVirialzPotentialEnergyLow = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 0);
300 const __m128d hSumVirialzPotentialEnergyHigh = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 1);
301 const __m128d hSumVirialzPotentialEnergyVec =
302 _mm_add_pd(hSumVirialzPotentialEnergyHigh, hSumVirialzPotentialEnergyLow);
306 _mm_store_pd(&globals[0], hSumVirialxyVec);
307 _mm_store_pd(&globals[2], hSumVirialzPotentialEnergyVec);
309 _aosThreadData[threadnum].virialSum[0] += globals[0];
310 _aosThreadData[threadnum].virialSum[1] += globals[1];
311 _aosThreadData[threadnum].virialSum[2] += globals[2];
312 _aosThreadData[threadnum].potentialEnergySum += globals[3];
317 template <
bool newton3>
320 if (soa1.
size() == 0 || soa2.
size() == 0)
return;
322 const auto *
const __restrict x1ptr = soa1.template begin<Particle_T::AttributeNames::posX>();
323 const auto *
const __restrict y1ptr = soa1.template begin<Particle_T::AttributeNames::posY>();
324 const auto *
const __restrict z1ptr = soa1.template begin<Particle_T::AttributeNames::posZ>();
325 const auto *
const __restrict x2ptr = soa2.template begin<Particle_T::AttributeNames::posX>();
326 const auto *
const __restrict y2ptr = soa2.template begin<Particle_T::AttributeNames::posY>();
327 const auto *
const __restrict z2ptr = soa2.template begin<Particle_T::AttributeNames::posZ>();
329 const auto *
const __restrict ownedStatePtr1 = soa1.template begin<Particle_T::AttributeNames::ownershipState>();
330 const auto *
const __restrict ownedStatePtr2 = soa2.template begin<Particle_T::AttributeNames::ownershipState>();
332 auto *
const __restrict fx1ptr = soa1.template begin<Particle_T::AttributeNames::forceX>();
333 auto *
const __restrict fy1ptr = soa1.template begin<Particle_T::AttributeNames::forceY>();
334 auto *
const __restrict fz1ptr = soa1.template begin<Particle_T::AttributeNames::forceZ>();
335 auto *
const __restrict fx2ptr = soa2.template begin<Particle_T::AttributeNames::forceX>();
336 auto *
const __restrict fy2ptr = soa2.template begin<Particle_T::AttributeNames::forceY>();
337 auto *
const __restrict fz2ptr = soa2.template begin<Particle_T::AttributeNames::forceZ>();
339 const auto *
const __restrict typeID1ptr = soa1.template begin<Particle_T::AttributeNames::typeId>();
340 const auto *
const __restrict typeID2ptr = soa2.template begin<Particle_T::AttributeNames::typeId>();
342 __m256d virialSumX = _mm256_setzero_pd();
343 __m256d virialSumY = _mm256_setzero_pd();
344 __m256d virialSumZ = _mm256_setzero_pd();
345 __m256d potentialEnergySum = _mm256_setzero_pd();
347 for (
unsigned int i = 0; i < soa1.
size(); ++i) {
353 __m256d fxacc = _mm256_setzero_pd();
354 __m256d fyacc = _mm256_setzero_pd();
355 __m256d fzacc = _mm256_setzero_pd();
357 static_assert(std::is_same_v<std::underlying_type_t<autopas::OwnershipState>, int64_t>,
358 "OwnershipStates underlying type should be int64_t!");
361 __m256i ownedStateI = _mm256_set1_epi64x(
static_cast<int64_t
>(ownedStatePtr1[i]));
363 const __m256d x1 = _mm256_broadcast_sd(&x1ptr[i]);
364 const __m256d y1 = _mm256_broadcast_sd(&y1ptr[i]);
365 const __m256d z1 = _mm256_broadcast_sd(&z1ptr[i]);
369 for (; j < (soa2.
size() & ~(vecLength - 1)); j += 4) {
370 SoAKernel<newton3, false>(j, ownedStateI,
reinterpret_cast<const int64_t *
>(ownedStatePtr2), x1, y1, z1, x2ptr,
371 y2ptr, z2ptr, fx2ptr, fy2ptr, fz2ptr, &typeID1ptr[i], &typeID2ptr[j], fxacc, fyacc,
372 fzacc, &virialSumX, &virialSumY, &virialSumZ, &potentialEnergySum, 0);
374 const int rest = (int)(soa2.
size() & (vecLength - 1));
376 SoAKernel<newton3, true>(j, ownedStateI,
reinterpret_cast<const int64_t *
>(ownedStatePtr2), x1, y1, z1, x2ptr,
377 y2ptr, z2ptr, fx2ptr, fy2ptr, fz2ptr, &typeID1ptr[i], &typeID2ptr[j], fxacc, fyacc,
378 fzacc, &virialSumX, &virialSumY, &virialSumZ, &potentialEnergySum, rest);
381 const __m256d hSumfxfy = _mm256_hadd_pd(fxacc, fyacc);
382 const __m256d hSumfz = _mm256_hadd_pd(fzacc, fzacc);
384 const __m128d hSumfxfyLow = _mm256_extractf128_pd(hSumfxfy, 0);
385 const __m128d hSumfzLow = _mm256_extractf128_pd(hSumfz, 0);
387 const __m128d hSumfxfyHigh = _mm256_extractf128_pd(hSumfxfy, 1);
388 const __m128d hSumfzHigh = _mm256_extractf128_pd(hSumfz, 1);
390 const __m128d sumfxfyVEC = _mm_add_pd(hSumfxfyLow, hSumfxfyHigh);
391 const __m128d sumfzVEC = _mm_add_pd(hSumfzLow, hSumfzHigh);
393 const double sumfx = sumfxfyVEC[0];
394 const double sumfy = sumfxfyVEC[1];
395 const double sumfz = _mm_cvtsd_f64(sumfzVEC);
402 if constexpr (calculateGlobals) {
406 const __m256d hSumVirialxy = _mm256_hadd_pd(virialSumX, virialSumY);
407 const __m128d hSumVirialxyLow = _mm256_extractf128_pd(hSumVirialxy, 0);
408 const __m128d hSumVirialxyHigh = _mm256_extractf128_pd(hSumVirialxy, 1);
409 const __m128d hSumVirialxyVec = _mm_add_pd(hSumVirialxyHigh, hSumVirialxyLow);
412 const __m256d hSumVirialzPotentialEnergy = _mm256_hadd_pd(virialSumZ, potentialEnergySum);
413 const __m128d hSumVirialzPotentialEnergyLow = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 0);
414 const __m128d hSumVirialzPotentialEnergyHigh = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 1);
415 const __m128d hSumVirialzPotentialEnergyVec =
416 _mm_add_pd(hSumVirialzPotentialEnergyHigh, hSumVirialzPotentialEnergyLow);
420 _mm_store_pd(&globals[0], hSumVirialxyVec);
421 _mm_store_pd(&globals[2], hSumVirialzPotentialEnergyVec);
423 _aosThreadData[threadnum].virialSum[0] += globals[0];
424 _aosThreadData[threadnum].virialSum[1] += globals[1];
425 _aosThreadData[threadnum].virialSum[2] += globals[2];
426 _aosThreadData[threadnum].potentialEnergySum += globals[3];
461 template <
bool newton3,
bool remainderIsMasked>
462 inline void SoAKernel(
const size_t j,
const __m256i ownedStateI,
const int64_t *
const __restrict ownedStatePtr2,
463 const __m256d &x1,
const __m256d &y1,
const __m256d &z1,
const double *
const __restrict x2ptr,
464 const double *
const __restrict y2ptr,
const double *
const __restrict z2ptr,
465 double *
const __restrict fx2ptr,
double *
const __restrict fy2ptr,
466 double *
const __restrict fz2ptr,
const size_t *
const typeID1ptr,
const size_t *
const typeID2ptr,
467 __m256d &fxacc, __m256d &fyacc, __m256d &fzacc, __m256d *virialSumX, __m256d *virialSumY,
468 __m256d *virialSumZ, __m256d *potentialEnergySum,
const unsigned int rest = 0) {
469 __m256d epsilon24s = _epsilon24;
470 __m256d sigmaSquareds = _sigmaSquared;
471 __m256d shift6s = _shift6;
474 epsilon24s = _mm256_set_pd(
475 not remainderIsMasked or rest > 3 ? _PPLibrary->
getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 3)) : 0,
476 not remainderIsMasked or rest > 2 ? _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 2)) : 0,
477 not remainderIsMasked or rest > 1 ? _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 1)) : 0,
478 _PPLibrary->getMixing24Epsilon(*typeID1ptr, *(typeID2ptr + 0)));
479 sigmaSquareds = _mm256_set_pd(
480 not remainderIsMasked or rest > 3 ? _PPLibrary->
getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 3)) : 0,
481 not remainderIsMasked or rest > 2 ? _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 2)) : 0,
482 not remainderIsMasked or rest > 1 ? _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 1)) : 0,
483 _PPLibrary->getMixingSigmaSquared(*typeID1ptr, *(typeID2ptr + 0)));
484 if constexpr (applyShift) {
485 shift6s = _mm256_set_pd(
486 (not remainderIsMasked or rest > 3) ? _PPLibrary->
getMixingShift6(*typeID1ptr, *(typeID2ptr + 3)) : 0,
487 (not remainderIsMasked or rest > 2) ? _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 2)) : 0,
488 (not remainderIsMasked or rest > 1) ? _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 1)) : 0,
489 _PPLibrary->getMixingShift6(*typeID1ptr, *(typeID2ptr + 0)));
493 const __m256d x2 = remainderIsMasked ? _mm256_maskload_pd(&x2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&x2ptr[j]);
494 const __m256d y2 = remainderIsMasked ? _mm256_maskload_pd(&y2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&y2ptr[j]);
495 const __m256d z2 = remainderIsMasked ? _mm256_maskload_pd(&z2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&z2ptr[j]);
497 const __m256d drx = _mm256_sub_pd(x1, x2);
498 const __m256d dry = _mm256_sub_pd(y1, y2);
499 const __m256d drz = _mm256_sub_pd(z1, z2);
501 const __m256d drx2 = _mm256_mul_pd(drx, drx);
502 const __m256d dry2 = _mm256_mul_pd(dry, dry);
503 const __m256d drz2 = _mm256_mul_pd(drz, drz);
505 const __m256d dr2PART = _mm256_add_pd(drx2, dry2);
506 const __m256d dr2 = _mm256_add_pd(dr2PART, drz2);
511 const __m256d cutoffMask = _mm256_cmp_pd(dr2, _cutoffSquared, _CMP_LE_OS);
514 const __m256i ownedStateJ = remainderIsMasked
515 ? _mm256_castpd_si256(_mm256_maskload_pd(
516 reinterpret_cast<double const *
>(&ownedStatePtr2[j]), _masks[rest - 1]))
517 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&ownedStatePtr2[j]));
519 const __m256d dummyMask = _mm256_cmp_pd(_mm256_castsi256_pd(ownedStateJ), _zero, _CMP_NEQ_UQ);
520 const __m256d cutoffDummyMask = _mm256_and_pd(cutoffMask, dummyMask);
523 if (_mm256_movemask_pd(cutoffDummyMask) == 0) {
527 const __m256d invdr2 = _mm256_div_pd(_one, dr2);
528 const __m256d lj2 = _mm256_mul_pd(sigmaSquareds, invdr2);
529 const __m256d lj4 = _mm256_mul_pd(lj2, lj2);
530 const __m256d lj6 = _mm256_mul_pd(lj2, lj4);
531 const __m256d lj12 = _mm256_mul_pd(lj6, lj6);
532 const __m256d lj12m6 = _mm256_sub_pd(lj12, lj6);
533 const __m256d lj12m6alj12 = _mm256_add_pd(lj12m6, lj12);
534 const __m256d lj12m6alj12e = _mm256_mul_pd(lj12m6alj12, epsilon24s);
535 const __m256d fac = _mm256_mul_pd(lj12m6alj12e, invdr2);
537 const __m256d facMasked =
538 remainderIsMasked ? _mm256_and_pd(fac, _mm256_and_pd(cutoffDummyMask, _mm256_castsi256_pd(_masks[rest - 1])))
539 : _mm256_and_pd(fac, cutoffDummyMask);
541 const __m256d fx = _mm256_mul_pd(drx, facMasked);
542 const __m256d fy = _mm256_mul_pd(dry, facMasked);
543 const __m256d fz = _mm256_mul_pd(drz, facMasked);
545 fxacc = _mm256_add_pd(fxacc, fx);
546 fyacc = _mm256_add_pd(fyacc, fy);
547 fzacc = _mm256_add_pd(fzacc, fz);
550 if constexpr (newton3) {
552 remainderIsMasked ? _mm256_maskload_pd(&fx2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&fx2ptr[j]);
554 remainderIsMasked ? _mm256_maskload_pd(&fy2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&fy2ptr[j]);
556 remainderIsMasked ? _mm256_maskload_pd(&fz2ptr[j], _masks[rest - 1]) : _mm256_loadu_pd(&fz2ptr[j]);
558 const __m256d fx2new = _mm256_sub_pd(fx2, fx);
559 const __m256d fy2new = _mm256_sub_pd(fy2, fy);
560 const __m256d fz2new = _mm256_sub_pd(fz2, fz);
562 remainderIsMasked ? _mm256_maskstore_pd(&fx2ptr[j], _masks[rest - 1], fx2new)
563 : _mm256_storeu_pd(&fx2ptr[j], fx2new);
564 remainderIsMasked ? _mm256_maskstore_pd(&fy2ptr[j], _masks[rest - 1], fy2new)
565 : _mm256_storeu_pd(&fy2ptr[j], fy2new);
566 remainderIsMasked ? _mm256_maskstore_pd(&fz2ptr[j], _masks[rest - 1], fz2new)
567 : _mm256_storeu_pd(&fz2ptr[j], fz2new);
570 if constexpr (calculateGlobals) {
572 const __m256d virialX = _mm256_mul_pd(fx, drx);
573 const __m256d virialY = _mm256_mul_pd(fy, dry);
574 const __m256d virialZ = _mm256_mul_pd(fz, drz);
577 const __m256d potentialEnergy = wrapperFMA(epsilon24s, lj12m6, shift6s);
579 const __m256d potentialEnergyMasked =
581 ? _mm256_and_pd(potentialEnergy, _mm256_and_pd(cutoffDummyMask, _mm256_castsi256_pd(_masks[rest - 1])))
582 : _mm256_and_pd(potentialEnergy, cutoffDummyMask);
585 _mm256_cmp_pd(_mm256_castsi256_pd(ownedStateI), _mm256_castsi256_pd(_ownedStateOwnedMM256i), _CMP_EQ_UQ);
586 __m256d energyFactor = _mm256_blendv_pd(_zero, _one, ownedMaskI);
587 if constexpr (newton3) {
589 _mm256_cmp_pd(_mm256_castsi256_pd(ownedStateJ), _mm256_castsi256_pd(_ownedStateOwnedMM256i), _CMP_EQ_UQ);
590 energyFactor = _mm256_add_pd(energyFactor, _mm256_blendv_pd(_zero, _one, ownedMaskJ));
592 *potentialEnergySum = wrapperFMA(energyFactor, potentialEnergyMasked, *potentialEnergySum);
593 *virialSumX = wrapperFMA(energyFactor, virialX, *virialSumX);
594 *virialSumY = wrapperFMA(energyFactor, virialY, *virialSumY);
595 *virialSumZ = wrapperFMA(energyFactor, virialZ, *virialSumZ);
610 bool newton3)
final {
611 if (soa.
size() == 0 or neighborList.empty())
return;
613 SoAFunctorVerletImpl<true>(soa, indexFirst, neighborList);
615 SoAFunctorVerletImpl<false>(soa, indexFirst, neighborList);
620 template <
bool newton3>
624 const auto *
const __restrict ownedStatePtr = soa.template begin<Particle_T::AttributeNames::ownershipState>();
629 const auto *
const __restrict xptr = soa.template begin<Particle_T::AttributeNames::posX>();
630 const auto *
const __restrict yptr = soa.template begin<Particle_T::AttributeNames::posY>();
631 const auto *
const __restrict zptr = soa.template begin<Particle_T::AttributeNames::posZ>();
633 auto *
const __restrict fxptr = soa.template begin<Particle_T::AttributeNames::forceX>();
634 auto *
const __restrict fyptr = soa.template begin<Particle_T::AttributeNames::forceY>();
635 auto *
const __restrict fzptr = soa.template begin<Particle_T::AttributeNames::forceZ>();
637 const auto *
const __restrict typeIDptr = soa.template begin<Particle_T::AttributeNames::typeId>();
640 __m256d virialSumX = _mm256_setzero_pd();
641 __m256d virialSumY = _mm256_setzero_pd();
642 __m256d virialSumZ = _mm256_setzero_pd();
643 __m256d potentialEnergySum = _mm256_setzero_pd();
644 __m256d fxacc = _mm256_setzero_pd();
645 __m256d fyacc = _mm256_setzero_pd();
646 __m256d fzacc = _mm256_setzero_pd();
649 const auto x1 = _mm256_broadcast_sd(&xptr[indexFirst]);
650 const auto y1 = _mm256_broadcast_sd(&yptr[indexFirst]);
651 const auto z1 = _mm256_broadcast_sd(&zptr[indexFirst]);
654 __m256i ownedStateI = _mm256_set1_epi64x(
static_cast<int64_t
>(ownedStatePtr[indexFirst]));
656 alignas(64) std::array<double, vecLength> x2tmp{};
657 alignas(64) std::array<double, vecLength> y2tmp{};
658 alignas(64) std::array<double, vecLength> z2tmp{};
659 alignas(64) std::array<double, vecLength> fx2tmp{};
660 alignas(64) std::array<double, vecLength> fy2tmp{};
661 alignas(64) std::array<double, vecLength> fz2tmp{};
662 alignas(64) std::array<size_t, vecLength> typeID2tmp{};
663 alignas(64) std::array<autopas::OwnershipState, vecLength> ownedStates2tmp{};
672 for (; j < (neighborList.size() & ~(vecLength - 1)); j += vecLength) {
684 for (
size_t vecIndex = 0; vecIndex < vecLength; ++vecIndex) {
685 x2tmp[vecIndex] = xptr[neighborList[j + vecIndex]];
686 y2tmp[vecIndex] = yptr[neighborList[j + vecIndex]];
687 z2tmp[vecIndex] = zptr[neighborList[j + vecIndex]];
688 if constexpr (newton3) {
689 fx2tmp[vecIndex] = fxptr[neighborList[j + vecIndex]];
690 fy2tmp[vecIndex] = fyptr[neighborList[j + vecIndex]];
691 fz2tmp[vecIndex] = fzptr[neighborList[j + vecIndex]];
693 typeID2tmp[vecIndex] = typeIDptr[neighborList[j + vecIndex]];
694 ownedStates2tmp[vecIndex] = ownedStatePtr[neighborList[j + vecIndex]];
697 SoAKernel<newton3, false>(0, ownedStateI,
reinterpret_cast<const int64_t *
>(ownedStates2tmp.data()), x1, y1, z1,
698 x2tmp.data(), y2tmp.data(), z2tmp.data(), fx2tmp.data(), fy2tmp.data(), fz2tmp.data(),
699 &typeIDptr[indexFirst], typeID2tmp.data(), fxacc, fyacc, fzacc, &virialSumX,
700 &virialSumY, &virialSumZ, &potentialEnergySum, 0);
702 if constexpr (newton3) {
703 for (
size_t vecIndex = 0; vecIndex < vecLength; ++vecIndex) {
704 fxptr[neighborList[j + vecIndex]] = fx2tmp[vecIndex];
705 fyptr[neighborList[j + vecIndex]] = fy2tmp[vecIndex];
706 fzptr[neighborList[j + vecIndex]] = fz2tmp[vecIndex];
713 const auto rest =
static_cast<int>(neighborList.size() & (vecLength - 1));
727 for (
size_t vecIndex = 0; vecIndex < rest; ++vecIndex) {
728 x2tmp[vecIndex] = xptr[neighborList[j + vecIndex]];
729 y2tmp[vecIndex] = yptr[neighborList[j + vecIndex]];
730 z2tmp[vecIndex] = zptr[neighborList[j + vecIndex]];
732 if constexpr (newton3) {
733 fx2tmp[vecIndex] = fxptr[neighborList[j + vecIndex]];
734 fy2tmp[vecIndex] = fyptr[neighborList[j + vecIndex]];
735 fz2tmp[vecIndex] = fzptr[neighborList[j + vecIndex]];
737 typeID2tmp[vecIndex] = typeIDptr[neighborList[j + vecIndex]];
738 ownedStates2tmp[vecIndex] = ownedStatePtr[neighborList[j + vecIndex]];
741 SoAKernel<newton3, true>(0, ownedStateI,
reinterpret_cast<const int64_t *
>(ownedStates2tmp.data()), x1, y1, z1,
742 x2tmp.data(), y2tmp.data(), z2tmp.data(), fx2tmp.data(), fy2tmp.data(), fz2tmp.data(),
743 &typeIDptr[indexFirst], typeID2tmp.data(), fxacc, fyacc, fzacc, &virialSumX, &virialSumY,
744 &virialSumZ, &potentialEnergySum, rest);
746 if constexpr (newton3) {
747 for (
size_t vecIndex = 0; vecIndex < rest; ++vecIndex) {
748 fxptr[neighborList[j + vecIndex]] = fx2tmp[vecIndex];
749 fyptr[neighborList[j + vecIndex]] = fy2tmp[vecIndex];
750 fzptr[neighborList[j + vecIndex]] = fz2tmp[vecIndex];
756 const __m256d hSumfxfy = _mm256_hadd_pd(fxacc, fyacc);
757 const __m256d hSumfz = _mm256_hadd_pd(fzacc, fzacc);
759 const __m128d hSumfxfyLow = _mm256_extractf128_pd(hSumfxfy, 0);
760 const __m128d hSumfzLow = _mm256_extractf128_pd(hSumfz, 0);
762 const __m128d hSumfxfyHigh = _mm256_extractf128_pd(hSumfxfy, 1);
763 const __m128d hSumfzHigh = _mm256_extractf128_pd(hSumfz, 1);
765 const __m128d sumfxfyVEC = _mm_add_pd(hSumfxfyLow, hSumfxfyHigh);
766 const __m128d sumfzVEC = _mm_add_pd(hSumfzLow, hSumfzHigh);
768 const double sumfx = sumfxfyVEC[0];
769 const double sumfy = sumfxfyVEC[1];
770 const double sumfz = _mm_cvtsd_f64(sumfzVEC);
772 fxptr[indexFirst] += sumfx;
773 fyptr[indexFirst] += sumfy;
774 fzptr[indexFirst] += sumfz;
776 if constexpr (calculateGlobals) {
780 const __m256d hSumVirialxy = _mm256_hadd_pd(virialSumX, virialSumY);
781 const __m128d hSumVirialxyLow = _mm256_extractf128_pd(hSumVirialxy, 0);
782 const __m128d hSumVirialxyHigh = _mm256_extractf128_pd(hSumVirialxy, 1);
783 const __m128d hSumVirialxyVec = _mm_add_pd(hSumVirialxyHigh, hSumVirialxyLow);
786 const __m256d hSumVirialzPotentialEnergy = _mm256_hadd_pd(virialSumZ, potentialEnergySum);
787 const __m128d hSumVirialzPotentialEnergyLow = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 0);
788 const __m128d hSumVirialzPotentialEnergyHigh = _mm256_extractf128_pd(hSumVirialzPotentialEnergy, 1);
789 const __m128d hSumVirialzPotentialEnergyVec =
790 _mm_add_pd(hSumVirialzPotentialEnergyHigh, hSumVirialzPotentialEnergyLow);
794 _mm_store_pd(&globals[0], hSumVirialxyVec);
795 _mm_store_pd(&globals[2], hSumVirialzPotentialEnergyVec);
797 _aosThreadData[threadnum].virialSum[0] += globals[0];
798 _aosThreadData[threadnum].virialSum[1] += globals[1];
799 _aosThreadData[threadnum].virialSum[2] += globals[2];
800 _aosThreadData[threadnum].potentialEnergySum += globals[3];
811 return std::array<typename Particle_T::AttributeNames, 9>{Particle_T::AttributeNames::id,
812 Particle_T::AttributeNames::posX,
813 Particle_T::AttributeNames::posY,
814 Particle_T::AttributeNames::posZ,
815 Particle_T::AttributeNames::forceX,
816 Particle_T::AttributeNames::forceY,
817 Particle_T::AttributeNames::forceZ,
818 Particle_T::AttributeNames::typeId,
819 Particle_T::AttributeNames::ownershipState};
826 return std::array<typename Particle_T::AttributeNames, 6>{
827 Particle_T::AttributeNames::id, Particle_T::AttributeNames::posX,
828 Particle_T::AttributeNames::posY, Particle_T::AttributeNames::posZ,
829 Particle_T::AttributeNames::typeId, Particle_T::AttributeNames::ownershipState};
836 return std::array<typename Particle_T::AttributeNames, 3>{
837 Particle_T::AttributeNames::forceX, Particle_T::AttributeNames::forceY, Particle_T::AttributeNames::forceZ};
851 _potentialEnergySum = 0.;
852 _virialSum = {0., 0., 0.};
853 _postProcessed =
false;
854 for (
size_t i = 0; i < _aosThreadData.size(); ++i) {
855 _aosThreadData[i].setZero();
864 using namespace autopas::utils::ArrayMath::literals;
866 if (_postProcessed) {
868 "Already postprocessed, endTraversal(bool newton3) was called twice without calling initTraversal().");
870 if (calculateGlobals) {
871 for (
size_t i = 0; i < _aosThreadData.size(); ++i) {
872 _potentialEnergySum += _aosThreadData[i].potentialEnergySum;
873 _virialSum += _aosThreadData[i].virialSum;
877 _potentialEnergySum *= 0.5;
881 _potentialEnergySum /= 6.;
882 _postProcessed =
true;
884 AutoPasLog(DEBUG,
"Final potential energy {}", _potentialEnergySum);
885 AutoPasLog(DEBUG,
"Final virial {}", _virialSum[0] + _virialSum[1] + _virialSum[2]);
894 if (not calculateGlobals) {
896 "Trying to get potential energy even though calculateGlobals is false. If you want this functor to calculate "
898 "values, please specify calculateGlobals to be true.");
900 if (not _postProcessed) {
902 "Cannot get potential energy, because endTraversal was not called.");
904 return _potentialEnergySum;
912 if (not calculateGlobals) {
914 "Trying to get virial even though calculateGlobals is false. If you want this functor to calculate global "
915 "values, please specify calculateGlobals to be true.");
917 if (not _postProcessed) {
919 "Cannot get virial, because endTraversal was not called.");
921 return _virialSum[0] + _virialSum[1] + _virialSum[2];
934 _epsilon24 = _mm256_set1_pd(epsilon24);
935 _sigmaSquared = _mm256_set1_pd(sigmaSquared);
936 if constexpr (applyShift) {
937 _shift6 = _mm256_set1_pd(
940 _shift6 = _mm256_setzero_pd();
944 _epsilon24AoS = epsilon24;
945 _sigmaSquaredAoS = sigmaSquared;
946 if constexpr (applyShift) {
962 inline __m256d wrapperFMA(
const __m256d &factorA,
const __m256d &factorB,
const __m256d &summandC) {
964 return _mm256_fmadd_pd(factorA, factorB, summandC);
966 const __m256d tmp = _mm256_mul_pd(factorA, factorB);
967 return _mm256_add_pd(summandC, tmp);
978 class AoSThreadData {
980 AoSThreadData() : virialSum{0., 0., 0.}, potentialEnergySum{0.}, __remainingTo64{} {}
982 virialSum = {0., 0., 0.};
983 potentialEnergySum = 0.;
987 std::array<double, 3> virialSum;
988 double potentialEnergySum;
992 double __remainingTo64[(64 - 4 *
sizeof(double)) /
sizeof(double)];
995 static_assert(
sizeof(AoSThreadData) % 64 == 0,
"AoSThreadData has wrong size");
998 const __m256d _zero{_mm256_set1_pd(0.)};
999 const __m256d _one{_mm256_set1_pd(1.)};
1000 const __m256i _vindex = _mm256_set_epi64x(0, 1, 3, 4);
1001 const __m256i _masks[3]{
1002 _mm256_set_epi64x(0, 0, 0, -1),
1003 _mm256_set_epi64x(0, 0, -1, -1),
1004 _mm256_set_epi64x(0, -1, -1, -1),
1006 const __m256i _ownedStateDummyMM256i{0x0};
1008 const __m256d _cutoffSquared{};
1009 __m256d _shift6 = _mm256_setzero_pd();
1010 __m256d _epsilon24{};
1011 __m256d _sigmaSquared{};
1014 const double _cutoffSquaredAoS = 0;
1015 double _epsilon24AoS, _sigmaSquaredAoS, _shift6AoS = 0;
1020 double _potentialEnergySum;
1023 std::array<double, 3> _virialSum;
1026 std::vector<AoSThreadData> _aosThreadData;
1029 bool _postProcessed;
1033 constexpr static size_t vecLength = 4;
#define AutoPasLog(lvl, fmt,...)
Macro for logging providing common meta information without filename.
Definition: Logger.h:24
This class stores the (physical) properties of molecule types, and, in the case of multi-site molecul...
Definition: ParticlePropertiesLibrary.h:28
floatType getMixingShift6(intType i, intType j) const
Returns precomputed mixed shift * 6 for one pair of site types.
Definition: ParticlePropertiesLibrary.h:262
floatType getMixingSigmaSquared(intType i, intType j) const
Returns precomputed mixed squared sigma for one pair of site types.
Definition: ParticlePropertiesLibrary.h:252
floatType getMixing24Epsilon(intType i, intType j) const
Returns the precomputed mixed epsilon * 24.
Definition: ParticlePropertiesLibrary.h:224
static double calcShift6(double epsilon24, double sigmaSquared, double cutoffSquared)
Calculate the shift multiplied 6 of the lennard jones potential from given cutoff,...
Definition: ParticlePropertiesLibrary.h:575
AlignedAllocator class.
Definition: AlignedAllocator.h:29
Functor base class.
Definition: Functor.h:40
PairwiseFunctor class.
Definition: PairwiseFunctor.h:31
View on a fixed part of a SoA between a start index and an end index.
Definition: SoAView.h:23
size_t size() const
Returns the number of particles in the view.
Definition: SoAView.h:83
Default exception class for autopas exceptions.
Definition: ExceptionHandler.h:115
static void exception(const Exception e)
Handle an exception derived by std::exception.
Definition: ExceptionHandler.h:63
A functor to handle lennard-jones interactions between two particles (molecules).
Definition: LJFunctorAVX.h:47
bool allowsNewton3() final
Specifies whether the functor is capable of Newton3-like functors.
Definition: LJFunctorAVX.h:118
void initTraversal() final
Reset the global values.
Definition: LJFunctorAVX.h:850
LJFunctorAVX(double cutoff, ParticlePropertiesLibrary< double, size_t > &particlePropertiesLibrary)
Constructor for Functor with mixing active.
Definition: LJFunctorAVX.h:106
std::string getName() final
Returns name of functor.
Definition: LJFunctorAVX.h:114
double getVirial()
Get the virial.
Definition: LJFunctorAVX.h:911
double getPotentialEnergy()
Get the potential Energy.
Definition: LJFunctorAVX.h:893
static constexpr auto getComputedAttr()
Get attributes computed by this functor.
Definition: LJFunctorAVX.h:835
LJFunctorAVX(double cutoff)
Constructor for Functor with mixing disabled.
Definition: LJFunctorAVX.h:94
static constexpr bool getMixing()
Definition: LJFunctorAVX.h:844
void SoAFunctorSingle(autopas::SoAView< SoAArraysType > soa, bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorAVX.h:184
void SoAFunctorVerlet(autopas::SoAView< SoAArraysType > soa, const size_t indexFirst, const std::vector< size_t, autopas::AlignedAllocator< size_t > > &neighborList, bool newton3) final
PairwiseFunctor for structure of arrays (SoA) for neighbor lists.
Definition: LJFunctorAVX.h:608
void AoSFunctor(Particle_T &i, Particle_T &j, bool newton3) final
PairwiseFunctor for arrays of structures (AoS).
Definition: LJFunctorAVX.h:126
void SoAFunctorPair(autopas::SoAView< SoAArraysType > soa1, autopas::SoAView< SoAArraysType > soa2, const bool newton3) final
PairwiseFunctor for structure of arrays (SoA)
Definition: LJFunctorAVX.h:191
bool allowsNonNewton3() final
Specifies whether the functor is capable of non-Newton3-like functors.
Definition: LJFunctorAVX.h:122
static constexpr auto getNeededAttr()
Get attributes needed for computation.
Definition: LJFunctorAVX.h:810
static constexpr auto getNeededAttr(std::false_type)
Get attributes needed for computation without N3 optimization.
Definition: LJFunctorAVX.h:825
bool isRelevantForTuning() final
Specifies whether the functor should be considered for the auto-tuning process.
Definition: LJFunctorAVX.h:116
void setParticleProperties(double epsilon24, double sigmaSquared)
Sets the particle properties constants for this functor.
Definition: LJFunctorAVX.h:932
LJFunctorAVX()=delete
Deleted default constructor.
void endTraversal(bool newton3) final
Accumulates global values, e.g.
Definition: LJFunctorAVX.h:863
constexpr T dot(const std::array< T, SIZE > &a, const std::array< T, SIZE > &b)
Generates the dot product of two arrays.
Definition: ArrayMath.h:233
int autopas_get_max_threads()
Dummy for omp_get_max_threads() when no OpenMP is available.
Definition: WrapOpenMP.h:144
@ dummy
Dummy or deleted state, a particle with this state is not an actual particle!
@ owned
Owned state, a particle with this state is an actual particle and owned by the current AutoPas object...
FunctorN3Modes
Newton 3 modes for the Functor.
Definition: Functor.h:22
int autopas_get_thread_num()
Dummy for omp_set_lock() when no OpenMP is available.
Definition: WrapOpenMP.h:132